From 3223c68992d568695b0e93148cfe74c0843d1a3a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 18 Apr 2021 10:11:55 -0600 Subject: [PATCH 001/329] Update Cargo.toml --- rust/Cargo.toml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/rust/Cargo.toml b/rust/Cargo.toml index de26f87c77846..0a4ef2a7f2c1a 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -17,18 +17,12 @@ [workspace] members = [ - "arrow", - "parquet", - "parquet_derive", - "parquet_derive_test", "datafusion", "datafusion-examples", - "arrow-flight", - "integration-testing", "benchmarks", ] # this package is excluded because it requires different compilation flags, thereby significantly changing # how it is compiled within the workspace, causing the whole workspace to be compiled from scratch # this way, this is a stand-alone package that compiles independently of the others. -exclude = ["arrow-pyarrow-integration-testing", "ballista"] +exclude = ["ballista"] From a93e4a790ac5f77e0cea63dcde2a6598447343e0 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 18 Apr 2021 10:21:34 -0600 Subject: [PATCH 002/329] Update DataFusion dependencies on arrow-rs --- rust/benchmarks/Cargo.toml | 4 ++-- rust/datafusion-examples/Cargo.toml | 4 ++-- rust/datafusion/Cargo.toml | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/rust/benchmarks/Cargo.toml b/rust/benchmarks/Cargo.toml index 5cdf0f94ac32c..7fd8444716865 100644 --- a/rust/benchmarks/Cargo.toml +++ b/rust/benchmarks/Cargo.toml @@ -31,8 +31,8 @@ simd = ["datafusion/simd"] snmalloc = ["snmalloc-rs"] [dependencies] -arrow = { path = "../arrow" } -parquet = { path = "../parquet" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } +parquet = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } datafusion = { path = "../datafusion" } structopt = { version = "0.3", default-features = false } tokio = { version = "^1.0", features = ["macros", "rt", "rt-multi-thread"] } diff --git a/rust/datafusion-examples/Cargo.toml b/rust/datafusion-examples/Cargo.toml index c86e7ccbe3cc3..1a060504df721 100644 --- a/rust/datafusion-examples/Cargo.toml +++ b/rust/datafusion-examples/Cargo.toml @@ -29,10 +29,10 @@ publish = false [dev-dependencies] +arrow = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } datafusion = { path = "../datafusion" } -arrow = { path = "../arrow" } prost = "0.7" -arrow-flight = { path = "../arrow-flight" } tonic = "0.4" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } futures = "0.3" diff --git a/rust/datafusion/Cargo.toml b/rust/datafusion/Cargo.toml index fd1c1b29590e6..88a50dd5a4432 100644 --- a/rust/datafusion/Cargo.toml +++ b/rust/datafusion/Cargo.toml @@ -50,8 +50,8 @@ unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = "0.7" hashbrown = "0.11" -arrow = { path = "../arrow", version = "4.0.0-SNAPSHOT", features = ["prettyprint"] } -parquet = { path = "../parquet", version = "4.0.0-SNAPSHOT", features = ["arrow"] } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c", features = ["prettyprint"] } +parquet = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c", features = ["arrow"] } sqlparser = "0.9.0" clap = "2.33" rustyline = {version = "7.0", optional = true} From 95dba6c8da928289bb9cb6041027e6e35c482291 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 18 Apr 2021 10:25:23 -0600 Subject: [PATCH 003/329] Update Ballsita dependencies on arrow-rs --- rust/ballista/rust/benchmarks/tpch/Cargo.toml | 6 +++--- rust/ballista/rust/client/Cargo.toml | 4 ++-- rust/ballista/rust/core/Cargo.toml | 5 +++-- rust/ballista/rust/executor/Cargo.toml | 5 +++-- rust/ballista/rust/scheduler/Cargo.toml | 2 +- 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/rust/ballista/rust/benchmarks/tpch/Cargo.toml b/rust/ballista/rust/benchmarks/tpch/Cargo.toml index 822d101d4e909..9a7d65199266e 100644 --- a/rust/ballista/rust/benchmarks/tpch/Cargo.toml +++ b/rust/ballista/rust/benchmarks/tpch/Cargo.toml @@ -26,10 +26,10 @@ edition = "2018" [dependencies] ballista = { path="../../client" } - -arrow = { path = "../../../../arrow" } datafusion = { path = "../../../../datafusion" } -parquet = { path = "../../../../parquet" } + +arrow = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } +parquet = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } env_logger = "0.8" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread"] } diff --git a/rust/ballista/rust/client/Cargo.toml b/rust/ballista/rust/client/Cargo.toml index de3effe87ca5d..bf40cdb499ace 100644 --- a/rust/ballista/rust/client/Cargo.toml +++ b/rust/ballista/rust/client/Cargo.toml @@ -31,5 +31,5 @@ futures = "0.3" log = "0.4" tokio = "1.0" -arrow = { path = "../../../arrow" } -datafusion = { path = "../../../datafusion" } \ No newline at end of file +arrow = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } +datafusion = { path = "../../../datafusion" } diff --git a/rust/ballista/rust/core/Cargo.toml b/rust/ballista/rust/core/Cargo.toml index e37a1ea7caa74..58e6d22734acd 100644 --- a/rust/ballista/rust/core/Cargo.toml +++ b/rust/ballista/rust/core/Cargo.toml @@ -40,8 +40,9 @@ tokio = "1.0" tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } -arrow = { path = "../../../arrow" } -arrow-flight = { path = "../../../arrow-flight" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } + datafusion = { path = "../../../datafusion" } [dev-dependencies] diff --git a/rust/ballista/rust/executor/Cargo.toml b/rust/ballista/rust/executor/Cargo.toml index 6b05b7c7fa931..ccf30cf16eb7b 100644 --- a/rust/ballista/rust/executor/Cargo.toml +++ b/rust/ballista/rust/executor/Cargo.toml @@ -45,8 +45,9 @@ tokio-stream = "0.1" tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } -arrow = { path = "../../../arrow" } -arrow-flight = { path = "../../../arrow-flight" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } + datafusion = { path = "../../../datafusion" } [dev-dependencies] diff --git a/rust/ballista/rust/scheduler/Cargo.toml b/rust/ballista/rust/scheduler/Cargo.toml index 71925ee52590c..197a2319154d2 100644 --- a/rust/ballista/rust/scheduler/Cargo.toml +++ b/rust/ballista/rust/scheduler/Cargo.toml @@ -52,7 +52,7 @@ tonic = "0.4" tower = { version = "0.4" } warp = "0.3" -arrow = { path = "../../../arrow" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } datafusion = { path = "../../../datafusion" } [dev-dependencies] From 9e302b1fdf773b93bec1197fd5a5ff73f6ec9641 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 18 Apr 2021 10:33:39 -0600 Subject: [PATCH 004/329] Remove old README --- README.md | 106 ------------------------------------------------------ 1 file changed, 106 deletions(-) delete mode 100644 README.md diff --git a/README.md b/README.md deleted file mode 100644 index 133018c72dfc3..0000000000000 --- a/README.md +++ /dev/null @@ -1,106 +0,0 @@ - - -# Apache Arrow - -[![Build Status](https://ci.appveyor.com/api/projects/status/github/apache/arrow/branch/master?svg=true)](https://ci.appveyor.com/project/ApacheSoftwareFoundation/arrow/branch/master) -[![Coverage Status](https://codecov.io/gh/apache/arrow/branch/master/graph/badge.svg)](https://codecov.io/gh/apache/arrow?branch=master) -[![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/arrow.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:arrow) -[![License](http://img.shields.io/:license-Apache%202-blue.svg)](https://github.com/apache/arrow/blob/master/LICENSE.txt) -[![Twitter Follow](https://img.shields.io/twitter/follow/apachearrow.svg?style=social&label=Follow)](https://twitter.com/apachearrow) - -## Powering In-Memory Analytics - -Apache Arrow is a development platform for in-memory analytics. It contains a -set of technologies that enable big data systems to process and move data fast. - -Major components of the project include: - - - [The Arrow Columnar In-Memory Format](https://github.com/apache/arrow/blob/master/docs/source/format/Columnar.rst): - a standard and efficient in-memory representation of various datatypes, plain or nested - - [The Arrow IPC Format](https://github.com/apache/arrow/blob/master/docs/source/format/Columnar.rst#serialization-and-interprocess-communication-ipc): - an efficient serialization of the Arrow format and associated metadata, - for communication between processes and heterogeneous environments - - [The Arrow Flight RPC protocol](https://github.com/apache/arrow/tree/master/format/Flight.proto): - based on the Arrow IPC format, a building block for remote services exchanging - Arrow data with application-defined semantics (for example a storage server or a database) - - [C++ libraries](https://github.com/apache/arrow/tree/master/cpp) - - [C bindings using GLib](https://github.com/apache/arrow/tree/master/c_glib) - - [C# .NET libraries](https://github.com/apache/arrow/tree/master/csharp) - - [Gandiva](https://github.com/apache/arrow/tree/master/cpp/src/gandiva): - an [LLVM](https://llvm.org)-based Arrow expression compiler, part of the C++ codebase - - [Go libraries](https://github.com/apache/arrow/tree/master/go) - - [Java libraries](https://github.com/apache/arrow/tree/master/java) - - [JavaScript libraries](https://github.com/apache/arrow/tree/master/js) - - [Plasma Object Store](https://github.com/apache/arrow/tree/master/cpp/src/plasma): - a shared-memory blob store, part of the C++ codebase - - [Python libraries](https://github.com/apache/arrow/tree/master/python) - - [R libraries](https://github.com/apache/arrow/tree/master/r) - - [Ruby libraries](https://github.com/apache/arrow/tree/master/ruby) - - [Rust libraries](https://github.com/apache/arrow/tree/master/rust) - -Arrow is an [Apache Software Foundation](https://www.apache.org) project. Learn more at -[arrow.apache.org](https://arrow.apache.org). - -## What's in the Arrow libraries? - -The reference Arrow libraries contain many distinct software components: - -- Columnar vector and table-like containers (similar to data frames) supporting - flat or nested types -- Fast, language agnostic metadata messaging layer (using Google's Flatbuffers - library) -- Reference-counted off-heap buffer memory management, for zero-copy memory - sharing and handling memory-mapped files -- IO interfaces to local and remote filesystems -- Self-describing binary wire formats (streaming and batch/file-like) for - remote procedure calls (RPC) and interprocess communication (IPC) -- Integration tests for verifying binary compatibility between the - implementations (e.g. sending data from Java to C++) -- Conversions to and from other in-memory data structures -- Readers and writers for various widely-used file formats (such as Parquet, CSV) - -## Implementation status - -The official Arrow libraries in this repository are in different stages of -implementing the Arrow format and related features. See our current -[feature matrix](https://github.com/apache/arrow/blob/master/docs/source/status.rst) -on git master. - -## How to Contribute - -Please read our latest [project contribution guide][5]. - -## Getting involved - -Even if you do not plan to contribute to Apache Arrow itself or Arrow -integrations in other projects, we'd be happy to have you involved: - -- Join the mailing list: send an email to - [dev-subscribe@arrow.apache.org][1]. Share your ideas and use cases for the - project. -- [Follow our activity on JIRA][3] -- [Learn the format][2] -- Contribute code to one of the reference implementations - -[1]: mailto:dev-subscribe@arrow.apache.org -[2]: https://github.com/apache/arrow/tree/master/format -[3]: https://issues.apache.org/jira/browse/ARROW -[4]: https://github.com/apache/arrow -[5]: https://github.com/apache/arrow/blob/master/docs/source/developers/contributing.rst From 70afe4c459af33b8cb190383c923fcee09cde252 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 18 Apr 2021 10:34:08 -0600 Subject: [PATCH 005/329] Move Rust folders to top level --- rust/Cargo.toml => Cargo.toml | 0 rust/README.md => README.md | 0 {rust/ballista => ballista}/.dockerignore | 0 {rust/ballista => ballista}/README.md | 0 {rust/ballista => ballista}/dev/build-rust-base.sh | 0 {rust/ballista => ballista}/dev/build-rust.sh | 0 .../ballista => ballista}/dev/integration-tests.sh | 0 {rust/ballista => ballista}/docker/README.md | 0 .../docker/rust-base.dockerfile | 0 {rust/ballista => ballista}/docker/rust.dockerfile | 0 {rust/ballista => ballista}/docs/README.md | 0 {rust/ballista => ballista}/docs/architecture.md | 0 {rust/ballista => ballista}/docs/dev-env-rust.md | 0 .../docs/images/query-execution.png | Bin .../docs/integration-testing.md | 0 {rust/ballista => ballista}/docs/release-process.md | 0 {rust/ballista => ballista}/docs/rust-docker.md | 0 .../docs/user-guide/.gitignore | 0 .../ballista => ballista}/docs/user-guide/README.md | 0 .../ballista => ballista}/docs/user-guide/book.toml | 0 .../docs/user-guide/src/SUMMARY.md | 0 .../docs/user-guide/src/client-rust.md | 0 .../docs/user-guide/src/clients.md | 0 .../docs/user-guide/src/configuration.md | 0 .../docs/user-guide/src/deployment.md | 0 .../docs/user-guide/src/docker-compose.md | 0 .../docs/user-guide/src/faq.md | 0 .../user-guide/src/img/ballista-architecture.png | Bin .../docs/user-guide/src/introduction.md | 0 .../docs/user-guide/src/kubernetes.md | 0 .../docs/user-guide/src/standalone.md | 0 {rust/ballista => ballista}/rust/.dockerignore | 0 {rust/ballista => ballista}/rust/.gitignore | 0 {rust/ballista => ballista}/rust/Cargo.toml | 0 .../rust/benchmarks/tpch/.dockerignore | 0 .../rust/benchmarks/tpch/.gitignore | 0 .../rust/benchmarks/tpch/Cargo.toml | 0 .../rust/benchmarks/tpch/README.md | 0 .../rust/benchmarks/tpch/docker-compose.yaml | 0 .../rust/benchmarks/tpch/entrypoint.sh | 0 .../rust/benchmarks/tpch/queries/q1.sql | 0 .../rust/benchmarks/tpch/queries/q10.sql | 0 .../rust/benchmarks/tpch/queries/q11.sql | 0 .../rust/benchmarks/tpch/queries/q12.sql | 0 .../rust/benchmarks/tpch/queries/q13.sql | 0 .../rust/benchmarks/tpch/queries/q14.sql | 0 .../rust/benchmarks/tpch/queries/q16.sql | 0 .../rust/benchmarks/tpch/queries/q17.sql | 0 .../rust/benchmarks/tpch/queries/q18.sql | 0 .../rust/benchmarks/tpch/queries/q19.sql | 0 .../rust/benchmarks/tpch/queries/q2.sql | 0 .../rust/benchmarks/tpch/queries/q20.sql | 0 .../rust/benchmarks/tpch/queries/q21.sql | 0 .../rust/benchmarks/tpch/queries/q22.sql | 0 .../rust/benchmarks/tpch/queries/q3.sql | 0 .../rust/benchmarks/tpch/queries/q4.sql | 0 .../rust/benchmarks/tpch/queries/q5.sql | 0 .../rust/benchmarks/tpch/queries/q6.sql | 0 .../rust/benchmarks/tpch/queries/q7.sql | 0 .../rust/benchmarks/tpch/queries/q8.sql | 0 .../rust/benchmarks/tpch/queries/q9.sql | 0 .../rust/benchmarks/tpch/run.sh | 0 .../rust/benchmarks/tpch/src/main.rs | 0 .../rust/benchmarks/tpch/tpch-gen.sh | 0 .../rust/benchmarks/tpch/tpchgen.dockerfile | 0 {rust/ballista => ballista}/rust/client/Cargo.toml | 0 {rust/ballista => ballista}/rust/client/README.md | 0 .../rust/client/src/columnar_batch.rs | 0 .../rust/client/src/context.rs | 0 {rust/ballista => ballista}/rust/client/src/lib.rs | 0 .../rust/client/src/prelude.rs | 0 {rust/ballista => ballista}/rust/core/Cargo.toml | 0 {rust/ballista => ballista}/rust/core/README.md | 0 {rust/ballista => ballista}/rust/core/build.rs | 0 .../rust/core/proto/ballista.proto | 0 {rust/ballista => ballista}/rust/core/src/client.rs | 0 .../rust/core/src/datasource.rs | 0 {rust/ballista => ballista}/rust/core/src/error.rs | 0 .../rust/core/src/execution_plans/mod.rs | 0 .../rust/core/src/execution_plans/query_stage.rs | 0 .../rust/core/src/execution_plans/shuffle_reader.rs | 0 .../core/src/execution_plans/unresolved_shuffle.rs | 0 {rust/ballista => ballista}/rust/core/src/lib.rs | 0 .../rust/core/src/memory_stream.rs | 0 .../rust/core/src/serde/logical_plan/from_proto.rs | 0 .../rust/core/src/serde/logical_plan/mod.rs | 0 .../rust/core/src/serde/logical_plan/to_proto.rs | 0 .../rust/core/src/serde/mod.rs | 0 .../rust/core/src/serde/physical_plan/from_proto.rs | 0 .../rust/core/src/serde/physical_plan/mod.rs | 0 .../rust/core/src/serde/physical_plan/to_proto.rs | 0 .../rust/core/src/serde/scheduler/from_proto.rs | 0 .../rust/core/src/serde/scheduler/mod.rs | 0 .../rust/core/src/serde/scheduler/to_proto.rs | 0 {rust/ballista => ballista}/rust/core/src/utils.rs | 0 .../ballista => ballista}/rust/executor/Cargo.toml | 0 {rust/ballista => ballista}/rust/executor/README.md | 0 {rust/ballista => ballista}/rust/executor/build.rs | 0 .../executor/examples/example_executor_config.toml | 0 .../rust/executor/executor_config_spec.toml | 0 .../rust/executor/src/collect.rs | 0 .../rust/executor/src/execution_loop.rs | 0 .../rust/executor/src/flight_service.rs | 0 .../ballista => ballista}/rust/executor/src/lib.rs | 0 .../ballista => ballista}/rust/executor/src/main.rs | 0 .../ballista => ballista}/rust/scheduler/Cargo.toml | 0 .../ballista => ballista}/rust/scheduler/README.md | 0 {rust/ballista => ballista}/rust/scheduler/build.rs | 0 .../rust/scheduler/scheduler_config_spec.toml | 0 .../rust/scheduler/src/api/handlers.rs | 0 .../rust/scheduler/src/api/mod.rs | 0 .../ballista => ballista}/rust/scheduler/src/lib.rs | 0 .../rust/scheduler/src/main.rs | 0 .../rust/scheduler/src/planner.rs | 0 .../rust/scheduler/src/state/etcd.rs | 0 .../rust/scheduler/src/state/mod.rs | 0 .../rust/scheduler/src/state/standalone.rs | 0 .../rust/scheduler/src/test_utils.rs | 0 .../rust/scheduler/testdata/customer/customer.tbl | 0 .../rust/scheduler/testdata/lineitem/partition0.tbl | 0 .../rust/scheduler/testdata/lineitem/partition1.tbl | 0 .../rust/scheduler/testdata/nation/nation.tbl | 0 .../rust/scheduler/testdata/orders/orders.tbl | 0 .../rust/scheduler/testdata/part/part.tbl | 0 .../rust/scheduler/testdata/partsupp/partsupp.tbl | 0 .../rust/scheduler/testdata/region/region.tbl | 0 .../rust/scheduler/testdata/supplier/supplier.tbl | 0 {rust/ballista => ballista}/ui/scheduler/.gitignore | 0 {rust/ballista => ballista}/ui/scheduler/README.md | 0 {rust/ballista => ballista}/ui/scheduler/index.d.ts | 0 .../ballista => ballista}/ui/scheduler/package.json | 0 .../ui/scheduler/public/favicon.ico | Bin .../ui/scheduler/public/index.html | 0 .../ui/scheduler/public/logo192.png | Bin .../ui/scheduler/public/logo512.png | Bin .../ui/scheduler/public/manifest.json | 0 .../ui/scheduler/public/robots.txt | 0 .../ui/scheduler/react-table-config.d.ts | 0 .../ballista => ballista}/ui/scheduler/src/App.css | 0 .../ui/scheduler/src/App.test.tsx | 0 .../ballista => ballista}/ui/scheduler/src/App.tsx | 0 .../ui/scheduler/src/components/DataTable.tsx | 0 .../ui/scheduler/src/components/Empty.tsx | 0 .../ui/scheduler/src/components/Footer.tsx | 0 .../ui/scheduler/src/components/Header.tsx | 0 .../ui/scheduler/src/components/NodesList.tsx | 0 .../ui/scheduler/src/components/QueriesList.tsx | 0 .../ui/scheduler/src/components/Summary.tsx | 0 .../ui/scheduler/src/components/logo.svg | 0 .../ui/scheduler/src/index.css | 0 .../ui/scheduler/src/index.tsx | 0 .../ui/scheduler/src/react-app-env.d.ts | 0 .../ui/scheduler/src/reportWebVitals.ts | 0 .../ui/scheduler/src/setupTests.ts | 0 .../ui/scheduler/tsconfig.json | 0 {rust/ballista => ballista}/ui/scheduler/yarn.lock | 0 {rust/benchmarks => benchmarks}/Cargo.toml | 0 {rust/benchmarks => benchmarks}/README.md | 0 {rust/benchmarks => benchmarks}/src/bin/nyctaxi.rs | 0 {rust/benchmarks => benchmarks}/src/bin/tpch.rs | 0 .../Cargo.toml | 0 .../examples/README.md | 0 .../examples/csv_sql.rs | 0 .../examples/dataframe.rs | 0 .../examples/dataframe_in_memory.rs | 0 .../examples/flight_client.rs | 0 .../examples/flight_server.rs | 0 .../examples/parquet_sql.rs | 0 .../examples/simple_udaf.rs | 0 .../examples/simple_udf.rs | 0 {rust/datafusion => datafusion}/Cargo.toml | 0 {rust/datafusion => datafusion}/DEVELOPERS.md | 0 {rust/datafusion => datafusion}/Dockerfile | 0 {rust/datafusion => datafusion}/README.md | 0 .../benches/aggregate_query_sql.rs | 0 .../benches/filter_query_sql.rs | 0 .../benches/math_query_sql.rs | 0 {rust/datafusion => datafusion}/benches/scalar.rs | 0 .../benches/sort_limit_query_sql.rs | 0 {rust/datafusion => datafusion}/docs/cli.md | 0 .../docs/images/DataFusion-Logo-Dark.png | Bin .../docs/images/DataFusion-Logo-Dark.svg | 0 .../docs/images/DataFusion-Logo-Light.png | Bin .../docs/images/DataFusion-Logo-Light.svg | 0 {rust/datafusion => datafusion}/src/bin/main.rs | 0 {rust/datafusion => datafusion}/src/bin/repl.rs | 0 .../src/catalog/catalog.rs | 0 .../src/catalog/information_schema.rs | 0 {rust/datafusion => datafusion}/src/catalog/mod.rs | 0 .../datafusion => datafusion}/src/catalog/schema.rs | 0 {rust/datafusion => datafusion}/src/dataframe.rs | 0 .../datafusion => datafusion}/src/datasource/csv.rs | 0 .../src/datasource/datasource.rs | 0 .../src/datasource/empty.rs | 0 .../src/datasource/memory.rs | 0 .../datafusion => datafusion}/src/datasource/mod.rs | 0 .../src/datasource/parquet.rs | 0 {rust/datafusion => datafusion}/src/error.rs | 0 .../src/execution/context.rs | 0 .../src/execution/dataframe_impl.rs | 0 .../datafusion => datafusion}/src/execution/mod.rs | 0 {rust/datafusion => datafusion}/src/lib.rs | 0 .../src/logical_plan/builder.rs | 0 .../src/logical_plan/dfschema.rs | 0 .../src/logical_plan/display.rs | 0 .../src/logical_plan/expr.rs | 0 .../src/logical_plan/extension.rs | 0 .../src/logical_plan/mod.rs | 0 .../src/logical_plan/operators.rs | 0 .../src/logical_plan/plan.rs | 0 .../src/logical_plan/registry.rs | 0 .../src/optimizer/constant_folding.rs | 0 .../src/optimizer/filter_push_down.rs | 0 .../src/optimizer/hash_build_probe_order.rs | 0 .../src/optimizer/limit_push_down.rs | 0 .../datafusion => datafusion}/src/optimizer/mod.rs | 0 .../src/optimizer/optimizer.rs | 0 .../src/optimizer/projection_push_down.rs | 0 .../src/optimizer/utils.rs | 0 .../src/physical_optimizer/coalesce_batches.rs | 0 .../src/physical_optimizer/merge_exec.rs | 0 .../src/physical_optimizer/mod.rs | 0 .../src/physical_optimizer/optimizer.rs | 0 .../src/physical_optimizer/repartition.rs | 0 .../src/physical_plan/aggregates.rs | 0 .../src/physical_plan/array_expressions.rs | 0 .../src/physical_plan/coalesce_batches.rs | 0 .../src/physical_plan/common.rs | 0 .../src/physical_plan/crypto_expressions.rs | 0 .../src/physical_plan/csv.rs | 0 .../src/physical_plan/datetime_expressions.rs | 0 .../src/physical_plan/distinct_expressions.rs | 0 .../src/physical_plan/empty.rs | 0 .../src/physical_plan/explain.rs | 0 .../src/physical_plan/expressions/average.rs | 0 .../src/physical_plan/expressions/binary.rs | 0 .../src/physical_plan/expressions/case.rs | 0 .../src/physical_plan/expressions/cast.rs | 0 .../src/physical_plan/expressions/coercion.rs | 0 .../src/physical_plan/expressions/column.rs | 0 .../src/physical_plan/expressions/count.rs | 0 .../src/physical_plan/expressions/in_list.rs | 0 .../src/physical_plan/expressions/is_not_null.rs | 0 .../src/physical_plan/expressions/is_null.rs | 0 .../src/physical_plan/expressions/literal.rs | 0 .../src/physical_plan/expressions/min_max.rs | 0 .../src/physical_plan/expressions/mod.rs | 0 .../src/physical_plan/expressions/negative.rs | 0 .../src/physical_plan/expressions/not.rs | 0 .../src/physical_plan/expressions/nullif.rs | 0 .../src/physical_plan/expressions/sum.rs | 0 .../src/physical_plan/expressions/try_cast.rs | 0 .../src/physical_plan/filter.rs | 0 .../src/physical_plan/functions.rs | 0 .../src/physical_plan/group_scalar.rs | 0 .../src/physical_plan/hash_aggregate.rs | 0 .../src/physical_plan/hash_join.rs | 0 .../src/physical_plan/hash_utils.rs | 0 .../src/physical_plan/limit.rs | 0 .../src/physical_plan/math_expressions.rs | 0 .../src/physical_plan/memory.rs | 0 .../src/physical_plan/merge.rs | 0 .../src/physical_plan/mod.rs | 0 .../src/physical_plan/parquet.rs | 0 .../src/physical_plan/planner.rs | 0 .../src/physical_plan/projection.rs | 0 .../src/physical_plan/regex_expressions.rs | 0 .../src/physical_plan/repartition.rs | 0 .../src/physical_plan/sort.rs | 0 .../src/physical_plan/string_expressions.rs | 0 .../src/physical_plan/type_coercion.rs | 0 .../src/physical_plan/udaf.rs | 0 .../src/physical_plan/udf.rs | 0 .../src/physical_plan/unicode_expressions.rs | 0 .../src/physical_plan/union.rs | 0 {rust/datafusion => datafusion}/src/prelude.rs | 0 {rust/datafusion => datafusion}/src/scalar.rs | 0 {rust/datafusion => datafusion}/src/sql/mod.rs | 0 {rust/datafusion => datafusion}/src/sql/parser.rs | 0 {rust/datafusion => datafusion}/src/sql/planner.rs | 0 {rust/datafusion => datafusion}/src/sql/utils.rs | 0 {rust/datafusion => datafusion}/src/test/exec.rs | 0 {rust/datafusion => datafusion}/src/test/mod.rs | 0 .../src/test/user_defined.rs | 0 .../datafusion => datafusion}/src/test/variable.rs | 0 {rust/datafusion => datafusion}/src/variable/mod.rs | 0 .../tests/aggregate_simple.csv | 0 .../tests/custom_sources.rs | 0 {rust/datafusion => datafusion}/tests/customer.csv | 0 {rust/datafusion => datafusion}/tests/dataframe.rs | 0 {rust/datafusion => datafusion}/tests/example.csv | 0 .../tests/provider_filter_pushdown.rs | 0 {rust/datafusion => datafusion}/tests/sql.rs | 0 .../tests/user_defined_plan.rs | 0 rust/pre-commit.sh => pre-commit.sh | 0 rust/rustfmt.toml => rustfmt.toml | 0 296 files changed, 0 insertions(+), 0 deletions(-) rename rust/Cargo.toml => Cargo.toml (100%) rename rust/README.md => README.md (100%) rename {rust/ballista => ballista}/.dockerignore (100%) rename {rust/ballista => ballista}/README.md (100%) rename {rust/ballista => ballista}/dev/build-rust-base.sh (100%) rename {rust/ballista => ballista}/dev/build-rust.sh (100%) rename {rust/ballista => ballista}/dev/integration-tests.sh (100%) rename {rust/ballista => ballista}/docker/README.md (100%) rename {rust/ballista => ballista}/docker/rust-base.dockerfile (100%) rename {rust/ballista => ballista}/docker/rust.dockerfile (100%) rename {rust/ballista => ballista}/docs/README.md (100%) rename {rust/ballista => ballista}/docs/architecture.md (100%) rename {rust/ballista => ballista}/docs/dev-env-rust.md (100%) rename {rust/ballista => ballista}/docs/images/query-execution.png (100%) rename {rust/ballista => ballista}/docs/integration-testing.md (100%) rename {rust/ballista => ballista}/docs/release-process.md (100%) rename {rust/ballista => ballista}/docs/rust-docker.md (100%) rename {rust/ballista => ballista}/docs/user-guide/.gitignore (100%) rename {rust/ballista => ballista}/docs/user-guide/README.md (100%) rename {rust/ballista => ballista}/docs/user-guide/book.toml (100%) rename {rust/ballista => ballista}/docs/user-guide/src/SUMMARY.md (100%) rename {rust/ballista => ballista}/docs/user-guide/src/client-rust.md (100%) rename {rust/ballista => ballista}/docs/user-guide/src/clients.md (100%) rename {rust/ballista => ballista}/docs/user-guide/src/configuration.md (100%) rename {rust/ballista => ballista}/docs/user-guide/src/deployment.md (100%) rename {rust/ballista => ballista}/docs/user-guide/src/docker-compose.md (100%) rename {rust/ballista => ballista}/docs/user-guide/src/faq.md (100%) rename {rust/ballista => ballista}/docs/user-guide/src/img/ballista-architecture.png (100%) rename {rust/ballista => ballista}/docs/user-guide/src/introduction.md (100%) rename {rust/ballista => ballista}/docs/user-guide/src/kubernetes.md (100%) rename {rust/ballista => ballista}/docs/user-guide/src/standalone.md (100%) rename {rust/ballista => ballista}/rust/.dockerignore (100%) rename {rust/ballista => ballista}/rust/.gitignore (100%) rename {rust/ballista => ballista}/rust/Cargo.toml (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/.dockerignore (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/.gitignore (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/Cargo.toml (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/README.md (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/docker-compose.yaml (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/entrypoint.sh (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/queries/q1.sql (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/queries/q10.sql (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/queries/q11.sql (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/queries/q12.sql (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/queries/q13.sql (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/queries/q14.sql (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/queries/q16.sql (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/queries/q17.sql (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/queries/q18.sql (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/queries/q19.sql (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/queries/q2.sql (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/queries/q20.sql (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/queries/q21.sql (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/queries/q22.sql (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/queries/q3.sql (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/queries/q4.sql (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/queries/q5.sql (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/queries/q6.sql (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/queries/q7.sql (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/queries/q8.sql (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/queries/q9.sql (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/run.sh (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/src/main.rs (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/tpch-gen.sh (100%) rename {rust/ballista => ballista}/rust/benchmarks/tpch/tpchgen.dockerfile (100%) rename {rust/ballista => ballista}/rust/client/Cargo.toml (100%) rename {rust/ballista => ballista}/rust/client/README.md (100%) rename {rust/ballista => ballista}/rust/client/src/columnar_batch.rs (100%) rename {rust/ballista => ballista}/rust/client/src/context.rs (100%) rename {rust/ballista => ballista}/rust/client/src/lib.rs (100%) rename {rust/ballista => ballista}/rust/client/src/prelude.rs (100%) rename {rust/ballista => ballista}/rust/core/Cargo.toml (100%) rename {rust/ballista => ballista}/rust/core/README.md (100%) rename {rust/ballista => ballista}/rust/core/build.rs (100%) rename {rust/ballista => ballista}/rust/core/proto/ballista.proto (100%) rename {rust/ballista => ballista}/rust/core/src/client.rs (100%) rename {rust/ballista => ballista}/rust/core/src/datasource.rs (100%) rename {rust/ballista => ballista}/rust/core/src/error.rs (100%) rename {rust/ballista => ballista}/rust/core/src/execution_plans/mod.rs (100%) rename {rust/ballista => ballista}/rust/core/src/execution_plans/query_stage.rs (100%) rename {rust/ballista => ballista}/rust/core/src/execution_plans/shuffle_reader.rs (100%) rename {rust/ballista => ballista}/rust/core/src/execution_plans/unresolved_shuffle.rs (100%) rename {rust/ballista => ballista}/rust/core/src/lib.rs (100%) rename {rust/ballista => ballista}/rust/core/src/memory_stream.rs (100%) rename {rust/ballista => ballista}/rust/core/src/serde/logical_plan/from_proto.rs (100%) rename {rust/ballista => ballista}/rust/core/src/serde/logical_plan/mod.rs (100%) rename {rust/ballista => ballista}/rust/core/src/serde/logical_plan/to_proto.rs (100%) rename {rust/ballista => ballista}/rust/core/src/serde/mod.rs (100%) rename {rust/ballista => ballista}/rust/core/src/serde/physical_plan/from_proto.rs (100%) rename {rust/ballista => ballista}/rust/core/src/serde/physical_plan/mod.rs (100%) rename {rust/ballista => ballista}/rust/core/src/serde/physical_plan/to_proto.rs (100%) rename {rust/ballista => ballista}/rust/core/src/serde/scheduler/from_proto.rs (100%) rename {rust/ballista => ballista}/rust/core/src/serde/scheduler/mod.rs (100%) rename {rust/ballista => ballista}/rust/core/src/serde/scheduler/to_proto.rs (100%) rename {rust/ballista => ballista}/rust/core/src/utils.rs (100%) rename {rust/ballista => ballista}/rust/executor/Cargo.toml (100%) rename {rust/ballista => ballista}/rust/executor/README.md (100%) rename {rust/ballista => ballista}/rust/executor/build.rs (100%) rename {rust/ballista => ballista}/rust/executor/examples/example_executor_config.toml (100%) rename {rust/ballista => ballista}/rust/executor/executor_config_spec.toml (100%) rename {rust/ballista => ballista}/rust/executor/src/collect.rs (100%) rename {rust/ballista => ballista}/rust/executor/src/execution_loop.rs (100%) rename {rust/ballista => ballista}/rust/executor/src/flight_service.rs (100%) rename {rust/ballista => ballista}/rust/executor/src/lib.rs (100%) rename {rust/ballista => ballista}/rust/executor/src/main.rs (100%) rename {rust/ballista => ballista}/rust/scheduler/Cargo.toml (100%) rename {rust/ballista => ballista}/rust/scheduler/README.md (100%) rename {rust/ballista => ballista}/rust/scheduler/build.rs (100%) rename {rust/ballista => ballista}/rust/scheduler/scheduler_config_spec.toml (100%) rename {rust/ballista => ballista}/rust/scheduler/src/api/handlers.rs (100%) rename {rust/ballista => ballista}/rust/scheduler/src/api/mod.rs (100%) rename {rust/ballista => ballista}/rust/scheduler/src/lib.rs (100%) rename {rust/ballista => ballista}/rust/scheduler/src/main.rs (100%) rename {rust/ballista => ballista}/rust/scheduler/src/planner.rs (100%) rename {rust/ballista => ballista}/rust/scheduler/src/state/etcd.rs (100%) rename {rust/ballista => ballista}/rust/scheduler/src/state/mod.rs (100%) rename {rust/ballista => ballista}/rust/scheduler/src/state/standalone.rs (100%) rename {rust/ballista => ballista}/rust/scheduler/src/test_utils.rs (100%) rename {rust/ballista => ballista}/rust/scheduler/testdata/customer/customer.tbl (100%) rename {rust/ballista => ballista}/rust/scheduler/testdata/lineitem/partition0.tbl (100%) rename {rust/ballista => ballista}/rust/scheduler/testdata/lineitem/partition1.tbl (100%) rename {rust/ballista => ballista}/rust/scheduler/testdata/nation/nation.tbl (100%) rename {rust/ballista => ballista}/rust/scheduler/testdata/orders/orders.tbl (100%) rename {rust/ballista => ballista}/rust/scheduler/testdata/part/part.tbl (100%) rename {rust/ballista => ballista}/rust/scheduler/testdata/partsupp/partsupp.tbl (100%) rename {rust/ballista => ballista}/rust/scheduler/testdata/region/region.tbl (100%) rename {rust/ballista => ballista}/rust/scheduler/testdata/supplier/supplier.tbl (100%) rename {rust/ballista => ballista}/ui/scheduler/.gitignore (100%) rename {rust/ballista => ballista}/ui/scheduler/README.md (100%) rename {rust/ballista => ballista}/ui/scheduler/index.d.ts (100%) rename {rust/ballista => ballista}/ui/scheduler/package.json (100%) rename {rust/ballista => ballista}/ui/scheduler/public/favicon.ico (100%) rename {rust/ballista => ballista}/ui/scheduler/public/index.html (100%) rename {rust/ballista => ballista}/ui/scheduler/public/logo192.png (100%) rename {rust/ballista => ballista}/ui/scheduler/public/logo512.png (100%) rename {rust/ballista => ballista}/ui/scheduler/public/manifest.json (100%) rename {rust/ballista => ballista}/ui/scheduler/public/robots.txt (100%) rename {rust/ballista => ballista}/ui/scheduler/react-table-config.d.ts (100%) rename {rust/ballista => ballista}/ui/scheduler/src/App.css (100%) rename {rust/ballista => ballista}/ui/scheduler/src/App.test.tsx (100%) rename {rust/ballista => ballista}/ui/scheduler/src/App.tsx (100%) rename {rust/ballista => ballista}/ui/scheduler/src/components/DataTable.tsx (100%) rename {rust/ballista => ballista}/ui/scheduler/src/components/Empty.tsx (100%) rename {rust/ballista => ballista}/ui/scheduler/src/components/Footer.tsx (100%) rename {rust/ballista => ballista}/ui/scheduler/src/components/Header.tsx (100%) rename {rust/ballista => ballista}/ui/scheduler/src/components/NodesList.tsx (100%) rename {rust/ballista => ballista}/ui/scheduler/src/components/QueriesList.tsx (100%) rename {rust/ballista => ballista}/ui/scheduler/src/components/Summary.tsx (100%) rename {rust/ballista => ballista}/ui/scheduler/src/components/logo.svg (100%) rename {rust/ballista => ballista}/ui/scheduler/src/index.css (100%) rename {rust/ballista => ballista}/ui/scheduler/src/index.tsx (100%) rename {rust/ballista => ballista}/ui/scheduler/src/react-app-env.d.ts (100%) rename {rust/ballista => ballista}/ui/scheduler/src/reportWebVitals.ts (100%) rename {rust/ballista => ballista}/ui/scheduler/src/setupTests.ts (100%) rename {rust/ballista => ballista}/ui/scheduler/tsconfig.json (100%) rename {rust/ballista => ballista}/ui/scheduler/yarn.lock (100%) rename {rust/benchmarks => benchmarks}/Cargo.toml (100%) rename {rust/benchmarks => benchmarks}/README.md (100%) rename {rust/benchmarks => benchmarks}/src/bin/nyctaxi.rs (100%) rename {rust/benchmarks => benchmarks}/src/bin/tpch.rs (100%) rename {rust/datafusion-examples => datafusion-examples}/Cargo.toml (100%) rename {rust/datafusion-examples => datafusion-examples}/examples/README.md (100%) rename {rust/datafusion-examples => datafusion-examples}/examples/csv_sql.rs (100%) rename {rust/datafusion-examples => datafusion-examples}/examples/dataframe.rs (100%) rename {rust/datafusion-examples => datafusion-examples}/examples/dataframe_in_memory.rs (100%) rename {rust/datafusion-examples => datafusion-examples}/examples/flight_client.rs (100%) rename {rust/datafusion-examples => datafusion-examples}/examples/flight_server.rs (100%) rename {rust/datafusion-examples => datafusion-examples}/examples/parquet_sql.rs (100%) rename {rust/datafusion-examples => datafusion-examples}/examples/simple_udaf.rs (100%) rename {rust/datafusion-examples => datafusion-examples}/examples/simple_udf.rs (100%) rename {rust/datafusion => datafusion}/Cargo.toml (100%) rename {rust/datafusion => datafusion}/DEVELOPERS.md (100%) rename {rust/datafusion => datafusion}/Dockerfile (100%) rename {rust/datafusion => datafusion}/README.md (100%) rename {rust/datafusion => datafusion}/benches/aggregate_query_sql.rs (100%) rename {rust/datafusion => datafusion}/benches/filter_query_sql.rs (100%) rename {rust/datafusion => datafusion}/benches/math_query_sql.rs (100%) rename {rust/datafusion => datafusion}/benches/scalar.rs (100%) rename {rust/datafusion => datafusion}/benches/sort_limit_query_sql.rs (100%) rename {rust/datafusion => datafusion}/docs/cli.md (100%) rename {rust/datafusion => datafusion}/docs/images/DataFusion-Logo-Dark.png (100%) rename {rust/datafusion => datafusion}/docs/images/DataFusion-Logo-Dark.svg (100%) rename {rust/datafusion => datafusion}/docs/images/DataFusion-Logo-Light.png (100%) rename {rust/datafusion => datafusion}/docs/images/DataFusion-Logo-Light.svg (100%) rename {rust/datafusion => datafusion}/src/bin/main.rs (100%) rename {rust/datafusion => datafusion}/src/bin/repl.rs (100%) rename {rust/datafusion => datafusion}/src/catalog/catalog.rs (100%) rename {rust/datafusion => datafusion}/src/catalog/information_schema.rs (100%) rename {rust/datafusion => datafusion}/src/catalog/mod.rs (100%) rename {rust/datafusion => datafusion}/src/catalog/schema.rs (100%) rename {rust/datafusion => datafusion}/src/dataframe.rs (100%) rename {rust/datafusion => datafusion}/src/datasource/csv.rs (100%) rename {rust/datafusion => datafusion}/src/datasource/datasource.rs (100%) rename {rust/datafusion => datafusion}/src/datasource/empty.rs (100%) rename {rust/datafusion => datafusion}/src/datasource/memory.rs (100%) rename {rust/datafusion => datafusion}/src/datasource/mod.rs (100%) rename {rust/datafusion => datafusion}/src/datasource/parquet.rs (100%) rename {rust/datafusion => datafusion}/src/error.rs (100%) rename {rust/datafusion => datafusion}/src/execution/context.rs (100%) rename {rust/datafusion => datafusion}/src/execution/dataframe_impl.rs (100%) rename {rust/datafusion => datafusion}/src/execution/mod.rs (100%) rename {rust/datafusion => datafusion}/src/lib.rs (100%) rename {rust/datafusion => datafusion}/src/logical_plan/builder.rs (100%) rename {rust/datafusion => datafusion}/src/logical_plan/dfschema.rs (100%) rename {rust/datafusion => datafusion}/src/logical_plan/display.rs (100%) rename {rust/datafusion => datafusion}/src/logical_plan/expr.rs (100%) rename {rust/datafusion => datafusion}/src/logical_plan/extension.rs (100%) rename {rust/datafusion => datafusion}/src/logical_plan/mod.rs (100%) rename {rust/datafusion => datafusion}/src/logical_plan/operators.rs (100%) rename {rust/datafusion => datafusion}/src/logical_plan/plan.rs (100%) rename {rust/datafusion => datafusion}/src/logical_plan/registry.rs (100%) rename {rust/datafusion => datafusion}/src/optimizer/constant_folding.rs (100%) rename {rust/datafusion => datafusion}/src/optimizer/filter_push_down.rs (100%) rename {rust/datafusion => datafusion}/src/optimizer/hash_build_probe_order.rs (100%) rename {rust/datafusion => datafusion}/src/optimizer/limit_push_down.rs (100%) rename {rust/datafusion => datafusion}/src/optimizer/mod.rs (100%) rename {rust/datafusion => datafusion}/src/optimizer/optimizer.rs (100%) rename {rust/datafusion => datafusion}/src/optimizer/projection_push_down.rs (100%) rename {rust/datafusion => datafusion}/src/optimizer/utils.rs (100%) rename {rust/datafusion => datafusion}/src/physical_optimizer/coalesce_batches.rs (100%) rename {rust/datafusion => datafusion}/src/physical_optimizer/merge_exec.rs (100%) rename {rust/datafusion => datafusion}/src/physical_optimizer/mod.rs (100%) rename {rust/datafusion => datafusion}/src/physical_optimizer/optimizer.rs (100%) rename {rust/datafusion => datafusion}/src/physical_optimizer/repartition.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/aggregates.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/array_expressions.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/coalesce_batches.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/common.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/crypto_expressions.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/csv.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/datetime_expressions.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/distinct_expressions.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/empty.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/explain.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/expressions/average.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/expressions/binary.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/expressions/case.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/expressions/cast.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/expressions/coercion.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/expressions/column.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/expressions/count.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/expressions/in_list.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/expressions/is_not_null.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/expressions/is_null.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/expressions/literal.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/expressions/min_max.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/expressions/mod.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/expressions/negative.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/expressions/not.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/expressions/nullif.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/expressions/sum.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/expressions/try_cast.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/filter.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/functions.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/group_scalar.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/hash_aggregate.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/hash_join.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/hash_utils.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/limit.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/math_expressions.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/memory.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/merge.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/mod.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/parquet.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/planner.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/projection.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/regex_expressions.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/repartition.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/sort.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/string_expressions.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/type_coercion.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/udaf.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/udf.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/unicode_expressions.rs (100%) rename {rust/datafusion => datafusion}/src/physical_plan/union.rs (100%) rename {rust/datafusion => datafusion}/src/prelude.rs (100%) rename {rust/datafusion => datafusion}/src/scalar.rs (100%) rename {rust/datafusion => datafusion}/src/sql/mod.rs (100%) rename {rust/datafusion => datafusion}/src/sql/parser.rs (100%) rename {rust/datafusion => datafusion}/src/sql/planner.rs (100%) rename {rust/datafusion => datafusion}/src/sql/utils.rs (100%) rename {rust/datafusion => datafusion}/src/test/exec.rs (100%) rename {rust/datafusion => datafusion}/src/test/mod.rs (100%) rename {rust/datafusion => datafusion}/src/test/user_defined.rs (100%) rename {rust/datafusion => datafusion}/src/test/variable.rs (100%) rename {rust/datafusion => datafusion}/src/variable/mod.rs (100%) rename {rust/datafusion => datafusion}/tests/aggregate_simple.csv (100%) rename {rust/datafusion => datafusion}/tests/custom_sources.rs (100%) rename {rust/datafusion => datafusion}/tests/customer.csv (100%) rename {rust/datafusion => datafusion}/tests/dataframe.rs (100%) rename {rust/datafusion => datafusion}/tests/example.csv (100%) rename {rust/datafusion => datafusion}/tests/provider_filter_pushdown.rs (100%) rename {rust/datafusion => datafusion}/tests/sql.rs (100%) rename {rust/datafusion => datafusion}/tests/user_defined_plan.rs (100%) rename rust/pre-commit.sh => pre-commit.sh (100%) rename rust/rustfmt.toml => rustfmt.toml (100%) diff --git a/rust/Cargo.toml b/Cargo.toml similarity index 100% rename from rust/Cargo.toml rename to Cargo.toml diff --git a/rust/README.md b/README.md similarity index 100% rename from rust/README.md rename to README.md diff --git a/rust/ballista/.dockerignore b/ballista/.dockerignore similarity index 100% rename from rust/ballista/.dockerignore rename to ballista/.dockerignore diff --git a/rust/ballista/README.md b/ballista/README.md similarity index 100% rename from rust/ballista/README.md rename to ballista/README.md diff --git a/rust/ballista/dev/build-rust-base.sh b/ballista/dev/build-rust-base.sh similarity index 100% rename from rust/ballista/dev/build-rust-base.sh rename to ballista/dev/build-rust-base.sh diff --git a/rust/ballista/dev/build-rust.sh b/ballista/dev/build-rust.sh similarity index 100% rename from rust/ballista/dev/build-rust.sh rename to ballista/dev/build-rust.sh diff --git a/rust/ballista/dev/integration-tests.sh b/ballista/dev/integration-tests.sh similarity index 100% rename from rust/ballista/dev/integration-tests.sh rename to ballista/dev/integration-tests.sh diff --git a/rust/ballista/docker/README.md b/ballista/docker/README.md similarity index 100% rename from rust/ballista/docker/README.md rename to ballista/docker/README.md diff --git a/rust/ballista/docker/rust-base.dockerfile b/ballista/docker/rust-base.dockerfile similarity index 100% rename from rust/ballista/docker/rust-base.dockerfile rename to ballista/docker/rust-base.dockerfile diff --git a/rust/ballista/docker/rust.dockerfile b/ballista/docker/rust.dockerfile similarity index 100% rename from rust/ballista/docker/rust.dockerfile rename to ballista/docker/rust.dockerfile diff --git a/rust/ballista/docs/README.md b/ballista/docs/README.md similarity index 100% rename from rust/ballista/docs/README.md rename to ballista/docs/README.md diff --git a/rust/ballista/docs/architecture.md b/ballista/docs/architecture.md similarity index 100% rename from rust/ballista/docs/architecture.md rename to ballista/docs/architecture.md diff --git a/rust/ballista/docs/dev-env-rust.md b/ballista/docs/dev-env-rust.md similarity index 100% rename from rust/ballista/docs/dev-env-rust.md rename to ballista/docs/dev-env-rust.md diff --git a/rust/ballista/docs/images/query-execution.png b/ballista/docs/images/query-execution.png similarity index 100% rename from rust/ballista/docs/images/query-execution.png rename to ballista/docs/images/query-execution.png diff --git a/rust/ballista/docs/integration-testing.md b/ballista/docs/integration-testing.md similarity index 100% rename from rust/ballista/docs/integration-testing.md rename to ballista/docs/integration-testing.md diff --git a/rust/ballista/docs/release-process.md b/ballista/docs/release-process.md similarity index 100% rename from rust/ballista/docs/release-process.md rename to ballista/docs/release-process.md diff --git a/rust/ballista/docs/rust-docker.md b/ballista/docs/rust-docker.md similarity index 100% rename from rust/ballista/docs/rust-docker.md rename to ballista/docs/rust-docker.md diff --git a/rust/ballista/docs/user-guide/.gitignore b/ballista/docs/user-guide/.gitignore similarity index 100% rename from rust/ballista/docs/user-guide/.gitignore rename to ballista/docs/user-guide/.gitignore diff --git a/rust/ballista/docs/user-guide/README.md b/ballista/docs/user-guide/README.md similarity index 100% rename from rust/ballista/docs/user-guide/README.md rename to ballista/docs/user-guide/README.md diff --git a/rust/ballista/docs/user-guide/book.toml b/ballista/docs/user-guide/book.toml similarity index 100% rename from rust/ballista/docs/user-guide/book.toml rename to ballista/docs/user-guide/book.toml diff --git a/rust/ballista/docs/user-guide/src/SUMMARY.md b/ballista/docs/user-guide/src/SUMMARY.md similarity index 100% rename from rust/ballista/docs/user-guide/src/SUMMARY.md rename to ballista/docs/user-guide/src/SUMMARY.md diff --git a/rust/ballista/docs/user-guide/src/client-rust.md b/ballista/docs/user-guide/src/client-rust.md similarity index 100% rename from rust/ballista/docs/user-guide/src/client-rust.md rename to ballista/docs/user-guide/src/client-rust.md diff --git a/rust/ballista/docs/user-guide/src/clients.md b/ballista/docs/user-guide/src/clients.md similarity index 100% rename from rust/ballista/docs/user-guide/src/clients.md rename to ballista/docs/user-guide/src/clients.md diff --git a/rust/ballista/docs/user-guide/src/configuration.md b/ballista/docs/user-guide/src/configuration.md similarity index 100% rename from rust/ballista/docs/user-guide/src/configuration.md rename to ballista/docs/user-guide/src/configuration.md diff --git a/rust/ballista/docs/user-guide/src/deployment.md b/ballista/docs/user-guide/src/deployment.md similarity index 100% rename from rust/ballista/docs/user-guide/src/deployment.md rename to ballista/docs/user-guide/src/deployment.md diff --git a/rust/ballista/docs/user-guide/src/docker-compose.md b/ballista/docs/user-guide/src/docker-compose.md similarity index 100% rename from rust/ballista/docs/user-guide/src/docker-compose.md rename to ballista/docs/user-guide/src/docker-compose.md diff --git a/rust/ballista/docs/user-guide/src/faq.md b/ballista/docs/user-guide/src/faq.md similarity index 100% rename from rust/ballista/docs/user-guide/src/faq.md rename to ballista/docs/user-guide/src/faq.md diff --git a/rust/ballista/docs/user-guide/src/img/ballista-architecture.png b/ballista/docs/user-guide/src/img/ballista-architecture.png similarity index 100% rename from rust/ballista/docs/user-guide/src/img/ballista-architecture.png rename to ballista/docs/user-guide/src/img/ballista-architecture.png diff --git a/rust/ballista/docs/user-guide/src/introduction.md b/ballista/docs/user-guide/src/introduction.md similarity index 100% rename from rust/ballista/docs/user-guide/src/introduction.md rename to ballista/docs/user-guide/src/introduction.md diff --git a/rust/ballista/docs/user-guide/src/kubernetes.md b/ballista/docs/user-guide/src/kubernetes.md similarity index 100% rename from rust/ballista/docs/user-guide/src/kubernetes.md rename to ballista/docs/user-guide/src/kubernetes.md diff --git a/rust/ballista/docs/user-guide/src/standalone.md b/ballista/docs/user-guide/src/standalone.md similarity index 100% rename from rust/ballista/docs/user-guide/src/standalone.md rename to ballista/docs/user-guide/src/standalone.md diff --git a/rust/ballista/rust/.dockerignore b/ballista/rust/.dockerignore similarity index 100% rename from rust/ballista/rust/.dockerignore rename to ballista/rust/.dockerignore diff --git a/rust/ballista/rust/.gitignore b/ballista/rust/.gitignore similarity index 100% rename from rust/ballista/rust/.gitignore rename to ballista/rust/.gitignore diff --git a/rust/ballista/rust/Cargo.toml b/ballista/rust/Cargo.toml similarity index 100% rename from rust/ballista/rust/Cargo.toml rename to ballista/rust/Cargo.toml diff --git a/rust/ballista/rust/benchmarks/tpch/.dockerignore b/ballista/rust/benchmarks/tpch/.dockerignore similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/.dockerignore rename to ballista/rust/benchmarks/tpch/.dockerignore diff --git a/rust/ballista/rust/benchmarks/tpch/.gitignore b/ballista/rust/benchmarks/tpch/.gitignore similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/.gitignore rename to ballista/rust/benchmarks/tpch/.gitignore diff --git a/rust/ballista/rust/benchmarks/tpch/Cargo.toml b/ballista/rust/benchmarks/tpch/Cargo.toml similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/Cargo.toml rename to ballista/rust/benchmarks/tpch/Cargo.toml diff --git a/rust/ballista/rust/benchmarks/tpch/README.md b/ballista/rust/benchmarks/tpch/README.md similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/README.md rename to ballista/rust/benchmarks/tpch/README.md diff --git a/rust/ballista/rust/benchmarks/tpch/docker-compose.yaml b/ballista/rust/benchmarks/tpch/docker-compose.yaml similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/docker-compose.yaml rename to ballista/rust/benchmarks/tpch/docker-compose.yaml diff --git a/rust/ballista/rust/benchmarks/tpch/entrypoint.sh b/ballista/rust/benchmarks/tpch/entrypoint.sh similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/entrypoint.sh rename to ballista/rust/benchmarks/tpch/entrypoint.sh diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q1.sql b/ballista/rust/benchmarks/tpch/queries/q1.sql similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/queries/q1.sql rename to ballista/rust/benchmarks/tpch/queries/q1.sql diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q10.sql b/ballista/rust/benchmarks/tpch/queries/q10.sql similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/queries/q10.sql rename to ballista/rust/benchmarks/tpch/queries/q10.sql diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q11.sql b/ballista/rust/benchmarks/tpch/queries/q11.sql similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/queries/q11.sql rename to ballista/rust/benchmarks/tpch/queries/q11.sql diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q12.sql b/ballista/rust/benchmarks/tpch/queries/q12.sql similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/queries/q12.sql rename to ballista/rust/benchmarks/tpch/queries/q12.sql diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q13.sql b/ballista/rust/benchmarks/tpch/queries/q13.sql similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/queries/q13.sql rename to ballista/rust/benchmarks/tpch/queries/q13.sql diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q14.sql b/ballista/rust/benchmarks/tpch/queries/q14.sql similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/queries/q14.sql rename to ballista/rust/benchmarks/tpch/queries/q14.sql diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q16.sql b/ballista/rust/benchmarks/tpch/queries/q16.sql similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/queries/q16.sql rename to ballista/rust/benchmarks/tpch/queries/q16.sql diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q17.sql b/ballista/rust/benchmarks/tpch/queries/q17.sql similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/queries/q17.sql rename to ballista/rust/benchmarks/tpch/queries/q17.sql diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q18.sql b/ballista/rust/benchmarks/tpch/queries/q18.sql similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/queries/q18.sql rename to ballista/rust/benchmarks/tpch/queries/q18.sql diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q19.sql b/ballista/rust/benchmarks/tpch/queries/q19.sql similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/queries/q19.sql rename to ballista/rust/benchmarks/tpch/queries/q19.sql diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q2.sql b/ballista/rust/benchmarks/tpch/queries/q2.sql similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/queries/q2.sql rename to ballista/rust/benchmarks/tpch/queries/q2.sql diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q20.sql b/ballista/rust/benchmarks/tpch/queries/q20.sql similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/queries/q20.sql rename to ballista/rust/benchmarks/tpch/queries/q20.sql diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q21.sql b/ballista/rust/benchmarks/tpch/queries/q21.sql similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/queries/q21.sql rename to ballista/rust/benchmarks/tpch/queries/q21.sql diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q22.sql b/ballista/rust/benchmarks/tpch/queries/q22.sql similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/queries/q22.sql rename to ballista/rust/benchmarks/tpch/queries/q22.sql diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q3.sql b/ballista/rust/benchmarks/tpch/queries/q3.sql similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/queries/q3.sql rename to ballista/rust/benchmarks/tpch/queries/q3.sql diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q4.sql b/ballista/rust/benchmarks/tpch/queries/q4.sql similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/queries/q4.sql rename to ballista/rust/benchmarks/tpch/queries/q4.sql diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q5.sql b/ballista/rust/benchmarks/tpch/queries/q5.sql similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/queries/q5.sql rename to ballista/rust/benchmarks/tpch/queries/q5.sql diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q6.sql b/ballista/rust/benchmarks/tpch/queries/q6.sql similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/queries/q6.sql rename to ballista/rust/benchmarks/tpch/queries/q6.sql diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q7.sql b/ballista/rust/benchmarks/tpch/queries/q7.sql similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/queries/q7.sql rename to ballista/rust/benchmarks/tpch/queries/q7.sql diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q8.sql b/ballista/rust/benchmarks/tpch/queries/q8.sql similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/queries/q8.sql rename to ballista/rust/benchmarks/tpch/queries/q8.sql diff --git a/rust/ballista/rust/benchmarks/tpch/queries/q9.sql b/ballista/rust/benchmarks/tpch/queries/q9.sql similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/queries/q9.sql rename to ballista/rust/benchmarks/tpch/queries/q9.sql diff --git a/rust/ballista/rust/benchmarks/tpch/run.sh b/ballista/rust/benchmarks/tpch/run.sh similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/run.sh rename to ballista/rust/benchmarks/tpch/run.sh diff --git a/rust/ballista/rust/benchmarks/tpch/src/main.rs b/ballista/rust/benchmarks/tpch/src/main.rs similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/src/main.rs rename to ballista/rust/benchmarks/tpch/src/main.rs diff --git a/rust/ballista/rust/benchmarks/tpch/tpch-gen.sh b/ballista/rust/benchmarks/tpch/tpch-gen.sh similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/tpch-gen.sh rename to ballista/rust/benchmarks/tpch/tpch-gen.sh diff --git a/rust/ballista/rust/benchmarks/tpch/tpchgen.dockerfile b/ballista/rust/benchmarks/tpch/tpchgen.dockerfile similarity index 100% rename from rust/ballista/rust/benchmarks/tpch/tpchgen.dockerfile rename to ballista/rust/benchmarks/tpch/tpchgen.dockerfile diff --git a/rust/ballista/rust/client/Cargo.toml b/ballista/rust/client/Cargo.toml similarity index 100% rename from rust/ballista/rust/client/Cargo.toml rename to ballista/rust/client/Cargo.toml diff --git a/rust/ballista/rust/client/README.md b/ballista/rust/client/README.md similarity index 100% rename from rust/ballista/rust/client/README.md rename to ballista/rust/client/README.md diff --git a/rust/ballista/rust/client/src/columnar_batch.rs b/ballista/rust/client/src/columnar_batch.rs similarity index 100% rename from rust/ballista/rust/client/src/columnar_batch.rs rename to ballista/rust/client/src/columnar_batch.rs diff --git a/rust/ballista/rust/client/src/context.rs b/ballista/rust/client/src/context.rs similarity index 100% rename from rust/ballista/rust/client/src/context.rs rename to ballista/rust/client/src/context.rs diff --git a/rust/ballista/rust/client/src/lib.rs b/ballista/rust/client/src/lib.rs similarity index 100% rename from rust/ballista/rust/client/src/lib.rs rename to ballista/rust/client/src/lib.rs diff --git a/rust/ballista/rust/client/src/prelude.rs b/ballista/rust/client/src/prelude.rs similarity index 100% rename from rust/ballista/rust/client/src/prelude.rs rename to ballista/rust/client/src/prelude.rs diff --git a/rust/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml similarity index 100% rename from rust/ballista/rust/core/Cargo.toml rename to ballista/rust/core/Cargo.toml diff --git a/rust/ballista/rust/core/README.md b/ballista/rust/core/README.md similarity index 100% rename from rust/ballista/rust/core/README.md rename to ballista/rust/core/README.md diff --git a/rust/ballista/rust/core/build.rs b/ballista/rust/core/build.rs similarity index 100% rename from rust/ballista/rust/core/build.rs rename to ballista/rust/core/build.rs diff --git a/rust/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto similarity index 100% rename from rust/ballista/rust/core/proto/ballista.proto rename to ballista/rust/core/proto/ballista.proto diff --git a/rust/ballista/rust/core/src/client.rs b/ballista/rust/core/src/client.rs similarity index 100% rename from rust/ballista/rust/core/src/client.rs rename to ballista/rust/core/src/client.rs diff --git a/rust/ballista/rust/core/src/datasource.rs b/ballista/rust/core/src/datasource.rs similarity index 100% rename from rust/ballista/rust/core/src/datasource.rs rename to ballista/rust/core/src/datasource.rs diff --git a/rust/ballista/rust/core/src/error.rs b/ballista/rust/core/src/error.rs similarity index 100% rename from rust/ballista/rust/core/src/error.rs rename to ballista/rust/core/src/error.rs diff --git a/rust/ballista/rust/core/src/execution_plans/mod.rs b/ballista/rust/core/src/execution_plans/mod.rs similarity index 100% rename from rust/ballista/rust/core/src/execution_plans/mod.rs rename to ballista/rust/core/src/execution_plans/mod.rs diff --git a/rust/ballista/rust/core/src/execution_plans/query_stage.rs b/ballista/rust/core/src/execution_plans/query_stage.rs similarity index 100% rename from rust/ballista/rust/core/src/execution_plans/query_stage.rs rename to ballista/rust/core/src/execution_plans/query_stage.rs diff --git a/rust/ballista/rust/core/src/execution_plans/shuffle_reader.rs b/ballista/rust/core/src/execution_plans/shuffle_reader.rs similarity index 100% rename from rust/ballista/rust/core/src/execution_plans/shuffle_reader.rs rename to ballista/rust/core/src/execution_plans/shuffle_reader.rs diff --git a/rust/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs b/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs similarity index 100% rename from rust/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs rename to ballista/rust/core/src/execution_plans/unresolved_shuffle.rs diff --git a/rust/ballista/rust/core/src/lib.rs b/ballista/rust/core/src/lib.rs similarity index 100% rename from rust/ballista/rust/core/src/lib.rs rename to ballista/rust/core/src/lib.rs diff --git a/rust/ballista/rust/core/src/memory_stream.rs b/ballista/rust/core/src/memory_stream.rs similarity index 100% rename from rust/ballista/rust/core/src/memory_stream.rs rename to ballista/rust/core/src/memory_stream.rs diff --git a/rust/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs similarity index 100% rename from rust/ballista/rust/core/src/serde/logical_plan/from_proto.rs rename to ballista/rust/core/src/serde/logical_plan/from_proto.rs diff --git a/rust/ballista/rust/core/src/serde/logical_plan/mod.rs b/ballista/rust/core/src/serde/logical_plan/mod.rs similarity index 100% rename from rust/ballista/rust/core/src/serde/logical_plan/mod.rs rename to ballista/rust/core/src/serde/logical_plan/mod.rs diff --git a/rust/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/ballista/rust/core/src/serde/logical_plan/to_proto.rs similarity index 100% rename from rust/ballista/rust/core/src/serde/logical_plan/to_proto.rs rename to ballista/rust/core/src/serde/logical_plan/to_proto.rs diff --git a/rust/ballista/rust/core/src/serde/mod.rs b/ballista/rust/core/src/serde/mod.rs similarity index 100% rename from rust/ballista/rust/core/src/serde/mod.rs rename to ballista/rust/core/src/serde/mod.rs diff --git a/rust/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs similarity index 100% rename from rust/ballista/rust/core/src/serde/physical_plan/from_proto.rs rename to ballista/rust/core/src/serde/physical_plan/from_proto.rs diff --git a/rust/ballista/rust/core/src/serde/physical_plan/mod.rs b/ballista/rust/core/src/serde/physical_plan/mod.rs similarity index 100% rename from rust/ballista/rust/core/src/serde/physical_plan/mod.rs rename to ballista/rust/core/src/serde/physical_plan/mod.rs diff --git a/rust/ballista/rust/core/src/serde/physical_plan/to_proto.rs b/ballista/rust/core/src/serde/physical_plan/to_proto.rs similarity index 100% rename from rust/ballista/rust/core/src/serde/physical_plan/to_proto.rs rename to ballista/rust/core/src/serde/physical_plan/to_proto.rs diff --git a/rust/ballista/rust/core/src/serde/scheduler/from_proto.rs b/ballista/rust/core/src/serde/scheduler/from_proto.rs similarity index 100% rename from rust/ballista/rust/core/src/serde/scheduler/from_proto.rs rename to ballista/rust/core/src/serde/scheduler/from_proto.rs diff --git a/rust/ballista/rust/core/src/serde/scheduler/mod.rs b/ballista/rust/core/src/serde/scheduler/mod.rs similarity index 100% rename from rust/ballista/rust/core/src/serde/scheduler/mod.rs rename to ballista/rust/core/src/serde/scheduler/mod.rs diff --git a/rust/ballista/rust/core/src/serde/scheduler/to_proto.rs b/ballista/rust/core/src/serde/scheduler/to_proto.rs similarity index 100% rename from rust/ballista/rust/core/src/serde/scheduler/to_proto.rs rename to ballista/rust/core/src/serde/scheduler/to_proto.rs diff --git a/rust/ballista/rust/core/src/utils.rs b/ballista/rust/core/src/utils.rs similarity index 100% rename from rust/ballista/rust/core/src/utils.rs rename to ballista/rust/core/src/utils.rs diff --git a/rust/ballista/rust/executor/Cargo.toml b/ballista/rust/executor/Cargo.toml similarity index 100% rename from rust/ballista/rust/executor/Cargo.toml rename to ballista/rust/executor/Cargo.toml diff --git a/rust/ballista/rust/executor/README.md b/ballista/rust/executor/README.md similarity index 100% rename from rust/ballista/rust/executor/README.md rename to ballista/rust/executor/README.md diff --git a/rust/ballista/rust/executor/build.rs b/ballista/rust/executor/build.rs similarity index 100% rename from rust/ballista/rust/executor/build.rs rename to ballista/rust/executor/build.rs diff --git a/rust/ballista/rust/executor/examples/example_executor_config.toml b/ballista/rust/executor/examples/example_executor_config.toml similarity index 100% rename from rust/ballista/rust/executor/examples/example_executor_config.toml rename to ballista/rust/executor/examples/example_executor_config.toml diff --git a/rust/ballista/rust/executor/executor_config_spec.toml b/ballista/rust/executor/executor_config_spec.toml similarity index 100% rename from rust/ballista/rust/executor/executor_config_spec.toml rename to ballista/rust/executor/executor_config_spec.toml diff --git a/rust/ballista/rust/executor/src/collect.rs b/ballista/rust/executor/src/collect.rs similarity index 100% rename from rust/ballista/rust/executor/src/collect.rs rename to ballista/rust/executor/src/collect.rs diff --git a/rust/ballista/rust/executor/src/execution_loop.rs b/ballista/rust/executor/src/execution_loop.rs similarity index 100% rename from rust/ballista/rust/executor/src/execution_loop.rs rename to ballista/rust/executor/src/execution_loop.rs diff --git a/rust/ballista/rust/executor/src/flight_service.rs b/ballista/rust/executor/src/flight_service.rs similarity index 100% rename from rust/ballista/rust/executor/src/flight_service.rs rename to ballista/rust/executor/src/flight_service.rs diff --git a/rust/ballista/rust/executor/src/lib.rs b/ballista/rust/executor/src/lib.rs similarity index 100% rename from rust/ballista/rust/executor/src/lib.rs rename to ballista/rust/executor/src/lib.rs diff --git a/rust/ballista/rust/executor/src/main.rs b/ballista/rust/executor/src/main.rs similarity index 100% rename from rust/ballista/rust/executor/src/main.rs rename to ballista/rust/executor/src/main.rs diff --git a/rust/ballista/rust/scheduler/Cargo.toml b/ballista/rust/scheduler/Cargo.toml similarity index 100% rename from rust/ballista/rust/scheduler/Cargo.toml rename to ballista/rust/scheduler/Cargo.toml diff --git a/rust/ballista/rust/scheduler/README.md b/ballista/rust/scheduler/README.md similarity index 100% rename from rust/ballista/rust/scheduler/README.md rename to ballista/rust/scheduler/README.md diff --git a/rust/ballista/rust/scheduler/build.rs b/ballista/rust/scheduler/build.rs similarity index 100% rename from rust/ballista/rust/scheduler/build.rs rename to ballista/rust/scheduler/build.rs diff --git a/rust/ballista/rust/scheduler/scheduler_config_spec.toml b/ballista/rust/scheduler/scheduler_config_spec.toml similarity index 100% rename from rust/ballista/rust/scheduler/scheduler_config_spec.toml rename to ballista/rust/scheduler/scheduler_config_spec.toml diff --git a/rust/ballista/rust/scheduler/src/api/handlers.rs b/ballista/rust/scheduler/src/api/handlers.rs similarity index 100% rename from rust/ballista/rust/scheduler/src/api/handlers.rs rename to ballista/rust/scheduler/src/api/handlers.rs diff --git a/rust/ballista/rust/scheduler/src/api/mod.rs b/ballista/rust/scheduler/src/api/mod.rs similarity index 100% rename from rust/ballista/rust/scheduler/src/api/mod.rs rename to ballista/rust/scheduler/src/api/mod.rs diff --git a/rust/ballista/rust/scheduler/src/lib.rs b/ballista/rust/scheduler/src/lib.rs similarity index 100% rename from rust/ballista/rust/scheduler/src/lib.rs rename to ballista/rust/scheduler/src/lib.rs diff --git a/rust/ballista/rust/scheduler/src/main.rs b/ballista/rust/scheduler/src/main.rs similarity index 100% rename from rust/ballista/rust/scheduler/src/main.rs rename to ballista/rust/scheduler/src/main.rs diff --git a/rust/ballista/rust/scheduler/src/planner.rs b/ballista/rust/scheduler/src/planner.rs similarity index 100% rename from rust/ballista/rust/scheduler/src/planner.rs rename to ballista/rust/scheduler/src/planner.rs diff --git a/rust/ballista/rust/scheduler/src/state/etcd.rs b/ballista/rust/scheduler/src/state/etcd.rs similarity index 100% rename from rust/ballista/rust/scheduler/src/state/etcd.rs rename to ballista/rust/scheduler/src/state/etcd.rs diff --git a/rust/ballista/rust/scheduler/src/state/mod.rs b/ballista/rust/scheduler/src/state/mod.rs similarity index 100% rename from rust/ballista/rust/scheduler/src/state/mod.rs rename to ballista/rust/scheduler/src/state/mod.rs diff --git a/rust/ballista/rust/scheduler/src/state/standalone.rs b/ballista/rust/scheduler/src/state/standalone.rs similarity index 100% rename from rust/ballista/rust/scheduler/src/state/standalone.rs rename to ballista/rust/scheduler/src/state/standalone.rs diff --git a/rust/ballista/rust/scheduler/src/test_utils.rs b/ballista/rust/scheduler/src/test_utils.rs similarity index 100% rename from rust/ballista/rust/scheduler/src/test_utils.rs rename to ballista/rust/scheduler/src/test_utils.rs diff --git a/rust/ballista/rust/scheduler/testdata/customer/customer.tbl b/ballista/rust/scheduler/testdata/customer/customer.tbl similarity index 100% rename from rust/ballista/rust/scheduler/testdata/customer/customer.tbl rename to ballista/rust/scheduler/testdata/customer/customer.tbl diff --git a/rust/ballista/rust/scheduler/testdata/lineitem/partition0.tbl b/ballista/rust/scheduler/testdata/lineitem/partition0.tbl similarity index 100% rename from rust/ballista/rust/scheduler/testdata/lineitem/partition0.tbl rename to ballista/rust/scheduler/testdata/lineitem/partition0.tbl diff --git a/rust/ballista/rust/scheduler/testdata/lineitem/partition1.tbl b/ballista/rust/scheduler/testdata/lineitem/partition1.tbl similarity index 100% rename from rust/ballista/rust/scheduler/testdata/lineitem/partition1.tbl rename to ballista/rust/scheduler/testdata/lineitem/partition1.tbl diff --git a/rust/ballista/rust/scheduler/testdata/nation/nation.tbl b/ballista/rust/scheduler/testdata/nation/nation.tbl similarity index 100% rename from rust/ballista/rust/scheduler/testdata/nation/nation.tbl rename to ballista/rust/scheduler/testdata/nation/nation.tbl diff --git a/rust/ballista/rust/scheduler/testdata/orders/orders.tbl b/ballista/rust/scheduler/testdata/orders/orders.tbl similarity index 100% rename from rust/ballista/rust/scheduler/testdata/orders/orders.tbl rename to ballista/rust/scheduler/testdata/orders/orders.tbl diff --git a/rust/ballista/rust/scheduler/testdata/part/part.tbl b/ballista/rust/scheduler/testdata/part/part.tbl similarity index 100% rename from rust/ballista/rust/scheduler/testdata/part/part.tbl rename to ballista/rust/scheduler/testdata/part/part.tbl diff --git a/rust/ballista/rust/scheduler/testdata/partsupp/partsupp.tbl b/ballista/rust/scheduler/testdata/partsupp/partsupp.tbl similarity index 100% rename from rust/ballista/rust/scheduler/testdata/partsupp/partsupp.tbl rename to ballista/rust/scheduler/testdata/partsupp/partsupp.tbl diff --git a/rust/ballista/rust/scheduler/testdata/region/region.tbl b/ballista/rust/scheduler/testdata/region/region.tbl similarity index 100% rename from rust/ballista/rust/scheduler/testdata/region/region.tbl rename to ballista/rust/scheduler/testdata/region/region.tbl diff --git a/rust/ballista/rust/scheduler/testdata/supplier/supplier.tbl b/ballista/rust/scheduler/testdata/supplier/supplier.tbl similarity index 100% rename from rust/ballista/rust/scheduler/testdata/supplier/supplier.tbl rename to ballista/rust/scheduler/testdata/supplier/supplier.tbl diff --git a/rust/ballista/ui/scheduler/.gitignore b/ballista/ui/scheduler/.gitignore similarity index 100% rename from rust/ballista/ui/scheduler/.gitignore rename to ballista/ui/scheduler/.gitignore diff --git a/rust/ballista/ui/scheduler/README.md b/ballista/ui/scheduler/README.md similarity index 100% rename from rust/ballista/ui/scheduler/README.md rename to ballista/ui/scheduler/README.md diff --git a/rust/ballista/ui/scheduler/index.d.ts b/ballista/ui/scheduler/index.d.ts similarity index 100% rename from rust/ballista/ui/scheduler/index.d.ts rename to ballista/ui/scheduler/index.d.ts diff --git a/rust/ballista/ui/scheduler/package.json b/ballista/ui/scheduler/package.json similarity index 100% rename from rust/ballista/ui/scheduler/package.json rename to ballista/ui/scheduler/package.json diff --git a/rust/ballista/ui/scheduler/public/favicon.ico b/ballista/ui/scheduler/public/favicon.ico similarity index 100% rename from rust/ballista/ui/scheduler/public/favicon.ico rename to ballista/ui/scheduler/public/favicon.ico diff --git a/rust/ballista/ui/scheduler/public/index.html b/ballista/ui/scheduler/public/index.html similarity index 100% rename from rust/ballista/ui/scheduler/public/index.html rename to ballista/ui/scheduler/public/index.html diff --git a/rust/ballista/ui/scheduler/public/logo192.png b/ballista/ui/scheduler/public/logo192.png similarity index 100% rename from rust/ballista/ui/scheduler/public/logo192.png rename to ballista/ui/scheduler/public/logo192.png diff --git a/rust/ballista/ui/scheduler/public/logo512.png b/ballista/ui/scheduler/public/logo512.png similarity index 100% rename from rust/ballista/ui/scheduler/public/logo512.png rename to ballista/ui/scheduler/public/logo512.png diff --git a/rust/ballista/ui/scheduler/public/manifest.json b/ballista/ui/scheduler/public/manifest.json similarity index 100% rename from rust/ballista/ui/scheduler/public/manifest.json rename to ballista/ui/scheduler/public/manifest.json diff --git a/rust/ballista/ui/scheduler/public/robots.txt b/ballista/ui/scheduler/public/robots.txt similarity index 100% rename from rust/ballista/ui/scheduler/public/robots.txt rename to ballista/ui/scheduler/public/robots.txt diff --git a/rust/ballista/ui/scheduler/react-table-config.d.ts b/ballista/ui/scheduler/react-table-config.d.ts similarity index 100% rename from rust/ballista/ui/scheduler/react-table-config.d.ts rename to ballista/ui/scheduler/react-table-config.d.ts diff --git a/rust/ballista/ui/scheduler/src/App.css b/ballista/ui/scheduler/src/App.css similarity index 100% rename from rust/ballista/ui/scheduler/src/App.css rename to ballista/ui/scheduler/src/App.css diff --git a/rust/ballista/ui/scheduler/src/App.test.tsx b/ballista/ui/scheduler/src/App.test.tsx similarity index 100% rename from rust/ballista/ui/scheduler/src/App.test.tsx rename to ballista/ui/scheduler/src/App.test.tsx diff --git a/rust/ballista/ui/scheduler/src/App.tsx b/ballista/ui/scheduler/src/App.tsx similarity index 100% rename from rust/ballista/ui/scheduler/src/App.tsx rename to ballista/ui/scheduler/src/App.tsx diff --git a/rust/ballista/ui/scheduler/src/components/DataTable.tsx b/ballista/ui/scheduler/src/components/DataTable.tsx similarity index 100% rename from rust/ballista/ui/scheduler/src/components/DataTable.tsx rename to ballista/ui/scheduler/src/components/DataTable.tsx diff --git a/rust/ballista/ui/scheduler/src/components/Empty.tsx b/ballista/ui/scheduler/src/components/Empty.tsx similarity index 100% rename from rust/ballista/ui/scheduler/src/components/Empty.tsx rename to ballista/ui/scheduler/src/components/Empty.tsx diff --git a/rust/ballista/ui/scheduler/src/components/Footer.tsx b/ballista/ui/scheduler/src/components/Footer.tsx similarity index 100% rename from rust/ballista/ui/scheduler/src/components/Footer.tsx rename to ballista/ui/scheduler/src/components/Footer.tsx diff --git a/rust/ballista/ui/scheduler/src/components/Header.tsx b/ballista/ui/scheduler/src/components/Header.tsx similarity index 100% rename from rust/ballista/ui/scheduler/src/components/Header.tsx rename to ballista/ui/scheduler/src/components/Header.tsx diff --git a/rust/ballista/ui/scheduler/src/components/NodesList.tsx b/ballista/ui/scheduler/src/components/NodesList.tsx similarity index 100% rename from rust/ballista/ui/scheduler/src/components/NodesList.tsx rename to ballista/ui/scheduler/src/components/NodesList.tsx diff --git a/rust/ballista/ui/scheduler/src/components/QueriesList.tsx b/ballista/ui/scheduler/src/components/QueriesList.tsx similarity index 100% rename from rust/ballista/ui/scheduler/src/components/QueriesList.tsx rename to ballista/ui/scheduler/src/components/QueriesList.tsx diff --git a/rust/ballista/ui/scheduler/src/components/Summary.tsx b/ballista/ui/scheduler/src/components/Summary.tsx similarity index 100% rename from rust/ballista/ui/scheduler/src/components/Summary.tsx rename to ballista/ui/scheduler/src/components/Summary.tsx diff --git a/rust/ballista/ui/scheduler/src/components/logo.svg b/ballista/ui/scheduler/src/components/logo.svg similarity index 100% rename from rust/ballista/ui/scheduler/src/components/logo.svg rename to ballista/ui/scheduler/src/components/logo.svg diff --git a/rust/ballista/ui/scheduler/src/index.css b/ballista/ui/scheduler/src/index.css similarity index 100% rename from rust/ballista/ui/scheduler/src/index.css rename to ballista/ui/scheduler/src/index.css diff --git a/rust/ballista/ui/scheduler/src/index.tsx b/ballista/ui/scheduler/src/index.tsx similarity index 100% rename from rust/ballista/ui/scheduler/src/index.tsx rename to ballista/ui/scheduler/src/index.tsx diff --git a/rust/ballista/ui/scheduler/src/react-app-env.d.ts b/ballista/ui/scheduler/src/react-app-env.d.ts similarity index 100% rename from rust/ballista/ui/scheduler/src/react-app-env.d.ts rename to ballista/ui/scheduler/src/react-app-env.d.ts diff --git a/rust/ballista/ui/scheduler/src/reportWebVitals.ts b/ballista/ui/scheduler/src/reportWebVitals.ts similarity index 100% rename from rust/ballista/ui/scheduler/src/reportWebVitals.ts rename to ballista/ui/scheduler/src/reportWebVitals.ts diff --git a/rust/ballista/ui/scheduler/src/setupTests.ts b/ballista/ui/scheduler/src/setupTests.ts similarity index 100% rename from rust/ballista/ui/scheduler/src/setupTests.ts rename to ballista/ui/scheduler/src/setupTests.ts diff --git a/rust/ballista/ui/scheduler/tsconfig.json b/ballista/ui/scheduler/tsconfig.json similarity index 100% rename from rust/ballista/ui/scheduler/tsconfig.json rename to ballista/ui/scheduler/tsconfig.json diff --git a/rust/ballista/ui/scheduler/yarn.lock b/ballista/ui/scheduler/yarn.lock similarity index 100% rename from rust/ballista/ui/scheduler/yarn.lock rename to ballista/ui/scheduler/yarn.lock diff --git a/rust/benchmarks/Cargo.toml b/benchmarks/Cargo.toml similarity index 100% rename from rust/benchmarks/Cargo.toml rename to benchmarks/Cargo.toml diff --git a/rust/benchmarks/README.md b/benchmarks/README.md similarity index 100% rename from rust/benchmarks/README.md rename to benchmarks/README.md diff --git a/rust/benchmarks/src/bin/nyctaxi.rs b/benchmarks/src/bin/nyctaxi.rs similarity index 100% rename from rust/benchmarks/src/bin/nyctaxi.rs rename to benchmarks/src/bin/nyctaxi.rs diff --git a/rust/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs similarity index 100% rename from rust/benchmarks/src/bin/tpch.rs rename to benchmarks/src/bin/tpch.rs diff --git a/rust/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml similarity index 100% rename from rust/datafusion-examples/Cargo.toml rename to datafusion-examples/Cargo.toml diff --git a/rust/datafusion-examples/examples/README.md b/datafusion-examples/examples/README.md similarity index 100% rename from rust/datafusion-examples/examples/README.md rename to datafusion-examples/examples/README.md diff --git a/rust/datafusion-examples/examples/csv_sql.rs b/datafusion-examples/examples/csv_sql.rs similarity index 100% rename from rust/datafusion-examples/examples/csv_sql.rs rename to datafusion-examples/examples/csv_sql.rs diff --git a/rust/datafusion-examples/examples/dataframe.rs b/datafusion-examples/examples/dataframe.rs similarity index 100% rename from rust/datafusion-examples/examples/dataframe.rs rename to datafusion-examples/examples/dataframe.rs diff --git a/rust/datafusion-examples/examples/dataframe_in_memory.rs b/datafusion-examples/examples/dataframe_in_memory.rs similarity index 100% rename from rust/datafusion-examples/examples/dataframe_in_memory.rs rename to datafusion-examples/examples/dataframe_in_memory.rs diff --git a/rust/datafusion-examples/examples/flight_client.rs b/datafusion-examples/examples/flight_client.rs similarity index 100% rename from rust/datafusion-examples/examples/flight_client.rs rename to datafusion-examples/examples/flight_client.rs diff --git a/rust/datafusion-examples/examples/flight_server.rs b/datafusion-examples/examples/flight_server.rs similarity index 100% rename from rust/datafusion-examples/examples/flight_server.rs rename to datafusion-examples/examples/flight_server.rs diff --git a/rust/datafusion-examples/examples/parquet_sql.rs b/datafusion-examples/examples/parquet_sql.rs similarity index 100% rename from rust/datafusion-examples/examples/parquet_sql.rs rename to datafusion-examples/examples/parquet_sql.rs diff --git a/rust/datafusion-examples/examples/simple_udaf.rs b/datafusion-examples/examples/simple_udaf.rs similarity index 100% rename from rust/datafusion-examples/examples/simple_udaf.rs rename to datafusion-examples/examples/simple_udaf.rs diff --git a/rust/datafusion-examples/examples/simple_udf.rs b/datafusion-examples/examples/simple_udf.rs similarity index 100% rename from rust/datafusion-examples/examples/simple_udf.rs rename to datafusion-examples/examples/simple_udf.rs diff --git a/rust/datafusion/Cargo.toml b/datafusion/Cargo.toml similarity index 100% rename from rust/datafusion/Cargo.toml rename to datafusion/Cargo.toml diff --git a/rust/datafusion/DEVELOPERS.md b/datafusion/DEVELOPERS.md similarity index 100% rename from rust/datafusion/DEVELOPERS.md rename to datafusion/DEVELOPERS.md diff --git a/rust/datafusion/Dockerfile b/datafusion/Dockerfile similarity index 100% rename from rust/datafusion/Dockerfile rename to datafusion/Dockerfile diff --git a/rust/datafusion/README.md b/datafusion/README.md similarity index 100% rename from rust/datafusion/README.md rename to datafusion/README.md diff --git a/rust/datafusion/benches/aggregate_query_sql.rs b/datafusion/benches/aggregate_query_sql.rs similarity index 100% rename from rust/datafusion/benches/aggregate_query_sql.rs rename to datafusion/benches/aggregate_query_sql.rs diff --git a/rust/datafusion/benches/filter_query_sql.rs b/datafusion/benches/filter_query_sql.rs similarity index 100% rename from rust/datafusion/benches/filter_query_sql.rs rename to datafusion/benches/filter_query_sql.rs diff --git a/rust/datafusion/benches/math_query_sql.rs b/datafusion/benches/math_query_sql.rs similarity index 100% rename from rust/datafusion/benches/math_query_sql.rs rename to datafusion/benches/math_query_sql.rs diff --git a/rust/datafusion/benches/scalar.rs b/datafusion/benches/scalar.rs similarity index 100% rename from rust/datafusion/benches/scalar.rs rename to datafusion/benches/scalar.rs diff --git a/rust/datafusion/benches/sort_limit_query_sql.rs b/datafusion/benches/sort_limit_query_sql.rs similarity index 100% rename from rust/datafusion/benches/sort_limit_query_sql.rs rename to datafusion/benches/sort_limit_query_sql.rs diff --git a/rust/datafusion/docs/cli.md b/datafusion/docs/cli.md similarity index 100% rename from rust/datafusion/docs/cli.md rename to datafusion/docs/cli.md diff --git a/rust/datafusion/docs/images/DataFusion-Logo-Dark.png b/datafusion/docs/images/DataFusion-Logo-Dark.png similarity index 100% rename from rust/datafusion/docs/images/DataFusion-Logo-Dark.png rename to datafusion/docs/images/DataFusion-Logo-Dark.png diff --git a/rust/datafusion/docs/images/DataFusion-Logo-Dark.svg b/datafusion/docs/images/DataFusion-Logo-Dark.svg similarity index 100% rename from rust/datafusion/docs/images/DataFusion-Logo-Dark.svg rename to datafusion/docs/images/DataFusion-Logo-Dark.svg diff --git a/rust/datafusion/docs/images/DataFusion-Logo-Light.png b/datafusion/docs/images/DataFusion-Logo-Light.png similarity index 100% rename from rust/datafusion/docs/images/DataFusion-Logo-Light.png rename to datafusion/docs/images/DataFusion-Logo-Light.png diff --git a/rust/datafusion/docs/images/DataFusion-Logo-Light.svg b/datafusion/docs/images/DataFusion-Logo-Light.svg similarity index 100% rename from rust/datafusion/docs/images/DataFusion-Logo-Light.svg rename to datafusion/docs/images/DataFusion-Logo-Light.svg diff --git a/rust/datafusion/src/bin/main.rs b/datafusion/src/bin/main.rs similarity index 100% rename from rust/datafusion/src/bin/main.rs rename to datafusion/src/bin/main.rs diff --git a/rust/datafusion/src/bin/repl.rs b/datafusion/src/bin/repl.rs similarity index 100% rename from rust/datafusion/src/bin/repl.rs rename to datafusion/src/bin/repl.rs diff --git a/rust/datafusion/src/catalog/catalog.rs b/datafusion/src/catalog/catalog.rs similarity index 100% rename from rust/datafusion/src/catalog/catalog.rs rename to datafusion/src/catalog/catalog.rs diff --git a/rust/datafusion/src/catalog/information_schema.rs b/datafusion/src/catalog/information_schema.rs similarity index 100% rename from rust/datafusion/src/catalog/information_schema.rs rename to datafusion/src/catalog/information_schema.rs diff --git a/rust/datafusion/src/catalog/mod.rs b/datafusion/src/catalog/mod.rs similarity index 100% rename from rust/datafusion/src/catalog/mod.rs rename to datafusion/src/catalog/mod.rs diff --git a/rust/datafusion/src/catalog/schema.rs b/datafusion/src/catalog/schema.rs similarity index 100% rename from rust/datafusion/src/catalog/schema.rs rename to datafusion/src/catalog/schema.rs diff --git a/rust/datafusion/src/dataframe.rs b/datafusion/src/dataframe.rs similarity index 100% rename from rust/datafusion/src/dataframe.rs rename to datafusion/src/dataframe.rs diff --git a/rust/datafusion/src/datasource/csv.rs b/datafusion/src/datasource/csv.rs similarity index 100% rename from rust/datafusion/src/datasource/csv.rs rename to datafusion/src/datasource/csv.rs diff --git a/rust/datafusion/src/datasource/datasource.rs b/datafusion/src/datasource/datasource.rs similarity index 100% rename from rust/datafusion/src/datasource/datasource.rs rename to datafusion/src/datasource/datasource.rs diff --git a/rust/datafusion/src/datasource/empty.rs b/datafusion/src/datasource/empty.rs similarity index 100% rename from rust/datafusion/src/datasource/empty.rs rename to datafusion/src/datasource/empty.rs diff --git a/rust/datafusion/src/datasource/memory.rs b/datafusion/src/datasource/memory.rs similarity index 100% rename from rust/datafusion/src/datasource/memory.rs rename to datafusion/src/datasource/memory.rs diff --git a/rust/datafusion/src/datasource/mod.rs b/datafusion/src/datasource/mod.rs similarity index 100% rename from rust/datafusion/src/datasource/mod.rs rename to datafusion/src/datasource/mod.rs diff --git a/rust/datafusion/src/datasource/parquet.rs b/datafusion/src/datasource/parquet.rs similarity index 100% rename from rust/datafusion/src/datasource/parquet.rs rename to datafusion/src/datasource/parquet.rs diff --git a/rust/datafusion/src/error.rs b/datafusion/src/error.rs similarity index 100% rename from rust/datafusion/src/error.rs rename to datafusion/src/error.rs diff --git a/rust/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs similarity index 100% rename from rust/datafusion/src/execution/context.rs rename to datafusion/src/execution/context.rs diff --git a/rust/datafusion/src/execution/dataframe_impl.rs b/datafusion/src/execution/dataframe_impl.rs similarity index 100% rename from rust/datafusion/src/execution/dataframe_impl.rs rename to datafusion/src/execution/dataframe_impl.rs diff --git a/rust/datafusion/src/execution/mod.rs b/datafusion/src/execution/mod.rs similarity index 100% rename from rust/datafusion/src/execution/mod.rs rename to datafusion/src/execution/mod.rs diff --git a/rust/datafusion/src/lib.rs b/datafusion/src/lib.rs similarity index 100% rename from rust/datafusion/src/lib.rs rename to datafusion/src/lib.rs diff --git a/rust/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs similarity index 100% rename from rust/datafusion/src/logical_plan/builder.rs rename to datafusion/src/logical_plan/builder.rs diff --git a/rust/datafusion/src/logical_plan/dfschema.rs b/datafusion/src/logical_plan/dfschema.rs similarity index 100% rename from rust/datafusion/src/logical_plan/dfschema.rs rename to datafusion/src/logical_plan/dfschema.rs diff --git a/rust/datafusion/src/logical_plan/display.rs b/datafusion/src/logical_plan/display.rs similarity index 100% rename from rust/datafusion/src/logical_plan/display.rs rename to datafusion/src/logical_plan/display.rs diff --git a/rust/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs similarity index 100% rename from rust/datafusion/src/logical_plan/expr.rs rename to datafusion/src/logical_plan/expr.rs diff --git a/rust/datafusion/src/logical_plan/extension.rs b/datafusion/src/logical_plan/extension.rs similarity index 100% rename from rust/datafusion/src/logical_plan/extension.rs rename to datafusion/src/logical_plan/extension.rs diff --git a/rust/datafusion/src/logical_plan/mod.rs b/datafusion/src/logical_plan/mod.rs similarity index 100% rename from rust/datafusion/src/logical_plan/mod.rs rename to datafusion/src/logical_plan/mod.rs diff --git a/rust/datafusion/src/logical_plan/operators.rs b/datafusion/src/logical_plan/operators.rs similarity index 100% rename from rust/datafusion/src/logical_plan/operators.rs rename to datafusion/src/logical_plan/operators.rs diff --git a/rust/datafusion/src/logical_plan/plan.rs b/datafusion/src/logical_plan/plan.rs similarity index 100% rename from rust/datafusion/src/logical_plan/plan.rs rename to datafusion/src/logical_plan/plan.rs diff --git a/rust/datafusion/src/logical_plan/registry.rs b/datafusion/src/logical_plan/registry.rs similarity index 100% rename from rust/datafusion/src/logical_plan/registry.rs rename to datafusion/src/logical_plan/registry.rs diff --git a/rust/datafusion/src/optimizer/constant_folding.rs b/datafusion/src/optimizer/constant_folding.rs similarity index 100% rename from rust/datafusion/src/optimizer/constant_folding.rs rename to datafusion/src/optimizer/constant_folding.rs diff --git a/rust/datafusion/src/optimizer/filter_push_down.rs b/datafusion/src/optimizer/filter_push_down.rs similarity index 100% rename from rust/datafusion/src/optimizer/filter_push_down.rs rename to datafusion/src/optimizer/filter_push_down.rs diff --git a/rust/datafusion/src/optimizer/hash_build_probe_order.rs b/datafusion/src/optimizer/hash_build_probe_order.rs similarity index 100% rename from rust/datafusion/src/optimizer/hash_build_probe_order.rs rename to datafusion/src/optimizer/hash_build_probe_order.rs diff --git a/rust/datafusion/src/optimizer/limit_push_down.rs b/datafusion/src/optimizer/limit_push_down.rs similarity index 100% rename from rust/datafusion/src/optimizer/limit_push_down.rs rename to datafusion/src/optimizer/limit_push_down.rs diff --git a/rust/datafusion/src/optimizer/mod.rs b/datafusion/src/optimizer/mod.rs similarity index 100% rename from rust/datafusion/src/optimizer/mod.rs rename to datafusion/src/optimizer/mod.rs diff --git a/rust/datafusion/src/optimizer/optimizer.rs b/datafusion/src/optimizer/optimizer.rs similarity index 100% rename from rust/datafusion/src/optimizer/optimizer.rs rename to datafusion/src/optimizer/optimizer.rs diff --git a/rust/datafusion/src/optimizer/projection_push_down.rs b/datafusion/src/optimizer/projection_push_down.rs similarity index 100% rename from rust/datafusion/src/optimizer/projection_push_down.rs rename to datafusion/src/optimizer/projection_push_down.rs diff --git a/rust/datafusion/src/optimizer/utils.rs b/datafusion/src/optimizer/utils.rs similarity index 100% rename from rust/datafusion/src/optimizer/utils.rs rename to datafusion/src/optimizer/utils.rs diff --git a/rust/datafusion/src/physical_optimizer/coalesce_batches.rs b/datafusion/src/physical_optimizer/coalesce_batches.rs similarity index 100% rename from rust/datafusion/src/physical_optimizer/coalesce_batches.rs rename to datafusion/src/physical_optimizer/coalesce_batches.rs diff --git a/rust/datafusion/src/physical_optimizer/merge_exec.rs b/datafusion/src/physical_optimizer/merge_exec.rs similarity index 100% rename from rust/datafusion/src/physical_optimizer/merge_exec.rs rename to datafusion/src/physical_optimizer/merge_exec.rs diff --git a/rust/datafusion/src/physical_optimizer/mod.rs b/datafusion/src/physical_optimizer/mod.rs similarity index 100% rename from rust/datafusion/src/physical_optimizer/mod.rs rename to datafusion/src/physical_optimizer/mod.rs diff --git a/rust/datafusion/src/physical_optimizer/optimizer.rs b/datafusion/src/physical_optimizer/optimizer.rs similarity index 100% rename from rust/datafusion/src/physical_optimizer/optimizer.rs rename to datafusion/src/physical_optimizer/optimizer.rs diff --git a/rust/datafusion/src/physical_optimizer/repartition.rs b/datafusion/src/physical_optimizer/repartition.rs similarity index 100% rename from rust/datafusion/src/physical_optimizer/repartition.rs rename to datafusion/src/physical_optimizer/repartition.rs diff --git a/rust/datafusion/src/physical_plan/aggregates.rs b/datafusion/src/physical_plan/aggregates.rs similarity index 100% rename from rust/datafusion/src/physical_plan/aggregates.rs rename to datafusion/src/physical_plan/aggregates.rs diff --git a/rust/datafusion/src/physical_plan/array_expressions.rs b/datafusion/src/physical_plan/array_expressions.rs similarity index 100% rename from rust/datafusion/src/physical_plan/array_expressions.rs rename to datafusion/src/physical_plan/array_expressions.rs diff --git a/rust/datafusion/src/physical_plan/coalesce_batches.rs b/datafusion/src/physical_plan/coalesce_batches.rs similarity index 100% rename from rust/datafusion/src/physical_plan/coalesce_batches.rs rename to datafusion/src/physical_plan/coalesce_batches.rs diff --git a/rust/datafusion/src/physical_plan/common.rs b/datafusion/src/physical_plan/common.rs similarity index 100% rename from rust/datafusion/src/physical_plan/common.rs rename to datafusion/src/physical_plan/common.rs diff --git a/rust/datafusion/src/physical_plan/crypto_expressions.rs b/datafusion/src/physical_plan/crypto_expressions.rs similarity index 100% rename from rust/datafusion/src/physical_plan/crypto_expressions.rs rename to datafusion/src/physical_plan/crypto_expressions.rs diff --git a/rust/datafusion/src/physical_plan/csv.rs b/datafusion/src/physical_plan/csv.rs similarity index 100% rename from rust/datafusion/src/physical_plan/csv.rs rename to datafusion/src/physical_plan/csv.rs diff --git a/rust/datafusion/src/physical_plan/datetime_expressions.rs b/datafusion/src/physical_plan/datetime_expressions.rs similarity index 100% rename from rust/datafusion/src/physical_plan/datetime_expressions.rs rename to datafusion/src/physical_plan/datetime_expressions.rs diff --git a/rust/datafusion/src/physical_plan/distinct_expressions.rs b/datafusion/src/physical_plan/distinct_expressions.rs similarity index 100% rename from rust/datafusion/src/physical_plan/distinct_expressions.rs rename to datafusion/src/physical_plan/distinct_expressions.rs diff --git a/rust/datafusion/src/physical_plan/empty.rs b/datafusion/src/physical_plan/empty.rs similarity index 100% rename from rust/datafusion/src/physical_plan/empty.rs rename to datafusion/src/physical_plan/empty.rs diff --git a/rust/datafusion/src/physical_plan/explain.rs b/datafusion/src/physical_plan/explain.rs similarity index 100% rename from rust/datafusion/src/physical_plan/explain.rs rename to datafusion/src/physical_plan/explain.rs diff --git a/rust/datafusion/src/physical_plan/expressions/average.rs b/datafusion/src/physical_plan/expressions/average.rs similarity index 100% rename from rust/datafusion/src/physical_plan/expressions/average.rs rename to datafusion/src/physical_plan/expressions/average.rs diff --git a/rust/datafusion/src/physical_plan/expressions/binary.rs b/datafusion/src/physical_plan/expressions/binary.rs similarity index 100% rename from rust/datafusion/src/physical_plan/expressions/binary.rs rename to datafusion/src/physical_plan/expressions/binary.rs diff --git a/rust/datafusion/src/physical_plan/expressions/case.rs b/datafusion/src/physical_plan/expressions/case.rs similarity index 100% rename from rust/datafusion/src/physical_plan/expressions/case.rs rename to datafusion/src/physical_plan/expressions/case.rs diff --git a/rust/datafusion/src/physical_plan/expressions/cast.rs b/datafusion/src/physical_plan/expressions/cast.rs similarity index 100% rename from rust/datafusion/src/physical_plan/expressions/cast.rs rename to datafusion/src/physical_plan/expressions/cast.rs diff --git a/rust/datafusion/src/physical_plan/expressions/coercion.rs b/datafusion/src/physical_plan/expressions/coercion.rs similarity index 100% rename from rust/datafusion/src/physical_plan/expressions/coercion.rs rename to datafusion/src/physical_plan/expressions/coercion.rs diff --git a/rust/datafusion/src/physical_plan/expressions/column.rs b/datafusion/src/physical_plan/expressions/column.rs similarity index 100% rename from rust/datafusion/src/physical_plan/expressions/column.rs rename to datafusion/src/physical_plan/expressions/column.rs diff --git a/rust/datafusion/src/physical_plan/expressions/count.rs b/datafusion/src/physical_plan/expressions/count.rs similarity index 100% rename from rust/datafusion/src/physical_plan/expressions/count.rs rename to datafusion/src/physical_plan/expressions/count.rs diff --git a/rust/datafusion/src/physical_plan/expressions/in_list.rs b/datafusion/src/physical_plan/expressions/in_list.rs similarity index 100% rename from rust/datafusion/src/physical_plan/expressions/in_list.rs rename to datafusion/src/physical_plan/expressions/in_list.rs diff --git a/rust/datafusion/src/physical_plan/expressions/is_not_null.rs b/datafusion/src/physical_plan/expressions/is_not_null.rs similarity index 100% rename from rust/datafusion/src/physical_plan/expressions/is_not_null.rs rename to datafusion/src/physical_plan/expressions/is_not_null.rs diff --git a/rust/datafusion/src/physical_plan/expressions/is_null.rs b/datafusion/src/physical_plan/expressions/is_null.rs similarity index 100% rename from rust/datafusion/src/physical_plan/expressions/is_null.rs rename to datafusion/src/physical_plan/expressions/is_null.rs diff --git a/rust/datafusion/src/physical_plan/expressions/literal.rs b/datafusion/src/physical_plan/expressions/literal.rs similarity index 100% rename from rust/datafusion/src/physical_plan/expressions/literal.rs rename to datafusion/src/physical_plan/expressions/literal.rs diff --git a/rust/datafusion/src/physical_plan/expressions/min_max.rs b/datafusion/src/physical_plan/expressions/min_max.rs similarity index 100% rename from rust/datafusion/src/physical_plan/expressions/min_max.rs rename to datafusion/src/physical_plan/expressions/min_max.rs diff --git a/rust/datafusion/src/physical_plan/expressions/mod.rs b/datafusion/src/physical_plan/expressions/mod.rs similarity index 100% rename from rust/datafusion/src/physical_plan/expressions/mod.rs rename to datafusion/src/physical_plan/expressions/mod.rs diff --git a/rust/datafusion/src/physical_plan/expressions/negative.rs b/datafusion/src/physical_plan/expressions/negative.rs similarity index 100% rename from rust/datafusion/src/physical_plan/expressions/negative.rs rename to datafusion/src/physical_plan/expressions/negative.rs diff --git a/rust/datafusion/src/physical_plan/expressions/not.rs b/datafusion/src/physical_plan/expressions/not.rs similarity index 100% rename from rust/datafusion/src/physical_plan/expressions/not.rs rename to datafusion/src/physical_plan/expressions/not.rs diff --git a/rust/datafusion/src/physical_plan/expressions/nullif.rs b/datafusion/src/physical_plan/expressions/nullif.rs similarity index 100% rename from rust/datafusion/src/physical_plan/expressions/nullif.rs rename to datafusion/src/physical_plan/expressions/nullif.rs diff --git a/rust/datafusion/src/physical_plan/expressions/sum.rs b/datafusion/src/physical_plan/expressions/sum.rs similarity index 100% rename from rust/datafusion/src/physical_plan/expressions/sum.rs rename to datafusion/src/physical_plan/expressions/sum.rs diff --git a/rust/datafusion/src/physical_plan/expressions/try_cast.rs b/datafusion/src/physical_plan/expressions/try_cast.rs similarity index 100% rename from rust/datafusion/src/physical_plan/expressions/try_cast.rs rename to datafusion/src/physical_plan/expressions/try_cast.rs diff --git a/rust/datafusion/src/physical_plan/filter.rs b/datafusion/src/physical_plan/filter.rs similarity index 100% rename from rust/datafusion/src/physical_plan/filter.rs rename to datafusion/src/physical_plan/filter.rs diff --git a/rust/datafusion/src/physical_plan/functions.rs b/datafusion/src/physical_plan/functions.rs similarity index 100% rename from rust/datafusion/src/physical_plan/functions.rs rename to datafusion/src/physical_plan/functions.rs diff --git a/rust/datafusion/src/physical_plan/group_scalar.rs b/datafusion/src/physical_plan/group_scalar.rs similarity index 100% rename from rust/datafusion/src/physical_plan/group_scalar.rs rename to datafusion/src/physical_plan/group_scalar.rs diff --git a/rust/datafusion/src/physical_plan/hash_aggregate.rs b/datafusion/src/physical_plan/hash_aggregate.rs similarity index 100% rename from rust/datafusion/src/physical_plan/hash_aggregate.rs rename to datafusion/src/physical_plan/hash_aggregate.rs diff --git a/rust/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs similarity index 100% rename from rust/datafusion/src/physical_plan/hash_join.rs rename to datafusion/src/physical_plan/hash_join.rs diff --git a/rust/datafusion/src/physical_plan/hash_utils.rs b/datafusion/src/physical_plan/hash_utils.rs similarity index 100% rename from rust/datafusion/src/physical_plan/hash_utils.rs rename to datafusion/src/physical_plan/hash_utils.rs diff --git a/rust/datafusion/src/physical_plan/limit.rs b/datafusion/src/physical_plan/limit.rs similarity index 100% rename from rust/datafusion/src/physical_plan/limit.rs rename to datafusion/src/physical_plan/limit.rs diff --git a/rust/datafusion/src/physical_plan/math_expressions.rs b/datafusion/src/physical_plan/math_expressions.rs similarity index 100% rename from rust/datafusion/src/physical_plan/math_expressions.rs rename to datafusion/src/physical_plan/math_expressions.rs diff --git a/rust/datafusion/src/physical_plan/memory.rs b/datafusion/src/physical_plan/memory.rs similarity index 100% rename from rust/datafusion/src/physical_plan/memory.rs rename to datafusion/src/physical_plan/memory.rs diff --git a/rust/datafusion/src/physical_plan/merge.rs b/datafusion/src/physical_plan/merge.rs similarity index 100% rename from rust/datafusion/src/physical_plan/merge.rs rename to datafusion/src/physical_plan/merge.rs diff --git a/rust/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs similarity index 100% rename from rust/datafusion/src/physical_plan/mod.rs rename to datafusion/src/physical_plan/mod.rs diff --git a/rust/datafusion/src/physical_plan/parquet.rs b/datafusion/src/physical_plan/parquet.rs similarity index 100% rename from rust/datafusion/src/physical_plan/parquet.rs rename to datafusion/src/physical_plan/parquet.rs diff --git a/rust/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs similarity index 100% rename from rust/datafusion/src/physical_plan/planner.rs rename to datafusion/src/physical_plan/planner.rs diff --git a/rust/datafusion/src/physical_plan/projection.rs b/datafusion/src/physical_plan/projection.rs similarity index 100% rename from rust/datafusion/src/physical_plan/projection.rs rename to datafusion/src/physical_plan/projection.rs diff --git a/rust/datafusion/src/physical_plan/regex_expressions.rs b/datafusion/src/physical_plan/regex_expressions.rs similarity index 100% rename from rust/datafusion/src/physical_plan/regex_expressions.rs rename to datafusion/src/physical_plan/regex_expressions.rs diff --git a/rust/datafusion/src/physical_plan/repartition.rs b/datafusion/src/physical_plan/repartition.rs similarity index 100% rename from rust/datafusion/src/physical_plan/repartition.rs rename to datafusion/src/physical_plan/repartition.rs diff --git a/rust/datafusion/src/physical_plan/sort.rs b/datafusion/src/physical_plan/sort.rs similarity index 100% rename from rust/datafusion/src/physical_plan/sort.rs rename to datafusion/src/physical_plan/sort.rs diff --git a/rust/datafusion/src/physical_plan/string_expressions.rs b/datafusion/src/physical_plan/string_expressions.rs similarity index 100% rename from rust/datafusion/src/physical_plan/string_expressions.rs rename to datafusion/src/physical_plan/string_expressions.rs diff --git a/rust/datafusion/src/physical_plan/type_coercion.rs b/datafusion/src/physical_plan/type_coercion.rs similarity index 100% rename from rust/datafusion/src/physical_plan/type_coercion.rs rename to datafusion/src/physical_plan/type_coercion.rs diff --git a/rust/datafusion/src/physical_plan/udaf.rs b/datafusion/src/physical_plan/udaf.rs similarity index 100% rename from rust/datafusion/src/physical_plan/udaf.rs rename to datafusion/src/physical_plan/udaf.rs diff --git a/rust/datafusion/src/physical_plan/udf.rs b/datafusion/src/physical_plan/udf.rs similarity index 100% rename from rust/datafusion/src/physical_plan/udf.rs rename to datafusion/src/physical_plan/udf.rs diff --git a/rust/datafusion/src/physical_plan/unicode_expressions.rs b/datafusion/src/physical_plan/unicode_expressions.rs similarity index 100% rename from rust/datafusion/src/physical_plan/unicode_expressions.rs rename to datafusion/src/physical_plan/unicode_expressions.rs diff --git a/rust/datafusion/src/physical_plan/union.rs b/datafusion/src/physical_plan/union.rs similarity index 100% rename from rust/datafusion/src/physical_plan/union.rs rename to datafusion/src/physical_plan/union.rs diff --git a/rust/datafusion/src/prelude.rs b/datafusion/src/prelude.rs similarity index 100% rename from rust/datafusion/src/prelude.rs rename to datafusion/src/prelude.rs diff --git a/rust/datafusion/src/scalar.rs b/datafusion/src/scalar.rs similarity index 100% rename from rust/datafusion/src/scalar.rs rename to datafusion/src/scalar.rs diff --git a/rust/datafusion/src/sql/mod.rs b/datafusion/src/sql/mod.rs similarity index 100% rename from rust/datafusion/src/sql/mod.rs rename to datafusion/src/sql/mod.rs diff --git a/rust/datafusion/src/sql/parser.rs b/datafusion/src/sql/parser.rs similarity index 100% rename from rust/datafusion/src/sql/parser.rs rename to datafusion/src/sql/parser.rs diff --git a/rust/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs similarity index 100% rename from rust/datafusion/src/sql/planner.rs rename to datafusion/src/sql/planner.rs diff --git a/rust/datafusion/src/sql/utils.rs b/datafusion/src/sql/utils.rs similarity index 100% rename from rust/datafusion/src/sql/utils.rs rename to datafusion/src/sql/utils.rs diff --git a/rust/datafusion/src/test/exec.rs b/datafusion/src/test/exec.rs similarity index 100% rename from rust/datafusion/src/test/exec.rs rename to datafusion/src/test/exec.rs diff --git a/rust/datafusion/src/test/mod.rs b/datafusion/src/test/mod.rs similarity index 100% rename from rust/datafusion/src/test/mod.rs rename to datafusion/src/test/mod.rs diff --git a/rust/datafusion/src/test/user_defined.rs b/datafusion/src/test/user_defined.rs similarity index 100% rename from rust/datafusion/src/test/user_defined.rs rename to datafusion/src/test/user_defined.rs diff --git a/rust/datafusion/src/test/variable.rs b/datafusion/src/test/variable.rs similarity index 100% rename from rust/datafusion/src/test/variable.rs rename to datafusion/src/test/variable.rs diff --git a/rust/datafusion/src/variable/mod.rs b/datafusion/src/variable/mod.rs similarity index 100% rename from rust/datafusion/src/variable/mod.rs rename to datafusion/src/variable/mod.rs diff --git a/rust/datafusion/tests/aggregate_simple.csv b/datafusion/tests/aggregate_simple.csv similarity index 100% rename from rust/datafusion/tests/aggregate_simple.csv rename to datafusion/tests/aggregate_simple.csv diff --git a/rust/datafusion/tests/custom_sources.rs b/datafusion/tests/custom_sources.rs similarity index 100% rename from rust/datafusion/tests/custom_sources.rs rename to datafusion/tests/custom_sources.rs diff --git a/rust/datafusion/tests/customer.csv b/datafusion/tests/customer.csv similarity index 100% rename from rust/datafusion/tests/customer.csv rename to datafusion/tests/customer.csv diff --git a/rust/datafusion/tests/dataframe.rs b/datafusion/tests/dataframe.rs similarity index 100% rename from rust/datafusion/tests/dataframe.rs rename to datafusion/tests/dataframe.rs diff --git a/rust/datafusion/tests/example.csv b/datafusion/tests/example.csv similarity index 100% rename from rust/datafusion/tests/example.csv rename to datafusion/tests/example.csv diff --git a/rust/datafusion/tests/provider_filter_pushdown.rs b/datafusion/tests/provider_filter_pushdown.rs similarity index 100% rename from rust/datafusion/tests/provider_filter_pushdown.rs rename to datafusion/tests/provider_filter_pushdown.rs diff --git a/rust/datafusion/tests/sql.rs b/datafusion/tests/sql.rs similarity index 100% rename from rust/datafusion/tests/sql.rs rename to datafusion/tests/sql.rs diff --git a/rust/datafusion/tests/user_defined_plan.rs b/datafusion/tests/user_defined_plan.rs similarity index 100% rename from rust/datafusion/tests/user_defined_plan.rs rename to datafusion/tests/user_defined_plan.rs diff --git a/rust/pre-commit.sh b/pre-commit.sh similarity index 100% rename from rust/pre-commit.sh rename to pre-commit.sh diff --git a/rust/rustfmt.toml b/rustfmt.toml similarity index 100% rename from rust/rustfmt.toml rename to rustfmt.toml From 8258f128bc6665ada0a6be90be351c4a7983fbcc Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 18 Apr 2021 10:36:29 -0600 Subject: [PATCH 006/329] Update gitignore --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index 6f123362ef1fb..5397fe371dfdb 100644 --- a/.gitignore +++ b/.gitignore @@ -80,3 +80,8 @@ cpp/Brewfile.lock.json # docker volumes used for caching .docker + +# Rust +target +Cargo.lock + From 2ac092e9e5099b137335418c79d48e18391ea06b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 18 Apr 2021 10:37:46 -0600 Subject: [PATCH 007/329] Move DataFusion README to top level --- README.md | 404 ++++++++++++++++++++++++++++++------------- datafusion/README.md | 356 -------------------------------------- 2 files changed, 287 insertions(+), 473 deletions(-) delete mode 100644 datafusion/README.md diff --git a/README.md b/README.md index 7fdef29bcdb88..e5849b84ca72a 100644 --- a/README.md +++ b/README.md @@ -17,170 +17,340 @@ under the License. --> -# Native Rust implementation of Apache Arrow +# DataFusion -[![Coverage Status](https://codecov.io/gh/apache/arrow/rust/branch/master/graph/badge.svg)](https://codecov.io/gh/apache/arrow?branch=master) + -Welcome to the implementation of Arrow, the popular in-memory columnar format, in [Rust](https://www.rust-lang.org/). +DataFusion is an extensible query execution framework, written in +Rust, that uses [Apache Arrow](https://arrow.apache.org) as its +in-memory format. -This part of the Arrow project is divided in 4 main components: +DataFusion supports both an SQL and a DataFrame API for building +logical query plans as well as a query optimizer and execution engine +capable of parallel execution against partitioned data sources (CSV +and Parquet) using threads. -| Crate | Description | Documentation | -|-----------|-------------|---------------| -|Arrow | Core functionality (memory layout, arrays, low level computations) | [(README)](arrow/README.md) | -|Parquet | Parquet support | [(README)](parquet/README.md) | -|Arrow-flight | Arrow data between processes | [(README)](arrow-flight/README.md) | -|DataFusion | In-memory query engine with SQL support | [(README)](datafusion/README.md) | -|Ballista | Distributed query execution | [(README)](ballista/README.md) | +## Use Cases -Independently, they support a vast array of functionality for in-memory computations. +DataFusion is used to create modern, fast and efficient data +pipelines, ETL processes, and database systems, which need the +performance of Rust and Apache Arrow and want to provide their users +the convenience of an SQL interface or a DataFrame API. -Together, they allow users to write an SQL query or a `DataFrame` (using the `datafusion` crate), run it against a parquet file (using the `parquet` crate), evaluate it in-memory using Arrow's columnar format (using the `arrow` crate), and send to another process (using the `arrow-flight` crate). +## Why DataFusion? -Generally speaking, the `arrow` crate offers functionality to develop code that uses Arrow arrays, and `datafusion` offers most operations typically found in SQL, with the notable exceptions of: +* *High Performance*: Leveraging Rust and Arrow's memory model, DataFusion achieves very high performance +* *Easy to Connect*: Being part of the Apache Arrow ecosystem (Arrow, Parquet and Flight), DataFusion works well with the rest of the big data ecosystem +* *Easy to Embed*: Allowing extension at almost any point in its design, DataFusion can be tailored for your specific usecase +* *High Quality*: Extensively tested, both by itself and with the rest of the Arrow ecosystem, DataFusion can be used as the foundation for production systems. -* `join` -* `window` functions +## Known Uses -There are too many features to enumerate here, but some notable mentions: +Here are some of the projects known to use DataFusion: -* `Arrow` implements all formats in the specification except certain dictionaries -* `Arrow` supports SIMD operations to some of its vertical operations -* `DataFusion` supports `async` execution -* `DataFusion` supports user-defined functions, aggregates, and whole execution nodes +* [Ballista](https://github.com/ballista-compute/ballista) Distributed Compute Platform +* [Cloudfuse Buzz](https://github.com/cloudfuse-io/buzz-rust) +* [Cube.js](https://github.com/cube-js/cube.js) +* [datafusion-python](https://pypi.org/project/datafusion) +* [delta-rs](https://github.com/delta-io/delta-rs) +* [InfluxDB IOx](https://github.com/influxdata/influxdb_iox) Time Series Database +* [ROAPI](https://github.com/roapi/roapi) -You can find more details about each crate in their respective READMEs. +(if you know of another project, please submit a PR to add a link!) -## Arrow Rust Community +## Example Usage -We use the official [ASF Slack](https://s.apache.org/slack-invite) for informal discussions and coordination. This is -a great place to meet other contributors and get guidance on where to contribute. Join us in the `arrow-rust` channel. +Run a SQL query against data stored in a CSV: -We use [ASF JIRA](https://issues.apache.org/jira/secure/Dashboard.jspa) as the system of record for new features -and bug fixes and this plays a critical role in the release process. +```rust +use datafusion::prelude::*; +use arrow::util::pretty::print_batches; +use arrow::record_batch::RecordBatch; -For design discussions we generally collaborate on Google documents and file a JIRA linking to the document. +#[tokio::main] +async fn main() -> datafusion::error::Result<()> { + // register the table + let mut ctx = ExecutionContext::new(); + ctx.register_csv("example", "tests/example.csv", CsvReadOptions::new())?; -There is also a bi-weekly Rust-specific sync call for the Arrow Rust community. This is hosted on Google Meet -at https://meet.google.com/ctp-yujs-aee on alternate Wednesday's at 09:00 US/Pacific, 12:00 US/Eastern. During -US daylight savings time this corresponds to 16:00 UTC and at other times this is 17:00 UTC. + // create a plan to run a SQL query + let df = ctx.sql("SELECT a, MIN(b) FROM example GROUP BY a LIMIT 100")?; -## Developer's guide to Arrow Rust - -### How to compile - -This is a standard cargo project with workspaces. To build it, you need to have `rust` and `cargo`: - -```bash -cd /rust && cargo build -``` - -You can also use rust's official docker image: - -```bash -docker run --rm -v $(pwd)/rust:/rust -it rust /bin/bash -c "cd /rust && cargo build" + // execute and print results + let results: Vec = df.collect().await?; + print_batches(&results)?; + Ok(()) +} ``` -The command above assumes that are in the root directory of the project, not in the same -directory as this README.md. +Use the DataFrame API to process data stored in a CSV: -You can also compile specific workspaces: +```rust +use datafusion::prelude::*; +use arrow::util::pretty::print_batches; +use arrow::record_batch::RecordBatch; -```bash -cd /rust/arrow && cargo build -``` - -### Git Submodules - -Before running tests and examples, it is necessary to set up the local development environment. +#[tokio::main] +async fn main() -> datafusion::error::Result<()> { + // create the dataframe + let mut ctx = ExecutionContext::new(); + let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; -The tests rely on test data that is contained in git submodules. + let df = df.filter(col("a").lt_eq(col("b")))? + .aggregate(vec![col("a")], vec![min(col("b"))])? + .limit(100)?; -To pull down this data run the following: - -```bash -git submodule update --init + // execute and print results + let results: Vec = df.collect().await?; + print_batches(&results)?; + Ok(()) +} ``` -This populates data in two git submodules: - -- `../cpp/submodules/parquet_testing/data` (sourced from https://github.com/apache/parquet-testing.git) -- `../testing` (sourced from https://github.com/apache/arrow-testing) - -By default, `cargo test` will look for these directories at their -standard location. The following environment variables can be used to override the location: +Both of these examples will produce -```bash -# Optionaly specify a different location for test data -export PARQUET_TEST_DATA=$(cd ../cpp/submodules/parquet-testing/data; pwd) -export ARROW_TEST_DATA=$(cd ../testing/data; pwd) +```text ++---+--------+ +| a | MIN(b) | ++---+--------+ +| 1 | 2 | ++---+--------+ ``` -From here on, this is a pure Rust project and `cargo` can be used to run tests, benchmarks, docs and examples as usual. -### Running the tests +## Using DataFusion as a library -Run tests using the Rust standard `cargo test` command: +DataFusion is [published on crates.io](https://crates.io/crates/datafusion), and is [well documented on docs.rs](https://docs.rs/datafusion/). -```bash -# run all tests. -cargo test +To get started, add the following to your `Cargo.toml` file: - -# run only tests for the arrow crate -cargo test -p arrow +```toml +[dependencies] +datafusion = "4.0.0-SNAPSHOT" ``` -## Code Formatting - -Our CI uses `rustfmt` to check code formatting. Before submitting a -PR be sure to run the following and check for lint issues: - -```bash -cargo +stable fmt --all -- --check +## Using DataFusion as a binary + +DataFusion also includes a simple command-line interactive SQL utility. See the [CLI reference](docs/cli.md) for more information. + +# Status + +## General + +- [x] SQL Parser +- [x] SQL Query Planner +- [x] Query Optimizer + - [x] Constant folding + - [x] Join Reordering + - [x] Limit Pushdown + - [x] Projection push down + - [x] Predicate push down +- [x] Type coercion +- [x] Parallel query execution + +## SQL Support + +- [x] Projection +- [x] Filter (WHERE) +- [x] Filter post-aggregate (HAVING) +- [x] Limit +- [x] Aggregate +- [x] Common math functions +- [x] cast +- [x] try_cast +- Postgres compatible String functions + - [x] ascii + - [x] bit_length + - [x] btrim + - [x] char_length + - [x] character_length + - [x] chr + - [x] concat + - [x] concat_ws + - [x] initcap + - [x] left + - [x] length + - [x] lpad + - [x] ltrim + - [x] octet_length + - [x] regexp_replace + - [x] repeat + - [x] replace + - [x] reverse + - [x] right + - [x] rpad + - [x] rtrim + - [x] split_part + - [x] starts_with + - [x] strpos + - [x] substr + - [x] to_hex + - [x] translate + - [x] trim +- Miscellaneous/Boolean functions + - [x] nullif +- Common date/time functions + - [ ] Basic date functions + - [ ] Basic time functions + - [x] Basic timestamp functions +- nested functions + - [x] Array of columns +- [x] Schema Queries + - [x] SHOW TABLES + - [x] SHOW COLUMNS + - [x] information_schema.{tables, columns} + - [ ] information_schema other views +- [x] Sorting +- [ ] Nested types +- [ ] Lists +- [x] Subqueries +- [x] Common table expressions +- [ ] Set Operations + - [x] UNION ALL + - [ ] UNION + - [ ] INTERSECT + - [ ] MINUS +- [x] Joins + - [x] INNER JOIN + - [ ] CROSS JOIN + - [ ] OUTER JOIN +- [ ] Window + +## Data Sources + +- [x] CSV +- [x] Parquet primitive types +- [ ] Parquet nested types + + +## Extensibility + +DataFusion is designed to be extensible at all points. To that end, you can provide your own custom: + +- [x] User Defined Functions (UDFs) +- [x] User Defined Aggregate Functions (UDAFs) +- [x] User Defined Table Source (`TableProvider`) for tables +- [x] User Defined `Optimizer` passes (plan rewrites) +- [x] User Defined `LogicalPlan` nodes +- [x] User Defined `ExecutionPlan` nodes + + +# Supported SQL + +This library currently supports many SQL constructs, including + +* `CREATE EXTERNAL TABLE X STORED AS PARQUET LOCATION '...';` to register a table's locations +* `SELECT ... FROM ...` together with any expression +* `ALIAS` to name an expression +* `CAST` to change types, including e.g. `Timestamp(Nanosecond, None)` +* most mathematical unary and binary expressions such as `+`, `/`, `sqrt`, `tan`, `>=`. +* `WHERE` to filter +* `GROUP BY` together with one of the following aggregations: `MIN`, `MAX`, `COUNT`, `SUM`, `AVG` +* `ORDER BY` together with an expression and optional `ASC` or `DESC` and also optional `NULLS FIRST` or `NULLS LAST` + + +## Supported Functions + +DataFusion strives to implement a subset of the [PostgreSQL SQL dialect](https://www.postgresql.org/docs/current/functions.html) where possible. We explicitly choose a single dialect to maximize interoperability with other tools and allow reuse of the PostgreSQL documents and tutorials as much as possible. + +Currently, only a subset of the PosgreSQL dialect is implemented, and we will document any deviations. + +## Schema Metadata / Information Schema Support + +DataFusion supports the showing metadata about the tables available. This information can be accessed using the views of the ISO SQL `information_schema` schema or the DataFusion specific `SHOW TABLES` and `SHOW COLUMNS` commands. + +More information can be found in the [Postgres docs](https://www.postgresql.org/docs/13/infoschema-schema.html)). + + +To show tables available for use in DataFusion, use the `SHOW TABLES` command or the `information_schema.tables` view: + +```sql +> show tables; ++---------------+--------------------+------------+------------+ +| table_catalog | table_schema | table_name | table_type | ++---------------+--------------------+------------+------------+ +| datafusion | public | t | BASE TABLE | +| datafusion | information_schema | tables | VIEW | ++---------------+--------------------+------------+------------+ + +> select * from information_schema.tables; + ++---------------+--------------------+------------+--------------+ +| table_catalog | table_schema | table_name | table_type | ++---------------+--------------------+------------+--------------+ +| datafusion | public | t | BASE TABLE | +| datafusion | information_schema | TABLES | SYSTEM TABLE | ++---------------+--------------------+------------+--------------+ ``` -## Clippy Lints - -We recommend using `clippy` for checking lints during development. While we do not yet enforce `clippy` checks, we recommend not introducing new `clippy` errors or warnings. - -Run the following to check for clippy lints. - -``` -cargo clippy +To show the schema of a table in DataFusion, use the `SHOW COLUMNS` command or the or `information_schema.columns` view: + +```sql +> show columns from t; ++---------------+--------------+------------+-------------+-----------+-------------+ +| table_catalog | table_schema | table_name | column_name | data_type | is_nullable | ++---------------+--------------+------------+-------------+-----------+-------------+ +| datafusion | public | t | a | Int32 | NO | +| datafusion | public | t | b | Utf8 | NO | +| datafusion | public | t | c | Float32 | NO | ++---------------+--------------+------------+-------------+-----------+-------------+ + +> select table_name, column_name, ordinal_position, is_nullable, data_type from information_schema.columns; ++------------+-------------+------------------+-------------+-----------+ +| table_name | column_name | ordinal_position | is_nullable | data_type | ++------------+-------------+------------------+-------------+-----------+ +| t | a | 0 | NO | Int32 | +| t | b | 1 | NO | Utf8 | +| t | c | 2 | NO | Float32 | ++------------+-------------+------------------+-------------+-----------+ ``` -If you use Visual Studio Code with the `rust-analyzer` plugin, you can enable `clippy` to run each time you save a file. See https://users.rust-lang.org/t/how-to-use-clippy-in-vs-code-with-rust-analyzer/41881. -One of the concerns with `clippy` is that it often produces a lot of false positives, or that some recommendations may hurt readability. We do not have a policy of which lints are ignored, but if you disagree with a `clippy` lint, you may disable the lint and briefly justify it. -Search for `allow(clippy::` in the codebase to identify lints that are ignored/allowed. We currently prefer ignoring lints on the lowest unit possible. -* If you are introducing a line that returns a lint warning or error, you may disable the lint on that line. -* If you have several lints on a function or module, you may disable the lint on the function or module. -* If a lint is pervasive across multiple modules, you may disable it at the crate level. +## Supported Data Types -## Git Pre-Commit Hook +DataFusion uses Arrow, and thus the Arrow type system, for query +execution. The SQL types from +[sqlparser-rs](https://github.com/ballista-compute/sqlparser-rs/blob/main/src/ast/data_type.rs#L57) +are mapped to Arrow types according to the following table -We can use [git pre-commit hook](https://git-scm.com/book/en/v2/Customizing-Git-Git-Hooks) to automate various kinds of git pre-commit checking/formatting. -Suppose you are in the root directory of the project. +| SQL Data Type | Arrow DataType | +| --------------- | -------------------------------- | +| `CHAR` | `Utf8` | +| `VARCHAR` | `Utf8` | +| `UUID` | *Not yet supported* | +| `CLOB` | *Not yet supported* | +| `BINARY` | *Not yet supported* | +| `VARBINARY` | *Not yet supported* | +| `DECIMAL` | `Float64` | +| `FLOAT` | `Float32` | +| `SMALLINT` | `Int16` | +| `INT` | `Int32` | +| `BIGINT` | `Int64` | +| `REAL` | `Float64` | +| `DOUBLE` | `Float64` | +| `BOOLEAN` | `Boolean` | +| `DATE` | `Date32` | +| `TIME` | `Time64(TimeUnit::Millisecond)` | +| `TIMESTAMP` | `Date64` | +| `INTERVAL` | *Not yet supported* | +| `REGCLASS` | *Not yet supported* | +| `TEXT` | *Not yet supported* | +| `BYTEA` | *Not yet supported* | +| `CUSTOM` | *Not yet supported* | +| `ARRAY` | *Not yet supported* | -First check if the file already exists: -```bash -ls -l .git/hooks/pre-commit -``` +# Architecture Overview -If the file already exists, to avoid mistakenly **overriding**, you MAY have to check -the link source or file content. Else if not exist, let's safely soft link [pre-commit.sh](pre-commit.sh) as file `.git/hooks/pre-commit`: +There is no formal document describing DataFusion's architecture yet, but the following presentations offer a good overview of its different components and how they interact together. -``` -ln -s ../../rust/pre-commit.sh .git/hooks/pre-commit -``` +* (March 2021): The DataFusion architecture is described in *Query Engine Design and the Rust-Based DataFusion in Apache Arrow*: [recording](https://www.youtube.com/watch?v=K6eCAVEk4kU) (DataFusion content starts ~ 15 minutes in) and [slides](https://www.slideshare.net/influxdata/influxdb-iox-tech-talks-query-engine-design-and-the-rustbased-datafusion-in-apache-arrow-244161934) +* (Feburary 2021): How DataFusion is used within the Ballista Project is described in *Ballista: Distributed Compute with Rust and Apache Arrow: [recording](https://www.youtube.com/watch?v=ZZHQaOap9pQ) -If sometimes you want to commit without checking, just run `git commit` with `--no-verify`: -```bash -git commit --no-verify -m "... commit message ..." -``` +# Developer's guide + +Please see [Developers Guide](DEVELOPERS.md) for information about developing DataFusion. diff --git a/datafusion/README.md b/datafusion/README.md deleted file mode 100644 index e5849b84ca72a..0000000000000 --- a/datafusion/README.md +++ /dev/null @@ -1,356 +0,0 @@ - - -# DataFusion - - - -DataFusion is an extensible query execution framework, written in -Rust, that uses [Apache Arrow](https://arrow.apache.org) as its -in-memory format. - -DataFusion supports both an SQL and a DataFrame API for building -logical query plans as well as a query optimizer and execution engine -capable of parallel execution against partitioned data sources (CSV -and Parquet) using threads. - -## Use Cases - -DataFusion is used to create modern, fast and efficient data -pipelines, ETL processes, and database systems, which need the -performance of Rust and Apache Arrow and want to provide their users -the convenience of an SQL interface or a DataFrame API. - -## Why DataFusion? - -* *High Performance*: Leveraging Rust and Arrow's memory model, DataFusion achieves very high performance -* *Easy to Connect*: Being part of the Apache Arrow ecosystem (Arrow, Parquet and Flight), DataFusion works well with the rest of the big data ecosystem -* *Easy to Embed*: Allowing extension at almost any point in its design, DataFusion can be tailored for your specific usecase -* *High Quality*: Extensively tested, both by itself and with the rest of the Arrow ecosystem, DataFusion can be used as the foundation for production systems. - -## Known Uses - -Here are some of the projects known to use DataFusion: - -* [Ballista](https://github.com/ballista-compute/ballista) Distributed Compute Platform -* [Cloudfuse Buzz](https://github.com/cloudfuse-io/buzz-rust) -* [Cube.js](https://github.com/cube-js/cube.js) -* [datafusion-python](https://pypi.org/project/datafusion) -* [delta-rs](https://github.com/delta-io/delta-rs) -* [InfluxDB IOx](https://github.com/influxdata/influxdb_iox) Time Series Database -* [ROAPI](https://github.com/roapi/roapi) - -(if you know of another project, please submit a PR to add a link!) - -## Example Usage - -Run a SQL query against data stored in a CSV: - -```rust -use datafusion::prelude::*; -use arrow::util::pretty::print_batches; -use arrow::record_batch::RecordBatch; - -#[tokio::main] -async fn main() -> datafusion::error::Result<()> { - // register the table - let mut ctx = ExecutionContext::new(); - ctx.register_csv("example", "tests/example.csv", CsvReadOptions::new())?; - - // create a plan to run a SQL query - let df = ctx.sql("SELECT a, MIN(b) FROM example GROUP BY a LIMIT 100")?; - - // execute and print results - let results: Vec = df.collect().await?; - print_batches(&results)?; - Ok(()) -} -``` - -Use the DataFrame API to process data stored in a CSV: - -```rust -use datafusion::prelude::*; -use arrow::util::pretty::print_batches; -use arrow::record_batch::RecordBatch; - -#[tokio::main] -async fn main() -> datafusion::error::Result<()> { - // create the dataframe - let mut ctx = ExecutionContext::new(); - let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; - - let df = df.filter(col("a").lt_eq(col("b")))? - .aggregate(vec![col("a")], vec![min(col("b"))])? - .limit(100)?; - - // execute and print results - let results: Vec = df.collect().await?; - print_batches(&results)?; - Ok(()) -} -``` - -Both of these examples will produce - -```text -+---+--------+ -| a | MIN(b) | -+---+--------+ -| 1 | 2 | -+---+--------+ -``` - - - -## Using DataFusion as a library - -DataFusion is [published on crates.io](https://crates.io/crates/datafusion), and is [well documented on docs.rs](https://docs.rs/datafusion/). - -To get started, add the following to your `Cargo.toml` file: - -```toml -[dependencies] -datafusion = "4.0.0-SNAPSHOT" -``` - -## Using DataFusion as a binary - -DataFusion also includes a simple command-line interactive SQL utility. See the [CLI reference](docs/cli.md) for more information. - -# Status - -## General - -- [x] SQL Parser -- [x] SQL Query Planner -- [x] Query Optimizer - - [x] Constant folding - - [x] Join Reordering - - [x] Limit Pushdown - - [x] Projection push down - - [x] Predicate push down -- [x] Type coercion -- [x] Parallel query execution - -## SQL Support - -- [x] Projection -- [x] Filter (WHERE) -- [x] Filter post-aggregate (HAVING) -- [x] Limit -- [x] Aggregate -- [x] Common math functions -- [x] cast -- [x] try_cast -- Postgres compatible String functions - - [x] ascii - - [x] bit_length - - [x] btrim - - [x] char_length - - [x] character_length - - [x] chr - - [x] concat - - [x] concat_ws - - [x] initcap - - [x] left - - [x] length - - [x] lpad - - [x] ltrim - - [x] octet_length - - [x] regexp_replace - - [x] repeat - - [x] replace - - [x] reverse - - [x] right - - [x] rpad - - [x] rtrim - - [x] split_part - - [x] starts_with - - [x] strpos - - [x] substr - - [x] to_hex - - [x] translate - - [x] trim -- Miscellaneous/Boolean functions - - [x] nullif -- Common date/time functions - - [ ] Basic date functions - - [ ] Basic time functions - - [x] Basic timestamp functions -- nested functions - - [x] Array of columns -- [x] Schema Queries - - [x] SHOW TABLES - - [x] SHOW COLUMNS - - [x] information_schema.{tables, columns} - - [ ] information_schema other views -- [x] Sorting -- [ ] Nested types -- [ ] Lists -- [x] Subqueries -- [x] Common table expressions -- [ ] Set Operations - - [x] UNION ALL - - [ ] UNION - - [ ] INTERSECT - - [ ] MINUS -- [x] Joins - - [x] INNER JOIN - - [ ] CROSS JOIN - - [ ] OUTER JOIN -- [ ] Window - -## Data Sources - -- [x] CSV -- [x] Parquet primitive types -- [ ] Parquet nested types - - -## Extensibility - -DataFusion is designed to be extensible at all points. To that end, you can provide your own custom: - -- [x] User Defined Functions (UDFs) -- [x] User Defined Aggregate Functions (UDAFs) -- [x] User Defined Table Source (`TableProvider`) for tables -- [x] User Defined `Optimizer` passes (plan rewrites) -- [x] User Defined `LogicalPlan` nodes -- [x] User Defined `ExecutionPlan` nodes - - -# Supported SQL - -This library currently supports many SQL constructs, including - -* `CREATE EXTERNAL TABLE X STORED AS PARQUET LOCATION '...';` to register a table's locations -* `SELECT ... FROM ...` together with any expression -* `ALIAS` to name an expression -* `CAST` to change types, including e.g. `Timestamp(Nanosecond, None)` -* most mathematical unary and binary expressions such as `+`, `/`, `sqrt`, `tan`, `>=`. -* `WHERE` to filter -* `GROUP BY` together with one of the following aggregations: `MIN`, `MAX`, `COUNT`, `SUM`, `AVG` -* `ORDER BY` together with an expression and optional `ASC` or `DESC` and also optional `NULLS FIRST` or `NULLS LAST` - - -## Supported Functions - -DataFusion strives to implement a subset of the [PostgreSQL SQL dialect](https://www.postgresql.org/docs/current/functions.html) where possible. We explicitly choose a single dialect to maximize interoperability with other tools and allow reuse of the PostgreSQL documents and tutorials as much as possible. - -Currently, only a subset of the PosgreSQL dialect is implemented, and we will document any deviations. - -## Schema Metadata / Information Schema Support - -DataFusion supports the showing metadata about the tables available. This information can be accessed using the views of the ISO SQL `information_schema` schema or the DataFusion specific `SHOW TABLES` and `SHOW COLUMNS` commands. - -More information can be found in the [Postgres docs](https://www.postgresql.org/docs/13/infoschema-schema.html)). - - -To show tables available for use in DataFusion, use the `SHOW TABLES` command or the `information_schema.tables` view: - -```sql -> show tables; -+---------------+--------------------+------------+------------+ -| table_catalog | table_schema | table_name | table_type | -+---------------+--------------------+------------+------------+ -| datafusion | public | t | BASE TABLE | -| datafusion | information_schema | tables | VIEW | -+---------------+--------------------+------------+------------+ - -> select * from information_schema.tables; - -+---------------+--------------------+------------+--------------+ -| table_catalog | table_schema | table_name | table_type | -+---------------+--------------------+------------+--------------+ -| datafusion | public | t | BASE TABLE | -| datafusion | information_schema | TABLES | SYSTEM TABLE | -+---------------+--------------------+------------+--------------+ -``` - -To show the schema of a table in DataFusion, use the `SHOW COLUMNS` command or the or `information_schema.columns` view: - -```sql -> show columns from t; -+---------------+--------------+------------+-------------+-----------+-------------+ -| table_catalog | table_schema | table_name | column_name | data_type | is_nullable | -+---------------+--------------+------------+-------------+-----------+-------------+ -| datafusion | public | t | a | Int32 | NO | -| datafusion | public | t | b | Utf8 | NO | -| datafusion | public | t | c | Float32 | NO | -+---------------+--------------+------------+-------------+-----------+-------------+ - -> select table_name, column_name, ordinal_position, is_nullable, data_type from information_schema.columns; -+------------+-------------+------------------+-------------+-----------+ -| table_name | column_name | ordinal_position | is_nullable | data_type | -+------------+-------------+------------------+-------------+-----------+ -| t | a | 0 | NO | Int32 | -| t | b | 1 | NO | Utf8 | -| t | c | 2 | NO | Float32 | -+------------+-------------+------------------+-------------+-----------+ -``` - - - -## Supported Data Types - -DataFusion uses Arrow, and thus the Arrow type system, for query -execution. The SQL types from -[sqlparser-rs](https://github.com/ballista-compute/sqlparser-rs/blob/main/src/ast/data_type.rs#L57) -are mapped to Arrow types according to the following table - - -| SQL Data Type | Arrow DataType | -| --------------- | -------------------------------- | -| `CHAR` | `Utf8` | -| `VARCHAR` | `Utf8` | -| `UUID` | *Not yet supported* | -| `CLOB` | *Not yet supported* | -| `BINARY` | *Not yet supported* | -| `VARBINARY` | *Not yet supported* | -| `DECIMAL` | `Float64` | -| `FLOAT` | `Float32` | -| `SMALLINT` | `Int16` | -| `INT` | `Int32` | -| `BIGINT` | `Int64` | -| `REAL` | `Float64` | -| `DOUBLE` | `Float64` | -| `BOOLEAN` | `Boolean` | -| `DATE` | `Date32` | -| `TIME` | `Time64(TimeUnit::Millisecond)` | -| `TIMESTAMP` | `Date64` | -| `INTERVAL` | *Not yet supported* | -| `REGCLASS` | *Not yet supported* | -| `TEXT` | *Not yet supported* | -| `BYTEA` | *Not yet supported* | -| `CUSTOM` | *Not yet supported* | -| `ARRAY` | *Not yet supported* | - - -# Architecture Overview - -There is no formal document describing DataFusion's architecture yet, but the following presentations offer a good overview of its different components and how they interact together. - -* (March 2021): The DataFusion architecture is described in *Query Engine Design and the Rust-Based DataFusion in Apache Arrow*: [recording](https://www.youtube.com/watch?v=K6eCAVEk4kU) (DataFusion content starts ~ 15 minutes in) and [slides](https://www.slideshare.net/influxdata/influxdb-iox-tech-talks-query-engine-design-and-the-rustbased-datafusion-in-apache-arrow-244161934) -* (Feburary 2021): How DataFusion is used within the Ballista Project is described in *Ballista: Distributed Compute with Rust and Apache Arrow: [recording](https://www.youtube.com/watch?v=ZZHQaOap9pQ) - - -# Developer's guide - -Please see [Developers Guide](DEVELOPERS.md) for information about developing DataFusion. From 44c2c6a628b04eb163d86e87ee13deac0118801d Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 18 Apr 2021 10:38:31 -0600 Subject: [PATCH 008/329] Fix image link in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e5849b84ca72a..9e6b7a2a78b5b 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ # DataFusion - + DataFusion is an extensible query execution framework, written in Rust, that uses [Apache Arrow](https://arrow.apache.org) as its From f5048b1e1b219f1a9210f3e86766c7966ccd0620 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 18 Apr 2021 10:40:30 -0600 Subject: [PATCH 009/329] Remove non-Rust files --- appveyor.yml | 79 --- cmake-format.py | 59 -- docker-compose.yml | 1391 ------------------------------------------- run-cmake-format.py | 111 ---- 4 files changed, 1640 deletions(-) delete mode 100644 appveyor.yml delete mode 100644 cmake-format.py delete mode 100644 docker-compose.yml delete mode 100755 run-cmake-format.py diff --git a/appveyor.yml b/appveyor.yml deleted file mode 100644 index ace5bb97f00d5..0000000000000 --- a/appveyor.yml +++ /dev/null @@ -1,79 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Operating system (build VM template) -os: Visual Studio 2017 - -only_commits: - # Skip commits not related to Python or C++ - files: - - appveyor.yml - - ci/appveyor* - - ci/conda* - - cpp/ - - format/ - - python/ - -cache: - - C:\Users\Appveyor\clcache1 - -matrix: - fast_finish: true - -environment: - global: - # Make these variables visible in all jobs and build steps - MSVC_DEFAULT_OPTIONS: ON - APPVEYOR_SAVE_CACHE_ON_ERROR: true - # Change the clcache dir to reset caches everywhere when a setting - # is changed incompatibly (e.g. CLCACHE_COMPRESS). - CLCACHE_DIR: C:\Users\Appveyor\clcache1 - CLCACHE_SERVER: 1 - CLCACHE_COMPRESS: 1 - CLCACHE_COMPRESSLEVEL: 6 - ARROW_BUILD_FLIGHT: "OFF" - ARROW_BUILD_GANDIVA: "OFF" - ARROW_LLVM_VERSION: "7.0.*" - ARROW_S3: "OFF" - PYTHON: "3.7" - ARCH: "64" - - matrix: - # NOTE: clcache seems to work best with Ninja and worst with msbuild - # (as generated by cmake) - - JOB: "Build" - GENERATOR: Ninja - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 - - JOB: "Toolchain" - GENERATOR: Ninja - ARROW_S3: "ON" - ARROW_BUILD_FLIGHT: "ON" - ARROW_BUILD_GANDIVA: "ON" - - JOB: "Build_Debug" - GENERATOR: Ninja - -before_build: - - call ci\appveyor-cpp-setup.bat - -build_script: - - call ci\appveyor-cpp-build.bat - -# Disable test discovery -test: off - -after_build: - - clcache -s diff --git a/cmake-format.py b/cmake-format.py deleted file mode 100644 index 0976642031f72..0000000000000 --- a/cmake-format.py +++ /dev/null @@ -1,59 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# cmake-format configuration file -# Use run-cmake-format.py to reformat all cmake files in the source tree - -# How wide to allow formatted cmake files -line_width = 90 - -# How many spaces to tab for indent -tab_size = 2 - -# If arglists are longer than this, break them always -max_subargs_per_line = 4 - -# If true, separate flow control names from their parentheses with a space -separate_ctrl_name_with_space = False - -# If true, separate function names from parentheses with a space -separate_fn_name_with_space = False - -# If a statement is wrapped to more than one line, than dangle the closing -# parenthesis on it's own line -dangle_parens = False - -# What style line endings to use in the output. -line_ending = 'unix' - -# Format command names consistently as 'lower' or 'upper' case -command_case = 'lower' - -# Format keywords consistently as 'lower' or 'upper' case -keyword_case = 'unchanged' - -# enable comment markup parsing and reflow -enable_markup = False - -# If comment markup is enabled, don't reflow the first comment block in -# eachlistfile. Use this to preserve formatting of your -# copyright/licensestatements. -first_comment_is_literal = False - -# If comment markup is enabled, don't reflow any comment block which matchesthis -# (regex) pattern. Default is `None` (disabled). -literal_comment_pattern = None diff --git a/docker-compose.yml b/docker-compose.yml deleted file mode 100644 index 4a3092ec04d7b..0000000000000 --- a/docker-compose.yml +++ /dev/null @@ -1,1391 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Usage -# ----- -# -# The docker compose file is parametrized using environment variables, the -# defaults are set in .env file. -# -# Example: -# $ ARCH=arm64v8 docker-compose build ubuntu-cpp -# $ ARCH=arm64v8 docker-compose run ubuntu-cpp -# -# -# Coredumps -# --------- -# -# In order to enable coredumps for the C++ tests run by CTest either with -# command `make unittest` or `ctest --output-on-failure` the correct coredump -# patterns must be set. -# The kernel settings are coming from the host, so while it can be enabled from -# a running container using --priviled option the change will affect all other -# containers, so prefer setting it explicitly, directly on the host. -# WARNING: setting this will affect the host machine. -# -# Linux host: -# $ sudo sysctl -w kernel.core_pattern=core.%e.%p -# -# macOS host running Docker for Mac (won't persist between restarts): -# $ screen ~/Library/Containers/com.docker.docker/Data/vms/0/tty -# # echo "core.%e.%p" > /proc/sys/kernel/core_pattern -# -# The setup attempts to generate coredumps by default, but the correct paths -# above must be set. In order to disable the coredump generation set -# ULIMIT_CORE environment variable to 0 before running docker-compose -# (or by setting it in .env file): -# -# ULIMIT_CORE=0 docker-compose run --rm conda-cpp -# -# See more in cpp/build-support/run-test.sh::print_coredumps - -version: '3.5' - -x-ccache: &ccache - CCACHE_COMPILERCHECK: content - CCACHE_COMPRESS: 1 - CCACHE_COMPRESSLEVEL: 6 - CCACHE_MAXSIZE: 500M - CCACHE_DIR: /ccache - -x-with-gpus: - - ubuntu-cuda-cpp - - ubuntu-cuda-python - -x-hierarchy: - # This section is used by the archery tool to enable building nested images, - # so it is enough to call: - # archery run debian-ruby - # instead of a seguence of docker-compose commands: - # docker-compose build debian-cpp - # docker-compose build debian-c-glib - # docker-compose build debian-ruby - # docker-compose run --rm debian-ruby - # - # Each node must be either a string scalar of a list containing the - # descendant images if any. Archery checks that all node has a corresponding - # service entry, so any new image/service must be listed here. - - conda: - - conda-cpp: - - conda-cpp-hiveserver2 - - conda-cpp-valgrind - - conda-python: - - conda-python-pandas - - conda-python-dask - - conda-python-hdfs - - conda-python-jpype - - conda-python-turbodbc - - conda-python-kartothek - - conda-python-spark - - conda-integration - - debian-cpp: - - debian-c-glib: - - debian-ruby - - debian-python - - debian-go - - debian-java: - - debian-java-jni - - debian-js - - fedora-cpp: - - fedora-python - - ubuntu-cpp: - - ubuntu-c-glib: - - ubuntu-ruby - - ubuntu-lint - - ubuntu-python: - - ubuntu-docs - - ubuntu-python-sdist-test - - ubuntu-r - - ubuntu-cuda-cpp: - - ubuntu-cuda-python - - ubuntu-csharp - - ubuntu-cpp-sanitizer - - ubuntu-cpp-thread-sanitizer - - ubuntu-r-sanitizer - - python-sdist - - r - # helper services - - impala - - postgres - - python-wheel-manylinux-2010 - - python-wheel-manylinux-2014 - - python-wheel-manylinux-test-imports - - python-wheel-manylinux-test-unittests - - python-wheel-windows-vs2017 - - python-wheel-windows-test - -volumes: - conda-ccache: - name: ${ARCH}-conda-ccache - debian-ccache: - name: ${ARCH}-debian-${DEBIAN}-ccache - ubuntu-ccache: - name: ${ARCH}-ubuntu-${UBUNTU}-ccache - fedora-ccache: - name: ${ARCH}-fedora-${FEDORA}-ccache - debian-rust: - name: ${ARCH}-debian-${DEBIAN}-rust - maven-cache: - name: maven-cache - python-wheel-manylinux2010-ccache: - name: python-wheel-manylinux2010-ccache - python-wheel-manylinux2014-ccache: - name: python-wheel-manylinux2014-ccache - python-wheel-windows-clcache: - name: python-wheel-windows-clcache - -services: - - ################################# C++ ####################################### - # Release build: - # docker-compose run -e ARROW_BUILD_TYPE=release conda-cpp|debian-cpp|... - # Shared only: - # docker-compose run -e ARROW_BUILD_STATIC=OFF conda-cpp|debian-cpp|... - # Static only: - # docker-compose run \ - # -e ARROW_BUILD_SHARED=OFF \ - # -e ARROW_TEST_LINKAGE=static \ - # conda-cpp|debian-cpp|... - - conda: - # Base image for conda builds. - # - # Usage: - # docker-compose build con - # docker-compose run --rm conda - # Parameters: - # ARCH: amd64, arm32v7 - image: ${REPO}:${ARCH}-conda - build: - context: . - dockerfile: ci/docker/conda.dockerfile - cache_from: - - ${REPO}:${ARCH}-conda - args: - arch: ${ARCH} - prefix: /opt/conda - volumes: - - .:/arrow:delegated - - conda-cpp: - # C++ build in conda environment, including the doxygen docs. - # - # Usage: - # docker-compose build conda - # docker-compose build conda-cpp - # docker-compose run --rm conda-cpp - # Parameters: - # ARCH: amd64, arm32v7 - image: ${REPO}:${ARCH}-conda-cpp - build: - context: . - dockerfile: ci/docker/conda-cpp.dockerfile - cache_from: - - ${REPO}:${ARCH}-conda-cpp - args: - repo: ${REPO} - arch: ${ARCH} - shm_size: &shm-size 2G - ulimits: &ulimits - core: ${ULIMIT_CORE} - environment: - <<: *ccache - ARROW_BUILD_BENCHMARKS: "ON" - ARROW_ENABLE_TIMING_TESTS: # inherit - ARROW_MIMALLOC: "ON" - ARROW_USE_LD_GOLD: "ON" - ARROW_USE_PRECOMPILED_HEADERS: "ON" - volumes: &conda-volumes - - .:/arrow:delegated - - ${DOCKER_VOLUME_PREFIX}conda-ccache:/ccache:delegated - command: &cpp-conda-command - ["/arrow/ci/scripts/cpp_build.sh /arrow /build true && - /arrow/ci/scripts/cpp_test.sh /arrow /build"] - - conda-cpp-valgrind: - # Usage: - # docker-compose build conda - # docker-compose build conda-cpp - # docker-compose run --rm conda-cpp-valgrind - # Parameters: - # ARCH: amd64, arm32v7 - image: ${REPO}:${ARCH}-conda-cpp - build: - context: . - dockerfile: ci/docker/conda-cpp.dockerfile - cache_from: - - ${REPO}:${ARCH}-conda-cpp - args: - repo: ${REPO} - arch: ${ARCH} - prefix: /opt/conda - shm_size: *shm-size - environment: - <<: *ccache - ARROW_CXXFLAGS: "-Og" # Shrink test runtime by enabling minimal optimizations - ARROW_ENABLE_TIMING_TESTS: # inherit - ARROW_FLIGHT: "OFF" - ARROW_GANDIVA: "OFF" - ARROW_JEMALLOC: "OFF" - ARROW_RUNTIME_SIMD_LEVEL: "AVX2" # AVX512 not supported by Valgrind (ARROW-9851) - ARROW_S3: "OFF" - ARROW_TEST_MEMCHECK: "ON" - ARROW_USE_LD_GOLD: "ON" - BUILD_WARNING_LEVEL: "PRODUCTION" - volumes: *conda-volumes - command: *cpp-conda-command - - debian-cpp: - # Usage: - # docker-compose build debian-cpp - # docker-compose run --rm debian-cpp - # Parameters: - # ARCH: amd64, arm64v8, ... - # DEBIAN: 9, 10 - image: ${REPO}:${ARCH}-debian-${DEBIAN}-cpp - build: - context: . - dockerfile: ci/docker/debian-${DEBIAN}-cpp.dockerfile - cache_from: - - ${REPO}:${ARCH}-debian-${DEBIAN}-cpp - args: - arch: ${ARCH} - llvm: ${LLVM} - shm_size: *shm-size - ulimits: *ulimits - environment: - <<: *ccache - ARROW_ENABLE_TIMING_TESTS: # inherit - ARROW_MIMALLOC: "ON" - volumes: &debian-volumes - - .:/arrow:delegated - - ${DOCKER_VOLUME_PREFIX}debian-ccache:/ccache:delegated - command: &cpp-command > - /bin/bash -c " - /arrow/ci/scripts/cpp_build.sh /arrow /build && - /arrow/ci/scripts/cpp_test.sh /arrow /build" - - ubuntu-cpp: - # Usage: - # docker-compose build ubuntu-cpp - # docker-compose run --rm ubuntu-cpp - # Parameters: - # ARCH: amd64, arm64v8, s390x, ... - # UBUNTU: 18.04, 20.04 - image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp - build: - context: . - dockerfile: ci/docker/ubuntu-${UBUNTU}-cpp.dockerfile - cache_from: - - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp - args: - arch: ${ARCH} - base: "${ARCH}/ubuntu:${UBUNTU}" - clang_tools: ${CLANG_TOOLS} - llvm: ${LLVM} - shm_size: *shm-size - ulimits: *ulimits - environment: - <<: *ccache - ARROW_ENABLE_TIMING_TESTS: # inherit - ARROW_MIMALLOC: "ON" - volumes: &ubuntu-volumes - - .:/arrow:delegated - - ${DOCKER_VOLUME_PREFIX}ubuntu-ccache:/ccache:delegated - command: *cpp-command - - ubuntu-cuda-cpp: - # Usage: - # docker-compose build cuda-cpp - # docker-compose run --rm cuda-cpp - # Also need to edit the host docker configuration as follows: - # https://github.com/docker/compose/issues/6691#issuecomment-561504928 - # Parameters: - # ARCH: amd64 - # CUDA: 9.1, 10.0, 10.1 - image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cuda-${CUDA}-cpp - build: - context: . - dockerfile: ci/docker/ubuntu-${UBUNTU}-cpp.dockerfile - cache_from: - - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cuda-${CUDA}-cpp - args: - arch: ${ARCH} - base: nvidia/cuda:${CUDA}-devel-ubuntu${UBUNTU} - clang_tools: ${CLANG_TOOLS} - llvm: ${LLVM} - shm_size: *shm-size - ulimits: *ulimits - environment: - <<: *ccache - ARROW_CUDA: "ON" - volumes: *ubuntu-volumes - command: *cpp-command - - ubuntu-cpp-sanitizer: - # Usage: - # docker-compose build ubuntu-cpp-sanitizer - # docker-compose run --rm ubuntu-cpp-sanitizer - # Parameters: - # ARCH: amd64, arm64v8, ... - # UBUNTU: 18.04, 20.04 - image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp - cap_add: - # For LeakSanitizer - - SYS_PTRACE - build: - context: . - dockerfile: ci/docker/ubuntu-${UBUNTU}-cpp.dockerfile - cache_from: - - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp - args: - arch: ${ARCH} - clang_tools: ${CLANG_TOOLS} - llvm: ${LLVM} - shm_size: *shm-size - volumes: *ubuntu-volumes - environment: - <<: *ccache - CC: clang-${CLANG_TOOLS} - CXX: clang++-${CLANG_TOOLS} - ARROW_ENABLE_TIMING_TESTS: # inherit - ARROW_FUZZING: "ON" # Check fuzz regressions - ARROW_JEMALLOC: "OFF" - ARROW_ORC: "OFF" - ARROW_S3: "OFF" - ARROW_USE_ASAN: "ON" - ARROW_USE_UBSAN: "ON" - # utf8proc 2.1.0 in Ubuntu Bionic has test failures - utf8proc_SOURCE: "BUNDLED" - command: *cpp-command - - ubuntu-cpp-thread-sanitizer: - # Usage: - # docker-compose build ubuntu-cpp-thread-sanitizer - # docker-compose run --rm ubuntu-cpp-thread-sanitizer - # Parameters: - # ARCH: amd64, arm64v8, ... - # UBUNTU: 18.04, 20.04 - image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp - build: - context: . - dockerfile: ci/docker/ubuntu-${UBUNTU}-cpp.dockerfile - cache_from: - - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp - args: - arch: ${ARCH} - clang_tools: ${CLANG_TOOLS} - llvm: ${LLVM} - shm_size: *shm-size - volumes: *ubuntu-volumes - environment: - <<: *ccache - CC: clang-${CLANG_TOOLS} - CXX: clang++-${CLANG_TOOLS} - ARROW_ENABLE_TIMING_TESTS: # inherit - ARROW_DATASET: "ON" - ARROW_JEMALLOC: "OFF" - ARROW_ORC: "OFF" - ARROW_S3: "OFF" - ARROW_USE_TSAN: "ON" - command: *cpp-command - - fedora-cpp: - # Usage: - # docker-compose build fedora-cpp - # docker-compose run --rm fedora-cpp - # Parameters: - # ARCH: amd64, arm64v8, ... - # FEDORA: 33 - image: ${REPO}:${ARCH}-fedora-${FEDORA}-cpp - build: - context: . - dockerfile: ci/docker/fedora-${FEDORA}-cpp.dockerfile - cache_from: - - ${REPO}:${ARCH}-fedora-${FEDORA}-cpp - args: - arch: ${ARCH} - llvm: ${LLVM} - shm_size: *shm-size - ulimits: *ulimits - environment: - <<: *ccache - ARROW_ENABLE_TIMING_TESTS: # inherit - ARROW_MIMALLOC: "ON" - volumes: &fedora-volumes - - .:/arrow:delegated - - ${DOCKER_VOLUME_PREFIX}fedora-ccache:/ccache:delegated - command: *cpp-command - - ############################### C GLib ###################################### - - debian-c-glib: - # Usage: - # docker-compose build debian-cpp - # docker-compose build debian-c-glib - # docker-compose run --rm debian-c-glib - # Parameters: - # ARCH: amd64, arm64v8, ... - # DEBIAN: 9, 10 - image: ${REPO}:${ARCH}-debian-${DEBIAN}-c-glib - build: - context: . - dockerfile: ci/docker/linux-apt-c-glib.dockerfile - cache_from: - - ${REPO}:${ARCH}-debian-${DEBIAN}-c-glib - args: - base: ${REPO}:${ARCH}-debian-${DEBIAN}-cpp - shm_size: *shm-size - ulimits: *ulimits - environment: - <<: *ccache - ARROW_GLIB_GTK_DOC: "true" - volumes: *debian-volumes - command: &c-glib-command > - /bin/bash -c " - /arrow/ci/scripts/cpp_build.sh /arrow /build && - /arrow/ci/scripts/c_glib_build.sh /arrow /build && - /arrow/ci/scripts/c_glib_test.sh /arrow /build" - - ubuntu-c-glib: - # Usage: - # docker-compose build ubuntu-cpp - # docker-compose build ubuntu-c-glib - # docker-compose run --rm ubuntu-c-glib - # Parameters: - # ARCH: amd64, arm64v8, ... - # UBUNTU: 18.04, 20.04 - image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-c-glib - build: - context: . - dockerfile: ci/docker/linux-apt-c-glib.dockerfile - cache_from: - - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-c-glib - args: - base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp - shm_size: *shm-size - ulimits: *ulimits - environment: - <<: *ccache - ARROW_GLIB_GTK_DOC: "true" - volumes: *ubuntu-volumes - command: *c-glib-command - - ############################### Ruby ######################################## - # Until Ruby is the only dependent implementation on top of C Glib we can - # test C Glib and Ruby in one pass. This is an optimization to avoid - # redundant (one for C GLib and one for Ruby doing the same work twice) - # builds on CI services. - - debian-ruby: - # Usage: - # docker-compose build debian-cpp - # docker-compose build debian-c-glib - # docker-compose build debian-ruby - # docker-compose run --rm debian-ruby - # Parameters: - # ARCH: amd64, arm64v8, ... - # DEBIAN: 9, 10 - image: ${REPO}:${ARCH}-debian-${DEBIAN}-ruby - build: - context: . - dockerfile: ci/docker/linux-apt-ruby.dockerfile - cache_from: - - ${REPO}:${ARCH}-debian-${DEBIAN}-ruby - args: - base: ${REPO}:${ARCH}-debian-${DEBIAN}-c-glib - shm_size: *shm-size - ulimits: *ulimits - environment: - <<: *ccache - volumes: *debian-volumes - command: &ruby-command > - /bin/bash -c " - /arrow/ci/scripts/cpp_build.sh /arrow /build && - /arrow/ci/scripts/c_glib_build.sh /arrow /build && - /arrow/ci/scripts/c_glib_test.sh /arrow /build && - /arrow/ci/scripts/ruby_test.sh /arrow /build" - - ubuntu-ruby: - # Usage: - # docker-compose build ubuntu-cpp - # docker-compose build ubuntu-c-glib - # docker-compose build ubuntu-ruby - # docker-compose run --rm ubuntu-ruby - # Parameters: - # ARCH: amd64, arm64v8, ... - # UBUNTU: 18.04, 20.04 - image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-ruby - build: - context: . - dockerfile: ci/docker/linux-apt-ruby.dockerfile - cache_from: - - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-ruby - args: - base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-c-glib - shm_size: *shm-size - ulimits: *ulimits - environment: - <<: *ccache - volumes: *ubuntu-volumes - command: *ruby-command - - ############################### Python ###################################### - - conda-python: - # Usage: - # docker-compose build conda - # docker-compose build conda-cpp - # docker-compose build conda-python - # docker-compose run --rm conda-python - # Parameters: - # ARCH: amd64, arm32v7 - # PYTHON: 3.6, 3.7, 3.8, 3.9 - image: ${REPO}:${ARCH}-conda-python-${PYTHON} - build: - context: . - dockerfile: ci/docker/conda-python.dockerfile - cache_from: - - ${REPO}:${ARCH}-conda-python-${PYTHON} - args: - repo: ${REPO} - arch: ${ARCH} - python: ${PYTHON} - shm_size: *shm-size - environment: - <<: *ccache - volumes: *conda-volumes - command: &python-conda-command - ["/arrow/ci/scripts/cpp_build.sh /arrow /build && - /arrow/ci/scripts/python_build.sh /arrow /build && - /arrow/ci/scripts/python_test.sh /arrow"] - - ubuntu-cuda-python: - # Usage: - # docker-compose build cuda-cpp - # docker-compose build cuda-python - # docker-compose run --rm cuda-python - # Parameters: - # ARCH: amd64 - # CUDA: 8.0, 10.0, ... - image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cuda-${CUDA}-python-3 - build: - context: . - dockerfile: ci/docker/linux-apt-python-3.dockerfile - cache_from: - - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cuda-${CUDA}-python-3 - args: - base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cuda-${CUDA}-cpp - shm_size: *shm-size - environment: - <<: *ccache - ARROW_CUDA: "ON" - volumes: *ubuntu-volumes - command: &python-command > - /bin/bash -c " - /arrow/ci/scripts/cpp_build.sh /arrow /build && - /arrow/ci/scripts/python_build.sh /arrow /build && - /arrow/ci/scripts/python_test.sh /arrow" - - debian-python: - # Usage: - # docker-compose build debian-cpp - # docker-compose build debian-python - # docker-compose run --rm debian-python - # Parameters: - # ARCH: amd64, arm64v8, ... - # DEBIAN: 9, 10 - image: ${REPO}:${ARCH}-debian-${DEBIAN}-python-3 - build: - context: . - dockerfile: ci/docker/linux-apt-python-3.dockerfile - cache_from: - - ${REPO}:${ARCH}-debian-${DEBIAN}-python-3 - args: - base: ${REPO}:${ARCH}-debian-${DEBIAN}-cpp - shm_size: *shm-size - environment: - <<: *ccache - volumes: *debian-volumes - command: *python-command - - ubuntu-python: - # Usage: - # docker-compose build ubuntu-cpp - # docker-compose build ubuntu-python - # docker-compose run --rm ubuntu-python - # Parameters: - # ARCH: amd64, arm64v8, ... - # UBUNTU: 18.04, 20.04 - image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3 - build: - context: . - dockerfile: ci/docker/linux-apt-python-3.dockerfile - cache_from: - - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3 - args: - base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp - shm_size: *shm-size - environment: - <<: *ccache - volumes: *ubuntu-volumes - command: *python-command - - fedora-python: - # Usage: - # docker-compose build fedora-cpp - # docker-compose build fedora-python - # docker-compose run --rm fedora-python - # Parameters: - # ARCH: amd64, arm64v8, ... - # FEDORA: 33 - image: ${REPO}:${ARCH}-fedora-${FEDORA}-python-3 - build: - context: . - dockerfile: ci/docker/linux-dnf-python-3.dockerfile - cache_from: - - ${REPO}:${ARCH}-fedora-${FEDORA}-python-3 - args: - base: ${REPO}:${ARCH}-fedora-${FEDORA}-cpp - shm_size: *shm-size - environment: - <<: *ccache - volumes: *fedora-volumes - command: *python-command - - ############################ Python sdist ################################### - - python-sdist: - # Usage: - # docker-compose build python-sdist - # docker-compose run --rm python-sdist - # Parameters: - # PYARROW_VERSION: The pyarrow version for sdist such as "3.0.0" - image: ${REPO}:python-sdist - build: - context: . - dockerfile: ci/docker/python-sdist.dockerfile - cache_from: - - ${REPO}:python-sdist - environment: - PYARROW_VERSION: ${PYARROW_VERSION:-} - volumes: - - .:/arrow:delegated - command: /arrow/ci/scripts/python_sdist_build.sh /arrow - - ubuntu-python-sdist-test: - # Usage: - # docker-compose build ubuntu-cpp - # docker-compose build ubuntu-python-sdist-test - # docker-compose run --rm ubuntu-python-sdist-test - # Parameters: - # ARCH: amd64, arm64v8, ... - # PYARROW_VERSION: The test target pyarrow version such as "3.0.0" - # UBUNTU: 18.04, 20.04 - image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3 - build: - context: . - dockerfile: ci/docker/linux-apt-python-3.dockerfile - cache_from: - - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3 - args: - base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp - shm_size: *shm-size - environment: - <<: *ccache - PYARROW_VERSION: ${PYARROW_VERSION:-} - volumes: *ubuntu-volumes - command: > - /bin/bash -c " - /arrow/ci/scripts/cpp_build.sh /arrow /build && - /arrow/ci/scripts/python_sdist_test.sh /arrow" - - ############################ Python wheels ################################## - - # See available versions at: - # https://quay.io/repository/pypa/manylinux2010_x86_64?tab=tags - # only amd64 arch is supported - python-wheel-manylinux-2010: - image: ${REPO}:${ARCH}-python-${PYTHON}-wheel-manylinux-2010 - build: - args: - arch_alias: ${ARCH_ALIAS} - arch_short_alias: ${ARCH_SHORT_ALIAS} - base: quay.io/pypa/manylinux2010_${ARCH_ALIAS}:2020-12-03-912b0de - vcpkg: ${VCPKG} - python: ${PYTHON} - context: . - dockerfile: ci/docker/python-wheel-manylinux-201x.dockerfile - cache_from: - - ${REPO}:${ARCH}-python-${PYTHON}-wheel-manylinux-2010 - environment: - <<: *ccache - MANYLINUX_VERSION: 2010 - volumes: - - .:/arrow:delegated - - ${DOCKER_VOLUME_PREFIX}python-wheel-manylinux2010-ccache:/ccache:delegated - command: /arrow/ci/scripts/python_wheel_manylinux_build.sh - - # See available versions at: - # https://quay.io/repository/pypa/manylinux2014_x86_64?tab=tags - python-wheel-manylinux-2014: - image: ${REPO}:${ARCH}-python-${PYTHON}-wheel-manylinux-2014 - build: - args: - arch_alias: ${ARCH_ALIAS} - arch_short_alias: ${ARCH_SHORT_ALIAS} - base: quay.io/pypa/manylinux2014_${ARCH_ALIAS}:2020-11-11-bc8ce45 - vcpkg: ${VCPKG} - python: ${PYTHON} - context: . - dockerfile: ci/docker/python-wheel-manylinux-201x.dockerfile - cache_from: - - ${REPO}:${ARCH}-python-${PYTHON}-wheel-manylinux-2014 - environment: - <<: *ccache - MANYLINUX_VERSION: 2014 - volumes: - - .:/arrow:delegated - - ${DOCKER_VOLUME_PREFIX}python-wheel-manylinux2014-ccache:/ccache:delegated - command: /arrow/ci/scripts/python_wheel_manylinux_build.sh - - python-wheel-manylinux-test-imports: - image: ${ARCH}/python:${PYTHON} - shm_size: 2G - volumes: - - .:/arrow:delegated - command: /arrow/ci/scripts/python_wheel_manylinux_test.sh imports - - python-wheel-manylinux-test-unittests: - image: ${REPO}:${ARCH}-python-${PYTHON}-wheel-manylinux-test - build: - args: - arch: ${ARCH} - python: ${PYTHON} - context: . - dockerfile: ci/docker/python-wheel-manylinux-test.dockerfile - cache_from: - - ${REPO}:${ARCH}-python-${PYTHON}-wheel-manylinux-test - shm_size: 2G - volumes: - - .:/arrow:delegated - command: /arrow/ci/scripts/python_wheel_manylinux_test.sh unittests - - python-wheel-windows-vs2017: - image: ${REPO}:python-${PYTHON}-wheel-windows-vs2017 - build: - args: - vcpkg: ${VCPKG} - python: ${PYTHON} - context: . - dockerfile: ci/docker/python-wheel-windows-vs2017.dockerfile - # This should make the pushed images reusable, but the image gets rebuilt. - # Uncomment if no local cache is available. - # cache_from: - # - mcr.microsoft.com/windows/servercore:ltsc2019 - # - ${REPO}:wheel-windows-vs2017 - volumes: - - "${DOCKER_VOLUME_PREFIX}python-wheel-windows-clcache:C:/clcache" - - type: bind - source: . - target: "C:/arrow" - command: arrow\\ci\\scripts\\python_wheel_windows_build.bat - - python-wheel-windows-test: - image: python:${PYTHON}-windowsservercore-1809 - volumes: - - type: bind - source: . - target: "C:/arrow" - command: arrow\\ci\\scripts\\python_wheel_windows_test.bat - - ############################## Integration ################################# - - conda-python-pandas: - # Possible $PANDAS parameters: - # - `latest`: latest release - # - `master`: git master branch, use `docker-compose run --no-cache` - # - ``: specific version available on conda-forge - # Usage: - # docker-compose build conda - # docker-compose build conda-cpp - # docker-compose build conda-python - # docker-compose build conda-python-pandas - # docker-compose run --rm conda-python-pandas - image: ${REPO}:${ARCH}-conda-python-${PYTHON}-pandas-${PANDAS} - build: - context: . - dockerfile: ci/docker/conda-python-pandas.dockerfile - cache_from: - - ${REPO}:${ARCH}-conda-python-${PYTHON}-pandas-${PANDAS} - args: - repo: ${REPO} - arch: ${ARCH} - python: ${PYTHON} - numpy: ${NUMPY} - pandas: ${PANDAS} - shm_size: *shm-size - environment: - <<: *ccache - volumes: *conda-volumes - command: *python-conda-command - - conda-python-dask: - # Possible $DASK parameters: - # - `latest`: latest release - # - `master`: git master branch, use `docker-compose run --no-cache` - # - ``: specific version available on conda-forge - # Usage: - # docker-compose build conda - # docker-compose build conda-cpp - # docker-compose build conda-python - # docker-compose build conda-python-dask - # docker-compose run --rm conda-python-dask - image: ${REPO}:${ARCH}-conda-python-${PYTHON}-dask-${DASK} - build: - context: . - dockerfile: ci/docker/conda-python-dask.dockerfile - cache_from: - - ${REPO}:${ARCH}-conda-python-${PYTHON}-dask-${DASK} - args: - repo: ${REPO} - arch: ${ARCH} - python: ${PYTHON} - dask: ${DASK} - shm_size: *shm-size - environment: - <<: *ccache - volumes: *conda-volumes - command: - ["/arrow/ci/scripts/cpp_build.sh /arrow /build && - /arrow/ci/scripts/python_build.sh /arrow /build && - /arrow/ci/scripts/integration_dask.sh"] - - conda-python-jpype: - # Usage: - # docker-compose build conda - # docker-compose build conda-cpp - # docker-compose build conda-python - # docker-compose build conda-python-jpype - # docker-compose run --rm conda-python-jpype - image: ${REPO}:${ARCH}-conda-python-${PYTHON}-jpype - build: - context: . - dockerfile: ci/docker/conda-python-jpype.dockerfile - cache_from: - - ${REPO}:${ARCH}-conda-python-${PYTHON}-jpype - args: - repo: ${REPO} - arch: ${ARCH} - python: ${PYTHON} - shm_size: *shm-size - environment: - <<: *ccache - ARROW_FLIGHT: "OFF" - ARROW_GANDIVA: "OFF" - volumes: *conda-volumes - command: - ["/arrow/ci/scripts/cpp_build.sh /arrow /build && - /arrow/ci/scripts/python_build.sh /arrow /build && - /arrow/ci/scripts/java_build.sh /arrow /build && - /arrow/ci/scripts/python_test.sh /arrow"] - - conda-python-turbodbc: - # Possible $TURBODBC parameters: - # - `latest`: latest release - # - `master`: git master branch, use `docker-compose run --no-cache` - # - ``: specific version available under github releases - # Usage: - # docker-compose build conda - # docker-compose build conda-cpp - # docker-compose build conda-python - # docker-compose build conda-python-turbodbc - # docker-compose run --rm conda-python-turbodbc - image: ${REPO}:${ARCH}-conda-python-${PYTHON}-turbodbc-${TURBODBC} - build: - context: . - dockerfile: ci/docker/conda-python-turbodbc.dockerfile - cache_from: - - ${REPO}:${ARCH}-conda-python-${PYTHON}-turbodbc-${TURBODBC} - args: - repo: ${REPO} - arch: ${ARCH} - python: ${PYTHON} - turbodbc: ${TURBODBC} - shm_size: *shm-size - environment: - <<: *ccache - volumes: *conda-volumes - command: - ["/arrow/ci/scripts/cpp_build.sh /arrow /build && - /arrow/ci/scripts/python_build.sh /arrow /build && - /arrow/ci/scripts/integration_turbodbc.sh /turbodbc /build"] - - conda-python-kartothek: - # Possible $KARTOTHEK parameters: - # - `latest`: latest release - # - `master`: git master branch, use `docker-compose run --no-cache` - # - ``: specific version available under github releases - # Usage: - # docker-compose build conda - # docker-compose build conda-cpp - # docker-compose build conda-python - # docker-compose build conda-python-kartothek - # docker-compose run --rm conda-python-kartothek - image: ${REPO}:${ARCH}-conda-python-${PYTHON}-kartothek-${KARTOTHEK} - build: - context: . - dockerfile: ci/docker/conda-python-kartothek.dockerfile - cache_from: - - ${REPO}:${ARCH}-conda-python-${PYTHON}-kartothek-${KARTOTHEK} - args: - repo: ${REPO} - arch: ${ARCH} - python: ${PYTHON} - kartothek: ${KARTOTHEK} - shm_size: *shm-size - environment: - <<: *ccache - volumes: *conda-volumes - command: - ["/arrow/ci/scripts/cpp_build.sh /arrow /build && - /arrow/ci/scripts/python_build.sh /arrow /build && - /arrow/ci/scripts/integration_kartothek.sh /kartothek /build"] - - ################################## R ######################################## - - ubuntu-r: - # Usage: - # docker-compose build ubuntu-cpp - # docker-compose build ubuntu-r - # docker-compose run ubuntu-r - image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-r-${R} - build: - context: . - dockerfile: ci/docker/linux-apt-r.dockerfile - cache_from: - - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-r-${R} - args: - arch: ${ARCH} - r: ${R} - base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp - shm_size: *shm-size - environment: - <<: *ccache - ARROW_R_CXXFLAGS: '-Werror' - LIBARROW_BUILD: 'false' - NOT_CRAN: 'true' - volumes: *ubuntu-volumes - command: > - /bin/bash -c " - /arrow/ci/scripts/cpp_build.sh /arrow /build && - /arrow/ci/scripts/python_build.sh /arrow /build && - /arrow/ci/scripts/r_test.sh /arrow" - - r: - # This lets you test building/installing the arrow R package - # (including building the C++ library) on any Docker image that contains R - # - # Usage: - # R_ORG=rhub R_IMAGE=ubuntu-gcc-release R_TAG=latest docker-compose build r - # R_ORG=rhub R_IMAGE=ubuntu-gcc-release R_TAG=latest docker-compose run r - image: ${REPO}:r-${R_ORG}-${R_IMAGE}-${R_TAG} - build: - context: . - dockerfile: ci/docker/linux-r.dockerfile - cache_from: - - ${REPO}:r-${R_ORG}-${R_IMAGE}-${R_TAG} - args: - base: ${R_ORG}/${R_IMAGE}:${R_TAG} - r_dev: ${ARROW_R_DEV} - devtoolset_version: ${DEVTOOLSET_VERSION} - shm_size: *shm-size - environment: - LIBARROW_DOWNLOAD: "false" - ARROW_SOURCE_HOME: "/arrow" - ARROW_R_DEV: ${ARROW_R_DEV} - # To test for CRAN release, delete ^^ these two env vars so we download the Apache release - ARROW_USE_PKG_CONFIG: "false" - devtoolset_version: ${DEVTOOLSET_VERSION} - volumes: - - .:/arrow:delegated - command: > - /bin/bash -c "/arrow/ci/scripts/r_test.sh /arrow" - - ubuntu-r-sanitizer: - # Only 18.04 and amd64 supported - # Usage: - # docker-compose build ubuntu-r-sanitizer - # docker-compose run ubuntu-r-sanitizer - image: ${REPO}:amd64-ubuntu-18.04-r-sanitizer - cap_add: - # LeakSanitizer and gdb requires ptrace(2) - - SYS_PTRACE - build: - context: . - dockerfile: ci/docker/linux-r.dockerfile - cache_from: - - ${REPO}:amd64-ubuntu-18.04-r-sanitizer - args: - base: wch1/r-debug:latest - r_bin: RDsan - environment: - <<: *ccache - volumes: *ubuntu-volumes - command: > - /bin/bash -c " - /arrow/ci/scripts/r_sanitize.sh /arrow" - - ################################# Go ######################################## - - debian-go: - # Usage: - # docker-compose build debian-go - # docker-compose run debian-go - image: ${REPO}:${ARCH}-debian-10-go-${GO} - build: - context: . - dockerfile: ci/docker/debian-10-go.dockerfile - cache_from: - - ${REPO}:${ARCH}-debian-10-go-${GO} - args: - arch: ${ARCH} - go: ${GO} - shm_size: *shm-size - volumes: *debian-volumes - command: &go-command > - /bin/bash -c " - /arrow/ci/scripts/go_build.sh /arrow && - /arrow/ci/scripts/go_test.sh /arrow" - - ############################# JavaScript #################################### - - debian-js: - # Usage: - # docker-compose build debian-js - # docker-compose run debian-js - image: ${REPO}:${ARCH}-debian-10-js-${NODE} - build: - context: . - dockerfile: ci/docker/debian-10-js.dockerfile - cache_from: - - ${REPO}:${ARCH}-debian-10-js-${NODE} - args: - arch: ${ARCH} - node: ${NODE} - shm_size: *shm-size - volumes: *debian-volumes - command: &js-command > - /bin/bash -c " - /arrow/ci/scripts/js_build.sh /arrow && - /arrow/ci/scripts/js_test.sh /arrow" - - #################################### C# ##################################### - - ubuntu-csharp: - # Usage: - # docker-compose build ubuntu-csharp - # docker-compose run ubuntu-csharp - image: ${REPO}:${ARCH}-ubuntu-18.04-csharp-${DOTNET} - build: - context: . - dockerfile: ci/docker/ubuntu-18.04-csharp.dockerfile - cache_from: - - ${REPO}:${ARCH}-ubuntu-18.04-csharp-${DOTNET} - args: - dotnet: ${DOTNET} - platform: bionic # use bionic-arm64v8 for ARM - shm_size: *shm-size - volumes: *ubuntu-volumes - command: &csharp-command > - /bin/bash -c " - /arrow/ci/scripts/csharp_build.sh /arrow && - /arrow/ci/scripts/csharp_test.sh /arrow && - /arrow/ci/scripts/csharp_pack.sh /arrow" - - ################################ Java ####################################### - - debian-java: - # Usage: - # docker-compose build debian-java - # docker-compose run debian-java - image: ${REPO}:${ARCH}-debian-9-java-${JDK}-maven-${MAVEN} - build: - context: . - dockerfile: ci/docker/debian-9-java.dockerfile - cache_from: - - ${REPO}:${ARCH}-debian-9-java-${JDK}-maven-${MAVEN} - args: - arch: ${ARCH} - jdk: ${JDK} - maven: ${MAVEN} - shm_size: *shm-size - volumes: &java-volumes - - .:/arrow:delegated - - ${DOCKER_VOLUME_PREFIX}maven-cache:/root/.m2:delegated - command: &java-command > - /bin/bash -c " - /arrow/ci/scripts/java_build.sh /arrow /build && - /arrow/ci/scripts/java_test.sh /arrow /build" - - debian-java-jni: - # Includes plasma test and jni for gandiva and orc. - # Usage: - # docker-compose build debian-java - # docker-compose build debian-java-jni - # docker-compose run debian-java-jni - image: ${REPO}:${ARCH}-debian-9-java-jni - build: - context: . - dockerfile: ci/docker/linux-apt-jni.dockerfile - cache_from: - - ${REPO}:${ARCH}-debian-9-java-jni - args: - base: ${REPO}:${ARCH}-debian-9-java-${JDK}-maven-${MAVEN} - llvm: ${LLVM} - shm_size: *shm-size - environment: - <<: *ccache - volumes: - - .:/arrow:delegated - - ${DOCKER_VOLUME_PREFIX}maven-cache:/root/.m2:delegated - - ${DOCKER_VOLUME_PREFIX}debian-ccache:/ccache:delegated - command: - /bin/bash -c " - /arrow/ci/scripts/cpp_build.sh /arrow /build && - /arrow/ci/scripts/java_build.sh /arrow /build && - /arrow/ci/scripts/java_test.sh /arrow /build" - - ############################## Integration ################################## - - conda-integration: - # Usage: - # docker-compose build conda-cpp - # docker-compose build conda-integration - # docker-compose run conda-integration - image: ${REPO}:${ARCH}-conda-integration - build: - context: . - dockerfile: ci/docker/conda-integration.dockerfile - cache_from: - - ${REPO}:${ARCH}-conda-integration - args: - repo: ${REPO} - arch: ${ARCH} - jdk: ${JDK} - # conda-forge doesn't have 3.5.4 so pinning explicitly, but this should - # be set to ${MAVEN} - maven: 3.5 - node: ${NODE} - go: ${GO} - volumes: *conda-volumes - environment: - <<: *ccache - # tell archery where the arrow binaries are located - ARROW_CPP_EXE_PATH: /build/cpp/debug - command: - ["/arrow/ci/scripts/rust_build.sh /arrow /build && - /arrow/ci/scripts/cpp_build.sh /arrow /build && - /arrow/ci/scripts/go_build.sh /arrow && - /arrow/ci/scripts/java_build.sh /arrow /build && - /arrow/ci/scripts/js_build.sh /arrow /build && - /arrow/ci/scripts/integration_arrow.sh /arrow /build"] - - ################################ Docs ####################################### - - ubuntu-docs: - # Usage: - # docker-compose build ubuntu-cpp - # docker-compose build ubuntu-python - # docker-compose build ubuntu-docs - # docker-compose run --rm ubuntu-docs - image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-docs - build: - context: . - dockerfile: ci/docker/linux-apt-docs.dockerfile - cache_from: - - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-docs - args: - jdk: ${JDK} - node: ${NODE} - base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3 - environment: - <<: *ccache - ARROW_CUDA: "ON" - ARROW_GLIB_GTK_DOC: "true" - volumes: *ubuntu-volumes - command: &docs-command > - /bin/bash -c " - /arrow/ci/scripts/cpp_build.sh /arrow /build true && - /arrow/ci/scripts/c_glib_build.sh /arrow /build && - /arrow/ci/scripts/python_build.sh /arrow /build && - /arrow/ci/scripts/java_build.sh /arrow /build true && - /arrow/ci/scripts/js_build.sh /arrow true && - /arrow/ci/scripts/r_build.sh /arrow true && - /arrow/ci/scripts/docs_build.sh /arrow /build" - - ################################# Tools ##################################### - - ubuntu-lint: - # Usage: - # docker-compose build ubuntu-cpp - # docker-compose build ubuntu-lint - # docker-compose run ubuntu-lint - image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-lint - build: - context: . - dockerfile: ci/docker/linux-apt-lint.dockerfile - cache_from: - - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-lint - args: - base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp - clang_tools: ${CLANG_TOOLS} - rust: ${RUST} - environment: - <<: *ccache - volumes: *ubuntu-volumes - command: > - /bin/bash -c " - pip install -e /arrow/dev/archery && - archery lint --all --no-clang-tidy --no-iwyu --no-numpydoc" - - ######################### Integration Tests ################################# - - postgres: - # required for the impala service - image: postgres - ports: - - 5432:5432 - environment: - POSTGRES_PASSWORD: postgres - - impala: - # required for the hiveserver and hdfs tests - image: ibisproject/impala:latest - hostname: impala - links: - - postgres:postgres - environment: - PGPASSWORD: postgres - ports: - # HDFS - - 9020:9020 - - 50070:50070 - - 50075:50075 - - 8020:8020 - - 8042:8042 - # Hive - - 9083:9083 - # Impala - - 21000:21000 - - 21050:21050 - - 25000:25000 - - 25010:25010 - - 25020:25020 - - conda-cpp-hiveserver2: - # Usage: - # docker-compose build conda-cpp - # docker-compose build conda-cpp-hiveserver2 - # docker-compose run conda-cpp-hiveserver2 - image: ${REPO}:${ARCH}-conda-cpp - links: - - impala:impala - environment: - <<: *ccache - ARROW_FLIGHT: "OFF" - ARROW_GANDIVA: "OFF" - ARROW_PLASMA: "OFF" - ARROW_HIVESERVER2: "ON" - ARROW_HIVESERVER2_TEST_HOST: impala - shm_size: *shm-size - volumes: *conda-volumes - command: - ["/arrow/ci/scripts/cpp_build.sh /arrow /build && - /arrow/ci/scripts/integration_hiveserver2.sh /arrow /build"] - - conda-python-hdfs: - # Usage: - # docker-compose build conda-cpp - # docker-compose build conda-python - # docker-compose build conda-python-hdfs - # docker-compose run conda-python-hdfs - image: ${REPO}:${ARCH}-conda-python-${PYTHON}-hdfs-${HDFS} - build: - context: . - dockerfile: ci/docker/conda-python-hdfs.dockerfile - cache_from: - - ${REPO}:${ARCH}-conda-python-${PYTHON}-hdfs-${HDFS} - args: - repo: ${REPO} - arch: ${ARCH} - python: ${PYTHON} - jdk: ${JDK} - # conda-forge doesn't have 3.5.4 so pinning explicitly, but this should - # be set to ${MAVEN} - maven: 3.5 - hdfs: ${HDFS} - links: - - impala:impala - environment: - <<: *ccache - ARROW_HDFS: "ON" - ARROW_HDFS_TEST_HOST: impala - ARROW_HDFS_TEST_PORT: 8020 - ARROW_HDFS_TEST_USER: hdfs - ARROW_S3: "OFF" - CMAKE_UNITY_BUILD: "ON" - shm_size: *shm-size - volumes: &conda-maven-volumes - - .:/arrow:delegated - - ${DOCKER_VOLUME_PREFIX}maven-cache:/root/.m2:delegated - - ${DOCKER_VOLUME_PREFIX}conda-ccache:/ccache:delegated - command: - ["/arrow/ci/scripts/cpp_build.sh /arrow /build && - /arrow/ci/scripts/python_build.sh /arrow /build && - /arrow/ci/scripts/integration_hdfs.sh /arrow /build"] - - conda-python-spark: - # Usage: - # docker-compose build conda-cpp - # docker-compose build conda-python - # docker-compose build conda-python-spark - # docker-compose run conda-python-spark - image: ${REPO}:${ARCH}-conda-python-${PYTHON}-spark-${SPARK} - build: - context: . - dockerfile: ci/docker/conda-python-spark.dockerfile - cache_from: - - ${REPO}:${ARCH}-conda-python-${PYTHON}-spark-${SPARK} - args: - repo: ${REPO} - arch: ${ARCH} - python: ${PYTHON} - jdk: ${JDK} - # conda-forge doesn't have 3.5.4 so pinning explicitly, but this should - # be set to ${MAVEN} - maven: 3.5 - spark: ${SPARK} - shm_size: *shm-size - environment: - <<: *ccache - volumes: *conda-maven-volumes - command: - ["/arrow/ci/scripts/cpp_build.sh /arrow /build && - /arrow/ci/scripts/python_build.sh /arrow /build && - /arrow/ci/scripts/java_build.sh /arrow /build && - /arrow/ci/scripts/integration_spark.sh /arrow /spark ${TEST_PYARROW_ONLY:-false}"] diff --git a/run-cmake-format.py b/run-cmake-format.py deleted file mode 100755 index 1ff103868d84b..0000000000000 --- a/run-cmake-format.py +++ /dev/null @@ -1,111 +0,0 @@ -#!/usr/bin/env python3 - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import argparse -import fnmatch -import hashlib -import pathlib -import subprocess -import sys - -# Keep an explicit list of files to format as we don't want to reformat -# files we imported from other location. -PATTERNS = [ - 'ci/**/*.cmake', - 'cpp/CMakeLists.txt', - 'cpp/src/**/CMakeLists.txt', - 'cpp/cmake_modules/*.cmake', - 'go/**/CMakeLists.txt', - 'java/**/CMakeLists.txt', - 'matlab/**/CMakeLists.txt', -] -EXCLUDE = [ - 'cpp/cmake_modules/FindNumPy.cmake', - 'cpp/cmake_modules/FindPythonLibsNew.cmake', - 'cpp/cmake_modules/UseCython.cmake', - 'cpp/src/arrow/util/config.h.cmake', -] - -here = pathlib.Path(__file__).parent - - -def find_cmake_files(): - for pat in PATTERNS: - yield from here.glob(pat) - - -def run_cmake_format(paths): - # cmake-format is fast enough that running in parallel doesn't seem - # necessary - # autosort is off because it breaks in cmake_format 5.1 - # See: https://github.com/cheshirekow/cmake_format/issues/111 - cmd = ['cmake-format', '--in-place', '--autosort=false'] + paths - try: - subprocess.run(cmd, check=True) - except FileNotFoundError: - try: - import cmake_format - except ImportError: - raise ImportError( - "Please install cmake-format: `pip install cmake_format`") - else: - # Other error, re-raise - raise - - -def check_cmake_format(paths): - hashes = {} - for p in paths: - contents = p.read_bytes() - hashes[p] = hashlib.sha256(contents).digest() - - run_cmake_format(paths) - - # Check contents didn't change - changed = [] - for p in paths: - contents = p.read_bytes() - if hashes[p] != hashlib.sha256(contents).digest(): - changed.append(p) - - if changed: - items = "\n".join("- %s" % p for p in sorted(changed)) - print("The following cmake files need re-formatting:\n%s" % (items,)) - print() - print("Consider running `run-cmake-format.py`") - sys.exit(1) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('--check', action='store_true') - parser.add_argument('paths', nargs='*', type=pathlib.Path) - args = parser.parse_args() - - paths = find_cmake_files() - if args.paths: - paths = set(paths) & set([path.resolve() for path in args.paths]) - paths = [ - path for path in paths - if path.relative_to(here).as_posix() not in EXCLUDE - ] - if args.check: - check_cmake_format(paths) - else: - run_cmake_format(paths) From 8b89427c6588db5b02b8156313c7ec69ae478737 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 18 Apr 2021 10:58:27 -0600 Subject: [PATCH 010/329] duplicate DataFusion README for now to fix build --- datafusion/README.md | 356 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 356 insertions(+) create mode 100644 datafusion/README.md diff --git a/datafusion/README.md b/datafusion/README.md new file mode 100644 index 0000000000000..9e6b7a2a78b5b --- /dev/null +++ b/datafusion/README.md @@ -0,0 +1,356 @@ + + +# DataFusion + + + +DataFusion is an extensible query execution framework, written in +Rust, that uses [Apache Arrow](https://arrow.apache.org) as its +in-memory format. + +DataFusion supports both an SQL and a DataFrame API for building +logical query plans as well as a query optimizer and execution engine +capable of parallel execution against partitioned data sources (CSV +and Parquet) using threads. + +## Use Cases + +DataFusion is used to create modern, fast and efficient data +pipelines, ETL processes, and database systems, which need the +performance of Rust and Apache Arrow and want to provide their users +the convenience of an SQL interface or a DataFrame API. + +## Why DataFusion? + +* *High Performance*: Leveraging Rust and Arrow's memory model, DataFusion achieves very high performance +* *Easy to Connect*: Being part of the Apache Arrow ecosystem (Arrow, Parquet and Flight), DataFusion works well with the rest of the big data ecosystem +* *Easy to Embed*: Allowing extension at almost any point in its design, DataFusion can be tailored for your specific usecase +* *High Quality*: Extensively tested, both by itself and with the rest of the Arrow ecosystem, DataFusion can be used as the foundation for production systems. + +## Known Uses + +Here are some of the projects known to use DataFusion: + +* [Ballista](https://github.com/ballista-compute/ballista) Distributed Compute Platform +* [Cloudfuse Buzz](https://github.com/cloudfuse-io/buzz-rust) +* [Cube.js](https://github.com/cube-js/cube.js) +* [datafusion-python](https://pypi.org/project/datafusion) +* [delta-rs](https://github.com/delta-io/delta-rs) +* [InfluxDB IOx](https://github.com/influxdata/influxdb_iox) Time Series Database +* [ROAPI](https://github.com/roapi/roapi) + +(if you know of another project, please submit a PR to add a link!) + +## Example Usage + +Run a SQL query against data stored in a CSV: + +```rust +use datafusion::prelude::*; +use arrow::util::pretty::print_batches; +use arrow::record_batch::RecordBatch; + +#[tokio::main] +async fn main() -> datafusion::error::Result<()> { + // register the table + let mut ctx = ExecutionContext::new(); + ctx.register_csv("example", "tests/example.csv", CsvReadOptions::new())?; + + // create a plan to run a SQL query + let df = ctx.sql("SELECT a, MIN(b) FROM example GROUP BY a LIMIT 100")?; + + // execute and print results + let results: Vec = df.collect().await?; + print_batches(&results)?; + Ok(()) +} +``` + +Use the DataFrame API to process data stored in a CSV: + +```rust +use datafusion::prelude::*; +use arrow::util::pretty::print_batches; +use arrow::record_batch::RecordBatch; + +#[tokio::main] +async fn main() -> datafusion::error::Result<()> { + // create the dataframe + let mut ctx = ExecutionContext::new(); + let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; + + let df = df.filter(col("a").lt_eq(col("b")))? + .aggregate(vec![col("a")], vec![min(col("b"))])? + .limit(100)?; + + // execute and print results + let results: Vec = df.collect().await?; + print_batches(&results)?; + Ok(()) +} +``` + +Both of these examples will produce + +```text ++---+--------+ +| a | MIN(b) | ++---+--------+ +| 1 | 2 | ++---+--------+ +``` + + + +## Using DataFusion as a library + +DataFusion is [published on crates.io](https://crates.io/crates/datafusion), and is [well documented on docs.rs](https://docs.rs/datafusion/). + +To get started, add the following to your `Cargo.toml` file: + +```toml +[dependencies] +datafusion = "4.0.0-SNAPSHOT" +``` + +## Using DataFusion as a binary + +DataFusion also includes a simple command-line interactive SQL utility. See the [CLI reference](docs/cli.md) for more information. + +# Status + +## General + +- [x] SQL Parser +- [x] SQL Query Planner +- [x] Query Optimizer + - [x] Constant folding + - [x] Join Reordering + - [x] Limit Pushdown + - [x] Projection push down + - [x] Predicate push down +- [x] Type coercion +- [x] Parallel query execution + +## SQL Support + +- [x] Projection +- [x] Filter (WHERE) +- [x] Filter post-aggregate (HAVING) +- [x] Limit +- [x] Aggregate +- [x] Common math functions +- [x] cast +- [x] try_cast +- Postgres compatible String functions + - [x] ascii + - [x] bit_length + - [x] btrim + - [x] char_length + - [x] character_length + - [x] chr + - [x] concat + - [x] concat_ws + - [x] initcap + - [x] left + - [x] length + - [x] lpad + - [x] ltrim + - [x] octet_length + - [x] regexp_replace + - [x] repeat + - [x] replace + - [x] reverse + - [x] right + - [x] rpad + - [x] rtrim + - [x] split_part + - [x] starts_with + - [x] strpos + - [x] substr + - [x] to_hex + - [x] translate + - [x] trim +- Miscellaneous/Boolean functions + - [x] nullif +- Common date/time functions + - [ ] Basic date functions + - [ ] Basic time functions + - [x] Basic timestamp functions +- nested functions + - [x] Array of columns +- [x] Schema Queries + - [x] SHOW TABLES + - [x] SHOW COLUMNS + - [x] information_schema.{tables, columns} + - [ ] information_schema other views +- [x] Sorting +- [ ] Nested types +- [ ] Lists +- [x] Subqueries +- [x] Common table expressions +- [ ] Set Operations + - [x] UNION ALL + - [ ] UNION + - [ ] INTERSECT + - [ ] MINUS +- [x] Joins + - [x] INNER JOIN + - [ ] CROSS JOIN + - [ ] OUTER JOIN +- [ ] Window + +## Data Sources + +- [x] CSV +- [x] Parquet primitive types +- [ ] Parquet nested types + + +## Extensibility + +DataFusion is designed to be extensible at all points. To that end, you can provide your own custom: + +- [x] User Defined Functions (UDFs) +- [x] User Defined Aggregate Functions (UDAFs) +- [x] User Defined Table Source (`TableProvider`) for tables +- [x] User Defined `Optimizer` passes (plan rewrites) +- [x] User Defined `LogicalPlan` nodes +- [x] User Defined `ExecutionPlan` nodes + + +# Supported SQL + +This library currently supports many SQL constructs, including + +* `CREATE EXTERNAL TABLE X STORED AS PARQUET LOCATION '...';` to register a table's locations +* `SELECT ... FROM ...` together with any expression +* `ALIAS` to name an expression +* `CAST` to change types, including e.g. `Timestamp(Nanosecond, None)` +* most mathematical unary and binary expressions such as `+`, `/`, `sqrt`, `tan`, `>=`. +* `WHERE` to filter +* `GROUP BY` together with one of the following aggregations: `MIN`, `MAX`, `COUNT`, `SUM`, `AVG` +* `ORDER BY` together with an expression and optional `ASC` or `DESC` and also optional `NULLS FIRST` or `NULLS LAST` + + +## Supported Functions + +DataFusion strives to implement a subset of the [PostgreSQL SQL dialect](https://www.postgresql.org/docs/current/functions.html) where possible. We explicitly choose a single dialect to maximize interoperability with other tools and allow reuse of the PostgreSQL documents and tutorials as much as possible. + +Currently, only a subset of the PosgreSQL dialect is implemented, and we will document any deviations. + +## Schema Metadata / Information Schema Support + +DataFusion supports the showing metadata about the tables available. This information can be accessed using the views of the ISO SQL `information_schema` schema or the DataFusion specific `SHOW TABLES` and `SHOW COLUMNS` commands. + +More information can be found in the [Postgres docs](https://www.postgresql.org/docs/13/infoschema-schema.html)). + + +To show tables available for use in DataFusion, use the `SHOW TABLES` command or the `information_schema.tables` view: + +```sql +> show tables; ++---------------+--------------------+------------+------------+ +| table_catalog | table_schema | table_name | table_type | ++---------------+--------------------+------------+------------+ +| datafusion | public | t | BASE TABLE | +| datafusion | information_schema | tables | VIEW | ++---------------+--------------------+------------+------------+ + +> select * from information_schema.tables; + ++---------------+--------------------+------------+--------------+ +| table_catalog | table_schema | table_name | table_type | ++---------------+--------------------+------------+--------------+ +| datafusion | public | t | BASE TABLE | +| datafusion | information_schema | TABLES | SYSTEM TABLE | ++---------------+--------------------+------------+--------------+ +``` + +To show the schema of a table in DataFusion, use the `SHOW COLUMNS` command or the or `information_schema.columns` view: + +```sql +> show columns from t; ++---------------+--------------+------------+-------------+-----------+-------------+ +| table_catalog | table_schema | table_name | column_name | data_type | is_nullable | ++---------------+--------------+------------+-------------+-----------+-------------+ +| datafusion | public | t | a | Int32 | NO | +| datafusion | public | t | b | Utf8 | NO | +| datafusion | public | t | c | Float32 | NO | ++---------------+--------------+------------+-------------+-----------+-------------+ + +> select table_name, column_name, ordinal_position, is_nullable, data_type from information_schema.columns; ++------------+-------------+------------------+-------------+-----------+ +| table_name | column_name | ordinal_position | is_nullable | data_type | ++------------+-------------+------------------+-------------+-----------+ +| t | a | 0 | NO | Int32 | +| t | b | 1 | NO | Utf8 | +| t | c | 2 | NO | Float32 | ++------------+-------------+------------------+-------------+-----------+ +``` + + + +## Supported Data Types + +DataFusion uses Arrow, and thus the Arrow type system, for query +execution. The SQL types from +[sqlparser-rs](https://github.com/ballista-compute/sqlparser-rs/blob/main/src/ast/data_type.rs#L57) +are mapped to Arrow types according to the following table + + +| SQL Data Type | Arrow DataType | +| --------------- | -------------------------------- | +| `CHAR` | `Utf8` | +| `VARCHAR` | `Utf8` | +| `UUID` | *Not yet supported* | +| `CLOB` | *Not yet supported* | +| `BINARY` | *Not yet supported* | +| `VARBINARY` | *Not yet supported* | +| `DECIMAL` | `Float64` | +| `FLOAT` | `Float32` | +| `SMALLINT` | `Int16` | +| `INT` | `Int32` | +| `BIGINT` | `Int64` | +| `REAL` | `Float64` | +| `DOUBLE` | `Float64` | +| `BOOLEAN` | `Boolean` | +| `DATE` | `Date32` | +| `TIME` | `Time64(TimeUnit::Millisecond)` | +| `TIMESTAMP` | `Date64` | +| `INTERVAL` | *Not yet supported* | +| `REGCLASS` | *Not yet supported* | +| `TEXT` | *Not yet supported* | +| `BYTEA` | *Not yet supported* | +| `CUSTOM` | *Not yet supported* | +| `ARRAY` | *Not yet supported* | + + +# Architecture Overview + +There is no formal document describing DataFusion's architecture yet, but the following presentations offer a good overview of its different components and how they interact together. + +* (March 2021): The DataFusion architecture is described in *Query Engine Design and the Rust-Based DataFusion in Apache Arrow*: [recording](https://www.youtube.com/watch?v=K6eCAVEk4kU) (DataFusion content starts ~ 15 minutes in) and [slides](https://www.slideshare.net/influxdata/influxdb-iox-tech-talks-query-engine-design-and-the-rustbased-datafusion-in-apache-arrow-244161934) +* (Feburary 2021): How DataFusion is used within the Ballista Project is described in *Ballista: Distributed Compute with Rust and Apache Arrow: [recording](https://www.youtube.com/watch?v=ZZHQaOap9pQ) + + +# Developer's guide + +Please see [Developers Guide](DEVELOPERS.md) for information about developing DataFusion. From 9d89bf2837dc34020939639eb321b1549adbd260 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 18 Apr 2021 11:00:35 -0600 Subject: [PATCH 011/329] Remove format dir --- format/File.fbs | 52 ----- format/Flight.proto | 335 --------------------------------- format/Message.fbs | 140 -------------- format/README.rst | 25 --- format/Schema.fbs | 407 ---------------------------------------- format/SparseTensor.fbs | 228 ---------------------- format/Tensor.fbs | 54 ------ 7 files changed, 1241 deletions(-) delete mode 100644 format/File.fbs delete mode 100644 format/Flight.proto delete mode 100644 format/Message.fbs delete mode 100644 format/README.rst delete mode 100644 format/Schema.fbs delete mode 100644 format/SparseTensor.fbs delete mode 100644 format/Tensor.fbs diff --git a/format/File.fbs b/format/File.fbs deleted file mode 100644 index 906d494f272df..0000000000000 --- a/format/File.fbs +++ /dev/null @@ -1,52 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -include "Schema.fbs"; - -namespace org.apache.arrow.flatbuf; - -/// ---------------------------------------------------------------------- -/// Arrow File metadata -/// - -table Footer { - version: org.apache.arrow.flatbuf.MetadataVersion; - - schema: org.apache.arrow.flatbuf.Schema; - - dictionaries: [ Block ]; - - recordBatches: [ Block ]; - - /// User-defined metadata - custom_metadata: [ KeyValue ]; -} - -struct Block { - - /// Index to the start of the RecordBlock (note this is past the Message header) - offset: long; - - /// Length of the metadata - metaDataLength: int; - - /// Length of the data (this is aligned so there can be a gap between this and - /// the metadata). - bodyLength: long; -} - -root_type Footer; diff --git a/format/Flight.proto b/format/Flight.proto deleted file mode 100644 index b291d9dbd9aa3..0000000000000 --- a/format/Flight.proto +++ /dev/null @@ -1,335 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - *

- * http://www.apache.org/licenses/LICENSE-2.0 - *

- * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -syntax = "proto3"; - -option java_package = "org.apache.arrow.flight.impl"; -option go_package = "github.com/apache/arrow/go/flight;flight"; -option csharp_namespace = "Apache.Arrow.Flight.Protocol"; - -package arrow.flight.protocol; - -/* - * A flight service is an endpoint for retrieving or storing Arrow data. A - * flight service can expose one or more predefined endpoints that can be - * accessed using the Arrow Flight Protocol. Additionally, a flight service - * can expose a set of actions that are available. - */ -service FlightService { - - /* - * Handshake between client and server. Depending on the server, the - * handshake may be required to determine the token that should be used for - * future operations. Both request and response are streams to allow multiple - * round-trips depending on auth mechanism. - */ - rpc Handshake(stream HandshakeRequest) returns (stream HandshakeResponse) {} - - /* - * Get a list of available streams given a particular criteria. Most flight - * services will expose one or more streams that are readily available for - * retrieval. This api allows listing the streams available for - * consumption. A user can also provide a criteria. The criteria can limit - * the subset of streams that can be listed via this interface. Each flight - * service allows its own definition of how to consume criteria. - */ - rpc ListFlights(Criteria) returns (stream FlightInfo) {} - - /* - * For a given FlightDescriptor, get information about how the flight can be - * consumed. This is a useful interface if the consumer of the interface - * already can identify the specific flight to consume. This interface can - * also allow a consumer to generate a flight stream through a specified - * descriptor. For example, a flight descriptor might be something that - * includes a SQL statement or a Pickled Python operation that will be - * executed. In those cases, the descriptor will not be previously available - * within the list of available streams provided by ListFlights but will be - * available for consumption for the duration defined by the specific flight - * service. - */ - rpc GetFlightInfo(FlightDescriptor) returns (FlightInfo) {} - - /* - * For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema - * This is used when a consumer needs the Schema of flight stream. Similar to - * GetFlightInfo this interface may generate a new flight that was not previously - * available in ListFlights. - */ - rpc GetSchema(FlightDescriptor) returns (SchemaResult) {} - - /* - * Retrieve a single stream associated with a particular descriptor - * associated with the referenced ticket. A Flight can be composed of one or - * more streams where each stream can be retrieved using a separate opaque - * ticket that the flight service uses for managing a collection of streams. - */ - rpc DoGet(Ticket) returns (stream FlightData) {} - - /* - * Push a stream to the flight service associated with a particular - * flight stream. This allows a client of a flight service to upload a stream - * of data. Depending on the particular flight service, a client consumer - * could be allowed to upload a single stream per descriptor or an unlimited - * number. In the latter, the service might implement a 'seal' action that - * can be applied to a descriptor once all streams are uploaded. - */ - rpc DoPut(stream FlightData) returns (stream PutResult) {} - - /* - * Open a bidirectional data channel for a given descriptor. This - * allows clients to send and receive arbitrary Arrow data and - * application-specific metadata in a single logical stream. In - * contrast to DoGet/DoPut, this is more suited for clients - * offloading computation (rather than storage) to a Flight service. - */ - rpc DoExchange(stream FlightData) returns (stream FlightData) {} - - /* - * Flight services can support an arbitrary number of simple actions in - * addition to the possible ListFlights, GetFlightInfo, DoGet, DoPut - * operations that are potentially available. DoAction allows a flight client - * to do a specific action against a flight service. An action includes - * opaque request and response objects that are specific to the type action - * being undertaken. - */ - rpc DoAction(Action) returns (stream Result) {} - - /* - * A flight service exposes all of the available action types that it has - * along with descriptions. This allows different flight consumers to - * understand the capabilities of the flight service. - */ - rpc ListActions(Empty) returns (stream ActionType) {} - -} - -/* - * The request that a client provides to a server on handshake. - */ -message HandshakeRequest { - - /* - * A defined protocol version - */ - uint64 protocol_version = 1; - - /* - * Arbitrary auth/handshake info. - */ - bytes payload = 2; -} - -message HandshakeResponse { - - /* - * A defined protocol version - */ - uint64 protocol_version = 1; - - /* - * Arbitrary auth/handshake info. - */ - bytes payload = 2; -} - -/* - * A message for doing simple auth. - */ -message BasicAuth { - string username = 2; - string password = 3; -} - -message Empty {} - -/* - * Describes an available action, including both the name used for execution - * along with a short description of the purpose of the action. - */ -message ActionType { - string type = 1; - string description = 2; -} - -/* - * A service specific expression that can be used to return a limited set - * of available Arrow Flight streams. - */ -message Criteria { - bytes expression = 1; -} - -/* - * An opaque action specific for the service. - */ -message Action { - string type = 1; - bytes body = 2; -} - -/* - * An opaque result returned after executing an action. - */ -message Result { - bytes body = 1; -} - -/* - * Wrap the result of a getSchema call - */ -message SchemaResult { - // schema of the dataset as described in Schema.fbs::Schema. - bytes schema = 1; -} - -/* - * The name or tag for a Flight. May be used as a way to retrieve or generate - * a flight or be used to expose a set of previously defined flights. - */ -message FlightDescriptor { - - /* - * Describes what type of descriptor is defined. - */ - enum DescriptorType { - - // Protobuf pattern, not used. - UNKNOWN = 0; - - /* - * A named path that identifies a dataset. A path is composed of a string - * or list of strings describing a particular dataset. This is conceptually - * similar to a path inside a filesystem. - */ - PATH = 1; - - /* - * An opaque command to generate a dataset. - */ - CMD = 2; - } - - DescriptorType type = 1; - - /* - * Opaque value used to express a command. Should only be defined when - * type = CMD. - */ - bytes cmd = 2; - - /* - * List of strings identifying a particular dataset. Should only be defined - * when type = PATH. - */ - repeated string path = 3; -} - -/* - * The access coordinates for retrieval of a dataset. With a FlightInfo, a - * consumer is able to determine how to retrieve a dataset. - */ -message FlightInfo { - // schema of the dataset as described in Schema.fbs::Schema. - bytes schema = 1; - - /* - * The descriptor associated with this info. - */ - FlightDescriptor flight_descriptor = 2; - - /* - * A list of endpoints associated with the flight. To consume the whole - * flight, all endpoints must be consumed. - */ - repeated FlightEndpoint endpoint = 3; - - // Set these to -1 if unknown. - int64 total_records = 4; - int64 total_bytes = 5; -} - -/* - * A particular stream or split associated with a flight. - */ -message FlightEndpoint { - - /* - * Token used to retrieve this stream. - */ - Ticket ticket = 1; - - /* - * A list of URIs where this ticket can be redeemed. If the list is - * empty, the expectation is that the ticket can only be redeemed on the - * current service where the ticket was generated. - */ - repeated Location location = 2; -} - -/* - * A location where a Flight service will accept retrieval of a particular - * stream given a ticket. - */ -message Location { - string uri = 1; -} - -/* - * An opaque identifier that the service can use to retrieve a particular - * portion of a stream. - */ -message Ticket { - bytes ticket = 1; -} - -/* - * A batch of Arrow data as part of a stream of batches. - */ -message FlightData { - - /* - * The descriptor of the data. This is only relevant when a client is - * starting a new DoPut stream. - */ - FlightDescriptor flight_descriptor = 1; - - /* - * Header for message data as described in Message.fbs::Message. - */ - bytes data_header = 2; - - /* - * Application-defined metadata. - */ - bytes app_metadata = 3; - - /* - * The actual batch of Arrow data. Preferably handled with minimal-copies - * coming last in the definition to help with sidecar patterns (it is - * expected that some implementations will fetch this field off the wire - * with specialized code to avoid extra memory copies). - */ - bytes data_body = 1000; -} - -/** - * The response message associated with the submission of a DoPut. - */ -message PutResult { - bytes app_metadata = 1; -} diff --git a/format/Message.fbs b/format/Message.fbs deleted file mode 100644 index f1c18d765d465..0000000000000 --- a/format/Message.fbs +++ /dev/null @@ -1,140 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -include "Schema.fbs"; -include "SparseTensor.fbs"; -include "Tensor.fbs"; - -namespace org.apache.arrow.flatbuf; - -/// ---------------------------------------------------------------------- -/// Data structures for describing a table row batch (a collection of -/// equal-length Arrow arrays) - -/// Metadata about a field at some level of a nested type tree (but not -/// its children). -/// -/// For example, a List with values `[[1, 2, 3], null, [4], [5, 6], null]` -/// would have {length: 5, null_count: 2} for its List node, and {length: 6, -/// null_count: 0} for its Int16 node, as separate FieldNode structs -struct FieldNode { - /// The number of value slots in the Arrow array at this level of a nested - /// tree - length: long; - - /// The number of observed nulls. Fields with null_count == 0 may choose not - /// to write their physical validity bitmap out as a materialized buffer, - /// instead setting the length of the bitmap buffer to 0. - null_count: long; -} - -enum CompressionType:byte { - // LZ4 frame format, for portability, as provided by lz4frame.h or wrappers - // thereof. Not to be confused with "raw" (also called "block") format - // provided by lz4.h - LZ4_FRAME, - - // Zstandard - ZSTD -} - -/// Provided for forward compatibility in case we need to support different -/// strategies for compressing the IPC message body (like whole-body -/// compression rather than buffer-level) in the future -enum BodyCompressionMethod:byte { - /// Each constituent buffer is first compressed with the indicated - /// compressor, and then written with the uncompressed length in the first 8 - /// bytes as a 64-bit little-endian signed integer followed by the compressed - /// buffer bytes (and then padding as required by the protocol). The - /// uncompressed length may be set to -1 to indicate that the data that - /// follows is not compressed, which can be useful for cases where - /// compression does not yield appreciable savings. - BUFFER -} - -/// Optional compression for the memory buffers constituting IPC message -/// bodies. Intended for use with RecordBatch but could be used for other -/// message types -table BodyCompression { - /// Compressor library - codec: CompressionType = LZ4_FRAME; - - /// Indicates the way the record batch body was compressed - method: BodyCompressionMethod = BUFFER; -} - -/// A data header describing the shared memory layout of a "record" or "row" -/// batch. Some systems call this a "row batch" internally and others a "record -/// batch". -table RecordBatch { - /// number of records / rows. The arrays in the batch should all have this - /// length - length: long; - - /// Nodes correspond to the pre-ordered flattened logical schema - nodes: [FieldNode]; - - /// Buffers correspond to the pre-ordered flattened buffer tree - /// - /// The number of buffers appended to this list depends on the schema. For - /// example, most primitive arrays will have 2 buffers, 1 for the validity - /// bitmap and 1 for the values. For struct arrays, there will only be a - /// single buffer for the validity (nulls) bitmap - buffers: [Buffer]; - - /// Optional compression of the message body - compression: BodyCompression; -} - -/// For sending dictionary encoding information. Any Field can be -/// dictionary-encoded, but in this case none of its children may be -/// dictionary-encoded. -/// There is one vector / column per dictionary, but that vector / column -/// may be spread across multiple dictionary batches by using the isDelta -/// flag - -table DictionaryBatch { - id: long; - data: RecordBatch; - - /// If isDelta is true the values in the dictionary are to be appended to a - /// dictionary with the indicated id. If isDelta is false this dictionary - /// should replace the existing dictionary. - isDelta: bool = false; -} - -/// ---------------------------------------------------------------------- -/// The root Message type - -/// This union enables us to easily send different message types without -/// redundant storage, and in the future we can easily add new message types. -/// -/// Arrow implementations do not need to implement all of the message types, -/// which may include experimental metadata types. For maximum compatibility, -/// it is best to send data using RecordBatch -union MessageHeader { - Schema, DictionaryBatch, RecordBatch, Tensor, SparseTensor -} - -table Message { - version: org.apache.arrow.flatbuf.MetadataVersion; - header: MessageHeader; - bodyLength: long; - custom_metadata: [ KeyValue ]; -} - -root_type Message; diff --git a/format/README.rst b/format/README.rst deleted file mode 100644 index 0eaad49b7e394..0000000000000 --- a/format/README.rst +++ /dev/null @@ -1,25 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -Arrow Protocol Files -==================== - -This folder contains binary protocol definitions for the Arrow columnar format -and other parts of the project, like the Flight RPC framework. - -For documentation about the Arrow format, see the `docs/source/format` -directory. diff --git a/format/Schema.fbs b/format/Schema.fbs deleted file mode 100644 index 3b00dd4780d66..0000000000000 --- a/format/Schema.fbs +++ /dev/null @@ -1,407 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -/// Logical types, vector layouts, and schemas - -namespace org.apache.arrow.flatbuf; - -enum MetadataVersion:short { - /// 0.1.0 (October 2016). - V1, - - /// 0.2.0 (February 2017). Non-backwards compatible with V1. - V2, - - /// 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2. - V3, - - /// >= 0.8.0 (December 2017). Non-backwards compatible with V3. - V4, - - /// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4 - /// metadata and IPC messages). Implementations are recommended to provide a - /// V4 compatibility mode with V5 format changes disabled. - /// - /// Incompatible changes between V4 and V5: - /// - Union buffer layout has changed. In V5, Unions don't have a validity - /// bitmap buffer. - V5, -} - -/// Represents Arrow Features that might not have full support -/// within implementations. This is intended to be used in -/// two scenarios: -/// 1. A mechanism for readers of Arrow Streams -/// and files to understand that the stream or file makes -/// use of a feature that isn't supported or unknown to -/// the implementation (and therefore can meet the Arrow -/// forward compatibility guarantees). -/// 2. A means of negotiating between a client and server -/// what features a stream is allowed to use. The enums -/// values here are intented to represent higher level -/// features, additional details maybe negotiated -/// with key-value pairs specific to the protocol. -/// -/// Enums added to this list should be assigned power-of-two values -/// to facilitate exchanging and comparing bitmaps for supported -/// features. -enum Feature : long { - /// Needed to make flatbuffers happy. - UNUSED = 0, - /// The stream makes use of multiple full dictionaries with the - /// same ID and assumes clients implement dictionary replacement - /// correctly. - DICTIONARY_REPLACEMENT = 1, - /// The stream makes use of compressed bodies as described - /// in Message.fbs. - COMPRESSED_BODY = 2 -} - -/// These are stored in the flatbuffer in the Type union below - -table Null { -} - -/// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct -/// (according to the physical memory layout). We used Struct_ here as -/// Struct is a reserved word in Flatbuffers -table Struct_ { -} - -table List { -} - -/// Same as List, but with 64-bit offsets, allowing to represent -/// extremely large data values. -table LargeList { -} - -table FixedSizeList { - /// Number of list items per value - listSize: int; -} - -/// A Map is a logical nested type that is represented as -/// -/// List> -/// -/// In this layout, the keys and values are each respectively contiguous. We do -/// not constrain the key and value types, so the application is responsible -/// for ensuring that the keys are hashable and unique. Whether the keys are sorted -/// may be set in the metadata for this field. -/// -/// In a field with Map type, the field has a child Struct field, which then -/// has two children: key type and the second the value type. The names of the -/// child fields may be respectively "entries", "key", and "value", but this is -/// not enforced. -/// -/// Map -/// ```text -/// - child[0] entries: Struct -/// - child[0] key: K -/// - child[1] value: V -/// ``` -/// Neither the "entries" field nor the "key" field may be nullable. -/// -/// The metadata is structured so that Arrow systems without special handling -/// for Map can make Map an alias for List. The "layout" attribute for the Map -/// field must have the same contents as a List. -table Map { - /// Set to true if the keys within each value are sorted - keysSorted: bool; -} - -enum UnionMode:short { Sparse, Dense } - -/// A union is a complex type with children in Field -/// By default ids in the type vector refer to the offsets in the children -/// optionally typeIds provides an indirection between the child offset and the type id -/// for each child `typeIds[offset]` is the id used in the type vector -table Union { - mode: UnionMode; - typeIds: [ int ]; // optional, describes typeid of each child. -} - -table Int { - bitWidth: int; // restricted to 8, 16, 32, and 64 in v1 - is_signed: bool; -} - -enum Precision:short {HALF, SINGLE, DOUBLE} - -table FloatingPoint { - precision: Precision; -} - -/// Unicode with UTF-8 encoding -table Utf8 { -} - -/// Opaque binary data -table Binary { -} - -/// Same as Utf8, but with 64-bit offsets, allowing to represent -/// extremely large data values. -table LargeUtf8 { -} - -/// Same as Binary, but with 64-bit offsets, allowing to represent -/// extremely large data values. -table LargeBinary { -} - -table FixedSizeBinary { - /// Number of bytes per value - byteWidth: int; -} - -table Bool { -} - -/// Exact decimal value represented as an integer value in two's -/// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers -/// are used. The representation uses the endianness indicated -/// in the Schema. -table Decimal { - /// Total number of decimal digits - precision: int; - - /// Number of digits after the decimal point "." - scale: int; - - /// Number of bits per value. The only accepted widths are 128 and 256. - /// We use bitWidth for consistency with Int::bitWidth. - bitWidth: int = 128; -} - -enum DateUnit: short { - DAY, - MILLISECOND -} - -/// Date is either a 32-bit or 64-bit type representing elapsed time since UNIX -/// epoch (1970-01-01), stored in either of two units: -/// -/// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no -/// leap seconds), where the values are evenly divisible by 86400000 -/// * Days (32 bits) since the UNIX epoch -table Date { - unit: DateUnit = MILLISECOND; -} - -enum TimeUnit: short { SECOND, MILLISECOND, MICROSECOND, NANOSECOND } - -/// Time type. The physical storage type depends on the unit -/// - SECOND and MILLISECOND: 32 bits -/// - MICROSECOND and NANOSECOND: 64 bits -table Time { - unit: TimeUnit = MILLISECOND; - bitWidth: int = 32; -} - -/// Time elapsed from the Unix epoch, 00:00:00.000 on 1 January 1970, excluding -/// leap seconds, as a 64-bit integer. Note that UNIX time does not include -/// leap seconds. -/// -/// The Timestamp metadata supports both "time zone naive" and "time zone -/// aware" timestamps. Read about the timezone attribute for more detail -table Timestamp { - unit: TimeUnit; - - /// The time zone is a string indicating the name of a time zone, one of: - /// - /// * As used in the Olson time zone database (the "tz database" or - /// "tzdata"), such as "America/New_York" - /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 - /// - /// Whether a timezone string is present indicates different semantics about - /// the data: - /// - /// * If the time zone is null or equal to an empty string, the data is "time - /// zone naive" and shall be displayed *as is* to the user, not localized - /// to the locale of the user. This data can be though of as UTC but - /// without having "UTC" as the time zone, it is not considered to be - /// localized to any time zone - /// - /// * If the time zone is set to a valid value, values can be displayed as - /// "localized" to that time zone, even though the underlying 64-bit - /// integers are identical to the same data stored in UTC. Converting - /// between time zones is a metadata-only operation and does not change the - /// underlying values - timezone: string; -} - -enum IntervalUnit: short { YEAR_MONTH, DAY_TIME} -// A "calendar" interval which models types that don't necessarily -// have a precise duration without the context of a base timestamp (e.g. -// days can differ in length during day light savings time transitions). -// YEAR_MONTH - Indicates the number of elapsed whole months, stored as -// 4-byte integers. -// DAY_TIME - Indicates the number of elapsed days and milliseconds, -// stored as 2 contiguous 32-bit integers (8-bytes in total). Support -// of this IntervalUnit is not required for full arrow compatibility. -table Interval { - unit: IntervalUnit; -} - -// An absolute length of time unrelated to any calendar artifacts. -// -// For the purposes of Arrow Implementations, adding this value to a Timestamp -// ("t1") naively (i.e. simply summing the two number) is acceptable even -// though in some cases the resulting Timestamp (t2) would not account for -// leap-seconds during the elapsed time between "t1" and "t2". Similarly, -// representing the difference between two Unix timestamp is acceptable, but -// would yield a value that is possibly a few seconds off from the true elapsed -// time. -// -// The resolution defaults to millisecond, but can be any of the other -// supported TimeUnit values as with Timestamp and Time types. This type is -// always represented as an 8-byte integer. -table Duration { - unit: TimeUnit = MILLISECOND; -} - -/// ---------------------------------------------------------------------- -/// Top-level Type value, enabling extensible type-specific metadata. We can -/// add new logical types to Type without breaking backwards compatibility - -union Type { - Null, - Int, - FloatingPoint, - Binary, - Utf8, - Bool, - Decimal, - Date, - Time, - Timestamp, - Interval, - List, - Struct_, - Union, - FixedSizeBinary, - FixedSizeList, - Map, - Duration, - LargeBinary, - LargeUtf8, - LargeList, -} - -/// ---------------------------------------------------------------------- -/// user defined key value pairs to add custom metadata to arrow -/// key namespacing is the responsibility of the user - -table KeyValue { - key: string; - value: string; -} - -/// ---------------------------------------------------------------------- -/// Dictionary encoding metadata -/// Maintained for forwards compatibility, in the future -/// Dictionaries might be explicit maps between integers and values -/// allowing for non-contiguous index values -enum DictionaryKind : short { DenseArray } -table DictionaryEncoding { - /// The known dictionary id in the application where this data is used. In - /// the file or streaming formats, the dictionary ids are found in the - /// DictionaryBatch messages - id: long; - - /// The dictionary indices are constrained to be non-negative integers. If - /// this field is null, the indices must be signed int32. To maximize - /// cross-language compatibility and performance, implementations are - /// recommended to prefer signed integer types over unsigned integer types - /// and to avoid uint64 indices unless they are required by an application. - indexType: Int; - - /// By default, dictionaries are not ordered, or the order does not have - /// semantic meaning. In some statistical, applications, dictionary-encoding - /// is used to represent ordered categorical data, and we provide a way to - /// preserve that metadata here - isOrdered: bool; - - dictionaryKind: DictionaryKind; -} - -/// ---------------------------------------------------------------------- -/// A field represents a named column in a record / row batch or child of a -/// nested type. - -table Field { - /// Name is not required, in i.e. a List - name: string; - - /// Whether or not this field can contain nulls. Should be true in general. - nullable: bool; - - /// This is the type of the decoded value if the field is dictionary encoded. - type: Type; - - /// Present only if the field is dictionary encoded. - dictionary: DictionaryEncoding; - - /// children apply only to nested data types like Struct, List and Union. For - /// primitive types children will have length 0. - children: [ Field ]; - - /// User-defined metadata - custom_metadata: [ KeyValue ]; -} - -/// ---------------------------------------------------------------------- -/// Endianness of the platform producing the data - -enum Endianness:short { Little, Big } - -/// ---------------------------------------------------------------------- -/// A Buffer represents a single contiguous memory segment -struct Buffer { - /// The relative offset into the shared memory page where the bytes for this - /// buffer starts - offset: long; - - /// The absolute length (in bytes) of the memory buffer. The memory is found - /// from offset (inclusive) to offset + length (non-inclusive). When building - /// messages using the encapsulated IPC message, padding bytes may be written - /// after a buffer, but such padding bytes do not need to be accounted for in - /// the size here. - length: long; -} - -/// ---------------------------------------------------------------------- -/// A Schema describes the columns in a row batch - -table Schema { - - /// endianness of the buffer - /// it is Little Endian by default - /// if endianness doesn't match the underlying system then the vectors need to be converted - endianness: Endianness=Little; - - fields: [Field]; - // User-defined metadata - custom_metadata: [ KeyValue ]; - - /// Features used in the stream/file. - features : [ Feature ]; -} - -root_type Schema; diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs deleted file mode 100644 index a6fd2f9e74816..0000000000000 --- a/format/SparseTensor.fbs +++ /dev/null @@ -1,228 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -/// EXPERIMENTAL: Metadata for n-dimensional sparse arrays, aka "sparse tensors". -/// Arrow implementations in general are not required to implement this type - -include "Tensor.fbs"; - -namespace org.apache.arrow.flatbuf; - -/// ---------------------------------------------------------------------- -/// EXPERIMENTAL: Data structures for sparse tensors - -/// Coordinate (COO) format of sparse tensor index. -/// -/// COO's index list are represented as a NxM matrix, -/// where N is the number of non-zero values, -/// and M is the number of dimensions of a sparse tensor. -/// -/// indicesBuffer stores the location and size of the data of this indices -/// matrix. The value type and the stride of the indices matrix is -/// specified in indicesType and indicesStrides fields. -/// -/// For example, let X be a 2x3x4x5 tensor, and it has the following -/// 6 non-zero values: -/// ```text -/// X[0, 1, 2, 0] := 1 -/// X[1, 1, 2, 3] := 2 -/// X[0, 2, 1, 0] := 3 -/// X[0, 1, 3, 0] := 4 -/// X[0, 1, 2, 1] := 5 -/// X[1, 2, 0, 4] := 6 -/// ``` -/// In COO format, the index matrix of X is the following 4x6 matrix: -/// ```text -/// [[0, 0, 0, 0, 1, 1], -/// [1, 1, 1, 2, 1, 2], -/// [2, 2, 3, 1, 2, 0], -/// [0, 1, 0, 0, 3, 4]] -/// ``` -/// When isCanonical is true, the indices is sorted in lexicographical order -/// (row-major order), and it does not have duplicated entries. Otherwise, -/// the indices may not be sorted, or may have duplicated entries. -table SparseTensorIndexCOO { - /// The type of values in indicesBuffer - indicesType: Int (required); - - /// Non-negative byte offsets to advance one value cell along each dimension - /// If omitted, default to row-major order (C-like). - indicesStrides: [long]; - - /// The location and size of the indices matrix's data - indicesBuffer: Buffer (required); - - /// This flag is true if and only if the indices matrix is sorted in - /// row-major order, and does not have duplicated entries. - /// This sort order is the same as of Tensorflow's SparseTensor, - /// but it is inverse order of SciPy's canonical coo_matrix - /// (SciPy employs column-major order for its coo_matrix). - isCanonical: bool; -} - -enum SparseMatrixCompressedAxis: short { Row, Column } - -/// Compressed Sparse format, that is matrix-specific. -table SparseMatrixIndexCSX { - /// Which axis, row or column, is compressed - compressedAxis: SparseMatrixCompressedAxis; - - /// The type of values in indptrBuffer - indptrType: Int (required); - - /// indptrBuffer stores the location and size of indptr array that - /// represents the range of the rows. - /// The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. - /// The length of this array is 1 + (the number of rows), and the type - /// of index value is long. - /// - /// For example, let X be the following 6x4 matrix: - /// ```text - /// X := [[0, 1, 2, 0], - /// [0, 0, 3, 0], - /// [0, 4, 0, 5], - /// [0, 0, 0, 0], - /// [6, 0, 7, 8], - /// [0, 9, 0, 0]]. - /// ``` - /// The array of non-zero values in X is: - /// ```text - /// values(X) = [1, 2, 3, 4, 5, 6, 7, 8, 9]. - /// ``` - /// And the indptr of X is: - /// ```text - /// indptr(X) = [0, 2, 3, 5, 5, 8, 10]. - /// ``` - indptrBuffer: Buffer (required); - - /// The type of values in indicesBuffer - indicesType: Int (required); - - /// indicesBuffer stores the location and size of the array that - /// contains the column indices of the corresponding non-zero values. - /// The type of index value is long. - /// - /// For example, the indices of the above X is: - /// ```text - /// indices(X) = [1, 2, 2, 1, 3, 0, 2, 3, 1]. - /// ``` - /// Note that the indices are sorted in lexicographical order for each row. - indicesBuffer: Buffer (required); -} - -/// Compressed Sparse Fiber (CSF) sparse tensor index. -table SparseTensorIndexCSF { - /// CSF is a generalization of compressed sparse row (CSR) index. - /// See [smith2017knl](http://shaden.io/pub-files/smith2017knl.pdf) - /// - /// CSF index recursively compresses each dimension of a tensor into a set - /// of prefix trees. Each path from a root to leaf forms one tensor - /// non-zero index. CSF is implemented with two arrays of buffers and one - /// arrays of integers. - /// - /// For example, let X be a 2x3x4x5 tensor and let it have the following - /// 8 non-zero values: - /// ```text - /// X[0, 0, 0, 1] := 1 - /// X[0, 0, 0, 2] := 2 - /// X[0, 1, 0, 0] := 3 - /// X[0, 1, 0, 2] := 4 - /// X[0, 1, 1, 0] := 5 - /// X[1, 1, 1, 0] := 6 - /// X[1, 1, 1, 1] := 7 - /// X[1, 1, 1, 2] := 8 - /// ``` - /// As a prefix tree this would be represented as: - /// ```text - /// 0 1 - /// / \ | - /// 0 1 1 - /// / / \ | - /// 0 0 1 1 - /// /| /| | /| | - /// 1 2 0 2 0 0 1 2 - /// ``` - /// The type of values in indptrBuffers - indptrType: Int (required); - - /// indptrBuffers stores the sparsity structure. - /// Each two consecutive dimensions in a tensor correspond to a buffer in - /// indptrBuffers. A pair of consecutive values at `indptrBuffers[dim][i]` - /// and `indptrBuffers[dim][i + 1]` signify a range of nodes in - /// `indicesBuffers[dim + 1]` who are children of `indicesBuffers[dim][i]` node. - /// - /// For example, the indptrBuffers for the above X is: - /// ```text - /// indptrBuffer(X) = [ - /// [0, 2, 3], - /// [0, 1, 3, 4], - /// [0, 2, 4, 5, 8] - /// ]. - /// ``` - indptrBuffers: [Buffer] (required); - - /// The type of values in indicesBuffers - indicesType: Int (required); - - /// indicesBuffers stores values of nodes. - /// Each tensor dimension corresponds to a buffer in indicesBuffers. - /// For example, the indicesBuffers for the above X is: - /// ```text - /// indicesBuffer(X) = [ - /// [0, 1], - /// [0, 1, 1], - /// [0, 0, 1, 1], - /// [1, 2, 0, 2, 0, 0, 1, 2] - /// ]. - /// ``` - indicesBuffers: [Buffer] (required); - - /// axisOrder stores the sequence in which dimensions were traversed to - /// produce the prefix tree. - /// For example, the axisOrder for the above X is: - /// ```text - /// axisOrder(X) = [0, 1, 2, 3]. - /// ``` - axisOrder: [int] (required); -} - -union SparseTensorIndex { - SparseTensorIndexCOO, - SparseMatrixIndexCSX, - SparseTensorIndexCSF -} - -table SparseTensor { - /// The type of data contained in a value cell. - /// Currently only fixed-width value types are supported, - /// no strings or nested types. - type: Type (required); - - /// The dimensions of the tensor, optionally named. - shape: [TensorDim] (required); - - /// The number of non-zero values in a sparse tensor. - non_zero_length: long; - - /// Sparse tensor index - sparseIndex: SparseTensorIndex (required); - - /// The location and size of the tensor's data - data: Buffer (required); -} - -root_type SparseTensor; diff --git a/format/Tensor.fbs b/format/Tensor.fbs deleted file mode 100644 index 409297ccf8264..0000000000000 --- a/format/Tensor.fbs +++ /dev/null @@ -1,54 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -/// EXPERIMENTAL: Metadata for n-dimensional arrays, aka "tensors" or -/// "ndarrays". Arrow implementations in general are not required to implement -/// this type - -include "Schema.fbs"; - -namespace org.apache.arrow.flatbuf; - -/// ---------------------------------------------------------------------- -/// Data structures for dense tensors - -/// Shape data for a single axis in a tensor -table TensorDim { - /// Length of dimension - size: long; - - /// Name of the dimension, optional - name: string; -} - -table Tensor { - /// The type of data contained in a value cell. Currently only fixed-width - /// value types are supported, no strings or nested types - type: Type (required); - - /// The dimensions of the tensor, optionally named - shape: [TensorDim] (required); - - /// Non-negative byte offsets to advance one value cell along each dimension - /// If omitted, default to row-major order (C-like). - strides: [long]; - - /// The location and size of the tensor's data - data: Buffer (required); -} - -root_type Tensor; From e4961431baf58ce0acaab8ca42df1617e8d4448b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 18 Apr 2021 11:06:22 -0600 Subject: [PATCH 012/329] Removed submodule --- .gitmodules | 6 ------ testing | 1 - 2 files changed, 7 deletions(-) delete mode 100644 .gitmodules delete mode 160000 testing diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 6efc4871542cb..0000000000000 --- a/.gitmodules +++ /dev/null @@ -1,6 +0,0 @@ -[submodule "cpp/submodules/parquet-testing"] - path = cpp/submodules/parquet-testing - url = https://github.com/apache/parquet-testing.git -[submodule "testing"] - path = testing - url = https://github.com/apache/arrow-testing diff --git a/testing b/testing deleted file mode 160000 index b658b087767b0..0000000000000 --- a/testing +++ /dev/null @@ -1 +0,0 @@ -Subproject commit b658b087767b041b2081766814655b4dd5a9a439 From db4d00cc1d59ff2afae23aa9177446de00e28993 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 18 Apr 2021 11:09:01 -0600 Subject: [PATCH 013/329] remove clang files --- .clang-format | 20 -------------------- .clang-tidy | 31 ------------------------------- .clang-tidy-ignore | 18 ------------------ 3 files changed, 69 deletions(-) delete mode 100644 .clang-format delete mode 100644 .clang-tidy delete mode 100644 .clang-tidy-ignore diff --git a/.clang-format b/.clang-format deleted file mode 100644 index 06453dfbb25b7..0000000000000 --- a/.clang-format +++ /dev/null @@ -1,20 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. ---- -BasedOnStyle: Google -DerivePointerAlignment: false -ColumnLimit: 90 diff --git a/.clang-tidy b/.clang-tidy deleted file mode 100644 index 8b2c16746f583..0000000000000 --- a/.clang-tidy +++ /dev/null @@ -1,31 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. ---- -Checks: 'clang-diagnostic-*,clang-analyzer-*,-clang-analyzer-alpha*,google-*,modernize-*,-modernize-use-trailing-return-type' -# produce HeaderFilterRegex from cpp/build-support/lint_exclusions.txt with: -# echo -n '^('; sed -e 's/*/\.*/g' cpp/build-support/lint_exclusions.txt | tr '\n' '|'; echo ')$' -HeaderFilterRegex: '^(.*codegen.*|.*_generated.*|.*windows_compatibility.h|.*pyarrow_api.h|.*pyarrow_lib.h|.*python/config.h|.*python/platform.h|.*thirdparty/ae/.*|.*vendored/.*|.*RcppExports.cpp.*|)$' -AnalyzeTemporaryDtors: true -CheckOptions: - - key: google-readability-braces-around-statements.ShortStatementLines - value: '1' - - key: google-readability-function-size.StatementThreshold - value: '800' - - key: google-readability-namespace-comments.ShortNamespaceLines - value: '10' - - key: google-readability-namespace-comments.SpacesBeforeComments - value: '2' diff --git a/.clang-tidy-ignore b/.clang-tidy-ignore deleted file mode 100644 index 3270b973f2e9a..0000000000000 --- a/.clang-tidy-ignore +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -ipc-adapter-test.cc -memory-pool-test.cc From fb1add1a0ae6c02d883fff9947a97101e074acf6 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 18 Apr 2021 12:08:33 -0600 Subject: [PATCH 014/329] Speficy GitHub options --- .asf.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.asf.yaml b/.asf.yaml index 4bd5191a7a659..c8e01478ef028 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -20,3 +20,10 @@ notifications: issues: github@arrow.apache.org pullrequests: github@arrow.apache.org jira_options: link label worklog +github: + enabled_merge_buttons: + squash: true + merge: false + rebase: false + features: + issues: true From 00d19fa43288ecc6908ebe6ee347570e8e5154ac Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 18 Apr 2021 12:41:39 -0600 Subject: [PATCH 015/329] Set GitHub description and labels (#1) --- .asf.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.asf.yaml b/.asf.yaml index c8e01478ef028..f50ae9a468ab3 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -21,6 +21,13 @@ notifications: pullrequests: github@arrow.apache.org jira_options: link label worklog github: + description: "Apache Arrow DataFusion and Ballista query engines" + homepage: https://arrow.apache.org/ + labels: + datafusion + ballista + bug + performance enabled_merge_buttons: squash: true merge: false From 05c40751822c4fbae8a1119e7db3cae199fbc06c Mon Sep 17 00:00:00 2001 From: Jorge Leitao Date: Sun, 18 Apr 2021 21:20:07 +0200 Subject: [PATCH 016/329] Bumped arrow. (#7) --- ballista/rust/benchmarks/tpch/Cargo.toml | 4 ++-- ballista/rust/client/Cargo.toml | 2 +- ballista/rust/core/Cargo.toml | 4 ++-- ballista/rust/executor/Cargo.toml | 4 ++-- ballista/rust/scheduler/Cargo.toml | 2 +- benchmarks/Cargo.toml | 4 ++-- datafusion-examples/Cargo.toml | 4 ++-- datafusion/Cargo.toml | 4 ++-- 8 files changed, 14 insertions(+), 14 deletions(-) diff --git a/ballista/rust/benchmarks/tpch/Cargo.toml b/ballista/rust/benchmarks/tpch/Cargo.toml index 9a7d65199266e..8d62e20e17e17 100644 --- a/ballista/rust/benchmarks/tpch/Cargo.toml +++ b/ballista/rust/benchmarks/tpch/Cargo.toml @@ -28,8 +28,8 @@ edition = "2018" ballista = { path="../../client" } datafusion = { path = "../../../../datafusion" } -arrow = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } -parquet = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } +parquet = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } env_logger = "0.8" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread"] } diff --git a/ballista/rust/client/Cargo.toml b/ballista/rust/client/Cargo.toml index bf40cdb499ace..6ac86875169b4 100644 --- a/ballista/rust/client/Cargo.toml +++ b/ballista/rust/client/Cargo.toml @@ -31,5 +31,5 @@ futures = "0.3" log = "0.4" tokio = "1.0" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml index 58e6d22734acd..e9d7682473f17 100644 --- a/ballista/rust/core/Cargo.toml +++ b/ballista/rust/core/Cargo.toml @@ -40,8 +40,8 @@ tokio = "1.0" tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } -arrow = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/executor/Cargo.toml b/ballista/rust/executor/Cargo.toml index ccf30cf16eb7b..79ceabe2dd666 100644 --- a/ballista/rust/executor/Cargo.toml +++ b/ballista/rust/executor/Cargo.toml @@ -45,8 +45,8 @@ tokio-stream = "0.1" tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } -arrow = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/scheduler/Cargo.toml b/ballista/rust/scheduler/Cargo.toml index 197a2319154d2..ce8ca09e15b2f 100644 --- a/ballista/rust/scheduler/Cargo.toml +++ b/ballista/rust/scheduler/Cargo.toml @@ -52,7 +52,7 @@ tonic = "0.4" tower = { version = "0.4" } warp = "0.3" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } datafusion = { path = "../../../datafusion" } [dev-dependencies] diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 7fd8444716865..66a81be26b36c 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -31,8 +31,8 @@ simd = ["datafusion/simd"] snmalloc = ["snmalloc-rs"] [dependencies] -arrow = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } -parquet = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } +parquet = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } datafusion = { path = "../datafusion" } structopt = { version = "0.3", default-features = false } tokio = { version = "^1.0", features = ["macros", "rt", "rt-multi-thread"] } diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 1a060504df721..7f7c239d0f549 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -29,8 +29,8 @@ publish = false [dev-dependencies] -arrow = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } datafusion = { path = "../datafusion" } prost = "0.7" tonic = "0.4" diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index 88a50dd5a4432..443bd7e020414 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -50,8 +50,8 @@ unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = "0.7" hashbrown = "0.11" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c", features = ["prettyprint"] } -parquet = { git = "https://github.com/apache/arrow-rs", rev = "e023b4c", features = ["arrow"] } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "08a662f", features = ["prettyprint"] } +parquet = { git = "https://github.com/apache/arrow-rs", rev = "08a662f", features = ["arrow"] } sqlparser = "0.9.0" clap = "2.33" rustyline = {version = "7.0", optional = true} From 929e9247ff38d0a66aeead4b0ed9edf0442cca92 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 18 Apr 2021 13:47:39 -0600 Subject: [PATCH 017/329] Add Arrow and Parquet submodules (#9) --- .gitmodules | 6 ++++++ arrow-testing | 1 + parquet-testing | 1 + 3 files changed, 8 insertions(+) create mode 100644 .gitmodules create mode 160000 arrow-testing create mode 160000 parquet-testing diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000..86e2f0638efe9 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "testing"] + path = arrow-testing + url = https://github.com/apache/arrow-testing +[submodule "parquet-testing"] + path = parquet-testing + url = https://github.com/apache/parquet-testing.git diff --git a/arrow-testing b/arrow-testing new file mode 160000 index 0000000000000..b658b087767b0 --- /dev/null +++ b/arrow-testing @@ -0,0 +1 @@ +Subproject commit b658b087767b041b2081766814655b4dd5a9a439 diff --git a/parquet-testing b/parquet-testing new file mode 160000 index 0000000000000..8e7badc6a3817 --- /dev/null +++ b/parquet-testing @@ -0,0 +1 @@ +Subproject commit 8e7badc6a3817a02e06d17b5d8ab6b6dc356e890 From 11679f67709044a7d89bc7579719e99986cb1575 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 18 Apr 2021 20:04:40 -0600 Subject: [PATCH 018/329] Fix CI (#10) * Fixed CI. * More cleanup. * More cleanups. * More cleanups. * Bumped arrow to latest. * Always test. * trigger CI * Remove arrow-testing submodule * Add arrow-testing submodule * fix env var for AMD64 build * delete travis ci yaml Co-authored-by: Jorge C. Leitao --- .github/workflows/archery.yml | 64 ---- .github/workflows/cancel.yml | 77 ----- .github/workflows/cpp.yml | 395 ------------------------ .github/workflows/cpp_cron.yml | 149 --------- .github/workflows/csharp.yml | 121 -------- .github/workflows/dev.yml | 63 +--- .github/workflows/dev_pr/labeler.yml | 52 ---- .github/workflows/go.yml | 125 -------- .github/workflows/integration.yml | 83 ----- .github/workflows/java.yml | 112 ------- .github/workflows/java_jni.yml | 83 ----- .github/workflows/js.yml | 122 -------- .github/workflows/julia.yml | 53 ---- .github/workflows/python.yml | 154 --------- .github/workflows/python_cron.yml | 141 --------- .github/workflows/r.yml | 255 --------------- .github/workflows/ruby.yml | 290 ----------------- .github/workflows/rust.yml | 228 ++------------ .gitignore | 6 + .gitmodules | 6 +- .travis.yml | 165 ---------- ballista/rust/benchmarks/tpch/README.md | 6 +- dev/.gitignore | 1 + dev/archery/archery/cli.py | 5 +- dev/archery/archery/utils/source.py | 8 +- dev/release/rat_exclude_files.txt | 111 +------ arrow-testing => testing | 0 27 files changed, 48 insertions(+), 2827 deletions(-) delete mode 100644 .github/workflows/archery.yml delete mode 100644 .github/workflows/cpp.yml delete mode 100644 .github/workflows/cpp_cron.yml delete mode 100644 .github/workflows/csharp.yml delete mode 100644 .github/workflows/go.yml delete mode 100644 .github/workflows/integration.yml delete mode 100644 .github/workflows/java.yml delete mode 100644 .github/workflows/java_jni.yml delete mode 100644 .github/workflows/js.yml delete mode 100644 .github/workflows/julia.yml delete mode 100644 .github/workflows/python.yml delete mode 100644 .github/workflows/python_cron.yml delete mode 100644 .github/workflows/r.yml delete mode 100644 .github/workflows/ruby.yml delete mode 100644 .travis.yml rename arrow-testing => testing (100%) diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml deleted file mode 100644 index 761e045954367..0000000000000 --- a/.github/workflows/archery.yml +++ /dev/null @@ -1,64 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Archery & Crossbow - -on: - push: - paths: - - '.github/workflows/archery.yml' - - 'dev/archery/**' - - 'dev/tasks/**' - - 'docker-compose.yml' - pull_request: - paths: - - '.github/workflows/archery.yml' - - 'dev/archery/**' - - 'dev/tasks/**' - - 'docker-compose.yml' - -jobs: - - test: - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - name: Archery Unittests and Crossbow Check Config - runs-on: ubuntu-latest - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Git Fixup - if: ${{ github.event_name == 'pull_request' }} - shell: bash - run: git branch master origin/master - - name: Free Up Disk Space - run: ci/scripts/util_cleanup.sh - - name: Setup Python - uses: actions/setup-python@v1 - with: - python-version: '3.6' - - name: Install Archery, Crossbow- and Test Dependencies - run: pip install pytest responses -e dev/archery[all] - - name: Archery Unittests - working-directory: dev/archery - run: pytest -v archery - - name: Archery Docker Validation - run: archery docker - - name: Crossbow Check Config - working-directory: dev/tasks - run: archery crossbow check-config diff --git a/.github/workflows/cancel.yml b/.github/workflows/cancel.yml index de980eb6d05b5..e1c6ed98ae8b3 100644 --- a/.github/workflows/cancel.yml +++ b/.github/workflows/cancel.yml @@ -30,20 +30,6 @@ jobs: steps: # Unfortunately, we need to define a separate cancellation step for # each workflow where we want to cancel stale runs. - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale C++ runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: cpp.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale C# runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: csharp.yml - skipEventTypes: '["push", "schedule"]' - uses: potiuk/cancel-workflow-runs@master name: "Cancel stale Dev runs" with: @@ -51,69 +37,6 @@ jobs: token: ${{ secrets.GITHUB_TOKEN }} workflowFileName: dev.yml skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale Go runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: go.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale Integration runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: integration.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale Java JNI runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: java_jni.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale Java runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: java.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale JS runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: js.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale Julia runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: julia.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale Python runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: python.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale R runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: r.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale Ruby runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: ruby.yml - skipEventTypes: '["push", "schedule"]' - uses: potiuk/cancel-workflow-runs@master name: "Cancel stale Rust runs" with: diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml deleted file mode 100644 index 0bcf3460ad4b0..0000000000000 --- a/.github/workflows/cpp.yml +++ /dev/null @@ -1,395 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: C++ - -on: - push: - paths: - - '.github/workflows/cpp.yml' - - 'ci/docker/**' - - 'ci/scripts/cpp_*' - - 'ci/scripts/msys2_*' - - 'ci/scripts/util_*' - - 'cpp/**' - - 'format/Flight.proto' - pull_request: - paths: - - '.github/workflows/cpp.yml' - - 'ci/docker/**' - - 'ci/scripts/cpp_*' - - 'ci/scripts/msys2_*' - - 'ci/scripts/util_*' - - 'cpp/**' - - 'format/Flight.proto' - -env: - ARROW_ENABLE_TIMING_TESTS: OFF - DOCKER_VOLUME_PREFIX: ".docker/" - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - -jobs: - - docker: - name: ${{ matrix.title }} - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - strategy: - fail-fast: false - matrix: - image: - - conda-cpp - - ubuntu-cpp-sanitizer - include: - - image: conda-cpp - title: AMD64 Conda C++ - - image: ubuntu-cpp-sanitizer - title: AMD64 Ubuntu 20.04 C++ ASAN UBSAN - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - run: ci/scripts/util_checkout.sh - - name: Free Up Disk Space - run: ci/scripts/util_cleanup.sh - - name: Cache Docker Volumes - uses: actions/cache@v2 - with: - path: .docker - key: ${{ matrix.image }}-${{ hashFiles('cpp/**') }} - restore-keys: ${{ matrix.image }}- - - name: Setup Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Setup Archery - run: pip install -e dev/archery[docker] - - name: Execute Docker Build - run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited - archery docker run ${{ matrix.image }} - - name: Docker Push - if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' - continue-on-error: true - run: archery docker push ${{ matrix.image }} - - docker-arm: - # NOTE: this job is specific for self-hosted runners - # CACHING: don't use the cache plugin because of various permission - # issues and keep the cached docker volumes permanently on the - # host - # PYTHON: no distributions are built for arm machines by the github - # actions team, so python>3.6 must be preinstalled on the self - # hosted machines - name: ${{ matrix.title }} - runs-on: ${{ matrix.runner }} - # TODO(kszucs): re-enable once the self-hosted workers are properly - # registered to github - if: false && github.event_name == 'push' - defaults: - # To use certain environment variables set by .bashrc, an interactive - # bash shell must be used - run: - shell: bash -i {0} - strategy: - fail-fast: false - matrix: - name: - - arm32v7-debian-10-cpp - - arm64v8-ubuntu-20.04-cpp - include: - - name: arm32v7-debian-10-cpp - debian: 10 - title: ARM32v7 Debian 10 C++ - image: | - -e CPP_MAKE_PARALLELISM=2 \ - -e CXXFLAGS=-Wno-psabi \ - -e ARROW_PARQUET=OFF \ - -e ARROW_FLIGHT=OFF \ - -e ARROW_GANDIVA=OFF \ - -e ARROW_ORC=OFF \ - -e CMAKE_ARGS=-DARROW_CPU_FLAG=armv7 \ - debian-cpp - arch: 'arm32v7' - runner: [self-hosted, linux, ARM] - - name: arm64v8-ubuntu-20.04-cpp - ubuntu: 20.04 - title: ARM64v8 Ubuntu 20.04 C++ - image: | - -e CPP_MAKE_PARALLELISM=1 \ - -e ARROW_PARQUET=OFF \ - ubuntu-cpp - arch: 'arm64v8' - runner: [self-hosted, linux, ARM64] - env: - # the defaults here should correspond to the values in .env - ARCH: ${{ matrix.arch || 'arm64v8' }} - DEBIAN: ${{ matrix.debian || 10 }} - FEDORA: ${{ matrix.fedora || 32 }} - UBUNTU: ${{ matrix.ubuntu || 18.04 }} - LLVM: 8 - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - run: ci/scripts/util_checkout.sh - - name: Setup Archery - run: pip install -U -e dev/archery[docker] - - name: Execute Docker Build - # parallelism is reduced because the ARM builders are low on memory - run: | - ulimit -c unlimited - archery docker run ${{ matrix.image }} - - name: Docker Push - if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' - continue-on-error: true - run: archery docker push ${{ matrix.image }} - - build-example: - name: C++ Minimal Build Example - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - strategy: - fail-fast: false - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Run - run: | - cd cpp/examples/minimal_build - docker-compose run --rm minimal - - macos: - name: AMD64 MacOS 10.15 C++ - runs-on: macos-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - strategy: - fail-fast: false - env: - ARROW_BUILD_TESTS: ON - ARROW_DATASET: ON - ARROW_FLIGHT: ON - ARROW_GANDIVA: ON - ARROW_HDFS: ON - ARROW_HOME: /usr/local - ARROW_JEMALLOC: ON - # TODO(kszucs): link error in the tests - ARROW_ORC: OFF - ARROW_PARQUET: ON - ARROW_PLASMA: ON - ARROW_S3: ON - ARROW_WITH_BROTLI: ON - ARROW_WITH_BZ2: ON - ARROW_WITH_LZ4: ON - ARROW_WITH_SNAPPY: ON - ARROW_WITH_ZLIB: ON - ARROW_WITH_ZSTD: ON - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - shell: bash - run: ci/scripts/util_checkout.sh - - name: Install Dependencies - shell: bash - run: | - rm -f /usr/local/bin/2to3 - brew update --preinstall - brew unlink gcc@8 gcc@9 - brew bundle --file=cpp/Brewfile - - name: Build - shell: bash - run: ci/scripts/cpp_build.sh $(pwd) $(pwd)/build - - name: Test - shell: bash - run: | - sudo sysctl -w kern.coredump=1 - sudo sysctl -w kern.corefile=core.%N.%P - ulimit -c unlimited # must enable within the same shell - ci/scripts/cpp_test.sh $(pwd) $(pwd)/build - - windows: - name: AMD64 ${{ matrix.name }} C++ - runs-on: ${{ matrix.os }} - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - strategy: - fail-fast: false - matrix: - os: - - windows-latest - include: - - os: windows-latest - name: Windows 2019 - generator: Visual Studio 16 2019 - env: - ARROW_BOOST_USE_SHARED: OFF - ARROW_BUILD_BENCHMARKS: ON - ARROW_BUILD_SHARED: ON - ARROW_BUILD_STATIC: OFF - ARROW_BUILD_TESTS: ON - ARROW_DATASET: ON - ARROW_FLIGHT: OFF - ARROW_HDFS: ON - ARROW_HOME: /usr - ARROW_JEMALLOC: OFF - ARROW_MIMALLOC: ON - ARROW_PARQUET: ON - ARROW_USE_GLOG: OFF - ARROW_VERBOSE_THIRDPARTY_BUILD: OFF - ARROW_WITH_BROTLI: OFF - ARROW_WITH_BZ2: OFF - ARROW_WITH_LZ4: OFF - ARROW_WITH_SNAPPY: ON - ARROW_WITH_ZLIB: ON - ARROW_WITH_ZSTD: ON - BOOST_SOURCE: BUNDLED - CMAKE_ARGS: '-A x64 -DOPENSSL_ROOT_DIR=C:\Program Files\OpenSSL-Win64' - CMAKE_GENERATOR: ${{ matrix.generator }} - CMAKE_INSTALL_LIBDIR: bin - CMAKE_INSTALL_PREFIX: /usr - CMAKE_UNITY_BUILD: ON - NPROC: 2 - steps: - - name: Disable Crash Dialogs - run: | - reg add ` - "HKCU\SOFTWARE\Microsoft\Windows\Windows Error Reporting" ` - /v DontShowUI ` - /t REG_DWORD ` - /d 1 ` - /f - - name: Installed Packages - run: choco list -l - - name: Install Dependencies - run: choco install -y --no-progress openssl - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - shell: bash - run: ci/scripts/util_checkout.sh - - name: Build - shell: bash - run: | - ci/scripts/cpp_build.sh $(pwd) $(pwd)/build - - name: Test - shell: bash - run: ci/scripts/cpp_test.sh $(pwd) $(pwd)/build - - windows-mingw: - name: AMD64 Windows MinGW ${{ matrix.mingw-n-bits }} C++ - runs-on: windows-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - strategy: - fail-fast: false - matrix: - mingw-n-bits: - - 32 - - 64 - env: - ARROW_BUILD_SHARED: ON - ARROW_BUILD_STATIC: OFF - ARROW_BUILD_TESTS: ON - ARROW_BUILD_TYPE: release - ARROW_DATASET: ON - ARROW_FLIGHT: ON - ARROW_GANDIVA: ON - ARROW_HDFS: OFF - ARROW_HOME: /mingw${{ matrix.mingw-n-bits }} - ARROW_JEMALLOC: OFF - ARROW_PARQUET: ON - ARROW_PYTHON: ON - ARROW_S3: ON - ARROW_USE_GLOG: OFF - ARROW_VERBOSE_THIRDPARTY_BUILD: OFF - ARROW_WITH_BROTLI: ON - ARROW_WITH_BZ2: ON - ARROW_WITH_LZ4: ON - ARROW_WITH_SNAPPY: ON - ARROW_WITH_ZLIB: ON - ARROW_WITH_ZSTD: ON - # Don't use preinstalled Boost by empty BOOST_ROOT and - # -DBoost_NO_BOOST_CMAKE=ON - BOOST_ROOT: "" - CMAKE_ARGS: >- - -DARROW_PACKAGE_PREFIX=/mingw${{ matrix.mingw-n-bits }} - -DBoost_NO_BOOST_CMAKE=ON - CMAKE_UNITY_BUILD: ON - steps: - - name: Disable Crash Dialogs - run: | - reg add ` - "HKCU\SOFTWARE\Microsoft\Windows\Windows Error Reporting" ` - /v DontShowUI ` - /t REG_DWORD ` - /d 1 ` - /f - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - shell: bash - run: ci/scripts/util_checkout.sh - - uses: msys2/setup-msys2@v2 - with: - msystem: MINGW${{ matrix.mingw-n-bits }} - update: true - - name: Setup MSYS2 - shell: msys2 {0} - run: | - ci/scripts/msys2_setup.sh cpp - - name: Cache ccache - uses: actions/cache@v2 - with: - path: ccache - key: cpp-ccache-mingw${{ matrix.mingw-n-bits }}-${{ hashFiles('cpp/**') }} - restore-keys: cpp-ccache-mingw${{ matrix.mingw-n-bits }}- - - name: Build - shell: msys2 {0} - run: | - export CMAKE_BUILD_PARALLEL_LEVEL=$NUMBER_OF_PROCESSORS - ci/scripts/cpp_build.sh "$(pwd)" "$(pwd)/build" - - name: Download MinIO - shell: msys2 {0} - run: | - mkdir -p /usr/local/bin - wget \ - --output-document /usr/local/bin/minio.exe \ - https://dl.min.io/server/minio/release/windows-amd64/minio.exe - chmod +x /usr/local/bin/minio.exe - - name: Test - shell: msys2 {0} - run: | - python_version=$(python -c "import sys; print('.'.join(map(str, sys.version_info[0:2])))") - export PYTHONHOME="$(cygpath --windows ${MINGW_PREFIX})\lib\python${python_version}" - PYTHONPATH="${PYTHONHOME}" - PYTHONPATH="${PYTHONPATH};${PYTHONHOME}\lib-dynload" - PYTHONPATH="${PYTHONPATH};${PYTHONHOME}\site-packages" - export PYTHONPATH - ci/scripts/cpp_test.sh "$(pwd)" "$(pwd)/build" diff --git a/.github/workflows/cpp_cron.yml b/.github/workflows/cpp_cron.yml deleted file mode 100644 index 9e4f3cf388b36..0000000000000 --- a/.github/workflows/cpp_cron.yml +++ /dev/null @@ -1,149 +0,0 @@ - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: C++ Cron - -on: - push: - paths: - - '.github/workflows/cpp_cron.yml' - pull_request: - paths: - - '.github/workflows/cpp_cron.yml' - schedule: - - cron: | - 0 */12 * * * - -env: - ARROW_ENABLE_TIMING_TESTS: OFF - DOCKER_VOLUME_PREFIX: ".docker/" - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - -jobs: - docker: - name: ${{ matrix.title }} - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') && github.repository == 'apache/arrow' }} - strategy: - fail-fast: false - matrix: - name: - - amd64-debian-10-cpp - - amd64-fedora-33-cpp - - amd64-ubuntu-18.04-cpp - - amd64-ubuntu-20.04-cpp - include: - - name: amd64-debian-10-cpp - image: debian-cpp - title: AMD64 Debian 10 C++ - debian: 10 - - name: amd64-fedora-33-cpp - image: fedora-cpp - title: AMD64 Fedora 33 C++ - fedora: 33 - - name: amd64-ubuntu-18.04-cpp - image: ubuntu-cpp - title: AMD64 Ubuntu 18.04 C++ - ubuntu: 18.04 - - name: amd64-ubuntu-20.04-cpp - image: ubuntu-cpp - title: AMD64 Ubuntu 20.04 C++ - ubuntu: 20.04 - env: - # the defaults here should correspond to the values in .env - ARCH: 'amd64' - DEBIAN: ${{ matrix.debian || 10 }} - FEDORA: ${{ matrix.fedora || 33 }} - UBUNTU: ${{ matrix.ubuntu || 18.04 }} - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - run: ci/scripts/util_checkout.sh - - name: Free Up Disk Space - run: ci/scripts/util_cleanup.sh - - name: Cache Docker Volumes - uses: actions/cache@v2 - with: - path: .docker - key: ${{ matrix.name }}-${{ hashFiles('cpp/**') }} - restore-keys: ${{ matrix.name }}- - - name: Setup Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Setup Archery - run: pip install -e dev/archery[docker] - - name: Execute Docker Build - run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited - archery docker run ${{ matrix.image }} - - name: Docker Push - if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' - continue-on-error: true - run: archery docker push ${{ matrix.image }} - - oss-fuzz: - name: OSS-Fuzz build check - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') && github.repository == 'apache/arrow' }} - strategy: - fail-fast: false - matrix: - ubuntu: [18.04] - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - shell: bash - run: ci/scripts/util_checkout.sh - - name: Free Up Disk Space - shell: bash - run: ci/scripts/util_cleanup.sh - - name: Checkout OSS-Fuzz - uses: actions/checkout@v1 - with: - path: oss-fuzz - repository: google/oss-fuzz - ref: master - - name: Install dependencies - working-directory: ../oss-fuzz - run: | - python3 -m pip install setuptools - python3 -m pip install -r infra/ci/requirements.txt - - name: Build image - shell: bash - working-directory: ../oss-fuzz - run: | - python3 infra/helper.py build_image --pull arrow - - name: Build fuzzers - shell: bash - working-directory: ../oss-fuzz - run: | - python3 infra/helper.py build_fuzzers arrow `pwd`/../arrow - - name: Check build - shell: bash - working-directory: ../oss-fuzz - run: | - python3 infra/helper.py check_build arrow diff --git a/.github/workflows/csharp.yml b/.github/workflows/csharp.yml deleted file mode 100644 index 03a297bb91423..0000000000000 --- a/.github/workflows/csharp.yml +++ /dev/null @@ -1,121 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: C# - -on: - push: - paths: - - '.github/workflows/csharp.yml' - - 'ci/scripts/csharp_*' - - 'csharp/**' - pull_request: - paths: - - '.github/workflows/csharp.yml' - - 'ci/scripts/csharp_*' - - 'csharp/**' - -jobs: - - ubuntu: - name: AMD64 Ubuntu 18.04 C# ${{ matrix.dotnet }} - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - strategy: - fail-fast: false - matrix: - dotnet: ['3.1.x'] - steps: - - name: Install C# - uses: actions/setup-dotnet@v1 - with: - dotnet-version: ${{ matrix.dotnet }} - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - shell: bash - run: ci/scripts/util_checkout.sh - - name: Install Source Link - shell: bash - run: dotnet tool install --global sourcelink - - name: Build - shell: bash - run: ci/scripts/csharp_build.sh $(pwd) - - name: Test - shell: bash - run: ci/scripts/csharp_test.sh $(pwd) - - windows: - name: AMD64 Windows 2019 18.04 C# ${{ matrix.dotnet }} - runs-on: windows-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - strategy: - fail-fast: false - matrix: - dotnet: ['3.1.x'] - steps: - - name: Install C# - uses: actions/setup-dotnet@v1 - with: - dotnet-version: ${{ matrix.dotnet }} - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - shell: bash - run: ci/scripts/util_checkout.sh - - name: Install Source Link - run: dotnet tool install --global sourcelink - - name: Build - shell: bash - run: ci/scripts/csharp_build.sh $(pwd) - - name: Test - shell: bash - run: ci/scripts/csharp_test.sh $(pwd) - - macos: - name: AMD64 MacOS 10.15 C# ${{ matrix.dotnet }} - runs-on: macos-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - strategy: - fail-fast: false - matrix: - dotnet: ['3.1.x'] - steps: - - name: Install C# - uses: actions/setup-dotnet@v1 - with: - dotnet-version: ${{ matrix.dotnet }} - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Install Source Link - shell: bash - run: dotnet tool install --global sourcelink - - name: Fetch Submodules and Tags - shell: bash - run: ci/scripts/util_checkout.sh - - name: Build - shell: bash - run: ci/scripts/csharp_build.sh $(pwd) - - name: Test - shell: bash - run: ci/scripts/csharp_test.sh $(pwd) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index d1b018480047e..6c6dd830e1610 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -22,75 +22,18 @@ on: push: pull_request: -env: - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - jobs: lint: name: Lint C++, Python, R, Rust, Docker, RAT runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - run: ci/scripts/util_checkout.sh - - name: Free Up Disk Space - run: ci/scripts/util_cleanup.sh + - uses: actions/checkout@v2 - name: Setup Python uses: actions/setup-python@v1 with: python-version: 3.8 - name: Setup Archery run: pip install -e dev/archery[docker] - - name: Execute Docker Build - run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited - archery docker run ubuntu-lint - - name: Docker Push - if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' - continue-on-error: true - run: archery docker push ubuntu-lint - - release: - name: Source Release and Merge Script - runs-on: ubuntu-20.04 - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - env: - GIT_AUTHOR_NAME: Github Actions - GIT_AUTHOR_EMAIL: github@actions - GIT_COMMITTER_NAME: Github Actions - GIT_COMMITTER_EMAIL: github@actions - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - shell: bash - run: ci/scripts/util_checkout.sh - - name: Install Python - uses: actions/setup-python@v1 - with: - python-version: '3.6' - - name: Install Ruby - uses: ruby/setup-ruby@v1 - with: - ruby-version: '2.6' - - name: Install Dependencies - shell: bash - run: | - pip install cython setuptools pytest jira - - name: Run Release Test - shell: bash - run: | - ci/scripts/release_test.sh $(pwd) - - name: Run Merge Script Test - shell: bash - run: | - pytest -v dev/test_merge_arrow_pr.py + - name: Lint + run: archery lint --rat diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index 098e1bad7f47b..5eb722da41867 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -15,60 +15,8 @@ # specific language governing permissions and limitations # under the License. -"lang-c++": - - cpp/**/* - -lang-c-glib: - - c_glib/**/* - -lang-csharp: - - csharp/**/* - -lang-go: - - go/**/* - -lang-java: - - java/**/* - -lang-js: - - js/**/* - -lang-julia: - - julia/**/* - -lang-python: - - python/**/* - -lang-R: - - r/**/* - -lang-ruby: - - ruby/**/* - -lang-rust: - - rust/**/* - datafusion: - rust/datafusion/**/* ballista: - rust/ballista/**/* - -flight: - - cpp/src/arrow/flight/**/* - - r/R/flight.* - - rust/arrow-flight/**/* - - python/pyarrow/*flight.* - -gandiva: - - c_glib/gandiva-glib/**/* - - cpp/src/gandiva/**/* - - ruby/red-gandiva/**/* - - python/pyarrow/gandiva.* - -parquet: - - c_glib/parquet-glib/**/* - - cpp/src/parquet/**/* - - r/R/parquet.* - - ruby/red-parquet/**/* - - rust/parquet*/**/* diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml deleted file mode 100644 index 574795f5e9b16..0000000000000 --- a/.github/workflows/go.yml +++ /dev/null @@ -1,125 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Go - -on: - push: - paths: - - '.github/workflows/go.yml' - - 'ci/docker/*_go.dockerfile' - - 'ci/scripts/go_*' - - 'go/**' - pull_request: - paths: - - '.github/workflows/go.yml' - - 'ci/docker/*_go.dockerfile' - - 'ci/docker/**' - - 'ci/scripts/go_*' - - 'go/**' - -env: - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - -jobs: - - docker: - name: AMD64 Debian 10 Go ${{ matrix.go }} - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - strategy: - fail-fast: false - matrix: - go: [1.15] - env: - GO: ${{ matrix.go }} - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - run: ci/scripts/util_checkout.sh - - name: Free Up Disk Space - run: ci/scripts/util_cleanup.sh - - name: Setup Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Setup Archery - run: pip install -e dev/archery[docker] - - name: Execute Docker Build - run: archery docker run debian-go - - name: Docker Push - if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' - continue-on-error: true - run: archery docker push debian-go - - windows: - name: AMD64 Windows 2019 Go ${{ matrix.go }} - runs-on: windows-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - strategy: - fail-fast: false - matrix: - go: [1.15] - steps: - - name: Install go - uses: actions/setup-go@v1 - with: - go-version: ${{ matrix.go }} - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - shell: bash - run: ci/scripts/util_checkout.sh - - name: Build - shell: bash - run: ci/scripts/go_build.sh . - - name: Test - shell: bash - run: ci/scripts/go_test.sh . - - macos: - name: AMD64 MacOS 10.15 Go ${{ matrix.go }} - runs-on: macos-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - strategy: - fail-fast: false - matrix: - go: [1.15] - steps: - - name: Install go - uses: actions/setup-go@v1 - with: - go-version: ${{ matrix.go }} - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - shell: bash - run: ci/scripts/util_checkout.sh - - name: Build - shell: bash - run: ci/scripts/go_build.sh . - - name: Test - shell: bash - run: ci/scripts/go_test.sh . diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml deleted file mode 100644 index 20112553ea25d..0000000000000 --- a/.github/workflows/integration.yml +++ /dev/null @@ -1,83 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Integration - -on: - push: - paths: - - '.github/workflows/integration.yml' - - 'ci/**' - - 'dev/archery/**' - - 'go/**' - - 'integration/**' - - 'js/**' - - 'cpp/**' - - 'java/**' - - 'format/**' - - 'rust/**' - pull_request: - paths: - - '.github/workflows/integration.yml' - - 'ci/**' - - 'dev/archery/**' - - 'go/**' - - 'integration/**' - - 'js/**' - - 'cpp/**' - - 'java/**' - - 'format/**' - - 'rust/**' - -env: - DOCKER_VOLUME_PREFIX: ".docker/" - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - -jobs: - - docker: - name: AMD64 Conda Integration Test - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - run: ci/scripts/util_checkout.sh - - name: Free Up Disk Space - run: ci/scripts/util_cleanup.sh - - name: Cache Docker Volumes - uses: actions/cache@v2 - with: - path: .docker - key: conda-${{ hashFiles('cpp/**') }} - restore-keys: conda- - - name: Setup Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Setup Archery - run: pip install -e dev/archery[docker] - - name: Execute Docker Build - run: archery docker run conda-integration - - name: Docker Push - if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' - continue-on-error: true - run: archery docker push conda-integration diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml deleted file mode 100644 index 7f6f29f0f4440..0000000000000 --- a/.github/workflows/java.yml +++ /dev/null @@ -1,112 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Java - -on: - push: - paths: - - '.github/workflows/java.yml' - - 'ci/docker/*java*' - - 'ci/scripts/java*.sh' - - 'ci/scripts/util_*.sh' - - 'format/Flight.proto' - - 'java/**' - pull_request: - paths: - - '.github/workflows/java.yml' - - 'ci/docker/*java*' - - 'ci/scripts/java*.sh' - - 'ci/scripts/util_*.sh' - - 'format/Flight.proto' - - 'java/**' - -env: - DOCKER_VOLUME_PREFIX: ".docker/" - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - -jobs: - - debian: - name: AMD64 Debian 9 Java JDK ${{ matrix.jdk }} Maven ${{ matrix.maven }} - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - strategy: - fail-fast: false - matrix: - jdk: [11] - maven: [3.6.2] - env: - JDK: ${{ matrix.jdk }} - MAVEN: ${{ matrix.maven }} - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - shell: bash - run: ci/scripts/util_checkout.sh - - name: Free Up Disk Space - shell: bash - run: ci/scripts/util_cleanup.sh - - name: Cache Docker Volumes - uses: actions/cache@v2 - with: - path: .docker - key: maven-${{ hashFiles('java/**') }} - restore-keys: maven- - - name: Setup Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Setup Archery - run: pip install -e dev/archery[docker] - - name: Execute Docker Build - run: archery docker run debian-java - - name: Docker Push - if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' - continue-on-error: true - run: archery docker push debian-java - - macos: - name: AMD64 MacOS 10.15 Java JDK ${{ matrix.jdk }} - runs-on: macos-latest - if: github.event_name == 'push' - strategy: - fail-fast: false - matrix: - jdk: [11] - steps: - - name: Set up Java - uses: actions/setup-java@v1 - with: - java-version: ${{ matrix.jdk }} - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - shell: bash - run: ci/scripts/util_checkout.sh - - name: Build - shell: bash - run: ci/scripts/java_build.sh $(pwd) $(pwd)/build - - name: Test - shell: bash - run: ci/scripts/java_test.sh $(pwd) $(pwd)/build diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml deleted file mode 100644 index 5f25e8c053d8c..0000000000000 --- a/.github/workflows/java_jni.yml +++ /dev/null @@ -1,83 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Java JNI - -on: - push: - paths: - - '.github/workflows/java_jni.yml' - - 'ci/docker/**' - - 'ci/scripts/cpp_build.sh' - - 'ci/scripts/java_*' - - 'cpp/**' - - 'java/**' - pull_request: - paths: - - '.github/workflows/java_jni.yml' - - 'ci/docker/**' - - 'ci/scripts/cpp_build.sh' - - 'ci/scripts/java_*' - - 'cpp/**' - - 'java/**' - -env: - DOCKER_VOLUME_PREFIX: ".docker/" - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - -jobs: - - docker: - name: AMD64 Debian 9 Java JNI (Gandiva, Plasma, ORC, Dataset) - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - strategy: - fail-fast: false - matrix: - jdk: [8] - maven: [3.5.2] - env: - JDK: ${{ matrix.jdk }} - MAVEN: ${{ matrix.maven }} - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - run: ci/scripts/util_checkout.sh - - name: Free Up Disk Space - run: ci/scripts/util_cleanup.sh - - name: Cache Docker Volumes - uses: actions/cache@v2 - with: - path: .docker - key: maven-${{ hashFiles('java/**') }} - restore-keys: maven- - - name: Setup Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Setup Archery - run: pip install -e dev/archery[docker] - - name: Execute Docker Build - run: archery docker run debian-java-jni - - name: Docker Push - if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' - continue-on-error: true - run: archery docker push debian-java-jni diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml deleted file mode 100644 index 354c45c60d308..0000000000000 --- a/.github/workflows/js.yml +++ /dev/null @@ -1,122 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: NodeJS - -on: - push: - paths: - - '.github/workflows/js.yml' - - 'ci/docker/*js.dockerfile' - - 'ci/scripts/js_*' - - 'js/**' - pull_request: - paths: - - '.github/workflows/js.yml' - - 'ci/docker/*js.dockerfile' - - 'ci/scripts/js_*' - - 'js/**' - -env: - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - -jobs: - - docker: - name: AMD64 Debian 10 NodeJS 14 - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - run: ci/scripts/util_checkout.sh - - name: Free Up Disk Space - run: ci/scripts/util_cleanup.sh - - name: Setup Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Setup Archery - run: pip install -e dev/archery[docker] - - name: Execute Docker Build - run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited - archery docker run debian-js - - name: Docker Push - if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' - continue-on-error: true - run: archery docker push debian-js - - macos: - name: AMD64 MacOS 10.15 NodeJS ${{ matrix.node }} - runs-on: macos-latest - if: github.event_name == 'push' - strategy: - fail-fast: false - matrix: - node: [14] - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - shell: bash - run: ci/scripts/util_checkout.sh - - name: Install NodeJS - uses: actions/setup-node@v1 - with: - node-version: ${{ matrix.node }} - - name: Build - shell: bash - run: ci/scripts/js_build.sh $(pwd) - - name: Test - shell: bash - run: ci/scripts/js_test.sh $(pwd) - - # TODO(kszucs): the windows build fails with platform specific npm error - # windows: - # name: AMD64 Windows 2019 NodeJS ${{ matrix.node }} - # runs-on: windows-latest - # if: github.event_name == 'push' - # strategy: - # fail-fast: false - # matrix: - # node: [14] - # steps: - # - name: Checkout Arrow - # uses: actions/checkout@v1 - # with: - # submodules: true - # - name: Install NodeJS - # uses: actions/setup-node@v1 - # with: - # node-version: ${{ matrix.node }} - # - name: Install Platform Dependencies - # shell: bash - # run: yarn add -g cross-env - # - name: Build - # shell: bash - # run: ci/scripts/js_build.sh $(pwd) - # - name: Test - # shell: bash - # run: ci/scripts/js_test.sh $(pwd) diff --git a/.github/workflows/julia.yml b/.github/workflows/julia.yml deleted file mode 100644 index 64ea6c947a15b..0000000000000 --- a/.github/workflows/julia.yml +++ /dev/null @@ -1,53 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Julia -on: - push: - paths: - - '.github/workflows/julia.yml' - - 'julia/**' - pull_request: - paths: - - '.github/workflows/julia.yml' - - 'julia/**' - -jobs: - test: - name: AMD64 ${{ matrix.os }} Julia ${{ matrix.version }} - env: - JULIA_NUM_THREADS: 2 - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - version: - - '1.3' - - '1' # automatically expands to the latest stable 1.x release of Julia - - 'nightly' - os: - - ubuntu-latest - - windows-latest - steps: - - uses: actions/checkout@v2 - - uses: julia-actions/setup-julia@v1 - with: - version: ${{ matrix.version }} - arch: x64 - - uses: julia-actions/julia-runtest@v1 - with: - project: julia/Arrow diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml deleted file mode 100644 index 9062e93e66515..0000000000000 --- a/.github/workflows/python.yml +++ /dev/null @@ -1,154 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Python - -on: - push: - paths: - - '.github/workflows/python.yml' - - 'ci/**' - - 'cpp/**' - - 'python/**' - pull_request: - paths: - - '.github/workflows/python.yml' - - 'ci/**' - - 'cpp/**' - - 'python/**' - -env: - DOCKER_VOLUME_PREFIX: ".docker/" - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - -jobs: - - docker: - name: ${{ matrix.title }} - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - strategy: - fail-fast: false - matrix: - name: - - conda-python-3.8-nopandas - - conda-python-3.6-pandas-0.23 - - conda-python-3.7-pandas-latest - include: - - name: conda-python-3.8-nopandas - cache: conda-python-3.8 - image: conda-python - title: AMD64 Conda Python 3.8 Without Pandas - python: 3.8 - - name: conda-python-3.6-pandas-0.23 - cache: conda-python-3.6 - image: conda-python-pandas - title: AMD64 Conda Python 3.6 Pandas 0.23 - python: 3.6 - pandas: 0.23 - - name: conda-python-3.7-pandas-latest - cache: conda-python-3.7 - image: conda-python-pandas - title: AMD64 Conda Python 3.7 Pandas latest - python: 3.7 - pandas: latest - env: - PYTHON: ${{ matrix.python || 3.7 }} - UBUNTU: ${{ matrix.ubuntu || 18.04 }} - PANDAS: ${{ matrix.pandas || 'latest' }} - NUMPY: ${{ matrix.numpy || 'latest' }} - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - run: ci/scripts/util_checkout.sh - - name: Free Up Disk Space - run: ci/scripts/util_cleanup.sh - - name: Cache Docker Volumes - uses: actions/cache@v2 - with: - path: .docker - key: ${{ matrix.cache }}-${{ hashFiles('cpp/**') }} - restore-keys: ${{ matrix.cache }}- - - name: Setup Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Setup Archery - run: pip install -e dev/archery[docker] - - name: Execute Docker Build - run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited - archery docker run ${{ matrix.image }} - - name: Docker Push - if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' - continue-on-error: true - run: archery docker push ${{ matrix.image }} - - macos: - name: AMD64 MacOS 10.15 Python 3 - runs-on: macos-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - env: - ARROW_HOME: /usr/local - ARROW_DATASET: ON - ARROW_JEMALLOC: ON - ARROW_ORC: ON - ARROW_PYTHON: ON - ARROW_PLASMA: ON - ARROW_GANDIVA: ON - ARROW_PARQUET: ON - ARROW_WITH_ZLIB: ON - ARROW_WITH_LZ4: ON - ARROW_WITH_BZ2: ON - ARROW_WITH_ZSTD: ON - ARROW_WITH_SNAPPY: ON - ARROW_WITH_BROTLI: ON - ARROW_BUILD_TESTS: OFF - CMAKE_ARGS: "-DPython3_EXECUTABLE=/usr/local/bin/python3" - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - shell: bash - run: ci/scripts/util_checkout.sh - - name: Install Dependencies - shell: bash - run: | - rm -f /usr/local/bin/2to3 - brew update --preinstall - brew unlink gcc@8 gcc@9 - brew bundle --file=cpp/Brewfile - brew install coreutils - python3 -mpip install \ - -r python/requirements-build.txt \ - -r python/requirements-test.txt - - name: Build - shell: bash - run: | - export PYTHON=python3 - ci/scripts/cpp_build.sh $(pwd) $(pwd)/build - ci/scripts/python_build.sh $(pwd) $(pwd)/build - - name: Test - shell: bash - run: ci/scripts/python_test.sh $(pwd) $(pwd)/build diff --git a/.github/workflows/python_cron.yml b/.github/workflows/python_cron.yml deleted file mode 100644 index 7a4401af1c3bf..0000000000000 --- a/.github/workflows/python_cron.yml +++ /dev/null @@ -1,141 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Python Cron - -on: - push: - paths: - - '.github/workflows/python_cron.yml' - pull_request: - paths: - - '.github/workflows/python_cron.yml' - schedule: - - cron: | - 0 */12 * * * - -env: - DOCKER_VOLUME_PREFIX: ".docker/" - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - -jobs: - - docker: - name: ${{ matrix.title }} - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') && github.repository == 'apache/arrow' }} - strategy: - fail-fast: false - matrix: - name: - - debian-10-python-3 - - fedora-33-python-3 - - ubuntu-18.04-python-3 - - conda-python-3.7-dask-latest - - conda-python-3.7-turbodbc-latest - - conda-python-3.7-kartothek-latest - - conda-python-3.7-pandas-0.24 - - conda-python-3.7-pandas-master - - conda-python-3.7-hdfs-2.9.2 - include: - - name: debian-10-python-3 - cache: debian-10-python-3 - image: debian-python - title: AMD64 Debian 10 Python 3 - debian: 10 - - name: fedora-33-python-3 - cache: fedora-33-python-3 - image: fedora-python - title: AMD64 Fedora 33 Python 3 - fedora: 33 - - name: ubuntu-18.04-python-3 - cache: ubuntu-18.04-python-3 - image: ubuntu-python - title: AMD64 Ubuntu 18.04 Python 3 - ubuntu: 18.04 - - name: conda-python-3.7-dask-latest - cache: conda-python-3.7 - image: conda-python-dask - title: AMD64 Conda Python 3.7 Dask latest - dask: latest - - name: conda-python-3.7-turbodbc-latest - cache: conda-python-3.7 - image: conda-python-turbodbc - title: AMD64 Conda Python 3.7 Turbodbc latest - turbodbc: latest - - name: conda-python-3.7-kartothek-latest - cache: conda-python-3.7 - image: conda-python-kartothek - title: AMD64 Conda Python 3.7 Kartothek latest - kartothek: latest - - name: conda-python-3.7-pandas-0.24 - cache: conda-python-3.7 - image: conda-python-pandas - title: AMD64 Conda Python 3.7 Pandas 0.24 - pandas: 0.24 - - name: conda-python-3.7-pandas-master - cache: conda-python-3.7 - image: --no-leaf-cache conda-python-pandas - title: AMD64 Conda Python 3.7 Pandas master - pandas: master - - name: conda-python-3.7-hdfs-2.9.2 - cache: conda-python-3.7 - image: conda-python-hdfs - title: AMD64 Conda Python 3.7 HDFS 2.9.2 - hdfs: 2.9.2 - env: - # the defaults here should correspond to the values in .env - DEBIAN: ${{ matrix.debian || 10 }} - FEDORA: ${{ matrix.fedora || 33 }} - UBUNTU: ${{ matrix.ubuntu || 18.04 }} - PYTHON: ${{ matrix.python || 3.7 }} - HDFS: ${{ matrix.hdfs || '2.9.2' }} - DASK: ${{ matrix.dask || 'latest' }} - TURBODBC: ${{ matrix.turbodbc || 'latest' }} - PANDAS: ${{ matrix.pandas || 'latest' }} - KARTOTHEK: ${{ matrix.kartothek || 'latest' }} - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - run: ci/scripts/util_checkout.sh - - name: Free Up Disk Space - run: ci/scripts/util_cleanup.sh - - name: Cache Docker Volumes - uses: actions/cache@v2 - with: - path: .docker - key: ${{ matrix.cache }}-${{ hashFiles('cpp/**') }} - restore-keys: ${{ matrix.cache }}- - - name: Setup Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Setup Archery - run: pip install -e dev/archery[docker] - - name: Execute Docker Build - run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited - archery docker run ${{ matrix.image }} - - name: Docker Push - if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' - continue-on-error: true - run: archery docker push ${{ matrix.image }} diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml deleted file mode 100644 index 7851b6b1915e4..0000000000000 --- a/.github/workflows/r.yml +++ /dev/null @@ -1,255 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: R - -on: - push: - paths: - - ".github/workflows/r.yml" - - "ci/scripts/r_*.sh" - - "ci/scripts/cpp_*.sh" - - "ci/scripts/PKGBUILD" - - "ci/etc/rprofile" - - "ci/docker/**" - - "cpp/**" - - "r/**" - pull_request: - paths: - - ".github/workflows/r.yml" - - "ci/scripts/r_*.sh" - - "ci/scripts/cpp_*.sh" - - "ci/scripts/PKGBUILD" - - "ci/etc/rprofile" - - "ci/docker/**" - - "cpp/**" - - "r/**" - -env: - DOCKER_VOLUME_PREFIX: ".docker/" - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - -jobs: - ubuntu: - name: AMD64 Ubuntu ${{ matrix.ubuntu }} R ${{ matrix.r }} - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - strategy: - fail-fast: false - matrix: - r: ["3.6"] - ubuntu: [18.04] - env: - R: ${{ matrix.r }} - UBUNTU: ${{ matrix.ubuntu }} - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - run: ci/scripts/util_checkout.sh - - name: Free Up Disk Space - run: ci/scripts/util_cleanup.sh - - name: Cache Docker Volumes - uses: actions/cache@v2 - with: - path: .docker - key: ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/**') }} - restore-keys: ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}- - - name: Check pkgdown reference sections - run: ci/scripts/r_pkgdown_check.sh - - name: Setup Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Setup Archery - run: pip install -e dev/archery[docker] - - name: Execute Docker Build - run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited - archery docker run ubuntu-r - - name: Dump install logs - run: cat r/check/arrow.Rcheck/00install.out - if: always() - - name: Dump test logs - run: cat r/check/arrow.Rcheck/tests/testthat.Rout* - if: always() - - name: Save the test output - if: always() - uses: actions/upload-artifact@v2 - with: - name: test-output - path: r/check/arrow.Rcheck/tests/testthat.Rout* - - name: Docker Push - if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' - continue-on-error: true - run: archery docker push ubuntu-r - - bundled: - name: "${{ matrix.config.org }}/${{ matrix.config.image }}:${{ matrix.config.tag }}" - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - strategy: - fail-fast: false - matrix: - config: - - { org: "rstudio", image: "r-base", tag: "4.0-centos7" } - - { org: "rhub", image: "debian-gcc-devel", tag: "latest" } - env: - R_ORG: ${{ matrix.config.org }} - R_IMAGE: ${{ matrix.config.image }} - R_TAG: ${{ matrix.config.tag }} - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - run: ci/scripts/util_checkout.sh - - name: Free Up Disk Space - run: ci/scripts/util_cleanup.sh - - name: Cache Docker Volumes - uses: actions/cache@v2 - with: - path: .docker - key: ${{ matrix.config.image }}-r-${{ hashFiles('cpp/**') }} - restore-keys: ${{ matrix.config.image }}-r- - - name: Setup Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Setup Archery - run: pip install -e dev/archery[docker] - - name: Execute Docker Build - run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited - archery docker run r - - name: Dump install logs - run: cat r/check/arrow.Rcheck/00install.out - if: always() - - name: Dump test logs - run: cat r/check/arrow.Rcheck/tests/testthat.Rout* - if: always() - - name: Save the test output - if: always() - uses: actions/upload-artifact@v2 - with: - name: test-output - path: r/check/arrow.Rcheck/tests/testthat.Rout* - - name: Docker Push - if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' - continue-on-error: true - run: archery docker push r - - windows: - name: AMD64 Windows RTools ${{ matrix.rtools }} - runs-on: windows-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - strategy: - fail-fast: false - matrix: - rtools: [35, 40] - env: - TEST_R_WITH_ARROW: "TRUE" - ARROW_R_CXXFLAGS: "-Werror" - _R_CHECK_TESTS_NLINES_: 0 - steps: - - run: git config --global core.autocrlf false - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Make R tests verbose - # If you get a segfault/mysterious test Execution halted, - # make this `true` to see where it dies. - if: false - shell: cmd - run: | - cd r/tests - sed -i.bak -E -e 's/"arrow"/"arrow", reporter = "location"/' testthat.R - rm -f testthat.R.bak - - name: Setup ccache - shell: bash - run: | - ci/scripts/ccache_setup.sh - echo "CCACHE_DIR=$(cygpath --absolute --windows ccache)" >> $GITHUB_ENV - # We must enable actions/cache before r-lib/actions/setup-r to ensure - # using system tar instead of tar provided by Rtools. - # We can use tar provided by Rtools when we drop support for Rtools 3.5. - # Because Rtools 4.0 or later has zstd. actions/cache requires zstd - # when tar is GNU tar. - - name: Cache ccache - uses: actions/cache@v2 - with: - path: ccache - key: r-${{ matrix.rtools }}-ccache-mingw-${{ hashFiles('cpp/**') }} - restore-keys: r-${{ matrix.rtools }}-ccache-mingw- - # We use the makepkg-mingw setup that is included in rtools40 even when - # we use the rtools35 compilers, so we always install R 4.0/Rtools40 - - uses: r-lib/actions/setup-r@master - with: - rtools-version: 40 - r-version: "4.0" - Ncpus: 2 - - uses: r-lib/actions/setup-r@master - if: ${{ matrix.rtools == 35 }} - with: - rtools-version: 35 - r-version: "3.6" - Ncpus: 2 - - name: Build Arrow C++ - shell: bash - env: - RTOOLS_VERSION: ${{ matrix.rtools }} - run: ci/scripts/r_windows_build.sh - - uses: actions/upload-artifact@v1 - with: - name: Rtools ${{ matrix.rtools }} Arrow C++ - path: libarrow.zip - - name: Install R package dependencies - shell: Rscript {0} - run: | - options(pkgType="win.binary") - install.packages(c("remotes", "rcmdcheck")) - remotes::install_deps("r", dependencies = TRUE) - - name: Check - shell: Rscript {0} - run: | - Sys.setenv( - RWINLIB_LOCAL = file.path(Sys.getenv("GITHUB_WORKSPACE"), "libarrow.zip"), - MAKEFLAGS = paste0("-j", parallel::detectCores()) - ) - rcmdcheck::rcmdcheck("r", - build_args = '--no-build-vignettes', - args = c('--no-manual', '--as-cran', '--ignore-vignettes', '--run-donttest'), - error_on = 'warning', - check_dir = 'check', - timeout = 3600 - ) - - name: Dump install logs - shell: cmd - run: cat check/arrow.Rcheck/00install.out - if: always() - # We can remove this when we drop support for Rtools 3.5. - - name: Ensure using system tar in actions/cache - run: | - Write-Output "${Env:windir}\System32" | ` - Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml deleted file mode 100644 index 2b99cddf8da11..0000000000000 --- a/.github/workflows/ruby.yml +++ /dev/null @@ -1,290 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: C GLib & Ruby - -on: - push: - paths: - - '.github/workflows/ruby.yml' - - 'ci/docker/**' - - 'ci/scripts/c_glib_*' - - 'ci/scripts/cpp_*' - - 'ci/scripts/msys2_*' - - 'ci/scripts/ruby_*' - - 'ci/scripts/util_*' - - 'c_glib/**' - - 'cpp/**' - - 'ruby/**' - pull_request: - paths: - - '.github/workflows/ruby.yml' - - 'ci/docker/**' - - 'ci/scripts/c_glib_*' - - 'ci/scripts/cpp_*' - - 'ci/scripts/msys2_*' - - 'ci/scripts/ruby_*' - - 'ci/scripts/util_*' - - 'c_glib/**' - - 'cpp/**' - - 'ruby/**' - -env: - DOCKER_VOLUME_PREFIX: ".docker/" - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - -jobs: - - ubuntu: - name: AMD64 Ubuntu ${{ matrix.ubuntu }} GLib & Ruby - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - strategy: - fail-fast: false - matrix: - ubuntu: - - 18.04 - - 20.04 - env: - UBUNTU: ${{ matrix.ubuntu }} - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - shell: bash - run: ci/scripts/util_checkout.sh - - name: Free Up Disk Space - shell: bash - run: ci/scripts/util_cleanup.sh - - name: Cache Docker Volumes - uses: actions/cache@v2 - with: - path: .docker - key: ubuntu-${{ matrix.ubuntu }}-ruby-${{ hashFiles('cpp/**') }} - restore-keys: ubuntu-${{ matrix.ubuntu }}-ruby- - - name: Setup Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Setup Archery - run: pip install -e dev/archery[docker] - - name: Execute Docker Build - run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited - archery docker run ubuntu-ruby - - name: Docker Push - if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' - continue-on-error: true - shell: bash - run: archery docker push ubuntu-ruby - - macos: - name: AMD64 MacOS 10.15 GLib & Ruby - runs-on: macos-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - strategy: - fail-fast: false - env: - ARROW_BUILD_TESTS: OFF - ARROW_GANDIVA: ON - ARROW_GLIB_DEVELOPMENT_MODE: true - ARROW_GLIB_GTK_DOC: true - ARROW_HOME: /usr/local - ARROW_JEMALLOC: OFF - ARROW_ORC: OFF - ARROW_PARQUET: ON - ARROW_WITH_BROTLI: ON - ARROW_WITH_LZ4: ON - ARROW_WITH_SNAPPY: ON - ARROW_WITH_ZLIB: ON - ARROW_WITH_ZSTD: ON - XML_CATALOG_FILES: /usr/local/etc/xml/catalog - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - shell: bash - run: ci/scripts/util_checkout.sh - - name: Install Homebrew Dependencies - shell: bash - run: | - rm -f /usr/local/bin/2to3 - brew update --preinstall - brew unlink gcc@8 gcc@9 - brew bundle --file=cpp/Brewfile - brew bundle --file=c_glib/Brewfile - - name: Install Ruby Dependencies - run: | - export MAKEFLAGS="-j$(sysctl -n hw.ncpu)" - bundle install --gemfile c_glib/Gemfile - bundle install --gemfile ruby/Gemfile - for ruby_package_gemfile in ruby/*/Gemfile; do \ - bundle install --gemfile ${ruby_package_gemfile} - done - - name: Setup ccache - run: | - ci/scripts/ccache_setup.sh - - name: ccache info - id: ccache-info - run: | - echo "::set-output name=cache-dir::$(ccache --get-config cache_dir)" - - name: Cache ccache - uses: actions/cache@v2 - with: - path: ${{ steps.ccache-info.outputs.cache-dir }} - key: ruby-ccache-macos-${{ hashFiles('cpp/**') }} - restore-keys: ruby-ccache-macos- - - name: Build C++ - run: | - ci/scripts/cpp_build.sh $(pwd) $(pwd)/build - - name: Build GLib - run: | - ci/scripts/c_glib_build.sh $(pwd) $(pwd)/build - - name: Test GLib - shell: bash - run: ci/scripts/c_glib_test.sh $(pwd) $(pwd)/build - - name: Test Ruby - shell: bash - run: ci/scripts/ruby_test.sh $(pwd) $(pwd)/build - - windows: - name: AMD64 Windows MinGW ${{ matrix.mingw-n-bits }} GLib & Ruby - runs-on: windows-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - strategy: - fail-fast: false - matrix: - mingw-n-bits: - - 64 - ruby-version: - - 2.6 - env: - ARROW_BUILD_SHARED: ON - ARROW_BUILD_STATIC: OFF - ARROW_BUILD_TESTS: OFF - ARROW_BUILD_TYPE: release - ARROW_DATASET: ON - ARROW_FLIGHT: ON - ARROW_GANDIVA: ON - ARROW_HDFS: OFF - ARROW_HOME: /mingw${{ matrix.mingw-n-bits }} - ARROW_JEMALLOC: OFF - ARROW_PARQUET: ON - ARROW_PYTHON: OFF - ARROW_S3: ON - ARROW_USE_GLOG: OFF - ARROW_WITH_BROTLI: ON - ARROW_WITH_BZ2: ON - ARROW_WITH_LZ4: ON - ARROW_WITH_SNAPPY: ON - ARROW_WITH_ZLIB: ON - ARROW_WITH_ZSTD: ON - # Don't use preinstalled Boost by empty BOOST_ROOT and - # -DBoost_NO_BOOST_CMAKE=ON - BOOST_ROOT: "" - CMAKE_ARGS: >- - -DARROW_PACKAGE_PREFIX=/mingw${{ matrix.mingw-n-bits }} - -DBoost_NO_BOOST_CMAKE=ON - CMAKE_UNITY_BUILD: ON - steps: - - name: Disable Crash Dialogs - run: | - reg add ` - "HKCU\SOFTWARE\Microsoft\Windows\Windows Error Reporting" ` - /v DontShowUI ` - /t REG_DWORD ` - /d 1 ` - /f - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - shell: bash - run: ci/scripts/util_checkout.sh - - name: Setup Ruby - uses: ruby/setup-ruby@v1 - with: - ruby-version: ${{ matrix.ruby-version }} - - name: Upgrade MSYS2 - run: | - ridk exec bash ci\scripts\msys2_system_upgrade.sh - taskkill /F /FI "MODULES eq msys-2.0.dll" - - name: Clean MSYS2 - run: | - ridk exec bash ci\scripts\msys2_system_clean.sh - - name: Setup MSYS2 - run: | - ridk exec bash ci\scripts\msys2_setup.sh ruby - - name: Cache ccache - uses: actions/cache@v2 - with: - path: ccache - key: ruby-ccache-mingw${{ matrix.mingw-n-bits }}-${{ hashFiles('cpp/**') }} - restore-keys: ruby-ccache-mingw${{ matrix.mingw-n-bits }}- - - name: Build C++ - run: | - $Env:CMAKE_BUILD_PARALLEL_LEVEL = $Env:NUMBER_OF_PROCESSORS - $source_dir = "$(ridk exec cygpath --unix "$(Get-Location)")" - $build_dir = "$(ridk exec cygpath --unix "$(Get-Location)\build")" - $ErrorActionPreference = "Continue" - ridk exec bash ci\scripts\cpp_build.sh "${source_dir}" "${build_dir}" - - name: Build GLib - run: | - $Env:CMAKE_BUILD_PARALLEL_LEVEL = $Env:NUMBER_OF_PROCESSORS - $source_dir = "$(ridk exec cygpath --unix "$(Get-Location)")" - $build_dir = "$(ridk exec cygpath --unix "$(Get-Location)\build")" - $ErrorActionPreference = "Continue" - ridk exec bash ci\scripts\c_glib_build.sh "${source_dir}" "${build_dir}" - - name: RubyGems info - id: rubygems-info - run: | - Write-Output "::set-output name=gem-dir::$(ridk exec gem env gemdir)" - - name: Cache RubyGems - uses: actions/cache@v2 - with: - path: ${{ steps.rubygems-info.outputs.gem-dir }} - key: ruby-rubygems-mingw${{ matrix.mingw-n-bits }}-${{ hashFiles('**/Gemfile', 'ruby/*/*.gemspec') }} - restore-keys: ruby-rubygems-mingw${{ matrix.mingw-n-bits }}- - - name: Install test dependencies - run: | - bundle install --gemfile c_glib\Gemfile - bundle install --gemfile ruby\Gemfile - Get-ChildItem ruby\*\Gemfile | ` - ForEach-Object {bundle install --gemfile $_} - - name: Test GLib - run: | - $source_dir = "$(ridk exec cygpath --unix "$(Get-Location)")" - $build_dir = "$(ridk exec cygpath --unix "$(Get-Location)\build")" - $ErrorActionPreference = "Continue" - ridk exec bash ci\scripts\c_glib_test.sh "${source_dir}" "${build_dir}" - - name: Test Ruby - run: | - $Env:PKG_CONFIG_PATH = ` - "$(ridk exec cygpath --absolute --windows "${Env:ARROW_HOME}/lib/pkgconfig")" - $Env:GI_TYPELIB_PATH = ` - "$(ridk exec cygpath --absolute --windows "${Env:ARROW_HOME}/lib/girepository-1.0")" - $Env:RUBYOPTS = "-rdevkit" - $Env:MAKE = "ridk exec make" - $ErrorActionPreference = "Continue" - rake -f ruby\Rakefile diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 4bb17a2ecafa5..ac2003608bb3f 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -18,16 +18,9 @@ name: Rust on: + # always trigger push: - paths: - - '.github/workflows/rust.yml' - - 'rust/**' - - 'format/Flight.proto' pull_request: - paths: - - '.github/workflows/rust.yml' - - 'rust/**' - - 'format/Flight.proto' jobs: @@ -70,14 +63,13 @@ jobs: run: | export CARGO_HOME="/github/home/.cargo" export CARGO_TARGET_DIR="/github/home/target" - cd rust cargo build # Ballista is currently not part of the main workspace so requires a separate build step - name: Build Ballista run: | export CARGO_HOME="/github/home/.cargo" export CARGO_TARGET_DIR="/github/home/target" - cd rust/ballista/rust + cd ballista/rust # snmalloc requires cmake so build without default features cargo build --no-default-features @@ -96,8 +88,6 @@ jobs: # Disable full debug symbol generation to speed up CI build and keep memory down # "1" means line tables only, which is useful for panic tracebacks. RUSTFLAGS: "-C debuginfo=1" - ARROW_TEST_DATA: /__w/arrow/arrow/testing/data - PARQUET_TEST_DATA: /__w/arrow/arrow/cpp/submodules/parquet-testing/data steps: - uses: actions/checkout@v2 with: @@ -123,7 +113,8 @@ jobs: run: | export CARGO_HOME="/github/home/.cargo" export CARGO_TARGET_DIR="/github/home/target" - cd rust + export ARROW_TEST_DATA=$(pwd)/testing/data + export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data # run tests on all workspace members with default feature list cargo test # test datafusion examples @@ -131,66 +122,17 @@ jobs: cargo test --no-default-features cargo run --example csv_sql cargo run --example parquet_sql - cd .. - cd arrow - # re-run tests on arrow workspace with additional features - cargo test --features=prettyprint - cargo run --example builders - cargo run --example dynamic_types - cargo run --example read_csv - cargo run --example read_csv_infer_schema # Ballista is currently not part of the main workspace so requires a separate test step - name: Run Ballista tests run: | export CARGO_HOME="/github/home/.cargo" export CARGO_TARGET_DIR="/github/home/target" - cd rust/ballista/rust + export ARROW_TEST_DATA=$(pwd)/testing/data + export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data + cd ballista/rust # snmalloc requires cmake so build without default features cargo test --no-default-features - # test the --features "simd" of the arrow crate. This requires nightly. - linux-test-simd: - name: Test SIMD on AMD64 Rust ${{ matrix.rust }} - runs-on: ubuntu-latest - strategy: - matrix: - arch: [amd64] - rust: [nightly-2021-03-24] - container: - image: ${{ matrix.arch }}/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - ARROW_TEST_DATA: /__w/arrow/arrow/testing/data - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /github/home/target - # this key equals the ones on `linux-build-lib` for re-use - key: ${{ runner.os }}-${{ matrix.arch }}-target-cache-${{ matrix.rust }} - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup default ${{ matrix.rust }} - rustup component add rustfmt - - name: Run tests - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd rust/arrow - cargo test --features "simd" - windows-and-macos: name: Test on ${{ matrix.os }} Rust ${{ matrix.rust }} runs-on: ${{ matrix.os }} @@ -213,12 +155,26 @@ jobs: shell: bash run: | export ARROW_TEST_DATA=$(pwd)/testing/data - export PARQUET_TEST_DATA=$(pwd)/cpp/submodules/parquet-testing/data + export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data # do not produce debug symbols to keep memory usage down export RUSTFLAGS="-C debuginfo=0" - cd rust cargo test + lint: + name: Lint + runs-on: ubuntu-latest + container: + image: amd64/rust + steps: + - uses: actions/checkout@v2 + - name: Setup toolchain + run: | + rustup toolchain install stable + rustup default stable + rustup component add rustfmt + - name: Run + run: cargo fmt --all -- --check + clippy: name: Clippy needs: [linux-build-lib] @@ -258,7 +214,6 @@ jobs: run: | export CARGO_HOME="/github/home/.cargo" export CARGO_TARGET_DIR="/github/home/target" - cd rust cargo clippy --all-targets --workspace -- -D warnings -A clippy::redundant_field_names miri-checks: @@ -290,7 +245,6 @@ jobs: RUST_LOG: 'trace' run: | export MIRIFLAGS="-Zmiri-disable-isolation" - cd rust cargo miri setup cargo clean # Ignore MIRI errors until we can get a clean run @@ -325,146 +279,12 @@ jobs: export CARGO_TARGET_DIR="/home/runner/target" export ARROW_TEST_DATA=$(pwd)/testing/data - export PARQUET_TEST_DATA=$(pwd)/cpp/submodules/parquet-testing/data + export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data # 2020-11-15: There is a cargo-tarpaulin regression in 0.17.0 # see https://github.com/xd009642/tarpaulin/issues/618 cargo install --version 0.16.0 cargo-tarpaulin - cd rust cargo tarpaulin --out Xml - name: Report coverage continue-on-error: true run: bash <(curl -s https://codecov.io/bash) - - # test FFI against the C-Data interface exposed by pyarrow - pyarrow-integration-test: - name: Test Pyarrow C Data Interface - runs-on: ubuntu-latest - strategy: - matrix: - rust: [stable] - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup default ${{ matrix.rust }} - rustup component add rustfmt clippy - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /home/runner/.cargo - key: cargo-maturin-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /home/runner/target - # this key is not equal because maturin uses different compilation flags. - key: ${{ runner.os }}-${{ matrix.arch }}-target-maturin-cache-${{ matrix.rust }}- - - uses: actions/setup-python@v2 - with: - python-version: '3.7' - - name: Install Python dependencies - run: python -m pip install --upgrade pip setuptools wheel - - name: Run tests - run: | - export CARGO_HOME="/home/runner/.cargo" - export CARGO_TARGET_DIR="/home/runner/target" - - cd rust/arrow-pyarrow-integration-testing - - python -m venv venv - source venv/bin/activate - - pip install maturin==0.8.2 toml==0.10.1 pyarrow==1.0.0 - maturin develop - python -m unittest discover tests - - # test the arrow crate builds against wasm32 in stable rust - wasm32-build: - name: Build wasm32 on AMD64 Rust ${{ matrix.rust }} - runs-on: ubuntu-latest - strategy: - matrix: - arch: [amd64] - rust: [nightly-2021-03-24] - container: - image: ${{ matrix.arch }}/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - ARROW_TEST_DATA: /__w/arrow/arrow/testing/data - PARQUET_TEST_DATA: /__w/arrow/arrow/cpp/submodules/parquet-testing/data - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /github/home/target - key: ${{ runner.os }}-${{ matrix.arch }}-target-wasm32-cache-${{ matrix.rust }} - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup override set ${{ matrix.rust }} - rustup component add rustfmt - rustup target add wasm32-unknown-unknown - - name: Build arrow crate - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd rust/arrow - cargo build --target wasm32-unknown-unknown - - # test the projects can build without default features - default-build: - name: Check No Defaults on AMD64 Rust ${{ matrix.rust }} - runs-on: ubuntu-latest - strategy: - matrix: - arch: [amd64] - rust: [stable] - container: - image: ${{ matrix.arch }}/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - ARROW_TEST_DATA: /__w/arrow/arrow/testing/data - PARQUET_TEST_DATA: /__w/arrow/arrow/cpp/submodules/parquet-testing/data - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /github/home/target - key: ${{ runner.os }}-${{ matrix.arch }}-target-wasm32-cache-${{ matrix.rust }} - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup override set ${{ matrix.rust }} - rustup component add rustfmt - - name: Build arrow crate - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd rust/arrow - cargo check --all-targets --no-default-features diff --git a/.gitignore b/.gitignore index 5397fe371dfdb..31bdf49ce43ef 100644 --- a/.gitignore +++ b/.gitignore @@ -85,3 +85,9 @@ cpp/Brewfile.lock.json target Cargo.lock +rusty-tags.vi +.history +.flatbuffers/ + +.vscode +venv/* diff --git a/.gitmodules b/.gitmodules index 86e2f0638efe9..ec5d6208b8ddb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ -[submodule "testing"] - path = arrow-testing - url = https://github.com/apache/arrow-testing [submodule "parquet-testing"] path = parquet-testing url = https://github.com/apache/parquet-testing.git +[submodule "testing"] + path = testing + url = https://github.com/apache/arrow-testing diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 2cf70cca982ff..0000000000000 --- a/.travis.yml +++ /dev/null @@ -1,165 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -dist: bionic - -language: minimal - -cache: - directories: - - $TRAVIS_BUILD_DIR/.docker - -addons: - apt: - packages: - - python3-pip - -services: - - docker - -# Note that the global "env" setting isn't inherited automatically by -# matrix entries with their own "env", so we have to insert it explicitly. -env: &global_env - ARROW_ENABLE_TIMING_TESTS: "OFF" - COMPOSE_DOCKER_CLI_BUILD: 1 - DOCKER_BUILDKIT: 0 - DOCKER_VOLUME_PREFIX: $TRAVIS_BUILD_DIR/.docker/ - -jobs: - include: - - name: "C++ on ARM" - os: linux - arch: arm64 - env: - <<: *global_env - ARCH: arm64v8 - ARROW_CI_MODULES: "CPP" - DOCKER_IMAGE_ID: ubuntu-cpp - # ARROW_USE_GLOG=OFF is needed to avoid build error caused by - # glog and CMAKE_UNITY_BUILD=ON. - # - # Disable ARROW_S3 because it often causes "No output has - # been received in the last 10m0s, this potentially indicates - # a stalled build or something wrong with the build itself." - # on Travis CI. - # - # Limiting CPP_MAKE_PARALLELISM is required to avoid random compiler - # crashes. - DOCKER_RUN_ARGS: >- - " - -e ARROW_BUILD_STATIC=OFF - -e ARROW_ORC=OFF - -e ARROW_S3=OFF - -e ARROW_USE_GLOG=OFF - -e CMAKE_UNITY_BUILD=ON - -e CPP_MAKE_PARALLELISM=4 - " - # The LLVM's APT repository provides only arm64 binaries. - # We should use LLVM provided by Ubuntu. - LLVM: "10" - UBUNTU: "20.04" - - - name: "C++ on s390x" - os: linux - arch: s390x - env: - <<: *global_env - ARCH: s390x - ARROW_CI_MODULES: "CPP" - DOCKER_IMAGE_ID: ubuntu-cpp - # Can't enable ARROW_MIMALLOC because of failures in memory pool tests. - # Can't enable ARROW_S3 because compiler is killed while compiling - # aws-sdk-cpp. - DOCKER_RUN_ARGS: >- - " - -e ARROW_BUILD_STATIC=OFF - -e ARROW_FLIGHT=ON - -e ARROW_MIMALLOC=OFF - -e ARROW_ORC=OFF - -e ARROW_PARQUET=OFF - -e ARROW_S3=OFF - -e CMAKE_UNITY_BUILD=ON - -e CPP_MAKE_PARALLELISM=4 - -e PARQUET_BUILD_EXAMPLES=OFF - -e PARQUET_BUILD_EXECUTABLES=OFF - -e Protobuf_SOURCE=BUNDLED - -e cares_SOURCE=BUNDLED - -e gRPC_SOURCE=BUNDLED - " - # The LLVM's APT repository provides only arm64 binaries. - # We should use LLVM provided by Ubuntu. - LLVM: "10" - UBUNTU: "20.04" - - - name: "Go on s390x" - os: linux - arch: s390x - env: - <<: *global_env - ARCH: s390x - ARROW_CI_MODULES: "GO" - DOCKER_IMAGE_ID: debian-go - - - name: "Java on s390x" - os: linux - arch: s390x - env: - <<: *global_env - ARCH: s390x - ARROW_CI_MODULES: "JAVA" - DOCKER_IMAGE_ID: debian-java - JDK: 11 - - allow_failures: - - name: "Go on s390x" - - name: "Java on s390x" - -before_install: - - eval "$(python ci/detect-changes.py)" - - | - arrow_ci_affected=no - for arrow_ci_module in ${ARROW_CI_MODULES}; do - arrow_ci_affected_variable=ARROW_CI_${arrow_ci_module}_AFFECTED - if [ "$(eval "echo \$${arrow_ci_affected_variable}")" = "1" ]; then - arrow_ci_affected=yes - fi - done - if [ "${arrow_ci_affected}" = "no" ]; then - travis_terminate 0 - fi - -install: - - pip3 install -e dev/archery[docker] - -script: - - sudo sysctl -w kernel.core_pattern="core.%e.%p" - # This isn't allowed on Travis CI: - # /home/travis/.travis/functions: line 109: ulimit: core file size: cannot modify limit: Operation not permitted - - | - ulimit -c unlimited || : - - | - archery docker run \ - ${DOCKER_RUN_ARGS} \ - --volume ${PWD}/build:/build \ - ${DOCKER_IMAGE_ID} - -after_success: - - | - if [ "${TRAVIS_EVENT_TYPE}" = "push" -a \ - "${TRAVIS_REPO_SLUG}" = "apache/arrow" ]; then - archery docker push ${DOCKER_IMAGE_ID} || : - fi diff --git a/ballista/rust/benchmarks/tpch/README.md b/ballista/rust/benchmarks/tpch/README.md index 6d77694b91b64..20c4fc71de35d 100644 --- a/ballista/rust/benchmarks/tpch/README.md +++ b/ballista/rust/benchmarks/tpch/README.md @@ -41,7 +41,7 @@ To run the benchmarks it is necessary to have at least one Ballista scheduler an To run the scheduler from source: ```bash -cd $ARROW_HOME/rust/ballista/rust/scheduler +cd $ARROW_HOME/ballista/rust/scheduler RUST_LOG=info cargo run --release ``` @@ -50,7 +50,7 @@ By default the scheduler will bind to `0.0.0.0` and listen on port 50050. To run the executor from source: ```bash -cd $ARROW_HOME/rust/ballista/rust/executor +cd $ARROW_HOME/ballista/rust/executor RUST_LOG=info cargo run --release ``` @@ -65,7 +65,7 @@ RUST_LOG=info RUSTFLAGS='-C target-cpu=native -C lto -C codegen-units=1 -C embed To run the benchmarks: ```bash -cd $ARROW_HOME/rust/ballista/rust/benchmarks/tpch +cd $ARROW_HOME/ballista/rust/benchmarks/tpch cargo run --release benchmark --host localhost --port 50050 --query 1 --path $(pwd)/data --format tbl ``` diff --git a/dev/.gitignore b/dev/.gitignore index b0792939f2ad2..399c30926260c 100644 --- a/dev/.gitignore +++ b/dev/.gitignore @@ -18,3 +18,4 @@ # Python virtual environments for dev tools .venv*/ +__pycache__ diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index bcaddf1c795d6..4bbde75b74cf4 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -289,8 +289,7 @@ def decorate_lint_command(cmd): @archery.command(short_help="Check Arrow source tree for errors") -@click.option("--src", metavar="", default=None, - callback=validate_arrow_sources, +@click.option("--src", metavar="", default=".", help="Specify Arrow source directory") @click.option("--fix", is_flag=True, type=BOOL, default=False, help="Toggle fixing the lint errors if the linter supports it.") @@ -301,6 +300,8 @@ def decorate_lint_command(cmd): @decorate_lint_command @click.pass_context def lint(ctx, src, fix, iwyu_all, **checks): + src = ArrowSources(src) + if checks.pop('all'): # "--all" is given => enable all non-selected checks for k, v in checks.items(): diff --git a/dev/archery/archery/utils/source.py b/dev/archery/archery/utils/source.py index d30b4f152e543..1ae0fe025049a 100644 --- a/dev/archery/archery/utils/source.py +++ b/dev/archery/archery/utils/source.py @@ -45,13 +45,7 @@ def __init__(self, path): ---------- path : src """ - path = Path(path) - # validate by checking a specific path in the arrow source tree - if not (path / 'cpp' / 'CMakeLists.txt').exists(): - raise InvalidArrowSource( - "No Arrow C++ sources found in {}.".format(path) - ) - self.path = path + self.path = Path(path) @property def archery(self): diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 68f5668098e7d..ead9c8db16f4b 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -12,37 +12,6 @@ ci/etc/rprofile ci/etc/*.patch ci/vcpkg/*.patch CHANGELOG.md -cpp/CHANGELOG_PARQUET.md -cpp/src/arrow/io/mman.h -cpp/src/arrow/util/random.h -cpp/src/arrow/status.cc -cpp/src/arrow/status.h -cpp/src/arrow/vendored/* -cpp/build-support/asan_symbolize.py -cpp/build-support/cpplint.py -cpp/build-support/lint_exclusions.txt -cpp/build-support/iwyu/* -cpp/cmake_modules/FindPythonLibsNew.cmake -cpp/cmake_modules/SnappyCMakeLists.txt -cpp/cmake_modules/SnappyConfig.h -cpp/examples/parquet/parquet-arrow/cmake_modules/FindArrow.cmake -cpp/src/parquet/.parquetcppversion -cpp/src/generated/parquet_constants.cpp -cpp/src/generated/parquet_constants.h -cpp/src/generated/parquet_types.cpp -cpp/src/generated/parquet_types.h -cpp/src/plasma/thirdparty/ae/ae.c -cpp/src/plasma/thirdparty/ae/ae.h -cpp/src/plasma/thirdparty/ae/ae_epoll.c -cpp/src/plasma/thirdparty/ae/ae_evport.c -cpp/src/plasma/thirdparty/ae/ae_kqueue.c -cpp/src/plasma/thirdparty/ae/ae_select.c -cpp/src/plasma/thirdparty/ae/config.h -cpp/src/plasma/thirdparty/ae/zmalloc.h -cpp/src/plasma/thirdparty/dlmalloc.c -cpp/thirdparty/flatbuffers/include/flatbuffers/base.h -cpp/thirdparty/flatbuffers/include/flatbuffers/flatbuffers.h -cpp/thirdparty/flatbuffers/include/flatbuffers/stl_emulation.h dev/requirements*.txt dev/archery/MANIFEST.in dev/archery/requirements*.txt @@ -115,56 +84,11 @@ dev/tasks/linux-packages/apache-arrow/debian/source/format dev/tasks/linux-packages/apache-arrow/debian/watch dev/tasks/requirements*.txt dev/tasks/conda-recipes/* -docs/requirements.txt -go/arrow/flight/Flight_grpc.pb.go -go/arrow/go.sum -go/arrow/Gopkg.lock -go/arrow/flight/Flight.pb.go -go/arrow/flight/Flight_grpc.pb.go -go/arrow/internal/cpu/* -go/arrow/type_string.go -go/*.tmpldata -go/*.s -go/parquet/go.sum -go/parquet/internal/gen-go/parquet/GoUnusedProtection__.go -go/parquet/internal/gen-go/parquet/parquet-consts.go -go/parquet/internal/gen-go/parquet/parquet.go -js/.npmignore -js/closure-compiler-scripts/* -js/src/fb/*.ts -js/yarn.lock -js/.eslintignore -python/cmake_modules -python/cmake_modules/FindPythonLibsNew.cmake -python/cmake_modules/SnappyCMakeLists.txt -python/cmake_modules/SnappyConfig.h -python/MANIFEST.in -python/manylinux1/.dockerignore -python/pyarrow/includes/__init__.pxd -python/pyarrow/tests/__init__.py -python/pyarrow/vendored/* -python/requirements*.txt pax_global_header MANIFEST.in __init__.pxd __init__.py requirements.txt -csharp/.gitattributes -csharp/dummy.git/* -csharp/src/Apache.Arrow/Flatbuf/* -csharp/Apache.Arrow.sln -csharp/examples/FluentBuilderExample/FluentBuilderExample.csproj -csharp/examples/Examples.sln -csharp/src/Apache.Arrow/Apache.Arrow.csproj -csharp/src/Apache.Arrow/Properties/Resources.Designer.cs -csharp/src/Apache.Arrow/Properties/Resources.resx -csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj -csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj -csharp/test/Apache.Arrow.Tests/app.config -csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj -csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj -csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj -csharp/src/Apache.Arrow.Flight.AspNetCore/Apache.Arrow.Flight.AspNetCore.csproj *.html *.sgml *.css @@ -173,35 +97,8 @@ csharp/src/Apache.Arrow.Flight.AspNetCore/Apache.Arrow.Flight.AspNetCore.csproj *.svg *.devhelp2 *.scss -r/R/arrowExports.R -r/src/arrowExports.cpp -r/DESCRIPTION -r/LICENSE.md -r/NAMESPACE -r/.Rbuildignore -r/arrow.Rproj -r/README.md -r/README.Rmd -r/man/*.Rd -r/cran-comments.md -r/vignettes/*.Rmd -r/tests/testthat/test-*.txt -r/inst/include/cpp11.hpp -r/inst/include/cpp11/*.hpp .gitattributes -ruby/red-arrow/.yardopts -rust/arrow/test/data/*.csv -rust/rust-toolchain -rust/arrow-flight/src/arrow.flight.protocol.rs -julia/Arrow/Project.toml -julia/Arrow/README.md -julia/Arrow/docs/Manifest.toml -julia/Arrow/docs/Project.toml -julia/Arrow/docs/make.jl -julia/Arrow/docs/mkdocs.yml -julia/Arrow/docs/src/index.md -julia/Arrow/docs/src/manual.md -julia/Arrow/docs/src/reference.md -rust/ballista/rust/benchmarks/tpch/queries/q*.sql -rust/ballista/rust/scheduler/testdata/* -rust/ballista/ui/scheduler/yarn.lock +rust-toolchain +ballista/rust/benchmarks/tpch/queries/q*.sql +ballista/rust/scheduler/testdata/* +ballista/ui/scheduler/yarn.lock diff --git a/arrow-testing b/testing similarity index 100% rename from arrow-testing rename to testing From 4123600d2708241d2c6bb70e02726c3e06e0555e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 19 Apr 2021 22:44:10 -0600 Subject: [PATCH 019/329] [Ballista] Fix Ballista IT (#5) --- .dockerignore | 42 ++----------------- ballista/.dockerignore | 18 -------- ballista/docker/README.md | 29 ------------- {ballista/dev => dev}/build-rust-base.sh | 2 +- {ballista/dev => dev}/build-rust.sh | 2 +- {ballista => dev}/docker/rust-base.dockerfile | 2 +- {ballista => dev}/docker/rust.dockerfile | 30 ++++++++++--- {ballista/dev => dev}/integration-tests.sh | 3 +- 8 files changed, 32 insertions(+), 96 deletions(-) delete mode 100644 ballista/.dockerignore delete mode 100644 ballista/docker/README.md rename {ballista/dev => dev}/build-rust-base.sh (96%) rename {ballista/dev => dev}/build-rust.sh (96%) rename {ballista => dev}/docker/rust-base.dockerfile (99%) rename {ballista => dev}/docker/rust.dockerfile (64%) rename {ballista/dev => dev}/integration-tests.sh (93%) diff --git a/.dockerignore b/.dockerignore index eb71138c679a5..9a64a123f7353 100644 --- a/.dockerignore +++ b/.dockerignore @@ -21,42 +21,6 @@ # This setup requires to all of our docker containers have arrow's source # as a mounted directory. -# exclude everything -** - -# include explicitly -!ci/** -!c_glib/Gemfile -!dev/archery/requirements*.txt -!python/requirements*.txt -!python/manylinux1/** -!python/manylinux2010/** -!r/DESCRIPTION -!ruby/Gemfile -!ruby/red-arrow/Gemfile -!ruby/red-arrow/lib/arrow/version.rb -!ruby/red-arrow/red-arrow.gemspec -!ruby/red-arrow-cuda/Gemfile -!ruby/red-arrow-cuda/lib/arrow-cuda/version.rb -!ruby/red-arrow-cuda/red-arrow-cuda.gemspec -!ruby/red-gandiva/Gemfile -!ruby/red-gandiva/lib/gandiva/version.rb -!ruby/red-gandiva/red-gandiva.gemspec -!ruby/red-parquet/Gemfile -!ruby/red-parquet/lib/parquet/version.rb -!ruby/red-parquet/red-parquet.gemspec -!ruby/red-plasma/Gemfile -!ruby/red-plasma/lib/plasma/version.rb -!ruby/red-plasma/red-plasma.gemspec -!rust/Cargo.toml -!rust/benchmarks/Cargo.toml -!rust/arrow/Cargo.toml -!rust/arrow/benches -!rust/arrow-flight/Cargo.toml -!rust/parquet/Cargo.toml -!rust/parquet/build.rs -!rust/parquet_derive/Cargo.toml -!rust/parquet_derive_test/Cargo.toml -!rust/datafusion/Cargo.toml -!rust/datafusion/benches -!rust/integration-testing/Cargo.toml +ci +dev +**/target/* diff --git a/ballista/.dockerignore b/ballista/.dockerignore deleted file mode 100644 index 3cde49e0a0c4c..0000000000000 --- a/ballista/.dockerignore +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -rust/**/target diff --git a/ballista/docker/README.md b/ballista/docker/README.md deleted file mode 100644 index 8417d04c49220..0000000000000 --- a/ballista/docker/README.md +++ /dev/null @@ -1,29 +0,0 @@ - - -# Ballista Docker Images - -Pre-built docker images are available from [Docker Hub](https://hub.docker.com/orgs/ballistacompute/repositories) but here are the commands to build the images from source. - -Run these commands from the root directory of the project. - -```bash -./dev/build-all.sh -``` - diff --git a/ballista/dev/build-rust-base.sh b/dev/build-rust-base.sh similarity index 96% rename from ballista/dev/build-rust-base.sh rename to dev/build-rust-base.sh index ee4b32c8e690a..e424909fb6f10 100755 --- a/ballista/dev/build-rust-base.sh +++ b/dev/build-rust-base.sh @@ -18,4 +18,4 @@ # under the License. BALLISTA_VERSION=0.4.2-SNAPSHOT set -e -docker build -t ballistacompute/rust-base:$BALLISTA_VERSION -f docker/rust-base.dockerfile . +docker build -t ballistacompute/rust-base:$BALLISTA_VERSION -f dev/docker/rust-base.dockerfile . diff --git a/ballista/dev/build-rust.sh b/dev/build-rust.sh similarity index 96% rename from ballista/dev/build-rust.sh rename to dev/build-rust.sh index 1916f8efbefb1..d31c5241c6f13 100755 --- a/ballista/dev/build-rust.sh +++ b/dev/build-rust.sh @@ -21,4 +21,4 @@ BALLISTA_VERSION=0.4.2-SNAPSHOT set -e -docker build -t ballistacompute/ballista-rust:$BALLISTA_VERSION -f docker/rust.dockerfile . +docker build -t ballistacompute/ballista-rust:$BALLISTA_VERSION -f dev/docker/rust.dockerfile . diff --git a/ballista/docker/rust-base.dockerfile b/dev/docker/rust-base.dockerfile similarity index 99% rename from ballista/docker/rust-base.dockerfile rename to dev/docker/rust-base.dockerfile index 4519225d2197c..31620b38cf39b 100644 --- a/ballista/docker/rust-base.dockerfile +++ b/dev/docker/rust-base.dockerfile @@ -23,7 +23,7 @@ # Base image extends debian:buster-slim -FROM rust:1.49.0-buster AS builder +FROM rust:1.51.0-buster AS builder RUN apt update && apt -y install musl musl-dev musl-tools libssl-dev openssl diff --git a/ballista/docker/rust.dockerfile b/dev/docker/rust.dockerfile similarity index 64% rename from ballista/docker/rust.dockerfile rename to dev/docker/rust.dockerfile index 8b06af3dc78fc..19dd4879eab6f 100644 --- a/ballista/docker/rust.dockerfile +++ b/dev/docker/rust.dockerfile @@ -22,13 +22,21 @@ # as a mounted directory. ARG RELEASE_FLAG=--release -FROM ballistacompute/rust-base:0.4.0-20210213 AS base +FROM ballistacompute/rust-base:0.4.2-SNAPSHOT AS base WORKDIR /tmp/ballista RUN apt-get -y install cmake RUN cargo install cargo-chef FROM base as planner -COPY rust . +RUN mkdir /tmp/ballista/ballista +RUN mkdir /tmp/ballista/benchmarks +RUN mkdir /tmp/ballista/datafusion +RUN mkdir /tmp/ballista/datafusion-examples +ADD Cargo.toml . +COPY benchmarks ./benchmarks/ +COPY datafusion ./datafusion/ +COPY datafusion-examples ./datafusion-examples/ +COPY ballista ./ballista/ RUN cargo chef prepare --recipe-path recipe.json FROM base as cacher @@ -36,25 +44,35 @@ COPY --from=planner /tmp/ballista/recipe.json recipe.json RUN cargo chef cook $RELEASE_FLAG --recipe-path recipe.json FROM base as builder -COPY rust . +RUN mkdir /tmp/ballista/ballista +RUN mkdir /tmp/ballista/benchmarks +RUN mkdir /tmp/ballista/datafusion +RUN mkdir /tmp/ballista/datafusion-examples +ADD Cargo.toml . +COPY benchmarks ./benchmarks/ +COPY datafusion ./datafusion/ +COPY ballista ./ballista/ +COPY datafusion-examples ./datafusion-examples/ COPY --from=cacher /tmp/ballista/target target ARG RELEASE_FLAG=--release # force build.rs to run to generate configure_me code. ENV FORCE_REBUILD='true' RUN cargo build $RELEASE_FLAG +RUN cd ballista/rust && \ + cargo build $RELEASE_FLAG # put the executor on /executor (need to be copied from different places depending on FLAG) ENV RELEASE_FLAG=${RELEASE_FLAG} -RUN if [ -z "$RELEASE_FLAG" ]; then mv /tmp/ballista/target/debug/ballista-executor /executor; else mv /tmp/ballista/target/release/ballista-executor /executor; fi +RUN if [ -z "$RELEASE_FLAG" ]; then mv /tmp/ballista/ballista/rust/target/debug/ballista-executor /executor; else mv /tmp/ballista/ballista/rust/target/release/ballista-executor /executor; fi # put the scheduler on /scheduler (need to be copied from different places depending on FLAG) ENV RELEASE_FLAG=${RELEASE_FLAG} -RUN if [ -z "$RELEASE_FLAG" ]; then mv /tmp/ballista/target/debug/ballista-scheduler /scheduler; else mv /tmp/ballista/target/release/ballista-scheduler /scheduler; fi +RUN if [ -z "$RELEASE_FLAG" ]; then mv /tmp/ballista/ballista/rust/target/debug/ballista-scheduler /scheduler; else mv /tmp/ballista/ballista/rust/target/release/ballista-scheduler /scheduler; fi # put the tpch on /tpch (need to be copied from different places depending on FLAG) ENV RELEASE_FLAG=${RELEASE_FLAG} -RUN if [ -z "$RELEASE_FLAG" ]; then mv /tmp/ballista/target/debug/tpch /tpch; else mv /tmp/ballista/target/release/tpch /tpch; fi +RUN if [ -z "$RELEASE_FLAG" ]; then mv /tmp/ballista/ballista/rust/target/debug/tpch /tpch; else mv /tmp/ballista/ballista/rust/target/release/tpch /tpch; fi # Copy the binary into a new container for a smaller docker image FROM ballistacompute/rust-base:0.4.0-20210213 diff --git a/ballista/dev/integration-tests.sh b/dev/integration-tests.sh similarity index 93% rename from ballista/dev/integration-tests.sh rename to dev/integration-tests.sh index cc34a5ce91f53..6ed764ecda8ad 100755 --- a/ballista/dev/integration-tests.sh +++ b/dev/integration-tests.sh @@ -17,8 +17,9 @@ # specific language governing permissions and limitations # under the License. set -e +./dev/build-rust-base.sh ./dev/build-rust.sh -pushd rust/benchmarks/tpch +pushd ballista/rust/benchmarks/tpch ./tpch-gen.sh docker-compose up -d From 7136cad2156ee588e43e145fd113d36ad7ed316b Mon Sep 17 00:00:00 2001 From: Jorge Leitao Date: Tue, 20 Apr 2021 13:11:05 +0200 Subject: [PATCH 020/329] Removed requirement of JIRA to PRs. (#14) --- .github/workflows/dev_pr.yml | 37 ++----------- .github/workflows/dev_pr/link.js | 69 ------------------------- .github/workflows/dev_pr/title_check.js | 56 -------------------- .github/workflows/dev_pr/title_check.md | 37 ------------- 4 files changed, 3 insertions(+), 196 deletions(-) delete mode 100644 .github/workflows/dev_pr/link.js delete mode 100644 .github/workflows/dev_pr/title_check.js delete mode 100644 .github/workflows/dev_pr/title_check.md diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml index 7b92b89705175..7644a3313299a 100644 --- a/.github/workflows/dev_pr.yml +++ b/.github/workflows/dev_pr.yml @@ -15,18 +15,11 @@ # specific language governing permissions and limitations # under the License. -name: Dev PR +name: Labeler on: - # TODO: Enable this when eps1lon/actions-label-merge-conflict is available. - # push: - # branches: - # - master - pull_request_target: - types: - - opened - - edited - - synchronize + push: + pull_request: jobs: process: @@ -35,30 +28,6 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Comment JIRA link - if: | - github.event_name == 'pull_request_target' && - (github.event.action == 'opened' || - github.event.action == 'edited') - uses: actions/github-script@v3 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - script: | - const script = require(`${process.env.GITHUB_WORKSPACE}/.github/workflows/dev_pr/link.js`); - script({github, context}); - - - name: Check title - if: | - github.event_name == 'pull_request_target' && - (github.event.action == 'opened' || - github.event.action == 'edited') - uses: actions/github-script@v3 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - script: | - const script = require(`${process.env.GITHUB_WORKSPACE}/.github/workflows/dev_pr/title_check.js`); - script({github, context}); - - name: Assign GitHub labels if: | github.event_name == 'pull_request_target' && diff --git a/.github/workflows/dev_pr/link.js b/.github/workflows/dev_pr/link.js deleted file mode 100644 index 550a9cd396d71..0000000000000 --- a/.github/workflows/dev_pr/link.js +++ /dev/null @@ -1,69 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -function detectJIRAID(title) { - if (!title) { - return null; - } - const matched = /^(WIP:?\s*)?((ARROW|PARQUET)-\d+)/.exec(title); - if (!matched) { - return null; - } - return matched[2]; -} - -async function haveComment(github, context, pullRequestNumber, body) { - const options = { - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: pullRequestNumber, - page: 1 - }; - while (true) { - const response = await github.issues.listComments(options); - if (response.data.some(comment => comment.body === body)) { - return true; - } - if (!/;\s*rel="next"/.test(response.headers.link || "")) { - break; - } - options.page++; - } - return false; -} - -async function commentJIRAURL(github, context, pullRequestNumber, jiraID) { - const jiraURL = `https://issues.apache.org/jira/browse/${jiraID}`; - if (await haveComment(github, context, pullRequestNumber, jiraURL)) { - return; - } - await github.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: pullRequestNumber, - body: jiraURL - }); -} - -module.exports = async ({github, context}) => { - const pullRequestNumber = context.payload.number; - const title = context.payload.pull_request.title; - const jiraID = detectJIRAID(title); - if (jiraID) { - await commentJIRAURL(github, context, pullRequestNumber, jiraID); - } -}; diff --git a/.github/workflows/dev_pr/title_check.js b/.github/workflows/dev_pr/title_check.js deleted file mode 100644 index c1ebd9d3e4d5e..0000000000000 --- a/.github/workflows/dev_pr/title_check.js +++ /dev/null @@ -1,56 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -const fs = require("fs"); - -function haveJIRAID(title) { - if (!title) { - return false; - } - if (title.startsWith("MINOR: ")) { - return true; - } - return /^(WIP:?\s*)?(ARROW|PARQUET)-\d+/.test(title); -} - -async function commentOpenJIRAIssue(github, context, pullRequestNumber) { - const {data: comments} = await github.issues.listComments({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: pullRequestNumber, - per_page: 1 - }); - if (comments.length > 0) { - return; - } - const commentPath = ".github/workflows/dev_pr/title_check.md"; - const comment = fs.readFileSync(commentPath).toString(); - await github.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: pullRequestNumber, - body: comment - }); -} - -module.exports = async ({github, context}) => { - const pullRequestNumber = context.payload.number; - const title = context.payload.pull_request.title; - if (!haveJIRAID(title)) { - await commentOpenJIRAIssue(github, context, pullRequestNumber); - } -}; diff --git a/.github/workflows/dev_pr/title_check.md b/.github/workflows/dev_pr/title_check.md deleted file mode 100644 index 1db9fcf637bb0..0000000000000 --- a/.github/workflows/dev_pr/title_check.md +++ /dev/null @@ -1,37 +0,0 @@ - - -Thanks for opening a pull request! - -If this is not a [minor PR](https://github.com/apache/arrow/blob/master/CONTRIBUTING.md#Minor-Fixes). Could you open an issue for this pull request on JIRA? https://issues.apache.org/jira/browse/ARROW - -Opening JIRAs ahead of time contributes to the [Openness](http://theapacheway.com/open/#:~:text=Openness%20allows%20new%20users%20the,must%20happen%20in%20the%20open.) of the Apache Arrow project. - -Then could you also rename pull request title in the following format? - - ARROW-${JIRA_ID}: [${COMPONENT}] ${SUMMARY} - -or - - MINOR: [${COMPONENT}] ${SUMMARY} - -See also: - - * [Other pull requests](https://github.com/apache/arrow/pulls/) - * [Contribution Guidelines - How to contribute patches](https://arrow.apache.org/docs/developers/contributing.html#how-to-contribute-patches) From 2a8e2bb0a5097a894313f824453add2274436034 Mon Sep 17 00:00:00 2001 From: Jorge Leitao Date: Tue, 20 Apr 2021 13:12:45 +0200 Subject: [PATCH 021/329] Fixed labeler. (#13) --- .github/workflows/dev_pr/labeler.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index 5eb722da41867..df9d41254a932 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -16,7 +16,7 @@ # under the License. datafusion: - - rust/datafusion/**/* + - datafusion/**/* ballista: - - rust/ballista/**/* + - ballista/**/* From c365a4f59d16d39cf27b19fd2bf34a27d590db4d Mon Sep 17 00:00:00 2001 From: Jorge Leitao Date: Tue, 20 Apr 2021 13:21:03 +0200 Subject: [PATCH 022/329] Allow adding issues. (#12) Also removed contributing, as that requires a re-write. --- .github/.dir-locals.el | 19 -------- .github/CONTRIBUTING.md | 72 ------------------------------ .github/ISSUE_TEMPLATE/config.yml | 22 --------- .github/ISSUE_TEMPLATE/question.md | 26 ----------- 4 files changed, 139 deletions(-) delete mode 100644 .github/.dir-locals.el delete mode 100644 .github/CONTRIBUTING.md delete mode 100644 .github/ISSUE_TEMPLATE/config.yml delete mode 100644 .github/ISSUE_TEMPLATE/question.md diff --git a/.github/.dir-locals.el b/.github/.dir-locals.el deleted file mode 100644 index a880e4a6bb697..0000000000000 --- a/.github/.dir-locals.el +++ /dev/null @@ -1,19 +0,0 @@ -;;; Licensed to the Apache Software Foundation (ASF) under one -;;; or more contributor license agreements. See the NOTICE file -;;; distributed with this work for additional information -;;; regarding copyright ownership. The ASF licenses this file -;;; to you under the Apache License, Version 2.0 (the -;;; "License"); you may not use this file except in compliance -;;; with the License. You may obtain a copy of the License at -;;; -;;; http://www.apache.org/licenses/LICENSE-2.0 -;;; -;;; Unless required by applicable law or agreed to in writing, -;;; software distributed under the License is distributed on an -;;; "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -;;; KIND, either express or implied. See the License for the -;;; specific language governing permissions and limitations -;;; under the License. - -((js-mode . ((indent-tabs-mode . nil) - (js-indent-level . 2)))) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md deleted file mode 100644 index bbabe35857939..0000000000000 --- a/.github/CONTRIBUTING.md +++ /dev/null @@ -1,72 +0,0 @@ - - -## Contributing to Apache Arrow - -There are many ways to contribute to Apache Arrow: - -* Contributing code (we call them "patches") -* Writing documentation (another form of code, in a way) -* Participating in discussions on JIRA or the mailing list -* Helping users of the libraries - -## Reporting bugs and asking questions - -We support GitHub issues as a lightweight way to ask questions and engage with -the Arrow developer community. We use [JIRA][3] for maintaining a queue of -development work and as the public record for work on the project. So, feel -free to open GitHub issues, but bugs and feature requests will eventually need -to end up in JIRA, either before or after completing a pull request. - -## How to contribute patches - -We prefer to receive contributions in the form of GitHub pull requests. Please -send pull requests against the [github.com/apache/arrow][4] repository following -the procedure below. - -If you are looking for some ideas on what to contribute, check out the [JIRA -issues][3] for the Apache Arrow project. Comment on the issue and/or contact -[dev@arrow.apache.org](https://lists.apache.org/list.html?dev@arrow.apache.org) -with your questions and ideas. - -If you’d like to report a bug but don’t have time to fix it, you can still post -it on JIRA, or email the mailing list -[dev@arrow.apache.org](https://lists.apache.org/list.html?dev@arrow.apache.org) - -To contribute a patch: - -1. Break your work into small, single-purpose patches if possible. It’s much -harder to merge in a large change with a lot of disjoint features. -2. If one doesn't already exist, create a JIRA for your patch on the [Arrow Project -JIRA](https://issues.apache.org/jira/browse/ARROW). -3. Submit the patch as a GitHub pull request against the master branch. For a -tutorial, see the GitHub guides on [forking a repo](https://help.github.com/en/articles/fork-a-repo) -and [sending a pull request](https://help.github.com/en/articles/creating-a-pull-request-from-a-fork). So that your pull request syncs with the JIRA issue, prefix your pull request -name with the JIRA issue id (ex: [ARROW-767: [C++] Filesystem abstraction](https://github.com/apache/arrow/pull/4225)) -4. Make sure that your code passes the unit tests. You can find instructions -how to run the unit tests for each Arrow component in its respective README -file. -5. Add new unit tests for your code. - -Thank you in advance for your contributions! - -[1]: mailto:dev-subscribe@arrow.apache.org -[2]: https://github.com/apache/arrow/tree/master/format -[3]: https://issues.apache.org/jira/browse/ARROW -[4]: https://github.com/apache/arrow diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml deleted file mode 100644 index 5a05012136279..0000000000000 --- a/.github/ISSUE_TEMPLATE/config.yml +++ /dev/null @@ -1,22 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -blank_issues_enabled: false -contact_links: - - name: Report an issue - url: https://issues.apache.org/jira/browse/ARROW - about: Please report bugs and request features on JIRA. diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md deleted file mode 100644 index 9c4b89c56977c..0000000000000 --- a/.github/ISSUE_TEMPLATE/question.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -name: Ask a question -about: Please ask questions at user@arrow.apache.org ---- - -STOP! Are you reporting a bug, a possible bug, or requesting a -feature? If so, please report under the ARROW project on the ASF JIRA -server https://issues.apache.org/jira/browse/ARROW. This JIRA server -is free to use and open to the public, but you must create an account -if it is your first time. - -See our contribution guidelines for more information: -http://arrow.apache.org/docs/developers/contributing.html - -We have GitHub issues available as a way for new contributors and -passers-by who are unfamiliar with Apache Software Foundation projects -to ask questions and interact with the project. Do not be surprised if -the first response is to open a JIRA issue or to write an e-mail to -one of the public mailing lists: - -* Development discussions: dev@arrow.apache.org (first subscribe by - sending an e-mail to dev-subscribe@arrow.apache.org). -* User discussions: user@arrow.apache.org (first subscribe by - sending an e-mail to user-subscribe@arrow.apache.org). - -Thank you! From abe84cfbfb6cc3e80ed314bc343ef78eae15ed9b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 21 Apr 2021 07:42:37 -0600 Subject: [PATCH 023/329] Create starting point for combined user guide for DataFusion and Ballista (#20) --- ballista/docs/user-guide/.gitignore | 2 - docs/user-guide/.gitignore | 1 + {ballista/docs => docs}/user-guide/README.md | 14 +--- {ballista/docs => docs}/user-guide/book.toml | 4 +- docs/user-guide/src/SUMMARY.md | 33 ++++++++ .../src/distributed/client-python.md | 13 +-- .../src/distributed}/client-rust.md | 0 .../user-guide/src/distributed}/clients.md | 0 .../src/distributed}/configuration.md | 0 .../user-guide/src/distributed}/deployment.md | 0 .../src/distributed}/docker-compose.md | 0 .../src/distributed}/introduction.md | 0 .../user-guide/src/distributed}/kubernetes.md | 3 +- .../user-guide/src/distributed}/standalone.md | 0 docs/user-guide/src/example-usage.md | 76 ++++++++++++++++++ {ballista/docs => docs}/user-guide/src/faq.md | 0 .../src/img/ballista-architecture.png | Bin docs/user-guide/src/introduction.md | 44 ++++++++++ docs/user-guide/src/library.md | 28 +++++++ 19 files changed, 191 insertions(+), 27 deletions(-) delete mode 100644 ballista/docs/user-guide/.gitignore create mode 100644 docs/user-guide/.gitignore rename {ballista/docs => docs}/user-guide/README.md (78%) rename {ballista/docs => docs}/user-guide/book.toml (93%) create mode 100644 docs/user-guide/src/SUMMARY.md rename ballista/docs/user-guide/src/SUMMARY.md => docs/user-guide/src/distributed/client-python.md (69%) rename {ballista/docs/user-guide/src => docs/user-guide/src/distributed}/client-rust.md (100%) rename {ballista/docs/user-guide/src => docs/user-guide/src/distributed}/clients.md (100%) rename {ballista/docs/user-guide/src => docs/user-guide/src/distributed}/configuration.md (100%) rename {ballista/docs/user-guide/src => docs/user-guide/src/distributed}/deployment.md (100%) rename {ballista/docs/user-guide/src => docs/user-guide/src/distributed}/docker-compose.md (100%) rename {ballista/docs/user-guide/src => docs/user-guide/src/distributed}/introduction.md (100%) rename {ballista/docs/user-guide/src => docs/user-guide/src/distributed}/kubernetes.md (97%) rename {ballista/docs/user-guide/src => docs/user-guide/src/distributed}/standalone.md (100%) create mode 100644 docs/user-guide/src/example-usage.md rename {ballista/docs => docs}/user-guide/src/faq.md (100%) rename {ballista/docs => docs}/user-guide/src/img/ballista-architecture.png (100%) create mode 100644 docs/user-guide/src/introduction.md create mode 100644 docs/user-guide/src/library.md diff --git a/ballista/docs/user-guide/.gitignore b/ballista/docs/user-guide/.gitignore deleted file mode 100644 index e662f99e3281a..0000000000000 --- a/ballista/docs/user-guide/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -ballista-book.tgz -book \ No newline at end of file diff --git a/docs/user-guide/.gitignore b/docs/user-guide/.gitignore new file mode 100644 index 0000000000000..e9c072897d554 --- /dev/null +++ b/docs/user-guide/.gitignore @@ -0,0 +1 @@ +book \ No newline at end of file diff --git a/ballista/docs/user-guide/README.md b/docs/user-guide/README.md similarity index 78% rename from ballista/docs/user-guide/README.md rename to docs/user-guide/README.md index 9ee3e90fcf6dd..0b9278c593b1e 100644 --- a/ballista/docs/user-guide/README.md +++ b/docs/user-guide/README.md @@ -16,21 +16,15 @@ specific language governing permissions and limitations under the License. --> -# Ballista User Guide Source +# DataFusion User Guide Source -This directory contains the sources for the user guide that is published at https://ballistacompute.org/docs/. +This directory contains the sources for the DataFusion user guide. ## Generate HTML +To generate the user guide in HTML format, run the following commands: + ```bash cargo install mdbook mdbook build -``` - -## Deploy User Guide to Web Site - -Requires ssh certificate to be available. - -```bash -./deploy.sh ``` \ No newline at end of file diff --git a/ballista/docs/user-guide/book.toml b/docs/user-guide/book.toml similarity index 93% rename from ballista/docs/user-guide/book.toml rename to docs/user-guide/book.toml index cf1653d74554d..efb9212dfdfda 100644 --- a/ballista/docs/user-guide/book.toml +++ b/docs/user-guide/book.toml @@ -16,8 +16,8 @@ # under the License. [book] -authors = ["Andy Grove"] +authors = ["Apache Arrow"] language = "en" multilingual = false src = "src" -title = "Ballista User Guide" +title = "DataFusion User Guide" diff --git a/docs/user-guide/src/SUMMARY.md b/docs/user-guide/src/SUMMARY.md new file mode 100644 index 0000000000000..e2ddcb0a4e89c --- /dev/null +++ b/docs/user-guide/src/SUMMARY.md @@ -0,0 +1,33 @@ + +# Summary + +- [Introduction](introduction.md) +- [Example Usage](example-usage.md) +- [Use as a Library](library.md) +- [Distributed](distributed/introduction.md) + - [Create a Ballista Cluster](distributed/deployment.md) + - [Docker](distributed/standalone.md) + - [Docker Compose](distributed/docker-compose.md) + - [Kubernetes](distributed/kubernetes.md) + - [Ballista Configuration](distributed/configuration.md) + - [Clients](distributed/clients.md) + - [Rust](distributed/client-rust.md) + - [Python](distributed/client-python.md) +- [Frequently Asked Questions](faq.md) \ No newline at end of file diff --git a/ballista/docs/user-guide/src/SUMMARY.md b/docs/user-guide/src/distributed/client-python.md similarity index 69% rename from ballista/docs/user-guide/src/SUMMARY.md rename to docs/user-guide/src/distributed/client-python.md index c8fc2c8bd6a67..7525c608ad233 100644 --- a/ballista/docs/user-guide/src/SUMMARY.md +++ b/docs/user-guide/src/distributed/client-python.md @@ -16,15 +16,6 @@ specific language governing permissions and limitations under the License. --> -# Summary +# Python -- [Introduction](introduction.md) -- [Create a Ballista Cluster](deployment.md) - - [Docker](standalone.md) - - [Docker Compose](docker-compose.md) - - [Kubernetes](kubernetes.md) - - [Ballista Configuration](configuration.md) -- [Clients](clients.md) - - [Rust](client-rust.md) - - [Python](client-python.md) -- [Frequently Asked Questions](faq.md) \ No newline at end of file +Coming soon. \ No newline at end of file diff --git a/ballista/docs/user-guide/src/client-rust.md b/docs/user-guide/src/distributed/client-rust.md similarity index 100% rename from ballista/docs/user-guide/src/client-rust.md rename to docs/user-guide/src/distributed/client-rust.md diff --git a/ballista/docs/user-guide/src/clients.md b/docs/user-guide/src/distributed/clients.md similarity index 100% rename from ballista/docs/user-guide/src/clients.md rename to docs/user-guide/src/distributed/clients.md diff --git a/ballista/docs/user-guide/src/configuration.md b/docs/user-guide/src/distributed/configuration.md similarity index 100% rename from ballista/docs/user-guide/src/configuration.md rename to docs/user-guide/src/distributed/configuration.md diff --git a/ballista/docs/user-guide/src/deployment.md b/docs/user-guide/src/distributed/deployment.md similarity index 100% rename from ballista/docs/user-guide/src/deployment.md rename to docs/user-guide/src/distributed/deployment.md diff --git a/ballista/docs/user-guide/src/docker-compose.md b/docs/user-guide/src/distributed/docker-compose.md similarity index 100% rename from ballista/docs/user-guide/src/docker-compose.md rename to docs/user-guide/src/distributed/docker-compose.md diff --git a/ballista/docs/user-guide/src/introduction.md b/docs/user-guide/src/distributed/introduction.md similarity index 100% rename from ballista/docs/user-guide/src/introduction.md rename to docs/user-guide/src/distributed/introduction.md diff --git a/ballista/docs/user-guide/src/kubernetes.md b/docs/user-guide/src/distributed/kubernetes.md similarity index 97% rename from ballista/docs/user-guide/src/kubernetes.md rename to docs/user-guide/src/distributed/kubernetes.md index 8cd8beeb267e6..027a44d469682 100644 --- a/ballista/docs/user-guide/src/kubernetes.md +++ b/docs/user-guide/src/distributed/kubernetes.md @@ -33,8 +33,7 @@ The k8s deployment consists of: Ballista is at an early stage of development and therefore has some significant limitations: - There is no support for shared object stores such as S3. All data must exist locally on each node in the - cluster, including where any client process runs (until - [#473](https://github.com/ballista-compute/ballista/issues/473) is resolved). + cluster, including where any client process runs. - Only a single scheduler instance is currently supported unless the scheduler is configured to use `etcd` as a backing store. diff --git a/ballista/docs/user-guide/src/standalone.md b/docs/user-guide/src/distributed/standalone.md similarity index 100% rename from ballista/docs/user-guide/src/standalone.md rename to docs/user-guide/src/distributed/standalone.md diff --git a/docs/user-guide/src/example-usage.md b/docs/user-guide/src/example-usage.md new file mode 100644 index 0000000000000..ff23c96de362e --- /dev/null +++ b/docs/user-guide/src/example-usage.md @@ -0,0 +1,76 @@ + +# Example Usage + +Run a SQL query against data stored in a CSV: + +```rust +use datafusion::prelude::*; +use arrow::util::pretty::print_batches; +use arrow::record_batch::RecordBatch; + +#[tokio::main] +async fn main() -> datafusion::error::Result<()> { + // register the table + let mut ctx = ExecutionContext::new(); + ctx.register_csv("example", "tests/example.csv", CsvReadOptions::new())?; + + // create a plan to run a SQL query + let df = ctx.sql("SELECT a, MIN(b) FROM example GROUP BY a LIMIT 100")?; + + // execute and print results + let results: Vec = df.collect().await?; + print_batches(&results)?; + Ok(()) +} +``` + +Use the DataFrame API to process data stored in a CSV: + +```rust +use datafusion::prelude::*; +use arrow::util::pretty::print_batches; +use arrow::record_batch::RecordBatch; + +#[tokio::main] +async fn main() -> datafusion::error::Result<()> { + // create the dataframe + let mut ctx = ExecutionContext::new(); + let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; + + let df = df.filter(col("a").lt_eq(col("b")))? + .aggregate(vec![col("a")], vec![min(col("b"))])? + .limit(100)?; + + // execute and print results + let results: Vec = df.collect().await?; + print_batches(&results)?; + Ok(()) +} +``` + +Both of these examples will produce + +```text ++---+--------+ +| a | MIN(b) | ++---+--------+ +| 1 | 2 | ++---+--------+ +``` diff --git a/ballista/docs/user-guide/src/faq.md b/docs/user-guide/src/faq.md similarity index 100% rename from ballista/docs/user-guide/src/faq.md rename to docs/user-guide/src/faq.md diff --git a/ballista/docs/user-guide/src/img/ballista-architecture.png b/docs/user-guide/src/img/ballista-architecture.png similarity index 100% rename from ballista/docs/user-guide/src/img/ballista-architecture.png rename to docs/user-guide/src/img/ballista-architecture.png diff --git a/docs/user-guide/src/introduction.md b/docs/user-guide/src/introduction.md new file mode 100644 index 0000000000000..c67fb90103d88 --- /dev/null +++ b/docs/user-guide/src/introduction.md @@ -0,0 +1,44 @@ + + +# DataFusion + +DataFusion is an extensible query execution framework, written in +Rust, that uses [Apache Arrow](https://arrow.apache.org) as its +in-memory format. + +DataFusion supports both an SQL and a DataFrame API for building +logical query plans as well as a query optimizer and execution engine +capable of parallel execution against partitioned data sources (CSV +and Parquet) using threads. + +## Use Cases + +DataFusion is used to create modern, fast and efficient data +pipelines, ETL processes, and database systems, which need the +performance of Rust and Apache Arrow and want to provide their users +the convenience of an SQL interface or a DataFrame API. + +## Why DataFusion? + +* *High Performance*: Leveraging Rust and Arrow's memory model, DataFusion achieves very high performance +* *Easy to Connect*: Being part of the Apache Arrow ecosystem (Arrow, Parquet and Flight), DataFusion works well with the rest of the big data ecosystem +* *Easy to Embed*: Allowing extension at almost any point in its design, DataFusion can be tailored for your specific usecase +* *High Quality*: Extensively tested, both by itself and with the rest of the Arrow ecosystem, DataFusion can be used as the foundation for production systems. + diff --git a/docs/user-guide/src/library.md b/docs/user-guide/src/library.md new file mode 100644 index 0000000000000..12879b160c8f1 --- /dev/null +++ b/docs/user-guide/src/library.md @@ -0,0 +1,28 @@ + +# Using DataFusion as a library + +DataFusion is [published on crates.io](https://crates.io/crates/datafusion), and is [well documented on docs.rs](https://docs.rs/datafusion/). + +To get started, add the following to your `Cargo.toml` file: + +```toml +[dependencies] +datafusion = "4.0.0-SNAPSHOT" +``` From 434fbf7ba75ee91bfdb5b8270b602959694199c1 Mon Sep 17 00:00:00 2001 From: Ruan Pearce-Authers Date: Thu, 22 Apr 2021 14:05:43 +0100 Subject: [PATCH 024/329] Use atomics for SQLMetric implementation, remove unused names (#25) --- .../src/physical_plan/hash_aggregate.rs | 16 +++----- datafusion/src/physical_plan/mod.rs | 40 ++++++++++++------- datafusion/src/physical_plan/sort.rs | 32 ++++++--------- 3 files changed, 43 insertions(+), 45 deletions(-) diff --git a/datafusion/src/physical_plan/hash_aggregate.rs b/datafusion/src/physical_plan/hash_aggregate.rs index 234265022ef79..fd20b5c65ef2a 100644 --- a/datafusion/src/physical_plan/hash_aggregate.rs +++ b/datafusion/src/physical_plan/hash_aggregate.rs @@ -18,7 +18,7 @@ //! Defines the execution plan for the hash aggregate operation use std::any::Any; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; use std::task::{Context, Poll}; use ahash::RandomState; @@ -95,7 +95,7 @@ pub struct HashAggregateExec { /// to the partial aggregate input_schema: SchemaRef, /// Metric to track number of output rows - output_rows: Arc>, + output_rows: Arc, } fn create_schema( @@ -144,7 +144,7 @@ impl HashAggregateExec { let schema = Arc::new(schema); - let output_rows = SQLMetric::counter("outputRows"); + let output_rows = SQLMetric::counter(); Ok(HashAggregateExec { mode, @@ -253,10 +253,7 @@ impl ExecutionPlan for HashAggregateExec { fn metrics(&self) -> HashMap { let mut metrics = HashMap::new(); - metrics.insert( - "outputRows".to_owned(), - self.output_rows.lock().unwrap().clone(), - ); + metrics.insert("outputRows".to_owned(), (*self.output_rows).clone()); metrics } } @@ -292,7 +289,7 @@ pin_project! { #[pin] output: futures::channel::oneshot::Receiver>, finished: bool, - output_rows: Arc>, + output_rows: Arc, } } @@ -644,7 +641,7 @@ impl GroupedHashAggregateStream { group_expr: Vec>, aggr_expr: Vec>, input: SendableRecordBatchStream, - output_rows: Arc>, + output_rows: Arc, ) -> Self { let (tx, rx) = futures::channel::oneshot::channel(); @@ -702,7 +699,6 @@ impl Stream for GroupedHashAggregateStream { }; if let Ok(batch) = &result { - let mut output_rows = output_rows.lock().unwrap(); output_rows.add(batch.num_rows()) } diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index 5036dcb921bb6..80dfe6e473b68 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -18,7 +18,8 @@ //! Traits for physical query plan, supporting parallel execution for partitioned relations. use std::fmt::{Debug, Display}; -use std::sync::{Arc, Mutex}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; use std::{any::Any, pin::Pin}; use crate::execution::context::ExecutionContextState; @@ -58,44 +59,53 @@ pub enum MetricType { /// SQL metric such as counter (number of input or output rows) or timing information about /// a physical operator. -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct SQLMetric { - /// Metric name - name: String, /// Metric value - value: usize, + value: AtomicUsize, /// Metric type metric_type: MetricType, } +impl Clone for SQLMetric { + fn clone(&self) -> Self { + Self { + value: AtomicUsize::new(self.value.load(Ordering::Relaxed)), + metric_type: self.metric_type.clone(), + } + } +} + impl SQLMetric { + // relaxed ordering for operations on `value` poses no issues + // we're purely using atomic ops with no associated memory ops + /// Create a new metric for tracking a counter - pub fn counter(name: &str) -> Arc> { - Arc::new(Mutex::new(SQLMetric::new(name, MetricType::Counter))) + pub fn counter() -> Arc { + Arc::new(SQLMetric::new(MetricType::Counter)) } /// Create a new metric for tracking time in nanoseconds - pub fn time_nanos(name: &str) -> Arc> { - Arc::new(Mutex::new(SQLMetric::new(name, MetricType::TimeNanos))) + pub fn time_nanos() -> Arc { + Arc::new(SQLMetric::new(MetricType::TimeNanos)) } /// Create a new SQLMetric - pub fn new(name: &str, metric_type: MetricType) -> Self { + pub fn new(metric_type: MetricType) -> Self { Self { - name: name.to_owned(), - value: 0, + value: AtomicUsize::new(0), metric_type, } } /// Add to the value - pub fn add(&mut self, n: usize) { - self.value += n; + pub fn add(&self, n: usize) { + self.value.fetch_add(n, Ordering::Relaxed); } /// Get the current value pub fn value(&self) -> usize { - self.value + self.value.load(Ordering::Relaxed) } } diff --git a/datafusion/src/physical_plan/sort.rs b/datafusion/src/physical_plan/sort.rs index 26855b354db0a..010e4068638ba 100644 --- a/datafusion/src/physical_plan/sort.rs +++ b/datafusion/src/physical_plan/sort.rs @@ -19,7 +19,7 @@ use std::any::Any; use std::pin::Pin; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; use std::task::{Context, Poll}; use std::time::Instant; @@ -52,9 +52,9 @@ pub struct SortExec { /// Sort expressions expr: Vec, /// Output rows - output_rows: Arc>, + output_rows: Arc, /// Time to sort batches - sort_time_nanos: Arc>, + sort_time_nanos: Arc, } impl SortExec { @@ -66,8 +66,8 @@ impl SortExec { Ok(Self { expr, input, - output_rows: SQLMetric::counter("outputRows"), - sort_time_nanos: SQLMetric::time_nanos("sortTime"), + output_rows: SQLMetric::counter(), + sort_time_nanos: SQLMetric::time_nanos(), }) } @@ -147,14 +147,8 @@ impl ExecutionPlan for SortExec { fn metrics(&self) -> HashMap { let mut metrics = HashMap::new(); - metrics.insert( - "outputRows".to_owned(), - self.output_rows.lock().unwrap().clone(), - ); - metrics.insert( - "sortTime".to_owned(), - self.sort_time_nanos.lock().unwrap().clone(), - ); + metrics.insert("outputRows".to_owned(), (*self.output_rows).clone()); + metrics.insert("sortTime".to_owned(), (*self.sort_time_nanos).clone()); metrics } } @@ -224,7 +218,7 @@ pin_project! { output: futures::channel::oneshot::Receiver>>, finished: bool, schema: SchemaRef, - output_rows: Arc>, + output_rows: Arc, } } @@ -232,8 +226,8 @@ impl SortStream { fn new( input: SendableRecordBatchStream, expr: Vec, - output_rows: Arc>, - sort_time: Arc>, + output_rows: Arc, + sort_time: Arc, ) -> Self { let (tx, rx) = futures::channel::oneshot::channel(); @@ -246,7 +240,6 @@ impl SortStream { .and_then(move |batches| { let now = Instant::now(); let result = sort_batches(&batches, &schema, &expr); - let mut sort_time = sort_time.lock().unwrap(); sort_time.add(now.elapsed().as_nanos() as usize); result }); @@ -288,7 +281,6 @@ impl Stream for SortStream { }; if let Some(Ok(batch)) = &result { - let mut output_rows = output_rows.lock().unwrap(); output_rows.add(batch.num_rows()); } @@ -431,8 +423,8 @@ mod tests { assert_eq!(DataType::Float64, *sort_exec.schema().field(1).data_type()); let result: Vec = collect(sort_exec.clone()).await?; - assert!(sort_exec.metrics().get("sortTime").unwrap().value > 0); - assert_eq!(sort_exec.metrics().get("outputRows").unwrap().value, 8); + assert!(sort_exec.metrics().get("sortTime").unwrap().value() > 0); + assert_eq!(sort_exec.metrics().get("outputRows").unwrap().value(), 8); assert_eq!(result.len(), 1); let columns = result[0].columns(); From 395d9d665800c7ea788c991b5847db828b4d88d6 Mon Sep 17 00:00:00 2001 From: Yichen Wang <18348405+Aiee@users.noreply.github.com> Date: Thu, 22 Apr 2021 21:12:08 +0800 Subject: [PATCH 025/329] Fix some typos (#31) --- datafusion/src/optimizer/constant_folding.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/src/optimizer/constant_folding.rs b/datafusion/src/optimizer/constant_folding.rs index 2fa03eb5c7096..d63177b15908a 100644 --- a/datafusion/src/optimizer/constant_folding.rs +++ b/datafusion/src/optimizer/constant_folding.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Boolean comparision rule rewrites redudant comparison expression involing boolean literal into +//! Boolean comparison rule rewrites redundant comparison expression involving boolean literal into //! unary expression. use std::sync::Arc; @@ -30,7 +30,7 @@ use crate::scalar::ScalarValue; /// Optimizer that simplifies comparison expressions involving boolean literals. /// -/// Recursively go through all expressionss and simplify the following cases: +/// Recursively go through all expressions and simplify the following cases: /// * `expr = true` and `expr != false` to `expr` when `expr` is of boolean type /// * `expr = false` and `expr != true` to `!expr` when `expr` is of boolean type /// * `true = true` and `false = false` to `true` @@ -253,7 +253,7 @@ mod tests { } #[test] - fn optimize_expr_null_comparision() -> Result<()> { + fn optimize_expr_null_comparison() -> Result<()> { let schema = expr_test_schema(); let mut rewriter = ConstantRewriter { schemas: vec![&schema], From 57eeb64659b9ca9c496a959f7716090fb32085b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Thu, 22 Apr 2021 15:25:45 +0200 Subject: [PATCH 026/329] [ARROW-12441] [DataFusion] Cross join implementation (#11) * Cross join implementation * Add to ballista, debug line * Add to tpch test, format * Simplify a bit * Row-by-row processing for the left side to keep memory down * Fix * Fmt * Clippy * Fix doc, don't include as much debug info in memoryexec debug * Use join * Fix doc * Add test cases with partitions * Make clear that mutex is locked for very short amount of time * Unwrap the lock --- .../core/src/serde/logical_plan/to_proto.rs | 1 + benchmarks/src/bin/tpch.rs | 5 + datafusion/README.md | 4 +- datafusion/src/logical_plan/builder.rs | 10 + datafusion/src/logical_plan/plan.rs | 27 +- datafusion/src/optimizer/constant_folding.rs | 3 +- datafusion/src/optimizer/filter_push_down.rs | 3 +- .../src/optimizer/hash_build_probe_order.rs | 27 ++ .../src/optimizer/projection_push_down.rs | 1 + datafusion/src/optimizer/utils.rs | 5 + datafusion/src/physical_plan/cross_join.rs | 318 ++++++++++++++++++ datafusion/src/physical_plan/hash_utils.rs | 5 - datafusion/src/physical_plan/memory.rs | 10 +- datafusion/src/physical_plan/mod.rs | 1 + datafusion/src/physical_plan/planner.rs | 9 +- datafusion/src/sql/planner.rs | 22 +- datafusion/tests/sql.rs | 54 ++- 17 files changed, 479 insertions(+), 26 deletions(-) create mode 100644 datafusion/src/physical_plan/cross_join.rs diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/ballista/rust/core/src/serde/logical_plan/to_proto.rs index a181f98b6eb6c..222b76739feb1 100644 --- a/ballista/rust/core/src/serde/logical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/to_proto.rs @@ -940,6 +940,7 @@ impl TryInto for &LogicalPlan { } LogicalPlan::Extension { .. } => unimplemented!(), LogicalPlan::Union { .. } => unimplemented!(), + LogicalPlan::CrossJoin { .. } => unimplemented!(), } } } diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index 328a68dd6a6fc..b203ceb3f741a 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -1374,6 +1374,11 @@ mod tests { run_query(6).await } + #[tokio::test] + async fn run_q9() -> Result<()> { + run_query(9).await + } + #[tokio::test] async fn run_q10() -> Result<()> { run_query(10).await diff --git a/datafusion/README.md b/datafusion/README.md index 9e6b7a2a78b5b..ff0b26d7bf031 100644 --- a/datafusion/README.md +++ b/datafusion/README.md @@ -213,7 +213,9 @@ DataFusion also includes a simple command-line interactive SQL utility. See the - [ ] MINUS - [x] Joins - [x] INNER JOIN - - [ ] CROSS JOIN + - [x] LEFT JOIN + - [x] RIGHT JOIN + - [x] CROSS JOIN - [ ] OUTER JOIN - [ ] Window diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs index fed82fd23b81a..b6017b743ed70 100644 --- a/datafusion/src/logical_plan/builder.rs +++ b/datafusion/src/logical_plan/builder.rs @@ -270,6 +270,16 @@ impl LogicalPlanBuilder { })) } } + /// Apply a cross join + pub fn cross_join(&self, right: &LogicalPlan) -> Result { + let schema = self.plan.schema().join(right.schema())?; + + Ok(Self::from(&LogicalPlan::CrossJoin { + left: Arc::new(self.plan.clone()), + right: Arc::new(right.clone()), + schema: DFSchemaRef::new(schema), + })) + } /// Repartition pub fn repartition(&self, partitioning_scheme: Partitioning) -> Result { diff --git a/datafusion/src/logical_plan/plan.rs b/datafusion/src/logical_plan/plan.rs index d1b9b827a5a3f..606ef1e222755 100644 --- a/datafusion/src/logical_plan/plan.rs +++ b/datafusion/src/logical_plan/plan.rs @@ -113,6 +113,15 @@ pub enum LogicalPlan { /// The output schema, containing fields from the left and right inputs schema: DFSchemaRef, }, + /// Apply Cross Join to two logical plans + CrossJoin { + /// Left input + left: Arc, + /// Right input + right: Arc, + /// The output schema, containing fields from the left and right inputs + schema: DFSchemaRef, + }, /// Repartition the plan based on a partitioning scheme. Repartition { /// The incoming logical plan @@ -203,6 +212,7 @@ impl LogicalPlan { LogicalPlan::Aggregate { schema, .. } => &schema, LogicalPlan::Sort { input, .. } => input.schema(), LogicalPlan::Join { schema, .. } => &schema, + LogicalPlan::CrossJoin { schema, .. } => &schema, LogicalPlan::Repartition { input, .. } => input.schema(), LogicalPlan::Limit { input, .. } => input.schema(), LogicalPlan::CreateExternalTable { schema, .. } => &schema, @@ -229,6 +239,11 @@ impl LogicalPlan { right, schema, .. + } + | LogicalPlan::CrossJoin { + left, + right, + schema, } => { let mut schemas = left.all_schemas(); schemas.extend(right.all_schemas()); @@ -290,8 +305,9 @@ impl LogicalPlan { | LogicalPlan::EmptyRelation { .. } | LogicalPlan::Limit { .. } | LogicalPlan::CreateExternalTable { .. } - | LogicalPlan::Explain { .. } => vec![], - LogicalPlan::Union { .. } => { + | LogicalPlan::CrossJoin { .. } + | LogicalPlan::Explain { .. } + | LogicalPlan::Union { .. } => { vec![] } } @@ -307,6 +323,7 @@ impl LogicalPlan { LogicalPlan::Aggregate { input, .. } => vec![input], LogicalPlan::Sort { input, .. } => vec![input], LogicalPlan::Join { left, right, .. } => vec![left, right], + LogicalPlan::CrossJoin { left, right, .. } => vec![left, right], LogicalPlan::Limit { input, .. } => vec![input], LogicalPlan::Extension { node } => node.inputs(), LogicalPlan::Union { inputs, .. } => inputs.iter().collect(), @@ -396,7 +413,8 @@ impl LogicalPlan { LogicalPlan::Repartition { input, .. } => input.accept(visitor)?, LogicalPlan::Aggregate { input, .. } => input.accept(visitor)?, LogicalPlan::Sort { input, .. } => input.accept(visitor)?, - LogicalPlan::Join { left, right, .. } => { + LogicalPlan::Join { left, right, .. } + | LogicalPlan::CrossJoin { left, right, .. } => { left.accept(visitor)? && right.accept(visitor)? } LogicalPlan::Union { inputs, .. } => { @@ -669,6 +687,9 @@ impl LogicalPlan { keys.iter().map(|(l, r)| format!("{} = {}", l, r)).collect(); write!(f, "Join: {}", join_expr.join(", ")) } + LogicalPlan::CrossJoin { .. } => { + write!(f, "CrossJoin:") + } LogicalPlan::Repartition { partitioning_scheme, .. diff --git a/datafusion/src/optimizer/constant_folding.rs b/datafusion/src/optimizer/constant_folding.rs index d63177b15908a..71c84f6153b62 100644 --- a/datafusion/src/optimizer/constant_folding.rs +++ b/datafusion/src/optimizer/constant_folding.rs @@ -72,7 +72,8 @@ impl OptimizerRule for ConstantFolding { | LogicalPlan::Explain { .. } | LogicalPlan::Limit { .. } | LogicalPlan::Union { .. } - | LogicalPlan::Join { .. } => { + | LogicalPlan::Join { .. } + | LogicalPlan::CrossJoin { .. } => { // apply the optimization to all inputs of the plan let inputs = plan.inputs(); let new_inputs = inputs diff --git a/datafusion/src/optimizer/filter_push_down.rs b/datafusion/src/optimizer/filter_push_down.rs index ec260a41dc574..4622e9fc62dc1 100644 --- a/datafusion/src/optimizer/filter_push_down.rs +++ b/datafusion/src/optimizer/filter_push_down.rs @@ -314,7 +314,8 @@ fn optimize(plan: &LogicalPlan, mut state: State) -> Result { .collect::>(); issue_filters(state, used_columns, plan) } - LogicalPlan::Join { left, right, .. } => { + LogicalPlan::Join { left, right, .. } + | LogicalPlan::CrossJoin { left, right, .. } => { let (pushable_to_left, pushable_to_right, keep) = get_join_predicates(&state, &left.schema(), &right.schema()); diff --git a/datafusion/src/optimizer/hash_build_probe_order.rs b/datafusion/src/optimizer/hash_build_probe_order.rs index f44050f0b72ed..086e2f03196bd 100644 --- a/datafusion/src/optimizer/hash_build_probe_order.rs +++ b/datafusion/src/optimizer/hash_build_probe_order.rs @@ -67,6 +67,10 @@ fn get_num_rows(logical_plan: &LogicalPlan) -> Option { // we cannot predict the cardinality of the join output None } + LogicalPlan::CrossJoin { left, right, .. } => { + // number of rows is equal to num_left * num_right + get_num_rows(left).and_then(|x| get_num_rows(right).map(|y| x * y)) + } LogicalPlan::Repartition { .. } => { // we cannot predict how rows will be repartitioned None @@ -138,6 +142,29 @@ impl OptimizerRule for HashBuildProbeOrder { }) } } + LogicalPlan::CrossJoin { + left, + right, + schema, + } => { + let left = self.optimize(left)?; + let right = self.optimize(right)?; + if should_swap_join_order(&left, &right) { + // Swap left and right + Ok(LogicalPlan::CrossJoin { + left: Arc::new(right), + right: Arc::new(left), + schema: schema.clone(), + }) + } else { + // Keep join as is + Ok(LogicalPlan::CrossJoin { + left: Arc::new(left), + right: Arc::new(right), + schema: schema.clone(), + }) + } + } // Rest: recurse into plan, apply optimization where possible LogicalPlan::Projection { .. } | LogicalPlan::Aggregate { .. } diff --git a/datafusion/src/optimizer/projection_push_down.rs b/datafusion/src/optimizer/projection_push_down.rs index 6b1cdfe18ca78..7243fa52d9b32 100644 --- a/datafusion/src/optimizer/projection_push_down.rs +++ b/datafusion/src/optimizer/projection_push_down.rs @@ -270,6 +270,7 @@ fn optimize_plan( | LogicalPlan::Sort { .. } | LogicalPlan::CreateExternalTable { .. } | LogicalPlan::Union { .. } + | LogicalPlan::CrossJoin { .. } | LogicalPlan::Extension { .. } => { let expr = plan.expressions(); // collect all required columns by this plan diff --git a/datafusion/src/optimizer/utils.rs b/datafusion/src/optimizer/utils.rs index fe1d023819175..0ec3fa7c02a16 100644 --- a/datafusion/src/optimizer/utils.rs +++ b/datafusion/src/optimizer/utils.rs @@ -208,6 +208,11 @@ pub fn from_plan( on: on.clone(), schema: schema.clone(), }), + LogicalPlan::CrossJoin { schema, .. } => Ok(LogicalPlan::CrossJoin { + left: Arc::new(inputs[0].clone()), + right: Arc::new(inputs[1].clone()), + schema: schema.clone(), + }), LogicalPlan::Limit { n, .. } => Ok(LogicalPlan::Limit { n: *n, input: Arc::new(inputs[0].clone()), diff --git a/datafusion/src/physical_plan/cross_join.rs b/datafusion/src/physical_plan/cross_join.rs new file mode 100644 index 0000000000000..4372352d6ecf9 --- /dev/null +++ b/datafusion/src/physical_plan/cross_join.rs @@ -0,0 +1,318 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines the cross join plan for loading the left side of the cross join +//! and producing batches in parallel for the right partitions + +use futures::{lock::Mutex, StreamExt}; +use std::{any::Any, sync::Arc, task::Poll}; + +use crate::physical_plan::memory::MemoryStream; +use arrow::datatypes::{Schema, SchemaRef}; +use arrow::error::Result as ArrowResult; +use arrow::record_batch::RecordBatch; + +use futures::{Stream, TryStreamExt}; + +use super::{hash_utils::check_join_is_valid, merge::MergeExec}; +use crate::{ + error::{DataFusionError, Result}, + scalar::ScalarValue, +}; +use async_trait::async_trait; +use std::time::Instant; + +use super::{ExecutionPlan, Partitioning, RecordBatchStream, SendableRecordBatchStream}; +use crate::physical_plan::coalesce_batches::concat_batches; +use log::debug; + +/// Data of the left side +type JoinLeftData = RecordBatch; + +/// executes partitions in parallel and combines them into a set of +/// partitions by combining all values from the left with all values on the right +#[derive(Debug)] +pub struct CrossJoinExec { + /// left (build) side which gets loaded in memory + left: Arc, + /// right (probe) side which are combined with left side + right: Arc, + /// The schema once the join is applied + schema: SchemaRef, + /// Build-side data + build_side: Arc>>, +} + +impl CrossJoinExec { + /// Tries to create a new [CrossJoinExec]. + /// # Error + /// This function errors when left and right schema's can't be combined + pub fn try_new( + left: Arc, + right: Arc, + ) -> Result { + let left_schema = left.schema(); + let right_schema = right.schema(); + check_join_is_valid(&left_schema, &right_schema, &[])?; + + let left_schema = left.schema(); + let left_fields = left_schema.fields().iter(); + let right_schema = right.schema(); + + let right_fields = right_schema.fields().iter(); + + // left then right + let all_columns = left_fields.chain(right_fields).cloned().collect(); + + let schema = Arc::new(Schema::new(all_columns)); + + Ok(CrossJoinExec { + left, + right, + schema, + build_side: Arc::new(Mutex::new(None)), + }) + } + + /// left (build) side which gets loaded in memory + pub fn left(&self) -> &Arc { + &self.left + } + + /// right side which gets combined with left side + pub fn right(&self) -> &Arc { + &self.right + } +} + +#[async_trait] +impl ExecutionPlan for CrossJoinExec { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + + fn children(&self) -> Vec> { + vec![self.left.clone(), self.right.clone()] + } + + fn with_new_children( + &self, + children: Vec>, + ) -> Result> { + match children.len() { + 2 => Ok(Arc::new(CrossJoinExec::try_new( + children[0].clone(), + children[1].clone(), + )?)), + _ => Err(DataFusionError::Internal( + "CrossJoinExec wrong number of children".to_string(), + )), + } + } + + fn output_partitioning(&self) -> Partitioning { + self.right.output_partitioning() + } + + async fn execute(&self, partition: usize) -> Result { + // we only want to compute the build side once + let left_data = { + let mut build_side = self.build_side.lock().await; + + match build_side.as_ref() { + Some(stream) => stream.clone(), + None => { + let start = Instant::now(); + + // merge all left parts into a single stream + let merge = MergeExec::new(self.left.clone()); + let stream = merge.execute(0).await?; + + // Load all batches and count the rows + let (batches, num_rows) = stream + .try_fold((Vec::new(), 0usize), |mut acc, batch| async { + acc.1 += batch.num_rows(); + acc.0.push(batch); + Ok(acc) + }) + .await?; + let merged_batch = + concat_batches(&self.left.schema(), &batches, num_rows)?; + *build_side = Some(merged_batch.clone()); + + debug!( + "Built build-side of cross join containing {} rows in {} ms", + num_rows, + start.elapsed().as_millis() + ); + + merged_batch + } + } + }; + + let stream = self.right.execute(partition).await?; + + if left_data.num_rows() == 0 { + return Ok(Box::pin(MemoryStream::try_new( + vec![], + self.schema.clone(), + None, + )?)); + } + + Ok(Box::pin(CrossJoinStream { + schema: self.schema.clone(), + left_data, + right: stream, + right_batch: Arc::new(std::sync::Mutex::new(None)), + left_index: 0, + num_input_batches: 0, + num_input_rows: 0, + num_output_batches: 0, + num_output_rows: 0, + join_time: 0, + })) + } +} + +/// A stream that issues [RecordBatch]es as they arrive from the right of the join. +struct CrossJoinStream { + /// Input schema + schema: Arc, + /// data from the left side + left_data: JoinLeftData, + /// right + right: SendableRecordBatchStream, + /// Current value on the left + left_index: usize, + /// Current batch being processed from the right side + right_batch: Arc>>, + /// number of input batches + num_input_batches: usize, + /// number of input rows + num_input_rows: usize, + /// number of batches produced + num_output_batches: usize, + /// number of rows produced + num_output_rows: usize, + /// total time for joining probe-side batches to the build-side batches + join_time: usize, +} + +impl RecordBatchStream for CrossJoinStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} +fn build_batch( + left_index: usize, + batch: &RecordBatch, + left_data: &RecordBatch, + schema: &Schema, +) -> ArrowResult { + // Repeat value on the left n times + let arrays = left_data + .columns() + .iter() + .map(|arr| { + let scalar = ScalarValue::try_from_array(arr, left_index)?; + Ok(scalar.to_array_of_size(batch.num_rows())) + }) + .collect::>>() + .map_err(|x| x.into_arrow_external_error())?; + + RecordBatch::try_new( + Arc::new(schema.clone()), + arrays + .iter() + .chain(batch.columns().iter()) + .cloned() + .collect(), + ) +} + +#[async_trait] +impl Stream for CrossJoinStream { + type Item = ArrowResult; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + if self.left_index > 0 && self.left_index < self.left_data.num_rows() { + let start = Instant::now(); + let right_batch = { + let right_batch = self.right_batch.lock().unwrap(); + right_batch.clone().unwrap() + }; + let result = + build_batch(self.left_index, &right_batch, &self.left_data, &self.schema); + self.num_input_rows += right_batch.num_rows(); + if let Ok(ref batch) = result { + self.join_time += start.elapsed().as_millis() as usize; + self.num_output_batches += 1; + self.num_output_rows += batch.num_rows(); + } + self.left_index += 1; + return Poll::Ready(Some(result)); + } + self.left_index = 0; + self.right + .poll_next_unpin(cx) + .map(|maybe_batch| match maybe_batch { + Some(Ok(batch)) => { + let start = Instant::now(); + let result = build_batch( + self.left_index, + &batch, + &self.left_data, + &self.schema, + ); + self.num_input_batches += 1; + self.num_input_rows += batch.num_rows(); + if let Ok(ref batch) = result { + self.join_time += start.elapsed().as_millis() as usize; + self.num_output_batches += 1; + self.num_output_rows += batch.num_rows(); + } + self.left_index = 1; + + let mut right_batch = self.right_batch.lock().unwrap(); + *right_batch = Some(batch); + + Some(result) + } + other => { + debug!( + "Processed {} probe-side input batches containing {} rows and \ + produced {} output batches containing {} rows in {} ms", + self.num_input_batches, + self.num_input_rows, + self.num_output_batches, + self.num_output_rows, + self.join_time + ); + other + } + }) + } +} diff --git a/datafusion/src/physical_plan/hash_utils.rs b/datafusion/src/physical_plan/hash_utils.rs index b26ff9bb5fc28..a38cc092123d4 100644 --- a/datafusion/src/physical_plan/hash_utils.rs +++ b/datafusion/src/physical_plan/hash_utils.rs @@ -52,11 +52,6 @@ fn check_join_set_is_valid( right: &HashSet, on: &JoinOn, ) -> Result<()> { - if on.is_empty() { - return Err(DataFusionError::Plan( - "The 'on' clause of a join cannot be empty".to_string(), - )); - } let on_left = &on.iter().map(|on| on.0.to_string()).collect::>(); let left_missing = on_left.difference(left).collect::>(); diff --git a/datafusion/src/physical_plan/memory.rs b/datafusion/src/physical_plan/memory.rs index bef9bcc62dff5..9022077559acf 100644 --- a/datafusion/src/physical_plan/memory.rs +++ b/datafusion/src/physical_plan/memory.rs @@ -17,6 +17,7 @@ //! Execution plan for reading in-memory batches of data +use core::fmt; use std::any::Any; use std::sync::Arc; use std::task::{Context, Poll}; @@ -31,7 +32,6 @@ use async_trait::async_trait; use futures::Stream; /// Execution plan for reading in-memory batches of data -#[derive(Debug)] pub struct MemoryExec { /// The partitions to query partitions: Vec>, @@ -41,6 +41,14 @@ pub struct MemoryExec { projection: Option>, } +impl fmt::Debug for MemoryExec { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "partitions: [...]")?; + write!(f, "schema: {:?}", self.schema)?; + write!(f, "projection: {:?}", self.projection) + } +} + #[async_trait] impl ExecutionPlan for MemoryExec { /// Return a reference to Any that can be used for downcasting diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index 80dfe6e473b68..11f0946c91ff6 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -345,6 +345,7 @@ pub mod aggregates; pub mod array_expressions; pub mod coalesce_batches; pub mod common; +pub mod cross_join; #[cfg(feature = "crypto_expressions")] pub mod crypto_expressions; pub mod csv; diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index f9279ae48f0cd..ae6ad5075d877 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -20,8 +20,8 @@ use std::sync::Arc; use super::{ - aggregates, empty::EmptyExec, expressions::binary, functions, - hash_join::PartitionMode, udaf, union::UnionExec, + aggregates, cross_join::CrossJoinExec, empty::EmptyExec, expressions::binary, + functions, hash_join::PartitionMode, udaf, union::UnionExec, }; use crate::error::{DataFusionError, Result}; use crate::execution::context::ExecutionContextState; @@ -328,6 +328,11 @@ impl DefaultPhysicalPlanner { )?)) } } + LogicalPlan::CrossJoin { left, right, .. } => { + let left = self.create_initial_plan(left, ctx_state)?; + let right = self.create_initial_plan(right, ctx_state)?; + Ok(Arc::new(CrossJoinExec::try_new(left, right)?)) + } LogicalPlan::EmptyRelation { produce_one_row, schema, diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index f3cba232a23ab..a40d0becdcb4b 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -355,12 +355,20 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { JoinOperator::Inner(constraint) => { self.parse_join(left, &right, constraint, JoinType::Inner) } + JoinOperator::CrossJoin => self.parse_cross_join(left, &right), other => Err(DataFusionError::NotImplemented(format!( "Unsupported JOIN operator {:?}", other ))), } } + fn parse_cross_join( + &self, + left: &LogicalPlan, + right: &LogicalPlan, + ) -> Result { + LogicalPlanBuilder::from(&left).cross_join(&right)?.build() + } fn parse_join( &self, @@ -489,9 +497,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } } if join_keys.is_empty() { - return Err(DataFusionError::NotImplemented( - "Cartesian joins are not supported".to_string(), - )); + left = + LogicalPlanBuilder::from(&left).cross_join(right)?.build()?; } else { let left_keys: Vec<_> = join_keys.iter().map(|(l, _)| *l).collect(); @@ -517,9 +524,12 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { if plans.len() == 1 { Ok(plans[0].clone()) } else { - Err(DataFusionError::NotImplemented( - "Cartesian joins are not supported".to_string(), - )) + let mut left = plans[0].clone(); + for right in plans.iter().skip(1) { + left = + LogicalPlanBuilder::from(&left).cross_join(right)?.build()?; + } + Ok(left) } } }; diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index f4d4e65f3a4ee..70baffc700ba2 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -1289,15 +1289,57 @@ async fn equijoin_implicit_syntax_reversed() -> Result<()> { } #[tokio::test] -async fn cartesian_join() -> Result<()> { - let ctx = create_join_context("t1_id", "t2_id")?; +async fn cross_join() { + let mut ctx = create_join_context("t1_id", "t2_id").unwrap(); + let sql = "SELECT t1_id, t1_name, t2_name FROM t1, t2 ORDER BY t1_id"; - let maybe_plan = ctx.create_logical_plan(&sql); + let actual = execute(&mut ctx, sql).await; + + assert_eq!(4 * 4, actual.len()); + + let sql = "SELECT t1_id, t1_name, t2_name FROM t1, t2 WHERE 1=1 ORDER BY t1_id"; + let actual = execute(&mut ctx, sql).await; + + assert_eq!(4 * 4, actual.len()); + + let sql = "SELECT t1_id, t1_name, t2_name FROM t1 CROSS JOIN t2"; + let actual = execute(&mut ctx, sql).await; + + assert_eq!(4 * 4, actual.len()); + assert_eq!( - "This feature is not implemented: Cartesian joins are not supported", - &format!("{}", maybe_plan.err().unwrap()) + actual, + [ + ["11", "a", "z"], + ["11", "a", "y"], + ["11", "a", "x"], + ["11", "a", "w"], + ["22", "b", "z"], + ["22", "b", "y"], + ["22", "b", "x"], + ["22", "b", "w"], + ["33", "c", "z"], + ["33", "c", "y"], + ["33", "c", "x"], + ["33", "c", "w"], + ["44", "d", "z"], + ["44", "d", "y"], + ["44", "d", "x"], + ["44", "d", "w"] + ] ); - Ok(()) + + // Two partitions (from UNION) on the left + let sql = "SELECT * FROM (SELECT t1_id, t1_name FROM t1 UNION ALL SELECT t1_id, t1_name FROM t1) t1 CROSS JOIN t2"; + let actual = execute(&mut ctx, sql).await; + + assert_eq!(4 * 4 * 2, actual.len()); + + // Two partitions (from UNION) on the right + let sql = "SELECT t1_id, t1_name, t2_name FROM t1 CROSS JOIN (SELECT t2_name FROM t2 UNION ALL SELECT t2_name FROM t2)"; + let actual = execute(&mut ctx, sql).await; + + assert_eq!(4 * 4 * 2, actual.len()); } fn create_join_context( From 9d720f47d3de3d756b13715c4078b8d0a1640251 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 23 Apr 2021 07:19:39 -0400 Subject: [PATCH 027/329] [MINOR] Update arrow-rs dependencies (#33) * Bring in latest arrow-rs dependencies * Update refs in other crates * update ballista deps --- ballista/rust/benchmarks/tpch/Cargo.toml | 4 ++-- ballista/rust/client/Cargo.toml | 2 +- ballista/rust/core/Cargo.toml | 4 ++-- ballista/rust/executor/Cargo.toml | 5 ++--- ballista/rust/scheduler/Cargo.toml | 2 +- benchmarks/Cargo.toml | 4 ++-- datafusion-examples/Cargo.toml | 4 ++-- datafusion/Cargo.toml | 4 ++-- 8 files changed, 14 insertions(+), 15 deletions(-) diff --git a/ballista/rust/benchmarks/tpch/Cargo.toml b/ballista/rust/benchmarks/tpch/Cargo.toml index 8d62e20e17e17..601943d867c6b 100644 --- a/ballista/rust/benchmarks/tpch/Cargo.toml +++ b/ballista/rust/benchmarks/tpch/Cargo.toml @@ -28,8 +28,8 @@ edition = "2018" ballista = { path="../../client" } datafusion = { path = "../../../../datafusion" } -arrow = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } -parquet = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } +parquet = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } env_logger = "0.8" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread"] } diff --git a/ballista/rust/client/Cargo.toml b/ballista/rust/client/Cargo.toml index 6ac86875169b4..fdce1c47a91d7 100644 --- a/ballista/rust/client/Cargo.toml +++ b/ballista/rust/client/Cargo.toml @@ -31,5 +31,5 @@ futures = "0.3" log = "0.4" tokio = "1.0" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml index e9d7682473f17..5a77691359c34 100644 --- a/ballista/rust/core/Cargo.toml +++ b/ballista/rust/core/Cargo.toml @@ -40,8 +40,8 @@ tokio = "1.0" tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } -arrow = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/executor/Cargo.toml b/ballista/rust/executor/Cargo.toml index 79ceabe2dd666..3df5f6ba16355 100644 --- a/ballista/rust/executor/Cargo.toml +++ b/ballista/rust/executor/Cargo.toml @@ -45,8 +45,8 @@ tokio-stream = "0.1" tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } -arrow = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } datafusion = { path = "../../../datafusion" } @@ -57,4 +57,3 @@ configure_me_codegen = "0.4.0" [package.metadata.configure_me.bin] executor = "executor_config_spec.toml" - diff --git a/ballista/rust/scheduler/Cargo.toml b/ballista/rust/scheduler/Cargo.toml index ce8ca09e15b2f..6c7fc58aaecfb 100644 --- a/ballista/rust/scheduler/Cargo.toml +++ b/ballista/rust/scheduler/Cargo.toml @@ -52,7 +52,7 @@ tonic = "0.4" tower = { version = "0.4" } warp = "0.3" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } datafusion = { path = "../../../datafusion" } [dev-dependencies] diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 66a81be26b36c..6eb6ab9f89d6e 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -31,8 +31,8 @@ simd = ["datafusion/simd"] snmalloc = ["snmalloc-rs"] [dependencies] -arrow = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } -parquet = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } +parquet = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } datafusion = { path = "../datafusion" } structopt = { version = "0.3", default-features = false } tokio = { version = "^1.0", features = ["macros", "rt", "rt-multi-thread"] } diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 7f7c239d0f549..929bdf208305f 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -29,8 +29,8 @@ publish = false [dev-dependencies] -arrow = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "08a662f" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } datafusion = { path = "../datafusion" } prost = "0.7" tonic = "0.4" diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index 443bd7e020414..eaa7031794cf7 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -50,8 +50,8 @@ unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = "0.7" hashbrown = "0.11" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "08a662f", features = ["prettyprint"] } -parquet = { git = "https://github.com/apache/arrow-rs", rev = "08a662f", features = ["arrow"] } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd", features = ["prettyprint"] } +parquet = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd", features = ["arrow"] } sqlparser = "0.9.0" clap = "2.33" rustyline = {version = "7.0", optional = true} From 74cdf6f667193a2d26a745632c5a6af79d15f1aa Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 23 Apr 2021 06:37:25 -0600 Subject: [PATCH 028/329] Combine Cargo workspaces (#23) --- Cargo.toml | 16 +++++----- ballista/rust/Cargo.toml | 30 ------------------- ballista/rust/benchmarks/tpch/Cargo.toml | 2 +- ballista/rust/client/Cargo.toml | 2 +- ballista/rust/client/src/context.rs | 4 +-- ballista/rust/core/Cargo.toml | 2 +- ballista/rust/core/src/datasource.rs | 6 ++-- ballista/rust/core/src/error.rs | 1 + .../core/src/serde/logical_plan/from_proto.rs | 2 ++ .../core/src/serde/logical_plan/to_proto.rs | 7 +++-- .../core/src/serde/scheduler/from_proto.rs | 1 + ballista/rust/core/src/serde/scheduler/mod.rs | 1 + .../rust/core/src/serde/scheduler/to_proto.rs | 2 ++ ballista/rust/executor/Cargo.toml | 3 +- ballista/rust/scheduler/Cargo.toml | 2 +- ballista/rust/scheduler/src/planner.rs | 4 +-- dev/docker/rust.dockerfile | 8 ++--- 17 files changed, 34 insertions(+), 59 deletions(-) delete mode 100644 ballista/rust/Cargo.toml diff --git a/Cargo.toml b/Cargo.toml index 0a4ef2a7f2c1a..0947beadac0d9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,12 +17,12 @@ [workspace] members = [ - "datafusion", - "datafusion-examples", + "datafusion", + "datafusion-examples", "benchmarks", -] - -# this package is excluded because it requires different compilation flags, thereby significantly changing -# how it is compiled within the workspace, causing the whole workspace to be compiled from scratch -# this way, this is a stand-alone package that compiles independently of the others. -exclude = ["ballista"] + "ballista/rust/benchmarks/tpch", + "ballista/rust/client", + "ballista/rust/core", + "ballista/rust/executor", + "ballista/rust/scheduler", +] \ No newline at end of file diff --git a/ballista/rust/Cargo.toml b/ballista/rust/Cargo.toml deleted file mode 100644 index 5e344e004b838..0000000000000 --- a/ballista/rust/Cargo.toml +++ /dev/null @@ -1,30 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[workspace] - -members = [ - "benchmarks/tpch", - "client", - "core", - "executor", - "scheduler", -] - -#[profile.release] -#lto = true -#codegen-units = 1 diff --git a/ballista/rust/benchmarks/tpch/Cargo.toml b/ballista/rust/benchmarks/tpch/Cargo.toml index 601943d867c6b..9311f23ad886b 100644 --- a/ballista/rust/benchmarks/tpch/Cargo.toml +++ b/ballista/rust/benchmarks/tpch/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "tpch" -version = "0.4.2-SNAPSHOT" +version = "0.5.0-SNAPSHOT" homepage = "https://github.com/apache/arrow" repository = "https://github.com/apache/arrow" authors = ["Apache Arrow "] diff --git a/ballista/rust/client/Cargo.toml b/ballista/rust/client/Cargo.toml index fdce1c47a91d7..d29f23ada888b 100644 --- a/ballista/rust/client/Cargo.toml +++ b/ballista/rust/client/Cargo.toml @@ -19,7 +19,7 @@ name = "ballista" description = "Ballista Distributed Compute" license = "Apache-2.0" -version = "0.4.2-SNAPSHOT" +version = "0.5.0-SNAPSHOT" homepage = "https://github.com/apache/arrow" repository = "https://github.com/apache/arrow" authors = ["Apache Arrow "] diff --git a/ballista/rust/client/src/context.rs b/ballista/rust/client/src/context.rs index 400f6b6183ec6..a4cca7a0996cc 100644 --- a/ballista/rust/client/src/context.rs +++ b/ballista/rust/client/src/context.rs @@ -30,7 +30,7 @@ use ballista_core::serde::protobuf::{ }; use ballista_core::{ client::BallistaClient, - datasource::DFTableAdapter, + datasource::DfTableAdapter, error::{BallistaError, Result}, memory_stream::MemoryStream, utils::create_datafusion_context, @@ -151,7 +151,7 @@ impl BallistaContext { let execution_plan = ctx.create_physical_plan(&plan)?; ctx.register_table( TableReference::Bare { table: name }, - Arc::new(DFTableAdapter::new(plan, execution_plan)), + Arc::new(DfTableAdapter::new(plan, execution_plan)), )?; } let df = ctx.sql(sql)?; diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml index 5a77691359c34..0a600ea21cd60 100644 --- a/ballista/rust/core/Cargo.toml +++ b/ballista/rust/core/Cargo.toml @@ -19,7 +19,7 @@ name = "ballista-core" description = "Ballista Distributed Compute" license = "Apache-2.0" -version = "0.4.2-SNAPSHOT" +version = "0.5.0-SNAPSHOT" homepage = "https://github.com/apache/arrow" repository = "https://github.com/apache/arrow" authors = ["Apache Arrow "] diff --git a/ballista/rust/core/src/datasource.rs b/ballista/rust/core/src/datasource.rs index 8ff0df44e4be4..5b1540ac50373 100644 --- a/ballista/rust/core/src/datasource.rs +++ b/ballista/rust/core/src/datasource.rs @@ -30,20 +30,20 @@ use datafusion::{ /// TableProvider which is effectively a wrapper around a physical plan. We need to be able to /// register tables so that we can create logical plans from SQL statements that reference these /// tables. -pub struct DFTableAdapter { +pub struct DfTableAdapter { /// DataFusion logical plan pub logical_plan: LogicalPlan, /// DataFusion execution plan plan: Arc, } -impl DFTableAdapter { +impl DfTableAdapter { pub fn new(logical_plan: LogicalPlan, plan: Arc) -> Self { Self { logical_plan, plan } } } -impl TableProvider for DFTableAdapter { +impl TableProvider for DfTableAdapter { fn as_any(&self) -> &dyn Any { self } diff --git a/ballista/rust/core/src/error.rs b/ballista/rust/core/src/error.rs index d0155ce4b78f3..e16920e047443 100644 --- a/ballista/rust/core/src/error.rs +++ b/ballista/rust/core/src/error.rs @@ -49,6 +49,7 @@ pub enum BallistaError { TokioError(tokio::task::JoinError), } +#[allow(clippy::from_over_into)] impl Into> for BallistaError { fn into(self) -> Result { Err(self) diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index 93084260662f8..18a85d2796cf4 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -436,6 +436,7 @@ impl TryInto for &protobuf::arrow_type::ArrowTypeEnu } } +#[allow(clippy::from_over_into)] impl Into for protobuf::PrimitiveScalarType { fn into(self) -> arrow::datatypes::DataType { use arrow::datatypes::DataType; @@ -1170,6 +1171,7 @@ impl TryFrom for protobuf::FileType { } } +#[allow(clippy::from_over_into)] impl Into for protobuf::FileType { fn into(self) -> datafusion::sql::parser::FileType { use datafusion::sql::parser::FileType; diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/ballista/rust/core/src/serde/logical_plan/to_proto.rs index 222b76739feb1..560578df9f5b3 100644 --- a/ballista/rust/core/src/serde/logical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/to_proto.rs @@ -24,7 +24,7 @@ use std::{ convert::{TryFrom, TryInto}, }; -use crate::datasource::DFTableAdapter; +use crate::datasource::DfTableAdapter; use crate::serde::{protobuf, BallistaError}; use arrow::datatypes::{DataType, Schema}; @@ -679,7 +679,7 @@ impl TryInto for &LogicalPlan { // unwrap the DFTableAdapter to get to the real TableProvider let source = if let Some(adapter) = - source.as_any().downcast_ref::() + source.as_any().downcast_ref::() { match &adapter.logical_plan { LogicalPlan::TableScan { source, .. } => Ok(source.as_any()), @@ -1021,7 +1021,7 @@ impl TryInto for &Expr { let fun: protobuf::ScalarFunction = fun.try_into()?; let expr: Vec = args .iter() - .map(|e| Ok(e.try_into()?)) + .map(|e| e.try_into()) .collect::, BallistaError>>()?; Ok(protobuf::LogicalExprNode { expr_type: Some( @@ -1164,6 +1164,7 @@ impl TryInto for &Expr { } } +#[allow(clippy::from_over_into)] impl Into for &Schema { fn into(self) -> protobuf::Schema { protobuf::Schema { diff --git a/ballista/rust/core/src/serde/scheduler/from_proto.rs b/ballista/rust/core/src/serde/scheduler/from_proto.rs index fb1e4f812d0d4..4631b2e4d8638 100644 --- a/ballista/rust/core/src/serde/scheduler/from_proto.rs +++ b/ballista/rust/core/src/serde/scheduler/from_proto.rs @@ -72,6 +72,7 @@ impl TryInto for protobuf::PartitionId { } } +#[allow(clippy::from_over_into)] impl Into for protobuf::PartitionStats { fn into(self) -> PartitionStats { PartitionStats::new( diff --git a/ballista/rust/core/src/serde/scheduler/mod.rs b/ballista/rust/core/src/serde/scheduler/mod.rs index 81d8722d7f466..bbbd48b74a1f3 100644 --- a/ballista/rust/core/src/serde/scheduler/mod.rs +++ b/ballista/rust/core/src/serde/scheduler/mod.rs @@ -75,6 +75,7 @@ pub struct ExecutorMeta { pub port: u16, } +#[allow(clippy::from_over_into)] impl Into for ExecutorMeta { fn into(self) -> protobuf::ExecutorMetadata { protobuf::ExecutorMetadata { diff --git a/ballista/rust/core/src/serde/scheduler/to_proto.rs b/ballista/rust/core/src/serde/scheduler/to_proto.rs index f581becdea176..40ca907a8a717 100644 --- a/ballista/rust/core/src/serde/scheduler/to_proto.rs +++ b/ballista/rust/core/src/serde/scheduler/to_proto.rs @@ -55,6 +55,7 @@ impl TryInto for ExecutePartition { } } +#[allow(clippy::from_over_into)] impl Into for PartitionId { fn into(self) -> protobuf::PartitionId { protobuf::PartitionId { @@ -77,6 +78,7 @@ impl TryInto for PartitionLocation { } } +#[allow(clippy::from_over_into)] impl Into for PartitionStats { fn into(self) -> protobuf::PartitionStats { let none_value = -1_i64; diff --git a/ballista/rust/executor/Cargo.toml b/ballista/rust/executor/Cargo.toml index 3df5f6ba16355..15bef69fa9e2e 100644 --- a/ballista/rust/executor/Cargo.toml +++ b/ballista/rust/executor/Cargo.toml @@ -19,14 +19,13 @@ name = "ballista-executor" description = "Ballista Distributed Compute - Executor" license = "Apache-2.0" -version = "0.4.2-SNAPSHOT" +version = "0.5.0-SNAPSHOT" homepage = "https://github.com/apache/arrow" repository = "https://github.com/apache/arrow" authors = ["Apache Arrow "] edition = "2018" [features] -default = ["snmalloc"] snmalloc = ["snmalloc-rs"] [dependencies] diff --git a/ballista/rust/scheduler/Cargo.toml b/ballista/rust/scheduler/Cargo.toml index 6c7fc58aaecfb..342f215849cbf 100644 --- a/ballista/rust/scheduler/Cargo.toml +++ b/ballista/rust/scheduler/Cargo.toml @@ -19,7 +19,7 @@ name = "ballista-scheduler" description = "Ballista Distributed Compute - Scheduler" license = "Apache-2.0" -version = "0.4.2-SNAPSHOT" +version = "0.5.0-SNAPSHOT" homepage = "https://github.com/apache/arrow" repository = "https://github.com/apache/arrow" authors = ["Apache Arrow "] diff --git a/ballista/rust/scheduler/src/planner.rs b/ballista/rust/scheduler/src/planner.rs index e9f668a7d5f84..e791fa8b54597 100644 --- a/ballista/rust/scheduler/src/planner.rs +++ b/ballista/rust/scheduler/src/planner.rs @@ -25,7 +25,7 @@ use std::time::Instant; use std::{collections::HashMap, future::Future}; use ballista_core::client::BallistaClient; -use ballista_core::datasource::DFTableAdapter; +use ballista_core::datasource::DfTableAdapter; use ballista_core::error::{BallistaError, Result}; use ballista_core::serde::scheduler::ExecutorMeta; use ballista_core::serde::scheduler::PartitionId; @@ -138,7 +138,7 @@ impl DistributedPlanner { stages.append(&mut child_stages); } - if let Some(adapter) = execution_plan.as_any().downcast_ref::() { + if let Some(adapter) = execution_plan.as_any().downcast_ref::() { // remove Repartition rule because that isn't supported yet let rules: Vec> = vec![ Arc::new(CoalesceBatches::new()), diff --git a/dev/docker/rust.dockerfile b/dev/docker/rust.dockerfile index 19dd4879eab6f..6505f3c1660ac 100644 --- a/dev/docker/rust.dockerfile +++ b/dev/docker/rust.dockerfile @@ -59,20 +59,18 @@ ARG RELEASE_FLAG=--release # force build.rs to run to generate configure_me code. ENV FORCE_REBUILD='true' RUN cargo build $RELEASE_FLAG -RUN cd ballista/rust && \ - cargo build $RELEASE_FLAG # put the executor on /executor (need to be copied from different places depending on FLAG) ENV RELEASE_FLAG=${RELEASE_FLAG} -RUN if [ -z "$RELEASE_FLAG" ]; then mv /tmp/ballista/ballista/rust/target/debug/ballista-executor /executor; else mv /tmp/ballista/ballista/rust/target/release/ballista-executor /executor; fi +RUN if [ -z "$RELEASE_FLAG" ]; then mv /tmp/ballista/target/debug/ballista-executor /executor; else mv /tmp/ballista/target/release/ballista-executor /executor; fi # put the scheduler on /scheduler (need to be copied from different places depending on FLAG) ENV RELEASE_FLAG=${RELEASE_FLAG} -RUN if [ -z "$RELEASE_FLAG" ]; then mv /tmp/ballista/ballista/rust/target/debug/ballista-scheduler /scheduler; else mv /tmp/ballista/ballista/rust/target/release/ballista-scheduler /scheduler; fi +RUN if [ -z "$RELEASE_FLAG" ]; then mv /tmp/ballista/target/debug/ballista-scheduler /scheduler; else mv /tmp/ballista/target/release/ballista-scheduler /scheduler; fi # put the tpch on /tpch (need to be copied from different places depending on FLAG) ENV RELEASE_FLAG=${RELEASE_FLAG} -RUN if [ -z "$RELEASE_FLAG" ]; then mv /tmp/ballista/ballista/rust/target/debug/tpch /tpch; else mv /tmp/ballista/ballista/rust/target/release/tpch /tpch; fi +RUN if [ -z "$RELEASE_FLAG" ]; then mv /tmp/ballista/target/debug/tpch /tpch; else mv /tmp/ballista/target/release/tpch /tpch; fi # Copy the binary into a new container for a smaller docker image FROM ballistacompute/rust-base:0.4.0-20210213 From 713ba10ccdcafcec63c5c8f5de755e714d51a508 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 23 Apr 2021 08:18:51 -0600 Subject: [PATCH 029/329] Add GitHub templates for issues and PRs (#29) --- .github/ISSUE_TEMPLATE/bug_report.md | 20 ++++++++++++++++++++ .github/ISSUE_TEMPLATE/feature_request.md | 21 +++++++++++++++++++++ .github/pull_request_template.md | 19 +++++++++++++++++++ dev/release/rat_exclude_files.txt | 3 ++- 4 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md create mode 100644 .github/pull_request_template.md diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000000000..5600dab98b550 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,20 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: bug +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior: + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000000000..d9883dd454b7d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,21 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: enhancement +assignees: '' + +--- + +**Is your feature request related to a problem or challenge? Please describe what you are trying to do.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] +(This section helps Arrow developers understand the context and *why* for this feature, in addition to the *what*) + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000000000..5da0d08f9469a --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,19 @@ +# Which issue does this PR close? + +We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. + +Closes #. + + # Rationale for this change + Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. + Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. + +# What changes are included in this PR? + +There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. + +# Are there any user-facing changes? + +If there are user-facing changes then we may require documentation to be updated before approving the PR. + +If there are any breaking changes to public APIs, please add the `breaking change` label. diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index ead9c8db16f4b..f9eca7a2ed458 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -7,7 +7,8 @@ *.csv *.json *.snap -.github/ISSUE_TEMPLATE/question.md +.github/ISSUE_TEMPLATE/*.md +.github/pull_request_template.md ci/etc/rprofile ci/etc/*.patch ci/vcpkg/*.patch From 32951c3c1c62e0ceef5aca200e81d2ea9ea124a7 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 23 Apr 2021 13:34:40 -0600 Subject: [PATCH 030/329] Consolidate benchmarks (#34) --- .dockerignore | 2 + Cargo.toml | 1 - ballista/rust/benchmarks/tpch/Cargo.toml | 36 - ballista/rust/benchmarks/tpch/README.md | 104 -- ballista/rust/benchmarks/tpch/src/main.rs | 360 ------- .../tpch => benchmarks}/.dockerignore | 0 .../benchmarks/tpch => benchmarks}/.gitignore | 0 benchmarks/Cargo.toml | 1 + benchmarks/README.md | 106 +- .../tpch => benchmarks}/docker-compose.yaml | 8 +- .../tpch => benchmarks}/entrypoint.sh | 0 .../tpch => benchmarks}/queries/q1.sql | 0 .../tpch => benchmarks}/queries/q10.sql | 0 .../tpch => benchmarks}/queries/q11.sql | 0 .../tpch => benchmarks}/queries/q12.sql | 0 .../tpch => benchmarks}/queries/q13.sql | 0 .../tpch => benchmarks}/queries/q14.sql | 0 .../tpch => benchmarks}/queries/q16.sql | 0 .../tpch => benchmarks}/queries/q17.sql | 0 .../tpch => benchmarks}/queries/q18.sql | 0 .../tpch => benchmarks}/queries/q19.sql | 0 .../tpch => benchmarks}/queries/q2.sql | 0 .../tpch => benchmarks}/queries/q20.sql | 0 .../tpch => benchmarks}/queries/q21.sql | 0 .../tpch => benchmarks}/queries/q22.sql | 0 .../tpch => benchmarks}/queries/q3.sql | 0 .../tpch => benchmarks}/queries/q4.sql | 0 .../tpch => benchmarks}/queries/q5.sql | 0 .../tpch => benchmarks}/queries/q6.sql | 0 .../tpch => benchmarks}/queries/q7.sql | 0 .../tpch => benchmarks}/queries/q8.sql | 0 .../tpch => benchmarks}/queries/q9.sql | 0 .../benchmarks/tpch => benchmarks}/run.sh | 1 + benchmarks/src/bin/tpch.rs | 950 +++--------------- .../tpch => benchmarks}/tpch-gen.sh | 2 +- .../tpch => benchmarks}/tpchgen.dockerfile | 0 dev/build-rust-base.sh | 2 +- dev/build-rust.sh | 2 +- dev/docker/rust.dockerfile | 8 +- dev/integration-tests.sh | 4 +- dev/release/rat_exclude_files.txt | 2 +- 41 files changed, 223 insertions(+), 1366 deletions(-) delete mode 100644 ballista/rust/benchmarks/tpch/Cargo.toml delete mode 100644 ballista/rust/benchmarks/tpch/README.md delete mode 100644 ballista/rust/benchmarks/tpch/src/main.rs rename {ballista/rust/benchmarks/tpch => benchmarks}/.dockerignore (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/.gitignore (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/docker-compose.yaml (90%) rename {ballista/rust/benchmarks/tpch => benchmarks}/entrypoint.sh (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/queries/q1.sql (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/queries/q10.sql (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/queries/q11.sql (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/queries/q12.sql (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/queries/q13.sql (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/queries/q14.sql (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/queries/q16.sql (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/queries/q17.sql (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/queries/q18.sql (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/queries/q19.sql (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/queries/q2.sql (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/queries/q20.sql (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/queries/q21.sql (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/queries/q22.sql (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/queries/q3.sql (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/queries/q4.sql (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/queries/q5.sql (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/queries/q6.sql (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/queries/q7.sql (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/queries/q8.sql (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/queries/q9.sql (100%) rename {ballista/rust/benchmarks/tpch => benchmarks}/run.sh (99%) rename {ballista/rust/benchmarks/tpch => benchmarks}/tpch-gen.sh (97%) rename {ballista/rust/benchmarks/tpch => benchmarks}/tpchgen.dockerfile (100%) diff --git a/.dockerignore b/.dockerignore index 9a64a123f7353..8cd6a89645c3b 100644 --- a/.dockerignore +++ b/.dockerignore @@ -23,4 +23,6 @@ ci dev +testing +parquet-testing **/target/* diff --git a/Cargo.toml b/Cargo.toml index 0947beadac0d9..2f34babdb247b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,6 @@ members = [ "datafusion", "datafusion-examples", "benchmarks", - "ballista/rust/benchmarks/tpch", "ballista/rust/client", "ballista/rust/core", "ballista/rust/executor", diff --git a/ballista/rust/benchmarks/tpch/Cargo.toml b/ballista/rust/benchmarks/tpch/Cargo.toml deleted file mode 100644 index 9311f23ad886b..0000000000000 --- a/ballista/rust/benchmarks/tpch/Cargo.toml +++ /dev/null @@ -1,36 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[package] -name = "tpch" -version = "0.5.0-SNAPSHOT" -homepage = "https://github.com/apache/arrow" -repository = "https://github.com/apache/arrow" -authors = ["Apache Arrow "] -license = "Apache-2.0" -edition = "2018" - -[dependencies] -ballista = { path="../../client" } -datafusion = { path = "../../../../datafusion" } - -arrow = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } -parquet = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } - -env_logger = "0.8" -tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread"] } -structopt = "0.3" diff --git a/ballista/rust/benchmarks/tpch/README.md b/ballista/rust/benchmarks/tpch/README.md deleted file mode 100644 index 20c4fc71de35d..0000000000000 --- a/ballista/rust/benchmarks/tpch/README.md +++ /dev/null @@ -1,104 +0,0 @@ - - -# TPC-H Benchmarks - -TPC-H is an industry standard benchmark for testing databases and query engines. A command-line tool is available that -can generate the raw test data at any given scale factor (scale factor refers to the amount of data to be generated). - -## Generating Test Data - -TPC-H data can be generated using the `tpch-gen.sh` script, which creates a Docker image containing the TPC-DS data -generator. - -```bash -./tpch-gen.sh -``` - -Data will be generated into the `data` subdirectory and will not be checked in because this directory has been added -to the `.gitignore` file. - -## Running the Benchmarks - -To run the benchmarks it is necessary to have at least one Ballista scheduler and one Ballista executor running. - -To run the scheduler from source: - -```bash -cd $ARROW_HOME/ballista/rust/scheduler -RUST_LOG=info cargo run --release -``` - -By default the scheduler will bind to `0.0.0.0` and listen on port 50050. - -To run the executor from source: - -```bash -cd $ARROW_HOME/ballista/rust/executor -RUST_LOG=info cargo run --release -``` - -By default the executor will bind to `0.0.0.0` and listen on port 50051. - -You can add SIMD/snmalloc/LTO flags to improve speed (with longer build times): - -``` -RUST_LOG=info RUSTFLAGS='-C target-cpu=native -C lto -C codegen-units=1 -C embed-bitcode' cargo run --release --bin executor --features "simd snmalloc" --target x86_64-unknown-linux-gnu -``` - -To run the benchmarks: - -```bash -cd $ARROW_HOME/ballista/rust/benchmarks/tpch -cargo run --release benchmark --host localhost --port 50050 --query 1 --path $(pwd)/data --format tbl -``` - -## Running the Benchmarks on docker-compose - -To start a Rust scheduler and executor using Docker Compose: - -```bash -cd $BALLISTA_HOME -./dev/build-rust.sh -cd $BALLISTA_HOME/rust/benchmarks/tpch -docker-compose up -``` - -Then you can run the benchmark with: - -```bash -docker-compose run ballista-client cargo run benchmark --host ballista-scheduler --port 50050 --query 1 --path /data --format tbl -``` - -## Expected output - -The result of query 1 should produce the following output when executed against the SF=1 dataset. - -``` -+--------------+--------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+-------------+ -| l_returnflag | l_linestatus | sum_qty | sum_base_price | sum_disc_price | sum_charge | avg_qty | avg_price | avg_disc | count_order | -+--------------+--------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+-------------+ -| A | F | 37734107 | 56586554400.73001 | 53758257134.870026 | 55909065222.82768 | 25.522005853257337 | 38273.12973462168 | 0.049985295838396455 | 1478493 | -| N | F | 991417 | 1487504710.3799996 | 1413082168.0541 | 1469649223.1943746 | 25.516471920522985 | 38284.467760848296 | 0.05009342667421622 | 38854 | -| N | O | 74476023 | 111701708529.50996 | 106118209986.10472 | 110367023144.56622 | 25.502229680934594 | 38249.1238377803 | 0.049996589476752576 | 2920373 | -| R | F | 37719753 | 56568041380.90001 | 53741292684.60399 | 55889619119.83194 | 25.50579361269077 | 38250.854626099666 | 0.05000940583012587 | 1478870 | -+--------------+--------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+-------------+ -Query 1 iteration 0 took 1956.1 ms -Query 1 avg time: 1956.11 ms -``` diff --git a/ballista/rust/benchmarks/tpch/src/main.rs b/ballista/rust/benchmarks/tpch/src/main.rs deleted file mode 100644 index 1ba46ea1826ad..0000000000000 --- a/ballista/rust/benchmarks/tpch/src/main.rs +++ /dev/null @@ -1,360 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Benchmark derived from TPC-H. This is not an official TPC-H benchmark. -//! -//! This is a modified version of the DataFusion version of these benchmarks. - -use std::collections::HashMap; -use std::fs; -use std::path::{Path, PathBuf}; -use std::time::Instant; - -use arrow::datatypes::{DataType, Field, Schema}; -use arrow::util::pretty; -use ballista::prelude::*; -use datafusion::prelude::*; -use parquet::basic::Compression; -use parquet::file::properties::WriterProperties; -use structopt::StructOpt; - -#[derive(Debug, StructOpt)] -struct BenchmarkOpt { - /// Ballista executor host - #[structopt(long = "host")] - host: String, - - /// Ballista executor port - #[structopt(long = "port")] - port: u16, - - /// Query number - #[structopt(long)] - query: usize, - - /// Activate debug mode to see query results - #[structopt(long)] - debug: bool, - - /// Number of iterations of each test run - #[structopt(long = "iterations", default_value = "1")] - iterations: usize, - - /// Batch size when reading CSV or Parquet files - #[structopt(long = "batch-size", default_value = "32768")] - batch_size: usize, - - /// Path to data files - #[structopt(parse(from_os_str), required = true, long = "path")] - path: PathBuf, - - /// File format: `csv`, `tbl` or `parquet` - #[structopt(long = "format")] - file_format: String, -} - -#[derive(Debug, StructOpt)] -struct ConvertOpt { - /// Path to csv files - #[structopt(parse(from_os_str), required = true, short = "i", long = "input")] - input_path: PathBuf, - - /// Output path - #[structopt(parse(from_os_str), required = true, short = "o", long = "output")] - output_path: PathBuf, - - /// Output file format: `csv` or `parquet` - #[structopt(short = "f", long = "format")] - file_format: String, - - /// Compression to use when writing Parquet files - #[structopt(short = "c", long = "compression", default_value = "snappy")] - compression: String, - - /// Number of partitions to produce - #[structopt(short = "p", long = "partitions", default_value = "1")] - partitions: usize, - - /// Batch size when reading CSV or Parquet files - #[structopt(short = "s", long = "batch-size", default_value = "4096")] - batch_size: usize, -} - -#[derive(Debug, StructOpt)] -#[structopt(name = "TPC-H", about = "TPC-H Benchmarks.")] -enum TpchOpt { - Benchmark(BenchmarkOpt), - Convert(ConvertOpt), -} - -const TABLES: &[&str] = &[ - "part", "supplier", "partsupp", "customer", "orders", "lineitem", "nation", "region", -]; - -#[tokio::main] -async fn main() -> Result<()> { - env_logger::init(); - match TpchOpt::from_args() { - TpchOpt::Benchmark(opt) => benchmark(opt).await.map(|_| ()), - TpchOpt::Convert(opt) => convert_tbl(opt).await, - } -} - -async fn benchmark(opt: BenchmarkOpt) -> Result<()> { - println!("Running benchmarks with the following options: {:?}", opt); - - let mut settings = HashMap::new(); - settings.insert("batch.size".to_owned(), format!("{}", opt.batch_size)); - - let ctx = BallistaContext::remote(opt.host.as_str(), opt.port, settings); - - // register tables with Ballista context - let path = opt.path.to_str().unwrap(); - let file_format = opt.file_format.as_str(); - for table in TABLES { - match file_format { - // dbgen creates .tbl ('|' delimited) files without header - "tbl" => { - let path = format!("{}/{}.tbl", path, table); - let schema = get_schema(table); - let options = CsvReadOptions::new() - .schema(&schema) - .delimiter(b'|') - .has_header(false) - .file_extension(".tbl"); - ctx.register_csv(table, &path, options)?; - } - "csv" => { - let path = format!("{}/{}", path, table); - let schema = get_schema(table); - let options = CsvReadOptions::new().schema(&schema).has_header(true); - ctx.register_csv(table, &path, options)?; - } - "parquet" => { - let path = format!("{}/{}", path, table); - ctx.register_parquet(table, &path)?; - } - other => { - unimplemented!("Invalid file format '{}'", other); - } - } - } - - let mut millis = vec![]; - - // run benchmark - let sql = get_query_sql(opt.query)?; - println!("Running benchmark with query {}:\n {}", opt.query, sql); - for i in 0..opt.iterations { - let start = Instant::now(); - let df = ctx.sql(&sql)?; - let mut batches = vec![]; - let mut stream = df.collect().await?; - while let Some(result) = stream.next().await { - let batch = result?; - batches.push(batch); - } - let elapsed = start.elapsed().as_secs_f64() * 1000.0; - millis.push(elapsed as f64); - println!("Query {} iteration {} took {:.1} ms", opt.query, i, elapsed); - if opt.debug { - pretty::print_batches(&batches)?; - } - } - - let avg = millis.iter().sum::() / millis.len() as f64; - println!("Query {} avg time: {:.2} ms", opt.query, avg); - - Ok(()) -} - -fn get_query_sql(query: usize) -> Result { - if query > 0 && query < 23 { - let filename = format!("queries/q{}.sql", query); - Ok(fs::read_to_string(&filename).expect("failed to read query")) - } else { - Err(BallistaError::General( - "invalid query. Expected value between 1 and 22".to_owned(), - )) - } -} - -async fn convert_tbl(opt: ConvertOpt) -> Result<()> { - let output_root_path = Path::new(&opt.output_path); - for table in TABLES { - let start = Instant::now(); - let schema = get_schema(table); - - let input_path = format!("{}/{}.tbl", opt.input_path.to_str().unwrap(), table); - let options = CsvReadOptions::new() - .schema(&schema) - .delimiter(b'|') - .file_extension(".tbl"); - - let config = ExecutionConfig::new().with_batch_size(opt.batch_size); - let mut ctx = ExecutionContext::with_config(config); - - // build plan to read the TBL file - let mut csv = ctx.read_csv(&input_path, options)?; - - // optionally, repartition the file - if opt.partitions > 1 { - csv = csv.repartition(Partitioning::RoundRobinBatch(opt.partitions))? - } - - // create the physical plan - let csv = csv.to_logical_plan(); - let csv = ctx.optimize(&csv)?; - let csv = ctx.create_physical_plan(&csv)?; - - let output_path = output_root_path.join(table); - let output_path = output_path.to_str().unwrap().to_owned(); - - println!( - "Converting '{}' to {} files in directory '{}'", - &input_path, &opt.file_format, &output_path - ); - match opt.file_format.as_str() { - "csv" => ctx.write_csv(csv, output_path).await?, - "parquet" => { - let compression = match opt.compression.as_str() { - "none" => Compression::UNCOMPRESSED, - "snappy" => Compression::SNAPPY, - "brotli" => Compression::BROTLI, - "gzip" => Compression::GZIP, - "lz4" => Compression::LZ4, - "lz0" => Compression::LZO, - "zstd" => Compression::ZSTD, - other => { - return Err(BallistaError::NotImplemented(format!( - "Invalid compression format: {}", - other - ))) - } - }; - let props = WriterProperties::builder() - .set_compression(compression) - .build(); - ctx.write_parquet(csv, output_path, Some(props)).await? - } - other => { - return Err(BallistaError::NotImplemented(format!( - "Invalid output format: {}", - other - ))) - } - } - println!("Conversion completed in {} ms", start.elapsed().as_millis()); - } - - Ok(()) -} - -fn get_schema(table: &str) -> Schema { - // note that the schema intentionally uses signed integers so that any generated Parquet - // files can also be used to benchmark tools that only support signed integers, such as - // Apache Spark - - match table { - "part" => Schema::new(vec![ - Field::new("p_partkey", DataType::Int32, false), - Field::new("p_name", DataType::Utf8, false), - Field::new("p_mfgr", DataType::Utf8, false), - Field::new("p_brand", DataType::Utf8, false), - Field::new("p_type", DataType::Utf8, false), - Field::new("p_size", DataType::Int32, false), - Field::new("p_container", DataType::Utf8, false), - Field::new("p_retailprice", DataType::Float64, false), - Field::new("p_comment", DataType::Utf8, false), - ]), - - "supplier" => Schema::new(vec![ - Field::new("s_suppkey", DataType::Int32, false), - Field::new("s_name", DataType::Utf8, false), - Field::new("s_address", DataType::Utf8, false), - Field::new("s_nationkey", DataType::Int32, false), - Field::new("s_phone", DataType::Utf8, false), - Field::new("s_acctbal", DataType::Float64, false), - Field::new("s_comment", DataType::Utf8, false), - ]), - - "partsupp" => Schema::new(vec![ - Field::new("ps_partkey", DataType::Int32, false), - Field::new("ps_suppkey", DataType::Int32, false), - Field::new("ps_availqty", DataType::Int32, false), - Field::new("ps_supplycost", DataType::Float64, false), - Field::new("ps_comment", DataType::Utf8, false), - ]), - - "customer" => Schema::new(vec![ - Field::new("c_custkey", DataType::Int32, false), - Field::new("c_name", DataType::Utf8, false), - Field::new("c_address", DataType::Utf8, false), - Field::new("c_nationkey", DataType::Int32, false), - Field::new("c_phone", DataType::Utf8, false), - Field::new("c_acctbal", DataType::Float64, false), - Field::new("c_mktsegment", DataType::Utf8, false), - Field::new("c_comment", DataType::Utf8, false), - ]), - - "orders" => Schema::new(vec![ - Field::new("o_orderkey", DataType::Int32, false), - Field::new("o_custkey", DataType::Int32, false), - Field::new("o_orderstatus", DataType::Utf8, false), - Field::new("o_totalprice", DataType::Float64, false), - Field::new("o_orderdate", DataType::Date32, false), - Field::new("o_orderpriority", DataType::Utf8, false), - Field::new("o_clerk", DataType::Utf8, false), - Field::new("o_shippriority", DataType::Int32, false), - Field::new("o_comment", DataType::Utf8, false), - ]), - - "lineitem" => Schema::new(vec![ - Field::new("l_orderkey", DataType::Int32, false), - Field::new("l_partkey", DataType::Int32, false), - Field::new("l_suppkey", DataType::Int32, false), - Field::new("l_linenumber", DataType::Int32, false), - Field::new("l_quantity", DataType::Float64, false), - Field::new("l_extendedprice", DataType::Float64, false), - Field::new("l_discount", DataType::Float64, false), - Field::new("l_tax", DataType::Float64, false), - Field::new("l_returnflag", DataType::Utf8, false), - Field::new("l_linestatus", DataType::Utf8, false), - Field::new("l_shipdate", DataType::Date32, false), - Field::new("l_commitdate", DataType::Date32, false), - Field::new("l_receiptdate", DataType::Date32, false), - Field::new("l_shipinstruct", DataType::Utf8, false), - Field::new("l_shipmode", DataType::Utf8, false), - Field::new("l_comment", DataType::Utf8, false), - ]), - - "nation" => Schema::new(vec![ - Field::new("n_nationkey", DataType::Int32, false), - Field::new("n_name", DataType::Utf8, false), - Field::new("n_regionkey", DataType::Int32, false), - Field::new("n_comment", DataType::Utf8, false), - ]), - - "region" => Schema::new(vec![ - Field::new("r_regionkey", DataType::Int32, false), - Field::new("r_name", DataType::Utf8, false), - Field::new("r_comment", DataType::Utf8, false), - ]), - - _ => unimplemented!(), - } -} diff --git a/ballista/rust/benchmarks/tpch/.dockerignore b/benchmarks/.dockerignore similarity index 100% rename from ballista/rust/benchmarks/tpch/.dockerignore rename to benchmarks/.dockerignore diff --git a/ballista/rust/benchmarks/tpch/.gitignore b/benchmarks/.gitignore similarity index 100% rename from ballista/rust/benchmarks/tpch/.gitignore rename to benchmarks/.gitignore diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 6eb6ab9f89d6e..35622661eaaaf 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -34,6 +34,7 @@ snmalloc = ["snmalloc-rs"] arrow = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } parquet = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } datafusion = { path = "../datafusion" } +ballista = { path = "../ballista/rust/client" } structopt = { version = "0.3", default-features = false } tokio = { version = "^1.0", features = ["macros", "rt", "rt-multi-thread"] } futures = "0.3" diff --git a/benchmarks/README.md b/benchmarks/README.md index 7460477db4e9e..e003d9687c9c1 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -17,53 +17,47 @@ under the License. --> -# Apache Arrow Rust Benchmarks +# DataFusion and Ballista Benchmarks This crate contains benchmarks based on popular public data sets and open source benchmark suites, making it easy to run real-world benchmarks to help with performance and scalability testing and for comparing performance with other Arrow implementations as well as other query engines. -Currently, only DataFusion benchmarks exist, but the plan is to add benchmarks for the arrow, flight, and parquet -crates as well. - ## Benchmark derived from TPC-H These benchmarks are derived from the [TPC-H][1] benchmark. -Data for this benchmark can be generated using the [tpch-dbgen][2] command-line tool. Run the following commands to -clone the repository and build the source code. +## Generating Test Data + +TPC-H data can be generated using the `tpch-gen.sh` script, which creates a Docker image containing the TPC-DS data +generator. ```bash -git clone git@github.com:databricks/tpch-dbgen.git -cd tpch-dbgen -make -export TPCH_DATA=$(pwd) +./tpch-gen.sh ``` -Data can now be generated with the following command. Note that `-s 1` means use Scale Factor 1 or ~1 GB of -data. This value can be increased to generate larger data sets. +Data will be generated into the `data` subdirectory and will not be checked in because this directory has been added +to the `.gitignore` file. -```bash -./dbgen -vf -s 1 -``` +## Running the DataFusion Benchmarks -The benchmark can then be run (assuming the data created from `dbgen` is in `/mnt/tpch-dbgen`) with a command such as: +The benchmark can then be run (assuming the data created from `dbgen` is in `./data`) with a command such as: ```bash -cargo run --release --bin tpch -- benchmark --iterations 3 --path /mnt/tpch-dbgen --format tbl --query 1 --batch-size 4096 +cargo run --release --bin tpch -- benchmark --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096 ``` You can enable the features `simd` (to use SIMD instructions) and/or `mimalloc` or `snmalloc` (to use either the mimalloc or snmalloc allocator) as features by passing them in as `--features`: ``` -cargo run --release --features "simd mimalloc" --bin tpch -- benchmark --iterations 3 --path /mnt/tpch-dbgen --format tbl --query 1 --batch-size 4096 +cargo run --release --features "simd mimalloc" --bin tpch -- benchmark --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096 ``` The benchmark program also supports CSV and Parquet input file formats and a utility is provided to convert from `tbl` (generated by the `dbgen` utility) to CSV and Parquet. ```bash -cargo run --release --bin tpch -- convert --input /mnt/tpch-dbgen --output /mnt/tpch-parquet --format parquet +cargo run --release --bin tpch -- convert --input ./data --output /mnt/tpch-parquet --format parquet ``` This utility does not yet provide support for changing the number of partitions when performing the conversion. Another @@ -97,9 +91,78 @@ docker run -v /mnt:/mnt -it ballistacompute/spark-benchmarks:0.4.0-SNAPSHOT \ --partitions 64 ``` +## Running the Ballista Benchmarks + +To run the benchmarks it is necessary to have at least one Ballista scheduler and one Ballista executor running. + +To run the scheduler from source: + +```bash +cd $ARROW_HOME/ballista/rust/scheduler +RUST_LOG=info cargo run --release +``` + +By default the scheduler will bind to `0.0.0.0` and listen on port 50050. + +To run the executor from source: + +```bash +cd $ARROW_HOME/ballista/rust/executor +RUST_LOG=info cargo run --release +``` + +By default the executor will bind to `0.0.0.0` and listen on port 50051. + +You can add SIMD/snmalloc/LTO flags to improve speed (with longer build times): + +``` +RUST_LOG=info RUSTFLAGS='-C target-cpu=native -C lto -C codegen-units=1 -C embed-bitcode' cargo run --release --bin executor --features "simd snmalloc" --target x86_64-unknown-linux-gnu +``` + +To run the benchmarks: + +```bash +cd $ARROW_HOME/ballista/rust/benchmarks/tpch +cargo run --release benchmark --host localhost --port 50050 --query 1 --path $(pwd)/data --format tbl +``` + +## Running the Ballista Benchmarks on docker-compose + +To start a Rust scheduler and executor using Docker Compose: + +```bash +cd $BALLISTA_HOME +./dev/build-rust.sh +cd $BALLISTA_HOME/rust/benchmarks/tpch +docker-compose up +``` + +Then you can run the benchmark with: + +```bash +docker-compose run ballista-client cargo run benchmark --host ballista-scheduler --port 50050 --query 1 --path /data --format tbl +``` + +## Expected output + +The result of query 1 should produce the following output when executed against the SF=1 dataset. + +``` ++--------------+--------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+-------------+ +| l_returnflag | l_linestatus | sum_qty | sum_base_price | sum_disc_price | sum_charge | avg_qty | avg_price | avg_disc | count_order | ++--------------+--------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+-------------+ +| A | F | 37734107 | 56586554400.73001 | 53758257134.870026 | 55909065222.82768 | 25.522005853257337 | 38273.12973462168 | 0.049985295838396455 | 1478493 | +| N | F | 991417 | 1487504710.3799996 | 1413082168.0541 | 1469649223.1943746 | 25.516471920522985 | 38284.467760848296 | 0.05009342667421622 | 38854 | +| N | O | 74476023 | 111701708529.50996 | 106118209986.10472 | 110367023144.56622 | 25.502229680934594 | 38249.1238377803 | 0.049996589476752576 | 2920373 | +| R | F | 37719753 | 56568041380.90001 | 53741292684.60399 | 55889619119.83194 | 25.50579361269077 | 38250.854626099666 | 0.05000940583012587 | 1478870 | ++--------------+--------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+-------------+ +Query 1 iteration 0 took 1956.1 ms +Query 1 avg time: 1956.11 ms +``` + ## NYC Taxi Benchmark -These benchmarks are based on the [New York Taxi and Limousine Commission][3] data set. +These benchmarks are based on the [New York Taxi and Limousine Commission][2] data set. ```bash cargo run --release --bin nyctaxi -- --iterations 3 --path /mnt/nyctaxi/csv --format csv --batch-size 4096 @@ -116,5 +179,4 @@ Query 'fare_amt_by_passenger' iteration 2 took 7969 ms ``` [1]: http://www.tpc.org/tpch/ -[2]: https://github.com/databricks/tpch-dbgen -[3]: https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page +[2]: https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page diff --git a/ballista/rust/benchmarks/tpch/docker-compose.yaml b/benchmarks/docker-compose.yaml similarity index 90% rename from ballista/rust/benchmarks/tpch/docker-compose.yaml rename to benchmarks/docker-compose.yaml index f872ce16e2d8b..6015dbac2cc25 100644 --- a/ballista/rust/benchmarks/tpch/docker-compose.yaml +++ b/benchmarks/docker-compose.yaml @@ -20,7 +20,7 @@ services: image: quay.io/coreos/etcd:v3.4.9 command: "etcd -advertise-client-urls http://etcd:2379 -listen-client-urls http://0.0.0.0:2379" ballista-scheduler: - image: ballistacompute/ballista-rust:0.4.2-SNAPSHOT + image: ballistacompute/ballista-rust:0.5.0-SNAPSHOT command: "/scheduler --config-backend etcd --etcd-urls etcd:2379 --bind-host 0.0.0.0 --port 50050" environment: - RUST_LOG=ballista=debug @@ -29,7 +29,7 @@ services: depends_on: - etcd ballista-executor-1: - image: ballistacompute/ballista-rust:0.4.2-SNAPSHOT + image: ballistacompute/ballista-rust:0.5.0-SNAPSHOT command: "/executor --bind-host 0.0.0.0 --port 50051 --external-host ballista-executor-1 --scheduler-host ballista-scheduler" environment: - RUST_LOG=info @@ -38,7 +38,7 @@ services: depends_on: - ballista-scheduler ballista-executor-2: - image: ballistacompute/ballista-rust:0.4.2-SNAPSHOT + image: ballistacompute/ballista-rust:0.5.0-SNAPSHOT command: "/executor --bind-host 0.0.0.0 --port 50052 --external-host ballista-executor-2 --scheduler-host ballista-scheduler" environment: - RUST_LOG=info @@ -47,7 +47,7 @@ services: depends_on: - ballista-scheduler ballista-client: - image: ballistacompute/ballista-rust:0.4.2-SNAPSHOT + image: ballistacompute/ballista-rust:0.5.0-SNAPSHOT command: "/bin/sh" # do nothing working_dir: /ballista/benchmarks/tpch environment: diff --git a/ballista/rust/benchmarks/tpch/entrypoint.sh b/benchmarks/entrypoint.sh similarity index 100% rename from ballista/rust/benchmarks/tpch/entrypoint.sh rename to benchmarks/entrypoint.sh diff --git a/ballista/rust/benchmarks/tpch/queries/q1.sql b/benchmarks/queries/q1.sql similarity index 100% rename from ballista/rust/benchmarks/tpch/queries/q1.sql rename to benchmarks/queries/q1.sql diff --git a/ballista/rust/benchmarks/tpch/queries/q10.sql b/benchmarks/queries/q10.sql similarity index 100% rename from ballista/rust/benchmarks/tpch/queries/q10.sql rename to benchmarks/queries/q10.sql diff --git a/ballista/rust/benchmarks/tpch/queries/q11.sql b/benchmarks/queries/q11.sql similarity index 100% rename from ballista/rust/benchmarks/tpch/queries/q11.sql rename to benchmarks/queries/q11.sql diff --git a/ballista/rust/benchmarks/tpch/queries/q12.sql b/benchmarks/queries/q12.sql similarity index 100% rename from ballista/rust/benchmarks/tpch/queries/q12.sql rename to benchmarks/queries/q12.sql diff --git a/ballista/rust/benchmarks/tpch/queries/q13.sql b/benchmarks/queries/q13.sql similarity index 100% rename from ballista/rust/benchmarks/tpch/queries/q13.sql rename to benchmarks/queries/q13.sql diff --git a/ballista/rust/benchmarks/tpch/queries/q14.sql b/benchmarks/queries/q14.sql similarity index 100% rename from ballista/rust/benchmarks/tpch/queries/q14.sql rename to benchmarks/queries/q14.sql diff --git a/ballista/rust/benchmarks/tpch/queries/q16.sql b/benchmarks/queries/q16.sql similarity index 100% rename from ballista/rust/benchmarks/tpch/queries/q16.sql rename to benchmarks/queries/q16.sql diff --git a/ballista/rust/benchmarks/tpch/queries/q17.sql b/benchmarks/queries/q17.sql similarity index 100% rename from ballista/rust/benchmarks/tpch/queries/q17.sql rename to benchmarks/queries/q17.sql diff --git a/ballista/rust/benchmarks/tpch/queries/q18.sql b/benchmarks/queries/q18.sql similarity index 100% rename from ballista/rust/benchmarks/tpch/queries/q18.sql rename to benchmarks/queries/q18.sql diff --git a/ballista/rust/benchmarks/tpch/queries/q19.sql b/benchmarks/queries/q19.sql similarity index 100% rename from ballista/rust/benchmarks/tpch/queries/q19.sql rename to benchmarks/queries/q19.sql diff --git a/ballista/rust/benchmarks/tpch/queries/q2.sql b/benchmarks/queries/q2.sql similarity index 100% rename from ballista/rust/benchmarks/tpch/queries/q2.sql rename to benchmarks/queries/q2.sql diff --git a/ballista/rust/benchmarks/tpch/queries/q20.sql b/benchmarks/queries/q20.sql similarity index 100% rename from ballista/rust/benchmarks/tpch/queries/q20.sql rename to benchmarks/queries/q20.sql diff --git a/ballista/rust/benchmarks/tpch/queries/q21.sql b/benchmarks/queries/q21.sql similarity index 100% rename from ballista/rust/benchmarks/tpch/queries/q21.sql rename to benchmarks/queries/q21.sql diff --git a/ballista/rust/benchmarks/tpch/queries/q22.sql b/benchmarks/queries/q22.sql similarity index 100% rename from ballista/rust/benchmarks/tpch/queries/q22.sql rename to benchmarks/queries/q22.sql diff --git a/ballista/rust/benchmarks/tpch/queries/q3.sql b/benchmarks/queries/q3.sql similarity index 100% rename from ballista/rust/benchmarks/tpch/queries/q3.sql rename to benchmarks/queries/q3.sql diff --git a/ballista/rust/benchmarks/tpch/queries/q4.sql b/benchmarks/queries/q4.sql similarity index 100% rename from ballista/rust/benchmarks/tpch/queries/q4.sql rename to benchmarks/queries/q4.sql diff --git a/ballista/rust/benchmarks/tpch/queries/q5.sql b/benchmarks/queries/q5.sql similarity index 100% rename from ballista/rust/benchmarks/tpch/queries/q5.sql rename to benchmarks/queries/q5.sql diff --git a/ballista/rust/benchmarks/tpch/queries/q6.sql b/benchmarks/queries/q6.sql similarity index 100% rename from ballista/rust/benchmarks/tpch/queries/q6.sql rename to benchmarks/queries/q6.sql diff --git a/ballista/rust/benchmarks/tpch/queries/q7.sql b/benchmarks/queries/q7.sql similarity index 100% rename from ballista/rust/benchmarks/tpch/queries/q7.sql rename to benchmarks/queries/q7.sql diff --git a/ballista/rust/benchmarks/tpch/queries/q8.sql b/benchmarks/queries/q8.sql similarity index 100% rename from ballista/rust/benchmarks/tpch/queries/q8.sql rename to benchmarks/queries/q8.sql diff --git a/ballista/rust/benchmarks/tpch/queries/q9.sql b/benchmarks/queries/q9.sql similarity index 100% rename from ballista/rust/benchmarks/tpch/queries/q9.sql rename to benchmarks/queries/q9.sql diff --git a/ballista/rust/benchmarks/tpch/run.sh b/benchmarks/run.sh similarity index 99% rename from ballista/rust/benchmarks/tpch/run.sh rename to benchmarks/run.sh index c8a36b6013cd7..fd97ff9a9a6a5 100755 --- a/ballista/rust/benchmarks/tpch/run.sh +++ b/benchmarks/run.sh @@ -19,6 +19,7 @@ set -e # This bash script is meant to be run inside the docker-compose environment. Check the README for instructions +cd / for query in 1 3 5 6 10 12 do /tpch benchmark --host ballista-scheduler --port 50050 --query $query --path /data --format tbl --iterations 1 --debug diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index b203ceb3f741a..fd9f0525987d5 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -17,21 +17,26 @@ //! Benchmark derived from TPC-H. This is not an official TPC-H benchmark. -use std::time::Instant; use std::{ + collections::HashMap, + fs, + iter::Iterator, path::{Path, PathBuf}, sync::Arc, + time::Instant, }; +use futures::StreamExt; + use arrow::datatypes::{DataType, Field, Schema}; use arrow::util::pretty; +use ballista::context::BallistaContext; use datafusion::datasource::parquet::ParquetTable; use datafusion::datasource::{CsvFile, MemTable, TableProvider}; use datafusion::error::{DataFusionError, Result}; use datafusion::logical_plan::LogicalPlan; use datafusion::physical_plan::collect; use datafusion::prelude::*; - use parquet::basic::Compression; use parquet::file::properties::WriterProperties; use structopt::StructOpt; @@ -44,7 +49,7 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc; #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; -#[derive(Debug, StructOpt)] +#[derive(Debug, StructOpt, Clone)] struct BenchmarkOpt { /// Query number #[structopt(short, long)] @@ -81,6 +86,14 @@ struct BenchmarkOpt { /// Number of partitions to create when using MemTable as input #[structopt(short = "n", long = "partitions", default_value = "8")] partitions: usize, + + /// Ballista executor host + #[structopt(long = "host")] + host: Option, + + /// Ballista executor port + #[structopt(long = "port")] + port: Option, } #[derive(Debug, StructOpt)] @@ -125,12 +138,20 @@ const TABLES: &[&str] = &[ async fn main() -> Result<()> { env_logger::init(); match TpchOpt::from_args() { - TpchOpt::Benchmark(opt) => benchmark(opt).await.map(|_| ()), + TpchOpt::Benchmark(opt) => { + if opt.host.is_some() && opt.port.is_some() { + benchmark_ballista(opt).await.map(|_| ()) + } else { + benchmark_datafusion(opt).await.map(|_| ()) + } + } TpchOpt::Convert(opt) => convert_tbl(opt).await, } } -async fn benchmark(opt: BenchmarkOpt) -> Result> { +async fn benchmark_datafusion( + opt: BenchmarkOpt, +) -> Result> { println!("Running benchmarks with the following options: {:?}", opt); let config = ExecutionConfig::new() .with_concurrency(opt.concurrency) @@ -181,832 +202,97 @@ async fn benchmark(opt: BenchmarkOpt) -> Result Result { - match query { - // original - // 1 => ctx.create_logical_plan( - // "select - // l_returnflag, - // l_linestatus, - // sum(l_quantity) as sum_qty, - // sum(l_extendedprice) as sum_base_price, - // sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, - // sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, - // avg(l_quantity) as avg_qty, - // avg(l_extendedprice) as avg_price, - // avg(l_discount) as avg_disc, - // count(*) as count_order - // from - // lineitem - // where - // l_shipdate <= date '1998-12-01' - interval '90' day (3) - // group by - // l_returnflag, - // l_linestatus - // order by - // l_returnflag, - // l_linestatus;" - // ), - 1 => ctx.create_logical_plan( - "select - l_returnflag, - l_linestatus, - sum(l_quantity) as sum_qty, - sum(l_extendedprice) as sum_base_price, - sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, - sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, - avg(l_quantity) as avg_qty, - avg(l_extendedprice) as avg_price, - avg(l_discount) as avg_disc, - count(*) as count_order - from - lineitem - where - l_shipdate <= date '1998-09-02' - group by - l_returnflag, - l_linestatus - order by - l_returnflag, - l_linestatus;", - ), - - 2 => ctx.create_logical_plan( - "select - s_acctbal, - s_name, - n_name, - p_partkey, - p_mfgr, - s_address, - s_phone, - s_comment - from - part, - supplier, - partsupp, - nation, - region - where - p_partkey = ps_partkey - and s_suppkey = ps_suppkey - and p_size = 15 - and p_type like '%BRASS' - and s_nationkey = n_nationkey - and n_regionkey = r_regionkey - and r_name = 'EUROPE' - and ps_supplycost = ( - select - min(ps_supplycost) - from - partsupp, - supplier, - nation, - region - where - p_partkey = ps_partkey - and s_suppkey = ps_suppkey - and s_nationkey = n_nationkey - and n_regionkey = r_regionkey - and r_name = 'EUROPE' - ) - order by - s_acctbal desc, - n_name, - s_name, - p_partkey;", - ), - - 3 => ctx.create_logical_plan( - "select - l_orderkey, - sum(l_extendedprice * (1 - l_discount)) as revenue, - o_orderdate, - o_shippriority - from - customer, - orders, - lineitem - where - c_mktsegment = 'BUILDING' - and c_custkey = o_custkey - and l_orderkey = o_orderkey - and o_orderdate < date '1995-03-15' - and l_shipdate > date '1995-03-15' - group by - l_orderkey, - o_orderdate, - o_shippriority - order by - revenue desc, - o_orderdate;", - ), - - 4 => ctx.create_logical_plan( - "select - o_orderpriority, - count(*) as order_count - from - orders - where - o_orderdate >= '1993-07-01' - and o_orderdate < date '1993-07-01' + interval '3' month - and exists ( - select - * - from - lineitem - where - l_orderkey = o_orderkey - and l_commitdate < l_receiptdate - ) - group by - o_orderpriority - order by - o_orderpriority;", - ), - - // original - // 5 => ctx.create_logical_plan( - // "select - // n_name, - // sum(l_extendedprice * (1 - l_discount)) as revenue - // from - // customer, - // orders, - // lineitem, - // supplier, - // nation, - // region - // where - // c_custkey = o_custkey - // and l_orderkey = o_orderkey - // and l_suppkey = s_suppkey - // and c_nationkey = s_nationkey - // and s_nationkey = n_nationkey - // and n_regionkey = r_regionkey - // and r_name = 'ASIA' - // and o_orderdate >= date '1994-01-01' - // and o_orderdate < date '1994-01-01' + interval '1' year - // group by - // n_name - // order by - // revenue desc;" - // ), - 5 => ctx.create_logical_plan( - "select - n_name, - sum(l_extendedprice * (1 - l_discount)) as revenue - from - customer, - orders, - lineitem, - supplier, - nation, - region - where - c_custkey = o_custkey - and l_orderkey = o_orderkey - and l_suppkey = s_suppkey - and c_nationkey = s_nationkey - and s_nationkey = n_nationkey - and n_regionkey = r_regionkey - and r_name = 'ASIA' - and o_orderdate >= date '1994-01-01' - and o_orderdate < date '1995-01-01' - group by - n_name - order by - revenue desc;", - ), - - // original - // 6 => ctx.create_logical_plan( - // "select - // sum(l_extendedprice * l_discount) as revenue - // from - // lineitem - // where - // l_shipdate >= date '1994-01-01' - // and l_shipdate < date '1994-01-01' + interval '1' year - // and l_discount between .06 - 0.01 and .06 + 0.01 - // and l_quantity < 24;" - // ), - 6 => ctx.create_logical_plan( - "select - sum(l_extendedprice * l_discount) as revenue - from - lineitem - where - l_shipdate >= date '1994-01-01' - and l_shipdate < date '1995-01-01' - and l_discount between .06 - 0.01 and .06 + 0.01 - and l_quantity < 24;", - ), - - 7 => ctx.create_logical_plan( - "select - supp_nation, - cust_nation, - l_year, - sum(volume) as revenue - from - ( - select - n1.n_name as supp_nation, - n2.n_name as cust_nation, - extract(year from l_shipdate) as l_year, - l_extendedprice * (1 - l_discount) as volume - from - supplier, - lineitem, - orders, - customer, - nation n1, - nation n2 - where - s_suppkey = l_suppkey - and o_orderkey = l_orderkey - and c_custkey = o_custkey - and s_nationkey = n1.n_nationkey - and c_nationkey = n2.n_nationkey - and ( - (n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY') - or (n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE') - ) - and l_shipdate between date '1995-01-01' and date '1996-12-31' - ) as shipping - group by - supp_nation, - cust_nation, - l_year - order by - supp_nation, - cust_nation, - l_year;", - ), - - 8 => ctx.create_logical_plan( - "select - o_year, - sum(case - when nation = 'BRAZIL' then volume - else 0 - end) / sum(volume) as mkt_share - from - ( - select - extract(year from o_orderdate) as o_year, - l_extendedprice * (1 - l_discount) as volume, - n2.n_name as nation - from - part, - supplier, - lineitem, - orders, - customer, - nation n1, - nation n2, - region - where - p_partkey = l_partkey - and s_suppkey = l_suppkey - and l_orderkey = o_orderkey - and o_custkey = c_custkey - and c_nationkey = n1.n_nationkey - and n1.n_regionkey = r_regionkey - and r_name = 'AMERICA' - and s_nationkey = n2.n_nationkey - and o_orderdate between date '1995-01-01' and date '1996-12-31' - and p_type = 'ECONOMY ANODIZED STEEL' - ) as all_nations - group by - o_year - order by - o_year;", - ), - - 9 => ctx.create_logical_plan( - "select - nation, - o_year, - sum(amount) as sum_profit - from - ( - select - n_name as nation, - extract(year from o_orderdate) as o_year, - l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount - from - part, - supplier, - lineitem, - partsupp, - orders, - nation - where - s_suppkey = l_suppkey - and ps_suppkey = l_suppkey - and ps_partkey = l_partkey - and p_partkey = l_partkey - and o_orderkey = l_orderkey - and s_nationkey = n_nationkey - and p_name like '%green%' - ) as profit - group by - nation, - o_year - order by - nation, - o_year desc;", - ), - - // 10 => ctx.create_logical_plan( - // "select - // c_custkey, - // c_name, - // sum(l_extendedprice * (1 - l_discount)) as revenue, - // c_acctbal, - // n_name, - // c_address, - // c_phone, - // c_comment - // from - // customer, - // orders, - // lineitem, - // nation - // where - // c_custkey = o_custkey - // and l_orderkey = o_orderkey - // and o_orderdate >= date '1993-10-01' - // and o_orderdate < date '1993-10-01' + interval '3' month - // and l_returnflag = 'R' - // and c_nationkey = n_nationkey - // group by - // c_custkey, - // c_name, - // c_acctbal, - // c_phone, - // n_name, - // c_address, - // c_comment - // order by - // revenue desc;" - // ), - 10 => ctx.create_logical_plan( - "select - c_custkey, - c_name, - sum(l_extendedprice * (1 - l_discount)) as revenue, - c_acctbal, - n_name, - c_address, - c_phone, - c_comment - from - customer, - orders, - lineitem, - nation - where - c_custkey = o_custkey - and l_orderkey = o_orderkey - and o_orderdate >= date '1993-10-01' - and o_orderdate < date '1994-01-01' - and l_returnflag = 'R' - and c_nationkey = n_nationkey - group by - c_custkey, - c_name, - c_acctbal, - c_phone, - n_name, - c_address, - c_comment - order by - revenue desc;", - ), - - 11 => ctx.create_logical_plan( - "select - ps_partkey, - sum(ps_supplycost * ps_availqty) as value - from - partsupp, - supplier, - nation - where - ps_suppkey = s_suppkey - and s_nationkey = n_nationkey - and n_name = 'GERMANY' - group by - ps_partkey having - sum(ps_supplycost * ps_availqty) > ( - select - sum(ps_supplycost * ps_availqty) * 0.0001 - from - partsupp, - supplier, - nation - where - ps_suppkey = s_suppkey - and s_nationkey = n_nationkey - and n_name = 'GERMANY' - ) - order by - value desc;", - ), - - // original - // 12 => ctx.create_logical_plan( - // "select - // l_shipmode, - // sum(case - // when o_orderpriority = '1-URGENT' - // or o_orderpriority = '2-HIGH' - // then 1 - // else 0 - // end) as high_line_count, - // sum(case - // when o_orderpriority <> '1-URGENT' - // and o_orderpriority <> '2-HIGH' - // then 1 - // else 0 - // end) as low_line_count - // from - // orders, - // lineitem - // where - // o_orderkey = l_orderkey - // and l_shipmode in ('MAIL', 'SHIP') - // and l_commitdate < l_receiptdate - // and l_shipdate < l_commitdate - // and l_receiptdate >= date '1994-01-01' - // and l_receiptdate < date '1994-01-01' + interval '1' year - // group by - // l_shipmode - // order by - // l_shipmode;" - // ), - 12 => ctx.create_logical_plan( - "select - l_shipmode, - sum(case - when o_orderpriority = '1-URGENT' - or o_orderpriority = '2-HIGH' - then 1 - else 0 - end) as high_line_count, - sum(case - when o_orderpriority <> '1-URGENT' - and o_orderpriority <> '2-HIGH' - then 1 - else 0 - end) as low_line_count - from - lineitem - join - orders - on - l_orderkey = o_orderkey - where - l_shipmode in ('MAIL', 'SHIP') - and l_commitdate < l_receiptdate - and l_shipdate < l_commitdate - and l_receiptdate >= date '1994-01-01' - and l_receiptdate < date '1995-01-01' - group by - l_shipmode - order by - l_shipmode;", - ), - - 13 => ctx.create_logical_plan( - "select - c_count, - count(*) as custdist - from - ( - select - c_custkey, - count(o_orderkey) - from - customer left outer join orders on - c_custkey = o_custkey - and o_comment not like '%special%requests%' - group by - c_custkey - ) as c_orders (c_custkey, c_count) - group by - c_count - order by - custdist desc, - c_count desc;", - ), - - 14 => ctx.create_logical_plan( - "select - 100.00 * sum(case - when p_type like 'PROMO%' - then l_extendedprice * (1 - l_discount) - else 0 - end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue - from - lineitem, - part - where - l_partkey = p_partkey - and l_shipdate >= date '1995-09-01' - and l_shipdate < date '1995-10-01';", - ), - - 15 => ctx.create_logical_plan( - "create view revenue0 (supplier_no, total_revenue) as - select - l_suppkey, - sum(l_extendedprice * (1 - l_discount)) - from - lineitem - where - l_shipdate >= date '1996-01-01' - and l_shipdate < date '1996-01-01' + interval '3' month - group by - l_suppkey; - - select - s_suppkey, - s_name, - s_address, - s_phone, - total_revenue - from - supplier, - revenue0 - where - s_suppkey = supplier_no - and total_revenue = ( - select - max(total_revenue) - from - revenue0 - ) - order by - s_suppkey; - - drop view revenue0;", - ), - - 16 => ctx.create_logical_plan( - "select - p_brand, - p_type, - p_size, - count(distinct ps_suppkey) as supplier_cnt - from - partsupp, - part - where - p_partkey = ps_partkey - and p_brand <> 'Brand#45' - and p_type not like 'MEDIUM POLISHED%' - and p_size in (49, 14, 23, 45, 19, 3, 36, 9) - and ps_suppkey not in ( - select - s_suppkey - from - supplier - where - s_comment like '%Customer%Complaints%' - ) - group by - p_brand, - p_type, - p_size - order by - supplier_cnt desc, - p_brand, - p_type, - p_size;", - ), - - 17 => ctx.create_logical_plan( - "select - sum(l_extendedprice) / 7.0 as avg_yearly - from - lineitem, - part - where - p_partkey = l_partkey - and p_brand = 'Brand#23' - and p_container = 'MED BOX' - and l_quantity < ( - select - 0.2 * avg(l_quantity) - from - lineitem - where - l_partkey = p_partkey - );", - ), - - 18 => ctx.create_logical_plan( - "select - c_name, - c_custkey, - o_orderkey, - o_orderdate, - o_totalprice, - sum(l_quantity) - from - customer, - orders, - lineitem - where - o_orderkey in ( - select - l_orderkey - from - lineitem - group by - l_orderkey having - sum(l_quantity) > 300 - ) - and c_custkey = o_custkey - and o_orderkey = l_orderkey - group by - c_name, - c_custkey, - o_orderkey, - o_orderdate, - o_totalprice - order by - o_totalprice desc, - o_orderdate;", - ), - - 19 => ctx.create_logical_plan( - "select - sum(l_extendedprice* (1 - l_discount)) as revenue - from - lineitem, - part - where - ( - p_partkey = l_partkey - and p_brand = 'Brand#12' - and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') - and l_quantity >= 1 and l_quantity <= 1 + 10 - and p_size between 1 and 5 - and l_shipmode in ('AIR', 'AIR REG') - and l_shipinstruct = 'DELIVER IN PERSON' - ) - or - ( - p_partkey = l_partkey - and p_brand = 'Brand#23' - and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') - and l_quantity >= 10 and l_quantity <= 10 + 10 - and p_size between 1 and 10 - and l_shipmode in ('AIR', 'AIR REG') - and l_shipinstruct = 'DELIVER IN PERSON' - ) - or - ( - p_partkey = l_partkey - and p_brand = 'Brand#34' - and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') - and l_quantity >= 20 and l_quantity <= 20 + 10 - and p_size between 1 and 15 - and l_shipmode in ('AIR', 'AIR REG') - and l_shipinstruct = 'DELIVER IN PERSON' - );", - ), - - 20 => ctx.create_logical_plan( - "select - s_name, - s_address - from - supplier, - nation - where - s_suppkey in ( - select - ps_suppkey - from - partsupp - where - ps_partkey in ( - select - p_partkey - from - part - where - p_name like 'forest%' - ) - and ps_availqty > ( - select - 0.5 * sum(l_quantity) - from - lineitem - where - l_partkey = ps_partkey - and l_suppkey = ps_suppkey - and l_shipdate >= date '1994-01-01' - and l_shipdate < 'date 1994-01-01' + interval '1' year - ) - ) - and s_nationkey = n_nationkey - and n_name = 'CANADA' - order by - s_name;", - ), - - 21 => ctx.create_logical_plan( - "select - s_name, - count(*) as numwait - from - supplier, - lineitem l1, - orders, - nation - where - s_suppkey = l1.l_suppkey - and o_orderkey = l1.l_orderkey - and o_orderstatus = 'F' - and l1.l_receiptdate > l1.l_commitdate - and exists ( - select - * - from - lineitem l2 - where - l2.l_orderkey = l1.l_orderkey - and l2.l_suppkey <> l1.l_suppkey - ) - and not exists ( - select - * - from - lineitem l3 - where - l3.l_orderkey = l1.l_orderkey - and l3.l_suppkey <> l1.l_suppkey - and l3.l_receiptdate > l3.l_commitdate - ) - and s_nationkey = n_nationkey - and n_name = 'SAUDI ARABIA' - group by - s_name - order by - numwait desc, - s_name;", - ), - - 22 => ctx.create_logical_plan( - "select - cntrycode, - count(*) as numcust, - sum(c_acctbal) as totacctbal - from - ( - select - substring(c_phone from 1 for 2) as cntrycode, - c_acctbal - from - customer - where - substring(c_phone from 1 for 2) in - ('13', '31', '23', '29', '30', '18', '17') - and c_acctbal > ( - select - avg(c_acctbal) - from - customer - where - c_acctbal > 0.00 - and substring(c_phone from 1 for 2) in - ('13', '31', '23', '29', '30', '18', '17') - ) - and not exists ( - select - * - from - orders - where - o_custkey = c_custkey - ) - ) as custsale - group by - cntrycode - order by - cntrycode;", - ), - - _ => unimplemented!("invalid query. Expected value between 1 and 22"), +async fn benchmark_ballista(opt: BenchmarkOpt) -> Result<()> { + println!("Running benchmarks with the following options: {:?}", opt); + + let mut settings = HashMap::new(); + settings.insert("batch.size".to_owned(), format!("{}", opt.batch_size)); + + let ctx = + BallistaContext::remote(opt.host.unwrap().as_str(), opt.port.unwrap(), settings); + + // register tables with Ballista context + let path = opt.path.to_str().unwrap(); + let file_format = opt.file_format.as_str(); + for table in TABLES { + match file_format { + // dbgen creates .tbl ('|' delimited) files without header + "tbl" => { + let path = format!("{}/{}.tbl", path, table); + let schema = get_schema(table); + let options = CsvReadOptions::new() + .schema(&schema) + .delimiter(b'|') + .has_header(false) + .file_extension(".tbl"); + ctx.register_csv(table, &path, options) + .map_err(|e| DataFusionError::Plan(format!("{:?}", e)))?; + } + "csv" => { + let path = format!("{}/{}", path, table); + let schema = get_schema(table); + let options = CsvReadOptions::new().schema(&schema).has_header(true); + ctx.register_csv(table, &path, options) + .map_err(|e| DataFusionError::Plan(format!("{:?}", e)))?; + } + "parquet" => { + let path = format!("{}/{}", path, table); + ctx.register_parquet(table, &path) + .map_err(|e| DataFusionError::Plan(format!("{:?}", e)))?; + } + other => { + unimplemented!("Invalid file format '{}'", other); + } + } } + + let mut millis = vec![]; + + // run benchmark + let sql = get_query_sql(opt.query)?; + println!("Running benchmark with query {}:\n {}", opt.query, sql); + for i in 0..opt.iterations { + let start = Instant::now(); + let df = ctx + .sql(&sql) + .map_err(|e| DataFusionError::Plan(format!("{:?}", e)))?; + let mut batches = vec![]; + let mut stream = df + .collect() + .await + .map_err(|e| DataFusionError::Plan(format!("{:?}", e)))?; + while let Some(result) = stream.next().await { + let batch = result?; + batches.push(batch); + } + let elapsed = start.elapsed().as_secs_f64() * 1000.0; + millis.push(elapsed as f64); + println!("Query {} iteration {} took {:.1} ms", opt.query, i, elapsed); + if opt.debug { + pretty::print_batches(&batches)?; + } + } + + let avg = millis.iter().sum::() / millis.len() as f64; + println!("Query {} avg time: {:.2} ms", opt.query, avg); + + Ok(()) +} + +fn get_query_sql(query: usize) -> Result { + if query > 0 && query < 23 { + let filename = format!("queries/q{}.sql", query); + Ok(fs::read_to_string(&filename).expect("failed to read query")) + } else { + Err(DataFusionError::Plan( + "invalid query. Expected value between 1 and 22".to_owned(), + )) + } +} + +fn create_logical_plan(ctx: &mut ExecutionContext, query: usize) -> Result { + let sql = get_query_sql(query)?; + ctx.create_logical_plan(&sql) } async fn execute_query( @@ -1668,8 +954,10 @@ mod tests { file_format: "tbl".to_string(), mem_table: false, partitions: 16, + host: None, + port: None, }; - let actual = benchmark(opt).await?; + let actual = benchmark_datafusion(opt).await?; // assert schema equality without comparing nullable values assert_eq!( diff --git a/ballista/rust/benchmarks/tpch/tpch-gen.sh b/benchmarks/tpch-gen.sh similarity index 97% rename from ballista/rust/benchmarks/tpch/tpch-gen.sh rename to benchmarks/tpch-gen.sh index f5147f55f2f69..fef3480c612c4 100755 --- a/ballista/rust/benchmarks/tpch/tpch-gen.sh +++ b/benchmarks/tpch-gen.sh @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -BALLISTA_VERSION=0.4.2-SNAPSHOT +BALLISTA_VERSION=0.5.0-SNAPSHOT #set -e diff --git a/ballista/rust/benchmarks/tpch/tpchgen.dockerfile b/benchmarks/tpchgen.dockerfile similarity index 100% rename from ballista/rust/benchmarks/tpch/tpchgen.dockerfile rename to benchmarks/tpchgen.dockerfile diff --git a/dev/build-rust-base.sh b/dev/build-rust-base.sh index e424909fb6f10..1bedbd880b441 100755 --- a/dev/build-rust-base.sh +++ b/dev/build-rust-base.sh @@ -16,6 +16,6 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -BALLISTA_VERSION=0.4.2-SNAPSHOT +BALLISTA_VERSION=0.5.0-SNAPSHOT set -e docker build -t ballistacompute/rust-base:$BALLISTA_VERSION -f dev/docker/rust-base.dockerfile . diff --git a/dev/build-rust.sh b/dev/build-rust.sh index d31c5241c6f13..5777d1eb253bc 100755 --- a/dev/build-rust.sh +++ b/dev/build-rust.sh @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -BALLISTA_VERSION=0.4.2-SNAPSHOT +BALLISTA_VERSION=0.5.0-SNAPSHOT set -e diff --git a/dev/docker/rust.dockerfile b/dev/docker/rust.dockerfile index 6505f3c1660ac..ba713b15e90c4 100644 --- a/dev/docker/rust.dockerfile +++ b/dev/docker/rust.dockerfile @@ -22,7 +22,7 @@ # as a mounted directory. ARG RELEASE_FLAG=--release -FROM ballistacompute/rust-base:0.4.2-SNAPSHOT AS base +FROM ballistacompute/rust-base:0.5.0-SNAPSHOT AS base WORKDIR /tmp/ballista RUN apt-get -y install cmake RUN cargo install cargo-chef @@ -73,7 +73,7 @@ ENV RELEASE_FLAG=${RELEASE_FLAG} RUN if [ -z "$RELEASE_FLAG" ]; then mv /tmp/ballista/target/debug/tpch /tpch; else mv /tmp/ballista/target/release/tpch /tpch; fi # Copy the binary into a new container for a smaller docker image -FROM ballistacompute/rust-base:0.4.0-20210213 +FROM ballistacompute/rust-base:0.5.0-SNAPSHOT COPY --from=builder /executor / @@ -81,6 +81,10 @@ COPY --from=builder /scheduler / COPY --from=builder /tpch / +ADD benchmarks/run.sh / +RUN mkdir /queries +COPY benchmarks/queries/ /queries/ + ENV RUST_LOG=info ENV RUST_BACKTRACE=full diff --git a/dev/integration-tests.sh b/dev/integration-tests.sh index 6ed764ecda8ad..06ab108c2931c 100755 --- a/dev/integration-tests.sh +++ b/dev/integration-tests.sh @@ -19,11 +19,11 @@ set -e ./dev/build-rust-base.sh ./dev/build-rust.sh -pushd ballista/rust/benchmarks/tpch +pushd benchmarks ./tpch-gen.sh docker-compose up -d -docker-compose run ballista-client ./run.sh +docker-compose run ballista-client /run.sh docker-compose down popd diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index f9eca7a2ed458..cef0a91eb00b1 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -100,6 +100,6 @@ requirements.txt *.scss .gitattributes rust-toolchain -ballista/rust/benchmarks/tpch/queries/q*.sql +benchmarks/queries/q*.sql ballista/rust/scheduler/testdata/* ballista/ui/scheduler/yarn.lock From 245f0b8a68c5763a236aef3e727f0502188d0bfa Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 24 Apr 2021 12:20:16 +0200 Subject: [PATCH 031/329] support large-utf8 in groupby (#35) * support large-utf8 in groupby * add test --- datafusion/src/execution/context.rs | 51 +++++++++++++++++++ datafusion/src/physical_plan/group_scalar.rs | 9 +++- .../src/physical_plan/hash_aggregate.rs | 18 ++++++- datafusion/src/physical_plan/hash_join.rs | 3 ++ datafusion/src/physical_plan/type_coercion.rs | 2 +- 5 files changed, 79 insertions(+), 4 deletions(-) diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index c83ca4d8de5e3..c394d3895622a 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -1646,6 +1646,57 @@ mod tests { Ok(()) } + #[tokio::test] + async fn group_by_largeutf8() { + { + let mut ctx = ExecutionContext::new(); + + // input data looks like: + // A, 1 + // B, 2 + // A, 2 + // A, 4 + // C, 1 + // A, 1 + + let str_array: LargeStringArray = vec!["A", "B", "A", "A", "C", "A"] + .into_iter() + .map(Some) + .collect(); + let str_array = Arc::new(str_array); + + let val_array: Int64Array = vec![1, 2, 2, 4, 1, 1].into(); + let val_array = Arc::new(val_array); + + let schema = Arc::new(Schema::new(vec![ + Field::new("str", str_array.data_type().clone(), false), + Field::new("val", val_array.data_type().clone(), false), + ])); + + let batch = + RecordBatch::try_new(schema.clone(), vec![str_array, val_array]).unwrap(); + + let provider = MemTable::try_new(schema.clone(), vec![vec![batch]]).unwrap(); + ctx.register_table("t", Arc::new(provider)).unwrap(); + + let results = + plan_and_collect(&mut ctx, "SELECT str, count(val) FROM t GROUP BY str") + .await + .expect("ran plan correctly"); + + let expected = vec![ + "+-----+------------+", + "| str | COUNT(val) |", + "+-----+------------+", + "| A | 4 |", + "| B | 1 |", + "| C | 1 |", + "+-----+------------+", + ]; + assert_batches_sorted_eq!(expected, &results); + } + } + #[tokio::test] async fn group_by_dictionary() { async fn run_test_case() { diff --git a/datafusion/src/physical_plan/group_scalar.rs b/datafusion/src/physical_plan/group_scalar.rs index f4987ae3a7db1..943386d215c4f 100644 --- a/datafusion/src/physical_plan/group_scalar.rs +++ b/datafusion/src/physical_plan/group_scalar.rs @@ -37,6 +37,7 @@ pub(crate) enum GroupByScalar { Int32(i32), Int64(i64), Utf8(Box), + LargeUtf8(Box), Boolean(bool), TimeMillisecond(i64), TimeMicrosecond(i64), @@ -74,6 +75,9 @@ impl TryFrom<&ScalarValue> for GroupByScalar { GroupByScalar::TimeNanosecond(*v) } ScalarValue::Utf8(Some(v)) => GroupByScalar::Utf8(Box::new(v.clone())), + ScalarValue::LargeUtf8(Some(v)) => { + GroupByScalar::LargeUtf8(Box::new(v.clone())) + } ScalarValue::Float32(None) | ScalarValue::Float64(None) | ScalarValue::Boolean(None) @@ -116,6 +120,7 @@ impl From<&GroupByScalar> for ScalarValue { GroupByScalar::UInt32(v) => ScalarValue::UInt32(Some(*v)), GroupByScalar::UInt64(v) => ScalarValue::UInt64(Some(*v)), GroupByScalar::Utf8(v) => ScalarValue::Utf8(Some(v.to_string())), + GroupByScalar::LargeUtf8(v) => ScalarValue::LargeUtf8(Some(v.to_string())), GroupByScalar::TimeMillisecond(v) => { ScalarValue::TimestampMillisecond(Some(*v)) } @@ -191,14 +196,14 @@ mod tests { #[test] fn from_scalar_unsupported() { // Use any ScalarValue type not supported by GroupByScalar. - let scalar_value = ScalarValue::LargeUtf8(Some("1.1".to_string())); + let scalar_value = ScalarValue::Binary(Some(vec![1, 2])); let result = GroupByScalar::try_from(&scalar_value); match result { Err(DataFusionError::Internal(error_message)) => assert_eq!( error_message, String::from( - "Cannot convert a ScalarValue with associated DataType LargeUtf8" + "Cannot convert a ScalarValue with associated DataType Binary" ) ), _ => panic!("Unexpected result"), diff --git a/datafusion/src/physical_plan/hash_aggregate.rs b/datafusion/src/physical_plan/hash_aggregate.rs index fd20b5c65ef2a..fad4fa585034b 100644 --- a/datafusion/src/physical_plan/hash_aggregate.rs +++ b/datafusion/src/physical_plan/hash_aggregate.rs @@ -59,7 +59,8 @@ use ordered_float::OrderedFloat; use pin_project_lite::pin_project; use arrow::array::{ - TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + LargeStringArray, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, }; use async_trait::async_trait; @@ -540,6 +541,14 @@ fn create_key_for_col(col: &ArrayRef, row: usize, vec: &mut Vec) -> Result<( // store the string value vec.extend_from_slice(value.as_bytes()); } + DataType::LargeUtf8 => { + let array = col.as_any().downcast_ref::().unwrap(); + let value = array.value(row); + // store the size + vec.extend_from_slice(&value.len().to_le_bytes()); + // store the string value + vec.extend_from_slice(value.as_bytes()); + } DataType::Date32 => { let array = col.as_any().downcast_ref::().unwrap(); vec.extend_from_slice(&array.value(row).to_le_bytes()); @@ -953,6 +962,9 @@ fn create_batch_from_map( GroupByScalar::Utf8(str) => { Arc::new(StringArray::from(vec![&***str])) } + GroupByScalar::LargeUtf8(str) => { + Arc::new(LargeStringArray::from(vec![&***str])) + } GroupByScalar::Boolean(b) => Arc::new(BooleanArray::from(vec![*b])), GroupByScalar::TimeMillisecond(n) => { Arc::new(TimestampMillisecondArray::from(vec![*n])) @@ -1103,6 +1115,10 @@ fn create_group_by_value(col: &ArrayRef, row: usize) -> Result { let array = col.as_any().downcast_ref::().unwrap(); Ok(GroupByScalar::Utf8(Box::new(array.value(row).into()))) } + DataType::LargeUtf8 => { + let array = col.as_any().downcast_ref::().unwrap(); + Ok(GroupByScalar::Utf8(Box::new(array.value(row).into()))) + } DataType::Boolean => { let array = col.as_any().downcast_ref::().unwrap(); Ok(GroupByScalar::Boolean(array.value(row))) diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index 401fe6580a927..eb2ec33d08699 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -831,6 +831,9 @@ pub fn create_hashes<'a>( DataType::Utf8 => { hash_array!(StringArray, col, str, hashes_buffer, random_state); } + DataType::LargeUtf8 => { + hash_array!(LargeStringArray, col, str, hashes_buffer, random_state); + } _ => { // This is internal because we should have caught this before. return Err(DataFusionError::Internal( diff --git a/datafusion/src/physical_plan/type_coercion.rs b/datafusion/src/physical_plan/type_coercion.rs index 24b51ba606956..d9f84e7cb8622 100644 --- a/datafusion/src/physical_plan/type_coercion.rs +++ b/datafusion/src/physical_plan/type_coercion.rs @@ -196,7 +196,7 @@ pub fn can_coerce_from(type_into: &DataType, type_from: &DataType) -> bool { | Float64 ), Timestamp(TimeUnit::Nanosecond, None) => matches!(type_from, Timestamp(_, None)), - Utf8 => true, + Utf8 | LargeUtf8 => true, _ => false, } } From 3cb83fe6e584a8b5590b88259b91b53b520aa653 Mon Sep 17 00:00:00 2001 From: sathis Date: Sat, 24 Apr 2021 20:56:24 +0530 Subject: [PATCH 032/329] [Ballista] Docker files for ui (#22) --- ballista/ui/scheduler/.dockerignore | 2 ++ dev/build-ui.sh | 24 ++++++++++++++++++ dev/docker/ui.scheduler.dockerfile | 38 +++++++++++++++++++++++++++++ dev/release/rat_exclude_files.txt | 1 + 4 files changed, 65 insertions(+) create mode 100644 ballista/ui/scheduler/.dockerignore create mode 100755 dev/build-ui.sh create mode 100644 dev/docker/ui.scheduler.dockerfile diff --git a/ballista/ui/scheduler/.dockerignore b/ballista/ui/scheduler/.dockerignore new file mode 100644 index 0000000000000..dd87e2d73f9fa --- /dev/null +++ b/ballista/ui/scheduler/.dockerignore @@ -0,0 +1,2 @@ +node_modules +build diff --git a/dev/build-ui.sh b/dev/build-ui.sh new file mode 100755 index 0000000000000..d39d610bd7ea2 --- /dev/null +++ b/dev/build-ui.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +BALLISTA_VERSION=0.4.2-SNAPSHOT + +set -e + +docker build -t ballistacompute/ballista-scheduler-ui:$BALLISTA_VERSION -f dev/docker/ui.scheduler.dockerfile ballista/ui/scheduler diff --git a/dev/docker/ui.scheduler.dockerfile b/dev/docker/ui.scheduler.dockerfile new file mode 100644 index 0000000000000..99892e95e7149 --- /dev/null +++ b/dev/docker/ui.scheduler.dockerfile @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Turn .dockerignore to .dockerallow by excluding everything and explicitly +# allowing specific files and directories. This enables us to quickly add +# dependency files to the docker content without scanning the whole directory. +# This setup requires to all of our docker containers have arrow's source +# as a mounted directory. + +FROM node:14.16.0-alpine as build +WORKDIR /app +ENV PATH /app/node_modules/.bin:$PATH + +COPY package.json ./ +COPY yarn.lock ./ +RUN yarn + +COPY . ./ +RUN yarn build + +FROM nginx:stable-alpine +COPY --from=build /app/build /usr/share/nginx/html +EXPOSE 80 +CMD ["nginx", "-g", "daemon off;"] diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index cef0a91eb00b1..b94c0ea1d61a6 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -1,5 +1,6 @@ *.npmrc *.gitignore +*.dockerignore .gitmodules *_generated.h *_generated.js From 9ba214a52ed78c57d3d6363c61a88893d41fe906 Mon Sep 17 00:00:00 2001 From: Ruan Pearce-Authers Date: Sat, 24 Apr 2021 17:32:59 +0100 Subject: [PATCH 033/329] Re-export Arrow and Parquet crates from DataFusion (#39) * Re-export Arrow and Parquet crates * Switch benchmarks crate to use DF-exported Arrow and Parquet deps * Switch datafusion-examples crate to use DF-exported Arrow dep --- benchmarks/Cargo.toml | 2 -- benchmarks/src/bin/nyctaxi.rs | 5 ++-- benchmarks/src/bin/tpch.rs | 25 ++++++++++--------- datafusion-examples/Cargo.toml | 1 - datafusion-examples/examples/csv_sql.rs | 4 +-- datafusion-examples/examples/dataframe.rs | 4 +-- .../examples/dataframe_in_memory.rs | 8 +++--- datafusion-examples/examples/flight_client.rs | 6 ++--- datafusion-examples/examples/flight_server.rs | 6 ++--- datafusion-examples/examples/parquet_sql.rs | 4 +-- datafusion-examples/examples/simple_udaf.rs | 4 +-- datafusion-examples/examples/simple_udf.rs | 4 +-- datafusion/src/lib.rs | 5 +++- 13 files changed, 40 insertions(+), 38 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 35622661eaaaf..25a385eaea0e6 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -31,8 +31,6 @@ simd = ["datafusion/simd"] snmalloc = ["snmalloc-rs"] [dependencies] -arrow = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } -parquet = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } datafusion = { path = "../datafusion" } ballista = { path = "../ballista/rust/client" } structopt = { version = "0.3", default-features = false } diff --git a/benchmarks/src/bin/nyctaxi.rs b/benchmarks/src/bin/nyctaxi.rs index 005efca94885c..b2a62a0d39f9b 100644 --- a/benchmarks/src/bin/nyctaxi.rs +++ b/benchmarks/src/bin/nyctaxi.rs @@ -22,8 +22,9 @@ use std::path::PathBuf; use std::process; use std::time::Instant; -use arrow::datatypes::{DataType, Field, Schema}; -use arrow::util::pretty; +use datafusion::arrow::datatypes::{DataType, Field, Schema}; +use datafusion::arrow::util::pretty; + use datafusion::error::Result; use datafusion::execution::context::{ExecutionConfig, ExecutionContext}; diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index fd9f0525987d5..543e84f33097d 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -28,17 +28,21 @@ use std::{ use futures::StreamExt; -use arrow::datatypes::{DataType, Field, Schema}; -use arrow::util::pretty; use ballista::context::BallistaContext; + +use datafusion::arrow::datatypes::{DataType, Field, Schema}; +use datafusion::arrow::record_batch::RecordBatch; +use datafusion::arrow::util::pretty; + use datafusion::datasource::parquet::ParquetTable; use datafusion::datasource::{CsvFile, MemTable, TableProvider}; use datafusion::error::{DataFusionError, Result}; use datafusion::logical_plan::LogicalPlan; use datafusion::physical_plan::collect; use datafusion::prelude::*; -use parquet::basic::Compression; -use parquet::file::properties::WriterProperties; + +use datafusion::parquet::basic::Compression; +use datafusion::parquet::file::properties::WriterProperties; use structopt::StructOpt; #[cfg(feature = "snmalloc")] @@ -149,9 +153,7 @@ async fn main() -> Result<()> { } } -async fn benchmark_datafusion( - opt: BenchmarkOpt, -) -> Result> { +async fn benchmark_datafusion(opt: BenchmarkOpt) -> Result> { println!("Running benchmarks with the following options: {:?}", opt); let config = ExecutionConfig::new() .with_concurrency(opt.concurrency) @@ -186,7 +188,7 @@ async fn benchmark_datafusion( let mut millis = vec![]; // run benchmark - let mut result: Vec = Vec::with_capacity(1); + let mut result: Vec = Vec::with_capacity(1); for i in 0..opt.iterations { let start = Instant::now(); let plan = create_logical_plan(&mut ctx, opt.query)?; @@ -299,7 +301,7 @@ async fn execute_query( ctx: &mut ExecutionContext, plan: &LogicalPlan, debug: bool, -) -> Result> { +) -> Result> { if debug { println!("Logical plan:\n{:?}", plan); } @@ -523,9 +525,8 @@ mod tests { use std::env; use std::sync::Arc; - use arrow::array::*; - use arrow::record_batch::RecordBatch; - use arrow::util::display::array_value_to_string; + use datafusion::arrow::array::*; + use datafusion::arrow::util::display::array_value_to_string; use datafusion::logical_plan::Expr; use datafusion::logical_plan::Expr::Cast; diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 929bdf208305f..0445f382a25e6 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -29,7 +29,6 @@ publish = false [dev-dependencies] -arrow = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } datafusion = { path = "../datafusion" } prost = "0.7" diff --git a/datafusion-examples/examples/csv_sql.rs b/datafusion-examples/examples/csv_sql.rs index 63fd36d44ce4b..76c87960d71d3 100644 --- a/datafusion-examples/examples/csv_sql.rs +++ b/datafusion-examples/examples/csv_sql.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow::util::pretty; +use datafusion::arrow::util::pretty; use datafusion::error::Result; use datafusion::prelude::*; @@ -27,7 +27,7 @@ async fn main() -> Result<()> { // create local execution context let mut ctx = ExecutionContext::new(); - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = datafusion::arrow::util::test_util::arrow_test_data(); // register csv file with the execution context ctx.register_csv( diff --git a/datafusion-examples/examples/dataframe.rs b/datafusion-examples/examples/dataframe.rs index cba4d87f1e0d2..dcf6bc32be6b2 100644 --- a/datafusion-examples/examples/dataframe.rs +++ b/datafusion-examples/examples/dataframe.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow::util::pretty; +use datafusion::arrow::util::pretty; use datafusion::error::Result; use datafusion::prelude::*; @@ -27,7 +27,7 @@ async fn main() -> Result<()> { // create local execution context let mut ctx = ExecutionContext::new(); - let testdata = arrow::util::test_util::parquet_test_data(); + let testdata = datafusion::arrow::util::test_util::parquet_test_data(); let filename = &format!("{}/alltypes_plain.parquet", testdata); diff --git a/datafusion-examples/examples/dataframe_in_memory.rs b/datafusion-examples/examples/dataframe_in_memory.rs index de8552a3bba71..0c65a7477e97c 100644 --- a/datafusion-examples/examples/dataframe_in_memory.rs +++ b/datafusion-examples/examples/dataframe_in_memory.rs @@ -17,10 +17,10 @@ use std::sync::Arc; -use arrow::array::{Int32Array, StringArray}; -use arrow::datatypes::{DataType, Field, Schema}; -use arrow::record_batch::RecordBatch; -use arrow::util::pretty; +use datafusion::arrow::array::{Int32Array, StringArray}; +use datafusion::arrow::datatypes::{DataType, Field, Schema}; +use datafusion::arrow::record_batch::RecordBatch; +use datafusion::arrow::util::pretty; use datafusion::datasource::MemTable; use datafusion::error::Result; diff --git a/datafusion-examples/examples/flight_client.rs b/datafusion-examples/examples/flight_client.rs index 2c2954d5a0298..53347826ff89c 100644 --- a/datafusion-examples/examples/flight_client.rs +++ b/datafusion-examples/examples/flight_client.rs @@ -18,8 +18,8 @@ use std::convert::TryFrom; use std::sync::Arc; -use arrow::datatypes::Schema; -use arrow::util::pretty; +use datafusion::arrow::datatypes::Schema; +use datafusion::arrow::util::pretty; use arrow_flight::flight_descriptor; use arrow_flight::flight_service_client::FlightServiceClient; @@ -31,7 +31,7 @@ use arrow_flight::{FlightDescriptor, Ticket}; /// This example is run along-side the example `flight_server`. #[tokio::main] async fn main() -> Result<(), Box> { - let testdata = arrow::util::test_util::parquet_test_data(); + let testdata = datafusion::arrow::util::test_util::parquet_test_data(); // Create Flight client let mut client = FlightServiceClient::connect("http://localhost:50051").await?; diff --git a/datafusion-examples/examples/flight_server.rs b/datafusion-examples/examples/flight_server.rs index 79660dd1871cf..8496bcb18914f 100644 --- a/datafusion-examples/examples/flight_server.rs +++ b/datafusion-examples/examples/flight_server.rs @@ -66,7 +66,7 @@ impl FlightService for FlightServiceImpl { let table = ParquetTable::try_new(&request.path[0], num_cpus::get()).unwrap(); - let options = arrow::ipc::writer::IpcWriteOptions::default(); + let options = datafusion::arrow::ipc::writer::IpcWriteOptions::default(); let schema_result = arrow_flight::utils::flight_schema_from_arrow_schema( table.schema().as_ref(), &options, @@ -87,7 +87,7 @@ impl FlightService for FlightServiceImpl { // create local execution context let mut ctx = ExecutionContext::new(); - let testdata = arrow::util::test_util::parquet_test_data(); + let testdata = datafusion::arrow::util::test_util::parquet_test_data(); // register parquet file with the execution context ctx.register_parquet( @@ -106,7 +106,7 @@ impl FlightService for FlightServiceImpl { } // add an initial FlightData message that sends schema - let options = arrow::ipc::writer::IpcWriteOptions::default(); + let options = datafusion::arrow::ipc::writer::IpcWriteOptions::default(); let schema_flight_data = arrow_flight::utils::flight_data_from_arrow_schema( &df.schema().clone().into(), diff --git a/datafusion-examples/examples/parquet_sql.rs b/datafusion-examples/examples/parquet_sql.rs index 8043d3296c87b..f679b22ceb904 100644 --- a/datafusion-examples/examples/parquet_sql.rs +++ b/datafusion-examples/examples/parquet_sql.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow::util::pretty; +use datafusion::arrow::util::pretty; use datafusion::error::Result; use datafusion::prelude::*; @@ -27,7 +27,7 @@ async fn main() -> Result<()> { // create local execution context let mut ctx = ExecutionContext::new(); - let testdata = arrow::util::test_util::parquet_test_data(); + let testdata = datafusion::arrow::util::test_util::parquet_test_data(); // register parquet file with the execution context ctx.register_parquet( diff --git a/datafusion-examples/examples/simple_udaf.rs b/datafusion-examples/examples/simple_udaf.rs index 8086dfc47de43..49d09ff431550 100644 --- a/datafusion-examples/examples/simple_udaf.rs +++ b/datafusion-examples/examples/simple_udaf.rs @@ -17,7 +17,7 @@ /// In this example we will declare a single-type, single return type UDAF that computes the geometric mean. /// The geometric mean is described here: https://en.wikipedia.org/wiki/Geometric_mean -use arrow::{ +use datafusion::arrow::{ array::Float32Array, array::Float64Array, datatypes::DataType, record_batch::RecordBatch, }; @@ -28,7 +28,7 @@ use std::sync::Arc; // create local execution context with an in-memory table fn create_context() -> Result { - use arrow::datatypes::{Field, Schema}; + use datafusion::arrow::datatypes::{Field, Schema}; use datafusion::datasource::MemTable; // define a schema. let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)])); diff --git a/datafusion-examples/examples/simple_udf.rs b/datafusion-examples/examples/simple_udf.rs index bfef1089a634c..0ffec44a37202 100644 --- a/datafusion-examples/examples/simple_udf.rs +++ b/datafusion-examples/examples/simple_udf.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow::{ +use datafusion::arrow::{ array::{ArrayRef, Float32Array, Float64Array}, datatypes::DataType, record_batch::RecordBatch, @@ -28,7 +28,7 @@ use std::sync::Arc; // create local execution context with an in-memory table fn create_context() -> Result { - use arrow::datatypes::{Field, Schema}; + use datafusion::arrow::datatypes::{Field, Schema}; use datafusion::datasource::MemTable; // define a schema. let schema = Arc::new(Schema::new(vec![ diff --git a/datafusion/src/lib.rs b/datafusion/src/lib.rs index 44a8a686a496d..252d168114add 100644 --- a/datafusion/src/lib.rs +++ b/datafusion/src/lib.rs @@ -183,7 +183,6 @@ //! //! you can find examples of each of them in examples section. -extern crate arrow; extern crate sqlparser; pub mod catalog; @@ -200,6 +199,10 @@ pub mod scalar; pub mod sql; pub mod variable; +// re-export dependencies from arrow-rs to minimise version maintenance for crate users +pub use arrow; +pub use parquet; + #[cfg(test)] pub mod test; From fa999799b65fc22abfada36955b0b0cc3ebf4c4f Mon Sep 17 00:00:00 2001 From: sathis Date: Sun, 25 Apr 2021 10:35:52 +0530 Subject: [PATCH 034/329] Remove hard coded ballista versions. Fixes #32 (#49) Co-authored-by: Sathis Kumar --- benchmarks/tpch-gen.sh | 3 +-- dev/build-rust-base.sh | 3 ++- dev/build-rust.sh | 3 +-- dev/build-set-env.sh | 20 ++++++++++++++++++++ dev/build-ui.sh | 3 +-- 5 files changed, 25 insertions(+), 7 deletions(-) create mode 100755 dev/build-set-env.sh diff --git a/benchmarks/tpch-gen.sh b/benchmarks/tpch-gen.sh index fef3480c612c4..3cef3bdfdd87c 100755 --- a/benchmarks/tpch-gen.sh +++ b/benchmarks/tpch-gen.sh @@ -16,10 +16,9 @@ # specific language governing permissions and limitations # under the License. -BALLISTA_VERSION=0.5.0-SNAPSHOT - #set -e +. ./dev/build-set-env.sh docker build -t ballistacompute/ballista-tpchgen:$BALLISTA_VERSION -f tpchgen.dockerfile . # Generate data into the ./data directory if it does not already exist diff --git a/dev/build-rust-base.sh b/dev/build-rust-base.sh index 1bedbd880b441..f2a4cc32385bd 100755 --- a/dev/build-rust-base.sh +++ b/dev/build-rust-base.sh @@ -16,6 +16,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -BALLISTA_VERSION=0.5.0-SNAPSHOT set -e + +. ./dev/build-set-env.sh docker build -t ballistacompute/rust-base:$BALLISTA_VERSION -f dev/docker/rust-base.dockerfile . diff --git a/dev/build-rust.sh b/dev/build-rust.sh index 5777d1eb253bc..479cb2a135fbb 100755 --- a/dev/build-rust.sh +++ b/dev/build-rust.sh @@ -17,8 +17,7 @@ # specific language governing permissions and limitations # under the License. -BALLISTA_VERSION=0.5.0-SNAPSHOT - set -e +. ./dev/build-set-env.sh docker build -t ballistacompute/ballista-rust:$BALLISTA_VERSION -f dev/docker/rust.dockerfile . diff --git a/dev/build-set-env.sh b/dev/build-set-env.sh new file mode 100755 index 0000000000000..3eb29e7ce1443 --- /dev/null +++ b/dev/build-set-env.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +export BALLISTA_VERSION=$(awk -F'[ ="]+' '$1 == "version" { print $2 }' ballista/rust/core/Cargo.toml) diff --git a/dev/build-ui.sh b/dev/build-ui.sh index d39d610bd7ea2..bb5bff3d180a5 100755 --- a/dev/build-ui.sh +++ b/dev/build-ui.sh @@ -17,8 +17,7 @@ # specific language governing permissions and limitations # under the License. -BALLISTA_VERSION=0.4.2-SNAPSHOT - set -e +. ./dev/build-set-env.sh docker build -t ballistacompute/ballista-scheduler-ui:$BALLISTA_VERSION -f dev/docker/ui.scheduler.dockerfile ballista/ui/scheduler From 589f355c23157ead4e65a8f941437ea78b64ecb1 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 25 Apr 2021 07:47:19 -0600 Subject: [PATCH 035/329] Remove Ballista DataFrame (#48) --- ballista/rust/client/src/context.rs | 217 +++++----------------------- benchmarks/src/bin/tpch.rs | 4 +- 2 files changed, 40 insertions(+), 181 deletions(-) diff --git a/ballista/rust/client/src/context.rs b/ballista/rust/client/src/context.rs index a4cca7a0996cc..e26dcac256d28 100644 --- a/ballista/rust/client/src/context.rs +++ b/ballista/rust/client/src/context.rs @@ -29,16 +29,14 @@ use ballista_core::serde::protobuf::{ GetJobStatusResult, }; use ballista_core::{ - client::BallistaClient, - datasource::DfTableAdapter, - error::{BallistaError, Result}, - memory_stream::MemoryStream, + client::BallistaClient, datasource::DfTableAdapter, memory_stream::MemoryStream, utils::create_datafusion_context, }; use arrow::datatypes::Schema; use datafusion::catalog::TableReference; -use datafusion::logical_plan::{DFSchema, Expr, LogicalPlan, Partitioning}; +use datafusion::error::{DataFusionError, Result}; +use datafusion::logical_plan::LogicalPlan; use datafusion::physical_plan::csv::CsvReadOptions; use datafusion::{dataframe::DataFrame, physical_plan::RecordBatchStream}; use log::{error, info}; @@ -88,7 +86,7 @@ impl BallistaContext { /// Create a DataFrame representing a Parquet table scan - pub fn read_parquet(&self, path: &str) -> Result { + pub fn read_parquet(&self, path: &str) -> Result> { // convert to absolute path because the executor likely has a different working directory let path = PathBuf::from(path); let path = fs::canonicalize(&path)?; @@ -96,7 +94,7 @@ impl BallistaContext { // use local DataFusion context for now but later this might call the scheduler let mut ctx = create_datafusion_context(); let df = ctx.read_parquet(path.to_str().unwrap())?; - Ok(BallistaDataFrame::from(self.state.clone(), df)) + Ok(df) } /// Create a DataFrame representing a CSV table scan @@ -105,7 +103,7 @@ impl BallistaContext { &self, path: &str, options: CsvReadOptions, - ) -> Result { + ) -> Result> { // convert to absolute path because the executor likely has a different working directory let path = PathBuf::from(path); let path = fs::canonicalize(&path)?; @@ -113,11 +111,11 @@ impl BallistaContext { // use local DataFusion context for now but later this might call the scheduler let mut ctx = create_datafusion_context(); let df = ctx.read_csv(path.to_str().unwrap(), options)?; - Ok(BallistaDataFrame::from(self.state.clone(), df)) + Ok(df) } /// Register a DataFrame as a table that can be referenced from a SQL query - pub fn register_table(&self, name: &str, table: &BallistaDataFrame) -> Result<()> { + pub fn register_table(&self, name: &str, table: &dyn DataFrame) -> Result<()> { let mut state = self.state.lock().unwrap(); state .tables @@ -132,16 +130,16 @@ impl BallistaContext { options: CsvReadOptions, ) -> Result<()> { let df = self.read_csv(path, options)?; - self.register_table(name, &df) + self.register_table(name, df.as_ref()) } pub fn register_parquet(&self, name: &str, path: &str) -> Result<()> { let df = self.read_parquet(path)?; - self.register_table(name, &df) + self.register_table(name, df.as_ref()) } /// Create a DataFrame from a SQL statement - pub fn sql(&self, sql: &str) -> Result { + pub fn sql(&self, sql: &str) -> Result> { // use local DataFusion context for now but later this might call the scheduler let mut ctx = create_datafusion_context(); // register tables @@ -154,27 +152,13 @@ impl BallistaContext { Arc::new(DfTableAdapter::new(plan, execution_plan)), )?; } - let df = ctx.sql(sql)?; - Ok(BallistaDataFrame::from(self.state.clone(), df)) + ctx.sql(sql) } -} - -/// The Ballista DataFrame is a wrapper around the DataFusion DataFrame and overrides the -/// `collect` method so that the query is executed against Ballista and not DataFusion. - -pub struct BallistaDataFrame { - /// Ballista context state - state: Arc>, - /// DataFusion DataFrame representing logical query plan - df: Arc, -} -impl BallistaDataFrame { - fn from(state: Arc>, df: Arc) -> Self { - Self { state, df } - } - - pub async fn collect(&self) -> Result>> { + pub async fn collect( + &self, + plan: &LogicalPlan, + ) -> Result>> { let scheduler_url = { let state = self.state.lock().unwrap(); @@ -183,16 +167,22 @@ impl BallistaDataFrame { info!("Connecting to Ballista scheduler at {}", scheduler_url); - let mut scheduler = SchedulerGrpcClient::connect(scheduler_url).await?; + let mut scheduler = SchedulerGrpcClient::connect(scheduler_url) + .await + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; - let plan = self.df.to_logical_plan(); let schema: Schema = plan.schema().as_ref().clone().into(); let job_id = scheduler .execute_query(ExecuteQueryParams { - query: Some(Query::LogicalPlan((&plan).try_into()?)), + query: Some(Query::LogicalPlan( + (plan) + .try_into() + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?, + )), }) - .await? + .await + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))? .into_inner() .job_id; @@ -201,10 +191,11 @@ impl BallistaDataFrame { .get_job_status(GetJobStatusParams { job_id: job_id.clone(), }) - .await? + .await + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))? .into_inner(); let status = status.and_then(|s| s.status).ok_or_else(|| { - BallistaError::Internal("Received empty status message".to_owned()) + DataFusionError::Internal("Received empty status message".to_owned()) })?; let wait_future = tokio::time::sleep(Duration::from_millis(100)); match status { @@ -219,19 +210,19 @@ impl BallistaDataFrame { job_status::Status::Failed(err) => { let msg = format!("Job {} failed: {}", job_id, err.error); error!("{}", msg); - break Err(BallistaError::General(msg)); + break Err(DataFusionError::Execution(msg)); } job_status::Status::Completed(completed) => { // TODO: use streaming. Probably need to change the signature of fetch_partition to achieve that let mut result = vec![]; for location in completed.partition_location { let metadata = location.executor_meta.ok_or_else(|| { - BallistaError::Internal( + DataFusionError::Internal( "Received empty executor metadata".to_owned(), ) })?; let partition_id = location.partition_id.ok_or_else(|| { - BallistaError::Internal( + DataFusionError::Internal( "Received empty partition id".to_owned(), ) })?; @@ -239,14 +230,18 @@ impl BallistaDataFrame { metadata.host.as_str(), metadata.port as u16, ) - .await?; + .await + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; let stream = ballista_client .fetch_partition( &partition_id.job_id, partition_id.stage_id as usize, partition_id.partition_id as usize, ) - .await?; + .await + .map_err(|e| { + DataFusionError::Execution(format!("{:?}", e)) + })?; result.append( &mut datafusion::physical_plan::common::collect(stream) .await?, @@ -261,140 +256,4 @@ impl BallistaDataFrame { }; } } - - pub fn select_columns(&self, columns: &[&str]) -> Result { - Ok(Self::from( - self.state.clone(), - self.df - .select_columns(columns) - .map_err(BallistaError::from)?, - )) - } - - pub fn select(&self, expr: Vec) -> Result { - Ok(Self::from( - self.state.clone(), - self.df.select(expr).map_err(BallistaError::from)?, - )) - } - - pub fn filter(&self, expr: Expr) -> Result { - Ok(Self::from( - self.state.clone(), - self.df.filter(expr).map_err(BallistaError::from)?, - )) - } - - pub fn aggregate( - &self, - group_expr: Vec, - aggr_expr: Vec, - ) -> Result { - Ok(Self::from( - self.state.clone(), - self.df - .aggregate(group_expr, aggr_expr) - .map_err(BallistaError::from)?, - )) - } - - pub fn limit(&self, n: usize) -> Result { - Ok(Self::from( - self.state.clone(), - self.df.limit(n).map_err(BallistaError::from)?, - )) - } - - pub fn sort(&self, expr: Vec) -> Result { - Ok(Self::from( - self.state.clone(), - self.df.sort(expr).map_err(BallistaError::from)?, - )) - } - - // TODO lifetime issue - // pub fn join(&self, right: Arc, join_type: JoinType, left_cols: &[&str], right_cols: &[&str]) -> - // Result { Ok(Self::from(self.state.clone(), self.df.join(right, join_type, &left_cols, - // &right_cols).map_err(BallistaError::from)?)) } - - pub fn repartition( - &self, - partitioning_scheme: Partitioning, - ) -> Result { - Ok(Self::from( - self.state.clone(), - self.df - .repartition(partitioning_scheme) - .map_err(BallistaError::from)?, - )) - } - - pub fn schema(&self) -> &DFSchema { - self.df.schema() - } - - pub fn to_logical_plan(&self) -> LogicalPlan { - self.df.to_logical_plan() - } - - pub fn explain(&self, verbose: bool) -> Result { - Ok(Self::from( - self.state.clone(), - self.df.explain(verbose).map_err(BallistaError::from)?, - )) - } } - -// #[async_trait] -// impl ExecutionContext for BallistaContext { -// async fn get_executor_ids(&self) -> Result> { -// match &self.config.discovery_mode { -// DiscoveryMode::Etcd => etcd_get_executors(&self.config.etcd_urls, "default").await, -// DiscoveryMode::Kubernetes => k8s_get_executors("default", "ballista").await, -// DiscoveryMode::Standalone => Err(ballista_error("Standalone mode not implemented yet")), -// } -// } -// -// async fn execute_task( -// &self, -// executor_meta: ExecutorMeta, -// task: ExecutionTask, -// ) -> Result { -// // TODO what is the point of returning this info since it is based on input arg? -// let shuffle_id = ShuffleId::new(task.job_uuid, task.stage_id, task.partition_id); -// -// let _ = execute_action( -// &executor_meta.host, -// executor_meta.port, -// &Action::Execute(task), -// ) -// .await?; -// -// Ok(shuffle_id) -// } -// -// async fn read_shuffle(&self, shuffle_id: &ShuffleId) -> Result> { -// match self.shuffle_locations.get(shuffle_id) { -// Some(executor_meta) => { -// let batches = execute_action( -// &executor_meta.host, -// executor_meta.port, -// &Action::FetchShuffle(*shuffle_id), -// ) -// .await?; -// Ok(batches -// .iter() -// .map(|b| ColumnarBatch::from_arrow(b)) -// .collect()) -// } -// _ => Err(ballista_error(&format!( -// "Failed to resolve executor UUID for shuffle ID {:?}", -// shuffle_id -// ))), -// } -// } -// -// fn config(&self) -> ExecutorConfig { -// self.config.clone() -// } -// } diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index 543e84f33097d..deaca496651ba 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -259,8 +259,8 @@ async fn benchmark_ballista(opt: BenchmarkOpt) -> Result<()> { .sql(&sql) .map_err(|e| DataFusionError::Plan(format!("{:?}", e)))?; let mut batches = vec![]; - let mut stream = df - .collect() + let mut stream = ctx + .collect(&df.to_logical_plan()) .await .map_err(|e| DataFusionError::Plan(format!("{:?}", e)))?; while let Some(result) = stream.next().await { From 1f1130e5c51bba05bd55d0495bbe0d952841a1d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sun, 25 Apr 2021 15:47:50 +0200 Subject: [PATCH 036/329] Use arrow eq kernels in CaseWhen (#52) --- .../src/physical_plan/expressions/case.rs | 62 ++++++++++++------- 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/datafusion/src/physical_plan/expressions/case.rs b/datafusion/src/physical_plan/expressions/case.rs index e8c500e5ed62b..723438df60f78 100644 --- a/datafusion/src/physical_plan/expressions/case.rs +++ b/datafusion/src/physical_plan/expressions/case.rs @@ -17,13 +17,13 @@ use std::{any::Any, sync::Arc}; +use crate::error::{DataFusionError, Result}; +use crate::physical_plan::{ColumnarValue, PhysicalExpr}; use arrow::array::{self, *}; +use arrow::compute::{eq, eq_utf8}; use arrow::datatypes::{DataType, Schema}; use arrow::record_batch::RecordBatch; -use crate::error::{DataFusionError, Result}; -use crate::physical_plan::{ColumnarValue, PhysicalExpr}; - /// The CASE expression is similar to a series of nested if/else and there are two forms that /// can be used. The first form consists of a series of boolean "when" expressions with /// corresponding "then" expressions, and an optional "else" expression. @@ -265,7 +265,7 @@ fn build_null_array(data_type: &DataType, num_rows: usize) -> Result { } macro_rules! array_equals { - ($TY:ty, $L:expr, $R:expr) => {{ + ($TY:ty, $L:expr, $R:expr, $eq_fn:expr) => {{ let when_value = $L .as_ref() .as_any() @@ -278,15 +278,7 @@ macro_rules! array_equals { .downcast_ref::<$TY>() .expect("array_equals downcast failed"); - let mut builder = BooleanBuilder::new(when_value.len()); - for row in 0..when_value.len() { - if when_value.is_valid(row) && base_value.is_valid(row) { - builder.append_value(when_value.value(row) == base_value.value(row))?; - } else { - builder.append_null()?; - } - } - Ok(builder.finish()) + $eq_fn(when_value, base_value).map_err(DataFusionError::from) }}; } @@ -296,17 +288,39 @@ fn array_equals( base_value: ArrayRef, ) -> Result { match data_type { - DataType::UInt8 => array_equals!(array::UInt8Array, when_value, base_value), - DataType::UInt16 => array_equals!(array::UInt16Array, when_value, base_value), - DataType::UInt32 => array_equals!(array::UInt32Array, when_value, base_value), - DataType::UInt64 => array_equals!(array::UInt64Array, when_value, base_value), - DataType::Int8 => array_equals!(array::Int8Array, when_value, base_value), - DataType::Int16 => array_equals!(array::Int16Array, when_value, base_value), - DataType::Int32 => array_equals!(array::Int32Array, when_value, base_value), - DataType::Int64 => array_equals!(array::Int64Array, when_value, base_value), - DataType::Float32 => array_equals!(array::Float32Array, when_value, base_value), - DataType::Float64 => array_equals!(array::Float64Array, when_value, base_value), - DataType::Utf8 => array_equals!(array::StringArray, when_value, base_value), + DataType::UInt8 => { + array_equals!(array::UInt8Array, when_value, base_value, eq) + } + DataType::UInt16 => { + array_equals!(array::UInt16Array, when_value, base_value, eq) + } + DataType::UInt32 => { + array_equals!(array::UInt32Array, when_value, base_value, eq) + } + DataType::UInt64 => { + array_equals!(array::UInt64Array, when_value, base_value, eq) + } + DataType::Int8 => { + array_equals!(array::Int8Array, when_value, base_value, eq) + } + DataType::Int16 => { + array_equals!(array::Int16Array, when_value, base_value, eq) + } + DataType::Int32 => { + array_equals!(array::Int32Array, when_value, base_value, eq) + } + DataType::Int64 => { + array_equals!(array::Int64Array, when_value, base_value, eq) + } + DataType::Float32 => { + array_equals!(array::Float32Array, when_value, base_value, eq) + } + DataType::Float64 => { + array_equals!(array::Float64Array, when_value, base_value, eq) + } + DataType::Utf8 => { + array_equals!(array::StringArray, when_value, base_value, eq_utf8) + } other => Err(DataFusionError::Execution(format!( "CASE does not support '{:?}'", other From 35bc3d7edb51bbc0e98e911d0b3a6fa1abf0d4cf Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 25 Apr 2021 08:03:58 -0600 Subject: [PATCH 037/329] Remove empty rust dir (#61) --- rust/.gitignore | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 rust/.gitignore diff --git a/rust/.gitignore b/rust/.gitignore deleted file mode 100644 index 389f4ab254bc4..0000000000000 --- a/rust/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -Cargo.lock -target -rusty-tags.vi -.history -.flatbuffers/ From ddaea81f9f46e918b5ab4e6257f1963b2a8a0f15 Mon Sep 17 00:00:00 2001 From: Ximo Guanter Date: Mon, 26 Apr 2021 01:02:05 +0200 Subject: [PATCH 038/329] Fix tpch-gen (#74) --- benchmarks/tpch-gen.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/tpch-gen.sh b/benchmarks/tpch-gen.sh index 3cef3bdfdd87c..676c0e7df52bf 100755 --- a/benchmarks/tpch-gen.sh +++ b/benchmarks/tpch-gen.sh @@ -18,7 +18,9 @@ #set -e +pushd .. . ./dev/build-set-env.sh +popd docker build -t ballistacompute/ballista-tpchgen:$BALLISTA_VERSION -f tpchgen.dockerfile . # Generate data into the ./data directory if it does not already exist From 8380c5dc9d3d9e98e3c6ceeb61db1d0a78371fef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Mon, 26 Apr 2021 23:34:26 +0200 Subject: [PATCH 039/329] Add query 19 to TPC-H regression tests (#59) --- benchmarks/src/bin/tpch.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index deaca496651ba..cee555fe675e0 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -681,6 +681,11 @@ mod tests { run_query(14).await } + #[tokio::test] + async fn run_q19() -> Result<()> { + run_query(19).await + } + /// Specialised String representation fn col_str(column: &ArrayRef, row_index: usize) -> String { if column.is_null(row_index) { From e973e30e0838486b248290ab9381b0df39d02f24 Mon Sep 17 00:00:00 2001 From: sathis Date: Tue, 27 Apr 2021 03:07:10 +0530 Subject: [PATCH 040/329] Deduplicate README.md (#79) * Deduplicate README.md * Remove CONTRIBUTING.md as it is no longer relevant Co-authored-by: Sathis Kumar --- CONTRIBUTING.md | 77 ----- datafusion/DEVELOPERS.md => DEVELOPERS.md | 28 +- README.md | 18 +- datafusion/Cargo.toml | 1 + datafusion/README.md | 358 ---------------------- datafusion/src/lib.rs | 2 +- 6 files changed, 26 insertions(+), 458 deletions(-) delete mode 100644 CONTRIBUTING.md rename datafusion/DEVELOPERS.md => DEVELOPERS.md (69%) delete mode 100644 datafusion/README.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md deleted file mode 100644 index 3e636d9cd2fe4..0000000000000 --- a/CONTRIBUTING.md +++ /dev/null @@ -1,77 +0,0 @@ - - -# How to contribute to Apache Arrow - -## Did you find a bug? - -The Arrow project uses JIRA as a bug tracker. To report a bug, you'll have -to first create an account on the -[Apache Foundation JIRA](https://issues.apache.org/jira/). The JIRA server -hosts bugs and issues for multiple Apache projects. The JIRA project name -for Arrow is "ARROW". - -To be assigned to an issue, ask an Arrow JIRA admin to go to -[Arrow Roles](https://issues.apache.org/jira/plugins/servlet/project-config/ARROW/roles), -click "Add users to a role," and add you to the "Contributor" role. Most -committers are authorized to do this; if you're a committer and aren't -able to load that project admin page, have someone else add you to the -necessary role. - -Before you create a new bug entry, we recommend you first -[search](https://issues.apache.org/jira/projects/ARROW/issues/ARROW-5140?filter=allopenissues) -among existing Arrow issues. - -When you create a new JIRA entry, please don't forget to fill the "Component" -field. Arrow has many subcomponents and this helps triaging and filtering -tremendously. Also, we conventionally prefix the issue title with the component -name in brackets, such as "[C++] Crash in Array::Frobnicate()", so as to make -lists more easy to navigate, and we'd be grateful if you did the same. - -## Did you write a patch that fixes a bug or brings an improvement? - -First create a JIRA entry as described above. Then, submit your changes -as a GitHub Pull Request. We'll ask you to prefix the pull request title -with the JIRA issue number and the component name in brackets. -(for example: "ARROW-2345: [C++] Fix crash in Array::Frobnicate()"). -Respecting this convention makes it easier for us to process the backlog -of submitted Pull Requests. - -### Minor Fixes - -Any functionality change should have a JIRA opened. For minor changes that -affect documentation, you do not need to open up a JIRA. Instead you can -prefix the title of your PR with "MINOR: " if meets the following guidelines: - -* Grammar, usage and spelling fixes that affect no more than 2 files -* Documentation updates affecting no more than 2 files and not more - than 500 words. - -## Do you want to propose a significant new feature or an important refactoring? - -We ask that all discussions about major changes in the codebase happen -publicly on the [arrow-dev mailing-list](https://mail-archives.apache.org/mod_mbox/arrow-dev/). - -## Do you have questions about the source code, the build procedure or the development process? - -You can also ask on the mailing-list, see above. - -## Further information - -Please read our [development documentation](https://arrow.apache.org/docs/developers/contributing.html). diff --git a/datafusion/DEVELOPERS.md b/DEVELOPERS.md similarity index 69% rename from datafusion/DEVELOPERS.md rename to DEVELOPERS.md index aa80cb71d3b9c..1dc9304651c81 100644 --- a/datafusion/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -35,21 +35,21 @@ DataFusion is written in Rust and it uses a standard rust toolkit: Below is a checklist of what you need to do to add a new scalar function to DataFusion: * Add the actual implementation of the function: - * [here](src/physical_plan/string_expressions.rs) for string functions - * [here](src/physical_plan/math_expressions.rs) for math functions - * [here](src/physical_plan/datetime_expressions.rs) for datetime functions - * create a new module [here](src/physical_plan) for other functions -* In [src/physical_plan/functions](src/physical_plan/functions.rs), add: + * [here](datafusion/src/physical_plan/string_expressions.rs) for string functions + * [here](datafusion/src/physical_plan/math_expressions.rs) for math functions + * [here](datafusion/src/physical_plan/datetime_expressions.rs) for datetime functions + * create a new module [here](datafusion/src/physical_plan) for other functions +* In [src/physical_plan/functions](datafusion/src/physical_plan/functions.rs), add: * a new variant to `BuiltinScalarFunction` * a new entry to `FromStr` with the name of the function as called by SQL * a new line in `return_type` with the expected return type of the function, given an incoming type * a new line in `signature` with the signature of the function (number and types of its arguments) * a new line in `create_physical_expr` mapping the built-in to the implementation * tests to the function. -* In [tests/sql.rs](tests/sql.rs), add a new test where the function is called through SQL against well known data and returns the expected result. -* In [src/logical_plan/expr](src/logical_plan/expr.rs), add: +* In [tests/sql.rs](datafusion/tests/sql.rs), add a new test where the function is called through SQL against well known data and returns the expected result. +* In [src/logical_plan/expr](datafusion/src/logical_plan/expr.rs), add: * a new entry of the `unary_scalar_expr!` macro for the new function. -* In [src/logical_plan/mod](src/logical_plan/mod.rs), add: +* In [src/logical_plan/mod](datafusion/src/logical_plan/mod.rs), add: * a new entry in the `pub use expr::{}` set. ## How to add a new aggregate function @@ -57,18 +57,18 @@ Below is a checklist of what you need to do to add a new scalar function to Data Below is a checklist of what you need to do to add a new aggregate function to DataFusion: * Add the actual implementation of an `Accumulator` and `AggregateExpr`: - * [here](src/physical_plan/string_expressions.rs) for string functions - * [here](src/physical_plan/math_expressions.rs) for math functions - * [here](src/physical_plan/datetime_expressions.rs) for datetime functions - * create a new module [here](src/physical_plan) for other functions -* In [src/physical_plan/aggregates](src/physical_plan/aggregates.rs), add: + * [here](datafusion/src/physical_plan/string_expressions.rs) for string functions + * [here](datafusion/src/physical_plan/math_expressions.rs) for math functions + * [here](datafusion/src/physical_plan/datetime_expressions.rs) for datetime functions + * create a new module [here](datafusion/src/physical_plan) for other functions +* In [src/physical_plan/aggregates](datafusion/src/physical_plan/aggregates.rs), add: * a new variant to `BuiltinAggregateFunction` * a new entry to `FromStr` with the name of the function as called by SQL * a new line in `return_type` with the expected return type of the function, given an incoming type * a new line in `signature` with the signature of the function (number and types of its arguments) * a new line in `create_aggregate_expr` mapping the built-in to the implementation * tests to the function. -* In [tests/sql.rs](tests/sql.rs), add a new test where the function is called through SQL against well known data and returns the expected result. +* In [tests/sql.rs](datafusion/tests/sql.rs), add a new test where the function is called through SQL against well known data and returns the expected result. ## How to display plans graphically diff --git a/README.md b/README.md index 9e6b7a2a78b5b..f6ef7d176686e 100644 --- a/README.md +++ b/README.md @@ -97,8 +97,8 @@ async fn main() -> datafusion::error::Result<()> { let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; let df = df.filter(col("a").lt_eq(col("b")))? - .aggregate(vec![col("a")], vec![min(col("b"))])? - .limit(100)?; + .aggregate(vec![col("a")], vec![min(col("b"))])? + .limit(100)?; // execute and print results let results: Vec = df.collect().await?; @@ -141,11 +141,11 @@ DataFusion also includes a simple command-line interactive SQL utility. See the - [x] SQL Parser - [x] SQL Query Planner - [x] Query Optimizer - - [x] Constant folding - - [x] Join Reordering - - [x] Limit Pushdown - - [x] Projection push down - - [x] Predicate push down +- [x] Constant folding +- [x] Join Reordering +- [x] Limit Pushdown +- [x] Projection push down +- [x] Predicate push down - [x] Type coercion - [x] Parallel query execution @@ -213,7 +213,9 @@ DataFusion also includes a simple command-line interactive SQL utility. See the - [ ] MINUS - [x] Joins - [x] INNER JOIN - - [ ] CROSS JOIN + - [x] LEFT JOIN + - [x] RIGHT JOIN + - [x] CROSS JOIN - [ ] OUTER JOIN - [ ] Window diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index eaa7031794cf7..3a7e857fe551f 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -21,6 +21,7 @@ description = "DataFusion is an in-memory query engine that uses Apache Arrow as version = "4.0.0-SNAPSHOT" homepage = "https://github.com/apache/arrow" repository = "https://github.com/apache/arrow" +readme = "../README.md" authors = ["Apache Arrow "] license = "Apache-2.0" keywords = [ "arrow", "query", "sql" ] diff --git a/datafusion/README.md b/datafusion/README.md deleted file mode 100644 index ff0b26d7bf031..0000000000000 --- a/datafusion/README.md +++ /dev/null @@ -1,358 +0,0 @@ - - -# DataFusion - - - -DataFusion is an extensible query execution framework, written in -Rust, that uses [Apache Arrow](https://arrow.apache.org) as its -in-memory format. - -DataFusion supports both an SQL and a DataFrame API for building -logical query plans as well as a query optimizer and execution engine -capable of parallel execution against partitioned data sources (CSV -and Parquet) using threads. - -## Use Cases - -DataFusion is used to create modern, fast and efficient data -pipelines, ETL processes, and database systems, which need the -performance of Rust and Apache Arrow and want to provide their users -the convenience of an SQL interface or a DataFrame API. - -## Why DataFusion? - -* *High Performance*: Leveraging Rust and Arrow's memory model, DataFusion achieves very high performance -* *Easy to Connect*: Being part of the Apache Arrow ecosystem (Arrow, Parquet and Flight), DataFusion works well with the rest of the big data ecosystem -* *Easy to Embed*: Allowing extension at almost any point in its design, DataFusion can be tailored for your specific usecase -* *High Quality*: Extensively tested, both by itself and with the rest of the Arrow ecosystem, DataFusion can be used as the foundation for production systems. - -## Known Uses - -Here are some of the projects known to use DataFusion: - -* [Ballista](https://github.com/ballista-compute/ballista) Distributed Compute Platform -* [Cloudfuse Buzz](https://github.com/cloudfuse-io/buzz-rust) -* [Cube.js](https://github.com/cube-js/cube.js) -* [datafusion-python](https://pypi.org/project/datafusion) -* [delta-rs](https://github.com/delta-io/delta-rs) -* [InfluxDB IOx](https://github.com/influxdata/influxdb_iox) Time Series Database -* [ROAPI](https://github.com/roapi/roapi) - -(if you know of another project, please submit a PR to add a link!) - -## Example Usage - -Run a SQL query against data stored in a CSV: - -```rust -use datafusion::prelude::*; -use arrow::util::pretty::print_batches; -use arrow::record_batch::RecordBatch; - -#[tokio::main] -async fn main() -> datafusion::error::Result<()> { - // register the table - let mut ctx = ExecutionContext::new(); - ctx.register_csv("example", "tests/example.csv", CsvReadOptions::new())?; - - // create a plan to run a SQL query - let df = ctx.sql("SELECT a, MIN(b) FROM example GROUP BY a LIMIT 100")?; - - // execute and print results - let results: Vec = df.collect().await?; - print_batches(&results)?; - Ok(()) -} -``` - -Use the DataFrame API to process data stored in a CSV: - -```rust -use datafusion::prelude::*; -use arrow::util::pretty::print_batches; -use arrow::record_batch::RecordBatch; - -#[tokio::main] -async fn main() -> datafusion::error::Result<()> { - // create the dataframe - let mut ctx = ExecutionContext::new(); - let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; - - let df = df.filter(col("a").lt_eq(col("b")))? - .aggregate(vec![col("a")], vec![min(col("b"))])? - .limit(100)?; - - // execute and print results - let results: Vec = df.collect().await?; - print_batches(&results)?; - Ok(()) -} -``` - -Both of these examples will produce - -```text -+---+--------+ -| a | MIN(b) | -+---+--------+ -| 1 | 2 | -+---+--------+ -``` - - - -## Using DataFusion as a library - -DataFusion is [published on crates.io](https://crates.io/crates/datafusion), and is [well documented on docs.rs](https://docs.rs/datafusion/). - -To get started, add the following to your `Cargo.toml` file: - -```toml -[dependencies] -datafusion = "4.0.0-SNAPSHOT" -``` - -## Using DataFusion as a binary - -DataFusion also includes a simple command-line interactive SQL utility. See the [CLI reference](docs/cli.md) for more information. - -# Status - -## General - -- [x] SQL Parser -- [x] SQL Query Planner -- [x] Query Optimizer - - [x] Constant folding - - [x] Join Reordering - - [x] Limit Pushdown - - [x] Projection push down - - [x] Predicate push down -- [x] Type coercion -- [x] Parallel query execution - -## SQL Support - -- [x] Projection -- [x] Filter (WHERE) -- [x] Filter post-aggregate (HAVING) -- [x] Limit -- [x] Aggregate -- [x] Common math functions -- [x] cast -- [x] try_cast -- Postgres compatible String functions - - [x] ascii - - [x] bit_length - - [x] btrim - - [x] char_length - - [x] character_length - - [x] chr - - [x] concat - - [x] concat_ws - - [x] initcap - - [x] left - - [x] length - - [x] lpad - - [x] ltrim - - [x] octet_length - - [x] regexp_replace - - [x] repeat - - [x] replace - - [x] reverse - - [x] right - - [x] rpad - - [x] rtrim - - [x] split_part - - [x] starts_with - - [x] strpos - - [x] substr - - [x] to_hex - - [x] translate - - [x] trim -- Miscellaneous/Boolean functions - - [x] nullif -- Common date/time functions - - [ ] Basic date functions - - [ ] Basic time functions - - [x] Basic timestamp functions -- nested functions - - [x] Array of columns -- [x] Schema Queries - - [x] SHOW TABLES - - [x] SHOW COLUMNS - - [x] information_schema.{tables, columns} - - [ ] information_schema other views -- [x] Sorting -- [ ] Nested types -- [ ] Lists -- [x] Subqueries -- [x] Common table expressions -- [ ] Set Operations - - [x] UNION ALL - - [ ] UNION - - [ ] INTERSECT - - [ ] MINUS -- [x] Joins - - [x] INNER JOIN - - [x] LEFT JOIN - - [x] RIGHT JOIN - - [x] CROSS JOIN - - [ ] OUTER JOIN -- [ ] Window - -## Data Sources - -- [x] CSV -- [x] Parquet primitive types -- [ ] Parquet nested types - - -## Extensibility - -DataFusion is designed to be extensible at all points. To that end, you can provide your own custom: - -- [x] User Defined Functions (UDFs) -- [x] User Defined Aggregate Functions (UDAFs) -- [x] User Defined Table Source (`TableProvider`) for tables -- [x] User Defined `Optimizer` passes (plan rewrites) -- [x] User Defined `LogicalPlan` nodes -- [x] User Defined `ExecutionPlan` nodes - - -# Supported SQL - -This library currently supports many SQL constructs, including - -* `CREATE EXTERNAL TABLE X STORED AS PARQUET LOCATION '...';` to register a table's locations -* `SELECT ... FROM ...` together with any expression -* `ALIAS` to name an expression -* `CAST` to change types, including e.g. `Timestamp(Nanosecond, None)` -* most mathematical unary and binary expressions such as `+`, `/`, `sqrt`, `tan`, `>=`. -* `WHERE` to filter -* `GROUP BY` together with one of the following aggregations: `MIN`, `MAX`, `COUNT`, `SUM`, `AVG` -* `ORDER BY` together with an expression and optional `ASC` or `DESC` and also optional `NULLS FIRST` or `NULLS LAST` - - -## Supported Functions - -DataFusion strives to implement a subset of the [PostgreSQL SQL dialect](https://www.postgresql.org/docs/current/functions.html) where possible. We explicitly choose a single dialect to maximize interoperability with other tools and allow reuse of the PostgreSQL documents and tutorials as much as possible. - -Currently, only a subset of the PosgreSQL dialect is implemented, and we will document any deviations. - -## Schema Metadata / Information Schema Support - -DataFusion supports the showing metadata about the tables available. This information can be accessed using the views of the ISO SQL `information_schema` schema or the DataFusion specific `SHOW TABLES` and `SHOW COLUMNS` commands. - -More information can be found in the [Postgres docs](https://www.postgresql.org/docs/13/infoschema-schema.html)). - - -To show tables available for use in DataFusion, use the `SHOW TABLES` command or the `information_schema.tables` view: - -```sql -> show tables; -+---------------+--------------------+------------+------------+ -| table_catalog | table_schema | table_name | table_type | -+---------------+--------------------+------------+------------+ -| datafusion | public | t | BASE TABLE | -| datafusion | information_schema | tables | VIEW | -+---------------+--------------------+------------+------------+ - -> select * from information_schema.tables; - -+---------------+--------------------+------------+--------------+ -| table_catalog | table_schema | table_name | table_type | -+---------------+--------------------+------------+--------------+ -| datafusion | public | t | BASE TABLE | -| datafusion | information_schema | TABLES | SYSTEM TABLE | -+---------------+--------------------+------------+--------------+ -``` - -To show the schema of a table in DataFusion, use the `SHOW COLUMNS` command or the or `information_schema.columns` view: - -```sql -> show columns from t; -+---------------+--------------+------------+-------------+-----------+-------------+ -| table_catalog | table_schema | table_name | column_name | data_type | is_nullable | -+---------------+--------------+------------+-------------+-----------+-------------+ -| datafusion | public | t | a | Int32 | NO | -| datafusion | public | t | b | Utf8 | NO | -| datafusion | public | t | c | Float32 | NO | -+---------------+--------------+------------+-------------+-----------+-------------+ - -> select table_name, column_name, ordinal_position, is_nullable, data_type from information_schema.columns; -+------------+-------------+------------------+-------------+-----------+ -| table_name | column_name | ordinal_position | is_nullable | data_type | -+------------+-------------+------------------+-------------+-----------+ -| t | a | 0 | NO | Int32 | -| t | b | 1 | NO | Utf8 | -| t | c | 2 | NO | Float32 | -+------------+-------------+------------------+-------------+-----------+ -``` - - - -## Supported Data Types - -DataFusion uses Arrow, and thus the Arrow type system, for query -execution. The SQL types from -[sqlparser-rs](https://github.com/ballista-compute/sqlparser-rs/blob/main/src/ast/data_type.rs#L57) -are mapped to Arrow types according to the following table - - -| SQL Data Type | Arrow DataType | -| --------------- | -------------------------------- | -| `CHAR` | `Utf8` | -| `VARCHAR` | `Utf8` | -| `UUID` | *Not yet supported* | -| `CLOB` | *Not yet supported* | -| `BINARY` | *Not yet supported* | -| `VARBINARY` | *Not yet supported* | -| `DECIMAL` | `Float64` | -| `FLOAT` | `Float32` | -| `SMALLINT` | `Int16` | -| `INT` | `Int32` | -| `BIGINT` | `Int64` | -| `REAL` | `Float64` | -| `DOUBLE` | `Float64` | -| `BOOLEAN` | `Boolean` | -| `DATE` | `Date32` | -| `TIME` | `Time64(TimeUnit::Millisecond)` | -| `TIMESTAMP` | `Date64` | -| `INTERVAL` | *Not yet supported* | -| `REGCLASS` | *Not yet supported* | -| `TEXT` | *Not yet supported* | -| `BYTEA` | *Not yet supported* | -| `CUSTOM` | *Not yet supported* | -| `ARRAY` | *Not yet supported* | - - -# Architecture Overview - -There is no formal document describing DataFusion's architecture yet, but the following presentations offer a good overview of its different components and how they interact together. - -* (March 2021): The DataFusion architecture is described in *Query Engine Design and the Rust-Based DataFusion in Apache Arrow*: [recording](https://www.youtube.com/watch?v=K6eCAVEk4kU) (DataFusion content starts ~ 15 minutes in) and [slides](https://www.slideshare.net/influxdata/influxdb-iox-tech-talks-query-engine-design-and-the-rustbased-datafusion-in-apache-arrow-244161934) -* (Feburary 2021): How DataFusion is used within the Ballista Project is described in *Ballista: Distributed Compute with Rust and Apache Arrow: [recording](https://www.youtube.com/watch?v=ZZHQaOap9pQ) - - -# Developer's guide - -Please see [Developers Guide](DEVELOPERS.md) for information about developing DataFusion. diff --git a/datafusion/src/lib.rs b/datafusion/src/lib.rs index 252d168114add..e1d7368469b0b 100644 --- a/datafusion/src/lib.rs +++ b/datafusion/src/lib.rs @@ -211,4 +211,4 @@ pub mod test; extern crate lazy_static; #[cfg(doctest)] -doc_comment::doctest!("../README.md", readme_example_test); +doc_comment::doctest!("../../README.md", readme_example_test); From 3855473e0bc3b8713a1c1b1fe21efefb8ca32cd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=80=9D=E7=BB=B4?= Date: Tue, 27 Apr 2021 05:39:01 +0800 Subject: [PATCH 041/329] ARROW-12306: Read CSV format text from stdin or memory (#54) --- datafusion/src/datasource/csv.rs | 152 +++++++++++++++--- datafusion/src/physical_plan/csv.rs | 231 ++++++++++++++++++++++++---- 2 files changed, 334 insertions(+), 49 deletions(-) diff --git a/datafusion/src/datasource/csv.rs b/datafusion/src/datasource/csv.rs index 6f6c9abe07741..1bd1b4be823ee 100644 --- a/datafusion/src/datasource/csv.rs +++ b/datafusion/src/datasource/csv.rs @@ -35,8 +35,9 @@ use arrow::datatypes::SchemaRef; use std::any::Any; +use std::io::{Read, Seek}; use std::string::String; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use crate::datasource::datasource::Statistics; use crate::datasource::TableProvider; @@ -46,10 +47,17 @@ use crate::physical_plan::csv::CsvExec; pub use crate::physical_plan::csv::CsvReadOptions; use crate::physical_plan::{common, ExecutionPlan}; +enum Source { + /// Path to a single CSV file or a directory containing one of more CSV files + Path(String), + + /// Read CSV data from a reader + Reader(Mutex>>), +} + /// Represents a CSV file with a provided schema pub struct CsvFile { - /// Path to a single CSV file or a directory containing one of more CSV files - path: String, + source: Source, schema: SchemaRef, has_header: bool, delimiter: u8, @@ -77,7 +85,7 @@ impl CsvFile { }); Ok(Self { - path: String::from(path), + source: Source::Path(path.to_string()), schema, has_header: options.has_header, delimiter: options.delimiter, @@ -86,9 +94,64 @@ impl CsvFile { }) } + /// Attempt to initialize a `CsvFile` from a reader. The schema MUST be provided in options. + pub fn try_new_from_reader( + reader: R, + options: CsvReadOptions, + ) -> Result { + let schema = Arc::new(match options.schema { + Some(s) => s.clone(), + None => { + return Err(DataFusionError::Execution( + "Schema must be provided to CsvRead".to_string(), + )); + } + }); + + Ok(Self { + source: Source::Reader(Mutex::new(Some(Box::new(reader)))), + schema, + has_header: options.has_header, + delimiter: options.delimiter, + statistics: Statistics::default(), + file_extension: String::new(), + }) + } + + /// Attempt to initialize a `CsvRead` from a reader impls `Seek`. The schema can be inferred automatically. + pub fn try_new_from_reader_infer_schema( + mut reader: R, + options: CsvReadOptions, + ) -> Result { + let schema = Arc::new(match options.schema { + Some(s) => s.clone(), + None => { + let (schema, _) = arrow::csv::reader::infer_file_schema( + &mut reader, + options.delimiter, + Some(options.schema_infer_max_records), + options.has_header, + )?; + schema + } + }); + + Ok(Self { + source: Source::Reader(Mutex::new(Some(Box::new(reader)))), + schema, + has_header: options.has_header, + delimiter: options.delimiter, + statistics: Statistics::default(), + file_extension: String::new(), + }) + } + /// Get the path for the CSV file(s) represented by this CsvFile instance pub fn path(&self) -> &str { - &self.path + match &self.source { + Source::Reader(_) => "", + Source::Path(path) => path, + } } /// Determine whether the CSV file(s) represented by this CsvFile instance have a header row @@ -123,22 +186,75 @@ impl TableProvider for CsvFile { _filters: &[Expr], limit: Option, ) -> Result> { - Ok(Arc::new(CsvExec::try_new( - &self.path, - CsvReadOptions::new() - .schema(&self.schema) - .has_header(self.has_header) - .delimiter(self.delimiter) - .file_extension(self.file_extension.as_str()), - projection.clone(), - limit - .map(|l| std::cmp::min(l, batch_size)) - .unwrap_or(batch_size), - limit, - )?)) + let opts = CsvReadOptions::new() + .schema(&self.schema) + .has_header(self.has_header) + .delimiter(self.delimiter) + .file_extension(self.file_extension.as_str()); + let batch_size = limit + .map(|l| std::cmp::min(l, batch_size)) + .unwrap_or(batch_size); + + let exec = match &self.source { + Source::Reader(maybe_reader) => { + if let Some(rdr) = maybe_reader.lock().unwrap().take() { + CsvExec::try_new_from_reader( + rdr, + opts, + projection.clone(), + batch_size, + limit, + )? + } else { + return Err(DataFusionError::Execution( + "You can only read once if the data comes from a reader" + .to_string(), + )); + } + } + Source::Path(p) => { + CsvExec::try_new(&p, opts, projection.clone(), batch_size, limit)? + } + }; + Ok(Arc::new(exec)) } fn statistics(&self) -> Statistics { self.statistics.clone() } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::prelude::*; + + #[tokio::test] + async fn csv_file_from_reader() -> Result<()> { + let testdata = arrow::util::test_util::arrow_test_data(); + let filename = "aggregate_test_100.csv"; + let path = format!("{}/csv/{}", testdata, filename); + let buf = std::fs::read(path).unwrap(); + let rdr = std::io::Cursor::new(buf); + let mut ctx = ExecutionContext::new(); + ctx.register_table( + "aggregate_test", + Arc::new(CsvFile::try_new_from_reader_infer_schema( + rdr, + CsvReadOptions::new(), + )?), + )?; + let df = ctx.sql("select max(c2) from aggregate_test")?; + let batches = df.collect().await?; + assert_eq!( + batches[0] + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 5 + ); + Ok(()) + } +} diff --git a/datafusion/src/physical_plan/csv.rs b/datafusion/src/physical_plan/csv.rs index 7ee5ae3fd90b0..b96a702f27325 100644 --- a/datafusion/src/physical_plan/csv.rs +++ b/datafusion/src/physical_plan/csv.rs @@ -17,12 +17,6 @@ //! Execution plan for reading CSV files -use std::any::Any; -use std::fs::File; -use std::pin::Pin; -use std::sync::Arc; -use std::task::{Context, Poll}; - use crate::error::{DataFusionError, Result}; use crate::physical_plan::ExecutionPlan; use crate::physical_plan::{common, Partitioning}; @@ -31,6 +25,13 @@ use arrow::datatypes::{Schema, SchemaRef}; use arrow::error::Result as ArrowResult; use arrow::record_batch::RecordBatch; use futures::Stream; +use std::any::Any; +use std::fs::File; +use std::io::Read; +use std::pin::Pin; +use std::sync::Arc; +use std::sync::Mutex; +use std::task::{Context, Poll}; use super::{RecordBatchStream, SendableRecordBatchStream}; use async_trait::async_trait; @@ -106,13 +107,69 @@ impl<'a> CsvReadOptions<'a> { } } +/// Source represents where the data comes from. +enum Source { + /// The data comes from partitioned files + PartitionedFiles { + /// Path to directory containing partitioned files with the same schema + path: String, + /// The individual files under path + filenames: Vec, + }, + + /// The data comes from anything impl Read trait + Reader(Mutex>>), +} + +impl std::fmt::Debug for Source { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Source::PartitionedFiles { path, filenames } => f + .debug_struct("PartitionedFiles") + .field("path", path) + .field("filenames", filenames) + .finish()?, + Source::Reader(_) => f.write_str("Reader")?, + }; + Ok(()) + } +} + +impl Clone for Source { + fn clone(&self) -> Self { + match self { + Source::PartitionedFiles { path, filenames } => Self::PartitionedFiles { + path: path.clone(), + filenames: filenames.clone(), + }, + Source::Reader(_) => Self::Reader(Mutex::new(None)), + } + } +} + +impl Source { + /// Path to directory containing partitioned files with the same schema + pub fn path(&self) -> &str { + match self { + Source::PartitionedFiles { path, .. } => path.as_str(), + Source::Reader(_) => "", + } + } + + /// The individual files under path + pub fn filenames(&self) -> &[String] { + match self { + Source::PartitionedFiles { filenames, .. } => filenames, + Source::Reader(_) => &[], + } + } +} + /// Execution plan for scanning a CSV file #[derive(Debug, Clone)] pub struct CsvExec { - /// Path to directory containing partitioned CSV files with the same schema - path: String, - /// The individual files under path - filenames: Vec, + /// Where the data comes from. + source: Source, /// Schema representing the CSV file schema: SchemaRef, /// Does the CSV file have a header? @@ -163,8 +220,10 @@ impl CsvExec { }; Ok(Self { - path: path.to_string(), - filenames, + source: Source::PartitionedFiles { + path: path.to_string(), + filenames, + }, schema: Arc::new(schema), has_header: options.has_header, delimiter: Some(options.delimiter), @@ -175,15 +234,50 @@ impl CsvExec { limit, }) } + /// Create a new execution plan for reading from a reader + pub fn try_new_from_reader( + reader: impl Read + Send + Sync + 'static, + options: CsvReadOptions, + projection: Option>, + batch_size: usize, + limit: Option, + ) -> Result { + let schema = match options.schema { + Some(s) => s.clone(), + None => { + return Err(DataFusionError::Execution( + "The schema must be provided in options when reading from a reader" + .to_string(), + )); + } + }; + + let projected_schema = match &projection { + None => schema.clone(), + Some(p) => Schema::new(p.iter().map(|i| schema.field(*i).clone()).collect()), + }; + + Ok(Self { + source: Source::Reader(Mutex::new(Some(Box::new(reader)))), + schema: Arc::new(schema), + has_header: options.has_header, + delimiter: Some(options.delimiter), + file_extension: String::new(), + projection, + projected_schema: Arc::new(projected_schema), + batch_size, + limit, + }) + } /// Path to directory containing partitioned CSV files with the same schema pub fn path(&self) -> &str { - &self.path + self.source.path() } /// The individual files under path pub fn filenames(&self) -> &[String] { - &self.filenames + self.source.filenames() } /// Does the CSV file have a header? @@ -249,7 +343,10 @@ impl ExecutionPlan for CsvExec { /// Get the output partitioning of this plan fn output_partitioning(&self) -> Partitioning { - Partitioning::UnknownPartitioning(self.filenames.len()) + Partitioning::UnknownPartitioning(match &self.source { + Source::PartitionedFiles { filenames, .. } => filenames.len(), + Source::Reader(_) => 1, + }) } fn children(&self) -> Vec> { @@ -272,25 +369,51 @@ impl ExecutionPlan for CsvExec { } async fn execute(&self, partition: usize) -> Result { - Ok(Box::pin(CsvStream::try_new( - &self.filenames[partition], - self.schema.clone(), - self.has_header, - self.delimiter, - &self.projection, - self.batch_size, - self.limit, - )?)) + match &self.source { + Source::PartitionedFiles { filenames, .. } => { + Ok(Box::pin(CsvStream::try_new( + &filenames[partition], + self.schema.clone(), + self.has_header, + self.delimiter, + &self.projection, + self.batch_size, + self.limit, + )?)) + } + Source::Reader(rdr) => { + if partition != 0 { + Err(DataFusionError::Internal( + "Only partition 0 is valid when CSV comes from a reader" + .to_string(), + )) + } else if let Some(rdr) = rdr.lock().unwrap().take() { + Ok(Box::pin(CsvStream::try_new_from_reader( + rdr, + self.schema.clone(), + self.has_header, + self.delimiter, + &self.projection, + self.batch_size, + self.limit, + )?)) + } else { + Err(DataFusionError::Execution( + "Error reading CSV: Data can only be read a single time when the source is a reader" + .to_string(), + )) + } + } + } } } /// Iterator over batches -struct CsvStream { +struct CsvStream { /// Arrow CSV reader - reader: csv::Reader, + reader: csv::Reader, } - -impl CsvStream { +impl CsvStream { /// Create an iterator for a CSV file pub fn try_new( filename: &str, @@ -302,11 +425,27 @@ impl CsvStream { limit: Option, ) -> Result { let file = File::open(filename)?; + Self::try_new_from_reader( + file, schema, has_header, delimiter, projection, batch_size, limit, + ) + } +} +impl CsvStream { + /// Create an iterator for a reader + pub fn try_new_from_reader( + reader: R, + schema: SchemaRef, + has_header: bool, + delimiter: Option, + projection: &Option>, + batch_size: usize, + limit: Option, + ) -> Result> { let start_line = if has_header { 1 } else { 0 }; let bounds = limit.map(|x| (0, x + start_line)); let reader = csv::Reader::new( - file, + reader, schema, has_header, delimiter, @@ -319,7 +458,7 @@ impl CsvStream { } } -impl Stream for CsvStream { +impl Stream for CsvStream { type Item = ArrowResult; fn poll_next( @@ -330,7 +469,7 @@ impl Stream for CsvStream { } } -impl RecordBatchStream for CsvStream { +impl RecordBatchStream for CsvStream { /// Get the schema fn schema(&self) -> SchemaRef { self.reader.schema() @@ -398,4 +537,34 @@ mod tests { assert_eq!("c3", batch_schema.field(2).name()); Ok(()) } + + #[tokio::test] + async fn csv_exec_with_reader() -> Result<()> { + let schema = aggr_test_schema(); + let testdata = arrow::util::test_util::arrow_test_data(); + let filename = "aggregate_test_100.csv"; + let path = format!("{}/csv/{}", testdata, filename); + let buf = std::fs::read(path).unwrap(); + let rdr = std::io::Cursor::new(buf); + let csv = CsvExec::try_new_from_reader( + rdr, + CsvReadOptions::new().schema(&schema), + Some(vec![0, 2, 4]), + 1024, + None, + )?; + assert_eq!(13, csv.schema.fields().len()); + assert_eq!(3, csv.projected_schema.fields().len()); + assert_eq!(13, csv.file_schema().fields().len()); + assert_eq!(3, csv.schema().fields().len()); + let mut stream = csv.execute(0).await?; + let batch = stream.next().await.unwrap()?; + assert_eq!(3, batch.num_columns()); + let batch_schema = batch.schema(); + assert_eq!(3, batch_schema.fields().len()); + assert_eq!("c1", batch_schema.field(0).name()); + assert_eq!("c3", batch_schema.field(1).name()); + assert_eq!("c5", batch_schema.field(2).name()); + Ok(()) + } } From e86ad26a678a32e2743bc031ae5cb81e6931a231 Mon Sep 17 00:00:00 2001 From: "K.I. (Dennis) Jung" Date: Tue, 27 Apr 2021 13:33:55 +0900 Subject: [PATCH 042/329] Add option param for standalone mode (#42) * add option param for standalone mode * Run formatter --- ballista/rust/executor/executor_config_spec.toml | 7 ++++++- ballista/rust/executor/src/main.rs | 10 ++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/ballista/rust/executor/executor_config_spec.toml b/ballista/rust/executor/executor_config_spec.toml index cb47ca064236d..f7c2f037a59f2 100644 --- a/ballista/rust/executor/executor_config_spec.toml +++ b/ballista/rust/executor/executor_config_spec.toml @@ -76,4 +76,9 @@ abbr = "c" name = "concurrent_tasks" type = "usize" default = "4" -doc = "Max concurrent tasks." \ No newline at end of file +doc = "Max concurrent tasks." + +[[param]] +name = "scheduler_data_path" +type = "String" +doc = "Path for standalone data" diff --git a/ballista/rust/executor/src/main.rs b/ballista/rust/executor/src/main.rs index 2718ea3542faf..e620fa450376b 100644 --- a/ballista/rust/executor/src/main.rs +++ b/ballista/rust/executor/src/main.rs @@ -107,8 +107,14 @@ async fn main() -> Result<()> { if opt.local { info!("Running in local mode. Scheduler will be run in-proc"); - let client = StandaloneClient::try_new_temporary() - .context("Could not create standalone config backend")?; + + let client = match opt.scheduler_data_path { + Some(v) => StandaloneClient::try_new(v) + .context("Could not create standalone config backend")?, + None => StandaloneClient::try_new_temporary() + .context("Could not create standalone config backend")?, + }; + let server = SchedulerGrpcServer::new(SchedulerServer::new(Arc::new(client), namespace)); let addr = format!("{}:{}", bind_host, scheduler_port); From af4c05d77ef25f6d5f02ebe266b3e5b2af634dc4 Mon Sep 17 00:00:00 2001 From: Ximo Guanter Date: Wed, 28 Apr 2021 03:47:45 +0200 Subject: [PATCH 043/329] Remove namespace from executors (#75) --- ballista/rust/executor/executor_config_spec.toml | 7 ------- ballista/rust/executor/src/main.rs | 7 ++++--- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/ballista/rust/executor/executor_config_spec.toml b/ballista/rust/executor/executor_config_spec.toml index f7c2f037a59f2..2a7c96bde3aff 100644 --- a/ballista/rust/executor/executor_config_spec.toml +++ b/ballista/rust/executor/executor_config_spec.toml @@ -24,13 +24,6 @@ conf_file_param = "config_file" name = "version" doc = "Print version of this executable" -[[param]] -abbr = "n" -name = "namespace" -type = "String" -doc = "Namespace for the ballista cluster that this executor will join. yippee" -default = "std::string::String::from(\"ballista\")" - [[param]] name = "scheduler_host" type = "String" diff --git a/ballista/rust/executor/src/main.rs b/ballista/rust/executor/src/main.rs index e620fa450376b..9c8d466add4f4 100644 --- a/ballista/rust/executor/src/main.rs +++ b/ballista/rust/executor/src/main.rs @@ -70,7 +70,6 @@ async fn main() -> Result<()> { std::process::exit(0); } - let namespace = opt.namespace; let external_host = opt.external_host; let bind_host = opt.bind_host; let port = opt.port; @@ -115,8 +114,10 @@ async fn main() -> Result<()> { .context("Could not create standalone config backend")?, }; - let server = - SchedulerGrpcServer::new(SchedulerServer::new(Arc::new(client), namespace)); + let server = SchedulerGrpcServer::new(SchedulerServer::new( + Arc::new(client), + "ballista".to_string(), + )); let addr = format!("{}:{}", bind_host, scheduler_port); let addr = addr .parse() From 14f1eebef068a9e65f556ed74d2b6d98376c97f4 Mon Sep 17 00:00:00 2001 From: Ruan Pearce-Authers Date: Wed, 28 Apr 2021 11:26:58 +0100 Subject: [PATCH 044/329] Allow table providers to indicate their type for catalog metadata (#205) --- datafusion/src/catalog/information_schema.rs | 45 ++++++------- datafusion/src/datasource/datasource.rs | 16 +++++ datafusion/src/datasource/mod.rs | 2 +- datafusion/src/execution/context.rs | 67 +++++++++++++++++++- 4 files changed, 105 insertions(+), 25 deletions(-) diff --git a/datafusion/src/catalog/information_schema.rs b/datafusion/src/catalog/information_schema.rs index 5a7b9d5b6448d..fd7fcb4b901a6 100644 --- a/datafusion/src/catalog/information_schema.rs +++ b/datafusion/src/catalog/information_schema.rs @@ -27,7 +27,7 @@ use arrow::{ record_batch::RecordBatch, }; -use crate::datasource::{MemTable, TableProvider}; +use crate::datasource::{MemTable, TableProvider, TableType}; use super::{ catalog::{CatalogList, CatalogProvider}, @@ -105,14 +105,25 @@ impl InformationSchemaProvider { if schema_name != INFORMATION_SCHEMA { let schema = catalog.schema(&schema_name).unwrap(); for table_name in schema.table_names() { - builder.add_base_table(&catalog_name, &schema_name, table_name) + let table = schema.table(&table_name).unwrap(); + builder.add_table( + &catalog_name, + &schema_name, + table_name, + table.table_type(), + ); } } } // Add a final list for the information schema tables themselves - builder.add_system_table(&catalog_name, INFORMATION_SCHEMA, TABLES); - builder.add_system_table(&catalog_name, INFORMATION_SCHEMA, COLUMNS); + builder.add_table(&catalog_name, INFORMATION_SCHEMA, TABLES, TableType::View); + builder.add_table( + &catalog_name, + INFORMATION_SCHEMA, + COLUMNS, + TableType::View, + ); } let mem_table: MemTable = builder.into(); @@ -198,11 +209,12 @@ impl InformationSchemaTablesBuilder { } } - fn add_base_table( + fn add_table( &mut self, catalog_name: impl AsRef, schema_name: impl AsRef, table_name: impl AsRef, + table_type: TableType, ) { // Note: append_value is actually infallable. self.catalog_names @@ -212,24 +224,13 @@ impl InformationSchemaTablesBuilder { .append_value(schema_name.as_ref()) .unwrap(); self.table_names.append_value(table_name.as_ref()).unwrap(); - self.table_types.append_value("BASE TABLE").unwrap(); - } - - fn add_system_table( - &mut self, - catalog_name: impl AsRef, - schema_name: impl AsRef, - table_name: impl AsRef, - ) { - // Note: append_value is actually infallable. - self.catalog_names - .append_value(catalog_name.as_ref()) + self.table_types + .append_value(match table_type { + TableType::Base => "BASE TABLE", + TableType::View => "VIEW", + TableType::Temporary => "LOCAL TEMPORARY", + }) .unwrap(); - self.schema_names - .append_value(schema_name.as_ref()) - .unwrap(); - self.table_names.append_value(table_name.as_ref()).unwrap(); - self.table_types.append_value("VIEW").unwrap(); } } diff --git a/datafusion/src/datasource/datasource.rs b/datafusion/src/datasource/datasource.rs index e2b07336486cb..0349a49e491ba 100644 --- a/datafusion/src/datasource/datasource.rs +++ b/datafusion/src/datasource/datasource.rs @@ -66,6 +66,17 @@ pub enum TableProviderFilterPushDown { Exact, } +/// Indicates the type of this table for metadata/catalog purposes. +#[derive(Debug, Clone, Copy)] +pub enum TableType { + /// An ordinary physical table. + Base, + /// A non-materialised table that itself uses a query internally to provide data. + View, + /// A transient table. + Temporary, +} + /// Source table pub trait TableProvider: Sync + Send { /// Returns the table provider as [`Any`](std::any::Any) so that it can be @@ -75,6 +86,11 @@ pub trait TableProvider: Sync + Send { /// Get a reference to the schema for this table fn schema(&self) -> SchemaRef; + /// Get the type of this table for metadata/catalog purposes. + fn table_type(&self) -> TableType { + TableType::Base + } + /// Create an ExecutionPlan that will scan the table. fn scan( &self, diff --git a/datafusion/src/datasource/mod.rs b/datafusion/src/datasource/mod.rs index 099098dd6f665..ac2f3d2dee1ee 100644 --- a/datafusion/src/datasource/mod.rs +++ b/datafusion/src/datasource/mod.rs @@ -24,5 +24,5 @@ pub mod memory; pub mod parquet; pub use self::csv::{CsvFile, CsvReadOptions}; -pub use self::datasource::TableProvider; +pub use self::datasource::{TableProvider, TableType}; pub use self::memory::MemTable; diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index c394d3895622a..d25e7cc05e0df 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -840,10 +840,11 @@ mod tests { use crate::variable::VarType; use crate::{ assert_batches_eq, assert_batches_sorted_eq, - logical_plan::{col, create_udf, sum}, + logical_plan::{col, create_udf, sum, Expr}, }; use crate::{ - datasource::MemTable, logical_plan::create_udaf, + datasource::{MemTable, TableType}, + logical_plan::create_udaf, physical_plan::expressions::AvgAccumulator, }; use arrow::array::{ @@ -2631,6 +2632,68 @@ mod tests { assert_batches_sorted_eq!(expected, &result); } + #[tokio::test] + async fn information_schema_tables_table_types() { + struct TestTable(TableType); + + impl TableProvider for TestTable { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn table_type(&self) -> TableType { + self.0 + } + + fn schema(&self) -> SchemaRef { + unimplemented!() + } + + fn scan( + &self, + _: &Option>, + _: usize, + _: &[Expr], + _: Option, + ) -> Result> { + unimplemented!() + } + + fn statistics(&self) -> crate::datasource::datasource::Statistics { + unimplemented!() + } + } + + let mut ctx = ExecutionContext::with_config( + ExecutionConfig::new().with_information_schema(true), + ); + + ctx.register_table("physical", Arc::new(TestTable(TableType::Base))) + .unwrap(); + ctx.register_table("query", Arc::new(TestTable(TableType::View))) + .unwrap(); + ctx.register_table("temp", Arc::new(TestTable(TableType::Temporary))) + .unwrap(); + + let result = + plan_and_collect(&mut ctx, "SELECT * from information_schema.tables") + .await + .unwrap(); + + let expected = vec![ + "+---------------+--------------------+------------+-----------------+", + "| table_catalog | table_schema | table_name | table_type |", + "+---------------+--------------------+------------+-----------------+", + "| datafusion | information_schema | tables | VIEW |", + "| datafusion | information_schema | columns | VIEW |", + "| datafusion | public | physical | BASE TABLE |", + "| datafusion | public | query | VIEW |", + "| datafusion | public | temp | LOCAL TEMPORARY |", + "+---------------+--------------------+------------+-----------------+", + ]; + assert_batches_sorted_eq!(expected, &result); + } + #[tokio::test] async fn information_schema_show_tables_no_information_schema() { let mut ctx = ExecutionContext::with_config(ExecutionConfig::new()); From 33715747db5a3cc48936c7b26859d4ad5809cde8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Wed, 28 Apr 2021 12:28:16 +0200 Subject: [PATCH 045/329] [DataFusion] Optimize hash join inner workings, null handling fix (#24) * Speed ups in hash join * Fix test * Update hash * Update commit hash everywhere * Use primitive everywhere * Undo combine_hashes * Delete * Fixes * Avoid combine_hashes call for single columns * Update comment * Revert "Avoid combine_hashes call for single columns" This reverts commit f14f8380851d9774939a71e841d80ff5d4b0f148. * Fix null handling * Revert "Revert "Avoid combine_hashes call for single columns"" This reverts commit e1ac6a9e5ef64ab27496f13b2b37b63263407865. * Fix null handling * Unignore test * Use normal hasher for booleans Co-authored-by: Jorge Leitao * Add extra documentation to hash join hashmap structure * empty Co-authored-by: Jorge Leitao --- datafusion/src/physical_plan/hash_join.rs | 336 ++++++++++++++++------ datafusion/tests/sql.rs | 3 +- 2 files changed, 252 insertions(+), 87 deletions(-) diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index eb2ec33d08699..2edd0c7ee5e3d 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -63,9 +63,18 @@ use crate::physical_plan::coalesce_batches::concat_batches; use log::debug; // Maps a `u64` hash value based on the left ["on" values] to a list of indices with this key's value. +// +// Note that the `u64` keys are not stored in the hashmap (hence the `()` as key), but are only used +// to put the indices in a certain bucket. +// By allocating a `HashMap` with capacity for *at least* the number of rows for entries at the left side, +// we make sure that we don't have to re-hash the hashmap, which needs access to the key (the hash in this case) value. // E.g. 1 -> [3, 6, 8] indicates that the column values map to rows 3, 6 and 8 for hash value 1 // As the key is a hash value, we need to check possible hash collisions in the probe stage -type JoinHashMap = HashMap, IdHashBuilder>; +// During this stage it might be the case that a row is contained the same hashmap value, +// but the values don't match. Those are checked in the [equal_rows] macro +// TODO: speed up collission check and move away from using a hashbrown HashMap +// https://github.com/apache/arrow-datafusion/issues/50 +type JoinHashMap = HashMap<(), SmallVec<[u64; 1]>, IdHashBuilder>; type JoinLeftData = Arc<(JoinHashMap, RecordBatch)>; /// join execution plan executes partitions in parallel and combines them into a set of @@ -255,34 +264,33 @@ impl ExecutionPlan for HashJoinExec { // This operation performs 2 steps at once: // 1. creates a [JoinHashMap] of all batches from the stream // 2. stores the batches in a vector. - let initial = ( - JoinHashMap::with_hasher(IdHashBuilder {}), - Vec::new(), - 0, - Vec::new(), - ); - let (hashmap, batches, num_rows, _) = stream + let initial = (0, Vec::new()); + let (num_rows, batches) = stream .try_fold(initial, |mut acc, batch| async { - let hash = &mut acc.0; - let values = &mut acc.1; - let offset = acc.2; - acc.3.clear(); - acc.3.resize(batch.num_rows(), 0); - update_hash( - &on_left, - &batch, - hash, - offset, - &self.random_state, - &mut acc.3, - ) - .unwrap(); - acc.2 += batch.num_rows(); - values.push(batch); + acc.0 += batch.num_rows(); + acc.1.push(batch); Ok(acc) }) .await?; - + let mut hashmap = JoinHashMap::with_capacity_and_hasher( + num_rows, + IdHashBuilder {}, + ); + let mut hashes_buffer = Vec::new(); + let mut offset = 0; + for batch in batches.iter() { + hashes_buffer.clear(); + hashes_buffer.resize(batch.num_rows(), 0); + update_hash( + &on_left, + &batch, + &mut hashmap, + offset, + &self.random_state, + &mut hashes_buffer, + )?; + offset += batch.num_rows(); + } // Merge all batches into a single batch, so we // can directly index into the arrays let single_batch = @@ -311,34 +319,31 @@ impl ExecutionPlan for HashJoinExec { // This operation performs 2 steps at once: // 1. creates a [JoinHashMap] of all batches from the stream // 2. stores the batches in a vector. - let initial = ( - JoinHashMap::with_hasher(IdHashBuilder {}), - Vec::new(), - 0, - Vec::new(), - ); - let (hashmap, batches, num_rows, _) = stream + let initial = (0, Vec::new()); + let (num_rows, batches) = stream .try_fold(initial, |mut acc, batch| async { - let hash = &mut acc.0; - let values = &mut acc.1; - let offset = acc.2; - acc.3.clear(); - acc.3.resize(batch.num_rows(), 0); - update_hash( - &on_left, - &batch, - hash, - offset, - &self.random_state, - &mut acc.3, - ) - .unwrap(); - acc.2 += batch.num_rows(); - values.push(batch); + acc.0 += batch.num_rows(); + acc.1.push(batch); Ok(acc) }) .await?; - + let mut hashmap = + JoinHashMap::with_capacity_and_hasher(num_rows, IdHashBuilder {}); + let mut hashes_buffer = Vec::new(); + let mut offset = 0; + for batch in batches.iter() { + hashes_buffer.clear(); + hashes_buffer.resize(batch.num_rows(), 0); + update_hash( + &on_left, + &batch, + &mut hashmap, + offset, + &self.random_state, + &mut hashes_buffer, + )?; + offset += batch.num_rows(); + } // Merge all batches into a single batch, so we // can directly index into the arrays let single_batch = @@ -399,15 +404,23 @@ fn update_hash( .map(|name| Ok(col(name).evaluate(batch)?.into_array(batch.num_rows()))) .collect::>>()?; - // update the hash map + // calculate the hash values let hash_values = create_hashes(&keys_values, &random_state, hashes_buffer)?; // insert hashes to key of the hashmap for (row, hash_value) in hash_values.iter().enumerate() { - hash.raw_entry_mut() - .from_key_hashed_nocheck(*hash_value, hash_value) - .and_modify(|_, v| v.push((row + offset) as u64)) - .or_insert_with(|| (*hash_value, smallvec![(row + offset) as u64])); + match hash.raw_entry_mut().from_hash(*hash_value, |_| true) { + hashbrown::hash_map::RawEntryMut::Occupied(mut entry) => { + entry.get_mut().push((row + offset) as u64); + } + hashbrown::hash_map::RawEntryMut::Vacant(entry) => { + entry.insert_hashed_nocheck( + *hash_value, + (), + smallvec![(row + offset) as u64], + ); + } + }; } Ok(()) } @@ -574,7 +587,9 @@ fn build_join_indexes( // For every item on the left and right we check if it matches // This possibly contains rows with hash collisions, // So we have to check here whether rows are equal or not - if let Some(indices) = left.get(hash_value) { + if let Some((_, indices)) = + left.raw_entry().from_hash(*hash_value, |_| true) + { for &i in indices { // Check hash collisions if equal_rows(i as usize, row, &left_join_values, &keys_values)? { @@ -611,7 +626,9 @@ fn build_join_indexes( // First visit all of the rows for (row, hash_value) in hash_values.iter().enumerate() { - if let Some(indices) = left.get(hash_value) { + if let Some((_, indices)) = + left.raw_entry().from_hash(*hash_value, |_| true) + { for &i in indices { // Collision check if equal_rows(i as usize, row, &left_join_values, &keys_values)? { @@ -638,8 +655,8 @@ fn build_join_indexes( let mut right_indices = UInt32Builder::new(0); for (row, hash_value) in hash_values.iter().enumerate() { - match left.get(hash_value) { - Some(indices) => { + match left.raw_entry().from_hash(*hash_value, |_| true) { + Some((_, indices)) => { for &i in indices { if equal_rows( i as usize, @@ -649,6 +666,9 @@ fn build_join_indexes( )? { left_indices.append_value(i)?; right_indices.append_value(row as u32)?; + } else { + left_indices.append_null()?; + right_indices.append_value(row as u32)?; } } } @@ -697,6 +717,7 @@ impl BuildHasher for IdHashBuilder { } // Combines two hashes into one hash +#[inline] fn combine_hashes(l: u64, r: u64) -> u64 { let hash = (17 * 37u64).wrapping_add(l); hash.wrapping_mul(37).wrapping_add(r) @@ -708,7 +729,6 @@ macro_rules! equal_rows_elem { let right_array = $r.as_any().downcast_ref::<$array_type>().unwrap(); match (left_array.is_null($left), left_array.is_null($right)) { - (true, true) => true, (false, false) => left_array.value($left) == right_array.value($right), _ => false, } @@ -755,21 +775,75 @@ fn equal_rows( } macro_rules! hash_array { - ($array_type:ident, $column: ident, $ty: ident, $hashes: ident, $random_state: ident) => { + ($array_type:ident, $column: ident, $ty: ident, $hashes: ident, $random_state: ident, $multi_col: ident) => { let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); if array.null_count() == 0 { - for (i, hash) in $hashes.iter_mut().enumerate() { - *hash = - combine_hashes($ty::get_hash(&array.value(i), $random_state), *hash); - } - } else { - for (i, hash) in $hashes.iter_mut().enumerate() { - if !array.is_null(i) { + if $multi_col { + for (i, hash) in $hashes.iter_mut().enumerate() { *hash = combine_hashes( $ty::get_hash(&array.value(i), $random_state), *hash, ); } + } else { + for (i, hash) in $hashes.iter_mut().enumerate() { + *hash = $ty::get_hash(&array.value(i), $random_state); + } + } + } else { + if $multi_col { + for (i, hash) in $hashes.iter_mut().enumerate() { + if !array.is_null(i) { + *hash = combine_hashes( + $ty::get_hash(&array.value(i), $random_state), + *hash, + ); + } + } + } else { + for (i, hash) in $hashes.iter_mut().enumerate() { + if !array.is_null(i) { + *hash = $ty::get_hash(&array.value(i), $random_state); + } + } + } + } + }; +} + +macro_rules! hash_array_primitive { + ($array_type:ident, $column: ident, $ty: ident, $hashes: ident, $random_state: ident, $multi_col: ident) => { + let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + let values = array.values(); + + if array.null_count() == 0 { + if $multi_col { + for (hash, value) in $hashes.iter_mut().zip(values.iter()) { + *hash = combine_hashes($ty::get_hash(value, $random_state), *hash); + } + } else { + for (hash, value) in $hashes.iter_mut().zip(values.iter()) { + *hash = $ty::get_hash(value, $random_state) + } + } + } else { + if $multi_col { + for (i, (hash, value)) in + $hashes.iter_mut().zip(values.iter()).enumerate() + { + if !array.is_null(i) { + *hash = + combine_hashes($ty::get_hash(value, $random_state), *hash); + } + } + } else { + for (i, (hash, value)) in + $hashes.iter_mut().zip(values.iter()).enumerate() + { + if !array.is_null(i) { + *hash = $ty::get_hash(value, $random_state); + } + } } } }; @@ -781,58 +855,140 @@ pub fn create_hashes<'a>( random_state: &RandomState, hashes_buffer: &'a mut Vec, ) -> Result<&'a mut Vec> { + // combine hashes with `combine_hashes` if we have more than 1 column + let multi_col = arrays.len() > 1; + for col in arrays { match col.data_type() { DataType::UInt8 => { - hash_array!(UInt8Array, col, u8, hashes_buffer, random_state); + hash_array_primitive!( + UInt8Array, + col, + u8, + hashes_buffer, + random_state, + multi_col + ); } DataType::UInt16 => { - hash_array!(UInt16Array, col, u16, hashes_buffer, random_state); + hash_array_primitive!( + UInt16Array, + col, + u16, + hashes_buffer, + random_state, + multi_col + ); } DataType::UInt32 => { - hash_array!(UInt32Array, col, u32, hashes_buffer, random_state); + hash_array_primitive!( + UInt32Array, + col, + u32, + hashes_buffer, + random_state, + multi_col + ); } DataType::UInt64 => { - hash_array!(UInt64Array, col, u64, hashes_buffer, random_state); + hash_array_primitive!( + UInt64Array, + col, + u64, + hashes_buffer, + random_state, + multi_col + ); } DataType::Int8 => { - hash_array!(Int8Array, col, i8, hashes_buffer, random_state); + hash_array_primitive!( + Int8Array, + col, + i8, + hashes_buffer, + random_state, + multi_col + ); } DataType::Int16 => { - hash_array!(Int16Array, col, i16, hashes_buffer, random_state); + hash_array_primitive!( + Int16Array, + col, + i16, + hashes_buffer, + random_state, + multi_col + ); } DataType::Int32 => { - hash_array!(Int32Array, col, i32, hashes_buffer, random_state); + hash_array_primitive!( + Int32Array, + col, + i32, + hashes_buffer, + random_state, + multi_col + ); } DataType::Int64 => { - hash_array!(Int64Array, col, i64, hashes_buffer, random_state); + hash_array_primitive!( + Int64Array, + col, + i64, + hashes_buffer, + random_state, + multi_col + ); } DataType::Timestamp(TimeUnit::Microsecond, None) => { - hash_array!( + hash_array_primitive!( TimestampMicrosecondArray, col, i64, hashes_buffer, - random_state + random_state, + multi_col ); } DataType::Timestamp(TimeUnit::Nanosecond, None) => { - hash_array!( + hash_array_primitive!( TimestampNanosecondArray, col, i64, hashes_buffer, - random_state + random_state, + multi_col ); } DataType::Boolean => { - hash_array!(BooleanArray, col, u8, hashes_buffer, random_state); + hash_array!( + BooleanArray, + col, + u8, + hashes_buffer, + random_state, + multi_col + ); } DataType::Utf8 => { - hash_array!(StringArray, col, str, hashes_buffer, random_state); + hash_array!( + StringArray, + col, + str, + hashes_buffer, + random_state, + multi_col + ); } DataType::LargeUtf8 => { - hash_array!(LargeStringArray, col, str, hashes_buffer, random_state); + hash_array!( + LargeStringArray, + col, + str, + hashes_buffer, + random_state, + multi_col + ); } _ => { // This is internal because we should have caught this before. @@ -1218,7 +1374,7 @@ mod tests { #[test] fn join_with_hash_collision() -> Result<()> { - let mut hashmap_left = HashMap::with_hasher(IdHashBuilder {}); + let mut hashmap_left = HashMap::with_capacity_and_hasher(2, IdHashBuilder {}); let left = build_table_i32( ("a", &vec![10, 20]), ("x", &vec![100, 200]), @@ -1231,8 +1387,18 @@ mod tests { create_hashes(&[left.columns()[0].clone()], &random_state, hashes_buff)?; // Create hash collisions - hashmap_left.insert(hashes[0], smallvec![0, 1]); - hashmap_left.insert(hashes[1], smallvec![0, 1]); + match hashmap_left.raw_entry_mut().from_hash(hashes[0], |_| true) { + hashbrown::hash_map::RawEntryMut::Vacant(entry) => { + entry.insert_hashed_nocheck(hashes[0], (), smallvec![0, 1]) + } + _ => unreachable!("Hash should not be vacant"), + }; + match hashmap_left.raw_entry_mut().from_hash(hashes[1], |_| true) { + hashbrown::hash_map::RawEntryMut::Vacant(entry) => { + entry.insert_hashed_nocheck(hashes[1], (), smallvec![0, 1]) + } + _ => unreachable!("Hash should not be vacant"), + }; let right = build_table_i32( ("a", &vec![10, 20]), diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 70baffc700ba2..79baeae35e961 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -2669,12 +2669,11 @@ async fn inner_join_qualified_names() -> Result<()> { } #[tokio::test] -#[ignore = "https://issues.apache.org/jira/browse/ARROW-12266"] async fn inner_join_nulls() { let sql = "SELECT * FROM (SELECT null AS id1) t1 INNER JOIN (SELECT null AS id2) t2 ON id1 = id2"; - let expected: &[&[&str]] = &[&[]]; + let expected: &[&[&str]] = &[]; let mut ctx = create_join_context_qualified().unwrap(); let actual = execute(&mut ctx, sql).await; From f43dc444eb510f0ff170adecb5a23afb39c489a8 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 28 Apr 2021 12:51:58 -0400 Subject: [PATCH 046/329] update arrow-rs deps to latest master (#216) * update arrow deps to latest * script to update arrow dependencies * fixup lint * flake8 --- ballista/rust/client/Cargo.toml | 2 +- ballista/rust/core/Cargo.toml | 4 +- ballista/rust/executor/Cargo.toml | 4 +- ballista/rust/scheduler/Cargo.toml | 2 +- datafusion-examples/Cargo.toml | 2 +- datafusion/Cargo.toml | 4 +- dev/update_arrow_deps.py | 83 ++++++++++++++++++++++++++++++ 7 files changed, 92 insertions(+), 9 deletions(-) create mode 100755 dev/update_arrow_deps.py diff --git a/ballista/rust/client/Cargo.toml b/ballista/rust/client/Cargo.toml index d29f23ada888b..013adc0e48be6 100644 --- a/ballista/rust/client/Cargo.toml +++ b/ballista/rust/client/Cargo.toml @@ -31,5 +31,5 @@ futures = "0.3" log = "0.4" tokio = "1.0" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "ed00e4d4a160cd5182bfafb81fee2240ec005014" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml index 0a600ea21cd60..b92225ff1fcea 100644 --- a/ballista/rust/core/Cargo.toml +++ b/ballista/rust/core/Cargo.toml @@ -40,8 +40,8 @@ tokio = "1.0" tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } -arrow = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "ed00e4d4a160cd5182bfafb81fee2240ec005014" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "ed00e4d4a160cd5182bfafb81fee2240ec005014" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/executor/Cargo.toml b/ballista/rust/executor/Cargo.toml index 15bef69fa9e2e..6c9546e855cdf 100644 --- a/ballista/rust/executor/Cargo.toml +++ b/ballista/rust/executor/Cargo.toml @@ -44,8 +44,8 @@ tokio-stream = "0.1" tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } -arrow = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "ed00e4d4a160cd5182bfafb81fee2240ec005014" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "ed00e4d4a160cd5182bfafb81fee2240ec005014" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/scheduler/Cargo.toml b/ballista/rust/scheduler/Cargo.toml index 342f215849cbf..1f488c09ca0fa 100644 --- a/ballista/rust/scheduler/Cargo.toml +++ b/ballista/rust/scheduler/Cargo.toml @@ -52,7 +52,7 @@ tonic = "0.4" tower = { version = "0.4" } warp = "0.3" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "ed00e4d4a160cd5182bfafb81fee2240ec005014" } datafusion = { path = "../../../datafusion" } [dev-dependencies] diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 0445f382a25e6..f8f4f35dd54bd 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -29,7 +29,7 @@ publish = false [dev-dependencies] -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "ed00e4d4a160cd5182bfafb81fee2240ec005014" } datafusion = { path = "../datafusion" } prost = "0.7" tonic = "0.4" diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index 3a7e857fe551f..d7540dbaca08b 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -51,8 +51,8 @@ unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = "0.7" hashbrown = "0.11" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd", features = ["prettyprint"] } -parquet = { git = "https://github.com/apache/arrow-rs", rev = "c3fe3bab9905739fdda75301dab07a18c91731bd", features = ["arrow"] } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "ed00e4d4a160cd5182bfafb81fee2240ec005014", features = ["prettyprint"] } +parquet = { git = "https://github.com/apache/arrow-rs", rev = "ed00e4d4a160cd5182bfafb81fee2240ec005014", features = ["arrow"] } sqlparser = "0.9.0" clap = "2.33" rustyline = {version = "7.0", optional = true} diff --git a/dev/update_arrow_deps.py b/dev/update_arrow_deps.py new file mode 100755 index 0000000000000..44bdf4235d1c6 --- /dev/null +++ b/dev/update_arrow_deps.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Script that updates the arrow dependencies in datafusion and ballista, locall +# +# installation: +# pip install tomlkit requests +# +# usage: +# python update_arrow_deps.py + +from pathlib import Path + +# use tomlkit as it preserves comments and other formatting +import tomlkit +import requests + + +# find latest arrow-rs sha +def get_arrow_sha(): + url = 'https://api.github.com/repos/apache/arrow-rs/branches/master' + response = requests.get(url) + return response.json()['commit']['sha'] + + +# Update all entries that look like +# { +# 'git': 'https://github.com/apache/arrow-rs', +# 'rev': 'c3fe3bab9905739fdda75301dab07a18c91731bd' +# } +# to point at a new SHA +def update_dependencies(dependencies, new_sha): + if dependencies is None: + return + for dep_name in dependencies: + dep = dependencies[dep_name] + if hasattr(dep, 'get'): + if dep.get('git') == 'https://github.com/apache/arrow-rs': + dep['rev'] = new_sha + + +def update_cargo_toml(cargo_toml, new_sha): + print('updating {}'.format(cargo_toml.absolute())) + with open(cargo_toml) as f: + data = f.read() + + doc = tomlkit.parse(data) + + update_dependencies(doc.get('dependencies'), new_sha) + update_dependencies(doc.get('dev-dependencies'), new_sha) + + with open(cargo_toml, 'w') as f: + f.write(tomlkit.dumps(doc)) + + +# Begin main script + +repo_root = Path(__file__).parent.parent.absolute() + + +new_sha = get_arrow_sha() + +print('Updating files in {} to use sha {}'.format(repo_root, new_sha)) + + +for cargo_toml in repo_root.rglob('Cargo.toml'): + update_cargo_toml(cargo_toml, new_sha) From aa033db1a302c3940bc1df3b7b5c23c842ee0b12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Thu, 29 Apr 2021 16:31:33 +0200 Subject: [PATCH 047/329] Add rule to eliminate `LIMIT 0` and replace it with an `EmptyRelation` (#213) --- datafusion/src/execution/context.rs | 5 +- datafusion/src/optimizer/eliminate_limit.rs | 122 ++++++++++++++++++++ datafusion/src/optimizer/mod.rs | 1 + 3 files changed, 127 insertions(+), 1 deletion(-) create mode 100644 datafusion/src/optimizer/eliminate_limit.rs diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index d25e7cc05e0df..dee253f44ac33 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -21,7 +21,9 @@ use crate::{ catalog::{CatalogList, MemoryCatalogList}, information_schema::CatalogWithInformationSchema, }, - optimizer::hash_build_probe_order::HashBuildProbeOrder, + optimizer::{ + eliminate_limit::EliminateLimit, hash_build_probe_order::HashBuildProbeOrder, + }, physical_optimizer::optimizer::PhysicalOptimizerRule, }; use log::debug; @@ -636,6 +638,7 @@ impl ExecutionConfig { batch_size: 8192, optimizers: vec![ Arc::new(ConstantFolding::new()), + Arc::new(EliminateLimit::new()), Arc::new(ProjectionPushDown::new()), Arc::new(FilterPushDown::new()), Arc::new(HashBuildProbeOrder::new()), diff --git a/datafusion/src/optimizer/eliminate_limit.rs b/datafusion/src/optimizer/eliminate_limit.rs new file mode 100644 index 0000000000000..87b33d6f5d5bc --- /dev/null +++ b/datafusion/src/optimizer/eliminate_limit.rs @@ -0,0 +1,122 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Optimizer rule to replace `LIMIT 0` on a plan with an empty relation. +//! This saves time in planning and executing the query. +use crate::error::Result; +use crate::logical_plan::LogicalPlan; +use crate::optimizer::optimizer::OptimizerRule; + +use super::utils; + +/// Optimization rule that replaces LIMIT 0 with an [LogicalPlan::EmptyRelation] +pub struct EliminateLimit; + +impl EliminateLimit { + #[allow(missing_docs)] + pub fn new() -> Self { + Self {} + } +} + +impl OptimizerRule for EliminateLimit { + fn optimize(&self, plan: &LogicalPlan) -> Result { + match plan { + LogicalPlan::Limit { n, input } if *n == 0 => { + Ok(LogicalPlan::EmptyRelation { + produce_one_row: false, + schema: input.schema().clone(), + }) + } + // Rest: recurse and find possible LIMIT 0 nodes + _ => { + let expr = plan.expressions(); + + // apply the optimization to all inputs of the plan + let inputs = plan.inputs(); + let new_inputs = inputs + .iter() + .map(|plan| self.optimize(plan)) + .collect::>>()?; + + utils::from_plan(plan, &expr, &new_inputs) + } + } + } + + fn name(&self) -> &str { + "eliminate_limit" + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::logical_plan::LogicalPlanBuilder; + use crate::logical_plan::{col, sum}; + use crate::test::*; + + fn assert_optimized_plan_eq(plan: &LogicalPlan, expected: &str) { + let rule = EliminateLimit::new(); + let optimized_plan = rule.optimize(plan).expect("failed to optimize plan"); + let formatted_plan = format!("{:?}", optimized_plan); + assert_eq!(formatted_plan, expected); + assert_eq!(plan.schema(), optimized_plan.schema()); + } + + #[test] + fn limit_0_root() { + let table_scan = test_table_scan().unwrap(); + let plan = LogicalPlanBuilder::from(&table_scan) + .aggregate(vec![col("a")], vec![sum(col("b"))]) + .unwrap() + .limit(0) + .unwrap() + .build() + .unwrap(); + + // No aggregate / scan / limit + let expected = "EmptyRelation"; + assert_optimized_plan_eq(&plan, expected); + } + + #[test] + fn limit_0_nested() { + let table_scan = test_table_scan().unwrap(); + let plan1 = LogicalPlanBuilder::from(&table_scan) + .aggregate(vec![col("a")], vec![sum(col("b"))]) + .unwrap() + .build() + .unwrap(); + let plan = LogicalPlanBuilder::from(&table_scan) + .aggregate(vec![col("a")], vec![sum(col("b"))]) + .unwrap() + .limit(0) + .unwrap() + .union(plan1) + .unwrap() + .build() + .unwrap(); + + // Left side is removed + let expected = "Union\ + \n EmptyRelation\ + \n Aggregate: groupBy=[[#a]], aggr=[[SUM(#b)]]\ + \n TableScan: test projection=None"; + assert_optimized_plan_eq(&plan, expected); + } +} diff --git a/datafusion/src/optimizer/mod.rs b/datafusion/src/optimizer/mod.rs index dc59b64ff4609..2fb8a3d629509 100644 --- a/datafusion/src/optimizer/mod.rs +++ b/datafusion/src/optimizer/mod.rs @@ -19,6 +19,7 @@ //! some simple rules to a logical plan, such as "Projection Push Down" and "Type Coercion". pub mod constant_folding; +pub mod eliminate_limit; pub mod filter_push_down; pub mod hash_build_probe_order; pub mod limit_push_down; From 88222b7dcf1ba8888a5befbba5cd175fdd90ea5d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 29 Apr 2021 15:07:31 -0400 Subject: [PATCH 048/329] Update arrow-rs deps (#224) --- ballista/rust/client/Cargo.toml | 2 +- ballista/rust/core/Cargo.toml | 4 ++-- ballista/rust/executor/Cargo.toml | 4 ++-- ballista/rust/scheduler/Cargo.toml | 2 +- datafusion-examples/Cargo.toml | 2 +- datafusion/Cargo.toml | 4 ++-- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ballista/rust/client/Cargo.toml b/ballista/rust/client/Cargo.toml index 013adc0e48be6..d812b65335877 100644 --- a/ballista/rust/client/Cargo.toml +++ b/ballista/rust/client/Cargo.toml @@ -31,5 +31,5 @@ futures = "0.3" log = "0.4" tokio = "1.0" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "ed00e4d4a160cd5182bfafb81fee2240ec005014" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "d008f31b107c1030a1f5144c164e8ca8bf543576" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml index b92225ff1fcea..b1fab62bd88f3 100644 --- a/ballista/rust/core/Cargo.toml +++ b/ballista/rust/core/Cargo.toml @@ -40,8 +40,8 @@ tokio = "1.0" tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } -arrow = { git = "https://github.com/apache/arrow-rs", rev = "ed00e4d4a160cd5182bfafb81fee2240ec005014" } -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "ed00e4d4a160cd5182bfafb81fee2240ec005014" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "d008f31b107c1030a1f5144c164e8ca8bf543576" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "d008f31b107c1030a1f5144c164e8ca8bf543576" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/executor/Cargo.toml b/ballista/rust/executor/Cargo.toml index 6c9546e855cdf..2284d915321ba 100644 --- a/ballista/rust/executor/Cargo.toml +++ b/ballista/rust/executor/Cargo.toml @@ -44,8 +44,8 @@ tokio-stream = "0.1" tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } -arrow = { git = "https://github.com/apache/arrow-rs", rev = "ed00e4d4a160cd5182bfafb81fee2240ec005014" } -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "ed00e4d4a160cd5182bfafb81fee2240ec005014" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "d008f31b107c1030a1f5144c164e8ca8bf543576" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "d008f31b107c1030a1f5144c164e8ca8bf543576" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/scheduler/Cargo.toml b/ballista/rust/scheduler/Cargo.toml index 1f488c09ca0fa..93a0730062c65 100644 --- a/ballista/rust/scheduler/Cargo.toml +++ b/ballista/rust/scheduler/Cargo.toml @@ -52,7 +52,7 @@ tonic = "0.4" tower = { version = "0.4" } warp = "0.3" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "ed00e4d4a160cd5182bfafb81fee2240ec005014" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "d008f31b107c1030a1f5144c164e8ca8bf543576" } datafusion = { path = "../../../datafusion" } [dev-dependencies] diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index f8f4f35dd54bd..77b155323b4aa 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -29,7 +29,7 @@ publish = false [dev-dependencies] -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "ed00e4d4a160cd5182bfafb81fee2240ec005014" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "d008f31b107c1030a1f5144c164e8ca8bf543576" } datafusion = { path = "../datafusion" } prost = "0.7" tonic = "0.4" diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index d7540dbaca08b..5f743f6f559ef 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -51,8 +51,8 @@ unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = "0.7" hashbrown = "0.11" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "ed00e4d4a160cd5182bfafb81fee2240ec005014", features = ["prettyprint"] } -parquet = { git = "https://github.com/apache/arrow-rs", rev = "ed00e4d4a160cd5182bfafb81fee2240ec005014", features = ["arrow"] } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "d008f31b107c1030a1f5144c164e8ca8bf543576", features = ["prettyprint"] } +parquet = { git = "https://github.com/apache/arrow-rs", rev = "d008f31b107c1030a1f5144c164e8ca8bf543576", features = ["arrow"] } sqlparser = "0.9.0" clap = "2.33" rustyline = {version = "7.0", optional = true} From 2423ff0dd1fe9c0932c1cb8d1776efa3acd69554 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Fri, 30 Apr 2021 10:11:14 +0200 Subject: [PATCH 049/329] Fix Filter / where clause without column names is removed in optimization pass (#225) * Workaround where without columns * Add some docs * Remove print statement * Bring back removed comment --- datafusion/src/optimizer/filter_push_down.rs | 34 +++++++++++++++++--- datafusion/tests/sql.rs | 14 ++++++++ 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/datafusion/src/optimizer/filter_push_down.rs b/datafusion/src/optimizer/filter_push_down.rs index 4622e9fc62dc1..356d497491a14 100644 --- a/datafusion/src/optimizer/filter_push_down.rs +++ b/datafusion/src/optimizer/filter_push_down.rs @@ -237,17 +237,30 @@ fn optimize(plan: &LogicalPlan, mut state: State) -> Result { let mut predicates = vec![]; split_members(predicate, &mut predicates); + // Predicates without referencing columns (WHERE FALSE, WHERE 1=1, etc.) + let mut no_col_predicates = vec![]; + predicates .into_iter() .try_for_each::<_, Result<()>>(|predicate| { let mut columns: HashSet = HashSet::new(); utils::expr_to_column_names(predicate, &mut columns)?; - // collect the predicate - state.filters.push((predicate.clone(), columns)); + if columns.is_empty() { + no_col_predicates.push(predicate) + } else { + // collect the predicate + state.filters.push((predicate.clone(), columns)); + } Ok(()) })?; - - optimize(input, state) + // Predicates without columns will not be pushed down. + // As those contain only literals, they could be optimized using constant folding + // and removal of WHERE TRUE / WHERE FALSE + if !no_col_predicates.is_empty() { + Ok(add_filter(optimize(input, state)?, &no_col_predicates)) + } else { + optimize(input, state) + } } LogicalPlan::Projection { input, @@ -482,6 +495,19 @@ mod tests { Ok(()) } + #[test] + fn filter_no_columns() -> Result<()> { + let table_scan = test_table_scan()?; + let plan = LogicalPlanBuilder::from(&table_scan) + .filter(lit(0i64).eq(lit(1i64)))? + .build()?; + let expected = "\ + Filter: Int64(0) Eq Int64(1)\ + \n TableScan: test projection=None"; + assert_optimized_plan_eq(&plan, expected); + Ok(()) + } + #[test] fn filter_jump_2_plans() -> Result<()> { let table_scan = test_table_scan()?; diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 79baeae35e961..716929405c3a1 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -474,6 +474,20 @@ async fn csv_query_group_by_and_having_and_where() -> Result<()> { Ok(()) } +#[tokio::test] +async fn all_where_empty() -> Result<()> { + let mut ctx = ExecutionContext::new(); + register_aggregate_csv(&mut ctx)?; + let sql = "SELECT * + FROM aggregate_test_100 + WHERE 1=2"; + let mut actual = execute(&mut ctx, sql).await; + actual.sort(); + let expected: Vec> = vec![]; + assert_eq!(expected, actual); + Ok(()) +} + #[tokio::test] async fn csv_query_having_without_group_by() -> Result<()> { let mut ctx = ExecutionContext::new(); From 23d02bb3c642ed69e7b963ed74df9687b91af970 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 30 Apr 2021 10:53:45 -0400 Subject: [PATCH 050/329] Use standard make_null_array for CASE (#223) --- .../src/physical_plan/expressions/case.rs | 34 ++----------------- 1 file changed, 2 insertions(+), 32 deletions(-) diff --git a/datafusion/src/physical_plan/expressions/case.rs b/datafusion/src/physical_plan/expressions/case.rs index 723438df60f78..95ae5325af119 100644 --- a/datafusion/src/physical_plan/expressions/case.rs +++ b/datafusion/src/physical_plan/expressions/case.rs @@ -234,36 +234,6 @@ fn if_then_else( } } -macro_rules! make_null_array { - ($TY:ty, $N:expr) => {{ - let mut builder = <$TY>::new($N); - for _ in 0..$N { - builder.append_null()?; - } - Ok(Arc::new(builder.finish())) - }}; -} - -fn build_null_array(data_type: &DataType, num_rows: usize) -> Result { - match data_type { - DataType::UInt8 => make_null_array!(array::UInt8Builder, num_rows), - DataType::UInt16 => make_null_array!(array::UInt16Builder, num_rows), - DataType::UInt32 => make_null_array!(array::UInt32Builder, num_rows), - DataType::UInt64 => make_null_array!(array::UInt64Builder, num_rows), - DataType::Int8 => make_null_array!(array::Int8Builder, num_rows), - DataType::Int16 => make_null_array!(array::Int16Builder, num_rows), - DataType::Int32 => make_null_array!(array::Int32Builder, num_rows), - DataType::Int64 => make_null_array!(array::Int64Builder, num_rows), - DataType::Float32 => make_null_array!(array::Float32Builder, num_rows), - DataType::Float64 => make_null_array!(array::Float64Builder, num_rows), - DataType::Utf8 => make_null_array!(array::StringBuilder, num_rows), - other => Err(DataFusionError::Execution(format!( - "CASE does not support '{:?}'", - other - ))), - } -} - macro_rules! array_equals { ($TY:ty, $L:expr, $R:expr, $eq_fn:expr) => {{ let when_value = $L @@ -347,7 +317,7 @@ impl CaseExpr { let mut current_value: Option = if let Some(e) = &self.else_expr { Some(e.evaluate(batch)?.into_array(batch.num_rows())) } else { - Some(build_null_array(&return_type, batch.num_rows())?) + Some(new_null_array(&return_type, batch.num_rows())) }; // walk backwards through the when/then expressions @@ -388,7 +358,7 @@ impl CaseExpr { let mut current_value: Option = if let Some(e) = &self.else_expr { Some(e.evaluate(batch)?.into_array(batch.num_rows())) } else { - Some(build_null_array(&return_type, batch.num_rows())?) + Some(new_null_array(&return_type, batch.num_rows())) }; // walk backwards through the when/then expressions From c945b03f3a459a5c15f481f9d52819df56e1090c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 1 May 2021 11:33:22 +0200 Subject: [PATCH 051/329] Make test join_with_hash_collision deterministic (#229) --- datafusion/src/physical_plan/hash_join.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index 2edd0c7ee5e3d..8e6b0428c0417 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -1381,7 +1381,7 @@ mod tests { ("y", &vec![200, 300]), ); - let random_state = RandomState::new(); + let random_state = RandomState::with_seeds(0, 0, 0, 0); let hashes_buff = &mut vec![0; left.num_rows()]; let hashes = create_hashes(&[left.columns()[0].clone()], &random_state, hashes_buff)?; From 4f842910c8b3c403ac51ff7c5966f9eb6b2f6f81 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 2 May 2021 10:57:34 -0600 Subject: [PATCH 052/329] Update repository url (#233) * Update repo url in Cargo * Update homepage url * Update url in rustdoc * Update GitHub URL in Ballista UI --- ballista/rust/client/Cargo.toml | 4 ++-- ballista/rust/core/Cargo.toml | 4 ++-- ballista/rust/executor/Cargo.toml | 4 ++-- ballista/rust/scheduler/Cargo.toml | 4 ++-- ballista/ui/scheduler/src/components/Header.tsx | 4 ++-- benchmarks/Cargo.toml | 4 ++-- datafusion-examples/Cargo.toml | 4 ++-- datafusion/Cargo.toml | 4 ++-- datafusion/src/lib.rs | 2 +- 9 files changed, 17 insertions(+), 17 deletions(-) diff --git a/ballista/rust/client/Cargo.toml b/ballista/rust/client/Cargo.toml index d812b65335877..283c2ebb44d71 100644 --- a/ballista/rust/client/Cargo.toml +++ b/ballista/rust/client/Cargo.toml @@ -20,8 +20,8 @@ name = "ballista" description = "Ballista Distributed Compute" license = "Apache-2.0" version = "0.5.0-SNAPSHOT" -homepage = "https://github.com/apache/arrow" -repository = "https://github.com/apache/arrow" +homepage = "https://github.com/apache/arrow-datafusion" +repository = "https://github.com/apache/arrow-datafusion" authors = ["Apache Arrow "] edition = "2018" diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml index b1fab62bd88f3..853aa7ae2f45f 100644 --- a/ballista/rust/core/Cargo.toml +++ b/ballista/rust/core/Cargo.toml @@ -20,8 +20,8 @@ name = "ballista-core" description = "Ballista Distributed Compute" license = "Apache-2.0" version = "0.5.0-SNAPSHOT" -homepage = "https://github.com/apache/arrow" -repository = "https://github.com/apache/arrow" +homepage = "https://github.com/apache/arrow-datafusion" +repository = "https://github.com/apache/arrow-datafusion" authors = ["Apache Arrow "] edition = "2018" build = "build.rs" diff --git a/ballista/rust/executor/Cargo.toml b/ballista/rust/executor/Cargo.toml index 2284d915321ba..cdc1b45382263 100644 --- a/ballista/rust/executor/Cargo.toml +++ b/ballista/rust/executor/Cargo.toml @@ -20,8 +20,8 @@ name = "ballista-executor" description = "Ballista Distributed Compute - Executor" license = "Apache-2.0" version = "0.5.0-SNAPSHOT" -homepage = "https://github.com/apache/arrow" -repository = "https://github.com/apache/arrow" +homepage = "https://github.com/apache/arrow-datafusion" +repository = "https://github.com/apache/arrow-datafusion" authors = ["Apache Arrow "] edition = "2018" diff --git a/ballista/rust/scheduler/Cargo.toml b/ballista/rust/scheduler/Cargo.toml index 93a0730062c65..507dc5465006b 100644 --- a/ballista/rust/scheduler/Cargo.toml +++ b/ballista/rust/scheduler/Cargo.toml @@ -20,8 +20,8 @@ name = "ballista-scheduler" description = "Ballista Distributed Compute - Scheduler" license = "Apache-2.0" version = "0.5.0-SNAPSHOT" -homepage = "https://github.com/apache/arrow" -repository = "https://github.com/apache/arrow" +homepage = "https://github.com/apache/arrow-datafusion" +repository = "https://github.com/apache/arrow-datafusion" authors = ["Apache Arrow "] edition = "2018" diff --git a/ballista/ui/scheduler/src/components/Header.tsx b/ballista/ui/scheduler/src/components/Header.tsx index c0ddd35c7264f..1a0b0f178bd6b 100644 --- a/ballista/ui/scheduler/src/components/Header.tsx +++ b/ballista/ui/scheduler/src/components/Header.tsx @@ -67,11 +67,11 @@ export const Header: React.FunctionComponent = ({schedulerState}) = diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 25a385eaea0e6..6a763420c7823 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -21,8 +21,8 @@ description = "Apache Arrow Benchmarks" version = "4.0.0-SNAPSHOT" edition = "2018" authors = ["Apache Arrow "] -homepage = "https://github.com/apache/arrow" -repository = "https://github.com/apache/arrow" +homepage = "https://github.com/apache/arrow-datafusion" +repository = "https://github.com/apache/arrow-datafusion" license = "Apache-2.0" publish = false diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 77b155323b4aa..0ec30105a409d 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -19,8 +19,8 @@ name = "datafusion-examples" description = "DataFusion usage examples" version = "4.0.0-SNAPSHOT" -homepage = "https://github.com/apache/arrow" -repository = "https://github.com/apache/arrow" +homepage = "https://github.com/apache/arrow-datafusion" +repository = "https://github.com/apache/arrow-datafusion" authors = ["Apache Arrow "] license = "Apache-2.0" keywords = [ "arrow", "query", "sql" ] diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index 5f743f6f559ef..1c2c860c55d0f 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -19,8 +19,8 @@ name = "datafusion" description = "DataFusion is an in-memory query engine that uses Apache Arrow as the memory model" version = "4.0.0-SNAPSHOT" -homepage = "https://github.com/apache/arrow" -repository = "https://github.com/apache/arrow" +homepage = "https://github.com/apache/arrow-datafusion" +repository = "https://github.com/apache/arrow-datafusion" readme = "../README.md" authors = ["Apache Arrow "] license = "Apache-2.0" diff --git a/datafusion/src/lib.rs b/datafusion/src/lib.rs index e1d7368469b0b..b6f64feb70d2a 100644 --- a/datafusion/src/lib.rs +++ b/datafusion/src/lib.rs @@ -25,7 +25,7 @@ clippy::upper_case_acronyms )] -//! [DataFusion](https://github.com/apache/arrow/tree/master/rust/datafusion) +//! [DataFusion](https://github.com/apache/arrow-datafusion) //! is an extensible query execution framework that uses //! [Apache Arrow](https://arrow.apache.org) as its in-memory format. //! From 5ecfb1b2a2750376c3721ea1754ef194469a161c Mon Sep 17 00:00:00 2001 From: Ximo Guanter Date: Sun, 2 May 2021 19:35:20 +0200 Subject: [PATCH 053/329] [Ballista] Make external hostname in executor optional (#232) --- ballista/rust/core/proto/ballista.proto | 12 +- ballista/rust/core/src/client.rs | 1 - .../src/execution_plans/unresolved_shuffle.rs | 1 - .../rust/executor/executor_config_spec.toml | 3 +- ballista/rust/executor/src/execution_loop.rs | 5 +- ballista/rust/executor/src/flight_service.rs | 14 +- ballista/rust/executor/src/lib.rs | 31 --- ballista/rust/executor/src/main.rs | 50 +++-- ballista/rust/scheduler/src/lib.rs | 59 ++++-- ballista/rust/scheduler/src/main.rs | 13 +- ballista/rust/scheduler/src/planner.rs | 177 +----------------- benchmarks/docker-compose.yaml | 19 +- 12 files changed, 115 insertions(+), 270 deletions(-) diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index 5733921bc92fb..b6bc5d09c3925 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -514,6 +514,16 @@ message ExecutorMetadata { uint32 port = 3; } +message ExecutorRegistration { + string id = 1; + // "optional" keyword is stable in protoc 3.15 but prost is still on 3.14 (see https://github.com/danburkert/prost/issues/430) + // this syntax is ugly but is binary compatible with the "optional" keyword (see https://stackoverflow.com/questions/42622015/how-to-define-an-optional-field-in-protobuf-3) + oneof optional_host { + string host = 2; + } + uint32 port = 3; +} + message GetExecutorMetadataParams {} message GetExecutorMetadataResult { @@ -542,7 +552,7 @@ message TaskStatus { } message PollWorkParams { - ExecutorMetadata metadata = 1; + ExecutorRegistration metadata = 1; bool can_accept_task = 2; // All tasks must be reported until they reach the failed or completed state repeated TaskStatus task_status = 3; diff --git a/ballista/rust/core/src/client.rs b/ballista/rust/core/src/client.rs index f64f95f7cfe25..1d0fedca7b4ef 100644 --- a/ballista/rust/core/src/client.rs +++ b/ballista/rust/core/src/client.rs @@ -58,7 +58,6 @@ pub struct BallistaClient { impl BallistaClient { /// Create a new BallistaClient to connect to the executor listening on the specified /// host and port - pub async fn try_new(host: &str, port: u16) -> Result { let addr = format!("http://{}:{}", host, port); debug!("BallistaClient connecting to {}", addr); diff --git a/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs b/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs index a62a2513ff4a9..7d147d53537c4 100644 --- a/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs +++ b/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs @@ -18,7 +18,6 @@ use std::sync::Arc; use std::{any::Any, pin::Pin}; -use crate::client::BallistaClient; use crate::memory_stream::MemoryStream; use crate::serde::scheduler::PartitionLocation; diff --git a/ballista/rust/executor/executor_config_spec.toml b/ballista/rust/executor/executor_config_spec.toml index 2a7c96bde3aff..8d817fee9cc5c 100644 --- a/ballista/rust/executor/executor_config_spec.toml +++ b/ballista/rust/executor/executor_config_spec.toml @@ -49,8 +49,7 @@ doc = "Local IP address to bind to." [[param]] name = "external_host" type = "String" -default = "std::string::String::from(\"localhost\")" -doc = "Host name or IP address to register with scheduler so that other executors can connect to this executor." +doc = "Host name or IP address to register with scheduler so that other executors can connect to this executor. If none is provided, the scheduler will use the connecting IP address to communicate with the executor." [[param]] abbr = "p" diff --git a/ballista/rust/executor/src/execution_loop.rs b/ballista/rust/executor/src/execution_loop.rs index cf641ddcc5c5e..5574a14a0915a 100644 --- a/ballista/rust/executor/src/execution_loop.rs +++ b/ballista/rust/executor/src/execution_loop.rs @@ -24,7 +24,7 @@ use datafusion::physical_plan::ExecutionPlan; use log::{debug, error, info, warn}; use tonic::transport::Channel; -use ballista_core::serde::scheduler::ExecutorMeta; +use ballista_core::serde::protobuf::ExecutorRegistration; use ballista_core::{ client::BallistaClient, serde::protobuf::{ @@ -37,10 +37,9 @@ use protobuf::CompletedTask; pub async fn poll_loop( mut scheduler: SchedulerGrpcClient, executor_client: BallistaClient, - executor_meta: ExecutorMeta, + executor_meta: ExecutorRegistration, concurrent_tasks: usize, ) { - let executor_meta: protobuf::ExecutorMetadata = executor_meta.into(); let available_tasks_slots = Arc::new(AtomicUsize::new(concurrent_tasks)); let (task_status_sender, mut task_status_receiver) = std::sync::mpsc::channel::(); diff --git a/ballista/rust/executor/src/flight_service.rs b/ballista/rust/executor/src/flight_service.rs index 8fff3dbcade77..115e1ab0d800e 100644 --- a/ballista/rust/executor/src/flight_service.rs +++ b/ballista/rust/executor/src/flight_service.rs @@ -23,7 +23,6 @@ use std::pin::Pin; use std::sync::Arc; use std::time::Instant; -use crate::BallistaExecutor; use ballista_core::error::BallistaError; use ballista_core::serde::decode_protobuf; use ballista_core::serde::scheduler::{Action as BallistaAction, PartitionStats}; @@ -59,12 +58,12 @@ type FlightDataReceiver = Receiver>; /// Service implementing the Apache Arrow Flight Protocol #[derive(Clone)] pub struct BallistaFlightService { - executor: Arc, + work_dir: String, } impl BallistaFlightService { - pub fn new(executor: Arc) -> Self { - Self { executor } + pub fn new(work_dir: String) -> Self { + Self { work_dir } } } @@ -103,11 +102,10 @@ impl FlightService for BallistaFlightService { ); let mut tasks: Vec>> = vec![]; - for part in partition.partition_id.clone() { - let work_dir = self.executor.config.work_dir.clone(); + for &part in &partition.partition_id { + let mut path = PathBuf::from(&self.work_dir); let partition = partition.clone(); tasks.push(tokio::spawn(async move { - let mut path = PathBuf::from(&work_dir); path.push(partition.job_id); path.push(&format!("{}", partition.stage_id)); path.push(&format!("{}", part)); @@ -208,7 +206,7 @@ impl FlightService for BallistaFlightService { // fetch a partition that was previously executed by this executor info!("FetchPartition {:?}", partition_id); - let mut path = PathBuf::from(&self.executor.config.work_dir); + let mut path = PathBuf::from(&self.work_dir); path.push(&partition_id.job_id); path.push(&format!("{}", partition_id.stage_id)); path.push(&format!("{}", partition_id.partition_id)); diff --git a/ballista/rust/executor/src/lib.rs b/ballista/rust/executor/src/lib.rs index 3d7bbaca3f1f0..08646ebda6b7f 100644 --- a/ballista/rust/executor/src/lib.rs +++ b/ballista/rust/executor/src/lib.rs @@ -19,34 +19,3 @@ pub mod collect; pub mod flight_service; - -#[derive(Debug, Clone)] -pub struct ExecutorConfig { - pub(crate) host: String, - pub(crate) port: u16, - /// Directory for temporary files, such as IPC files - pub(crate) work_dir: String, - pub(crate) concurrent_tasks: usize, -} - -impl ExecutorConfig { - pub fn new(host: &str, port: u16, work_dir: &str, concurrent_tasks: usize) -> Self { - Self { - host: host.to_owned(), - port, - work_dir: work_dir.to_owned(), - concurrent_tasks, - } - } -} - -#[allow(dead_code)] -pub struct BallistaExecutor { - pub(crate) config: ExecutorConfig, -} - -impl BallistaExecutor { - pub fn new(config: ExecutorConfig) -> Self { - Self { config } - } -} diff --git a/ballista/rust/executor/src/main.rs b/ballista/rust/executor/src/main.rs index 9c8d466add4f4..ad7c001e654af 100644 --- a/ballista/rust/executor/src/main.rs +++ b/ballista/rust/executor/src/main.rs @@ -17,7 +17,10 @@ //! Ballista Rust executor binary. -use std::sync::Arc; +use std::{ + net::{IpAddr, Ipv4Addr}, + sync::Arc, +}; use anyhow::{Context, Result}; use arrow_flight::flight_service_server::FlightServiceServer; @@ -28,15 +31,17 @@ use tonic::transport::Server; use uuid::Uuid; use ballista_core::{ - client::BallistaClient, serde::protobuf::scheduler_grpc_client::SchedulerGrpcClient, + client::BallistaClient, + serde::protobuf::{ + executor_registration, scheduler_grpc_client::SchedulerGrpcClient, + ExecutorRegistration, + }, }; use ballista_core::{ print_version, serde::protobuf::scheduler_grpc_server::SchedulerGrpcServer, - serde::scheduler::ExecutorMeta, BALLISTA_VERSION, -}; -use ballista_executor::{ - flight_service::BallistaFlightService, BallistaExecutor, ExecutorConfig, + BALLISTA_VERSION, }; +use ballista_executor::flight_service::BallistaFlightService; use ballista_scheduler::{state::StandaloneClient, SchedulerServer}; use config::prelude::*; @@ -80,7 +85,7 @@ async fn main() -> Result<()> { .with_context(|| format!("Could not parse address: {}", addr))?; let scheduler_host = if opt.local { - external_host.to_owned() + "localhost".to_string() } else { opt.scheduler_host }; @@ -94,14 +99,16 @@ async fn main() -> Result<()> { .into_string() .unwrap(), ); - let config = - ExecutorConfig::new(&external_host, port, &work_dir, opt.concurrent_tasks); - info!("Running with config: {:?}", config); + info!("Running with config:"); + info!("work_dir: {}", work_dir); + info!("concurrent_tasks: {}", opt.concurrent_tasks); - let executor_meta = ExecutorMeta { + let executor_meta = ExecutorRegistration { id: Uuid::new_v4().to_string(), // assign this executor a unique ID - host: external_host.clone(), - port, + optional_host: external_host + .clone() + .map(executor_registration::OptionalHost::Host), + port: port as u32, }; if opt.local { @@ -117,8 +124,9 @@ async fn main() -> Result<()> { let server = SchedulerGrpcServer::new(SchedulerServer::new( Arc::new(client), "ballista".to_string(), + IpAddr::V4(Ipv4Addr::LOCALHOST), )); - let addr = format!("{}:{}", bind_host, scheduler_port); + let addr = format!("localhost:{}", scheduler_port); let addr = addr .parse() .with_context(|| format!("Could not parse {}", addr))?; @@ -158,8 +166,7 @@ async fn main() -> Result<()> { let scheduler = SchedulerGrpcClient::connect(scheduler_url) .await .context("Could not connect to scheduler")?; - let executor = Arc::new(BallistaExecutor::new(config)); - let service = BallistaFlightService::new(executor); + let service = BallistaFlightService::new(work_dir); let server = FlightServiceServer::new(service); info!( @@ -167,7 +174,16 @@ async fn main() -> Result<()> { BALLISTA_VERSION, addr ); let server_future = tokio::spawn(Server::builder().add_service(server).serve(addr)); - let client = BallistaClient::try_new(&external_host, port).await?; + let client_host = external_host.as_deref().unwrap_or_else(|| { + if bind_host == "0.0.0.0" { + // If the executor is being bound to "0.0.0.0" (which means use all ips in all eth devices) + // then use "localhost" to connect to itself through the BallistaClient + "localhost" + } else { + &bind_host + } + }); + let client = BallistaClient::try_new(client_host, port).await?; tokio::spawn(execution_loop::poll_loop( scheduler, client, diff --git a/ballista/rust/scheduler/src/lib.rs b/ballista/rust/scheduler/src/lib.rs index a675153897be3..3dc8df29bd038 100644 --- a/ballista/rust/scheduler/src/lib.rs +++ b/ballista/rust/scheduler/src/lib.rs @@ -24,16 +24,16 @@ pub mod state; #[cfg(test)] pub mod test_utils; -use std::fmt; use std::{convert::TryInto, sync::Arc}; +use std::{fmt, net::IpAddr}; use ballista_core::serde::protobuf::{ - execute_query_params::Query, job_status, scheduler_grpc_server::SchedulerGrpc, - ExecuteQueryParams, ExecuteQueryResult, FailedJob, FilePartitionMetadata, FileType, - GetExecutorMetadataParams, GetExecutorMetadataResult, GetFileMetadataParams, - GetFileMetadataResult, GetJobStatusParams, GetJobStatusResult, JobStatus, - PartitionId, PollWorkParams, PollWorkResult, QueuedJob, RunningJob, TaskDefinition, - TaskStatus, + execute_query_params::Query, executor_registration::OptionalHost, job_status, + scheduler_grpc_server::SchedulerGrpc, ExecuteQueryParams, ExecuteQueryResult, + FailedJob, FilePartitionMetadata, FileType, GetExecutorMetadataParams, + GetExecutorMetadataResult, GetFileMetadataParams, GetFileMetadataResult, + GetJobStatusParams, GetJobStatusResult, JobStatus, PartitionId, PollWorkParams, + PollWorkResult, QueuedJob, RunningJob, TaskDefinition, TaskStatus, }; use ballista_core::serde::scheduler::ExecutorMeta; @@ -71,13 +71,18 @@ use std::time::{Instant, SystemTime, UNIX_EPOCH}; #[derive(Clone)] pub struct SchedulerServer { + caller_ip: IpAddr, state: Arc, start_time: u128, version: String, } impl SchedulerServer { - pub fn new(config: Arc, namespace: String) -> Self { + pub fn new( + config: Arc, + namespace: String, + caller_ip: IpAddr, + ) -> Self { const VERSION: Option<&'static str> = option_env!("CARGO_PKG_VERSION"); let state = Arc::new(SchedulerState::new(config, namespace)); let state_clone = state.clone(); @@ -86,6 +91,7 @@ impl SchedulerServer { tokio::spawn(async move { state_clone.synchronize_job_status_loop().await }); Self { + caller_ip, state, start_time: SystemTime::now() .duration_since(UNIX_EPOCH) @@ -131,7 +137,16 @@ impl SchedulerGrpc for SchedulerServer { } = request.into_inner() { debug!("Received poll_work request for {:?}", metadata); - let metadata: ExecutorMeta = metadata.into(); + let metadata: ExecutorMeta = ExecutorMeta { + id: metadata.id, + host: metadata + .optional_host + .map(|h| match h { + OptionalHost::Host(host) => host, + }) + .unwrap_or_else(|| self.caller_ip.to_string()), + port: metadata.port as u16, + }; let mut lock = self.state.lock().await.map_err(|e| { let msg = format!("Could not lock the state: {}", e); error!("{}", msg); @@ -359,12 +374,7 @@ impl SchedulerGrpc for SchedulerServer { job_id_spawn, e ); } - let mut planner = fail_job!(DistributedPlanner::try_new(executors) - .map_err(|e| { - let msg = format!("Could not create distributed planner: {}", e); - error!("{}", msg); - tonic::Status::internal(msg) - })); + let mut planner = DistributedPlanner::new(); let stages = fail_job!(planner .plan_query_stages(&job_id_spawn, plan) .map_err(|e| { @@ -433,12 +443,17 @@ impl SchedulerGrpc for SchedulerServer { #[cfg(all(test, feature = "sled"))] mod test { - use std::sync::Arc; + use std::{ + net::{IpAddr, Ipv4Addr}, + sync::Arc, + }; use tonic::Request; use ballista_core::error::BallistaError; - use ballista_core::serde::protobuf::{ExecutorMetadata, PollWorkParams}; + use ballista_core::serde::protobuf::{ + executor_registration::OptionalHost, ExecutorRegistration, PollWorkParams, + }; use super::{ state::{SchedulerState, StandaloneClient}, @@ -449,11 +464,15 @@ mod test { async fn test_poll_work() -> Result<(), BallistaError> { let state = Arc::new(StandaloneClient::try_new_temporary()?); let namespace = "default"; - let scheduler = SchedulerServer::new(state.clone(), namespace.to_owned()); + let scheduler = SchedulerServer::new( + state.clone(), + namespace.to_owned(), + IpAddr::V4(Ipv4Addr::LOCALHOST), + ); let state = SchedulerState::new(state, namespace.to_string()); - let exec_meta = ExecutorMetadata { + let exec_meta = ExecutorRegistration { id: "abc".to_owned(), - host: "".to_owned(), + optional_host: Some(OptionalHost::Host("".to_owned())), port: 0, }; let request: Request = Request::new(PollWorkParams { diff --git a/ballista/rust/scheduler/src/main.rs b/ballista/rust/scheduler/src/main.rs index 205023a4c34c6..713103fcf0439 100644 --- a/ballista/rust/scheduler/src/main.rs +++ b/ballista/rust/scheduler/src/main.rs @@ -19,7 +19,7 @@ use anyhow::{Context, Result}; use futures::future::{self, Either, TryFutureExt}; -use hyper::{service::make_service_fn, Server}; +use hyper::{server::conn::AddrStream, service::make_service_fn, Server}; use std::convert::Infallible; use std::{net::SocketAddr, sync::Arc}; use tonic::transport::Server as TonicServer; @@ -62,17 +62,20 @@ async fn start_server( BALLISTA_VERSION, addr ); - let scheduler_server = - SchedulerServer::new(config_backend.clone(), namespace.clone()); Ok(Server::bind(&addr) - .serve(make_service_fn(move |_| { + .serve(make_service_fn(move |request: &AddrStream| { + let scheduler_server = SchedulerServer::new( + config_backend.clone(), + namespace.clone(), + request.remote_addr().ip(), + ); let scheduler_grpc_server = SchedulerGrpcServer::new(scheduler_server.clone()); let mut tonic = TonicServer::builder() .add_service(scheduler_grpc_server) .into_service(); - let mut warp = warp::service(get_routes(scheduler_server.clone())); + let mut warp = warp::service(get_routes(scheduler_server)); future::ok::<_, Infallible>(tower::service_fn( move |req: hyper::Request| { diff --git a/ballista/rust/scheduler/src/planner.rs b/ballista/rust/scheduler/src/planner.rs index e791fa8b54597..20dd0d36d9ab9 100644 --- a/ballista/rust/scheduler/src/planner.rs +++ b/ballista/rust/scheduler/src/planner.rs @@ -19,17 +19,11 @@ //! //! This code is EXPERIMENTAL and still under development -use std::pin::Pin; +use std::collections::HashMap; use std::sync::Arc; -use std::time::Instant; -use std::{collections::HashMap, future::Future}; -use ballista_core::client::BallistaClient; use ballista_core::datasource::DfTableAdapter; use ballista_core::error::{BallistaError, Result}; -use ballista_core::serde::scheduler::ExecutorMeta; -use ballista_core::serde::scheduler::PartitionId; -use ballista_core::utils::format_plan; use ballista_core::{ execution_plans::{QueryStageExec, ShuffleReaderExec, UnresolvedShuffleExec}, serde::scheduler::PartitionLocation, @@ -42,59 +36,27 @@ use datafusion::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec use datafusion::physical_plan::hash_join::HashJoinExec; use datafusion::physical_plan::merge::MergeExec; use datafusion::physical_plan::ExecutionPlan; -use log::{debug, info}; -use tokio::task::JoinHandle; +use log::info; -type SendableExecutionPlan = - Pin>> + Send>>; type PartialQueryStageResult = (Arc, Vec>); pub struct DistributedPlanner { - executors: Vec, next_stage_id: usize, } impl DistributedPlanner { - pub fn try_new(executors: Vec) -> Result { - if executors.is_empty() { - Err(BallistaError::General( - "DistributedPlanner requires at least one executor".to_owned(), - )) - } else { - Ok(Self { - executors, - next_stage_id: 0, - }) - } + pub fn new() -> Self { + Self { next_stage_id: 0 } } } -impl DistributedPlanner { - /// Execute a distributed query against a cluster, leaving the final results on the - /// executors. The [ExecutionPlan] returned by this method is guaranteed to be a - /// [ShuffleReaderExec] that can be used to fetch the final results from the executors - /// in parallel. - pub async fn execute_distributed_query( - &mut self, - job_id: String, - execution_plan: Arc, - ) -> Result> { - let now = Instant::now(); - let execution_plans = self.plan_query_stages(&job_id, execution_plan)?; - - info!( - "DistributedPlanner created {} execution plans in {} seconds:", - execution_plans.len(), - now.elapsed().as_secs() - ); - - for plan in &execution_plans { - info!("{}", format_plan(plan.as_ref(), 0)?); - } - - execute(execution_plans, self.executors.clone()).await +impl Default for DistributedPlanner { + fn default() -> Self { + Self::new() } +} +impl DistributedPlanner { /// Returns a vector of ExecutionPlans, where the root node is a [QueryStageExec]. /// Plans that depend on the input of other plans will have leaf nodes of type [UnresolvedShuffleExec]. /// A [QueryStageExec] is created whenever the partitioning changes. @@ -221,38 +183,6 @@ impl DistributedPlanner { } } -fn execute( - stages: Vec>, - executors: Vec, -) -> SendableExecutionPlan { - Box::pin(async move { - let mut partition_locations: HashMap> = - HashMap::new(); - let mut result_partition_locations = vec![]; - for stage in &stages { - debug!("execute() {}", &format!("{:?}", stage)[0..60]); - let stage = remove_unresolved_shuffles(stage.as_ref(), &partition_locations)?; - let stage = stage.as_any().downcast_ref::().unwrap(); - result_partition_locations = execute_query_stage( - &stage.job_id.clone(), - stage.stage_id, - stage.children()[0].clone(), - executors.clone(), - ) - .await?; - partition_locations - .insert(stage.stage_id, result_partition_locations.clone()); - } - - let shuffle_reader: Arc = - Arc::new(ShuffleReaderExec::try_new( - result_partition_locations, - stages.last().unwrap().schema(), - )?); - Ok(shuffle_reader) - }) -} - pub fn remove_unresolved_shuffles( stage: &dyn ExecutionPlan, partition_locations: &HashMap>, @@ -298,88 +228,6 @@ fn create_query_stage( Ok(Arc::new(QueryStageExec::try_new(job_id, stage_id, plan)?)) } -/// Execute a query stage by sending each partition to an executor -async fn execute_query_stage( - job_id: &str, - stage_id: usize, - plan: Arc, - executors: Vec, -) -> Result> { - info!( - "execute_query_stage() stage_id={}\n{}", - stage_id, - format_plan(plan.as_ref(), 0)? - ); - - let partition_count = plan.output_partitioning().partition_count(); - - let num_chunks = partition_count / executors.len(); - let num_chunks = num_chunks.max(1); - let partition_chunks: Vec> = (0..partition_count) - .collect::>() - .chunks(num_chunks) - .map(|r| r.to_vec()) - .collect(); - - info!( - "Executing query stage with {} chunks of partition ranges", - partition_chunks.len() - ); - - let mut executions: Vec>>> = - Vec::with_capacity(partition_count); - for i in 0..partition_chunks.len() { - let plan = plan.clone(); - let executor_meta = executors[i % executors.len()].clone(); - let partition_ids = partition_chunks[i].to_vec(); - let job_id = job_id.to_owned(); - executions.push(tokio::spawn(async move { - let mut client = - BallistaClient::try_new(&executor_meta.host, executor_meta.port).await?; - let stats = client - .execute_partition(job_id.clone(), stage_id, partition_ids.clone(), plan) - .await?; - - Ok(partition_ids - .iter() - .map(|part| PartitionLocation { - partition_id: PartitionId::new(&job_id, stage_id, *part), - executor_meta: executor_meta.clone(), - partition_stats: *stats[*part].statistics(), - }) - .collect()) - })); - } - - // wait for all partitions to complete - let results = futures::future::join_all(executions).await; - - // check for errors - let mut meta = Vec::with_capacity(partition_count); - for result in results { - match result { - Ok(partition_result) => { - let final_result = partition_result?; - debug!("Query stage partition result: {:?}", final_result); - meta.extend(final_result); - } - Err(e) => { - return Err(BallistaError::General(format!( - "Query stage {} failed: {:?}", - stage_id, e - ))) - } - } - } - - debug!( - "execute_query_stage() stage_id={} produced {:?}", - stage_id, meta - ); - - Ok(meta) -} - #[cfg(test)] mod test { use crate::planner::DistributedPlanner; @@ -387,7 +235,6 @@ mod test { use ballista_core::error::BallistaError; use ballista_core::execution_plans::UnresolvedShuffleExec; use ballista_core::serde::protobuf; - use ballista_core::serde::scheduler::ExecutorMeta; use ballista_core::utils::format_plan; use datafusion::physical_plan::hash_aggregate::HashAggregateExec; use datafusion::physical_plan::merge::MergeExec; @@ -420,11 +267,7 @@ mod test { let plan = ctx.optimize(&plan)?; let plan = ctx.create_physical_plan(&plan)?; - let mut planner = DistributedPlanner::try_new(vec![ExecutorMeta { - id: "".to_string(), - host: "".to_string(), - port: 0, - }])?; + let mut planner = DistributedPlanner::new(); let job_uuid = Uuid::new_v4(); let stages = planner.plan_query_stages(&job_uuid.to_string(), plan)?; for stage in &stages { diff --git a/benchmarks/docker-compose.yaml b/benchmarks/docker-compose.yaml index 6015dbac2cc25..bbb31078cf0a5 100644 --- a/benchmarks/docker-compose.yaml +++ b/benchmarks/docker-compose.yaml @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -version: '2.0' +version: '2.2' services: etcd: image: quay.io/coreos/etcd:v3.4.9 @@ -28,18 +28,10 @@ services: - ./data:/data depends_on: - etcd - ballista-executor-1: + ballista-executor: image: ballistacompute/ballista-rust:0.5.0-SNAPSHOT - command: "/executor --bind-host 0.0.0.0 --port 50051 --external-host ballista-executor-1 --scheduler-host ballista-scheduler" - environment: - - RUST_LOG=info - volumes: - - ./data:/data - depends_on: - - ballista-scheduler - ballista-executor-2: - image: ballistacompute/ballista-rust:0.5.0-SNAPSHOT - command: "/executor --bind-host 0.0.0.0 --port 50052 --external-host ballista-executor-2 --scheduler-host ballista-scheduler" + command: "/executor --bind-host 0.0.0.0 --port 50051 --scheduler-host ballista-scheduler" + scale: 2 environment: - RUST_LOG=info volumes: @@ -57,6 +49,5 @@ services: - ../..:/ballista depends_on: - ballista-scheduler - - ballista-executor-1 - - ballista-executor-2 + - ballista-executor From 3072df65ecab8f29f18ffe7cce4b5633f34aebe0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sun, 2 May 2021 23:15:15 +0200 Subject: [PATCH 054/329] Fix Left join implementation is incorrect for 0 or multiple batches on the right side (#238) * Fix left join unmatched rows * Add test for multiple batches * Clippy * Import cleanup * Add test for empty right side * Add some comments * Fix comment * Fix comment * Link to GH issue * Use explicity pattern match --- datafusion/src/physical_plan/hash_join.rs | 191 ++++++++++++++++++--- datafusion/src/physical_plan/hash_utils.rs | 2 +- 2 files changed, 168 insertions(+), 25 deletions(-) diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index 8e6b0428c0417..3398494e3c46c 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -31,9 +31,9 @@ use arrow::{ datatypes::{TimeUnit, UInt32Type, UInt64Type}, }; use smallvec::{smallvec, SmallVec}; -use std::time::Instant; -use std::{any::Any, collections::HashSet}; +use std::{any::Any, usize}; use std::{hash::Hasher, sync::Arc}; +use std::{time::Instant, vec}; use async_trait::async_trait; use futures::{Stream, StreamExt, TryStreamExt}; @@ -370,6 +370,11 @@ impl ExecutionPlan for HashJoinExec { let on_right = self.on.iter().map(|on| on.1.clone()).collect::>(); let column_indices = self.column_indices_from_schema()?; + let num_rows = left_data.1.num_rows(); + let visited_left_side = match self.join_type { + JoinType::Left => vec![false; num_rows], + JoinType::Inner | JoinType::Right => vec![], + }; Ok(Box::pin(HashJoinStream { schema: self.schema.clone(), on_left, @@ -384,6 +389,8 @@ impl ExecutionPlan for HashJoinExec { num_output_rows: 0, join_time: 0, random_state: self.random_state.clone(), + visited_left_side: visited_left_side, + is_exhausted: false, })) } } @@ -453,6 +460,10 @@ struct HashJoinStream { join_time: usize, /// Random state used for hashing initialization random_state: RandomState, + /// Keeps track of the left side rows whether they are visited + visited_left_side: Vec, // TODO: use a more memory efficient data structure, https://github.com/apache/arrow-datafusion/issues/240 + /// There is nothing to process anymore and left side is processed in case of left join + is_exhausted: bool, } impl RecordBatchStream for HashJoinStream { @@ -473,7 +484,7 @@ fn build_batch_from_indices( left_indices: UInt64Array, right_indices: UInt32Array, column_indices: &[ColumnIndex], -) -> ArrowResult { +) -> ArrowResult<(RecordBatch, UInt64Array)> { // build the columns of the new [RecordBatch]: // 1. pick whether the column is from the left or right // 2. based on the pick, `take` items from the different RecordBatches @@ -489,7 +500,7 @@ fn build_batch_from_indices( }; columns.push(array); } - RecordBatch::try_new(Arc::new(schema.clone()), columns) + RecordBatch::try_new(Arc::new(schema.clone()), columns).map(|x| (x, left_indices)) } #[allow(clippy::too_many_arguments)] @@ -502,7 +513,7 @@ fn build_batch( schema: &Schema, column_indices: &[ColumnIndex], random_state: &RandomState, -) -> ArrowResult { +) -> ArrowResult<(RecordBatch, UInt64Array)> { let (left_indices, right_indices) = build_join_indexes( &left_data, &batch, @@ -617,13 +628,6 @@ fn build_join_indexes( let mut left_indices = UInt64Builder::new(0); let mut right_indices = UInt32Builder::new(0); - // Keep track of which item is visited in the build input - // TODO: this can be stored more efficiently with a marker - // https://issues.apache.org/jira/browse/ARROW-11116 - // TODO: Fix LEFT join with multiple right batches - // https://issues.apache.org/jira/browse/ARROW-10971 - let mut is_visited = HashSet::new(); - // First visit all of the rows for (row, hash_value) in hash_values.iter().enumerate() { if let Some((_, indices)) = @@ -634,20 +638,10 @@ fn build_join_indexes( if equal_rows(i as usize, row, &left_join_values, &keys_values)? { left_indices.append_value(i)?; right_indices.append_value(row as u32)?; - is_visited.insert(i); } } }; } - // Add the remaining left rows to the result set with None on the right side - for (_, indices) in left { - for i in indices.iter() { - if !is_visited.contains(i) { - left_indices.append_slice(&indices)?; - right_indices.append_null()?; - } - } - } Ok((left_indices.finish(), right_indices.finish())) } JoinType::Right => { @@ -1001,6 +995,39 @@ pub fn create_hashes<'a>( Ok(hashes_buffer) } +// Produces a batch for left-side rows that are not marked as being visited during the whole join +fn produce_unmatched( + visited_left_side: &[bool], + schema: &SchemaRef, + column_indices: &[ColumnIndex], + left_data: &JoinLeftData, +) -> ArrowResult { + // Find indices which didn't match any right row (are false) + let unmatched_indices: Vec = visited_left_side + .iter() + .enumerate() + .filter(|&(_, &value)| !value) + .map(|(index, _)| index as u64) + .collect(); + + // generate batches by taking values from the left side and generating columns filled with null on the right side + let indices = UInt64Array::from_iter_values(unmatched_indices); + let num_rows = indices.len(); + let mut columns: Vec> = Vec::with_capacity(schema.fields().len()); + for (idx, column_index) in column_indices.iter().enumerate() { + let array = if column_index.is_left { + let array = left_data.1.column(column_index.index); + compute::take(array.as_ref(), &indices, None).unwrap() + } else { + let datatype = schema.field(idx).data_type(); + arrow::array::new_null_array(datatype, num_rows) + }; + + columns.push(array); + } + RecordBatch::try_new(schema.clone(), columns) +} + impl Stream for HashJoinStream { type Item = ArrowResult; @@ -1025,14 +1052,49 @@ impl Stream for HashJoinStream { ); self.num_input_batches += 1; self.num_input_rows += batch.num_rows(); - if let Ok(ref batch) = result { + if let Ok((ref batch, ref left_side)) = result { self.join_time += start.elapsed().as_millis() as usize; self.num_output_batches += 1; self.num_output_rows += batch.num_rows(); + + match self.join_type { + JoinType::Left => { + left_side.iter().flatten().for_each(|x| { + self.visited_left_side[x as usize] = true; + }); + } + JoinType::Inner | JoinType::Right => {} + } } - Some(result) + Some(result.map(|x| x.0)) } other => { + let start = Instant::now(); + // For the left join, produce rows for unmatched rows + match self.join_type { + JoinType::Left if !self.is_exhausted => { + let result = produce_unmatched( + &self.visited_left_side, + &self.schema, + &self.column_indices, + &self.left_data, + ); + if let Ok(ref batch) = result { + self.num_input_batches += 1; + self.num_input_rows += batch.num_rows(); + if let Ok(ref batch) = result { + self.join_time += + start.elapsed().as_millis() as usize; + self.num_output_batches += 1; + self.num_output_rows += batch.num_rows(); + } + } + self.is_exhausted = true; + return Some(result); + } + JoinType::Left | JoinType::Inner | JoinType::Right => {} + } + debug!( "Processed {} probe-side input batches containing {} rows and \ produced {} output batches containing {} rows in {} ms", @@ -1299,6 +1361,87 @@ mod tests { Ok(()) } + fn build_table_two_batches( + a: (&str, &Vec), + b: (&str, &Vec), + c: (&str, &Vec), + ) -> Arc { + let batch = build_table_i32(a, b, c); + let schema = batch.schema(); + Arc::new( + MemoryExec::try_new(&[vec![batch.clone(), batch]], schema, None).unwrap(), + ) + } + + #[tokio::test] + async fn join_left_multi_batch() { + let left = build_table( + ("a1", &vec![1, 2, 3]), + ("b1", &vec![4, 5, 7]), // 7 does not exist on the right + ("c1", &vec![7, 8, 9]), + ); + let right = build_table_two_batches( + ("a2", &vec![10, 20, 30]), + ("b1", &vec![4, 5, 6]), + ("c2", &vec![70, 80, 90]), + ); + let on = &[("b1", "b1")]; + + let join = join(left, right, on, &JoinType::Left).unwrap(); + + let columns = columns(&join.schema()); + assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "c2"]); + + let stream = join.execute(0).await.unwrap(); + let batches = common::collect(stream).await.unwrap(); + + let expected = vec![ + "+----+----+----+----+----+", + "| a1 | b1 | c1 | a2 | c2 |", + "+----+----+----+----+----+", + "| 1 | 4 | 7 | 10 | 70 |", + "| 1 | 4 | 7 | 10 | 70 |", + "| 2 | 5 | 8 | 20 | 80 |", + "| 2 | 5 | 8 | 20 | 80 |", + "| 3 | 7 | 9 | | |", + "+----+----+----+----+----+", + ]; + + assert_batches_sorted_eq!(expected, &batches); + } + + #[tokio::test] + async fn join_left_empty_right() { + let left = build_table( + ("a1", &vec![1, 2, 3]), + ("b1", &vec![4, 5, 7]), + ("c1", &vec![7, 8, 9]), + ); + let right = build_table_i32(("a2", &vec![]), ("b1", &vec![]), ("c2", &vec![])); + let on = &[("b1", "b1")]; + let schema = right.schema(); + let right = Arc::new(MemoryExec::try_new(&[vec![right]], schema, None).unwrap()); + let join = join(left, right, on, &JoinType::Left).unwrap(); + + let columns = columns(&join.schema()); + assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "c2"]); + + let stream = join.execute(0).await.unwrap(); + let batches = common::collect(stream).await.unwrap(); + + let expected = vec![ + "+----+----+----+----+----+", + "| a1 | b1 | c1 | a2 | c2 |", + "+----+----+----+----+----+", + "| 1 | 4 | 7 | | |", + "| 2 | 5 | 8 | | |", + "| 3 | 7 | 9 | | |", + "+----+----+----+----+----+", + ]; + + assert_batches_sorted_eq!(expected, &batches); + } + #[tokio::test] async fn join_left_one() -> Result<()> { let left = build_table( diff --git a/datafusion/src/physical_plan/hash_utils.rs b/datafusion/src/physical_plan/hash_utils.rs index a38cc092123d4..54da1249e5c55 100644 --- a/datafusion/src/physical_plan/hash_utils.rs +++ b/datafusion/src/physical_plan/hash_utils.rs @@ -22,7 +22,7 @@ use arrow::datatypes::{Field, Schema}; use std::collections::HashSet; /// All valid types of joins. -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum JoinType { /// Inner join Inner, From 47bd3faddb410c02333aaf2430c1de19ae4ff09f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Mon, 3 May 2021 15:59:40 +0200 Subject: [PATCH 055/329] Move datafusion-cli to new crate (#231) * Move datafusion-cli to own crate * Cargo.toml fixes, remove repl code * Remove bin option * License and doc updates * Fix link to CLI docs in readme * Use re-exported arrow * fmt * Spacing * Inherit datafusion default * Update datafusion/docs/cli.md Co-authored-by: Andy Grove * Update datafusion/docs/cli.md Co-authored-by: Andy Grove * Update docker setup Co-authored-by: Andy Grove --- .dockerignore | 30 ++--------------- Cargo.toml | 1 + README.md | 2 +- datafusion-cli/Cargo.toml | 33 +++++++++++++++++++ {datafusion => datafusion-cli}/Dockerfile | 13 +++++--- .../bin/repl.rs => datafusion-cli/src/main.rs | 23 ++++++------- datafusion/Cargo.toml | 9 +---- datafusion/docs/cli.md | 12 +++---- datafusion/src/bin/main.rs | 25 -------------- 9 files changed, 64 insertions(+), 84 deletions(-) create mode 100644 datafusion-cli/Cargo.toml rename {datafusion => datafusion-cli}/Dockerfile (83%) rename datafusion/src/bin/repl.rs => datafusion-cli/src/main.rs (88%) delete mode 100644 datafusion/src/bin/main.rs diff --git a/.dockerignore b/.dockerignore index 8cd6a89645c3b..533221e2b4a69 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,28 +1,2 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Turn .dockerignore to .dockerallow by excluding everything and explicitly -# allowing specific files and directories. This enables us to quickly add -# dependency files to the docker content without scanning the whole directory. -# This setup requires to all of our docker containers have arrow's source -# as a mounted directory. - -ci -dev -testing -parquet-testing -**/target/* +.git +**target diff --git a/Cargo.toml b/Cargo.toml index 2f34babdb247b..fa36a0c0fed7c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ [workspace] members = [ "datafusion", + "datafusion-cli", "datafusion-examples", "benchmarks", "ballista/rust/client", diff --git a/README.md b/README.md index f6ef7d176686e..783bdd655cbb6 100644 --- a/README.md +++ b/README.md @@ -132,7 +132,7 @@ datafusion = "4.0.0-SNAPSHOT" ## Using DataFusion as a binary -DataFusion also includes a simple command-line interactive SQL utility. See the [CLI reference](docs/cli.md) for more information. +DataFusion also includes a simple command-line interactive SQL utility. See the [CLI reference](datafusion/docs/cli.md) for more information. # Status diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml new file mode 100644 index 0000000000000..883d0f2f4c66b --- /dev/null +++ b/datafusion-cli/Cargo.toml @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "datafusion-cli" +version = "4.0.0-SNAPSHOT" +authors = ["Apache Arrow "] +edition = "2018" +keywords = [ "arrow", "query", "sql", "cli", "repl" ] +license = "Apache-2.0" +homepage = "https://github.com/apache/arrow-datafusion" +repository = "https://github.com/apache/arrow-datafusion" + + +[dependencies] +clap = "2.33" +rustyline = "8.0" +tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } +datafusion = { path = "../datafusion" } diff --git a/datafusion/Dockerfile b/datafusion-cli/Dockerfile similarity index 83% rename from datafusion/Dockerfile rename to datafusion-cli/Dockerfile index 97e82b4bbca20..32bd38b9419dc 100644 --- a/datafusion/Dockerfile +++ b/datafusion-cli/Dockerfile @@ -15,11 +15,14 @@ # specific language governing permissions and limitations # under the License. -FROM rustlang/rust:nightly +FROM rust:latest + + +COPY ./datafusion ./usr/src/datafusion +COPY ./datafusion-cli ./usr/src/datafusion-cli + +WORKDIR /usr/src/datafusion-cli +RUN cargo install --path . -COPY format /arrow/format/ -COPY rust /arrow/rust/ -WORKDIR /arrow/rust/datafusion -RUN cargo install --bin datafusion-cli --path . CMD ["datafusion-cli", "--data-path", "/data"] diff --git a/datafusion/src/bin/repl.rs b/datafusion-cli/src/main.rs similarity index 88% rename from datafusion/src/bin/repl.rs rename to datafusion-cli/src/main.rs index a6aec204c0d3b..dd7265e1f707e 100644 --- a/datafusion/src/bin/repl.rs +++ b/datafusion-cli/src/main.rs @@ -17,8 +17,8 @@ #![allow(bare_trait_objects)] -use arrow::util::pretty; use clap::{crate_version, App, Arg}; +use datafusion::arrow::util::pretty; use datafusion::error::Result; use datafusion::execution::context::{ExecutionConfig, ExecutionContext}; use rustyline::Editor; @@ -44,7 +44,7 @@ pub async fn main() { ) .arg( Arg::with_name("batch-size") - .help("The batch size of each query, default value is 1048576") + .help("The batch size of each query, or use DataFusion default") .short("c") .long("batch-size") .takes_value(true), @@ -56,16 +56,17 @@ pub async fn main() { env::set_current_dir(&p).unwrap(); }; - let batch_size = matches + let mut execution_config = ExecutionConfig::new().with_information_schema(true); + + if let Some(batch_size) = matches .value_of("batch-size") - .map(|size| size.parse::().unwrap()) - .unwrap_or(1_048_576); - - let mut ctx = ExecutionContext::with_config( - ExecutionConfig::new() - .with_batch_size(batch_size) - .with_information_schema(true), - ); + .and_then(|size| size.parse::().ok()) + { + execution_config = execution_config.with_batch_size(batch_size); + }; + + let mut ctx = + ExecutionContext::with_config(execution_config.with_information_schema(true)); let mut rl = Editor::<()>::new(); rl.load_history(".history").ok(); diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index 1c2c860c55d0f..e35b8feea7373 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -36,13 +36,8 @@ edition = "2018" name = "datafusion" path = "src/lib.rs" -[[bin]] -name = "datafusion-cli" -path = "src/bin/main.rs" - [features] -default = ["cli", "crypto_expressions", "regex_expressions", "unicode_expressions"] -cli = ["rustyline"] +default = ["crypto_expressions", "regex_expressions", "unicode_expressions"] simd = ["arrow/simd"] crypto_expressions = ["md-5", "sha2"] regex_expressions = ["regex", "lazy_static"] @@ -54,8 +49,6 @@ hashbrown = "0.11" arrow = { git = "https://github.com/apache/arrow-rs", rev = "d008f31b107c1030a1f5144c164e8ca8bf543576", features = ["prettyprint"] } parquet = { git = "https://github.com/apache/arrow-rs", rev = "d008f31b107c1030a1f5144c164e8ca8bf543576", features = ["arrow"] } sqlparser = "0.9.0" -clap = "2.33" -rustyline = {version = "7.0", optional = true} paste = "^1.0" num_cpus = "1.13.0" chrono = "0.4" diff --git a/datafusion/docs/cli.md b/datafusion/docs/cli.md index aeacdeee04a18..27605b2e98c0f 100644 --- a/datafusion/docs/cli.md +++ b/datafusion/docs/cli.md @@ -26,9 +26,9 @@ The DataFusion CLI is a command-line interactive SQL utility that allows queries Use the following commands to clone this repository and run the CLI. This will require the Rust toolchain to be installed. Rust can be installed from [https://rustup.rs/](https://rustup.rs/). ```sh -git clone https://github.com/apache/arrow -cd arrow/rust/datafusion -cargo run --bin datafusion-cli --release +git clone https://github.com/apache/arrow-datafusion +cd arrow-datafusion/datafusion-cli +cargo run --release ``` ## Run using Docker @@ -36,9 +36,9 @@ cargo run --bin datafusion-cli --release Use the following commands to clone this repository and build a Docker image containing the CLI tool. Note that there is `.dockerignore` file in the root of the repository that may need to be deleted in order for this to work. ```sh -git clone https://github.com/apache/arrow -cd arrow -docker build -f rust/datafusion/Dockerfile . --tag datafusion-cli +git clone https://github.com/apache/arrow-datafusion +cd arrow-datafusion +docker build -f datafusion-cli/Dockerfile . --tag datafusion-cli docker run -it -v $(your_data_location):/data datafusion-cli ``` diff --git a/datafusion/src/bin/main.rs b/datafusion/src/bin/main.rs deleted file mode 100644 index deb5b796b2d69..0000000000000 --- a/datafusion/src/bin/main.rs +++ /dev/null @@ -1,25 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Only bring in dependencies for the repl when the cli feature is enabled. -#[cfg(feature = "cli")] -mod repl; - -pub fn main() { - #[cfg(feature = "cli")] - repl::main() -} From b9b3d863d7765ceccf92155da7ec2e5f28d8feee Mon Sep 17 00:00:00 2001 From: Patrick More <34631716+pjmore@users.noreply.github.com> Date: Mon, 3 May 2021 07:33:12 -0700 Subject: [PATCH 056/329] Count distinct boolean (#230) * Added boolean support for count distinct. * Added boolean support for COUNT DISTINCT * Corrected macro call * Added test for boolean COUNT DISTINCT * ran cargo fmt * Corrected test assertion for boolean COUNT DISTINCT * Fixed clippy warnings * fix cargo fmt --- .../src/physical_plan/distinct_expressions.rs | 60 ++++++++++++++++++- datafusion/src/scalar.rs | 1 + 2 files changed, 58 insertions(+), 3 deletions(-) diff --git a/datafusion/src/physical_plan/distinct_expressions.rs b/datafusion/src/physical_plan/distinct_expressions.rs index 8534e9c8805cf..1c93b5a104d09 100644 --- a/datafusion/src/physical_plan/distinct_expressions.rs +++ b/datafusion/src/physical_plan/distinct_expressions.rs @@ -195,10 +195,9 @@ impl Accumulator for DistinctCountAccumulator { mod tests { use super::*; - use arrow::array::ArrayRef; use arrow::array::{ - Int16Array, Int32Array, Int64Array, Int8Array, ListArray, UInt16Array, - UInt32Array, UInt64Array, UInt8Array, + ArrayRef, BooleanArray, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, + UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; use arrow::array::{Int32Builder, ListBuilder, UInt64Builder}; use arrow::datatypes::DataType; @@ -396,6 +395,61 @@ mod tests { test_count_distinct_update_batch_numeric!(UInt64Array, UInt64, u64) } + #[test] + fn count_distinct_update_batch_boolean() -> Result<()> { + let get_count = |data: BooleanArray| -> Result<(Vec>, u64)> { + let arrays = vec![Arc::new(data) as ArrayRef]; + let (states, result) = run_update_batch(&arrays)?; + let mut state_vec = state_to_vec!(&states[0], Boolean, bool).unwrap(); + state_vec.sort(); + let count = match result { + ScalarValue::UInt64(c) => c.ok_or_else(|| { + DataFusionError::Internal("Found None count".to_string()) + }), + scalar => Err(DataFusionError::Internal(format!( + "Found non Uint64 scalar value from count: {}", + scalar + ))), + }?; + Ok((state_vec, count)) + }; + + let zero_count_values = BooleanArray::from(Vec::::new()); + + let one_count_values = BooleanArray::from(vec![false, false]); + let one_count_values_with_null = + BooleanArray::from(vec![Some(true), Some(true), None, None]); + + let two_count_values = BooleanArray::from(vec![true, false, true, false, true]); + let two_count_values_with_null = BooleanArray::from(vec![ + Some(true), + Some(false), + None, + None, + Some(true), + Some(false), + ]); + + assert_eq!( + get_count(zero_count_values)?, + (Vec::>::new(), 0) + ); + assert_eq!(get_count(one_count_values)?, (vec![Some(false)], 1)); + assert_eq!( + get_count(one_count_values_with_null)?, + (vec![Some(true)], 1) + ); + assert_eq!( + get_count(two_count_values)?, + (vec![Some(false), Some(true)], 2) + ); + assert_eq!( + get_count(two_count_values_with_null)?, + (vec![Some(false), Some(true)], 2) + ); + Ok(()) + } + #[test] fn count_distinct_update_batch_all_nulls() -> Result<()> { let arrays = vec![Arc::new(Int32Array::from( diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index 833f707e971ea..6f03194f45423 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -345,6 +345,7 @@ impl ScalarValue { ), }, ScalarValue::List(values, data_type) => Arc::new(match data_type { + DataType::Boolean => build_list!(BooleanBuilder, Boolean, values, size), DataType::Int8 => build_list!(Int8Builder, Int8, values, size), DataType::Int16 => build_list!(Int16Builder, Int16, values, size), DataType::Int32 => build_list!(Int32Builder, Int32, values, size), From 9c7cd9a3dc6cf613274918ad3734e7e0df43c31c Mon Sep 17 00:00:00 2001 From: "K.I. (Dennis) Jung" Date: Mon, 3 May 2021 23:34:10 +0900 Subject: [PATCH 057/329] Add install guide in README (#236) --- ballista/ui/scheduler/README.md | 60 ++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/ballista/ui/scheduler/README.md b/ballista/ui/scheduler/README.md index 1a196dab2fa16..90bc2bface4c6 100644 --- a/ballista/ui/scheduler/README.md +++ b/ballista/ui/scheduler/README.md @@ -19,27 +19,39 @@ # Ballista UI - -## Available Scripts - -In the project directory, you can run: - -### `yarn start` - -Runs the app in the development mode.\ -Open [http://localhost:3000](http://localhost:3000) to view it in the browser. - -The page will reload if you make edits.\ -You will also see any lint errors in the console. - -### `yarn test` - -Launches the test runner in the interactive watch mode.\ -See the section about [running tests](https://facebook.github.io/create-react-app/docs/running-tests) for more information. - -### `yarn build` - -Builds the app for production to the `build` folder.\ -It correctly bundles React in production mode and optimizes the build for the best performance. - -The build is minified and the filenames include the hashes. \ No newline at end of file +## Start project from source + +### Run scheduler/executor +First, run scheduler from project: +```shell +$ cd rust/scheduler +$ RUST_LOG=info cargo run --release +... + Finished release [optimized] target(s) in 11.92s + Running `/path-to-project/target/release/ballista-scheduler` +[2021-05-02T05:11:17Z INFO ballista_scheduler] Ballista v0.5.0-SNAPSHOT Scheduler listening on 0.0.0.0:50050 +[2021-05-02T05:14:10Z INFO ballista_scheduler] Received get_executors_metadata request +``` + +and run executor in new terminal: +```shell +$ cd rust/executor +$ RUST_LOG=info cargo run --release + Finished release [optimized] target(s) in 0.09s + Running `/path-to-project/target/release/ballista-executor` +[2021-05-02T05:11:30Z INFO ballista_executor] Running with config: ExecutorConfig { host: "localhost", port: 50051, work_dir: "/var/folders/y8/fc61kyjd4n53tn444n72rjrm0000gn/T/.tmpAZ0rn4", concurrent_tasks: 4 } +[2021-05-02T05:11:30Z INFO ballista_executor] Ballista v0.5.0-SNAPSHOT Rust Executor listening on 0.0.0.0:50051 +``` + +### Run Client project +```shell +$ cd ui/scheduler +$ yarn +yarn install v1.22.10 +[1/4] 🔍 Resolving packages... +... +$ yarn start +Starting the development server... +``` + +Now access to http://localhost:3000/ From d700cc11fdc4a371666cf7ecbbd5fb3cdea31769 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Mon, 3 May 2021 16:40:39 +0200 Subject: [PATCH 058/329] Initial docs for SQL syntax (#242) * Initial docs for SQL syntax * Add license --- docs/user-guide/src/SUMMARY.md | 7 +- docs/user-guide/src/sql/ddl.md | 20 ++++ docs/user-guide/src/sql/introduction.md | 20 ++++ docs/user-guide/src/sql/select.md | 133 ++++++++++++++++++++++++ 4 files changed, 179 insertions(+), 1 deletion(-) create mode 100644 docs/user-guide/src/sql/ddl.md create mode 100644 docs/user-guide/src/sql/introduction.md create mode 100644 docs/user-guide/src/sql/select.md diff --git a/docs/user-guide/src/SUMMARY.md b/docs/user-guide/src/SUMMARY.md index e2ddcb0a4e89c..0fced3bb3deab 100644 --- a/docs/user-guide/src/SUMMARY.md +++ b/docs/user-guide/src/SUMMARY.md @@ -20,7 +20,12 @@ - [Introduction](introduction.md) - [Example Usage](example-usage.md) -- [Use as a Library](library.md) +- [Use as a Library](library.md) +- [SQL Reference](sql/introduction.md) + - [SELECT](sql/select.md) + - [DDL](sql/ddl.md) + - [CREATE EXTERNAL TABLE](sql/ddl.md) + - [Distributed](distributed/introduction.md) - [Create a Ballista Cluster](distributed/deployment.md) - [Docker](distributed/standalone.md) diff --git a/docs/user-guide/src/sql/ddl.md b/docs/user-guide/src/sql/ddl.md new file mode 100644 index 0000000000000..cb1665792d964 --- /dev/null +++ b/docs/user-guide/src/sql/ddl.md @@ -0,0 +1,20 @@ + + +# DDL diff --git a/docs/user-guide/src/sql/introduction.md b/docs/user-guide/src/sql/introduction.md new file mode 100644 index 0000000000000..89ed2777618d8 --- /dev/null +++ b/docs/user-guide/src/sql/introduction.md @@ -0,0 +1,20 @@ + + +# SQL Reference diff --git a/docs/user-guide/src/sql/select.md b/docs/user-guide/src/sql/select.md new file mode 100644 index 0000000000000..777b4ff61e5d0 --- /dev/null +++ b/docs/user-guide/src/sql/select.md @@ -0,0 +1,133 @@ + + +# SELECT syntax + +The queries in DataFusion scan data from tables and return 0 or more rows. +In this documentation we describe the SQL syntax in DataFusion. + +DataFusion supports the following syntax for queries: + + +[ [WITH](#with-clause) with_query [, ...] ]
+[SELECT](#select-clause) select_expr [, ...]
+[ [FROM](#from-clause) from_item [, ...] ]
+[ [WHERE](#where-clause) condition ]
+[ [GROUP BY](#group-by-clause) grouping_element [, ...] ]
+[ [HAVING](#having-clause) condition]
+[ [UNION](#union-clause) [ ALL | select ]
+[ [ORDER BY](#order-by-clause) expression [ ASC | DESC ] [, ...] ]
+[ [LIMIT](#limit-clause) count ]
+ +
+ +# WITH clause + +A with clause allows to give names for queries and reference them by name. + +```sql +WITH x AS (SELECT a, MAX(b) AS b FROM t GROUP BY a) +SELECT a, b FROM x; +``` + +# SELECT clause + + +Example: + +```sql +SELECT a, b, a + b FROM table +``` + + +# FROM clause + +Example: +```sql +SELECT t.a FROM table AS t +``` + + +# WHERE clause + +Example: + +```sql +SELECT a FROM table WHERE a > 10 +``` + +# GROUP BY clause + +Example: + +```sql +SELECT a, b, MAX(c) FROM table GROUP BY a, b +``` + + +# HAVING clause + +Example: + +```sql +SELECT a, b, MAX(c) FROM table GROUP BY a, b HAVING MAX(c) > 10 +``` + +# UNION clause + +Example: + +```sql +SELECT + a, + b, + c +FROM table1 +UNION ALL +SELECT + a, + b, + c +FROM table2 +``` + +# ORDER BY clause + +Orders the results by the referenced expression. By default it uses ascending order (`ASC`). +This order can be changed to descending by adding `DESC` after the order-by expressions. + +Examples: + +```sql +SELECT age, person FROM table ORDER BY age; +SELECT age, person FROM table ORDER BY age DESC; +SELECT age, person FROM table ORDER BY age, person DESC; +``` + + +# LIMIT clause + +Limits the number of rows to be a maximum of `count` rows. `count` should be a non-negative integer. + +Example: + +```sql +SELECT age, person FROM table +LIMIT 10 +``` \ No newline at end of file From e271e4d480b17ca36f39c165661c1dfc8022c63f Mon Sep 17 00:00:00 2001 From: Dmitry Patsura Date: Mon, 3 May 2021 22:35:59 +0300 Subject: [PATCH 059/329] misc(README): Replace Cube.js with Cube Store (#248) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 783bdd655cbb6..60492e7f93919 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ Here are some of the projects known to use DataFusion: * [Ballista](https://github.com/ballista-compute/ballista) Distributed Compute Platform * [Cloudfuse Buzz](https://github.com/cloudfuse-io/buzz-rust) -* [Cube.js](https://github.com/cube-js/cube.js) +* [Cube Store](https://github.com/cube-js/cube.js/tree/master/rust) * [datafusion-python](https://pypi.org/project/datafusion) * [delta-rs](https://github.com/delta-io/delta-rs) * [InfluxDB IOx](https://github.com/influxdata/influxdb_iox) Time Series Database From 46bde0bd148aacf1677a575cb9ddbc154b6c4fb3 Mon Sep 17 00:00:00 2001 From: Jorge Leitao Date: Tue, 4 May 2021 14:24:57 +0200 Subject: [PATCH 060/329] Add datafusion-python (#69) * Added Python project. * Update python/Cargo.toml Co-authored-by: Andy Grove * Update python/Cargo.toml Co-authored-by: Uwe L. Korn * Added license and black formatting. * License * Fixing build. * TesTestt * Bumped to latest DataFusion. * Bumped nightly. * Bumped pyarrow in tests. * Added some tests back. Co-authored-by: Andy Grove Co-authored-by: Uwe L. Korn --- .github/workflows/python_build.yml | 89 +++++++++ .github/workflows/python_test.yaml | 58 ++++++ Cargo.toml | 4 +- dev/release/rat_exclude_files.txt | 1 + python/.cargo/config | 22 +++ python/.dockerignore | 19 ++ python/.gitignore | 20 ++ python/Cargo.toml | 57 ++++++ python/README.md | 146 ++++++++++++++ python/pyproject.toml | 20 ++ python/rust-toolchain | 1 + python/src/context.rs | 115 +++++++++++ python/src/dataframe.rs | 161 ++++++++++++++++ python/src/errors.rs | 61 ++++++ python/src/expression.rs | 162 ++++++++++++++++ python/src/functions.rs | 165 ++++++++++++++++ python/src/lib.rs | 44 +++++ python/src/scalar.rs | 36 ++++ python/src/to_py.rs | 77 ++++++++ python/src/to_rust.rs | 111 +++++++++++ python/src/types.rs | 76 ++++++++ python/src/udaf.rs | 147 +++++++++++++++ python/src/udf.rs | 62 ++++++ python/tests/__init__.py | 16 ++ python/tests/generic.py | 75 ++++++++ python/tests/test_df.py | 115 +++++++++++ python/tests/test_sql.py | 294 +++++++++++++++++++++++++++++ python/tests/test_udaf.py | 91 +++++++++ 28 files changed, 2244 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/python_build.yml create mode 100644 .github/workflows/python_test.yaml create mode 100644 python/.cargo/config create mode 100644 python/.dockerignore create mode 100644 python/.gitignore create mode 100644 python/Cargo.toml create mode 100644 python/README.md create mode 100644 python/pyproject.toml create mode 100644 python/rust-toolchain create mode 100644 python/src/context.rs create mode 100644 python/src/dataframe.rs create mode 100644 python/src/errors.rs create mode 100644 python/src/expression.rs create mode 100644 python/src/functions.rs create mode 100644 python/src/lib.rs create mode 100644 python/src/scalar.rs create mode 100644 python/src/to_py.rs create mode 100644 python/src/to_rust.rs create mode 100644 python/src/types.rs create mode 100644 python/src/udaf.rs create mode 100644 python/src/udf.rs create mode 100644 python/tests/__init__.py create mode 100644 python/tests/generic.py create mode 100644 python/tests/test_df.py create mode 100644 python/tests/test_sql.py create mode 100644 python/tests/test_udaf.py diff --git a/.github/workflows/python_build.yml b/.github/workflows/python_build.yml new file mode 100644 index 0000000000000..c86bb81581a71 --- /dev/null +++ b/.github/workflows/python_build.yml @@ -0,0 +1,89 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Build +on: + push: + tags: + - v* + +jobs: + build-python-mac-win: + name: Mac/Win + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + python-version: [3.6, 3.7, 3.8] + os: [macos-latest, windows-latest] + steps: + - uses: actions/checkout@v2 + + - uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + + - uses: actions-rs/toolchain@v1 + with: + toolchain: nightly-2021-01-06 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install maturin + + - name: Build Python package + run: cd python && maturin build --release --no-sdist --strip --interpreter python${{matrix.python_version}} + + - name: List wheels + if: matrix.os == 'windows-latest' + run: dir python/target\wheels\ + + - name: List wheels + if: matrix.os != 'windows-latest' + run: find ./python/target/wheels/ + + - name: Archive wheels + uses: actions/upload-artifact@v2 + with: + name: dist + path: python/target/wheels/* + + build-manylinux: + name: Manylinux + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Build wheels + run: docker run --rm -v $(pwd):/io konstin2/maturin build --release --manylinux + - name: Archive wheels + uses: actions/upload-artifact@v2 + with: + name: dist + path: python/target/wheels/* + + release: + name: Publish in PyPI + needs: [build-manylinux, build-python-mac-win] + runs-on: ubuntu-latest + steps: + - uses: actions/download-artifact@v2 + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@master + with: + user: __token__ + password: ${{ secrets.pypi_password }} diff --git a/.github/workflows/python_test.yaml b/.github/workflows/python_test.yaml new file mode 100644 index 0000000000000..3b2111b59d49d --- /dev/null +++ b/.github/workflows/python_test.yaml @@ -0,0 +1,58 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Python test +on: [push, pull_request] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Setup Rust toolchain + run: | + rustup toolchain install nightly-2021-01-06 + rustup default nightly-2021-01-06 + rustup component add rustfmt + - name: Cache Cargo + uses: actions/cache@v2 + with: + path: /home/runner/.cargo + key: cargo-maturin-cache- + - name: Cache Rust dependencies + uses: actions/cache@v2 + with: + path: /home/runner/target + key: target-maturin-cache- + - uses: actions/setup-python@v2 + with: + python-version: '3.7' + - name: Install Python dependencies + run: python -m pip install --upgrade pip setuptools wheel + - name: Run tests + run: | + cd python/ + export CARGO_HOME="/home/runner/.cargo" + export CARGO_TARGET_DIR="/home/runner/target" + + python -m venv venv + source venv/bin/activate + + pip install maturin==0.10.4 toml==0.10.1 pyarrow==4.0.0 + maturin develop + + python -m unittest discover tests diff --git a/Cargo.toml b/Cargo.toml index fa36a0c0fed7c..9795cb68b4456 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,4 +25,6 @@ members = [ "ballista/rust/core", "ballista/rust/executor", "ballista/rust/scheduler", -] \ No newline at end of file +] + +exclude = ["python"] diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index b94c0ea1d61a6..6126699bbc1fa 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -104,3 +104,4 @@ rust-toolchain benchmarks/queries/q*.sql ballista/rust/scheduler/testdata/* ballista/ui/scheduler/yarn.lock +python/rust-toolchain diff --git a/python/.cargo/config b/python/.cargo/config new file mode 100644 index 0000000000000..0b24f30cf908a --- /dev/null +++ b/python/.cargo/config @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[target.x86_64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] diff --git a/python/.dockerignore b/python/.dockerignore new file mode 100644 index 0000000000000..08c131c2e7d60 --- /dev/null +++ b/python/.dockerignore @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +target +venv diff --git a/python/.gitignore b/python/.gitignore new file mode 100644 index 0000000000000..48fe4dbe52dde --- /dev/null +++ b/python/.gitignore @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +/target +Cargo.lock +venv diff --git a/python/Cargo.toml b/python/Cargo.toml new file mode 100644 index 0000000000000..070720554f0ed --- /dev/null +++ b/python/Cargo.toml @@ -0,0 +1,57 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "datafusion" +version = "0.2.1" +homepage = "https://github.com/apache/arrow" +repository = "https://github.com/apache/arrow" +authors = ["Apache Arrow "] +description = "Build and run queries against data" +readme = "README.md" +license = "Apache-2.0" +edition = "2018" + +[dependencies] +tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } +rand = "0.7" +pyo3 = { version = "0.12.1", features = ["extension-module"] } +datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "2423ff0d" } + +[lib] +name = "datafusion" +crate-type = ["cdylib"] + +[package.metadata.maturin] +requires-dist = ["pyarrow>=1"] + +classifier = [ + "Development Status :: 2 - Pre-Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "License :: OSI Approved", + "Operating System :: MacOS", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python", + "Programming Language :: Rust", +] diff --git a/python/README.md b/python/README.md new file mode 100644 index 0000000000000..1859fca9811c0 --- /dev/null +++ b/python/README.md @@ -0,0 +1,146 @@ + + +## DataFusion in Python + +This is a Python library that binds to [Apache Arrow](https://arrow.apache.org/) in-memory query engine [DataFusion](https://github.com/apache/arrow/tree/master/rust/datafusion). + +Like pyspark, it allows you to build a plan through SQL or a DataFrame API against in-memory data, parquet or CSV files, run it in a multi-threaded environment, and obtain the result back in Python. + +It also allows you to use UDFs and UDAFs for complex operations. + +The major advantage of this library over other execution engines is that this library achieves zero-copy between Python and its execution engine: there is no cost in using UDFs, UDAFs, and collecting the results to Python apart from having to lock the GIL when running those operations. + +Its query engine, DataFusion, is written in [Rust](https://www.rust-lang.org/), which makes strong assumptions about thread safety and lack of memory leaks. + +Technically, zero-copy is achieved via the [c data interface](https://arrow.apache.org/docs/format/CDataInterface.html). + +## How to use it + +Simple usage: + +```python +import datafusion +import pyarrow + +# an alias +f = datafusion.functions + +# create a context +ctx = datafusion.ExecutionContext() + +# create a RecordBatch and a new DataFrame from it +batch = pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], + names=["a", "b"], +) +df = ctx.create_dataframe([[batch]]) + +# create a new statement +df = df.select( + f.col("a") + f.col("b"), + f.col("a") - f.col("b"), +) + +# execute and collect the first (and only) batch +result = df.collect()[0] + +assert result.column(0) == pyarrow.array([5, 7, 9]) +assert result.column(1) == pyarrow.array([-3, -3, -3]) +``` + +### UDFs + +```python +def is_null(array: pyarrow.Array) -> pyarrow.Array: + return array.is_null() + +udf = f.udf(is_null, [pyarrow.int64()], pyarrow.bool_()) + +df = df.select(udf(f.col("a"))) +``` + +### UDAF + +```python +import pyarrow +import pyarrow.compute + + +class Accumulator: + """ + Interface of a user-defined accumulation. + """ + def __init__(self): + self._sum = pyarrow.scalar(0.0) + + def to_scalars(self) -> [pyarrow.Scalar]: + return [self._sum] + + def update(self, values: pyarrow.Array) -> None: + # not nice since pyarrow scalars can't be summed yet. This breaks on `None` + self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(values).as_py()) + + def merge(self, states: pyarrow.Array) -> None: + # not nice since pyarrow scalars can't be summed yet. This breaks on `None` + self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(states).as_py()) + + def evaluate(self) -> pyarrow.Scalar: + return self._sum + + +df = ... + +udaf = f.udaf(Accumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()]) + +df = df.aggregate( + [], + [udaf(f.col("a"))] +) +``` + +## How to install + +```bash +pip install datafusion +``` + +## How to develop + +This assumes that you have rust and cargo installed. We use the workflow recommended by [pyo3](https://github.com/PyO3/pyo3) and [maturin](https://github.com/PyO3/maturin). + +Bootstrap: + +```bash +# fetch this repo +git clone git@github.com:apache/arrow-datafusion.git + +cd arrow-datafusion/python + +# prepare development environment (used to build wheel / install in development) +python3 -m venv venv +pip install maturin==0.10.4 toml==0.10.1 pyarrow==1.0.0 +``` + +Whenever rust code changes (your changes or via git pull): + +```bash +venv/bin/maturin develop +venv/bin/python -m unittest discover tests +``` diff --git a/python/pyproject.toml b/python/pyproject.toml new file mode 100644 index 0000000000000..27480690e06cc --- /dev/null +++ b/python/pyproject.toml @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[build-system] +requires = ["maturin"] +build-backend = "maturin" diff --git a/python/rust-toolchain b/python/rust-toolchain new file mode 100644 index 0000000000000..9d0cf79d367d6 --- /dev/null +++ b/python/rust-toolchain @@ -0,0 +1 @@ +nightly-2021-01-06 diff --git a/python/src/context.rs b/python/src/context.rs new file mode 100644 index 0000000000000..14ef0f7321f15 --- /dev/null +++ b/python/src/context.rs @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{collections::HashSet, sync::Arc}; + +use rand::distributions::Alphanumeric; +use rand::Rng; + +use pyo3::prelude::*; + +use datafusion::arrow::record_batch::RecordBatch; +use datafusion::datasource::MemTable; +use datafusion::execution::context::ExecutionContext as _ExecutionContext; + +use crate::dataframe; +use crate::errors; +use crate::functions; +use crate::to_rust; +use crate::types::PyDataType; + +/// `ExecutionContext` is able to plan and execute DataFusion plans. +/// It has a powerful optimizer, a physical planner for local execution, and a +/// multi-threaded execution engine to perform the execution. +#[pyclass(unsendable)] +pub(crate) struct ExecutionContext { + ctx: _ExecutionContext, +} + +#[pymethods] +impl ExecutionContext { + #[new] + fn new() -> Self { + ExecutionContext { + ctx: _ExecutionContext::new(), + } + } + + /// Returns a DataFrame whose plan corresponds to the SQL statement. + fn sql(&mut self, query: &str) -> PyResult { + let df = self + .ctx + .sql(query) + .map_err(|e| -> errors::DataFusionError { e.into() })?; + Ok(dataframe::DataFrame::new( + self.ctx.state.clone(), + df.to_logical_plan(), + )) + } + + fn create_dataframe( + &mut self, + partitions: Vec>, + py: Python, + ) -> PyResult { + let partitions: Vec> = partitions + .iter() + .map(|batches| { + batches + .iter() + .map(|batch| to_rust::to_rust_batch(batch.as_ref(py))) + .collect() + }) + .collect::>()?; + + let table = + errors::wrap(MemTable::try_new(partitions[0][0].schema(), partitions))?; + + // generate a random (unique) name for this table + let name = rand::thread_rng() + .sample_iter(&Alphanumeric) + .take(10) + .collect::(); + + errors::wrap(self.ctx.register_table(&*name, Arc::new(table)))?; + Ok(dataframe::DataFrame::new( + self.ctx.state.clone(), + errors::wrap(self.ctx.table(&*name))?.to_logical_plan(), + )) + } + + fn register_parquet(&mut self, name: &str, path: &str) -> PyResult<()> { + errors::wrap(self.ctx.register_parquet(name, path))?; + Ok(()) + } + + fn register_udf( + &mut self, + name: &str, + func: PyObject, + args_types: Vec, + return_type: PyDataType, + ) { + let function = functions::create_udf(func, args_types, return_type, name); + + self.ctx.register_udf(function.function); + } + + fn tables(&self) -> HashSet { + self.ctx.tables().unwrap() + } +} diff --git a/python/src/dataframe.rs b/python/src/dataframe.rs new file mode 100644 index 0000000000000..f90a7cf2f0dcf --- /dev/null +++ b/python/src/dataframe.rs @@ -0,0 +1,161 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::{Arc, Mutex}; + +use logical_plan::LogicalPlan; +use pyo3::{prelude::*, types::PyTuple}; +use tokio::runtime::Runtime; + +use datafusion::execution::context::ExecutionContext as _ExecutionContext; +use datafusion::logical_plan::{JoinType, LogicalPlanBuilder}; +use datafusion::physical_plan::collect; +use datafusion::{execution::context::ExecutionContextState, logical_plan}; + +use crate::{errors, to_py}; +use crate::{errors::DataFusionError, expression}; + +/// A DataFrame is a representation of a logical plan and an API to compose statements. +/// Use it to build a plan and `.collect()` to execute the plan and collect the result. +/// The actual execution of a plan runs natively on Rust and Arrow on a multi-threaded environment. +#[pyclass] +pub(crate) struct DataFrame { + ctx_state: Arc>, + plan: LogicalPlan, +} + +impl DataFrame { + /// creates a new DataFrame + pub fn new(ctx_state: Arc>, plan: LogicalPlan) -> Self { + Self { ctx_state, plan } + } +} + +#[pymethods] +impl DataFrame { + /// Select `expressions` from the existing DataFrame. + #[args(args = "*")] + fn select(&self, args: &PyTuple) -> PyResult { + let expressions = expression::from_tuple(args)?; + let builder = LogicalPlanBuilder::from(&self.plan); + let builder = + errors::wrap(builder.project(expressions.into_iter().map(|e| e.expr)))?; + let plan = errors::wrap(builder.build())?; + + Ok(DataFrame { + ctx_state: self.ctx_state.clone(), + plan, + }) + } + + /// Filter according to the `predicate` expression + fn filter(&self, predicate: expression::Expression) -> PyResult { + let builder = LogicalPlanBuilder::from(&self.plan); + let builder = errors::wrap(builder.filter(predicate.expr))?; + let plan = errors::wrap(builder.build())?; + + Ok(DataFrame { + ctx_state: self.ctx_state.clone(), + plan, + }) + } + + /// Aggregates using expressions + fn aggregate( + &self, + group_by: Vec, + aggs: Vec, + ) -> PyResult { + let builder = LogicalPlanBuilder::from(&self.plan); + let builder = errors::wrap(builder.aggregate( + group_by.into_iter().map(|e| e.expr), + aggs.into_iter().map(|e| e.expr), + ))?; + let plan = errors::wrap(builder.build())?; + + Ok(DataFrame { + ctx_state: self.ctx_state.clone(), + plan, + }) + } + + /// Limits the plan to return at most `count` rows + fn limit(&self, count: usize) -> PyResult { + let builder = LogicalPlanBuilder::from(&self.plan); + let builder = errors::wrap(builder.limit(count))?; + let plan = errors::wrap(builder.build())?; + + Ok(DataFrame { + ctx_state: self.ctx_state.clone(), + plan, + }) + } + + /// Executes the plan, returning a list of `RecordBatch`es. + /// Unless some order is specified in the plan, there is no guarantee of the order of the result + fn collect(&self, py: Python) -> PyResult { + let ctx = _ExecutionContext::from(self.ctx_state.clone()); + let plan = ctx + .optimize(&self.plan) + .map_err(|e| -> errors::DataFusionError { e.into() })?; + let plan = ctx + .create_physical_plan(&plan) + .map_err(|e| -> errors::DataFusionError { e.into() })?; + + let rt = Runtime::new().unwrap(); + let batches = py.allow_threads(|| { + rt.block_on(async { + collect(plan) + .await + .map_err(|e| -> errors::DataFusionError { e.into() }) + }) + })?; + to_py::to_py(&batches) + } + + /// Returns the join of two DataFrames `on`. + fn join(&self, right: &DataFrame, on: Vec<&str>, how: &str) -> PyResult { + let builder = LogicalPlanBuilder::from(&self.plan); + + let join_type = match how { + "inner" => JoinType::Inner, + "left" => JoinType::Left, + "right" => JoinType::Right, + how => { + return Err(DataFusionError::Common(format!( + "The join type {} does not exist or is not implemented", + how + )) + .into()) + } + }; + + let builder = errors::wrap(builder.join( + &right.plan, + join_type, + on.as_slice(), + on.as_slice(), + ))?; + + let plan = errors::wrap(builder.build())?; + + Ok(DataFrame { + ctx_state: self.ctx_state.clone(), + plan, + }) + } +} diff --git a/python/src/errors.rs b/python/src/errors.rs new file mode 100644 index 0000000000000..fbe98037a030f --- /dev/null +++ b/python/src/errors.rs @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use core::fmt; + +use datafusion::arrow::error::ArrowError; +use datafusion::error::DataFusionError as InnerDataFusionError; +use pyo3::{exceptions, PyErr}; + +#[derive(Debug)] +pub enum DataFusionError { + ExecutionError(InnerDataFusionError), + ArrowError(ArrowError), + Common(String), +} + +impl fmt::Display for DataFusionError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + DataFusionError::ExecutionError(e) => write!(f, "DataFusion error: {:?}", e), + DataFusionError::ArrowError(e) => write!(f, "Arrow error: {:?}", e), + DataFusionError::Common(e) => write!(f, "{}", e), + } + } +} + +impl From for PyErr { + fn from(err: DataFusionError) -> PyErr { + exceptions::PyException::new_err(err.to_string()) + } +} + +impl From for DataFusionError { + fn from(err: InnerDataFusionError) -> DataFusionError { + DataFusionError::ExecutionError(err) + } +} + +impl From for DataFusionError { + fn from(err: ArrowError) -> DataFusionError { + DataFusionError::ArrowError(err) + } +} + +pub(crate) fn wrap(a: Result) -> Result { + Ok(a?) +} diff --git a/python/src/expression.rs b/python/src/expression.rs new file mode 100644 index 0000000000000..78ca6d7e598ec --- /dev/null +++ b/python/src/expression.rs @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use pyo3::{ + basic::CompareOp, prelude::*, types::PyTuple, PyNumberProtocol, PyObjectProtocol, +}; + +use datafusion::logical_plan::Expr as _Expr; +use datafusion::physical_plan::udaf::AggregateUDF as _AggregateUDF; +use datafusion::physical_plan::udf::ScalarUDF as _ScalarUDF; + +/// An expression that can be used on a DataFrame +#[pyclass] +#[derive(Debug, Clone)] +pub(crate) struct Expression { + pub(crate) expr: _Expr, +} + +/// converts a tuple of expressions into a vector of Expressions +pub(crate) fn from_tuple(value: &PyTuple) -> PyResult> { + value + .iter() + .map(|e| e.extract::()) + .collect::>() +} + +#[pyproto] +impl PyNumberProtocol for Expression { + fn __add__(lhs: Expression, rhs: Expression) -> PyResult { + Ok(Expression { + expr: lhs.expr + rhs.expr, + }) + } + + fn __sub__(lhs: Expression, rhs: Expression) -> PyResult { + Ok(Expression { + expr: lhs.expr - rhs.expr, + }) + } + + fn __truediv__(lhs: Expression, rhs: Expression) -> PyResult { + Ok(Expression { + expr: lhs.expr / rhs.expr, + }) + } + + fn __mul__(lhs: Expression, rhs: Expression) -> PyResult { + Ok(Expression { + expr: lhs.expr * rhs.expr, + }) + } + + fn __and__(lhs: Expression, rhs: Expression) -> PyResult { + Ok(Expression { + expr: lhs.expr.and(rhs.expr), + }) + } + + fn __or__(lhs: Expression, rhs: Expression) -> PyResult { + Ok(Expression { + expr: lhs.expr.or(rhs.expr), + }) + } + + fn __invert__(&self) -> PyResult { + Ok(Expression { + expr: self.expr.clone().not(), + }) + } +} + +#[pyproto] +impl PyObjectProtocol for Expression { + fn __richcmp__(&self, other: Expression, op: CompareOp) -> Expression { + match op { + CompareOp::Lt => Expression { + expr: self.expr.clone().lt(other.expr), + }, + CompareOp::Le => Expression { + expr: self.expr.clone().lt_eq(other.expr), + }, + CompareOp::Eq => Expression { + expr: self.expr.clone().eq(other.expr), + }, + CompareOp::Ne => Expression { + expr: self.expr.clone().not_eq(other.expr), + }, + CompareOp::Gt => Expression { + expr: self.expr.clone().gt(other.expr), + }, + CompareOp::Ge => Expression { + expr: self.expr.clone().gt_eq(other.expr), + }, + } + } +} + +#[pymethods] +impl Expression { + /// assign a name to the expression + pub fn alias(&self, name: &str) -> PyResult { + Ok(Expression { + expr: self.expr.clone().alias(name), + }) + } +} + +/// Represents a ScalarUDF +#[pyclass] +#[derive(Debug, Clone)] +pub struct ScalarUDF { + pub(crate) function: _ScalarUDF, +} + +#[pymethods] +impl ScalarUDF { + /// creates a new expression with the call of the udf + #[call] + #[args(args = "*")] + fn __call__(&self, args: &PyTuple) -> PyResult { + let args = from_tuple(args)?.iter().map(|e| e.expr.clone()).collect(); + + Ok(Expression { + expr: self.function.call(args), + }) + } +} + +/// Represents a AggregateUDF +#[pyclass] +#[derive(Debug, Clone)] +pub struct AggregateUDF { + pub(crate) function: _AggregateUDF, +} + +#[pymethods] +impl AggregateUDF { + /// creates a new expression with the call of the udf + #[call] + #[args(args = "*")] + fn __call__(&self, args: &PyTuple) -> PyResult { + let args = from_tuple(args)?.iter().map(|e| e.expr.clone()).collect(); + + Ok(Expression { + expr: self.function.call(args), + }) + } +} diff --git a/python/src/functions.rs b/python/src/functions.rs new file mode 100644 index 0000000000000..68000cb1ecbf8 --- /dev/null +++ b/python/src/functions.rs @@ -0,0 +1,165 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion::arrow::datatypes::DataType; +use pyo3::{prelude::*, wrap_pyfunction}; + +use datafusion::logical_plan; + +use crate::udaf; +use crate::udf; +use crate::{expression, types::PyDataType}; + +/// Expression representing a column on the existing plan. +#[pyfunction] +#[text_signature = "(name)"] +fn col(name: &str) -> expression::Expression { + expression::Expression { + expr: logical_plan::col(name), + } +} + +/// Expression representing a constant value +#[pyfunction] +#[text_signature = "(value)"] +fn lit(value: i32) -> expression::Expression { + expression::Expression { + expr: logical_plan::lit(value), + } +} + +#[pyfunction] +fn sum(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::sum(value.expr), + } +} + +#[pyfunction] +fn avg(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::avg(value.expr), + } +} + +#[pyfunction] +fn min(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::min(value.expr), + } +} + +#[pyfunction] +fn max(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::max(value.expr), + } +} + +#[pyfunction] +fn count(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::count(value.expr), + } +} + +/* +#[pyfunction] +fn concat(value: Vec) -> expression::Expression { + expression::Expression { + expr: logical_plan::concat(value.into_iter().map(|e| e.expr)), + } +} + */ + +pub(crate) fn create_udf( + fun: PyObject, + input_types: Vec, + return_type: PyDataType, + name: &str, +) -> expression::ScalarUDF { + let input_types: Vec = + input_types.iter().map(|d| d.data_type.clone()).collect(); + let return_type = Arc::new(return_type.data_type); + + expression::ScalarUDF { + function: logical_plan::create_udf( + name, + input_types, + return_type, + udf::array_udf(fun), + ), + } +} + +/// Creates a new udf. +#[pyfunction] +fn udf( + fun: PyObject, + input_types: Vec, + return_type: PyDataType, + py: Python, +) -> PyResult { + let name = fun.getattr(py, "__qualname__")?.extract::(py)?; + + Ok(create_udf(fun, input_types, return_type, &name)) +} + +/// Creates a new udf. +#[pyfunction] +fn udaf( + accumulator: PyObject, + input_type: PyDataType, + return_type: PyDataType, + state_type: Vec, + py: Python, +) -> PyResult { + let name = accumulator + .getattr(py, "__qualname__")? + .extract::(py)?; + + let input_type = input_type.data_type; + let return_type = Arc::new(return_type.data_type); + let state_type = Arc::new(state_type.into_iter().map(|t| t.data_type).collect()); + + Ok(expression::AggregateUDF { + function: logical_plan::create_udaf( + &name, + input_type, + return_type, + udaf::array_udaf(accumulator), + state_type, + ), + }) +} + +pub fn init(module: &PyModule) -> PyResult<()> { + module.add_function(wrap_pyfunction!(col, module)?)?; + module.add_function(wrap_pyfunction!(lit, module)?)?; + // see https://github.com/apache/arrow-datafusion/issues/226 + //module.add_function(wrap_pyfunction!(concat, module)?)?; + module.add_function(wrap_pyfunction!(udf, module)?)?; + module.add_function(wrap_pyfunction!(sum, module)?)?; + module.add_function(wrap_pyfunction!(count, module)?)?; + module.add_function(wrap_pyfunction!(min, module)?)?; + module.add_function(wrap_pyfunction!(max, module)?)?; + module.add_function(wrap_pyfunction!(avg, module)?)?; + module.add_function(wrap_pyfunction!(udaf, module)?)?; + Ok(()) +} diff --git a/python/src/lib.rs b/python/src/lib.rs new file mode 100644 index 0000000000000..aecfe9994cd1a --- /dev/null +++ b/python/src/lib.rs @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use pyo3::prelude::*; + +mod context; +mod dataframe; +mod errors; +mod expression; +mod functions; +mod scalar; +mod to_py; +mod to_rust; +mod types; +mod udaf; +mod udf; + +/// DataFusion. +#[pymodule] +fn datafusion(py: Python, m: &PyModule) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + + let functions = PyModule::new(py, "functions")?; + functions::init(functions)?; + m.add_submodule(functions)?; + + Ok(()) +} diff --git a/python/src/scalar.rs b/python/src/scalar.rs new file mode 100644 index 0000000000000..0c562a9403616 --- /dev/null +++ b/python/src/scalar.rs @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use pyo3::prelude::*; + +use datafusion::scalar::ScalarValue as _Scalar; + +use crate::to_rust::to_rust_scalar; + +/// An expression that can be used on a DataFrame +#[derive(Debug, Clone)] +pub(crate) struct Scalar { + pub(crate) scalar: _Scalar, +} + +impl<'source> FromPyObject<'source> for Scalar { + fn extract(ob: &'source PyAny) -> PyResult { + Ok(Self { + scalar: to_rust_scalar(ob)?, + }) + } +} diff --git a/python/src/to_py.rs b/python/src/to_py.rs new file mode 100644 index 0000000000000..deeb9719891a3 --- /dev/null +++ b/python/src/to_py.rs @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use pyo3::prelude::*; +use pyo3::{libc::uintptr_t, PyErr}; + +use std::convert::From; + +use datafusion::arrow::array::ArrayRef; +use datafusion::arrow::record_batch::RecordBatch; + +use crate::errors; + +pub fn to_py_array(array: &ArrayRef, py: Python) -> PyResult { + let (array_pointer, schema_pointer) = + array.to_raw().map_err(errors::DataFusionError::from)?; + + let pa = py.import("pyarrow")?; + + let array = pa.getattr("Array")?.call_method1( + "_import_from_c", + (array_pointer as uintptr_t, schema_pointer as uintptr_t), + )?; + Ok(array.to_object(py)) +} + +fn to_py_batch<'a>( + batch: &RecordBatch, + py: Python, + pyarrow: &'a PyModule, +) -> Result { + let mut py_arrays = vec![]; + let mut py_names = vec![]; + + let schema = batch.schema(); + for (array, field) in batch.columns().iter().zip(schema.fields().iter()) { + let array = to_py_array(array, py)?; + + py_arrays.push(array); + py_names.push(field.name()); + } + + let record = pyarrow + .getattr("RecordBatch")? + .call_method1("from_arrays", (py_arrays, py_names))?; + + Ok(PyObject::from(record)) +} + +/// Converts a &[RecordBatch] into a Vec represented in PyArrow +pub fn to_py(batches: &[RecordBatch]) -> PyResult { + let gil = pyo3::Python::acquire_gil(); + let py = gil.python(); + let pyarrow = PyModule::import(py, "pyarrow")?; + let builtins = PyModule::import(py, "builtins")?; + + let mut py_batches = vec![]; + for batch in batches { + py_batches.push(to_py_batch(batch, py, pyarrow)?); + } + let result = builtins.call1("list", (py_batches,))?; + Ok(PyObject::from(result)) +} diff --git a/python/src/to_rust.rs b/python/src/to_rust.rs new file mode 100644 index 0000000000000..d8f2307a49823 --- /dev/null +++ b/python/src/to_rust.rs @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion::arrow::{ + array::{make_array_from_raw, ArrayRef}, + datatypes::Field, + datatypes::Schema, + ffi, + record_batch::RecordBatch, +}; +use datafusion::scalar::ScalarValue; +use pyo3::{libc::uintptr_t, prelude::*}; + +use crate::{errors, types::PyDataType}; + +/// converts a pyarrow Array into a Rust Array +pub fn to_rust(ob: &PyAny) -> PyResult { + // prepare a pointer to receive the Array struct + let (array_pointer, schema_pointer) = + ffi::ArrowArray::into_raw(unsafe { ffi::ArrowArray::empty() }); + + // make the conversion through PyArrow's private API + // this changes the pointer's memory and is thus unsafe. In particular, `_export_to_c` can go out of bounds + ob.call_method1( + "_export_to_c", + (array_pointer as uintptr_t, schema_pointer as uintptr_t), + )?; + + let array = unsafe { make_array_from_raw(array_pointer, schema_pointer) } + .map_err(errors::DataFusionError::from)?; + Ok(array) +} + +pub fn to_rust_batch(batch: &PyAny) -> PyResult { + let schema = batch.getattr("schema")?; + let names = schema.getattr("names")?.extract::>()?; + + let fields = names + .iter() + .enumerate() + .map(|(i, name)| { + let field = schema.call_method1("field", (i,))?; + let nullable = field.getattr("nullable")?.extract::()?; + let py_data_type = field.getattr("type")?; + let data_type = py_data_type.extract::()?.data_type; + Ok(Field::new(name, data_type, nullable)) + }) + .collect::>()?; + + let schema = Arc::new(Schema::new(fields)); + + let arrays = (0..names.len()) + .map(|i| { + let array = batch.call_method1("column", (i,))?; + to_rust(array) + }) + .collect::>()?; + + let batch = + RecordBatch::try_new(schema, arrays).map_err(errors::DataFusionError::from)?; + Ok(batch) +} + +/// converts a pyarrow Scalar into a Rust Scalar +pub fn to_rust_scalar(ob: &PyAny) -> PyResult { + let t = ob + .getattr("__class__")? + .getattr("__name__")? + .extract::<&str>()?; + + let p = ob.call_method0("as_py")?; + + Ok(match t { + "Int8Scalar" => ScalarValue::Int8(Some(p.extract::()?)), + "Int16Scalar" => ScalarValue::Int16(Some(p.extract::()?)), + "Int32Scalar" => ScalarValue::Int32(Some(p.extract::()?)), + "Int64Scalar" => ScalarValue::Int64(Some(p.extract::()?)), + "UInt8Scalar" => ScalarValue::UInt8(Some(p.extract::()?)), + "UInt16Scalar" => ScalarValue::UInt16(Some(p.extract::()?)), + "UInt32Scalar" => ScalarValue::UInt32(Some(p.extract::()?)), + "UInt64Scalar" => ScalarValue::UInt64(Some(p.extract::()?)), + "FloatScalar" => ScalarValue::Float32(Some(p.extract::()?)), + "DoubleScalar" => ScalarValue::Float64(Some(p.extract::()?)), + "BooleanScalar" => ScalarValue::Boolean(Some(p.extract::()?)), + "StringScalar" => ScalarValue::Utf8(Some(p.extract::()?)), + "LargeStringScalar" => ScalarValue::LargeUtf8(Some(p.extract::()?)), + other => { + return Err(errors::DataFusionError::Common(format!( + "Type \"{}\"not yet implemented", + other + )) + .into()) + } + }) +} diff --git a/python/src/types.rs b/python/src/types.rs new file mode 100644 index 0000000000000..ffa822e073a89 --- /dev/null +++ b/python/src/types.rs @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::arrow::datatypes::DataType; +use pyo3::{FromPyObject, PyAny, PyResult}; + +use crate::errors; + +/// utility struct to convert PyObj to native DataType +#[derive(Debug, Clone)] +pub struct PyDataType { + pub data_type: DataType, +} + +impl<'source> FromPyObject<'source> for PyDataType { + fn extract(ob: &'source PyAny) -> PyResult { + let id = ob.getattr("id")?.extract::()?; + let data_type = data_type_id(&id)?; + Ok(PyDataType { data_type }) + } +} + +fn data_type_id(id: &i32) -> Result { + // see https://github.com/apache/arrow/blob/3694794bdfd0677b95b8c95681e392512f1c9237/python/pyarrow/includes/libarrow.pxd + // this is not ideal as it does not generalize for non-basic types + // Find a way to get a unique name from the pyarrow.DataType + Ok(match id { + 1 => DataType::Boolean, + 2 => DataType::UInt8, + 3 => DataType::Int8, + 4 => DataType::UInt16, + 5 => DataType::Int16, + 6 => DataType::UInt32, + 7 => DataType::Int32, + 8 => DataType::UInt64, + 9 => DataType::Int64, + + 10 => DataType::Float16, + 11 => DataType::Float32, + 12 => DataType::Float64, + + //13 => DataType::Decimal, + + // 14 => DataType::Date32(), + // 15 => DataType::Date64(), + // 16 => DataType::Timestamp(), + // 17 => DataType::Time32(), + // 18 => DataType::Time64(), + // 19 => DataType::Duration() + 20 => DataType::Binary, + 21 => DataType::Utf8, + 22 => DataType::LargeBinary, + 23 => DataType::LargeUtf8, + + other => { + return Err(errors::DataFusionError::Common(format!( + "The type {} is not valid", + other + ))) + } + }) +} diff --git a/python/src/udaf.rs b/python/src/udaf.rs new file mode 100644 index 0000000000000..3ce223df9a491 --- /dev/null +++ b/python/src/udaf.rs @@ -0,0 +1,147 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use pyo3::{prelude::*, types::PyTuple}; + +use datafusion::arrow::array::ArrayRef; + +use datafusion::error::Result; +use datafusion::{ + error::DataFusionError as InnerDataFusionError, physical_plan::Accumulator, + scalar::ScalarValue, +}; + +use crate::scalar::Scalar; +use crate::to_py::to_py_array; +use crate::to_rust::to_rust_scalar; + +#[derive(Debug)] +struct PyAccumulator { + accum: PyObject, +} + +impl PyAccumulator { + fn new(accum: PyObject) -> Self { + Self { accum } + } +} + +impl Accumulator for PyAccumulator { + fn state(&self) -> Result> { + let gil = pyo3::Python::acquire_gil(); + let py = gil.python(); + + let state = self + .accum + .as_ref(py) + .call_method0("to_scalars") + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))? + .extract::>() + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; + + Ok(state.into_iter().map(|v| v.scalar).collect::>()) + } + + fn update(&mut self, _values: &[ScalarValue]) -> Result<()> { + // no need to implement as datafusion does not use it + todo!() + } + + fn merge(&mut self, _states: &[ScalarValue]) -> Result<()> { + // no need to implement as datafusion does not use it + todo!() + } + + fn evaluate(&self) -> Result { + // get GIL + let gil = pyo3::Python::acquire_gil(); + let py = gil.python(); + + let value = self + .accum + .as_ref(py) + .call_method0("evaluate") + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; + + to_rust_scalar(value) + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e))) + } + + fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { + // get GIL + let gil = pyo3::Python::acquire_gil(); + let py = gil.python(); + + // 1. cast args to Pyarrow array + // 2. call function + + // 1. + let py_args = values + .iter() + .map(|arg| { + // remove unwrap + to_py_array(arg, py).unwrap() + }) + .collect::>(); + let py_args = PyTuple::new(py, py_args); + + // update accumulator + self.accum + .as_ref(py) + .call_method1("update", py_args) + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; + + Ok(()) + } + + fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { + // get GIL + let gil = pyo3::Python::acquire_gil(); + let py = gil.python(); + + // 1. cast states to Pyarrow array + // 2. merge + let state = &states[0]; + + let state = to_py_array(state, py) + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; + + // 2. + self.accum + .as_ref(py) + .call_method1("merge", (state,)) + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; + + Ok(()) + } +} + +pub fn array_udaf( + accumulator: PyObject, +) -> Arc Result> + Send + Sync> { + Arc::new(move || -> Result> { + let gil = pyo3::Python::acquire_gil(); + let py = gil.python(); + + let accumulator = accumulator + .call0(py) + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; + Ok(Box::new(PyAccumulator::new(accumulator))) + }) +} diff --git a/python/src/udf.rs b/python/src/udf.rs new file mode 100644 index 0000000000000..7fee71008ef2f --- /dev/null +++ b/python/src/udf.rs @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use pyo3::{prelude::*, types::PyTuple}; + +use datafusion::{arrow::array, physical_plan::functions::make_scalar_function}; + +use datafusion::error::DataFusionError; +use datafusion::physical_plan::functions::ScalarFunctionImplementation; + +use crate::to_py::to_py_array; +use crate::to_rust::to_rust; + +/// creates a DataFusion's UDF implementation from a python function that expects pyarrow arrays +/// This is more efficient as it performs a zero-copy of the contents. +pub fn array_udf(func: PyObject) -> ScalarFunctionImplementation { + make_scalar_function( + move |args: &[array::ArrayRef]| -> Result { + // get GIL + let gil = pyo3::Python::acquire_gil(); + let py = gil.python(); + + // 1. cast args to Pyarrow arrays + // 2. call function + // 3. cast to arrow::array::Array + + // 1. + let py_args = args + .iter() + .map(|arg| { + // remove unwrap + to_py_array(arg, py).unwrap() + }) + .collect::>(); + let py_args = PyTuple::new(py, py_args); + + // 2. + let value = func.as_ref(py).call(py_args, None); + let value = match value { + Ok(n) => Ok(n), + Err(error) => Err(DataFusionError::Execution(format!("{:?}", error))), + }?; + + let array = to_rust(value).unwrap(); + Ok(array) + }, + ) +} diff --git a/python/tests/__init__.py b/python/tests/__init__.py new file mode 100644 index 0000000000000..13a83393a9124 --- /dev/null +++ b/python/tests/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/tests/generic.py b/python/tests/generic.py new file mode 100644 index 0000000000000..7362f0bb29569 --- /dev/null +++ b/python/tests/generic.py @@ -0,0 +1,75 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import unittest +import tempfile +import datetime +import os.path +import shutil + +import numpy +import pyarrow +import datafusion + +# used to write parquet files +import pyarrow.parquet + + +def data(): + data = numpy.concatenate( + [numpy.random.normal(0, 0.01, size=50), numpy.random.normal(50, 0.01, size=50)] + ) + return pyarrow.array(data) + + +def data_with_nans(): + data = numpy.random.normal(0, 0.01, size=50) + mask = numpy.random.randint(0, 2, size=50) + data[mask == 0] = numpy.NaN + return data + + +def data_datetime(f): + data = [ + datetime.datetime.now(), + datetime.datetime.now() - datetime.timedelta(days=1), + datetime.datetime.now() + datetime.timedelta(days=1), + ] + return pyarrow.array( + data, type=pyarrow.timestamp(f), mask=numpy.array([False, True, False]) + ) + + +def data_timedelta(f): + data = [ + datetime.timedelta(days=100), + datetime.timedelta(days=1), + datetime.timedelta(seconds=1), + ] + return pyarrow.array( + data, type=pyarrow.duration(f), mask=numpy.array([False, True, False]) + ) + + +def data_binary_other(): + return numpy.array([1, 0, 0], dtype="u4") + + +def write_parquet(path, data): + table = pyarrow.Table.from_arrays([data], names=["a"]) + pyarrow.parquet.write_table(table, path) + return path diff --git a/python/tests/test_df.py b/python/tests/test_df.py new file mode 100644 index 0000000000000..520d4e6a54723 --- /dev/null +++ b/python/tests/test_df.py @@ -0,0 +1,115 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import unittest + +import pyarrow +import datafusion +f = datafusion.functions + + +class TestCase(unittest.TestCase): + + def _prepare(self): + ctx = datafusion.ExecutionContext() + + # create a RecordBatch and a new DataFrame from it + batch = pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], + names=["a", "b"], + ) + return ctx.create_dataframe([[batch]]) + + def test_select(self): + df = self._prepare() + + df = df.select( + f.col("a") + f.col("b"), + f.col("a") - f.col("b"), + ) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + self.assertEqual(result.column(0), pyarrow.array([5, 7, 9])) + self.assertEqual(result.column(1), pyarrow.array([-3, -3, -3])) + + def test_filter(self): + df = self._prepare() + + df = df \ + .select( + f.col("a") + f.col("b"), + f.col("a") - f.col("b"), + ) \ + .filter(f.col("a") > f.lit(2)) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + self.assertEqual(result.column(0), pyarrow.array([9])) + self.assertEqual(result.column(1), pyarrow.array([-3])) + + def test_limit(self): + df = self._prepare() + + df = df.limit(1) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + self.assertEqual(len(result.column(0)), 1) + self.assertEqual(len(result.column(1)), 1) + + def test_udf(self): + df = self._prepare() + + # is_null is a pyarrow function over arrays + udf = f.udf(lambda x: x.is_null(), [pyarrow.int64()], pyarrow.bool_()) + + df = df.select(udf(f.col("a"))) + + self.assertEqual(df.collect()[0].column(0), pyarrow.array([False, False, False])) + + def test_join(self): + ctx = datafusion.ExecutionContext() + + batch = pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], + names=["a", "b"], + ) + df = ctx.create_dataframe([[batch]]) + + batch = pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1, 2]), pyarrow.array([8, 10])], + names=["a", "c"], + ) + df1 = ctx.create_dataframe([[batch]]) + + df = df.join(df1, on="a", how="inner") + + # execute and collect the first (and only) batch + batch = df.collect()[0] + + if batch.column(0) == pyarrow.array([1, 2]): + self.assertEqual(batch.column(0), pyarrow.array([1, 2])) + self.assertEqual(batch.column(1), pyarrow.array([8, 10])) + self.assertEqual(batch.column(2), pyarrow.array([4, 5])) + else: + self.assertEqual(batch.column(0), pyarrow.array([2, 1])) + self.assertEqual(batch.column(1), pyarrow.array([10, 8])) + self.assertEqual(batch.column(2), pyarrow.array([5, 4])) diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py new file mode 100644 index 0000000000000..e9047ea6e70c3 --- /dev/null +++ b/python/tests/test_sql.py @@ -0,0 +1,294 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import unittest +import tempfile +import datetime +import os.path +import shutil + +import numpy +import pyarrow +import datafusion + +# used to write parquet files +import pyarrow.parquet + +from tests.generic import * + + +class TestCase(unittest.TestCase): + def setUp(self): + # Create a temporary directory + self.test_dir = tempfile.mkdtemp() + numpy.random.seed(1) + + def tearDown(self): + # Remove the directory after the test + shutil.rmtree(self.test_dir) + + def test_no_table(self): + with self.assertRaises(Exception): + datafusion.Context().sql("SELECT a FROM b").collect() + + def test_register(self): + ctx = datafusion.ExecutionContext() + + path = write_parquet(os.path.join(self.test_dir, "a.parquet"), data()) + + ctx.register_parquet("t", path) + + self.assertEqual(ctx.tables(), {"t"}) + + def test_execute(self): + data = [1, 1, 2, 2, 3, 11, 12] + + ctx = datafusion.ExecutionContext() + + # single column, "a" + path = write_parquet( + os.path.join(self.test_dir, "a.parquet"), pyarrow.array(data) + ) + ctx.register_parquet("t", path) + + self.assertEqual(ctx.tables(), {"t"}) + + # count + result = ctx.sql("SELECT COUNT(a) FROM t").collect() + + expected = pyarrow.array([7], pyarrow.uint64()) + expected = [pyarrow.RecordBatch.from_arrays([expected], ["COUNT(a)"])] + self.assertEqual(expected, result) + + # where + expected = pyarrow.array([2], pyarrow.uint64()) + expected = [pyarrow.RecordBatch.from_arrays([expected], ["COUNT(a)"])] + self.assertEqual( + expected, ctx.sql("SELECT COUNT(a) FROM t WHERE a > 10").collect() + ) + + # group by + result = ctx.sql( + "SELECT CAST(a as int), COUNT(a) FROM t GROUP BY CAST(a as int)" + ).collect() + + result_keys = result[0].to_pydict()["CAST(a AS Int32)"] + result_values = result[0].to_pydict()["COUNT(a)"] + result_keys, result_values = ( + list(t) for t in zip(*sorted(zip(result_keys, result_values))) + ) + + self.assertEqual(result_keys, [1, 2, 3, 11, 12]) + self.assertEqual(result_values, [2, 2, 1, 1, 1]) + + # order by + result = ctx.sql( + "SELECT a, CAST(a AS int) FROM t ORDER BY a DESC LIMIT 2" + ).collect() + expected_a = pyarrow.array([50.0219, 50.0152], pyarrow.float64()) + expected_cast = pyarrow.array([50, 50], pyarrow.int32()) + expected = [ + pyarrow.RecordBatch.from_arrays( + [expected_a, expected_cast], ["a", "CAST(a AS Int32)"] + ) + ] + numpy.testing.assert_equal(expected[0].column(1), expected[0].column(1)) + + def test_cast(self): + """ + Verify that we can cast + """ + ctx = datafusion.ExecutionContext() + + path = write_parquet(os.path.join(self.test_dir, "a.parquet"), data()) + ctx.register_parquet("t", path) + + valid_types = [ + "smallint", + "int", + "bigint", + "float(32)", + "float(64)", + "float", + ] + + select = ", ".join( + [f"CAST(9 AS {t}) AS A{i}" for i, t in enumerate(valid_types)] + ) + + # can execute, which implies that we can cast + ctx.sql(f"SELECT {select} FROM t").collect() + + def _test_udf(self, udf, args, return_type, array, expected): + ctx = datafusion.ExecutionContext() + + # write to disk + path = write_parquet(os.path.join(self.test_dir, "a.parquet"), array) + ctx.register_parquet("t", path) + + ctx.register_udf("udf", udf, args, return_type) + + batches = ctx.sql("SELECT udf(a) AS tt FROM t").collect() + + result = batches[0].column(0) + + self.assertEqual(expected, result) + + def test_udf_identity(self): + self._test_udf( + lambda x: x, + [pyarrow.float64()], + pyarrow.float64(), + pyarrow.array([-1.2, None, 1.2]), + pyarrow.array([-1.2, None, 1.2]), + ) + + def test_udf(self): + self._test_udf( + lambda x: x.is_null(), + [pyarrow.float64()], + pyarrow.bool_(), + pyarrow.array([-1.2, None, 1.2]), + pyarrow.array([False, True, False]), + ) + + +class TestIO(unittest.TestCase): + def setUp(self): + # Create a temporary directory + self.test_dir = tempfile.mkdtemp() + + def tearDown(self): + # Remove the directory after the test + shutil.rmtree(self.test_dir) + + def _test_data(self, data): + ctx = datafusion.ExecutionContext() + + # write to disk + path = write_parquet(os.path.join(self.test_dir, "a.parquet"), data) + ctx.register_parquet("t", path) + + batches = ctx.sql("SELECT a AS tt FROM t").collect() + + result = batches[0].column(0) + + numpy.testing.assert_equal(data, result) + + def test_nans(self): + self._test_data(data_with_nans()) + + def test_utf8(self): + array = pyarrow.array( + ["a", "b", "c"], pyarrow.utf8(), numpy.array([False, True, False]) + ) + self._test_data(array) + + def test_large_utf8(self): + array = pyarrow.array( + ["a", "b", "c"], pyarrow.large_utf8(), numpy.array([False, True, False]) + ) + self._test_data(array) + + # Error from Arrow + @unittest.expectedFailure + def test_datetime_s(self): + self._test_data(data_datetime("s")) + + # C data interface missing + @unittest.expectedFailure + def test_datetime_ms(self): + self._test_data(data_datetime("ms")) + + # C data interface missing + @unittest.expectedFailure + def test_datetime_us(self): + self._test_data(data_datetime("us")) + + # Not writtable to parquet + @unittest.expectedFailure + def test_datetime_ns(self): + self._test_data(data_datetime("ns")) + + # Not writtable to parquet + @unittest.expectedFailure + def test_timedelta_s(self): + self._test_data(data_timedelta("s")) + + # Not writtable to parquet + @unittest.expectedFailure + def test_timedelta_ms(self): + self._test_data(data_timedelta("ms")) + + # Not writtable to parquet + @unittest.expectedFailure + def test_timedelta_us(self): + self._test_data(data_timedelta("us")) + + # Not writtable to parquet + @unittest.expectedFailure + def test_timedelta_ns(self): + self._test_data(data_timedelta("ns")) + + def test_date32(self): + array = pyarrow.array( + [ + datetime.date(2000, 1, 1), + datetime.date(1980, 1, 1), + datetime.date(2030, 1, 1), + ], + pyarrow.date32(), + numpy.array([False, True, False]), + ) + self._test_data(array) + + def test_binary_variable(self): + array = pyarrow.array( + [b"1", b"2", b"3"], pyarrow.binary(), numpy.array([False, True, False]) + ) + self._test_data(array) + + # C data interface missing + @unittest.expectedFailure + def test_binary_fixed(self): + array = pyarrow.array( + [b"1111", b"2222", b"3333"], + pyarrow.binary(4), + numpy.array([False, True, False]), + ) + self._test_data(array) + + def test_large_binary(self): + array = pyarrow.array( + [b"1111", b"2222", b"3333"], + pyarrow.large_binary(), + numpy.array([False, True, False]), + ) + self._test_data(array) + + def test_binary_other(self): + self._test_data(data_binary_other()) + + def test_bool(self): + array = pyarrow.array( + [False, True, True], None, numpy.array([False, True, False]) + ) + self._test_data(array) + + def test_u32(self): + array = pyarrow.array([0, 1, 2], None, numpy.array([False, True, False])) + self._test_data(array) diff --git a/python/tests/test_udaf.py b/python/tests/test_udaf.py new file mode 100644 index 0000000000000..ffd235e285f80 --- /dev/null +++ b/python/tests/test_udaf.py @@ -0,0 +1,91 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import unittest + +import pyarrow +import pyarrow.compute +import datafusion + +f = datafusion.functions + + +class Accumulator: + """ + Interface of a user-defined accumulation. + """ + + def __init__(self): + self._sum = pyarrow.scalar(0.0) + + def to_scalars(self) -> [pyarrow.Scalar]: + return [self._sum] + + def update(self, values: pyarrow.Array) -> None: + # not nice since pyarrow scalars can't be summed yet. This breaks on `None` + self._sum = pyarrow.scalar( + self._sum.as_py() + pyarrow.compute.sum(values).as_py() + ) + + def merge(self, states: pyarrow.Array) -> None: + # not nice since pyarrow scalars can't be summed yet. This breaks on `None` + self._sum = pyarrow.scalar( + self._sum.as_py() + pyarrow.compute.sum(states).as_py() + ) + + def evaluate(self) -> pyarrow.Scalar: + return self._sum + + +class TestCase(unittest.TestCase): + def _prepare(self): + ctx = datafusion.ExecutionContext() + + # create a RecordBatch and a new DataFrame from it + batch = pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1, 2, 3]), pyarrow.array([4, 4, 6])], + names=["a", "b"], + ) + return ctx.create_dataframe([[batch]]) + + def test_aggregate(self): + df = self._prepare() + + udaf = f.udaf( + Accumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()] + ) + + df = df.aggregate([], [udaf(f.col("a"))]) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + self.assertEqual(result.column(0), pyarrow.array([1.0 + 2.0 + 3.0])) + + def test_group_by(self): + df = self._prepare() + + udaf = f.udaf( + Accumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()] + ) + + df = df.aggregate([f.col("b")], [udaf(f.col("a"))]) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + self.assertEqual(result.column(1), pyarrow.array([1.0 + 2.0, 3.0])) From d0af907652aa8773d1de21dfd2f15bbcf6f50ce3 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 4 May 2021 08:51:44 -0600 Subject: [PATCH 061/329] Revert "Add datafusion-python (#69)" (#257) This reverts commit 46bde0bd148aacf1677a575cb9ddbc154b6c4fb3. --- .github/workflows/python_build.yml | 89 --------- .github/workflows/python_test.yaml | 58 ------ Cargo.toml | 4 +- dev/release/rat_exclude_files.txt | 1 - python/.cargo/config | 22 --- python/.dockerignore | 19 -- python/.gitignore | 20 -- python/Cargo.toml | 57 ------ python/README.md | 146 -------------- python/pyproject.toml | 20 -- python/rust-toolchain | 1 - python/src/context.rs | 115 ----------- python/src/dataframe.rs | 161 ---------------- python/src/errors.rs | 61 ------ python/src/expression.rs | 162 ---------------- python/src/functions.rs | 165 ---------------- python/src/lib.rs | 44 ----- python/src/scalar.rs | 36 ---- python/src/to_py.rs | 77 -------- python/src/to_rust.rs | 111 ----------- python/src/types.rs | 76 -------- python/src/udaf.rs | 147 --------------- python/src/udf.rs | 62 ------ python/tests/__init__.py | 16 -- python/tests/generic.py | 75 -------- python/tests/test_df.py | 115 ----------- python/tests/test_sql.py | 294 ----------------------------- python/tests/test_udaf.py | 91 --------- 28 files changed, 1 insertion(+), 2244 deletions(-) delete mode 100644 .github/workflows/python_build.yml delete mode 100644 .github/workflows/python_test.yaml delete mode 100644 python/.cargo/config delete mode 100644 python/.dockerignore delete mode 100644 python/.gitignore delete mode 100644 python/Cargo.toml delete mode 100644 python/README.md delete mode 100644 python/pyproject.toml delete mode 100644 python/rust-toolchain delete mode 100644 python/src/context.rs delete mode 100644 python/src/dataframe.rs delete mode 100644 python/src/errors.rs delete mode 100644 python/src/expression.rs delete mode 100644 python/src/functions.rs delete mode 100644 python/src/lib.rs delete mode 100644 python/src/scalar.rs delete mode 100644 python/src/to_py.rs delete mode 100644 python/src/to_rust.rs delete mode 100644 python/src/types.rs delete mode 100644 python/src/udaf.rs delete mode 100644 python/src/udf.rs delete mode 100644 python/tests/__init__.py delete mode 100644 python/tests/generic.py delete mode 100644 python/tests/test_df.py delete mode 100644 python/tests/test_sql.py delete mode 100644 python/tests/test_udaf.py diff --git a/.github/workflows/python_build.yml b/.github/workflows/python_build.yml deleted file mode 100644 index c86bb81581a71..0000000000000 --- a/.github/workflows/python_build.yml +++ /dev/null @@ -1,89 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Build -on: - push: - tags: - - v* - -jobs: - build-python-mac-win: - name: Mac/Win - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - python-version: [3.6, 3.7, 3.8] - os: [macos-latest, windows-latest] - steps: - - uses: actions/checkout@v2 - - - uses: actions/setup-python@v1 - with: - python-version: ${{ matrix.python-version }} - - - uses: actions-rs/toolchain@v1 - with: - toolchain: nightly-2021-01-06 - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install maturin - - - name: Build Python package - run: cd python && maturin build --release --no-sdist --strip --interpreter python${{matrix.python_version}} - - - name: List wheels - if: matrix.os == 'windows-latest' - run: dir python/target\wheels\ - - - name: List wheels - if: matrix.os != 'windows-latest' - run: find ./python/target/wheels/ - - - name: Archive wheels - uses: actions/upload-artifact@v2 - with: - name: dist - path: python/target/wheels/* - - build-manylinux: - name: Manylinux - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: Build wheels - run: docker run --rm -v $(pwd):/io konstin2/maturin build --release --manylinux - - name: Archive wheels - uses: actions/upload-artifact@v2 - with: - name: dist - path: python/target/wheels/* - - release: - name: Publish in PyPI - needs: [build-manylinux, build-python-mac-win] - runs-on: ubuntu-latest - steps: - - uses: actions/download-artifact@v2 - - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@master - with: - user: __token__ - password: ${{ secrets.pypi_password }} diff --git a/.github/workflows/python_test.yaml b/.github/workflows/python_test.yaml deleted file mode 100644 index 3b2111b59d49d..0000000000000 --- a/.github/workflows/python_test.yaml +++ /dev/null @@ -1,58 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Python test -on: [push, pull_request] - -jobs: - test: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: Setup Rust toolchain - run: | - rustup toolchain install nightly-2021-01-06 - rustup default nightly-2021-01-06 - rustup component add rustfmt - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /home/runner/.cargo - key: cargo-maturin-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /home/runner/target - key: target-maturin-cache- - - uses: actions/setup-python@v2 - with: - python-version: '3.7' - - name: Install Python dependencies - run: python -m pip install --upgrade pip setuptools wheel - - name: Run tests - run: | - cd python/ - export CARGO_HOME="/home/runner/.cargo" - export CARGO_TARGET_DIR="/home/runner/target" - - python -m venv venv - source venv/bin/activate - - pip install maturin==0.10.4 toml==0.10.1 pyarrow==4.0.0 - maturin develop - - python -m unittest discover tests diff --git a/Cargo.toml b/Cargo.toml index 9795cb68b4456..fa36a0c0fed7c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,4 @@ members = [ "ballista/rust/core", "ballista/rust/executor", "ballista/rust/scheduler", -] - -exclude = ["python"] +] \ No newline at end of file diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 6126699bbc1fa..b94c0ea1d61a6 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -104,4 +104,3 @@ rust-toolchain benchmarks/queries/q*.sql ballista/rust/scheduler/testdata/* ballista/ui/scheduler/yarn.lock -python/rust-toolchain diff --git a/python/.cargo/config b/python/.cargo/config deleted file mode 100644 index 0b24f30cf908a..0000000000000 --- a/python/.cargo/config +++ /dev/null @@ -1,22 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[target.x86_64-apple-darwin] -rustflags = [ - "-C", "link-arg=-undefined", - "-C", "link-arg=dynamic_lookup", -] diff --git a/python/.dockerignore b/python/.dockerignore deleted file mode 100644 index 08c131c2e7d60..0000000000000 --- a/python/.dockerignore +++ /dev/null @@ -1,19 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -target -venv diff --git a/python/.gitignore b/python/.gitignore deleted file mode 100644 index 48fe4dbe52dde..0000000000000 --- a/python/.gitignore +++ /dev/null @@ -1,20 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -/target -Cargo.lock -venv diff --git a/python/Cargo.toml b/python/Cargo.toml deleted file mode 100644 index 070720554f0ed..0000000000000 --- a/python/Cargo.toml +++ /dev/null @@ -1,57 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[package] -name = "datafusion" -version = "0.2.1" -homepage = "https://github.com/apache/arrow" -repository = "https://github.com/apache/arrow" -authors = ["Apache Arrow "] -description = "Build and run queries against data" -readme = "README.md" -license = "Apache-2.0" -edition = "2018" - -[dependencies] -tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } -rand = "0.7" -pyo3 = { version = "0.12.1", features = ["extension-module"] } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "2423ff0d" } - -[lib] -name = "datafusion" -crate-type = ["cdylib"] - -[package.metadata.maturin] -requires-dist = ["pyarrow>=1"] - -classifier = [ - "Development Status :: 2 - Pre-Alpha", - "Intended Audience :: Developers", - "License :: OSI Approved :: Apache Software License", - "License :: OSI Approved", - "Operating System :: MacOS", - "Operating System :: Microsoft :: Windows", - "Operating System :: POSIX :: Linux", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python", - "Programming Language :: Rust", -] diff --git a/python/README.md b/python/README.md deleted file mode 100644 index 1859fca9811c0..0000000000000 --- a/python/README.md +++ /dev/null @@ -1,146 +0,0 @@ - - -## DataFusion in Python - -This is a Python library that binds to [Apache Arrow](https://arrow.apache.org/) in-memory query engine [DataFusion](https://github.com/apache/arrow/tree/master/rust/datafusion). - -Like pyspark, it allows you to build a plan through SQL or a DataFrame API against in-memory data, parquet or CSV files, run it in a multi-threaded environment, and obtain the result back in Python. - -It also allows you to use UDFs and UDAFs for complex operations. - -The major advantage of this library over other execution engines is that this library achieves zero-copy between Python and its execution engine: there is no cost in using UDFs, UDAFs, and collecting the results to Python apart from having to lock the GIL when running those operations. - -Its query engine, DataFusion, is written in [Rust](https://www.rust-lang.org/), which makes strong assumptions about thread safety and lack of memory leaks. - -Technically, zero-copy is achieved via the [c data interface](https://arrow.apache.org/docs/format/CDataInterface.html). - -## How to use it - -Simple usage: - -```python -import datafusion -import pyarrow - -# an alias -f = datafusion.functions - -# create a context -ctx = datafusion.ExecutionContext() - -# create a RecordBatch and a new DataFrame from it -batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], - names=["a", "b"], -) -df = ctx.create_dataframe([[batch]]) - -# create a new statement -df = df.select( - f.col("a") + f.col("b"), - f.col("a") - f.col("b"), -) - -# execute and collect the first (and only) batch -result = df.collect()[0] - -assert result.column(0) == pyarrow.array([5, 7, 9]) -assert result.column(1) == pyarrow.array([-3, -3, -3]) -``` - -### UDFs - -```python -def is_null(array: pyarrow.Array) -> pyarrow.Array: - return array.is_null() - -udf = f.udf(is_null, [pyarrow.int64()], pyarrow.bool_()) - -df = df.select(udf(f.col("a"))) -``` - -### UDAF - -```python -import pyarrow -import pyarrow.compute - - -class Accumulator: - """ - Interface of a user-defined accumulation. - """ - def __init__(self): - self._sum = pyarrow.scalar(0.0) - - def to_scalars(self) -> [pyarrow.Scalar]: - return [self._sum] - - def update(self, values: pyarrow.Array) -> None: - # not nice since pyarrow scalars can't be summed yet. This breaks on `None` - self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(values).as_py()) - - def merge(self, states: pyarrow.Array) -> None: - # not nice since pyarrow scalars can't be summed yet. This breaks on `None` - self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(states).as_py()) - - def evaluate(self) -> pyarrow.Scalar: - return self._sum - - -df = ... - -udaf = f.udaf(Accumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()]) - -df = df.aggregate( - [], - [udaf(f.col("a"))] -) -``` - -## How to install - -```bash -pip install datafusion -``` - -## How to develop - -This assumes that you have rust and cargo installed. We use the workflow recommended by [pyo3](https://github.com/PyO3/pyo3) and [maturin](https://github.com/PyO3/maturin). - -Bootstrap: - -```bash -# fetch this repo -git clone git@github.com:apache/arrow-datafusion.git - -cd arrow-datafusion/python - -# prepare development environment (used to build wheel / install in development) -python3 -m venv venv -pip install maturin==0.10.4 toml==0.10.1 pyarrow==1.0.0 -``` - -Whenever rust code changes (your changes or via git pull): - -```bash -venv/bin/maturin develop -venv/bin/python -m unittest discover tests -``` diff --git a/python/pyproject.toml b/python/pyproject.toml deleted file mode 100644 index 27480690e06cc..0000000000000 --- a/python/pyproject.toml +++ /dev/null @@ -1,20 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[build-system] -requires = ["maturin"] -build-backend = "maturin" diff --git a/python/rust-toolchain b/python/rust-toolchain deleted file mode 100644 index 9d0cf79d367d6..0000000000000 --- a/python/rust-toolchain +++ /dev/null @@ -1 +0,0 @@ -nightly-2021-01-06 diff --git a/python/src/context.rs b/python/src/context.rs deleted file mode 100644 index 14ef0f7321f15..0000000000000 --- a/python/src/context.rs +++ /dev/null @@ -1,115 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::{collections::HashSet, sync::Arc}; - -use rand::distributions::Alphanumeric; -use rand::Rng; - -use pyo3::prelude::*; - -use datafusion::arrow::record_batch::RecordBatch; -use datafusion::datasource::MemTable; -use datafusion::execution::context::ExecutionContext as _ExecutionContext; - -use crate::dataframe; -use crate::errors; -use crate::functions; -use crate::to_rust; -use crate::types::PyDataType; - -/// `ExecutionContext` is able to plan and execute DataFusion plans. -/// It has a powerful optimizer, a physical planner for local execution, and a -/// multi-threaded execution engine to perform the execution. -#[pyclass(unsendable)] -pub(crate) struct ExecutionContext { - ctx: _ExecutionContext, -} - -#[pymethods] -impl ExecutionContext { - #[new] - fn new() -> Self { - ExecutionContext { - ctx: _ExecutionContext::new(), - } - } - - /// Returns a DataFrame whose plan corresponds to the SQL statement. - fn sql(&mut self, query: &str) -> PyResult { - let df = self - .ctx - .sql(query) - .map_err(|e| -> errors::DataFusionError { e.into() })?; - Ok(dataframe::DataFrame::new( - self.ctx.state.clone(), - df.to_logical_plan(), - )) - } - - fn create_dataframe( - &mut self, - partitions: Vec>, - py: Python, - ) -> PyResult { - let partitions: Vec> = partitions - .iter() - .map(|batches| { - batches - .iter() - .map(|batch| to_rust::to_rust_batch(batch.as_ref(py))) - .collect() - }) - .collect::>()?; - - let table = - errors::wrap(MemTable::try_new(partitions[0][0].schema(), partitions))?; - - // generate a random (unique) name for this table - let name = rand::thread_rng() - .sample_iter(&Alphanumeric) - .take(10) - .collect::(); - - errors::wrap(self.ctx.register_table(&*name, Arc::new(table)))?; - Ok(dataframe::DataFrame::new( - self.ctx.state.clone(), - errors::wrap(self.ctx.table(&*name))?.to_logical_plan(), - )) - } - - fn register_parquet(&mut self, name: &str, path: &str) -> PyResult<()> { - errors::wrap(self.ctx.register_parquet(name, path))?; - Ok(()) - } - - fn register_udf( - &mut self, - name: &str, - func: PyObject, - args_types: Vec, - return_type: PyDataType, - ) { - let function = functions::create_udf(func, args_types, return_type, name); - - self.ctx.register_udf(function.function); - } - - fn tables(&self) -> HashSet { - self.ctx.tables().unwrap() - } -} diff --git a/python/src/dataframe.rs b/python/src/dataframe.rs deleted file mode 100644 index f90a7cf2f0dcf..0000000000000 --- a/python/src/dataframe.rs +++ /dev/null @@ -1,161 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::sync::{Arc, Mutex}; - -use logical_plan::LogicalPlan; -use pyo3::{prelude::*, types::PyTuple}; -use tokio::runtime::Runtime; - -use datafusion::execution::context::ExecutionContext as _ExecutionContext; -use datafusion::logical_plan::{JoinType, LogicalPlanBuilder}; -use datafusion::physical_plan::collect; -use datafusion::{execution::context::ExecutionContextState, logical_plan}; - -use crate::{errors, to_py}; -use crate::{errors::DataFusionError, expression}; - -/// A DataFrame is a representation of a logical plan and an API to compose statements. -/// Use it to build a plan and `.collect()` to execute the plan and collect the result. -/// The actual execution of a plan runs natively on Rust and Arrow on a multi-threaded environment. -#[pyclass] -pub(crate) struct DataFrame { - ctx_state: Arc>, - plan: LogicalPlan, -} - -impl DataFrame { - /// creates a new DataFrame - pub fn new(ctx_state: Arc>, plan: LogicalPlan) -> Self { - Self { ctx_state, plan } - } -} - -#[pymethods] -impl DataFrame { - /// Select `expressions` from the existing DataFrame. - #[args(args = "*")] - fn select(&self, args: &PyTuple) -> PyResult { - let expressions = expression::from_tuple(args)?; - let builder = LogicalPlanBuilder::from(&self.plan); - let builder = - errors::wrap(builder.project(expressions.into_iter().map(|e| e.expr)))?; - let plan = errors::wrap(builder.build())?; - - Ok(DataFrame { - ctx_state: self.ctx_state.clone(), - plan, - }) - } - - /// Filter according to the `predicate` expression - fn filter(&self, predicate: expression::Expression) -> PyResult { - let builder = LogicalPlanBuilder::from(&self.plan); - let builder = errors::wrap(builder.filter(predicate.expr))?; - let plan = errors::wrap(builder.build())?; - - Ok(DataFrame { - ctx_state: self.ctx_state.clone(), - plan, - }) - } - - /// Aggregates using expressions - fn aggregate( - &self, - group_by: Vec, - aggs: Vec, - ) -> PyResult { - let builder = LogicalPlanBuilder::from(&self.plan); - let builder = errors::wrap(builder.aggregate( - group_by.into_iter().map(|e| e.expr), - aggs.into_iter().map(|e| e.expr), - ))?; - let plan = errors::wrap(builder.build())?; - - Ok(DataFrame { - ctx_state: self.ctx_state.clone(), - plan, - }) - } - - /// Limits the plan to return at most `count` rows - fn limit(&self, count: usize) -> PyResult { - let builder = LogicalPlanBuilder::from(&self.plan); - let builder = errors::wrap(builder.limit(count))?; - let plan = errors::wrap(builder.build())?; - - Ok(DataFrame { - ctx_state: self.ctx_state.clone(), - plan, - }) - } - - /// Executes the plan, returning a list of `RecordBatch`es. - /// Unless some order is specified in the plan, there is no guarantee of the order of the result - fn collect(&self, py: Python) -> PyResult { - let ctx = _ExecutionContext::from(self.ctx_state.clone()); - let plan = ctx - .optimize(&self.plan) - .map_err(|e| -> errors::DataFusionError { e.into() })?; - let plan = ctx - .create_physical_plan(&plan) - .map_err(|e| -> errors::DataFusionError { e.into() })?; - - let rt = Runtime::new().unwrap(); - let batches = py.allow_threads(|| { - rt.block_on(async { - collect(plan) - .await - .map_err(|e| -> errors::DataFusionError { e.into() }) - }) - })?; - to_py::to_py(&batches) - } - - /// Returns the join of two DataFrames `on`. - fn join(&self, right: &DataFrame, on: Vec<&str>, how: &str) -> PyResult { - let builder = LogicalPlanBuilder::from(&self.plan); - - let join_type = match how { - "inner" => JoinType::Inner, - "left" => JoinType::Left, - "right" => JoinType::Right, - how => { - return Err(DataFusionError::Common(format!( - "The join type {} does not exist or is not implemented", - how - )) - .into()) - } - }; - - let builder = errors::wrap(builder.join( - &right.plan, - join_type, - on.as_slice(), - on.as_slice(), - ))?; - - let plan = errors::wrap(builder.build())?; - - Ok(DataFrame { - ctx_state: self.ctx_state.clone(), - plan, - }) - } -} diff --git a/python/src/errors.rs b/python/src/errors.rs deleted file mode 100644 index fbe98037a030f..0000000000000 --- a/python/src/errors.rs +++ /dev/null @@ -1,61 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use core::fmt; - -use datafusion::arrow::error::ArrowError; -use datafusion::error::DataFusionError as InnerDataFusionError; -use pyo3::{exceptions, PyErr}; - -#[derive(Debug)] -pub enum DataFusionError { - ExecutionError(InnerDataFusionError), - ArrowError(ArrowError), - Common(String), -} - -impl fmt::Display for DataFusionError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - DataFusionError::ExecutionError(e) => write!(f, "DataFusion error: {:?}", e), - DataFusionError::ArrowError(e) => write!(f, "Arrow error: {:?}", e), - DataFusionError::Common(e) => write!(f, "{}", e), - } - } -} - -impl From for PyErr { - fn from(err: DataFusionError) -> PyErr { - exceptions::PyException::new_err(err.to_string()) - } -} - -impl From for DataFusionError { - fn from(err: InnerDataFusionError) -> DataFusionError { - DataFusionError::ExecutionError(err) - } -} - -impl From for DataFusionError { - fn from(err: ArrowError) -> DataFusionError { - DataFusionError::ArrowError(err) - } -} - -pub(crate) fn wrap(a: Result) -> Result { - Ok(a?) -} diff --git a/python/src/expression.rs b/python/src/expression.rs deleted file mode 100644 index 78ca6d7e598ec..0000000000000 --- a/python/src/expression.rs +++ /dev/null @@ -1,162 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use pyo3::{ - basic::CompareOp, prelude::*, types::PyTuple, PyNumberProtocol, PyObjectProtocol, -}; - -use datafusion::logical_plan::Expr as _Expr; -use datafusion::physical_plan::udaf::AggregateUDF as _AggregateUDF; -use datafusion::physical_plan::udf::ScalarUDF as _ScalarUDF; - -/// An expression that can be used on a DataFrame -#[pyclass] -#[derive(Debug, Clone)] -pub(crate) struct Expression { - pub(crate) expr: _Expr, -} - -/// converts a tuple of expressions into a vector of Expressions -pub(crate) fn from_tuple(value: &PyTuple) -> PyResult> { - value - .iter() - .map(|e| e.extract::()) - .collect::>() -} - -#[pyproto] -impl PyNumberProtocol for Expression { - fn __add__(lhs: Expression, rhs: Expression) -> PyResult { - Ok(Expression { - expr: lhs.expr + rhs.expr, - }) - } - - fn __sub__(lhs: Expression, rhs: Expression) -> PyResult { - Ok(Expression { - expr: lhs.expr - rhs.expr, - }) - } - - fn __truediv__(lhs: Expression, rhs: Expression) -> PyResult { - Ok(Expression { - expr: lhs.expr / rhs.expr, - }) - } - - fn __mul__(lhs: Expression, rhs: Expression) -> PyResult { - Ok(Expression { - expr: lhs.expr * rhs.expr, - }) - } - - fn __and__(lhs: Expression, rhs: Expression) -> PyResult { - Ok(Expression { - expr: lhs.expr.and(rhs.expr), - }) - } - - fn __or__(lhs: Expression, rhs: Expression) -> PyResult { - Ok(Expression { - expr: lhs.expr.or(rhs.expr), - }) - } - - fn __invert__(&self) -> PyResult { - Ok(Expression { - expr: self.expr.clone().not(), - }) - } -} - -#[pyproto] -impl PyObjectProtocol for Expression { - fn __richcmp__(&self, other: Expression, op: CompareOp) -> Expression { - match op { - CompareOp::Lt => Expression { - expr: self.expr.clone().lt(other.expr), - }, - CompareOp::Le => Expression { - expr: self.expr.clone().lt_eq(other.expr), - }, - CompareOp::Eq => Expression { - expr: self.expr.clone().eq(other.expr), - }, - CompareOp::Ne => Expression { - expr: self.expr.clone().not_eq(other.expr), - }, - CompareOp::Gt => Expression { - expr: self.expr.clone().gt(other.expr), - }, - CompareOp::Ge => Expression { - expr: self.expr.clone().gt_eq(other.expr), - }, - } - } -} - -#[pymethods] -impl Expression { - /// assign a name to the expression - pub fn alias(&self, name: &str) -> PyResult { - Ok(Expression { - expr: self.expr.clone().alias(name), - }) - } -} - -/// Represents a ScalarUDF -#[pyclass] -#[derive(Debug, Clone)] -pub struct ScalarUDF { - pub(crate) function: _ScalarUDF, -} - -#[pymethods] -impl ScalarUDF { - /// creates a new expression with the call of the udf - #[call] - #[args(args = "*")] - fn __call__(&self, args: &PyTuple) -> PyResult { - let args = from_tuple(args)?.iter().map(|e| e.expr.clone()).collect(); - - Ok(Expression { - expr: self.function.call(args), - }) - } -} - -/// Represents a AggregateUDF -#[pyclass] -#[derive(Debug, Clone)] -pub struct AggregateUDF { - pub(crate) function: _AggregateUDF, -} - -#[pymethods] -impl AggregateUDF { - /// creates a new expression with the call of the udf - #[call] - #[args(args = "*")] - fn __call__(&self, args: &PyTuple) -> PyResult { - let args = from_tuple(args)?.iter().map(|e| e.expr.clone()).collect(); - - Ok(Expression { - expr: self.function.call(args), - }) - } -} diff --git a/python/src/functions.rs b/python/src/functions.rs deleted file mode 100644 index 68000cb1ecbf8..0000000000000 --- a/python/src/functions.rs +++ /dev/null @@ -1,165 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::sync::Arc; - -use datafusion::arrow::datatypes::DataType; -use pyo3::{prelude::*, wrap_pyfunction}; - -use datafusion::logical_plan; - -use crate::udaf; -use crate::udf; -use crate::{expression, types::PyDataType}; - -/// Expression representing a column on the existing plan. -#[pyfunction] -#[text_signature = "(name)"] -fn col(name: &str) -> expression::Expression { - expression::Expression { - expr: logical_plan::col(name), - } -} - -/// Expression representing a constant value -#[pyfunction] -#[text_signature = "(value)"] -fn lit(value: i32) -> expression::Expression { - expression::Expression { - expr: logical_plan::lit(value), - } -} - -#[pyfunction] -fn sum(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::sum(value.expr), - } -} - -#[pyfunction] -fn avg(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::avg(value.expr), - } -} - -#[pyfunction] -fn min(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::min(value.expr), - } -} - -#[pyfunction] -fn max(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::max(value.expr), - } -} - -#[pyfunction] -fn count(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::count(value.expr), - } -} - -/* -#[pyfunction] -fn concat(value: Vec) -> expression::Expression { - expression::Expression { - expr: logical_plan::concat(value.into_iter().map(|e| e.expr)), - } -} - */ - -pub(crate) fn create_udf( - fun: PyObject, - input_types: Vec, - return_type: PyDataType, - name: &str, -) -> expression::ScalarUDF { - let input_types: Vec = - input_types.iter().map(|d| d.data_type.clone()).collect(); - let return_type = Arc::new(return_type.data_type); - - expression::ScalarUDF { - function: logical_plan::create_udf( - name, - input_types, - return_type, - udf::array_udf(fun), - ), - } -} - -/// Creates a new udf. -#[pyfunction] -fn udf( - fun: PyObject, - input_types: Vec, - return_type: PyDataType, - py: Python, -) -> PyResult { - let name = fun.getattr(py, "__qualname__")?.extract::(py)?; - - Ok(create_udf(fun, input_types, return_type, &name)) -} - -/// Creates a new udf. -#[pyfunction] -fn udaf( - accumulator: PyObject, - input_type: PyDataType, - return_type: PyDataType, - state_type: Vec, - py: Python, -) -> PyResult { - let name = accumulator - .getattr(py, "__qualname__")? - .extract::(py)?; - - let input_type = input_type.data_type; - let return_type = Arc::new(return_type.data_type); - let state_type = Arc::new(state_type.into_iter().map(|t| t.data_type).collect()); - - Ok(expression::AggregateUDF { - function: logical_plan::create_udaf( - &name, - input_type, - return_type, - udaf::array_udaf(accumulator), - state_type, - ), - }) -} - -pub fn init(module: &PyModule) -> PyResult<()> { - module.add_function(wrap_pyfunction!(col, module)?)?; - module.add_function(wrap_pyfunction!(lit, module)?)?; - // see https://github.com/apache/arrow-datafusion/issues/226 - //module.add_function(wrap_pyfunction!(concat, module)?)?; - module.add_function(wrap_pyfunction!(udf, module)?)?; - module.add_function(wrap_pyfunction!(sum, module)?)?; - module.add_function(wrap_pyfunction!(count, module)?)?; - module.add_function(wrap_pyfunction!(min, module)?)?; - module.add_function(wrap_pyfunction!(max, module)?)?; - module.add_function(wrap_pyfunction!(avg, module)?)?; - module.add_function(wrap_pyfunction!(udaf, module)?)?; - Ok(()) -} diff --git a/python/src/lib.rs b/python/src/lib.rs deleted file mode 100644 index aecfe9994cd1a..0000000000000 --- a/python/src/lib.rs +++ /dev/null @@ -1,44 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use pyo3::prelude::*; - -mod context; -mod dataframe; -mod errors; -mod expression; -mod functions; -mod scalar; -mod to_py; -mod to_rust; -mod types; -mod udaf; -mod udf; - -/// DataFusion. -#[pymodule] -fn datafusion(py: Python, m: &PyModule) -> PyResult<()> { - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - - let functions = PyModule::new(py, "functions")?; - functions::init(functions)?; - m.add_submodule(functions)?; - - Ok(()) -} diff --git a/python/src/scalar.rs b/python/src/scalar.rs deleted file mode 100644 index 0c562a9403616..0000000000000 --- a/python/src/scalar.rs +++ /dev/null @@ -1,36 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use pyo3::prelude::*; - -use datafusion::scalar::ScalarValue as _Scalar; - -use crate::to_rust::to_rust_scalar; - -/// An expression that can be used on a DataFrame -#[derive(Debug, Clone)] -pub(crate) struct Scalar { - pub(crate) scalar: _Scalar, -} - -impl<'source> FromPyObject<'source> for Scalar { - fn extract(ob: &'source PyAny) -> PyResult { - Ok(Self { - scalar: to_rust_scalar(ob)?, - }) - } -} diff --git a/python/src/to_py.rs b/python/src/to_py.rs deleted file mode 100644 index deeb9719891a3..0000000000000 --- a/python/src/to_py.rs +++ /dev/null @@ -1,77 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use pyo3::prelude::*; -use pyo3::{libc::uintptr_t, PyErr}; - -use std::convert::From; - -use datafusion::arrow::array::ArrayRef; -use datafusion::arrow::record_batch::RecordBatch; - -use crate::errors; - -pub fn to_py_array(array: &ArrayRef, py: Python) -> PyResult { - let (array_pointer, schema_pointer) = - array.to_raw().map_err(errors::DataFusionError::from)?; - - let pa = py.import("pyarrow")?; - - let array = pa.getattr("Array")?.call_method1( - "_import_from_c", - (array_pointer as uintptr_t, schema_pointer as uintptr_t), - )?; - Ok(array.to_object(py)) -} - -fn to_py_batch<'a>( - batch: &RecordBatch, - py: Python, - pyarrow: &'a PyModule, -) -> Result { - let mut py_arrays = vec![]; - let mut py_names = vec![]; - - let schema = batch.schema(); - for (array, field) in batch.columns().iter().zip(schema.fields().iter()) { - let array = to_py_array(array, py)?; - - py_arrays.push(array); - py_names.push(field.name()); - } - - let record = pyarrow - .getattr("RecordBatch")? - .call_method1("from_arrays", (py_arrays, py_names))?; - - Ok(PyObject::from(record)) -} - -/// Converts a &[RecordBatch] into a Vec represented in PyArrow -pub fn to_py(batches: &[RecordBatch]) -> PyResult { - let gil = pyo3::Python::acquire_gil(); - let py = gil.python(); - let pyarrow = PyModule::import(py, "pyarrow")?; - let builtins = PyModule::import(py, "builtins")?; - - let mut py_batches = vec![]; - for batch in batches { - py_batches.push(to_py_batch(batch, py, pyarrow)?); - } - let result = builtins.call1("list", (py_batches,))?; - Ok(PyObject::from(result)) -} diff --git a/python/src/to_rust.rs b/python/src/to_rust.rs deleted file mode 100644 index d8f2307a49823..0000000000000 --- a/python/src/to_rust.rs +++ /dev/null @@ -1,111 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::sync::Arc; - -use datafusion::arrow::{ - array::{make_array_from_raw, ArrayRef}, - datatypes::Field, - datatypes::Schema, - ffi, - record_batch::RecordBatch, -}; -use datafusion::scalar::ScalarValue; -use pyo3::{libc::uintptr_t, prelude::*}; - -use crate::{errors, types::PyDataType}; - -/// converts a pyarrow Array into a Rust Array -pub fn to_rust(ob: &PyAny) -> PyResult { - // prepare a pointer to receive the Array struct - let (array_pointer, schema_pointer) = - ffi::ArrowArray::into_raw(unsafe { ffi::ArrowArray::empty() }); - - // make the conversion through PyArrow's private API - // this changes the pointer's memory and is thus unsafe. In particular, `_export_to_c` can go out of bounds - ob.call_method1( - "_export_to_c", - (array_pointer as uintptr_t, schema_pointer as uintptr_t), - )?; - - let array = unsafe { make_array_from_raw(array_pointer, schema_pointer) } - .map_err(errors::DataFusionError::from)?; - Ok(array) -} - -pub fn to_rust_batch(batch: &PyAny) -> PyResult { - let schema = batch.getattr("schema")?; - let names = schema.getattr("names")?.extract::>()?; - - let fields = names - .iter() - .enumerate() - .map(|(i, name)| { - let field = schema.call_method1("field", (i,))?; - let nullable = field.getattr("nullable")?.extract::()?; - let py_data_type = field.getattr("type")?; - let data_type = py_data_type.extract::()?.data_type; - Ok(Field::new(name, data_type, nullable)) - }) - .collect::>()?; - - let schema = Arc::new(Schema::new(fields)); - - let arrays = (0..names.len()) - .map(|i| { - let array = batch.call_method1("column", (i,))?; - to_rust(array) - }) - .collect::>()?; - - let batch = - RecordBatch::try_new(schema, arrays).map_err(errors::DataFusionError::from)?; - Ok(batch) -} - -/// converts a pyarrow Scalar into a Rust Scalar -pub fn to_rust_scalar(ob: &PyAny) -> PyResult { - let t = ob - .getattr("__class__")? - .getattr("__name__")? - .extract::<&str>()?; - - let p = ob.call_method0("as_py")?; - - Ok(match t { - "Int8Scalar" => ScalarValue::Int8(Some(p.extract::()?)), - "Int16Scalar" => ScalarValue::Int16(Some(p.extract::()?)), - "Int32Scalar" => ScalarValue::Int32(Some(p.extract::()?)), - "Int64Scalar" => ScalarValue::Int64(Some(p.extract::()?)), - "UInt8Scalar" => ScalarValue::UInt8(Some(p.extract::()?)), - "UInt16Scalar" => ScalarValue::UInt16(Some(p.extract::()?)), - "UInt32Scalar" => ScalarValue::UInt32(Some(p.extract::()?)), - "UInt64Scalar" => ScalarValue::UInt64(Some(p.extract::()?)), - "FloatScalar" => ScalarValue::Float32(Some(p.extract::()?)), - "DoubleScalar" => ScalarValue::Float64(Some(p.extract::()?)), - "BooleanScalar" => ScalarValue::Boolean(Some(p.extract::()?)), - "StringScalar" => ScalarValue::Utf8(Some(p.extract::()?)), - "LargeStringScalar" => ScalarValue::LargeUtf8(Some(p.extract::()?)), - other => { - return Err(errors::DataFusionError::Common(format!( - "Type \"{}\"not yet implemented", - other - )) - .into()) - } - }) -} diff --git a/python/src/types.rs b/python/src/types.rs deleted file mode 100644 index ffa822e073a89..0000000000000 --- a/python/src/types.rs +++ /dev/null @@ -1,76 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use datafusion::arrow::datatypes::DataType; -use pyo3::{FromPyObject, PyAny, PyResult}; - -use crate::errors; - -/// utility struct to convert PyObj to native DataType -#[derive(Debug, Clone)] -pub struct PyDataType { - pub data_type: DataType, -} - -impl<'source> FromPyObject<'source> for PyDataType { - fn extract(ob: &'source PyAny) -> PyResult { - let id = ob.getattr("id")?.extract::()?; - let data_type = data_type_id(&id)?; - Ok(PyDataType { data_type }) - } -} - -fn data_type_id(id: &i32) -> Result { - // see https://github.com/apache/arrow/blob/3694794bdfd0677b95b8c95681e392512f1c9237/python/pyarrow/includes/libarrow.pxd - // this is not ideal as it does not generalize for non-basic types - // Find a way to get a unique name from the pyarrow.DataType - Ok(match id { - 1 => DataType::Boolean, - 2 => DataType::UInt8, - 3 => DataType::Int8, - 4 => DataType::UInt16, - 5 => DataType::Int16, - 6 => DataType::UInt32, - 7 => DataType::Int32, - 8 => DataType::UInt64, - 9 => DataType::Int64, - - 10 => DataType::Float16, - 11 => DataType::Float32, - 12 => DataType::Float64, - - //13 => DataType::Decimal, - - // 14 => DataType::Date32(), - // 15 => DataType::Date64(), - // 16 => DataType::Timestamp(), - // 17 => DataType::Time32(), - // 18 => DataType::Time64(), - // 19 => DataType::Duration() - 20 => DataType::Binary, - 21 => DataType::Utf8, - 22 => DataType::LargeBinary, - 23 => DataType::LargeUtf8, - - other => { - return Err(errors::DataFusionError::Common(format!( - "The type {} is not valid", - other - ))) - } - }) -} diff --git a/python/src/udaf.rs b/python/src/udaf.rs deleted file mode 100644 index 3ce223df9a491..0000000000000 --- a/python/src/udaf.rs +++ /dev/null @@ -1,147 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::sync::Arc; - -use pyo3::{prelude::*, types::PyTuple}; - -use datafusion::arrow::array::ArrayRef; - -use datafusion::error::Result; -use datafusion::{ - error::DataFusionError as InnerDataFusionError, physical_plan::Accumulator, - scalar::ScalarValue, -}; - -use crate::scalar::Scalar; -use crate::to_py::to_py_array; -use crate::to_rust::to_rust_scalar; - -#[derive(Debug)] -struct PyAccumulator { - accum: PyObject, -} - -impl PyAccumulator { - fn new(accum: PyObject) -> Self { - Self { accum } - } -} - -impl Accumulator for PyAccumulator { - fn state(&self) -> Result> { - let gil = pyo3::Python::acquire_gil(); - let py = gil.python(); - - let state = self - .accum - .as_ref(py) - .call_method0("to_scalars") - .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))? - .extract::>() - .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; - - Ok(state.into_iter().map(|v| v.scalar).collect::>()) - } - - fn update(&mut self, _values: &[ScalarValue]) -> Result<()> { - // no need to implement as datafusion does not use it - todo!() - } - - fn merge(&mut self, _states: &[ScalarValue]) -> Result<()> { - // no need to implement as datafusion does not use it - todo!() - } - - fn evaluate(&self) -> Result { - // get GIL - let gil = pyo3::Python::acquire_gil(); - let py = gil.python(); - - let value = self - .accum - .as_ref(py) - .call_method0("evaluate") - .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; - - to_rust_scalar(value) - .map_err(|e| InnerDataFusionError::Execution(format!("{}", e))) - } - - fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { - // get GIL - let gil = pyo3::Python::acquire_gil(); - let py = gil.python(); - - // 1. cast args to Pyarrow array - // 2. call function - - // 1. - let py_args = values - .iter() - .map(|arg| { - // remove unwrap - to_py_array(arg, py).unwrap() - }) - .collect::>(); - let py_args = PyTuple::new(py, py_args); - - // update accumulator - self.accum - .as_ref(py) - .call_method1("update", py_args) - .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; - - Ok(()) - } - - fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { - // get GIL - let gil = pyo3::Python::acquire_gil(); - let py = gil.python(); - - // 1. cast states to Pyarrow array - // 2. merge - let state = &states[0]; - - let state = to_py_array(state, py) - .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; - - // 2. - self.accum - .as_ref(py) - .call_method1("merge", (state,)) - .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; - - Ok(()) - } -} - -pub fn array_udaf( - accumulator: PyObject, -) -> Arc Result> + Send + Sync> { - Arc::new(move || -> Result> { - let gil = pyo3::Python::acquire_gil(); - let py = gil.python(); - - let accumulator = accumulator - .call0(py) - .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; - Ok(Box::new(PyAccumulator::new(accumulator))) - }) -} diff --git a/python/src/udf.rs b/python/src/udf.rs deleted file mode 100644 index 7fee71008ef2f..0000000000000 --- a/python/src/udf.rs +++ /dev/null @@ -1,62 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use pyo3::{prelude::*, types::PyTuple}; - -use datafusion::{arrow::array, physical_plan::functions::make_scalar_function}; - -use datafusion::error::DataFusionError; -use datafusion::physical_plan::functions::ScalarFunctionImplementation; - -use crate::to_py::to_py_array; -use crate::to_rust::to_rust; - -/// creates a DataFusion's UDF implementation from a python function that expects pyarrow arrays -/// This is more efficient as it performs a zero-copy of the contents. -pub fn array_udf(func: PyObject) -> ScalarFunctionImplementation { - make_scalar_function( - move |args: &[array::ArrayRef]| -> Result { - // get GIL - let gil = pyo3::Python::acquire_gil(); - let py = gil.python(); - - // 1. cast args to Pyarrow arrays - // 2. call function - // 3. cast to arrow::array::Array - - // 1. - let py_args = args - .iter() - .map(|arg| { - // remove unwrap - to_py_array(arg, py).unwrap() - }) - .collect::>(); - let py_args = PyTuple::new(py, py_args); - - // 2. - let value = func.as_ref(py).call(py_args, None); - let value = match value { - Ok(n) => Ok(n), - Err(error) => Err(DataFusionError::Execution(format!("{:?}", error))), - }?; - - let array = to_rust(value).unwrap(); - Ok(array) - }, - ) -} diff --git a/python/tests/__init__.py b/python/tests/__init__.py deleted file mode 100644 index 13a83393a9124..0000000000000 --- a/python/tests/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. diff --git a/python/tests/generic.py b/python/tests/generic.py deleted file mode 100644 index 7362f0bb29569..0000000000000 --- a/python/tests/generic.py +++ /dev/null @@ -1,75 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import unittest -import tempfile -import datetime -import os.path -import shutil - -import numpy -import pyarrow -import datafusion - -# used to write parquet files -import pyarrow.parquet - - -def data(): - data = numpy.concatenate( - [numpy.random.normal(0, 0.01, size=50), numpy.random.normal(50, 0.01, size=50)] - ) - return pyarrow.array(data) - - -def data_with_nans(): - data = numpy.random.normal(0, 0.01, size=50) - mask = numpy.random.randint(0, 2, size=50) - data[mask == 0] = numpy.NaN - return data - - -def data_datetime(f): - data = [ - datetime.datetime.now(), - datetime.datetime.now() - datetime.timedelta(days=1), - datetime.datetime.now() + datetime.timedelta(days=1), - ] - return pyarrow.array( - data, type=pyarrow.timestamp(f), mask=numpy.array([False, True, False]) - ) - - -def data_timedelta(f): - data = [ - datetime.timedelta(days=100), - datetime.timedelta(days=1), - datetime.timedelta(seconds=1), - ] - return pyarrow.array( - data, type=pyarrow.duration(f), mask=numpy.array([False, True, False]) - ) - - -def data_binary_other(): - return numpy.array([1, 0, 0], dtype="u4") - - -def write_parquet(path, data): - table = pyarrow.Table.from_arrays([data], names=["a"]) - pyarrow.parquet.write_table(table, path) - return path diff --git a/python/tests/test_df.py b/python/tests/test_df.py deleted file mode 100644 index 520d4e6a54723..0000000000000 --- a/python/tests/test_df.py +++ /dev/null @@ -1,115 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import unittest - -import pyarrow -import datafusion -f = datafusion.functions - - -class TestCase(unittest.TestCase): - - def _prepare(self): - ctx = datafusion.ExecutionContext() - - # create a RecordBatch and a new DataFrame from it - batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], - names=["a", "b"], - ) - return ctx.create_dataframe([[batch]]) - - def test_select(self): - df = self._prepare() - - df = df.select( - f.col("a") + f.col("b"), - f.col("a") - f.col("b"), - ) - - # execute and collect the first (and only) batch - result = df.collect()[0] - - self.assertEqual(result.column(0), pyarrow.array([5, 7, 9])) - self.assertEqual(result.column(1), pyarrow.array([-3, -3, -3])) - - def test_filter(self): - df = self._prepare() - - df = df \ - .select( - f.col("a") + f.col("b"), - f.col("a") - f.col("b"), - ) \ - .filter(f.col("a") > f.lit(2)) - - # execute and collect the first (and only) batch - result = df.collect()[0] - - self.assertEqual(result.column(0), pyarrow.array([9])) - self.assertEqual(result.column(1), pyarrow.array([-3])) - - def test_limit(self): - df = self._prepare() - - df = df.limit(1) - - # execute and collect the first (and only) batch - result = df.collect()[0] - - self.assertEqual(len(result.column(0)), 1) - self.assertEqual(len(result.column(1)), 1) - - def test_udf(self): - df = self._prepare() - - # is_null is a pyarrow function over arrays - udf = f.udf(lambda x: x.is_null(), [pyarrow.int64()], pyarrow.bool_()) - - df = df.select(udf(f.col("a"))) - - self.assertEqual(df.collect()[0].column(0), pyarrow.array([False, False, False])) - - def test_join(self): - ctx = datafusion.ExecutionContext() - - batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], - names=["a", "b"], - ) - df = ctx.create_dataframe([[batch]]) - - batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2]), pyarrow.array([8, 10])], - names=["a", "c"], - ) - df1 = ctx.create_dataframe([[batch]]) - - df = df.join(df1, on="a", how="inner") - - # execute and collect the first (and only) batch - batch = df.collect()[0] - - if batch.column(0) == pyarrow.array([1, 2]): - self.assertEqual(batch.column(0), pyarrow.array([1, 2])) - self.assertEqual(batch.column(1), pyarrow.array([8, 10])) - self.assertEqual(batch.column(2), pyarrow.array([4, 5])) - else: - self.assertEqual(batch.column(0), pyarrow.array([2, 1])) - self.assertEqual(batch.column(1), pyarrow.array([10, 8])) - self.assertEqual(batch.column(2), pyarrow.array([5, 4])) diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py deleted file mode 100644 index e9047ea6e70c3..0000000000000 --- a/python/tests/test_sql.py +++ /dev/null @@ -1,294 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import unittest -import tempfile -import datetime -import os.path -import shutil - -import numpy -import pyarrow -import datafusion - -# used to write parquet files -import pyarrow.parquet - -from tests.generic import * - - -class TestCase(unittest.TestCase): - def setUp(self): - # Create a temporary directory - self.test_dir = tempfile.mkdtemp() - numpy.random.seed(1) - - def tearDown(self): - # Remove the directory after the test - shutil.rmtree(self.test_dir) - - def test_no_table(self): - with self.assertRaises(Exception): - datafusion.Context().sql("SELECT a FROM b").collect() - - def test_register(self): - ctx = datafusion.ExecutionContext() - - path = write_parquet(os.path.join(self.test_dir, "a.parquet"), data()) - - ctx.register_parquet("t", path) - - self.assertEqual(ctx.tables(), {"t"}) - - def test_execute(self): - data = [1, 1, 2, 2, 3, 11, 12] - - ctx = datafusion.ExecutionContext() - - # single column, "a" - path = write_parquet( - os.path.join(self.test_dir, "a.parquet"), pyarrow.array(data) - ) - ctx.register_parquet("t", path) - - self.assertEqual(ctx.tables(), {"t"}) - - # count - result = ctx.sql("SELECT COUNT(a) FROM t").collect() - - expected = pyarrow.array([7], pyarrow.uint64()) - expected = [pyarrow.RecordBatch.from_arrays([expected], ["COUNT(a)"])] - self.assertEqual(expected, result) - - # where - expected = pyarrow.array([2], pyarrow.uint64()) - expected = [pyarrow.RecordBatch.from_arrays([expected], ["COUNT(a)"])] - self.assertEqual( - expected, ctx.sql("SELECT COUNT(a) FROM t WHERE a > 10").collect() - ) - - # group by - result = ctx.sql( - "SELECT CAST(a as int), COUNT(a) FROM t GROUP BY CAST(a as int)" - ).collect() - - result_keys = result[0].to_pydict()["CAST(a AS Int32)"] - result_values = result[0].to_pydict()["COUNT(a)"] - result_keys, result_values = ( - list(t) for t in zip(*sorted(zip(result_keys, result_values))) - ) - - self.assertEqual(result_keys, [1, 2, 3, 11, 12]) - self.assertEqual(result_values, [2, 2, 1, 1, 1]) - - # order by - result = ctx.sql( - "SELECT a, CAST(a AS int) FROM t ORDER BY a DESC LIMIT 2" - ).collect() - expected_a = pyarrow.array([50.0219, 50.0152], pyarrow.float64()) - expected_cast = pyarrow.array([50, 50], pyarrow.int32()) - expected = [ - pyarrow.RecordBatch.from_arrays( - [expected_a, expected_cast], ["a", "CAST(a AS Int32)"] - ) - ] - numpy.testing.assert_equal(expected[0].column(1), expected[0].column(1)) - - def test_cast(self): - """ - Verify that we can cast - """ - ctx = datafusion.ExecutionContext() - - path = write_parquet(os.path.join(self.test_dir, "a.parquet"), data()) - ctx.register_parquet("t", path) - - valid_types = [ - "smallint", - "int", - "bigint", - "float(32)", - "float(64)", - "float", - ] - - select = ", ".join( - [f"CAST(9 AS {t}) AS A{i}" for i, t in enumerate(valid_types)] - ) - - # can execute, which implies that we can cast - ctx.sql(f"SELECT {select} FROM t").collect() - - def _test_udf(self, udf, args, return_type, array, expected): - ctx = datafusion.ExecutionContext() - - # write to disk - path = write_parquet(os.path.join(self.test_dir, "a.parquet"), array) - ctx.register_parquet("t", path) - - ctx.register_udf("udf", udf, args, return_type) - - batches = ctx.sql("SELECT udf(a) AS tt FROM t").collect() - - result = batches[0].column(0) - - self.assertEqual(expected, result) - - def test_udf_identity(self): - self._test_udf( - lambda x: x, - [pyarrow.float64()], - pyarrow.float64(), - pyarrow.array([-1.2, None, 1.2]), - pyarrow.array([-1.2, None, 1.2]), - ) - - def test_udf(self): - self._test_udf( - lambda x: x.is_null(), - [pyarrow.float64()], - pyarrow.bool_(), - pyarrow.array([-1.2, None, 1.2]), - pyarrow.array([False, True, False]), - ) - - -class TestIO(unittest.TestCase): - def setUp(self): - # Create a temporary directory - self.test_dir = tempfile.mkdtemp() - - def tearDown(self): - # Remove the directory after the test - shutil.rmtree(self.test_dir) - - def _test_data(self, data): - ctx = datafusion.ExecutionContext() - - # write to disk - path = write_parquet(os.path.join(self.test_dir, "a.parquet"), data) - ctx.register_parquet("t", path) - - batches = ctx.sql("SELECT a AS tt FROM t").collect() - - result = batches[0].column(0) - - numpy.testing.assert_equal(data, result) - - def test_nans(self): - self._test_data(data_with_nans()) - - def test_utf8(self): - array = pyarrow.array( - ["a", "b", "c"], pyarrow.utf8(), numpy.array([False, True, False]) - ) - self._test_data(array) - - def test_large_utf8(self): - array = pyarrow.array( - ["a", "b", "c"], pyarrow.large_utf8(), numpy.array([False, True, False]) - ) - self._test_data(array) - - # Error from Arrow - @unittest.expectedFailure - def test_datetime_s(self): - self._test_data(data_datetime("s")) - - # C data interface missing - @unittest.expectedFailure - def test_datetime_ms(self): - self._test_data(data_datetime("ms")) - - # C data interface missing - @unittest.expectedFailure - def test_datetime_us(self): - self._test_data(data_datetime("us")) - - # Not writtable to parquet - @unittest.expectedFailure - def test_datetime_ns(self): - self._test_data(data_datetime("ns")) - - # Not writtable to parquet - @unittest.expectedFailure - def test_timedelta_s(self): - self._test_data(data_timedelta("s")) - - # Not writtable to parquet - @unittest.expectedFailure - def test_timedelta_ms(self): - self._test_data(data_timedelta("ms")) - - # Not writtable to parquet - @unittest.expectedFailure - def test_timedelta_us(self): - self._test_data(data_timedelta("us")) - - # Not writtable to parquet - @unittest.expectedFailure - def test_timedelta_ns(self): - self._test_data(data_timedelta("ns")) - - def test_date32(self): - array = pyarrow.array( - [ - datetime.date(2000, 1, 1), - datetime.date(1980, 1, 1), - datetime.date(2030, 1, 1), - ], - pyarrow.date32(), - numpy.array([False, True, False]), - ) - self._test_data(array) - - def test_binary_variable(self): - array = pyarrow.array( - [b"1", b"2", b"3"], pyarrow.binary(), numpy.array([False, True, False]) - ) - self._test_data(array) - - # C data interface missing - @unittest.expectedFailure - def test_binary_fixed(self): - array = pyarrow.array( - [b"1111", b"2222", b"3333"], - pyarrow.binary(4), - numpy.array([False, True, False]), - ) - self._test_data(array) - - def test_large_binary(self): - array = pyarrow.array( - [b"1111", b"2222", b"3333"], - pyarrow.large_binary(), - numpy.array([False, True, False]), - ) - self._test_data(array) - - def test_binary_other(self): - self._test_data(data_binary_other()) - - def test_bool(self): - array = pyarrow.array( - [False, True, True], None, numpy.array([False, True, False]) - ) - self._test_data(array) - - def test_u32(self): - array = pyarrow.array([0, 1, 2], None, numpy.array([False, True, False])) - self._test_data(array) diff --git a/python/tests/test_udaf.py b/python/tests/test_udaf.py deleted file mode 100644 index ffd235e285f80..0000000000000 --- a/python/tests/test_udaf.py +++ /dev/null @@ -1,91 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import unittest - -import pyarrow -import pyarrow.compute -import datafusion - -f = datafusion.functions - - -class Accumulator: - """ - Interface of a user-defined accumulation. - """ - - def __init__(self): - self._sum = pyarrow.scalar(0.0) - - def to_scalars(self) -> [pyarrow.Scalar]: - return [self._sum] - - def update(self, values: pyarrow.Array) -> None: - # not nice since pyarrow scalars can't be summed yet. This breaks on `None` - self._sum = pyarrow.scalar( - self._sum.as_py() + pyarrow.compute.sum(values).as_py() - ) - - def merge(self, states: pyarrow.Array) -> None: - # not nice since pyarrow scalars can't be summed yet. This breaks on `None` - self._sum = pyarrow.scalar( - self._sum.as_py() + pyarrow.compute.sum(states).as_py() - ) - - def evaluate(self) -> pyarrow.Scalar: - return self._sum - - -class TestCase(unittest.TestCase): - def _prepare(self): - ctx = datafusion.ExecutionContext() - - # create a RecordBatch and a new DataFrame from it - batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 4, 6])], - names=["a", "b"], - ) - return ctx.create_dataframe([[batch]]) - - def test_aggregate(self): - df = self._prepare() - - udaf = f.udaf( - Accumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()] - ) - - df = df.aggregate([], [udaf(f.col("a"))]) - - # execute and collect the first (and only) batch - result = df.collect()[0] - - self.assertEqual(result.column(0), pyarrow.array([1.0 + 2.0 + 3.0])) - - def test_group_by(self): - df = self._prepare() - - udaf = f.udaf( - Accumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()] - ) - - df = df.aggregate([f.col("b")], [udaf(f.col("a"))]) - - # execute and collect the first (and only) batch - result = df.collect()[0] - - self.assertEqual(result.column(1), pyarrow.array([1.0 + 2.0, 3.0])) From 5f6024d570f4d2d5fd4c2187ff7dcb0cd389ac4b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 4 May 2021 12:23:09 -0400 Subject: [PATCH 062/329] fix clippy (#259) --- datafusion/src/physical_plan/hash_join.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index 3398494e3c46c..60d65b2361601 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -389,7 +389,7 @@ impl ExecutionPlan for HashJoinExec { num_output_rows: 0, join_time: 0, random_state: self.random_state.clone(), - visited_left_side: visited_left_side, + visited_left_side, is_exhausted: false, })) } From cb0a4a926e2b5c14c54a2ccc2296ff92cb64b8d4 Mon Sep 17 00:00:00 2001 From: Patrick More <34631716+pjmore@users.noreply.github.com> Date: Tue, 4 May 2021 13:39:09 -0700 Subject: [PATCH 063/329] Count distinct floats (#252) * Added test for boolean COUNT DISTINCT * ran cargo fmt * Fixed clippy warnings * Added COUNT DISTINCT support for floating point numbers --- .../src/physical_plan/distinct_expressions.rs | 85 ++++++++++++++++++- datafusion/src/scalar.rs | 2 + 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/datafusion/src/physical_plan/distinct_expressions.rs b/datafusion/src/physical_plan/distinct_expressions.rs index 1c93b5a104d09..a4dd25d0157b3 100644 --- a/datafusion/src/physical_plan/distinct_expressions.rs +++ b/datafusion/src/physical_plan/distinct_expressions.rs @@ -196,8 +196,9 @@ mod tests { use super::*; use arrow::array::{ - ArrayRef, BooleanArray, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, - UInt16Array, UInt32Array, UInt64Array, UInt8Array, + ArrayRef, BooleanArray, Float32Array, Float64Array, Int16Array, Int32Array, + Int64Array, Int8Array, ListArray, UInt16Array, UInt32Array, UInt64Array, + UInt8Array, }; use arrow::array::{Int32Builder, ListBuilder, UInt64Builder}; use arrow::datatypes::DataType; @@ -355,6 +356,76 @@ mod tests { }}; } + //Used trait to create associated constant for f32 and f64 + trait SubNormal: 'static { + const SUBNORMAL: Self; + } + + impl SubNormal for f64 { + const SUBNORMAL: Self = 1.0e-308_f64; + } + + impl SubNormal for f32 { + const SUBNORMAL: Self = 1.0e-38_f32; + } + + macro_rules! test_count_distinct_update_batch_floating_point { + ($ARRAY_TYPE:ident, $DATA_TYPE:ident, $PRIM_TYPE:ty) => {{ + use ordered_float::OrderedFloat; + let values: Vec> = vec![ + Some(<$PRIM_TYPE>::INFINITY), + Some(<$PRIM_TYPE>::NAN), + Some(1.0), + Some(<$PRIM_TYPE as SubNormal>::SUBNORMAL), + Some(1.0), + Some(<$PRIM_TYPE>::INFINITY), + None, + Some(3.0), + Some(-4.5), + Some(2.0), + None, + Some(2.0), + Some(3.0), + Some(<$PRIM_TYPE>::NEG_INFINITY), + Some(1.0), + Some(<$PRIM_TYPE>::NAN), + Some(<$PRIM_TYPE>::NEG_INFINITY), + ]; + + let arrays = vec![Arc::new($ARRAY_TYPE::from(values)) as ArrayRef]; + + let (states, result) = run_update_batch(&arrays)?; + + let mut state_vec = + state_to_vec!(&states[0], $DATA_TYPE, $PRIM_TYPE).unwrap(); + state_vec.sort_by(|a, b| match (a, b) { + (Some(lhs), Some(rhs)) => { + OrderedFloat::from(*lhs).cmp(&OrderedFloat::from(*rhs)) + } + _ => a.partial_cmp(b).unwrap(), + }); + + let nan_idx = state_vec.len() - 1; + assert_eq!(states.len(), 1); + assert_eq!( + &state_vec[..nan_idx], + vec![ + Some(<$PRIM_TYPE>::NEG_INFINITY), + Some(-4.5), + Some(<$PRIM_TYPE as SubNormal>::SUBNORMAL), + Some(1.0), + Some(2.0), + Some(3.0), + Some(<$PRIM_TYPE>::INFINITY) + ] + ); + assert!(state_vec[nan_idx].unwrap_or_default().is_nan()); + assert_eq!(result, ScalarValue::UInt64(Some(8))); + + Ok(()) + }}; + } + #[test] fn count_distinct_update_batch_i8() -> Result<()> { test_count_distinct_update_batch_numeric!(Int8Array, Int8, i8) @@ -395,6 +466,16 @@ mod tests { test_count_distinct_update_batch_numeric!(UInt64Array, UInt64, u64) } + #[test] + fn count_distinct_update_batch_f32() -> Result<()> { + test_count_distinct_update_batch_floating_point!(Float32Array, Float32, f32) + } + + #[test] + fn count_distinct_update_batch_f64() -> Result<()> { + test_count_distinct_update_batch_floating_point!(Float64Array, Float64, f64) + } + #[test] fn count_distinct_update_batch_boolean() -> Result<()> { let get_count = |data: BooleanArray| -> Result<(Vec>, u64)> { diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index 6f03194f45423..dd3fb58757bed 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -355,6 +355,8 @@ impl ScalarValue { DataType::UInt32 => build_list!(UInt32Builder, UInt32, values, size), DataType::UInt64 => build_list!(UInt64Builder, UInt64, values, size), DataType::Utf8 => build_list!(StringBuilder, Utf8, values, size), + DataType::Float32 => build_list!(Float32Builder, Float32, values, size), + DataType::Float64 => build_list!(Float64Builder, Float64, values, size), DataType::LargeUtf8 => { build_list!(LargeStringBuilder, LargeUtf8, values, size) } From f7bd7b9f0e5a28b1fada2dbb60151b5af9b16545 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Wed, 5 May 2021 06:15:38 +0800 Subject: [PATCH 064/329] Refactor datafusion/src/physical_plan/common.rs build_file_list to take less param and reuse code (#253) * refactor common build_file_list * fixing clippy --- datafusion/src/datasource/csv.rs | 3 +-- datafusion/src/physical_plan/common.rs | 17 ++++++++++++++--- datafusion/src/physical_plan/csv.rs | 3 +-- datafusion/src/physical_plan/parquet.rs | 3 +-- 4 files changed, 17 insertions(+), 9 deletions(-) diff --git a/datafusion/src/datasource/csv.rs b/datafusion/src/datasource/csv.rs index 1bd1b4be823ee..33cbeb12ca6bd 100644 --- a/datafusion/src/datasource/csv.rs +++ b/datafusion/src/datasource/csv.rs @@ -71,8 +71,7 @@ impl CsvFile { let schema = Arc::new(match options.schema { Some(s) => s.clone(), None => { - let mut filenames: Vec = vec![]; - common::build_file_list(path, &mut filenames, options.file_extension)?; + let filenames = common::build_file_list(path, options.file_extension)?; if filenames.is_empty() { return Err(DataFusionError::Plan(format!( "No files found at {path} with file extension {file_extension}", diff --git a/datafusion/src/physical_plan/common.rs b/datafusion/src/physical_plan/common.rs index 9de7ee2a32dd8..f1ed3742340b0 100644 --- a/datafusion/src/physical_plan/common.rs +++ b/datafusion/src/physical_plan/common.rs @@ -78,8 +78,19 @@ pub async fn collect(stream: SendableRecordBatchStream) -> Result, ext: &str) -> Result<()> { +/// Recursively builds a list of files in a directory with a given extension +pub fn build_file_list(dir: &str, ext: &str) -> Result> { + let mut filenames: Vec = Vec::new(); + build_file_list_recurse(dir, &mut filenames, ext)?; + Ok(filenames) +} + +/// Recursively build a list of files in a directory with a given extension with an accumulator list +fn build_file_list_recurse( + dir: &str, + filenames: &mut Vec, + ext: &str, +) -> Result<()> { let metadata = metadata(dir)?; if metadata.is_file() { if dir.ends_with(ext) { @@ -91,7 +102,7 @@ pub fn build_file_list(dir: &str, filenames: &mut Vec, ext: &str) -> Res let path = entry.path(); if let Some(path_name) = path.to_str() { if path.is_dir() { - build_file_list(path_name, filenames, ext)?; + build_file_list_recurse(path_name, filenames, ext)?; } else if path_name.ends_with(ext) { filenames.push(path_name.to_string()); } diff --git a/datafusion/src/physical_plan/csv.rs b/datafusion/src/physical_plan/csv.rs index b96a702f27325..9ab817799954f 100644 --- a/datafusion/src/physical_plan/csv.rs +++ b/datafusion/src/physical_plan/csv.rs @@ -199,8 +199,7 @@ impl CsvExec { ) -> Result { let file_extension = String::from(options.file_extension); - let mut filenames: Vec = vec![]; - common::build_file_list(path, &mut filenames, file_extension.as_str())?; + let filenames = common::build_file_list(path, file_extension.as_str())?; if filenames.is_empty() { return Err(DataFusionError::Execution(format!( "No files found at {path} with file extension {file_extension}", diff --git a/datafusion/src/physical_plan/parquet.rs b/datafusion/src/physical_plan/parquet.rs index d41d6968fee0d..09dd48df3ed5c 100644 --- a/datafusion/src/physical_plan/parquet.rs +++ b/datafusion/src/physical_plan/parquet.rs @@ -118,8 +118,7 @@ impl ParquetExec { ) -> Result { // build a list of filenames from the specified path, which could be a single file or // a directory containing one or more parquet files - let mut filenames: Vec = vec![]; - common::build_file_list(path, &mut filenames, ".parquet")?; + let filenames = common::build_file_list(path, ".parquet")?; if filenames.is_empty() { Err(DataFusionError::Plan(format!( "No Parquet files found at path {}", From 9e84f15191860befab07d4f257ccb0be75f1b206 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Wed, 5 May 2021 12:25:46 +0200 Subject: [PATCH 065/329] Implement select distinct (#262) * Implement select distinct * Simplify implementation * Move out of project fn * Fix call * Fix docs * Add to docs * Add note of ALL support * Add test for SELECT ALL * Clippy * Update datafusion/tests/sql.rs Co-authored-by: Andrew Lamb Co-authored-by: Andrew Lamb --- datafusion/src/sql/planner.rs | 33 +++++++------- datafusion/tests/sql.rs | 75 +++++++++++++++++++++++++++++++ docs/user-guide/src/sql/select.md | 8 +++- 3 files changed, 100 insertions(+), 16 deletions(-) diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index a40d0becdcb4b..48900f56aad5e 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -17,9 +17,9 @@ //! SQL Query Planner (produces logical plan from SQL AST) -use std::convert::TryInto; use std::str::FromStr; use std::sync::Arc; +use std::{convert::TryInto, vec}; use crate::catalog::TableReference; use crate::datasource::TableProvider; @@ -621,7 +621,15 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { plan }; - self.project(&plan, select_exprs_post_aggr, false) + let plan = if select.distinct { + return LogicalPlanBuilder::from(&plan) + .aggregate(select_exprs_post_aggr, vec![])? + .build(); + } else { + plan + }; + + self.project(&plan, select_exprs_post_aggr) } /// Returns the `Expr`'s corresponding to a SQL query's SELECT expressions. @@ -645,24 +653,19 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { /// Wrap a plan in a projection /// - /// If the `force` argument is `false`, the projection is applied only when - /// necessary, i.e., when the input fields are different than the + /// The projection is applied only when necessary, + /// i.e., when the input fields are different than the /// projection. Note that if the input fields are the same, but out of /// order, the projection will be applied. - fn project( - &self, - input: &LogicalPlan, - expr: Vec, - force: bool, - ) -> Result { + fn project(&self, input: &LogicalPlan, expr: Vec) -> Result { self.validate_schema_satisfies_exprs(&input.schema(), &expr)?; + let plan = LogicalPlanBuilder::from(input).project(expr)?.build()?; - let project = force - || match input { - LogicalPlan::TableScan { .. } => true, - _ => plan.schema().fields() != input.schema().fields(), - }; + let project = match input { + LogicalPlan::TableScan { .. } => true, + _ => plan.schema().fields() != input.schema().fields(), + }; if project { Ok(plan) diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 716929405c3a1..bf28525ad437f 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -372,6 +372,81 @@ async fn csv_query_group_by_float32() -> Result<()> { Ok(()) } +#[tokio::test] +async fn select_all() -> Result<()> { + let mut ctx = ExecutionContext::new(); + register_aggregate_simple_csv(&mut ctx)?; + + let sql = "SELECT c1 FROM aggregate_simple order by c1"; + let actual_no_all = execute(&mut ctx, sql).await; + + let sql_all = "SELECT ALL c1 FROM aggregate_simple order by c1"; + let actual_all = execute(&mut ctx, sql_all).await; + + assert_eq!(actual_no_all, actual_all); + + Ok(()) +} + +#[tokio::test] +async fn select_distinct() -> Result<()> { + let mut ctx = ExecutionContext::new(); + register_aggregate_simple_csv(&mut ctx)?; + + let sql = "SELECT DISTINCT * FROM aggregate_simple"; + let mut actual = execute(&mut ctx, sql).await; + actual.sort(); + + let mut dedup = actual.clone(); + dedup.dedup(); + + assert_eq!(actual, dedup); + + Ok(()) +} + +#[tokio::test] +async fn select_distinct_simple() -> Result<()> { + let mut ctx = ExecutionContext::new(); + register_aggregate_simple_csv(&mut ctx)?; + + let sql = "SELECT DISTINCT c1 FROM aggregate_simple order by c1"; + let actual = execute(&mut ctx, sql).await; + + let expected = vec![ + vec!["0.00001"], + vec!["0.00002"], + vec!["0.00003"], + vec!["0.00004"], + vec!["0.00005"], + ]; + assert_eq!(actual, expected); + + let sql = "SELECT DISTINCT c1, c2 FROM aggregate_simple order by c1"; + let actual = execute(&mut ctx, sql).await; + + let expected = vec![ + vec!["0.00001", "0.000000000001"], + vec!["0.00002", "0.000000000002"], + vec!["0.00003", "0.000000000003"], + vec!["0.00004", "0.000000000004"], + vec!["0.00005", "0.000000000005"], + ]; + assert_eq!(actual, expected); + + let sql = "SELECT distinct c3 FROM aggregate_simple order by c3"; + let actual = execute(&mut ctx, sql).await; + + let expected = vec![vec!["false"], vec!["true"]]; + assert_eq!(actual, expected); + + let sql = "SELECT distinct c1+c2 as a FROM aggregate_simple"; + let actual = execute(&mut ctx, sql).await; + + assert_eq!(actual.len(), 5); + Ok(()) +} + #[tokio::test] async fn csv_query_group_by_float64() -> Result<()> { let mut ctx = ExecutionContext::new(); diff --git a/docs/user-guide/src/sql/select.md b/docs/user-guide/src/sql/select.md index 777b4ff61e5d0..78d0cb58531d4 100644 --- a/docs/user-guide/src/sql/select.md +++ b/docs/user-guide/src/sql/select.md @@ -26,7 +26,7 @@ DataFusion supports the following syntax for queries: [ [WITH](#with-clause) with_query [, ...] ]
-[SELECT](#select-clause) select_expr [, ...]
+[SELECT](#select-clause) [ ALL | DISTINCT ] select_expr [, ...]
[ [FROM](#from-clause) from_item [, ...] ]
[ [WHERE](#where-clause) condition ]
[ [GROUP BY](#group-by-clause) grouping_element [, ...] ]
@@ -55,6 +55,12 @@ Example: SELECT a, b, a + b FROM table ``` +The `DISTINCT` quantifier can be added to make the query return all distinct rows. +By default `ALL` will be used, which returns all the rows. + +```sql +SELECT DISTINCT person, age FROM employees +``` # FROM clause From a9eac577b1715fbec5fa885f59fb1253f38f4be0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Wed, 5 May 2021 13:47:11 +0200 Subject: [PATCH 066/329] Enable redundant_field_names clippy lint (#261) --- .github/workflows/rust.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index ac2003608bb3f..f76873ef77aed 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -214,7 +214,7 @@ jobs: run: | export CARGO_HOME="/github/home/.cargo" export CARGO_TARGET_DIR="/github/home/target" - cargo clippy --all-targets --workspace -- -D warnings -A clippy::redundant_field_names + cargo clippy --all-targets --workspace -- -D warnings miri-checks: name: MIRI From 3be087a78846beffdbc4a9f80c73938fa18d24a7 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 5 May 2021 07:39:28 -0600 Subject: [PATCH 067/329] Revert "Revert "Add datafusion-python (#69)" (#257)" (#270) This reverts commit d0af907652aa8773d1de21dfd2f15bbcf6f50ce3. --- .github/workflows/python_build.yml | 89 +++++++++ .github/workflows/python_test.yaml | 58 ++++++ Cargo.toml | 4 +- dev/release/rat_exclude_files.txt | 1 + python/.cargo/config | 22 +++ python/.dockerignore | 19 ++ python/.gitignore | 20 ++ python/Cargo.toml | 57 ++++++ python/README.md | 146 ++++++++++++++ python/pyproject.toml | 20 ++ python/rust-toolchain | 1 + python/src/context.rs | 115 +++++++++++ python/src/dataframe.rs | 161 ++++++++++++++++ python/src/errors.rs | 61 ++++++ python/src/expression.rs | 162 ++++++++++++++++ python/src/functions.rs | 165 ++++++++++++++++ python/src/lib.rs | 44 +++++ python/src/scalar.rs | 36 ++++ python/src/to_py.rs | 77 ++++++++ python/src/to_rust.rs | 111 +++++++++++ python/src/types.rs | 76 ++++++++ python/src/udaf.rs | 147 +++++++++++++++ python/src/udf.rs | 62 ++++++ python/tests/__init__.py | 16 ++ python/tests/generic.py | 75 ++++++++ python/tests/test_df.py | 115 +++++++++++ python/tests/test_sql.py | 294 +++++++++++++++++++++++++++++ python/tests/test_udaf.py | 91 +++++++++ 28 files changed, 2244 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/python_build.yml create mode 100644 .github/workflows/python_test.yaml create mode 100644 python/.cargo/config create mode 100644 python/.dockerignore create mode 100644 python/.gitignore create mode 100644 python/Cargo.toml create mode 100644 python/README.md create mode 100644 python/pyproject.toml create mode 100644 python/rust-toolchain create mode 100644 python/src/context.rs create mode 100644 python/src/dataframe.rs create mode 100644 python/src/errors.rs create mode 100644 python/src/expression.rs create mode 100644 python/src/functions.rs create mode 100644 python/src/lib.rs create mode 100644 python/src/scalar.rs create mode 100644 python/src/to_py.rs create mode 100644 python/src/to_rust.rs create mode 100644 python/src/types.rs create mode 100644 python/src/udaf.rs create mode 100644 python/src/udf.rs create mode 100644 python/tests/__init__.py create mode 100644 python/tests/generic.py create mode 100644 python/tests/test_df.py create mode 100644 python/tests/test_sql.py create mode 100644 python/tests/test_udaf.py diff --git a/.github/workflows/python_build.yml b/.github/workflows/python_build.yml new file mode 100644 index 0000000000000..c86bb81581a71 --- /dev/null +++ b/.github/workflows/python_build.yml @@ -0,0 +1,89 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Build +on: + push: + tags: + - v* + +jobs: + build-python-mac-win: + name: Mac/Win + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + python-version: [3.6, 3.7, 3.8] + os: [macos-latest, windows-latest] + steps: + - uses: actions/checkout@v2 + + - uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + + - uses: actions-rs/toolchain@v1 + with: + toolchain: nightly-2021-01-06 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install maturin + + - name: Build Python package + run: cd python && maturin build --release --no-sdist --strip --interpreter python${{matrix.python_version}} + + - name: List wheels + if: matrix.os == 'windows-latest' + run: dir python/target\wheels\ + + - name: List wheels + if: matrix.os != 'windows-latest' + run: find ./python/target/wheels/ + + - name: Archive wheels + uses: actions/upload-artifact@v2 + with: + name: dist + path: python/target/wheels/* + + build-manylinux: + name: Manylinux + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Build wheels + run: docker run --rm -v $(pwd):/io konstin2/maturin build --release --manylinux + - name: Archive wheels + uses: actions/upload-artifact@v2 + with: + name: dist + path: python/target/wheels/* + + release: + name: Publish in PyPI + needs: [build-manylinux, build-python-mac-win] + runs-on: ubuntu-latest + steps: + - uses: actions/download-artifact@v2 + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@master + with: + user: __token__ + password: ${{ secrets.pypi_password }} diff --git a/.github/workflows/python_test.yaml b/.github/workflows/python_test.yaml new file mode 100644 index 0000000000000..3b2111b59d49d --- /dev/null +++ b/.github/workflows/python_test.yaml @@ -0,0 +1,58 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Python test +on: [push, pull_request] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Setup Rust toolchain + run: | + rustup toolchain install nightly-2021-01-06 + rustup default nightly-2021-01-06 + rustup component add rustfmt + - name: Cache Cargo + uses: actions/cache@v2 + with: + path: /home/runner/.cargo + key: cargo-maturin-cache- + - name: Cache Rust dependencies + uses: actions/cache@v2 + with: + path: /home/runner/target + key: target-maturin-cache- + - uses: actions/setup-python@v2 + with: + python-version: '3.7' + - name: Install Python dependencies + run: python -m pip install --upgrade pip setuptools wheel + - name: Run tests + run: | + cd python/ + export CARGO_HOME="/home/runner/.cargo" + export CARGO_TARGET_DIR="/home/runner/target" + + python -m venv venv + source venv/bin/activate + + pip install maturin==0.10.4 toml==0.10.1 pyarrow==4.0.0 + maturin develop + + python -m unittest discover tests diff --git a/Cargo.toml b/Cargo.toml index fa36a0c0fed7c..9795cb68b4456 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,4 +25,6 @@ members = [ "ballista/rust/core", "ballista/rust/executor", "ballista/rust/scheduler", -] \ No newline at end of file +] + +exclude = ["python"] diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index b94c0ea1d61a6..6126699bbc1fa 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -104,3 +104,4 @@ rust-toolchain benchmarks/queries/q*.sql ballista/rust/scheduler/testdata/* ballista/ui/scheduler/yarn.lock +python/rust-toolchain diff --git a/python/.cargo/config b/python/.cargo/config new file mode 100644 index 0000000000000..0b24f30cf908a --- /dev/null +++ b/python/.cargo/config @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[target.x86_64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] diff --git a/python/.dockerignore b/python/.dockerignore new file mode 100644 index 0000000000000..08c131c2e7d60 --- /dev/null +++ b/python/.dockerignore @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +target +venv diff --git a/python/.gitignore b/python/.gitignore new file mode 100644 index 0000000000000..48fe4dbe52dde --- /dev/null +++ b/python/.gitignore @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +/target +Cargo.lock +venv diff --git a/python/Cargo.toml b/python/Cargo.toml new file mode 100644 index 0000000000000..070720554f0ed --- /dev/null +++ b/python/Cargo.toml @@ -0,0 +1,57 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "datafusion" +version = "0.2.1" +homepage = "https://github.com/apache/arrow" +repository = "https://github.com/apache/arrow" +authors = ["Apache Arrow "] +description = "Build and run queries against data" +readme = "README.md" +license = "Apache-2.0" +edition = "2018" + +[dependencies] +tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } +rand = "0.7" +pyo3 = { version = "0.12.1", features = ["extension-module"] } +datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "2423ff0d" } + +[lib] +name = "datafusion" +crate-type = ["cdylib"] + +[package.metadata.maturin] +requires-dist = ["pyarrow>=1"] + +classifier = [ + "Development Status :: 2 - Pre-Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "License :: OSI Approved", + "Operating System :: MacOS", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python", + "Programming Language :: Rust", +] diff --git a/python/README.md b/python/README.md new file mode 100644 index 0000000000000..1859fca9811c0 --- /dev/null +++ b/python/README.md @@ -0,0 +1,146 @@ + + +## DataFusion in Python + +This is a Python library that binds to [Apache Arrow](https://arrow.apache.org/) in-memory query engine [DataFusion](https://github.com/apache/arrow/tree/master/rust/datafusion). + +Like pyspark, it allows you to build a plan through SQL or a DataFrame API against in-memory data, parquet or CSV files, run it in a multi-threaded environment, and obtain the result back in Python. + +It also allows you to use UDFs and UDAFs for complex operations. + +The major advantage of this library over other execution engines is that this library achieves zero-copy between Python and its execution engine: there is no cost in using UDFs, UDAFs, and collecting the results to Python apart from having to lock the GIL when running those operations. + +Its query engine, DataFusion, is written in [Rust](https://www.rust-lang.org/), which makes strong assumptions about thread safety and lack of memory leaks. + +Technically, zero-copy is achieved via the [c data interface](https://arrow.apache.org/docs/format/CDataInterface.html). + +## How to use it + +Simple usage: + +```python +import datafusion +import pyarrow + +# an alias +f = datafusion.functions + +# create a context +ctx = datafusion.ExecutionContext() + +# create a RecordBatch and a new DataFrame from it +batch = pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], + names=["a", "b"], +) +df = ctx.create_dataframe([[batch]]) + +# create a new statement +df = df.select( + f.col("a") + f.col("b"), + f.col("a") - f.col("b"), +) + +# execute and collect the first (and only) batch +result = df.collect()[0] + +assert result.column(0) == pyarrow.array([5, 7, 9]) +assert result.column(1) == pyarrow.array([-3, -3, -3]) +``` + +### UDFs + +```python +def is_null(array: pyarrow.Array) -> pyarrow.Array: + return array.is_null() + +udf = f.udf(is_null, [pyarrow.int64()], pyarrow.bool_()) + +df = df.select(udf(f.col("a"))) +``` + +### UDAF + +```python +import pyarrow +import pyarrow.compute + + +class Accumulator: + """ + Interface of a user-defined accumulation. + """ + def __init__(self): + self._sum = pyarrow.scalar(0.0) + + def to_scalars(self) -> [pyarrow.Scalar]: + return [self._sum] + + def update(self, values: pyarrow.Array) -> None: + # not nice since pyarrow scalars can't be summed yet. This breaks on `None` + self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(values).as_py()) + + def merge(self, states: pyarrow.Array) -> None: + # not nice since pyarrow scalars can't be summed yet. This breaks on `None` + self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(states).as_py()) + + def evaluate(self) -> pyarrow.Scalar: + return self._sum + + +df = ... + +udaf = f.udaf(Accumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()]) + +df = df.aggregate( + [], + [udaf(f.col("a"))] +) +``` + +## How to install + +```bash +pip install datafusion +``` + +## How to develop + +This assumes that you have rust and cargo installed. We use the workflow recommended by [pyo3](https://github.com/PyO3/pyo3) and [maturin](https://github.com/PyO3/maturin). + +Bootstrap: + +```bash +# fetch this repo +git clone git@github.com:apache/arrow-datafusion.git + +cd arrow-datafusion/python + +# prepare development environment (used to build wheel / install in development) +python3 -m venv venv +pip install maturin==0.10.4 toml==0.10.1 pyarrow==1.0.0 +``` + +Whenever rust code changes (your changes or via git pull): + +```bash +venv/bin/maturin develop +venv/bin/python -m unittest discover tests +``` diff --git a/python/pyproject.toml b/python/pyproject.toml new file mode 100644 index 0000000000000..27480690e06cc --- /dev/null +++ b/python/pyproject.toml @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[build-system] +requires = ["maturin"] +build-backend = "maturin" diff --git a/python/rust-toolchain b/python/rust-toolchain new file mode 100644 index 0000000000000..9d0cf79d367d6 --- /dev/null +++ b/python/rust-toolchain @@ -0,0 +1 @@ +nightly-2021-01-06 diff --git a/python/src/context.rs b/python/src/context.rs new file mode 100644 index 0000000000000..14ef0f7321f15 --- /dev/null +++ b/python/src/context.rs @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{collections::HashSet, sync::Arc}; + +use rand::distributions::Alphanumeric; +use rand::Rng; + +use pyo3::prelude::*; + +use datafusion::arrow::record_batch::RecordBatch; +use datafusion::datasource::MemTable; +use datafusion::execution::context::ExecutionContext as _ExecutionContext; + +use crate::dataframe; +use crate::errors; +use crate::functions; +use crate::to_rust; +use crate::types::PyDataType; + +/// `ExecutionContext` is able to plan and execute DataFusion plans. +/// It has a powerful optimizer, a physical planner for local execution, and a +/// multi-threaded execution engine to perform the execution. +#[pyclass(unsendable)] +pub(crate) struct ExecutionContext { + ctx: _ExecutionContext, +} + +#[pymethods] +impl ExecutionContext { + #[new] + fn new() -> Self { + ExecutionContext { + ctx: _ExecutionContext::new(), + } + } + + /// Returns a DataFrame whose plan corresponds to the SQL statement. + fn sql(&mut self, query: &str) -> PyResult { + let df = self + .ctx + .sql(query) + .map_err(|e| -> errors::DataFusionError { e.into() })?; + Ok(dataframe::DataFrame::new( + self.ctx.state.clone(), + df.to_logical_plan(), + )) + } + + fn create_dataframe( + &mut self, + partitions: Vec>, + py: Python, + ) -> PyResult { + let partitions: Vec> = partitions + .iter() + .map(|batches| { + batches + .iter() + .map(|batch| to_rust::to_rust_batch(batch.as_ref(py))) + .collect() + }) + .collect::>()?; + + let table = + errors::wrap(MemTable::try_new(partitions[0][0].schema(), partitions))?; + + // generate a random (unique) name for this table + let name = rand::thread_rng() + .sample_iter(&Alphanumeric) + .take(10) + .collect::(); + + errors::wrap(self.ctx.register_table(&*name, Arc::new(table)))?; + Ok(dataframe::DataFrame::new( + self.ctx.state.clone(), + errors::wrap(self.ctx.table(&*name))?.to_logical_plan(), + )) + } + + fn register_parquet(&mut self, name: &str, path: &str) -> PyResult<()> { + errors::wrap(self.ctx.register_parquet(name, path))?; + Ok(()) + } + + fn register_udf( + &mut self, + name: &str, + func: PyObject, + args_types: Vec, + return_type: PyDataType, + ) { + let function = functions::create_udf(func, args_types, return_type, name); + + self.ctx.register_udf(function.function); + } + + fn tables(&self) -> HashSet { + self.ctx.tables().unwrap() + } +} diff --git a/python/src/dataframe.rs b/python/src/dataframe.rs new file mode 100644 index 0000000000000..f90a7cf2f0dcf --- /dev/null +++ b/python/src/dataframe.rs @@ -0,0 +1,161 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::{Arc, Mutex}; + +use logical_plan::LogicalPlan; +use pyo3::{prelude::*, types::PyTuple}; +use tokio::runtime::Runtime; + +use datafusion::execution::context::ExecutionContext as _ExecutionContext; +use datafusion::logical_plan::{JoinType, LogicalPlanBuilder}; +use datafusion::physical_plan::collect; +use datafusion::{execution::context::ExecutionContextState, logical_plan}; + +use crate::{errors, to_py}; +use crate::{errors::DataFusionError, expression}; + +/// A DataFrame is a representation of a logical plan and an API to compose statements. +/// Use it to build a plan and `.collect()` to execute the plan and collect the result. +/// The actual execution of a plan runs natively on Rust and Arrow on a multi-threaded environment. +#[pyclass] +pub(crate) struct DataFrame { + ctx_state: Arc>, + plan: LogicalPlan, +} + +impl DataFrame { + /// creates a new DataFrame + pub fn new(ctx_state: Arc>, plan: LogicalPlan) -> Self { + Self { ctx_state, plan } + } +} + +#[pymethods] +impl DataFrame { + /// Select `expressions` from the existing DataFrame. + #[args(args = "*")] + fn select(&self, args: &PyTuple) -> PyResult { + let expressions = expression::from_tuple(args)?; + let builder = LogicalPlanBuilder::from(&self.plan); + let builder = + errors::wrap(builder.project(expressions.into_iter().map(|e| e.expr)))?; + let plan = errors::wrap(builder.build())?; + + Ok(DataFrame { + ctx_state: self.ctx_state.clone(), + plan, + }) + } + + /// Filter according to the `predicate` expression + fn filter(&self, predicate: expression::Expression) -> PyResult { + let builder = LogicalPlanBuilder::from(&self.plan); + let builder = errors::wrap(builder.filter(predicate.expr))?; + let plan = errors::wrap(builder.build())?; + + Ok(DataFrame { + ctx_state: self.ctx_state.clone(), + plan, + }) + } + + /// Aggregates using expressions + fn aggregate( + &self, + group_by: Vec, + aggs: Vec, + ) -> PyResult { + let builder = LogicalPlanBuilder::from(&self.plan); + let builder = errors::wrap(builder.aggregate( + group_by.into_iter().map(|e| e.expr), + aggs.into_iter().map(|e| e.expr), + ))?; + let plan = errors::wrap(builder.build())?; + + Ok(DataFrame { + ctx_state: self.ctx_state.clone(), + plan, + }) + } + + /// Limits the plan to return at most `count` rows + fn limit(&self, count: usize) -> PyResult { + let builder = LogicalPlanBuilder::from(&self.plan); + let builder = errors::wrap(builder.limit(count))?; + let plan = errors::wrap(builder.build())?; + + Ok(DataFrame { + ctx_state: self.ctx_state.clone(), + plan, + }) + } + + /// Executes the plan, returning a list of `RecordBatch`es. + /// Unless some order is specified in the plan, there is no guarantee of the order of the result + fn collect(&self, py: Python) -> PyResult { + let ctx = _ExecutionContext::from(self.ctx_state.clone()); + let plan = ctx + .optimize(&self.plan) + .map_err(|e| -> errors::DataFusionError { e.into() })?; + let plan = ctx + .create_physical_plan(&plan) + .map_err(|e| -> errors::DataFusionError { e.into() })?; + + let rt = Runtime::new().unwrap(); + let batches = py.allow_threads(|| { + rt.block_on(async { + collect(plan) + .await + .map_err(|e| -> errors::DataFusionError { e.into() }) + }) + })?; + to_py::to_py(&batches) + } + + /// Returns the join of two DataFrames `on`. + fn join(&self, right: &DataFrame, on: Vec<&str>, how: &str) -> PyResult { + let builder = LogicalPlanBuilder::from(&self.plan); + + let join_type = match how { + "inner" => JoinType::Inner, + "left" => JoinType::Left, + "right" => JoinType::Right, + how => { + return Err(DataFusionError::Common(format!( + "The join type {} does not exist or is not implemented", + how + )) + .into()) + } + }; + + let builder = errors::wrap(builder.join( + &right.plan, + join_type, + on.as_slice(), + on.as_slice(), + ))?; + + let plan = errors::wrap(builder.build())?; + + Ok(DataFrame { + ctx_state: self.ctx_state.clone(), + plan, + }) + } +} diff --git a/python/src/errors.rs b/python/src/errors.rs new file mode 100644 index 0000000000000..fbe98037a030f --- /dev/null +++ b/python/src/errors.rs @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use core::fmt; + +use datafusion::arrow::error::ArrowError; +use datafusion::error::DataFusionError as InnerDataFusionError; +use pyo3::{exceptions, PyErr}; + +#[derive(Debug)] +pub enum DataFusionError { + ExecutionError(InnerDataFusionError), + ArrowError(ArrowError), + Common(String), +} + +impl fmt::Display for DataFusionError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + DataFusionError::ExecutionError(e) => write!(f, "DataFusion error: {:?}", e), + DataFusionError::ArrowError(e) => write!(f, "Arrow error: {:?}", e), + DataFusionError::Common(e) => write!(f, "{}", e), + } + } +} + +impl From for PyErr { + fn from(err: DataFusionError) -> PyErr { + exceptions::PyException::new_err(err.to_string()) + } +} + +impl From for DataFusionError { + fn from(err: InnerDataFusionError) -> DataFusionError { + DataFusionError::ExecutionError(err) + } +} + +impl From for DataFusionError { + fn from(err: ArrowError) -> DataFusionError { + DataFusionError::ArrowError(err) + } +} + +pub(crate) fn wrap(a: Result) -> Result { + Ok(a?) +} diff --git a/python/src/expression.rs b/python/src/expression.rs new file mode 100644 index 0000000000000..78ca6d7e598ec --- /dev/null +++ b/python/src/expression.rs @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use pyo3::{ + basic::CompareOp, prelude::*, types::PyTuple, PyNumberProtocol, PyObjectProtocol, +}; + +use datafusion::logical_plan::Expr as _Expr; +use datafusion::physical_plan::udaf::AggregateUDF as _AggregateUDF; +use datafusion::physical_plan::udf::ScalarUDF as _ScalarUDF; + +/// An expression that can be used on a DataFrame +#[pyclass] +#[derive(Debug, Clone)] +pub(crate) struct Expression { + pub(crate) expr: _Expr, +} + +/// converts a tuple of expressions into a vector of Expressions +pub(crate) fn from_tuple(value: &PyTuple) -> PyResult> { + value + .iter() + .map(|e| e.extract::()) + .collect::>() +} + +#[pyproto] +impl PyNumberProtocol for Expression { + fn __add__(lhs: Expression, rhs: Expression) -> PyResult { + Ok(Expression { + expr: lhs.expr + rhs.expr, + }) + } + + fn __sub__(lhs: Expression, rhs: Expression) -> PyResult { + Ok(Expression { + expr: lhs.expr - rhs.expr, + }) + } + + fn __truediv__(lhs: Expression, rhs: Expression) -> PyResult { + Ok(Expression { + expr: lhs.expr / rhs.expr, + }) + } + + fn __mul__(lhs: Expression, rhs: Expression) -> PyResult { + Ok(Expression { + expr: lhs.expr * rhs.expr, + }) + } + + fn __and__(lhs: Expression, rhs: Expression) -> PyResult { + Ok(Expression { + expr: lhs.expr.and(rhs.expr), + }) + } + + fn __or__(lhs: Expression, rhs: Expression) -> PyResult { + Ok(Expression { + expr: lhs.expr.or(rhs.expr), + }) + } + + fn __invert__(&self) -> PyResult { + Ok(Expression { + expr: self.expr.clone().not(), + }) + } +} + +#[pyproto] +impl PyObjectProtocol for Expression { + fn __richcmp__(&self, other: Expression, op: CompareOp) -> Expression { + match op { + CompareOp::Lt => Expression { + expr: self.expr.clone().lt(other.expr), + }, + CompareOp::Le => Expression { + expr: self.expr.clone().lt_eq(other.expr), + }, + CompareOp::Eq => Expression { + expr: self.expr.clone().eq(other.expr), + }, + CompareOp::Ne => Expression { + expr: self.expr.clone().not_eq(other.expr), + }, + CompareOp::Gt => Expression { + expr: self.expr.clone().gt(other.expr), + }, + CompareOp::Ge => Expression { + expr: self.expr.clone().gt_eq(other.expr), + }, + } + } +} + +#[pymethods] +impl Expression { + /// assign a name to the expression + pub fn alias(&self, name: &str) -> PyResult { + Ok(Expression { + expr: self.expr.clone().alias(name), + }) + } +} + +/// Represents a ScalarUDF +#[pyclass] +#[derive(Debug, Clone)] +pub struct ScalarUDF { + pub(crate) function: _ScalarUDF, +} + +#[pymethods] +impl ScalarUDF { + /// creates a new expression with the call of the udf + #[call] + #[args(args = "*")] + fn __call__(&self, args: &PyTuple) -> PyResult { + let args = from_tuple(args)?.iter().map(|e| e.expr.clone()).collect(); + + Ok(Expression { + expr: self.function.call(args), + }) + } +} + +/// Represents a AggregateUDF +#[pyclass] +#[derive(Debug, Clone)] +pub struct AggregateUDF { + pub(crate) function: _AggregateUDF, +} + +#[pymethods] +impl AggregateUDF { + /// creates a new expression with the call of the udf + #[call] + #[args(args = "*")] + fn __call__(&self, args: &PyTuple) -> PyResult { + let args = from_tuple(args)?.iter().map(|e| e.expr.clone()).collect(); + + Ok(Expression { + expr: self.function.call(args), + }) + } +} diff --git a/python/src/functions.rs b/python/src/functions.rs new file mode 100644 index 0000000000000..68000cb1ecbf8 --- /dev/null +++ b/python/src/functions.rs @@ -0,0 +1,165 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion::arrow::datatypes::DataType; +use pyo3::{prelude::*, wrap_pyfunction}; + +use datafusion::logical_plan; + +use crate::udaf; +use crate::udf; +use crate::{expression, types::PyDataType}; + +/// Expression representing a column on the existing plan. +#[pyfunction] +#[text_signature = "(name)"] +fn col(name: &str) -> expression::Expression { + expression::Expression { + expr: logical_plan::col(name), + } +} + +/// Expression representing a constant value +#[pyfunction] +#[text_signature = "(value)"] +fn lit(value: i32) -> expression::Expression { + expression::Expression { + expr: logical_plan::lit(value), + } +} + +#[pyfunction] +fn sum(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::sum(value.expr), + } +} + +#[pyfunction] +fn avg(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::avg(value.expr), + } +} + +#[pyfunction] +fn min(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::min(value.expr), + } +} + +#[pyfunction] +fn max(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::max(value.expr), + } +} + +#[pyfunction] +fn count(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::count(value.expr), + } +} + +/* +#[pyfunction] +fn concat(value: Vec) -> expression::Expression { + expression::Expression { + expr: logical_plan::concat(value.into_iter().map(|e| e.expr)), + } +} + */ + +pub(crate) fn create_udf( + fun: PyObject, + input_types: Vec, + return_type: PyDataType, + name: &str, +) -> expression::ScalarUDF { + let input_types: Vec = + input_types.iter().map(|d| d.data_type.clone()).collect(); + let return_type = Arc::new(return_type.data_type); + + expression::ScalarUDF { + function: logical_plan::create_udf( + name, + input_types, + return_type, + udf::array_udf(fun), + ), + } +} + +/// Creates a new udf. +#[pyfunction] +fn udf( + fun: PyObject, + input_types: Vec, + return_type: PyDataType, + py: Python, +) -> PyResult { + let name = fun.getattr(py, "__qualname__")?.extract::(py)?; + + Ok(create_udf(fun, input_types, return_type, &name)) +} + +/// Creates a new udf. +#[pyfunction] +fn udaf( + accumulator: PyObject, + input_type: PyDataType, + return_type: PyDataType, + state_type: Vec, + py: Python, +) -> PyResult { + let name = accumulator + .getattr(py, "__qualname__")? + .extract::(py)?; + + let input_type = input_type.data_type; + let return_type = Arc::new(return_type.data_type); + let state_type = Arc::new(state_type.into_iter().map(|t| t.data_type).collect()); + + Ok(expression::AggregateUDF { + function: logical_plan::create_udaf( + &name, + input_type, + return_type, + udaf::array_udaf(accumulator), + state_type, + ), + }) +} + +pub fn init(module: &PyModule) -> PyResult<()> { + module.add_function(wrap_pyfunction!(col, module)?)?; + module.add_function(wrap_pyfunction!(lit, module)?)?; + // see https://github.com/apache/arrow-datafusion/issues/226 + //module.add_function(wrap_pyfunction!(concat, module)?)?; + module.add_function(wrap_pyfunction!(udf, module)?)?; + module.add_function(wrap_pyfunction!(sum, module)?)?; + module.add_function(wrap_pyfunction!(count, module)?)?; + module.add_function(wrap_pyfunction!(min, module)?)?; + module.add_function(wrap_pyfunction!(max, module)?)?; + module.add_function(wrap_pyfunction!(avg, module)?)?; + module.add_function(wrap_pyfunction!(udaf, module)?)?; + Ok(()) +} diff --git a/python/src/lib.rs b/python/src/lib.rs new file mode 100644 index 0000000000000..aecfe9994cd1a --- /dev/null +++ b/python/src/lib.rs @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use pyo3::prelude::*; + +mod context; +mod dataframe; +mod errors; +mod expression; +mod functions; +mod scalar; +mod to_py; +mod to_rust; +mod types; +mod udaf; +mod udf; + +/// DataFusion. +#[pymodule] +fn datafusion(py: Python, m: &PyModule) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + + let functions = PyModule::new(py, "functions")?; + functions::init(functions)?; + m.add_submodule(functions)?; + + Ok(()) +} diff --git a/python/src/scalar.rs b/python/src/scalar.rs new file mode 100644 index 0000000000000..0c562a9403616 --- /dev/null +++ b/python/src/scalar.rs @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use pyo3::prelude::*; + +use datafusion::scalar::ScalarValue as _Scalar; + +use crate::to_rust::to_rust_scalar; + +/// An expression that can be used on a DataFrame +#[derive(Debug, Clone)] +pub(crate) struct Scalar { + pub(crate) scalar: _Scalar, +} + +impl<'source> FromPyObject<'source> for Scalar { + fn extract(ob: &'source PyAny) -> PyResult { + Ok(Self { + scalar: to_rust_scalar(ob)?, + }) + } +} diff --git a/python/src/to_py.rs b/python/src/to_py.rs new file mode 100644 index 0000000000000..deeb9719891a3 --- /dev/null +++ b/python/src/to_py.rs @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use pyo3::prelude::*; +use pyo3::{libc::uintptr_t, PyErr}; + +use std::convert::From; + +use datafusion::arrow::array::ArrayRef; +use datafusion::arrow::record_batch::RecordBatch; + +use crate::errors; + +pub fn to_py_array(array: &ArrayRef, py: Python) -> PyResult { + let (array_pointer, schema_pointer) = + array.to_raw().map_err(errors::DataFusionError::from)?; + + let pa = py.import("pyarrow")?; + + let array = pa.getattr("Array")?.call_method1( + "_import_from_c", + (array_pointer as uintptr_t, schema_pointer as uintptr_t), + )?; + Ok(array.to_object(py)) +} + +fn to_py_batch<'a>( + batch: &RecordBatch, + py: Python, + pyarrow: &'a PyModule, +) -> Result { + let mut py_arrays = vec![]; + let mut py_names = vec![]; + + let schema = batch.schema(); + for (array, field) in batch.columns().iter().zip(schema.fields().iter()) { + let array = to_py_array(array, py)?; + + py_arrays.push(array); + py_names.push(field.name()); + } + + let record = pyarrow + .getattr("RecordBatch")? + .call_method1("from_arrays", (py_arrays, py_names))?; + + Ok(PyObject::from(record)) +} + +/// Converts a &[RecordBatch] into a Vec represented in PyArrow +pub fn to_py(batches: &[RecordBatch]) -> PyResult { + let gil = pyo3::Python::acquire_gil(); + let py = gil.python(); + let pyarrow = PyModule::import(py, "pyarrow")?; + let builtins = PyModule::import(py, "builtins")?; + + let mut py_batches = vec![]; + for batch in batches { + py_batches.push(to_py_batch(batch, py, pyarrow)?); + } + let result = builtins.call1("list", (py_batches,))?; + Ok(PyObject::from(result)) +} diff --git a/python/src/to_rust.rs b/python/src/to_rust.rs new file mode 100644 index 0000000000000..d8f2307a49823 --- /dev/null +++ b/python/src/to_rust.rs @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion::arrow::{ + array::{make_array_from_raw, ArrayRef}, + datatypes::Field, + datatypes::Schema, + ffi, + record_batch::RecordBatch, +}; +use datafusion::scalar::ScalarValue; +use pyo3::{libc::uintptr_t, prelude::*}; + +use crate::{errors, types::PyDataType}; + +/// converts a pyarrow Array into a Rust Array +pub fn to_rust(ob: &PyAny) -> PyResult { + // prepare a pointer to receive the Array struct + let (array_pointer, schema_pointer) = + ffi::ArrowArray::into_raw(unsafe { ffi::ArrowArray::empty() }); + + // make the conversion through PyArrow's private API + // this changes the pointer's memory and is thus unsafe. In particular, `_export_to_c` can go out of bounds + ob.call_method1( + "_export_to_c", + (array_pointer as uintptr_t, schema_pointer as uintptr_t), + )?; + + let array = unsafe { make_array_from_raw(array_pointer, schema_pointer) } + .map_err(errors::DataFusionError::from)?; + Ok(array) +} + +pub fn to_rust_batch(batch: &PyAny) -> PyResult { + let schema = batch.getattr("schema")?; + let names = schema.getattr("names")?.extract::>()?; + + let fields = names + .iter() + .enumerate() + .map(|(i, name)| { + let field = schema.call_method1("field", (i,))?; + let nullable = field.getattr("nullable")?.extract::()?; + let py_data_type = field.getattr("type")?; + let data_type = py_data_type.extract::()?.data_type; + Ok(Field::new(name, data_type, nullable)) + }) + .collect::>()?; + + let schema = Arc::new(Schema::new(fields)); + + let arrays = (0..names.len()) + .map(|i| { + let array = batch.call_method1("column", (i,))?; + to_rust(array) + }) + .collect::>()?; + + let batch = + RecordBatch::try_new(schema, arrays).map_err(errors::DataFusionError::from)?; + Ok(batch) +} + +/// converts a pyarrow Scalar into a Rust Scalar +pub fn to_rust_scalar(ob: &PyAny) -> PyResult { + let t = ob + .getattr("__class__")? + .getattr("__name__")? + .extract::<&str>()?; + + let p = ob.call_method0("as_py")?; + + Ok(match t { + "Int8Scalar" => ScalarValue::Int8(Some(p.extract::()?)), + "Int16Scalar" => ScalarValue::Int16(Some(p.extract::()?)), + "Int32Scalar" => ScalarValue::Int32(Some(p.extract::()?)), + "Int64Scalar" => ScalarValue::Int64(Some(p.extract::()?)), + "UInt8Scalar" => ScalarValue::UInt8(Some(p.extract::()?)), + "UInt16Scalar" => ScalarValue::UInt16(Some(p.extract::()?)), + "UInt32Scalar" => ScalarValue::UInt32(Some(p.extract::()?)), + "UInt64Scalar" => ScalarValue::UInt64(Some(p.extract::()?)), + "FloatScalar" => ScalarValue::Float32(Some(p.extract::()?)), + "DoubleScalar" => ScalarValue::Float64(Some(p.extract::()?)), + "BooleanScalar" => ScalarValue::Boolean(Some(p.extract::()?)), + "StringScalar" => ScalarValue::Utf8(Some(p.extract::()?)), + "LargeStringScalar" => ScalarValue::LargeUtf8(Some(p.extract::()?)), + other => { + return Err(errors::DataFusionError::Common(format!( + "Type \"{}\"not yet implemented", + other + )) + .into()) + } + }) +} diff --git a/python/src/types.rs b/python/src/types.rs new file mode 100644 index 0000000000000..ffa822e073a89 --- /dev/null +++ b/python/src/types.rs @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::arrow::datatypes::DataType; +use pyo3::{FromPyObject, PyAny, PyResult}; + +use crate::errors; + +/// utility struct to convert PyObj to native DataType +#[derive(Debug, Clone)] +pub struct PyDataType { + pub data_type: DataType, +} + +impl<'source> FromPyObject<'source> for PyDataType { + fn extract(ob: &'source PyAny) -> PyResult { + let id = ob.getattr("id")?.extract::()?; + let data_type = data_type_id(&id)?; + Ok(PyDataType { data_type }) + } +} + +fn data_type_id(id: &i32) -> Result { + // see https://github.com/apache/arrow/blob/3694794bdfd0677b95b8c95681e392512f1c9237/python/pyarrow/includes/libarrow.pxd + // this is not ideal as it does not generalize for non-basic types + // Find a way to get a unique name from the pyarrow.DataType + Ok(match id { + 1 => DataType::Boolean, + 2 => DataType::UInt8, + 3 => DataType::Int8, + 4 => DataType::UInt16, + 5 => DataType::Int16, + 6 => DataType::UInt32, + 7 => DataType::Int32, + 8 => DataType::UInt64, + 9 => DataType::Int64, + + 10 => DataType::Float16, + 11 => DataType::Float32, + 12 => DataType::Float64, + + //13 => DataType::Decimal, + + // 14 => DataType::Date32(), + // 15 => DataType::Date64(), + // 16 => DataType::Timestamp(), + // 17 => DataType::Time32(), + // 18 => DataType::Time64(), + // 19 => DataType::Duration() + 20 => DataType::Binary, + 21 => DataType::Utf8, + 22 => DataType::LargeBinary, + 23 => DataType::LargeUtf8, + + other => { + return Err(errors::DataFusionError::Common(format!( + "The type {} is not valid", + other + ))) + } + }) +} diff --git a/python/src/udaf.rs b/python/src/udaf.rs new file mode 100644 index 0000000000000..3ce223df9a491 --- /dev/null +++ b/python/src/udaf.rs @@ -0,0 +1,147 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use pyo3::{prelude::*, types::PyTuple}; + +use datafusion::arrow::array::ArrayRef; + +use datafusion::error::Result; +use datafusion::{ + error::DataFusionError as InnerDataFusionError, physical_plan::Accumulator, + scalar::ScalarValue, +}; + +use crate::scalar::Scalar; +use crate::to_py::to_py_array; +use crate::to_rust::to_rust_scalar; + +#[derive(Debug)] +struct PyAccumulator { + accum: PyObject, +} + +impl PyAccumulator { + fn new(accum: PyObject) -> Self { + Self { accum } + } +} + +impl Accumulator for PyAccumulator { + fn state(&self) -> Result> { + let gil = pyo3::Python::acquire_gil(); + let py = gil.python(); + + let state = self + .accum + .as_ref(py) + .call_method0("to_scalars") + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))? + .extract::>() + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; + + Ok(state.into_iter().map(|v| v.scalar).collect::>()) + } + + fn update(&mut self, _values: &[ScalarValue]) -> Result<()> { + // no need to implement as datafusion does not use it + todo!() + } + + fn merge(&mut self, _states: &[ScalarValue]) -> Result<()> { + // no need to implement as datafusion does not use it + todo!() + } + + fn evaluate(&self) -> Result { + // get GIL + let gil = pyo3::Python::acquire_gil(); + let py = gil.python(); + + let value = self + .accum + .as_ref(py) + .call_method0("evaluate") + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; + + to_rust_scalar(value) + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e))) + } + + fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { + // get GIL + let gil = pyo3::Python::acquire_gil(); + let py = gil.python(); + + // 1. cast args to Pyarrow array + // 2. call function + + // 1. + let py_args = values + .iter() + .map(|arg| { + // remove unwrap + to_py_array(arg, py).unwrap() + }) + .collect::>(); + let py_args = PyTuple::new(py, py_args); + + // update accumulator + self.accum + .as_ref(py) + .call_method1("update", py_args) + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; + + Ok(()) + } + + fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { + // get GIL + let gil = pyo3::Python::acquire_gil(); + let py = gil.python(); + + // 1. cast states to Pyarrow array + // 2. merge + let state = &states[0]; + + let state = to_py_array(state, py) + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; + + // 2. + self.accum + .as_ref(py) + .call_method1("merge", (state,)) + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; + + Ok(()) + } +} + +pub fn array_udaf( + accumulator: PyObject, +) -> Arc Result> + Send + Sync> { + Arc::new(move || -> Result> { + let gil = pyo3::Python::acquire_gil(); + let py = gil.python(); + + let accumulator = accumulator + .call0(py) + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; + Ok(Box::new(PyAccumulator::new(accumulator))) + }) +} diff --git a/python/src/udf.rs b/python/src/udf.rs new file mode 100644 index 0000000000000..7fee71008ef2f --- /dev/null +++ b/python/src/udf.rs @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use pyo3::{prelude::*, types::PyTuple}; + +use datafusion::{arrow::array, physical_plan::functions::make_scalar_function}; + +use datafusion::error::DataFusionError; +use datafusion::physical_plan::functions::ScalarFunctionImplementation; + +use crate::to_py::to_py_array; +use crate::to_rust::to_rust; + +/// creates a DataFusion's UDF implementation from a python function that expects pyarrow arrays +/// This is more efficient as it performs a zero-copy of the contents. +pub fn array_udf(func: PyObject) -> ScalarFunctionImplementation { + make_scalar_function( + move |args: &[array::ArrayRef]| -> Result { + // get GIL + let gil = pyo3::Python::acquire_gil(); + let py = gil.python(); + + // 1. cast args to Pyarrow arrays + // 2. call function + // 3. cast to arrow::array::Array + + // 1. + let py_args = args + .iter() + .map(|arg| { + // remove unwrap + to_py_array(arg, py).unwrap() + }) + .collect::>(); + let py_args = PyTuple::new(py, py_args); + + // 2. + let value = func.as_ref(py).call(py_args, None); + let value = match value { + Ok(n) => Ok(n), + Err(error) => Err(DataFusionError::Execution(format!("{:?}", error))), + }?; + + let array = to_rust(value).unwrap(); + Ok(array) + }, + ) +} diff --git a/python/tests/__init__.py b/python/tests/__init__.py new file mode 100644 index 0000000000000..13a83393a9124 --- /dev/null +++ b/python/tests/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/tests/generic.py b/python/tests/generic.py new file mode 100644 index 0000000000000..7362f0bb29569 --- /dev/null +++ b/python/tests/generic.py @@ -0,0 +1,75 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import unittest +import tempfile +import datetime +import os.path +import shutil + +import numpy +import pyarrow +import datafusion + +# used to write parquet files +import pyarrow.parquet + + +def data(): + data = numpy.concatenate( + [numpy.random.normal(0, 0.01, size=50), numpy.random.normal(50, 0.01, size=50)] + ) + return pyarrow.array(data) + + +def data_with_nans(): + data = numpy.random.normal(0, 0.01, size=50) + mask = numpy.random.randint(0, 2, size=50) + data[mask == 0] = numpy.NaN + return data + + +def data_datetime(f): + data = [ + datetime.datetime.now(), + datetime.datetime.now() - datetime.timedelta(days=1), + datetime.datetime.now() + datetime.timedelta(days=1), + ] + return pyarrow.array( + data, type=pyarrow.timestamp(f), mask=numpy.array([False, True, False]) + ) + + +def data_timedelta(f): + data = [ + datetime.timedelta(days=100), + datetime.timedelta(days=1), + datetime.timedelta(seconds=1), + ] + return pyarrow.array( + data, type=pyarrow.duration(f), mask=numpy.array([False, True, False]) + ) + + +def data_binary_other(): + return numpy.array([1, 0, 0], dtype="u4") + + +def write_parquet(path, data): + table = pyarrow.Table.from_arrays([data], names=["a"]) + pyarrow.parquet.write_table(table, path) + return path diff --git a/python/tests/test_df.py b/python/tests/test_df.py new file mode 100644 index 0000000000000..520d4e6a54723 --- /dev/null +++ b/python/tests/test_df.py @@ -0,0 +1,115 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import unittest + +import pyarrow +import datafusion +f = datafusion.functions + + +class TestCase(unittest.TestCase): + + def _prepare(self): + ctx = datafusion.ExecutionContext() + + # create a RecordBatch and a new DataFrame from it + batch = pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], + names=["a", "b"], + ) + return ctx.create_dataframe([[batch]]) + + def test_select(self): + df = self._prepare() + + df = df.select( + f.col("a") + f.col("b"), + f.col("a") - f.col("b"), + ) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + self.assertEqual(result.column(0), pyarrow.array([5, 7, 9])) + self.assertEqual(result.column(1), pyarrow.array([-3, -3, -3])) + + def test_filter(self): + df = self._prepare() + + df = df \ + .select( + f.col("a") + f.col("b"), + f.col("a") - f.col("b"), + ) \ + .filter(f.col("a") > f.lit(2)) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + self.assertEqual(result.column(0), pyarrow.array([9])) + self.assertEqual(result.column(1), pyarrow.array([-3])) + + def test_limit(self): + df = self._prepare() + + df = df.limit(1) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + self.assertEqual(len(result.column(0)), 1) + self.assertEqual(len(result.column(1)), 1) + + def test_udf(self): + df = self._prepare() + + # is_null is a pyarrow function over arrays + udf = f.udf(lambda x: x.is_null(), [pyarrow.int64()], pyarrow.bool_()) + + df = df.select(udf(f.col("a"))) + + self.assertEqual(df.collect()[0].column(0), pyarrow.array([False, False, False])) + + def test_join(self): + ctx = datafusion.ExecutionContext() + + batch = pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], + names=["a", "b"], + ) + df = ctx.create_dataframe([[batch]]) + + batch = pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1, 2]), pyarrow.array([8, 10])], + names=["a", "c"], + ) + df1 = ctx.create_dataframe([[batch]]) + + df = df.join(df1, on="a", how="inner") + + # execute and collect the first (and only) batch + batch = df.collect()[0] + + if batch.column(0) == pyarrow.array([1, 2]): + self.assertEqual(batch.column(0), pyarrow.array([1, 2])) + self.assertEqual(batch.column(1), pyarrow.array([8, 10])) + self.assertEqual(batch.column(2), pyarrow.array([4, 5])) + else: + self.assertEqual(batch.column(0), pyarrow.array([2, 1])) + self.assertEqual(batch.column(1), pyarrow.array([10, 8])) + self.assertEqual(batch.column(2), pyarrow.array([5, 4])) diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py new file mode 100644 index 0000000000000..e9047ea6e70c3 --- /dev/null +++ b/python/tests/test_sql.py @@ -0,0 +1,294 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import unittest +import tempfile +import datetime +import os.path +import shutil + +import numpy +import pyarrow +import datafusion + +# used to write parquet files +import pyarrow.parquet + +from tests.generic import * + + +class TestCase(unittest.TestCase): + def setUp(self): + # Create a temporary directory + self.test_dir = tempfile.mkdtemp() + numpy.random.seed(1) + + def tearDown(self): + # Remove the directory after the test + shutil.rmtree(self.test_dir) + + def test_no_table(self): + with self.assertRaises(Exception): + datafusion.Context().sql("SELECT a FROM b").collect() + + def test_register(self): + ctx = datafusion.ExecutionContext() + + path = write_parquet(os.path.join(self.test_dir, "a.parquet"), data()) + + ctx.register_parquet("t", path) + + self.assertEqual(ctx.tables(), {"t"}) + + def test_execute(self): + data = [1, 1, 2, 2, 3, 11, 12] + + ctx = datafusion.ExecutionContext() + + # single column, "a" + path = write_parquet( + os.path.join(self.test_dir, "a.parquet"), pyarrow.array(data) + ) + ctx.register_parquet("t", path) + + self.assertEqual(ctx.tables(), {"t"}) + + # count + result = ctx.sql("SELECT COUNT(a) FROM t").collect() + + expected = pyarrow.array([7], pyarrow.uint64()) + expected = [pyarrow.RecordBatch.from_arrays([expected], ["COUNT(a)"])] + self.assertEqual(expected, result) + + # where + expected = pyarrow.array([2], pyarrow.uint64()) + expected = [pyarrow.RecordBatch.from_arrays([expected], ["COUNT(a)"])] + self.assertEqual( + expected, ctx.sql("SELECT COUNT(a) FROM t WHERE a > 10").collect() + ) + + # group by + result = ctx.sql( + "SELECT CAST(a as int), COUNT(a) FROM t GROUP BY CAST(a as int)" + ).collect() + + result_keys = result[0].to_pydict()["CAST(a AS Int32)"] + result_values = result[0].to_pydict()["COUNT(a)"] + result_keys, result_values = ( + list(t) for t in zip(*sorted(zip(result_keys, result_values))) + ) + + self.assertEqual(result_keys, [1, 2, 3, 11, 12]) + self.assertEqual(result_values, [2, 2, 1, 1, 1]) + + # order by + result = ctx.sql( + "SELECT a, CAST(a AS int) FROM t ORDER BY a DESC LIMIT 2" + ).collect() + expected_a = pyarrow.array([50.0219, 50.0152], pyarrow.float64()) + expected_cast = pyarrow.array([50, 50], pyarrow.int32()) + expected = [ + pyarrow.RecordBatch.from_arrays( + [expected_a, expected_cast], ["a", "CAST(a AS Int32)"] + ) + ] + numpy.testing.assert_equal(expected[0].column(1), expected[0].column(1)) + + def test_cast(self): + """ + Verify that we can cast + """ + ctx = datafusion.ExecutionContext() + + path = write_parquet(os.path.join(self.test_dir, "a.parquet"), data()) + ctx.register_parquet("t", path) + + valid_types = [ + "smallint", + "int", + "bigint", + "float(32)", + "float(64)", + "float", + ] + + select = ", ".join( + [f"CAST(9 AS {t}) AS A{i}" for i, t in enumerate(valid_types)] + ) + + # can execute, which implies that we can cast + ctx.sql(f"SELECT {select} FROM t").collect() + + def _test_udf(self, udf, args, return_type, array, expected): + ctx = datafusion.ExecutionContext() + + # write to disk + path = write_parquet(os.path.join(self.test_dir, "a.parquet"), array) + ctx.register_parquet("t", path) + + ctx.register_udf("udf", udf, args, return_type) + + batches = ctx.sql("SELECT udf(a) AS tt FROM t").collect() + + result = batches[0].column(0) + + self.assertEqual(expected, result) + + def test_udf_identity(self): + self._test_udf( + lambda x: x, + [pyarrow.float64()], + pyarrow.float64(), + pyarrow.array([-1.2, None, 1.2]), + pyarrow.array([-1.2, None, 1.2]), + ) + + def test_udf(self): + self._test_udf( + lambda x: x.is_null(), + [pyarrow.float64()], + pyarrow.bool_(), + pyarrow.array([-1.2, None, 1.2]), + pyarrow.array([False, True, False]), + ) + + +class TestIO(unittest.TestCase): + def setUp(self): + # Create a temporary directory + self.test_dir = tempfile.mkdtemp() + + def tearDown(self): + # Remove the directory after the test + shutil.rmtree(self.test_dir) + + def _test_data(self, data): + ctx = datafusion.ExecutionContext() + + # write to disk + path = write_parquet(os.path.join(self.test_dir, "a.parquet"), data) + ctx.register_parquet("t", path) + + batches = ctx.sql("SELECT a AS tt FROM t").collect() + + result = batches[0].column(0) + + numpy.testing.assert_equal(data, result) + + def test_nans(self): + self._test_data(data_with_nans()) + + def test_utf8(self): + array = pyarrow.array( + ["a", "b", "c"], pyarrow.utf8(), numpy.array([False, True, False]) + ) + self._test_data(array) + + def test_large_utf8(self): + array = pyarrow.array( + ["a", "b", "c"], pyarrow.large_utf8(), numpy.array([False, True, False]) + ) + self._test_data(array) + + # Error from Arrow + @unittest.expectedFailure + def test_datetime_s(self): + self._test_data(data_datetime("s")) + + # C data interface missing + @unittest.expectedFailure + def test_datetime_ms(self): + self._test_data(data_datetime("ms")) + + # C data interface missing + @unittest.expectedFailure + def test_datetime_us(self): + self._test_data(data_datetime("us")) + + # Not writtable to parquet + @unittest.expectedFailure + def test_datetime_ns(self): + self._test_data(data_datetime("ns")) + + # Not writtable to parquet + @unittest.expectedFailure + def test_timedelta_s(self): + self._test_data(data_timedelta("s")) + + # Not writtable to parquet + @unittest.expectedFailure + def test_timedelta_ms(self): + self._test_data(data_timedelta("ms")) + + # Not writtable to parquet + @unittest.expectedFailure + def test_timedelta_us(self): + self._test_data(data_timedelta("us")) + + # Not writtable to parquet + @unittest.expectedFailure + def test_timedelta_ns(self): + self._test_data(data_timedelta("ns")) + + def test_date32(self): + array = pyarrow.array( + [ + datetime.date(2000, 1, 1), + datetime.date(1980, 1, 1), + datetime.date(2030, 1, 1), + ], + pyarrow.date32(), + numpy.array([False, True, False]), + ) + self._test_data(array) + + def test_binary_variable(self): + array = pyarrow.array( + [b"1", b"2", b"3"], pyarrow.binary(), numpy.array([False, True, False]) + ) + self._test_data(array) + + # C data interface missing + @unittest.expectedFailure + def test_binary_fixed(self): + array = pyarrow.array( + [b"1111", b"2222", b"3333"], + pyarrow.binary(4), + numpy.array([False, True, False]), + ) + self._test_data(array) + + def test_large_binary(self): + array = pyarrow.array( + [b"1111", b"2222", b"3333"], + pyarrow.large_binary(), + numpy.array([False, True, False]), + ) + self._test_data(array) + + def test_binary_other(self): + self._test_data(data_binary_other()) + + def test_bool(self): + array = pyarrow.array( + [False, True, True], None, numpy.array([False, True, False]) + ) + self._test_data(array) + + def test_u32(self): + array = pyarrow.array([0, 1, 2], None, numpy.array([False, True, False])) + self._test_data(array) diff --git a/python/tests/test_udaf.py b/python/tests/test_udaf.py new file mode 100644 index 0000000000000..ffd235e285f80 --- /dev/null +++ b/python/tests/test_udaf.py @@ -0,0 +1,91 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import unittest + +import pyarrow +import pyarrow.compute +import datafusion + +f = datafusion.functions + + +class Accumulator: + """ + Interface of a user-defined accumulation. + """ + + def __init__(self): + self._sum = pyarrow.scalar(0.0) + + def to_scalars(self) -> [pyarrow.Scalar]: + return [self._sum] + + def update(self, values: pyarrow.Array) -> None: + # not nice since pyarrow scalars can't be summed yet. This breaks on `None` + self._sum = pyarrow.scalar( + self._sum.as_py() + pyarrow.compute.sum(values).as_py() + ) + + def merge(self, states: pyarrow.Array) -> None: + # not nice since pyarrow scalars can't be summed yet. This breaks on `None` + self._sum = pyarrow.scalar( + self._sum.as_py() + pyarrow.compute.sum(states).as_py() + ) + + def evaluate(self) -> pyarrow.Scalar: + return self._sum + + +class TestCase(unittest.TestCase): + def _prepare(self): + ctx = datafusion.ExecutionContext() + + # create a RecordBatch and a new DataFrame from it + batch = pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1, 2, 3]), pyarrow.array([4, 4, 6])], + names=["a", "b"], + ) + return ctx.create_dataframe([[batch]]) + + def test_aggregate(self): + df = self._prepare() + + udaf = f.udaf( + Accumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()] + ) + + df = df.aggregate([], [udaf(f.col("a"))]) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + self.assertEqual(result.column(0), pyarrow.array([1.0 + 2.0 + 3.0])) + + def test_group_by(self): + df = self._prepare() + + udaf = f.udaf( + Accumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()] + ) + + df = df.aggregate([f.col("b")], [udaf(f.col("a"))]) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + self.assertEqual(result.column(1), pyarrow.array([1.0 + 2.0, 3.0])) From ea451c9b809aed832ac41947424a595856d87af5 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 5 May 2021 13:41:09 -0400 Subject: [PATCH 068/329] Update arrow deps (#269) --- ballista/rust/client/Cargo.toml | 2 +- ballista/rust/core/Cargo.toml | 4 ++-- ballista/rust/executor/Cargo.toml | 4 ++-- ballista/rust/scheduler/Cargo.toml | 2 +- datafusion-examples/Cargo.toml | 2 +- datafusion/Cargo.toml | 4 ++-- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ballista/rust/client/Cargo.toml b/ballista/rust/client/Cargo.toml index 283c2ebb44d71..b077af69b954c 100644 --- a/ballista/rust/client/Cargo.toml +++ b/ballista/rust/client/Cargo.toml @@ -31,5 +31,5 @@ futures = "0.3" log = "0.4" tokio = "1.0" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "d008f31b107c1030a1f5144c164e8ca8bf543576" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "508f25c10032857da34ea88cc8166f0741616a32" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml index 853aa7ae2f45f..dd3cbe65226cb 100644 --- a/ballista/rust/core/Cargo.toml +++ b/ballista/rust/core/Cargo.toml @@ -40,8 +40,8 @@ tokio = "1.0" tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } -arrow = { git = "https://github.com/apache/arrow-rs", rev = "d008f31b107c1030a1f5144c164e8ca8bf543576" } -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "d008f31b107c1030a1f5144c164e8ca8bf543576" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "508f25c10032857da34ea88cc8166f0741616a32" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "508f25c10032857da34ea88cc8166f0741616a32" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/executor/Cargo.toml b/ballista/rust/executor/Cargo.toml index cdc1b45382263..5b9dc003013b8 100644 --- a/ballista/rust/executor/Cargo.toml +++ b/ballista/rust/executor/Cargo.toml @@ -44,8 +44,8 @@ tokio-stream = "0.1" tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } -arrow = { git = "https://github.com/apache/arrow-rs", rev = "d008f31b107c1030a1f5144c164e8ca8bf543576" } -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "d008f31b107c1030a1f5144c164e8ca8bf543576" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "508f25c10032857da34ea88cc8166f0741616a32" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "508f25c10032857da34ea88cc8166f0741616a32" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/scheduler/Cargo.toml b/ballista/rust/scheduler/Cargo.toml index 507dc5465006b..7315533426529 100644 --- a/ballista/rust/scheduler/Cargo.toml +++ b/ballista/rust/scheduler/Cargo.toml @@ -52,7 +52,7 @@ tonic = "0.4" tower = { version = "0.4" } warp = "0.3" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "d008f31b107c1030a1f5144c164e8ca8bf543576" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "508f25c10032857da34ea88cc8166f0741616a32" } datafusion = { path = "../../../datafusion" } [dev-dependencies] diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 0ec30105a409d..f3fe39f5b6fb6 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -29,7 +29,7 @@ publish = false [dev-dependencies] -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "d008f31b107c1030a1f5144c164e8ca8bf543576" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "508f25c10032857da34ea88cc8166f0741616a32" } datafusion = { path = "../datafusion" } prost = "0.7" tonic = "0.4" diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index e35b8feea7373..b42bef682b0d0 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -46,8 +46,8 @@ unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = "0.7" hashbrown = "0.11" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "d008f31b107c1030a1f5144c164e8ca8bf543576", features = ["prettyprint"] } -parquet = { git = "https://github.com/apache/arrow-rs", rev = "d008f31b107c1030a1f5144c164e8ca8bf543576", features = ["arrow"] } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "508f25c10032857da34ea88cc8166f0741616a32", features = ["prettyprint"] } +parquet = { git = "https://github.com/apache/arrow-rs", rev = "508f25c10032857da34ea88cc8166f0741616a32", features = ["arrow"] } sqlparser = "0.9.0" paste = "^1.0" num_cpus = "1.13.0" From 1c40738232f2babf297dfdf6a563f03cf91e3572 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 6 May 2021 18:09:00 +0800 Subject: [PATCH 069/329] Use multi-stage build dockerfile in datafusion-cli and reduce image size from 2.16GB to 89.9MB (#266) * optimize datafusion-cli docker image size * update workspace def * get rid of alpine * use debian:buster-slim --- Cargo.toml | 2 +- datafusion-cli/Dockerfile | 16 +++++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9795cb68b4456..351523d74c36a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,7 @@ members = [ "datafusion", "datafusion-cli", "datafusion-examples", - "benchmarks", + "benchmarks", "ballista/rust/client", "ballista/rust/core", "ballista/rust/executor", diff --git a/datafusion-cli/Dockerfile b/datafusion-cli/Dockerfile index 32bd38b9419dc..2a8ab16a2554b 100644 --- a/datafusion-cli/Dockerfile +++ b/datafusion-cli/Dockerfile @@ -15,14 +15,20 @@ # specific language governing permissions and limitations # under the License. -FROM rust:latest +FROM rust:latest as builder +COPY ./datafusion /usr/src/datafusion -COPY ./datafusion ./usr/src/datafusion -COPY ./datafusion-cli ./usr/src/datafusion-cli +COPY ./datafusion-cli /usr/src/datafusion-cli WORKDIR /usr/src/datafusion-cli -RUN cargo install --path . +RUN cargo build --release -CMD ["datafusion-cli", "--data-path", "/data"] +FROM debian:buster-slim + +COPY --from=builder /usr/src/datafusion-cli/target/release/datafusion-cli /usr/local/bin + +ENTRYPOINT ["datafusion-cli"] + +CMD ["--data-path", "/data"] From 24e248ded052fc682b88f5ce33b1af35622d2040 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Thu, 6 May 2021 17:51:21 +0200 Subject: [PATCH 070/329] Fix wrong projection 'optimization' (#268) * Fix wrong projection 'optimization' * Fmt * test * Update some more * Update tests * Fix test --- datafusion/src/execution/dataframe_impl.rs | 36 +++--- datafusion/src/sql/planner.rs | 130 +++++++++++---------- datafusion/tests/sql.rs | 34 ++++-- 3 files changed, 112 insertions(+), 88 deletions(-) diff --git a/datafusion/src/execution/dataframe_impl.rs b/datafusion/src/execution/dataframe_impl.rs index 2a0c39aa48ebd..fdc75f92f2e75 100644 --- a/datafusion/src/execution/dataframe_impl.rs +++ b/datafusion/src/execution/dataframe_impl.rs @@ -177,9 +177,11 @@ impl DataFrame for DataFrameImpl { #[cfg(test)] mod tests { + use std::vec; + use super::*; - use crate::execution::context::ExecutionContext; use crate::logical_plan::*; + use crate::{assert_batches_sorted_eq, execution::context::ExecutionContext}; use crate::{datasource::csv::CsvReadOptions, physical_plan::ColumnarValue}; use crate::{physical_plan::functions::ScalarFunctionImplementation, test}; use arrow::datatypes::DataType; @@ -216,8 +218,8 @@ mod tests { Ok(()) } - #[test] - fn aggregate() -> Result<()> { + #[tokio::test] + async fn aggregate() -> Result<()> { // build plan using DataFrame API let df = test_table()?; let group_expr = vec![col("c1")]; @@ -230,18 +232,22 @@ mod tests { count_distinct(col("c12")), ]; - let df = df.aggregate(group_expr, aggr_expr)?; - - let plan = df.to_logical_plan(); - - // build same plan using SQL API - let sql = "SELECT c1, MIN(c12), MAX(c12), AVG(c12), SUM(c12), COUNT(c12), COUNT(DISTINCT c12) \ - FROM aggregate_test_100 \ - GROUP BY c1"; - let sql_plan = create_plan(sql)?; - - // the two plans should be identical - assert_same_plan(&plan, &sql_plan); + let df: Vec = df.aggregate(group_expr, aggr_expr)?.collect().await?; + + assert_batches_sorted_eq!( + vec![ + "+----+----------------------+--------------------+---------------------+--------------------+------------+---------------------+", + "| c1 | MIN(c12) | MAX(c12) | AVG(c12) | SUM(c12) | COUNT(c12) | COUNT(DISTINCT c12) |", + "+----+----------------------+--------------------+---------------------+--------------------+------------+---------------------+", + "| a | 0.02182578039211991 | 0.9800193410444061 | 0.48754517466109415 | 10.238448667882977 | 21 | 21 |", + "| b | 0.04893135681998029 | 0.9185813970744787 | 0.41040709263815384 | 7.797734760124923 | 19 | 19 |", + "| c | 0.0494924465469434 | 0.991517828651004 | 0.6600456536439784 | 13.860958726523545 | 21 | 21 |", + "| d | 0.061029375346466685 | 0.9748360509016578 | 0.48855379387549824 | 8.793968289758968 | 18 | 18 |", + "| e | 0.01479305307777301 | 0.9965400387585364 | 0.48600669271341534 | 10.206140546981722 | 21 | 21 |", + "+----+----------------------+--------------------+---------------------+--------------------+------------+---------------------+", + ], + &df + ); Ok(()) } diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 48900f56aad5e..ed7dd377c835c 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -652,26 +652,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } /// Wrap a plan in a projection - /// - /// The projection is applied only when necessary, - /// i.e., when the input fields are different than the - /// projection. Note that if the input fields are the same, but out of - /// order, the projection will be applied. fn project(&self, input: &LogicalPlan, expr: Vec) -> Result { self.validate_schema_satisfies_exprs(&input.schema(), &expr)?; - let plan = LogicalPlanBuilder::from(input).project(expr)?.build()?; - - let project = match input { - LogicalPlan::TableScan { .. } => true, - _ => plan.schema().fields() != input.schema().fields(), - }; - - if project { - Ok(plan) - } else { - Ok(input.clone()) - } + LogicalPlanBuilder::from(input).project(expr)?.build() } fn aggregate( @@ -1737,10 +1721,11 @@ mod tests { ) WHERE fn1 = 'X' AND age < 30"; - let expected = "Filter: #fn1 Eq Utf8(\"X\") And #age Lt Int64(30)\ - \n Projection: #first_name AS fn1, #age\ - \n Filter: #age Gt Int64(20)\ - \n TableScan: person projection=None"; + let expected = "Projection: #fn1, #age\ + \n Filter: #fn1 Eq Utf8(\"X\") And #age Lt Int64(30)\ + \n Projection: #first_name AS fn1, #age\ + \n Filter: #age Gt Int64(20)\ + \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -1797,9 +1782,10 @@ mod tests { let sql = "SELECT MAX(age) FROM person HAVING MAX(age) < 30"; - let expected = "Filter: #MAX(age) Lt Int64(30)\ - \n Aggregate: groupBy=[[]], aggr=[[MAX(#age)]]\ - \n TableScan: person projection=None"; + let expected = "Projection: #MAX(age)\ + \n Filter: #MAX(age) Lt Int64(30)\ + \n Aggregate: groupBy=[[]], aggr=[[MAX(#age)]]\ + \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -1857,9 +1843,10 @@ mod tests { FROM person GROUP BY first_name HAVING first_name = 'M'"; - let expected = "Filter: #first_name Eq Utf8(\"M\")\ - \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age)]]\ - \n TableScan: person projection=None"; + let expected = "Projection: #first_name, #MAX(age)\ + \n Filter: #first_name Eq Utf8(\"M\")\ + \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age)]]\ + \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -1870,10 +1857,11 @@ mod tests { WHERE id > 5 GROUP BY first_name HAVING MAX(age) < 100"; - let expected = "Filter: #MAX(age) Lt Int64(100)\ - \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age)]]\ - \n Filter: #id Gt Int64(5)\ - \n TableScan: person projection=None"; + let expected = "Projection: #first_name, #MAX(age)\ + \n Filter: #MAX(age) Lt Int64(100)\ + \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age)]]\ + \n Filter: #id Gt Int64(5)\ + \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -1885,10 +1873,11 @@ mod tests { WHERE id > 5 AND age > 18 GROUP BY first_name HAVING MAX(age) < 100"; - let expected = "Filter: #MAX(age) Lt Int64(100)\ - \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age)]]\ - \n Filter: #id Gt Int64(5) And #age Gt Int64(18)\ - \n TableScan: person projection=None"; + let expected = "Projection: #first_name, #MAX(age)\ + \n Filter: #MAX(age) Lt Int64(100)\ + \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age)]]\ + \n Filter: #id Gt Int64(5) And #age Gt Int64(18)\ + \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -1925,9 +1914,10 @@ mod tests { FROM person GROUP BY first_name HAVING MAX(age) > 100"; - let expected = "Filter: #MAX(age) Gt Int64(100)\ - \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age)]]\ - \n TableScan: person projection=None"; + let expected = "Projection: #first_name, #MAX(age)\ + \n Filter: #MAX(age) Gt Int64(100)\ + \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age)]]\ + \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -1950,9 +1940,10 @@ mod tests { FROM person GROUP BY first_name HAVING MAX(age) > 100 AND MAX(age) < 200"; - let expected = "Filter: #MAX(age) Gt Int64(100) And #MAX(age) Lt Int64(200)\ - \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age)]]\ - \n TableScan: person projection=None"; + let expected = "Projection: #first_name, #MAX(age)\ + \n Filter: #MAX(age) Gt Int64(100) And #MAX(age) Lt Int64(200)\ + \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age)]]\ + \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -2045,14 +2036,16 @@ mod tests { fn select_wildcard_with_groupby() { quick_test( "SELECT * FROM person GROUP BY id, first_name, last_name, age, state, salary, birth_date", - "Aggregate: groupBy=[[#id, #first_name, #last_name, #age, #state, #salary, #birth_date]], aggr=[[]]\ - \n TableScan: person projection=None", + "Projection: #id, #first_name, #last_name, #age, #state, #salary, #birth_date\ + \n Aggregate: groupBy=[[#id, #first_name, #last_name, #age, #state, #salary, #birth_date]], aggr=[[]]\ + \n TableScan: person projection=None", ); quick_test( "SELECT * FROM (SELECT first_name, last_name FROM person) GROUP BY first_name, last_name", - "Aggregate: groupBy=[[#first_name, #last_name]], aggr=[[]]\ - \n Projection: #first_name, #last_name\ - \n TableScan: person projection=None", + "Projection: #first_name, #last_name\ + \n Aggregate: groupBy=[[#first_name, #last_name]], aggr=[[]]\ + \n Projection: #first_name, #last_name\ + \n TableScan: person projection=None", ); } @@ -2060,8 +2053,9 @@ mod tests { fn select_simple_aggregate() { quick_test( "SELECT MIN(age) FROM person", - "Aggregate: groupBy=[[]], aggr=[[MIN(#age)]]\ - \n TableScan: person projection=None", + "Projection: #MIN(age)\ + \n Aggregate: groupBy=[[]], aggr=[[MIN(#age)]]\ + \n TableScan: person projection=None", ); } @@ -2069,8 +2063,9 @@ mod tests { fn test_sum_aggregate() { quick_test( "SELECT SUM(age) from person", - "Aggregate: groupBy=[[]], aggr=[[SUM(#age)]]\ - \n TableScan: person projection=None", + "Projection: #SUM(age)\ + \n Aggregate: groupBy=[[]], aggr=[[SUM(#age)]]\ + \n TableScan: person projection=None", ); } @@ -2131,8 +2126,9 @@ mod tests { fn select_simple_aggregate_with_groupby() { quick_test( "SELECT state, MIN(age), MAX(age) FROM person GROUP BY state", - "Aggregate: groupBy=[[#state]], aggr=[[MIN(#age), MAX(#age)]]\ - \n TableScan: person projection=None", + "Projection: #state, #MIN(age), #MAX(age)\ + \n Aggregate: groupBy=[[#state]], aggr=[[MIN(#age), MAX(#age)]]\ + \n TableScan: person projection=None", ); } @@ -2269,8 +2265,9 @@ mod tests { ) { quick_test( "SELECT age + 1, MIN(first_name) FROM person GROUP BY age + 1", - "Aggregate: groupBy=[[#age Plus Int64(1)]], aggr=[[MIN(#first_name)]]\ - \n TableScan: person projection=None", + "Projection: #age Plus Int64(1), #MIN(first_name)\ + \n Aggregate: groupBy=[[#age Plus Int64(1)]], aggr=[[MIN(#first_name)]]\ + \n TableScan: person projection=None", ); quick_test( "SELECT MIN(first_name), age + 1 FROM person GROUP BY age + 1", @@ -2349,8 +2346,9 @@ mod tests { fn select_aggregate_with_non_column_inner_expression_with_groupby() { quick_test( "SELECT state, MIN(age + 1) FROM person GROUP BY state", - "Aggregate: groupBy=[[#state]], aggr=[[MIN(#age Plus Int64(1))]]\ - \n TableScan: person projection=None", + "Projection: #state, #MIN(age Plus Int64(1))\ + \n Aggregate: groupBy=[[#state]], aggr=[[MIN(#age Plus Int64(1))]]\ + \n TableScan: person projection=None", ); } @@ -2366,16 +2364,18 @@ mod tests { #[test] fn select_count_one() { let sql = "SELECT COUNT(1) FROM person"; - let expected = "Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1))]]\ - \n TableScan: person projection=None"; + let expected = "Projection: #COUNT(UInt8(1))\ + \n Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1))]]\ + \n TableScan: person projection=None"; quick_test(sql, expected); } #[test] fn select_count_column() { let sql = "SELECT COUNT(id) FROM person"; - let expected = "Aggregate: groupBy=[[]], aggr=[[COUNT(#id)]]\ - \n TableScan: person projection=None"; + let expected = "Projection: #COUNT(id)\ + \n Aggregate: groupBy=[[]], aggr=[[COUNT(#id)]]\ + \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -2461,8 +2461,9 @@ mod tests { #[test] fn select_group_by() { let sql = "SELECT state FROM person GROUP BY state"; - let expected = "Aggregate: groupBy=[[#state]], aggr=[[]]\ - \n TableScan: person projection=None"; + let expected = "Projection: #state\ + \n Aggregate: groupBy=[[#state]], aggr=[[]]\ + \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -2480,8 +2481,9 @@ mod tests { #[test] fn select_group_by_count_star() { let sql = "SELECT state, COUNT(*) FROM person GROUP BY state"; - let expected = "Aggregate: groupBy=[[#state]], aggr=[[COUNT(UInt8(1))]]\ - \n TableScan: person projection=None"; + let expected = "Projection: #state, #COUNT(UInt8(1))\ + \n Aggregate: groupBy=[[#state]], aggr=[[COUNT(UInt8(1))]]\ + \n TableScan: person projection=None"; quick_test(sql, expected); } diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index bf28525ad437f..5c90f8ac162b3 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -81,15 +81,18 @@ async fn nyc() -> Result<()> { let optimized_plan = ctx.optimize(&logical_plan)?; match &optimized_plan { - LogicalPlan::Aggregate { input, .. } => match input.as_ref() { - LogicalPlan::TableScan { - ref projected_schema, - .. - } => { - assert_eq!(2, projected_schema.fields().len()); - assert_eq!(projected_schema.field(0).name(), "passenger_count"); - assert_eq!(projected_schema.field(1).name(), "fare_amount"); - } + LogicalPlan::Projection { input, .. } => match input.as_ref() { + LogicalPlan::Aggregate { input, .. } => match input.as_ref() { + LogicalPlan::TableScan { + ref projected_schema, + .. + } => { + assert_eq!(2, projected_schema.fields().len()); + assert_eq!(projected_schema.field(0).name(), "passenger_count"); + assert_eq!(projected_schema.field(1).name(), "fare_amount"); + } + _ => unreachable!(), + }, _ => unreachable!(), }, _ => unreachable!(false), @@ -447,6 +450,19 @@ async fn select_distinct_simple() -> Result<()> { Ok(()) } +#[tokio::test] +async fn projection_same_fields() -> Result<()> { + let mut ctx = ExecutionContext::new(); + + let sql = "select (1+1) as a from (select 1 as a);"; + let actual = execute(&mut ctx, sql).await; + + let expected = vec![vec!["2"]]; + assert_eq!(actual, expected); + + Ok(()) +} + #[tokio::test] async fn csv_query_group_by_float64() -> Result<()> { let mut ctx = ExecutionContext::new(); From f7a7505e2ee52546fcb0ad6e5bf266fb6f0da033 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Fri, 7 May 2021 02:06:03 +0800 Subject: [PATCH 071/329] fix 265, log should be log10, and add ln (#271) --- ballista/rust/core/proto/ballista.proto | 23 ++++++++++--------- .../core/src/serde/logical_plan/from_proto.rs | 5 ++-- .../core/src/serde/logical_plan/to_proto.rs | 1 + datafusion/src/logical_plan/expr.rs | 2 +- datafusion/src/physical_plan/functions.rs | 9 ++++++-- .../src/physical_plan/math_expressions.rs | 2 +- 6 files changed, 25 insertions(+), 17 deletions(-) diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index b6bc5d09c3925..381221f38d270 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -33,7 +33,7 @@ message LogicalExprNode { oneof ExprType { // column references string column_name = 1; - + // alias AliasNode alias = 2; @@ -42,15 +42,15 @@ message LogicalExprNode { // binary expressions BinaryExprNode binary_expr = 4; - + // aggregate expressions AggregateExprNode aggregate_expr = 5; - + // null checks IsNull is_null_expr = 6; IsNotNull is_not_null_expr = 7; Not not_expr = 8; - + BetweenNode between = 9; CaseNode case_ = 10; CastNode cast = 11; @@ -130,6 +130,7 @@ enum ScalarFunction { SHA256 = 30; SHA384 = 31; SHA512 = 32; + LN = 33; } message ScalarFunctionNode { @@ -361,7 +362,7 @@ message CsvScanExecNode { bool has_header = 5; uint32 batch_size = 6; string delimiter = 7; - + // partition filenames repeated string filename = 8; } @@ -466,7 +467,7 @@ message Action { // Fetch a partition from an executor PartitionId fetch_partition = 3; } - + // configuration settings repeated KeyValuePair settings = 100; } @@ -742,10 +743,10 @@ message ScalarValue{ } } -// Contains all valid datafusion scalar type except for +// Contains all valid datafusion scalar type except for // List enum PrimitiveScalarType{ - + BOOL = 0; // arrow::Type::BOOL UINT8 = 1; // arrow::Type::UINT8 INT8 = 2; // arrow::Type::INT8 @@ -777,7 +778,7 @@ message ScalarListType{ PrimitiveScalarType deepest_type = 2; } -// Broke out into multiple message types so that type +// Broke out into multiple message types so that type // metadata did not need to be in separate message //All types that are of the empty message types contain no additional metadata // about the type @@ -794,7 +795,7 @@ message ArrowType{ EmptyMessage UINT64 =9; EmptyMessage INT64 =10 ; EmptyMessage FLOAT16 =11 ; - EmptyMessage FLOAT32 =12 ; + EmptyMessage FLOAT32 =12 ; EmptyMessage FLOAT64 =13 ; EmptyMessage UTF8 =14 ; EmptyMessage LARGE_UTF8 = 32; @@ -824,7 +825,7 @@ message ArrowType{ //Useful for representing an empty enum variant in rust // E.G. enum example{One, Two(i32)} -// maps to +// maps to // message example{ // oneof{ // EmptyMessage One = 1; diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index 18a85d2796cf4..ab7c55f4500d6 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -28,8 +28,8 @@ use crate::{convert_box_required, convert_required}; use arrow::datatypes::{DataType, Field, Schema}; use datafusion::logical_plan::{ - abs, acos, asin, atan, ceil, cos, exp, floor, log10, log2, round, signum, sin, sqrt, - tan, trunc, Expr, JoinType, LogicalPlan, LogicalPlanBuilder, Operator, + abs, acos, asin, atan, ceil, cos, exp, floor, ln, log10, log2, round, signum, sin, + sqrt, tan, trunc, Expr, JoinType, LogicalPlan, LogicalPlanBuilder, Operator, }; use datafusion::physical_plan::aggregates::AggregateFunction; use datafusion::physical_plan::csv::CsvReadOptions; @@ -1013,6 +1013,7 @@ impl TryInto for &protobuf::LogicalExprNode { protobuf::ScalarFunction::Log2 => { Ok(log2((&expr.expr[0]).try_into()?)) } + protobuf::ScalarFunction::Ln => Ok(ln((&expr.expr[0]).try_into()?)), protobuf::ScalarFunction::Log10 => { Ok(log10((&expr.expr[0]).try_into()?)) } diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/ballista/rust/core/src/serde/logical_plan/to_proto.rs index 560578df9f5b3..de4b86e38413b 100644 --- a/ballista/rust/core/src/serde/logical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/to_proto.rs @@ -1200,6 +1200,7 @@ impl TryInto for &BuiltinScalarFunction { BuiltinScalarFunction::Atan => Ok(protobuf::ScalarFunction::Atan), BuiltinScalarFunction::Exp => Ok(protobuf::ScalarFunction::Exp), BuiltinScalarFunction::Log => Ok(protobuf::ScalarFunction::Log), + BuiltinScalarFunction::Ln => Ok(protobuf::ScalarFunction::Ln), BuiltinScalarFunction::Log10 => Ok(protobuf::ScalarFunction::Log10), BuiltinScalarFunction::Floor => Ok(protobuf::ScalarFunction::Floor), BuiltinScalarFunction::Ceil => Ok(protobuf::ScalarFunction::Ceil), diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index fa9b9e0a2490f..3365bf2603234 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -1086,9 +1086,9 @@ unary_scalar_expr!(Trunc, trunc); unary_scalar_expr!(Abs, abs); unary_scalar_expr!(Signum, signum); unary_scalar_expr!(Exp, exp); -unary_scalar_expr!(Log, ln); unary_scalar_expr!(Log2, log2); unary_scalar_expr!(Log10, log10); +unary_scalar_expr!(Ln, ln); // string functions unary_scalar_expr!(Ascii, ascii); diff --git a/datafusion/src/physical_plan/functions.rs b/datafusion/src/physical_plan/functions.rs index 56365fec1dc87..960d7c5d8e0d7 100644 --- a/datafusion/src/physical_plan/functions.rs +++ b/datafusion/src/physical_plan/functions.rs @@ -102,7 +102,9 @@ pub enum BuiltinScalarFunction { Exp, /// floor Floor, - /// log, also known as ln + /// ln, Natural logarithm + Ln, + /// log, same as log10 Log, /// log10 Log10, @@ -222,6 +224,7 @@ impl FromStr for BuiltinScalarFunction { "cos" => BuiltinScalarFunction::Cos, "exp" => BuiltinScalarFunction::Exp, "floor" => BuiltinScalarFunction::Floor, + "ln" => BuiltinScalarFunction::Ln, "log" => BuiltinScalarFunction::Log, "log10" => BuiltinScalarFunction::Log10, "log2" => BuiltinScalarFunction::Log2, @@ -633,6 +636,7 @@ pub fn return_type( | BuiltinScalarFunction::Exp | BuiltinScalarFunction::Floor | BuiltinScalarFunction::Log + | BuiltinScalarFunction::Ln | BuiltinScalarFunction::Log10 | BuiltinScalarFunction::Log2 | BuiltinScalarFunction::Round @@ -721,7 +725,8 @@ pub fn create_physical_expr( BuiltinScalarFunction::Cos => math_expressions::cos, BuiltinScalarFunction::Exp => math_expressions::exp, BuiltinScalarFunction::Floor => math_expressions::floor, - BuiltinScalarFunction::Log => math_expressions::ln, + BuiltinScalarFunction::Log => math_expressions::log10, + BuiltinScalarFunction::Ln => math_expressions::ln, BuiltinScalarFunction::Log10 => math_expressions::log10, BuiltinScalarFunction::Log2 => math_expressions::log2, BuiltinScalarFunction::Round => math_expressions::round, diff --git a/datafusion/src/physical_plan/math_expressions.rs b/datafusion/src/physical_plan/math_expressions.rs index 382a15f8ccf6e..72b4f102f6627 100644 --- a/datafusion/src/physical_plan/math_expressions.rs +++ b/datafusion/src/physical_plan/math_expressions.rs @@ -113,6 +113,6 @@ math_unary_function!("trunc", trunc); math_unary_function!("abs", abs); math_unary_function!("signum", signum); math_unary_function!("exp", exp); -math_unary_function!("log", ln); +math_unary_function!("ln", ln); math_unary_function!("log2", log2); math_unary_function!("log10", log10); From 27be271d1aae7cedd8d0e809a79cb16f41506245 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 6 May 2021 14:14:00 -0400 Subject: [PATCH 072/329] Implement count distinct for dictionary arrays (#256) * Implement count distinct for dictionary arrays * cleanup * cleanup * fix: clippy --- datafusion/src/execution/context.rs | 19 ++++++++ .../src/physical_plan/distinct_expressions.rs | 37 ++++++++++---- datafusion/src/physical_plan/mod.rs | 1 + datafusion/src/scalar.rs | 48 +++++++++++++++++-- 4 files changed, 92 insertions(+), 13 deletions(-) diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index dee253f44ac33..b53f7c15e3aac 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -1766,6 +1766,25 @@ mod tests { "+-----+-------------+", ]; assert_batches_sorted_eq!(expected, &results); + + // Now, use dict as an aggregate + let results = plan_and_collect( + &mut ctx, + "SELECT val, count(distinct dict) FROM t GROUP BY val", + ) + .await + .expect("ran plan correctly"); + + let expected = vec![ + "+-----+----------------------+", + "| val | COUNT(DISTINCT dict) |", + "+-----+----------------------+", + "| 1 | 2 |", + "| 2 | 2 |", + "| 4 | 1 |", + "+-----+----------------------+", + ]; + assert_batches_sorted_eq!(expected, &results); } run_test_case::().await; diff --git a/datafusion/src/physical_plan/distinct_expressions.rs b/datafusion/src/physical_plan/distinct_expressions.rs index a4dd25d0157b3..ffc138ad4574b 100644 --- a/datafusion/src/physical_plan/distinct_expressions.rs +++ b/datafusion/src/physical_plan/distinct_expressions.rs @@ -47,8 +47,8 @@ pub struct DistinctCount { name: String, /// The DataType for the final count data_type: DataType, - /// The DataType for each input argument - input_data_types: Vec, + /// The DataType used to hold the state for each input + state_data_types: Vec, /// The input arguments exprs: Vec>, } @@ -61,8 +61,10 @@ impl DistinctCount { name: String, data_type: DataType, ) -> Self { + let state_data_types = input_data_types.into_iter().map(state_type).collect(); + Self { - input_data_types, + state_data_types, exprs, name, data_type, @@ -70,6 +72,15 @@ impl DistinctCount { } } +/// return the type to use to accumulate state for the specified input type +fn state_type(data_type: DataType) -> DataType { + match data_type { + // when aggregating dictionary values, use the underlying value type + DataType::Dictionary(_key_type, value_type) => *value_type, + t => t, + } +} + impl AggregateExpr for DistinctCount { /// Return a reference to Any that can be used for downcasting fn as_any(&self) -> &dyn Any { @@ -82,12 +93,16 @@ impl AggregateExpr for DistinctCount { fn state_fields(&self) -> Result> { Ok(self - .input_data_types + .state_data_types .iter() - .map(|data_type| { + .map(|state_data_type| { Field::new( &format_state_name(&self.name, "count distinct"), - DataType::List(Box::new(Field::new("item", data_type.clone(), true))), + DataType::List(Box::new(Field::new( + "item", + state_data_type.clone(), + true, + ))), false, ) }) @@ -101,7 +116,7 @@ impl AggregateExpr for DistinctCount { fn create_accumulator(&self) -> Result> { Ok(Box::new(DistinctCountAccumulator { values: HashSet::default(), - data_types: self.input_data_types.clone(), + state_data_types: self.state_data_types.clone(), count_data_type: self.data_type.clone(), })) } @@ -110,7 +125,7 @@ impl AggregateExpr for DistinctCount { #[derive(Debug)] struct DistinctCountAccumulator { values: HashSet, - data_types: Vec, + state_data_types: Vec, count_data_type: DataType, } @@ -156,9 +171,11 @@ impl Accumulator for DistinctCountAccumulator { fn state(&self) -> Result> { let mut cols_out = self - .data_types + .state_data_types .iter() - .map(|data_type| ScalarValue::List(Some(Vec::new()), data_type.clone())) + .map(|state_data_type| { + ScalarValue::List(Some(Vec::new()), state_data_type.clone()) + }) .collect::>(); let mut cols_vec = cols_out diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index 11f0946c91ff6..a8f6f0c35f00e 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -274,6 +274,7 @@ pub trait AggregateExpr: Send + Sync + Debug { /// Returns the aggregate expression as [`Any`](std::any::Any) so that it can be /// downcast to a specific implementation. fn as_any(&self) -> &dyn Any; + /// the field of the final result of this aggregation. fn field(&self) -> Result; diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index dd3fb58757bed..7b76b7041ef2a 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -19,10 +19,13 @@ use std::{convert::TryFrom, fmt, iter::repeat, sync::Arc}; -use arrow::datatypes::{DataType, Field, IntervalUnit, TimeUnit}; +use arrow::datatypes::{ArrowDictionaryKeyType, DataType, Field, IntervalUnit, TimeUnit}; use arrow::{ array::*, - datatypes::{ArrowNativeType, Float32Type, TimestampNanosecondType}, + datatypes::{ + ArrowNativeType, Float32Type, Int16Type, Int32Type, Int64Type, Int8Type, + TimestampNanosecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, + }, }; use arrow::{ array::{ @@ -446,14 +449,53 @@ impl ScalarValue { DataType::Timestamp(TimeUnit::Nanosecond, _) => { typed_cast!(array, index, TimestampNanosecondArray, TimestampNanosecond) } + DataType::Dictionary(index_type, _) => match **index_type { + DataType::Int8 => Self::try_from_dict_array::(array, index)?, + DataType::Int16 => Self::try_from_dict_array::(array, index)?, + DataType::Int32 => Self::try_from_dict_array::(array, index)?, + DataType::Int64 => Self::try_from_dict_array::(array, index)?, + DataType::UInt8 => Self::try_from_dict_array::(array, index)?, + DataType::UInt16 => { + Self::try_from_dict_array::(array, index)? + } + DataType::UInt32 => { + Self::try_from_dict_array::(array, index)? + } + DataType::UInt64 => { + Self::try_from_dict_array::(array, index)? + } + _ => { + return Err(DataFusionError::Internal(format!( + "Index type not supported while creating scalar from dictionary: {}", + array.data_type(), + ))) + } + }, other => { return Err(DataFusionError::NotImplemented(format!( - "Can't create a scalar of array of type \"{:?}\"", + "Can't create a scalar from array of type \"{:?}\"", other ))) } }) } + + fn try_from_dict_array( + array: &ArrayRef, + index: usize, + ) -> Result { + let dict_array = array.as_any().downcast_ref::>().unwrap(); + + // look up the index in the values dictionary + let keys_col = dict_array.keys_array(); + let values_index = keys_col.value(index).to_usize().ok_or_else(|| { + DataFusionError::Internal(format!( + "Can not convert index to usize in dictionary of type creating group by value {:?}", + keys_col.data_type() + )) + })?; + Self::try_from_array(&dict_array.values(), values_index) + } } impl From for ScalarValue { From 9dface6c72fb812c7480571b1934288c80580a09 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 7 May 2021 02:15:43 +0800 Subject: [PATCH 073/329] docs - fix the ballista link (#274) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 60492e7f93919..2f6640f28f865 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ the convenience of an SQL interface or a DataFrame API. Here are some of the projects known to use DataFusion: -* [Ballista](https://github.com/ballista-compute/ballista) Distributed Compute Platform +* [Ballista](ballista) Distributed Compute Platform * [Cloudfuse Buzz](https://github.com/cloudfuse-io/buzz-rust) * [Cube Store](https://github.com/cube-js/cube.js/tree/master/rust) * [datafusion-python](https://pypi.org/project/datafusion) From b8805d4f44d4da8f16069d93ab342dc6f082ca07 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Fri, 7 May 2021 12:14:45 +0800 Subject: [PATCH 074/329] fix clippy warning (#286) --- datafusion/src/physical_plan/distinct_expressions.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/src/physical_plan/distinct_expressions.rs b/datafusion/src/physical_plan/distinct_expressions.rs index ffc138ad4574b..927f16fe3d216 100644 --- a/datafusion/src/physical_plan/distinct_expressions.rs +++ b/datafusion/src/physical_plan/distinct_expressions.rs @@ -64,10 +64,10 @@ impl DistinctCount { let state_data_types = input_data_types.into_iter().map(state_type).collect(); Self { - state_data_types, - exprs, name, data_type, + state_data_types, + exprs, } } } From a947f11a8c0558c7301931a6c35a07de396d2463 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Sat, 8 May 2021 15:07:19 +0800 Subject: [PATCH 075/329] add param validation for datafusion-cli (#284) * add param validation for datafusion-cli * Update datafusion-cli/src/main.rs Co-authored-by: Andrew Lamb * Update datafusion-cli/src/main.rs Co-authored-by: Andrew Lamb Co-authored-by: Andrew Lamb --- datafusion-cli/src/main.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index dd7265e1f707e..6a7693a2bd82f 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -40,6 +40,7 @@ pub async fn main() { .help("Path to your data, default to current directory") .short("p") .long("data-path") + .validator(is_valid_data_dir) .takes_value(true), ) .arg( @@ -47,6 +48,7 @@ pub async fn main() { .help("The batch size of each query, or use DataFusion default") .short("c") .long("batch-size") + .validator(is_valid_batch_size) .takes_value(true), ) .get_matches(); @@ -100,6 +102,21 @@ pub async fn main() { rl.save_history(".history").ok(); } +fn is_valid_data_dir(dir: String) -> std::result::Result<(), String> { + if Path::new(&dir).is_dir() { + Ok(()) + } else { + Err(format!("Invalid data directory '{}'", dir)) + } +} + +fn is_valid_batch_size(size: String) -> std::result::Result<(), String> { + match size.parse::() { + Ok(size) if size > 0 => Ok(()), + _ => Err(format!("Invalid batch size '{}'", size)), + } +} + fn is_exit_command(line: &str) -> bool { let line = line.trim_end().to_lowercase(); line == "quit" || line == "exit" From d0a4552dcf07316acaa8ade7feabe5c9165f3a48 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Sat, 8 May 2021 17:51:37 +0800 Subject: [PATCH 076/329] allow datafusion-cli to take a file param (#285) * add param validation for datafusion-cli * avoid error surpressing * deref * use unwrap or else * reduce diff --- datafusion-cli/src/main.rs | 74 +++++++++++++++++++++++++++++++++++--- 1 file changed, 69 insertions(+), 5 deletions(-) diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index 6a7693a2bd82f..2e8fe111b2fc2 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -16,13 +16,15 @@ // under the License. #![allow(bare_trait_objects)] - use clap::{crate_version, App, Arg}; use datafusion::arrow::util::pretty; use datafusion::error::Result; use datafusion::execution::context::{ExecutionConfig, ExecutionContext}; use rustyline::Editor; use std::env; +use std::fs::File; +use std::io::prelude::*; +use std::io::BufReader; use std::path::Path; use std::time::Instant; @@ -51,6 +53,14 @@ pub async fn main() { .validator(is_valid_batch_size) .takes_value(true), ) + .arg( + Arg::with_name("file") + .help("execute commands from file, then exit") + .short("f") + .long("file") + .validator(is_valid_file) + .takes_value(true), + ) .get_matches(); if let Some(path) = matches.value_of("data-path") { @@ -67,16 +77,62 @@ pub async fn main() { execution_config = execution_config.with_batch_size(batch_size); }; - let mut ctx = - ExecutionContext::with_config(execution_config.with_information_schema(true)); + if let Some(file_path) = matches.value_of("file") { + let file = File::open(file_path) + .unwrap_or_else(|err| panic!("cannot open file '{}': {}", file_path, err)); + let mut reader = BufReader::new(file); + exec_from_lines(&mut reader, execution_config).await; + } else { + exec_from_repl(execution_config).await; + } +} + +async fn exec_from_lines( + reader: &mut BufReader, + execution_config: ExecutionConfig, +) { + let mut ctx = ExecutionContext::with_config(execution_config); + let mut query = "".to_owned(); + + for line in reader.lines() { + match line { + Ok(line) => { + let line = line.trim_end(); + query.push_str(line); + if line.ends_with(';') { + match exec_and_print(&mut ctx, query).await { + Ok(_) => {} + Err(err) => println!("{:?}", err), + } + query = "".to_owned(); + } else { + query.push(' '); + } + } + _ => { + break; + } + } + } + + // run the left over query if the last statement doesn't contain ‘;’ + if !query.is_empty() { + match exec_and_print(&mut ctx, query).await { + Ok(_) => {} + Err(err) => println!("{:?}", err), + } + } +} + +async fn exec_from_repl(execution_config: ExecutionConfig) { + let mut ctx = ExecutionContext::with_config(execution_config); let mut rl = Editor::<()>::new(); rl.load_history(".history").ok(); let mut query = "".to_owned(); loop { - let readline = rl.readline("> "); - match readline { + match rl.readline("> ") { Ok(ref line) if is_exit_command(line) && query.is_empty() => { break; } @@ -102,6 +158,14 @@ pub async fn main() { rl.save_history(".history").ok(); } +fn is_valid_file(dir: String) -> std::result::Result<(), String> { + if Path::new(&dir).is_file() { + Ok(()) + } else { + Err(format!("Invalid file '{}'", dir)) + } +} + fn is_valid_data_dir(dir: String) -> std::result::Result<(), String> { if Path::new(&dir).is_dir() { Ok(()) From 204d4f588a5820f25fc5c6d6599d368c1ee04c3e Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Sun, 9 May 2021 18:59:19 +0800 Subject: [PATCH 077/329] Add print format param and support for csv print format to datafusion cli (#289) * add csv mode to datafusion cli * add license * fix per comments * update help --- datafusion-cli/Cargo.toml | 1 + datafusion-cli/src/format.rs | 17 ++++++ datafusion-cli/src/format/print_format.rs | 64 +++++++++++++++++++++++ datafusion-cli/src/main.rs | 50 ++++++++++++++---- 4 files changed, 122 insertions(+), 10 deletions(-) create mode 100644 datafusion-cli/src/format.rs create mode 100644 datafusion-cli/src/format/print_format.rs diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 883d0f2f4c66b..2cde4da16ca1c 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -31,3 +31,4 @@ clap = "2.33" rustyline = "8.0" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } datafusion = { path = "../datafusion" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "508f25c10032857da34ea88cc8166f0741616a32" } diff --git a/datafusion-cli/src/format.rs b/datafusion-cli/src/format.rs new file mode 100644 index 0000000000000..c5da78f17951c --- /dev/null +++ b/datafusion-cli/src/format.rs @@ -0,0 +1,17 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +pub mod print_format; diff --git a/datafusion-cli/src/format/print_format.rs b/datafusion-cli/src/format/print_format.rs new file mode 100644 index 0000000000000..921e29f7fae79 --- /dev/null +++ b/datafusion-cli/src/format/print_format.rs @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Print format variants +use arrow::csv::writer::WriterBuilder; +use datafusion::arrow::record_batch::RecordBatch; +use datafusion::arrow::util::pretty; +use datafusion::error::{DataFusionError, Result}; +use std::str::FromStr; + +/// Allow records to be printed in different formats +#[derive(Debug, Clone)] +pub enum PrintFormat { + Csv, + Table, +} + +impl FromStr for PrintFormat { + type Err = (); + fn from_str(s: &str) -> std::result::Result { + match s { + "csv" => Ok(PrintFormat::Csv), + "table" => Ok(PrintFormat::Table), + _ => Err(()), + } + } +} + +impl PrintFormat { + /// print the batches to stdout using the specified format + pub fn print_batches(&self, batches: &[RecordBatch]) -> Result<()> { + match self { + PrintFormat::Csv => { + let mut bytes = vec![]; + { + let builder = WriterBuilder::new().has_headers(true); + let mut writer = builder.build(&mut bytes); + for batch in batches { + writer.write(batch)?; + } + } + let csv = String::from_utf8(bytes) + .map_err(|e| DataFusionError::Execution(e.to_string()))?; + println!("{}", csv); + } + PrintFormat::Table => pretty::print_batches(batches)?, + } + Ok(()) + } +} diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index 2e8fe111b2fc2..05c6766cee724 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -16,10 +16,13 @@ // under the License. #![allow(bare_trait_objects)] + +mod format; + use clap::{crate_version, App, Arg}; -use datafusion::arrow::util::pretty; use datafusion::error::Result; use datafusion::execution::context::{ExecutionConfig, ExecutionContext}; +use format::print_format::PrintFormat; use rustyline::Editor; use std::env; use std::fs::File; @@ -55,12 +58,20 @@ pub async fn main() { ) .arg( Arg::with_name("file") - .help("execute commands from file, then exit") + .help("Execute commands from file, then exit") .short("f") .long("file") .validator(is_valid_file) .takes_value(true), ) + .arg( + Arg::with_name("format") + .help("Output format (possible values: table, csv)") + .long("format") + .default_value("table") + .validator(is_valid_format) + .takes_value(true), + ) .get_matches(); if let Some(path) = matches.value_of("data-path") { @@ -77,19 +88,26 @@ pub async fn main() { execution_config = execution_config.with_batch_size(batch_size); }; + let print_format = matches + .value_of("format") + .expect("No format is specified") + .parse::() + .expect("Invalid format"); + if let Some(file_path) = matches.value_of("file") { let file = File::open(file_path) .unwrap_or_else(|err| panic!("cannot open file '{}': {}", file_path, err)); let mut reader = BufReader::new(file); - exec_from_lines(&mut reader, execution_config).await; + exec_from_lines(&mut reader, execution_config, print_format).await; } else { - exec_from_repl(execution_config).await; + exec_from_repl(execution_config, print_format).await; } } async fn exec_from_lines( reader: &mut BufReader, execution_config: ExecutionConfig, + print_format: PrintFormat, ) { let mut ctx = ExecutionContext::with_config(execution_config); let mut query = "".to_owned(); @@ -100,7 +118,7 @@ async fn exec_from_lines( let line = line.trim_end(); query.push_str(line); if line.ends_with(';') { - match exec_and_print(&mut ctx, query).await { + match exec_and_print(&mut ctx, print_format.clone(), query).await { Ok(_) => {} Err(err) => println!("{:?}", err), } @@ -117,14 +135,14 @@ async fn exec_from_lines( // run the left over query if the last statement doesn't contain ‘;’ if !query.is_empty() { - match exec_and_print(&mut ctx, query).await { + match exec_and_print(&mut ctx, print_format, query).await { Ok(_) => {} Err(err) => println!("{:?}", err), } } } -async fn exec_from_repl(execution_config: ExecutionConfig) { +async fn exec_from_repl(execution_config: ExecutionConfig, print_format: PrintFormat) { let mut ctx = ExecutionContext::with_config(execution_config); let mut rl = Editor::<()>::new(); @@ -139,7 +157,7 @@ async fn exec_from_repl(execution_config: ExecutionConfig) { Ok(ref line) if line.trim_end().ends_with(';') => { query.push_str(line.trim_end()); rl.add_history_entry(query.clone()); - match exec_and_print(&mut ctx, query).await { + match exec_and_print(&mut ctx, print_format.clone(), query).await { Ok(_) => {} Err(err) => println!("{:?}", err), } @@ -158,6 +176,14 @@ async fn exec_from_repl(execution_config: ExecutionConfig) { rl.save_history(".history").ok(); } +fn is_valid_format(format: String) -> std::result::Result<(), String> { + match format.to_lowercase().as_str() { + "csv" => Ok(()), + "table" => Ok(()), + _ => Err(format!("Format '{}' not supported", format)), + } +} + fn is_valid_file(dir: String) -> std::result::Result<(), String> { if Path::new(&dir).is_file() { Ok(()) @@ -186,7 +212,11 @@ fn is_exit_command(line: &str) -> bool { line == "quit" || line == "exit" } -async fn exec_and_print(ctx: &mut ExecutionContext, sql: String) -> Result<()> { +async fn exec_and_print( + ctx: &mut ExecutionContext, + print_format: PrintFormat, + sql: String, +) -> Result<()> { let now = Instant::now(); let df = ctx.sql(&sql)?; @@ -200,7 +230,7 @@ async fn exec_and_print(ctx: &mut ExecutionContext, sql: String) -> Result<()> { return Ok(()); } - pretty::print_batches(&results)?; + print_format.print_batches(&results)?; let row_count: usize = results.iter().map(|b| b.num_rows()).sum(); From cbda0a718cf0ef47414283d83b9b37bd49a11006 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sun, 9 May 2021 20:21:07 +0200 Subject: [PATCH 078/329] Return error on OVER clause (#301) --- datafusion/src/sql/planner.rs | 37 ++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index ed7dd377c835c..d5aae24e7971b 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -1088,11 +1088,17 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .collect::>>()? }; - return Ok(Expr::AggregateFunction { - fun, - distinct: function.distinct, - args, - }); + return match &function.over { + Some(window) => Err(DataFusionError::NotImplemented(format!( + "Unsupported OVER clause ({})", + window + ))), + _ => Ok(Expr::AggregateFunction { + fun, + distinct: function.distinct, + args, + }), + }; }; // finally, user-defined functions (UDF) and UDAF @@ -2631,6 +2637,27 @@ mod tests { ); } + #[test] + fn over_not_supported() { + let sql = "SELECT order_id, MAX(order_id) OVER () from orders"; + let err = logical_plan(sql).expect_err("query should have failed"); + assert_eq!( + "NotImplemented(\"Unsupported OVER clause ()\")", + format!("{:?}", err) + ); + } + + #[test] + fn over_partition_by_not_supported() { + let sql = + "SELECT order_id, MAX(delivered) OVER (PARTITION BY order_id) from orders"; + let err = logical_plan(sql).expect_err("query should have failed"); + assert_eq!( + "NotImplemented(\"Unsupported OVER clause (PARTITION BY order_id)\")", + format!("{:?}", err) + ); + } + #[test] fn only_union_all_supported() { let sql = "SELECT order_id from orders EXCEPT SELECT order_id FROM orders"; From a00f410e0c95dc8785715d4d543d8f390185bff2 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Mon, 10 May 2021 02:21:31 +0800 Subject: [PATCH 079/329] allow datafusion cli to take -- comments (#296) * allow datafusion cli to take -- comments * append \n not _ --- datafusion-cli/src/main.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index 05c6766cee724..f15c1d8e4a21c 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -114,6 +114,9 @@ async fn exec_from_lines( for line in reader.lines() { match line { + Ok(line) if line.starts_with("--") => { + continue; + } Ok(line) => { let line = line.trim_end(); query.push_str(line); @@ -124,7 +127,7 @@ async fn exec_from_lines( } query = "".to_owned(); } else { - query.push(' '); + query.push('\n'); } } _ => { @@ -154,6 +157,9 @@ async fn exec_from_repl(execution_config: ExecutionConfig, print_format: PrintFo Ok(ref line) if is_exit_command(line) && query.is_empty() => { break; } + Ok(ref line) if line.starts_with("--") => { + continue; + } Ok(ref line) if line.trim_end().ends_with(';') => { query.push_str(line.trim_end()); rl.add_history_entry(query.clone()); @@ -165,7 +171,7 @@ async fn exec_from_repl(execution_config: ExecutionConfig, print_format: PrintFo } Ok(ref line) => { query.push_str(line); - query.push(' '); + query.push('\n'); } Err(_) => { break; From 03de07efc8129029aae5e90616b53fbbf8e52c53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Mon, 10 May 2021 12:21:35 +0200 Subject: [PATCH 080/329] Support Full Join (#291) * Support Full Join * Docs * fmt * Update rev * Update ballista * Revert python changes * Update ballista/rust/core/src/serde/physical_plan/to_proto.rs Co-authored-by: Andrew Lamb * Make tests a bit easier to follow * Rename to full * Fix comment Co-authored-by: Andrew Lamb --- README.md | 2 +- ballista/rust/core/proto/ballista.proto | 1 + .../core/src/serde/logical_plan/from_proto.rs | 1 + .../core/src/serde/logical_plan/to_proto.rs | 1 + .../src/serde/physical_plan/from_proto.rs | 1 + .../core/src/serde/physical_plan/to_proto.rs | 1 + datafusion/src/logical_plan/builder.rs | 2 +- datafusion/src/logical_plan/plan.rs | 8 +- .../src/optimizer/hash_build_probe_order.rs | 1 + datafusion/src/physical_plan/hash_join.rs | 124 +++++++++++++++++- datafusion/src/physical_plan/hash_utils.rs | 10 +- datafusion/src/physical_plan/planner.rs | 1 + datafusion/src/sql/planner.rs | 3 + datafusion/tests/sql.rs | 21 +++ 14 files changed, 162 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 2f6640f28f865..bc153368e1bf9 100644 --- a/README.md +++ b/README.md @@ -215,8 +215,8 @@ DataFusion also includes a simple command-line interactive SQL utility. See the - [x] INNER JOIN - [x] LEFT JOIN - [x] RIGHT JOIN + - [x] FULL JOIN - [x] CROSS JOIN - - [ ] OUTER JOIN - [ ] Window ## Data Sources diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index 381221f38d270..07419d09b7a91 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -292,6 +292,7 @@ enum JoinType { INNER = 0; LEFT = 1; RIGHT = 2; + FULL = 3; } message JoinNode { diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index ab7c55f4500d6..6987035394c6d 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -235,6 +235,7 @@ impl TryInto for &protobuf::LogicalPlanNode { protobuf::JoinType::Inner => JoinType::Inner, protobuf::JoinType::Left => JoinType::Left, protobuf::JoinType::Right => JoinType::Right, + protobuf::JoinType::Full => JoinType::Full, }; LogicalPlanBuilder::from(&convert_box_required!(join.left)?) .join( diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/ballista/rust/core/src/serde/logical_plan/to_proto.rs index de4b86e38413b..01b669d264461 100644 --- a/ballista/rust/core/src/serde/logical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/to_proto.rs @@ -808,6 +808,7 @@ impl TryInto for &LogicalPlan { JoinType::Inner => protobuf::JoinType::Inner, JoinType::Left => protobuf::JoinType::Left, JoinType::Right => protobuf::JoinType::Right, + JoinType::Full => protobuf::JoinType::Full, }; let left_join_column = on.iter().map(|on| on.0.to_owned()).collect(); let right_join_column = on.iter().map(|on| on.1.to_owned()).collect(); diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index be0777dbb9a8f..6a33c6a43f628 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -297,6 +297,7 @@ impl TryInto> for &protobuf::PhysicalPlanNode { protobuf::JoinType::Inner => JoinType::Inner, protobuf::JoinType::Left => JoinType::Left, protobuf::JoinType::Right => JoinType::Right, + protobuf::JoinType::Full => JoinType::Full, }; Ok(Arc::new(HashJoinExec::try_new( left, diff --git a/ballista/rust/core/src/serde/physical_plan/to_proto.rs b/ballista/rust/core/src/serde/physical_plan/to_proto.rs index 5352c1f777530..8a5fd71083f75 100644 --- a/ballista/rust/core/src/serde/physical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/to_proto.rs @@ -132,6 +132,7 @@ impl TryInto for Arc { JoinType::Inner => protobuf::JoinType::Inner, JoinType::Left => protobuf::JoinType::Left, JoinType::Right => protobuf::JoinType::Right, + JoinType::Full => protobuf::JoinType::Full, }; Ok(protobuf::PhysicalPlanNode { physical_plan_type: Some(PhysicalPlanType::HashJoin(Box::new( diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs index b6017b743ed70..2e69814d2634e 100644 --- a/datafusion/src/logical_plan/builder.rs +++ b/datafusion/src/logical_plan/builder.rs @@ -347,7 +347,7 @@ fn build_join_schema( join_type: &JoinType, ) -> Result { let fields: Vec = match join_type { - JoinType::Inner | JoinType::Left => { + JoinType::Inner | JoinType::Left | JoinType::Full => { // remove right-side join keys if they have the same names as the left-side let duplicate_keys = &on .iter() diff --git a/datafusion/src/logical_plan/plan.rs b/datafusion/src/logical_plan/plan.rs index 606ef1e222755..13509d13eb159 100644 --- a/datafusion/src/logical_plan/plan.rs +++ b/datafusion/src/logical_plan/plan.rs @@ -39,12 +39,14 @@ use crate::logical_plan::dfschema::DFSchemaRef; /// Join type #[derive(Debug, Clone, Copy)] pub enum JoinType { - /// Inner join + /// Inner Join Inner, - /// Left join + /// Left Join Left, - /// Right join + /// Right Join Right, + /// Full Join + Full, } /// A LogicalPlan represents the different types of relational diff --git a/datafusion/src/optimizer/hash_build_probe_order.rs b/datafusion/src/optimizer/hash_build_probe_order.rs index 086e2f03196bd..b27171f058ca9 100644 --- a/datafusion/src/optimizer/hash_build_probe_order.rs +++ b/datafusion/src/optimizer/hash_build_probe_order.rs @@ -203,6 +203,7 @@ impl HashBuildProbeOrder { fn swap_join_type(join_type: JoinType) -> JoinType { match join_type { JoinType::Inner => JoinType::Inner, + JoinType::Full => JoinType::Full, JoinType::Left => JoinType::Right, JoinType::Right => JoinType::Left, } diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index 60d65b2361601..2682623d374a6 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -180,7 +180,7 @@ impl HashJoinExec { /// Calculates column indices and left/right placement on input / output schemas and jointype fn column_indices_from_schema(&self) -> ArrowResult> { let (primary_is_left, primary_schema, secondary_schema) = match self.join_type { - JoinType::Inner | JoinType::Left => { + JoinType::Inner | JoinType::Left | JoinType::Full => { (true, self.left.schema(), self.right.schema()) } JoinType::Right => (false, self.right.schema(), self.left.schema()), @@ -372,7 +372,7 @@ impl ExecutionPlan for HashJoinExec { let column_indices = self.column_indices_from_schema()?; let num_rows = left_data.1.num_rows(); let visited_left_side = match self.join_type { - JoinType::Left => vec![false; num_rows], + JoinType::Left | JoinType::Full => vec![false; num_rows], JoinType::Inner | JoinType::Right => vec![], }; Ok(Box::pin(HashJoinStream { @@ -644,7 +644,7 @@ fn build_join_indexes( } Ok((left_indices.finish(), right_indices.finish())) } - JoinType::Right => { + JoinType::Right | JoinType::Full => { let mut left_indices = UInt64Builder::new(0); let mut right_indices = UInt32Builder::new(0); @@ -1058,7 +1058,7 @@ impl Stream for HashJoinStream { self.num_output_rows += batch.num_rows(); match self.join_type { - JoinType::Left => { + JoinType::Left | JoinType::Full => { left_side.iter().flatten().for_each(|x| { self.visited_left_side[x as usize] = true; }); @@ -1072,7 +1072,7 @@ impl Stream for HashJoinStream { let start = Instant::now(); // For the left join, produce rows for unmatched rows match self.join_type { - JoinType::Left if !self.is_exhausted => { + JoinType::Left | JoinType::Full if !self.is_exhausted => { let result = produce_unmatched( &self.visited_left_side, &self.schema, @@ -1092,7 +1092,10 @@ impl Stream for HashJoinStream { self.is_exhausted = true; return Some(result); } - JoinType::Left | JoinType::Inner | JoinType::Right => {} + JoinType::Left + | JoinType::Full + | JoinType::Inner + | JoinType::Right => {} } debug!( @@ -1410,6 +1413,46 @@ mod tests { assert_batches_sorted_eq!(expected, &batches); } + #[tokio::test] + async fn join_full_multi_batch() { + let left = build_table( + ("a1", &vec![1, 2, 3]), + ("b1", &vec![4, 5, 7]), // 7 does not exist on the right + ("c1", &vec![7, 8, 9]), + ); + // create two identical batches for the right side + let right = build_table_two_batches( + ("a2", &vec![10, 20, 30]), + ("b2", &vec![4, 5, 6]), + ("c2", &vec![70, 80, 90]), + ); + let on = &[("b1", "b2")]; + + let join = join(left, right, on, &JoinType::Full).unwrap(); + + let columns = columns(&join.schema()); + assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]); + + let stream = join.execute(0).await.unwrap(); + let batches = common::collect(stream).await.unwrap(); + + let expected = vec![ + "+----+----+----+----+----+----+", + "| a1 | b1 | c1 | a2 | b2 | c2 |", + "+----+----+----+----+----+----+", + "| | | | 30 | 6 | 90 |", + "| | | | 30 | 6 | 90 |", + "| 1 | 4 | 7 | 10 | 4 | 70 |", + "| 1 | 4 | 7 | 10 | 4 | 70 |", + "| 2 | 5 | 8 | 20 | 5 | 80 |", + "| 2 | 5 | 8 | 20 | 5 | 80 |", + "| 3 | 7 | 9 | | | |", + "+----+----+----+----+----+----+", + ]; + + assert_batches_sorted_eq!(expected, &batches); + } + #[tokio::test] async fn join_left_empty_right() { let left = build_table( @@ -1442,6 +1485,38 @@ mod tests { assert_batches_sorted_eq!(expected, &batches); } + #[tokio::test] + async fn join_full_empty_right() { + let left = build_table( + ("a1", &vec![1, 2, 3]), + ("b1", &vec![4, 5, 7]), + ("c1", &vec![7, 8, 9]), + ); + let right = build_table_i32(("a2", &vec![]), ("b2", &vec![]), ("c2", &vec![])); + let on = &[("b1", "b2")]; + let schema = right.schema(); + let right = Arc::new(MemoryExec::try_new(&[vec![right]], schema, None).unwrap()); + let join = join(left, right, on, &JoinType::Full).unwrap(); + + let columns = columns(&join.schema()); + assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]); + + let stream = join.execute(0).await.unwrap(); + let batches = common::collect(stream).await.unwrap(); + + let expected = vec![ + "+----+----+----+----+----+----+", + "| a1 | b1 | c1 | a2 | b2 | c2 |", + "+----+----+----+----+----+----+", + "| 1 | 4 | 7 | | | |", + "| 2 | 5 | 8 | | | |", + "| 3 | 7 | 9 | | | |", + "+----+----+----+----+----+----+", + ]; + + assert_batches_sorted_eq!(expected, &batches); + } + #[tokio::test] async fn join_left_one() -> Result<()> { let left = build_table( @@ -1515,6 +1590,43 @@ mod tests { Ok(()) } + #[tokio::test] + async fn join_full_one() -> Result<()> { + let left = build_table( + ("a1", &vec![1, 2, 3]), + ("b1", &vec![4, 5, 7]), // 7 does not exist on the right + ("c1", &vec![7, 8, 9]), + ); + let right = build_table( + ("a2", &vec![10, 20, 30]), + ("b2", &vec![4, 5, 6]), + ("c2", &vec![70, 80, 90]), + ); + let on = &[("b1", "b2")]; + + let join = join(left, right, on, &JoinType::Full)?; + + let columns = columns(&join.schema()); + assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]); + + let stream = join.execute(0).await?; + let batches = common::collect(stream).await?; + + let expected = vec![ + "+----+----+----+----+----+----+", + "| a1 | b1 | c1 | a2 | b2 | c2 |", + "+----+----+----+----+----+----+", + "| | | | 30 | 6 | 90 |", + "| 1 | 4 | 7 | 10 | 4 | 70 |", + "| 2 | 5 | 8 | 20 | 5 | 80 |", + "| 3 | 7 | 9 | | | |", + "+----+----+----+----+----+----+", + ]; + assert_batches_sorted_eq!(expected, &batches); + + Ok(()) + } + #[test] fn join_with_hash_collision() -> Result<()> { let mut hashmap_left = HashMap::with_capacity_and_hasher(2, IdHashBuilder {}); diff --git a/datafusion/src/physical_plan/hash_utils.rs b/datafusion/src/physical_plan/hash_utils.rs index 54da1249e5c55..7e030af3a124c 100644 --- a/datafusion/src/physical_plan/hash_utils.rs +++ b/datafusion/src/physical_plan/hash_utils.rs @@ -24,12 +24,14 @@ use std::collections::HashSet; /// All valid types of joins. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum JoinType { - /// Inner join + /// Inner Join Inner, - /// Left + /// Left Join Left, - /// Right + /// Right Join Right, + /// Full Join + Full, } /// The on clause of the join, as vector of (left, right) columns. @@ -92,7 +94,7 @@ pub fn build_join_schema( join_type: &JoinType, ) -> Schema { let fields: Vec = match join_type { - JoinType::Inner | JoinType::Left => { + JoinType::Inner | JoinType::Left | JoinType::Full => { // remove right-side join keys if they have the same names as the left-side let duplicate_keys = &on .iter() diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index ae6ad5075d877..acbb863c604b7 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -298,6 +298,7 @@ impl DefaultPhysicalPlanner { JoinType::Inner => hash_utils::JoinType::Inner, JoinType::Left => hash_utils::JoinType::Left, JoinType::Right => hash_utils::JoinType::Right, + JoinType::Full => hash_utils::JoinType::Full, }; if ctx_state.config.concurrency > 1 && ctx_state.config.repartition_joins { diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index d5aae24e7971b..036c66da16cc5 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -355,6 +355,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { JoinOperator::Inner(constraint) => { self.parse_join(left, &right, constraint, JoinType::Inner) } + JoinOperator::FullOuter(constraint) => { + self.parse_join(left, &right, constraint, JoinType::Full) + } JoinOperator::CrossJoin => self.parse_cross_join(left, &right), other => Err(DataFusionError::NotImplemented(format!( "Unsupported JOIN operator {:?}", diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 5c90f8ac162b3..45a5694fc8cc1 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -1329,6 +1329,27 @@ async fn right_join() -> Result<()> { Ok(()) } +#[tokio::test] +async fn full_join() -> Result<()> { + let mut ctx = create_join_context("t1_id", "t2_id")?; + let sql = "SELECT t1_id, t1_name, t2_name FROM t1 FULL JOIN t2 ON t1_id = t2_id ORDER BY t1_id"; + let actual = execute(&mut ctx, sql).await; + let expected = vec![ + vec!["NULL", "NULL", "w"], + vec!["11", "a", "z"], + vec!["22", "b", "y"], + vec!["33", "c", "NULL"], + vec!["44", "d", "x"], + ]; + assert_eq!(expected, actual); + + let sql = "SELECT t1_id, t1_name, t2_name FROM t1 FULL OUTER JOIN t2 ON t1_id = t2_id ORDER BY t1_id"; + let actual = execute(&mut ctx, sql).await; + assert_eq!(expected, actual); + + Ok(()) +} + #[tokio::test] async fn left_join_using() -> Result<()> { let mut ctx = create_join_context("id", "id")?; From 11634f391f39c68c7cabbc4c3b33e424475b211d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Mon, 10 May 2021 13:55:27 +0200 Subject: [PATCH 081/329] Use unary function (#309) --- .../src/physical_plan/math_expressions.rs | 32 +++++-------------- datafusion/tests/sql.rs | 2 +- 2 files changed, 9 insertions(+), 25 deletions(-) diff --git a/datafusion/src/physical_plan/math_expressions.rs b/datafusion/src/physical_plan/math_expressions.rs index 72b4f102f6627..0e0bed2deac2f 100644 --- a/datafusion/src/physical_plan/math_expressions.rs +++ b/datafusion/src/physical_plan/math_expressions.rs @@ -17,37 +17,21 @@ //! Math expressions -use arrow::array::{make_array, Array, ArrayData, Float32Array, Float64Array}; -use arrow::buffer::Buffer; -use arrow::datatypes::{DataType, ToByteSlice}; - use super::{ColumnarValue, ScalarValue}; use crate::error::{DataFusionError, Result}; - -macro_rules! compute_op { - ($ARRAY:expr, $FUNC:ident, $TYPE:ident) => {{ - let len = $ARRAY.len(); - let result = (0..len) - .map(|i| $ARRAY.value(i).$FUNC() as f64) - .collect::>(); - let data = ArrayData::new( - DataType::Float64, - len, - Some($ARRAY.null_count()), - $ARRAY.data().null_buffer().cloned(), - 0, - vec![Buffer::from(result.to_byte_slice())], - vec![], - ); - Ok(make_array(data)) - }}; -} +use arrow::array::{Float32Array, Float64Array}; +use arrow::datatypes::DataType; +use std::sync::Arc; macro_rules! downcast_compute_op { ($ARRAY:expr, $NAME:expr, $FUNC:ident, $TYPE:ident) => {{ let n = $ARRAY.as_any().downcast_ref::<$TYPE>(); match n { - Some(array) => compute_op!(array, $FUNC, $TYPE), + Some(array) => { + let res: $TYPE = + arrow::compute::kernels::arity::unary(array, |x| x.$FUNC()); + Ok(Arc::new(res)) + } _ => Err(DataFusionError::Internal(format!( "Invalid data type for {}", $NAME diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 45a5694fc8cc1..3194ac7bf794d 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -631,7 +631,7 @@ async fn sqrt_f32_vs_f64() -> Result<()> { // sqrt(f32)'s plan passes let sql = "SELECT avg(sqrt(c11)) FROM aggregate_test_100"; let actual = execute(&mut ctx, sql).await; - let expected = vec![vec!["0.6584408485889435"]]; + let expected = vec![vec!["0.6584407806396484"]]; assert_eq!(actual, expected); let sql = "SELECT avg(sqrt(CAST(c11 AS double))) FROM aggregate_test_100"; From dcf6c110e7309e867e4b2fc1ac75cf9313095667 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Tue, 11 May 2021 05:49:33 +0800 Subject: [PATCH 082/329] Add print format param with support for tsv print format to datafusion cli (#292) * add csv mode to datafusion cli * adding tsv format * use Self whereas possible * prune import --- datafusion-cli/src/format/print_format.rs | 41 +++++++++++++---------- datafusion-cli/src/main.rs | 10 +++--- 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/datafusion-cli/src/format/print_format.rs b/datafusion-cli/src/format/print_format.rs index 921e29f7fae79..a9fc56b90524b 100644 --- a/datafusion-cli/src/format/print_format.rs +++ b/datafusion-cli/src/format/print_format.rs @@ -26,38 +26,45 @@ use std::str::FromStr; #[derive(Debug, Clone)] pub enum PrintFormat { Csv, + Tsv, Table, } impl FromStr for PrintFormat { type Err = (); - fn from_str(s: &str) -> std::result::Result { + fn from_str(s: &str) -> std::result::Result { match s { - "csv" => Ok(PrintFormat::Csv), - "table" => Ok(PrintFormat::Table), + "csv" => Ok(Self::Csv), + "tsv" => Ok(Self::Tsv), + "table" => Ok(Self::Table), _ => Err(()), } } } +fn print_batches_with_sep(batches: &[RecordBatch], delimiter: u8) -> Result { + let mut bytes = vec![]; + { + let builder = WriterBuilder::new() + .has_headers(true) + .with_delimiter(delimiter); + let mut writer = builder.build(&mut bytes); + for batch in batches { + writer.write(batch)?; + } + } + let formatted = String::from_utf8(bytes) + .map_err(|e| DataFusionError::Execution(e.to_string()))?; + Ok(formatted) +} + impl PrintFormat { /// print the batches to stdout using the specified format pub fn print_batches(&self, batches: &[RecordBatch]) -> Result<()> { match self { - PrintFormat::Csv => { - let mut bytes = vec![]; - { - let builder = WriterBuilder::new().has_headers(true); - let mut writer = builder.build(&mut bytes); - for batch in batches { - writer.write(batch)?; - } - } - let csv = String::from_utf8(bytes) - .map_err(|e| DataFusionError::Execution(e.to_string()))?; - println!("{}", csv); - } - PrintFormat::Table => pretty::print_batches(batches)?, + Self::Csv => println!("{}", print_batches_with_sep(batches, b',')?), + Self::Tsv => println!("{}", print_batches_with_sep(batches, b'\t')?), + Self::Table => pretty::print_batches(batches)?, } Ok(()) } diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index f15c1d8e4a21c..52d3ccc6123b0 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -66,7 +66,7 @@ pub async fn main() { ) .arg( Arg::with_name("format") - .help("Output format (possible values: table, csv)") + .help("Output format (possible values: table, csv, tsv)") .long("format") .default_value("table") .validator(is_valid_format) @@ -183,10 +183,10 @@ async fn exec_from_repl(execution_config: ExecutionConfig, print_format: PrintFo } fn is_valid_format(format: String) -> std::result::Result<(), String> { - match format.to_lowercase().as_str() { - "csv" => Ok(()), - "table" => Ok(()), - _ => Err(format!("Format '{}' not supported", format)), + if format.parse::().is_ok() { + Ok(()) + } else { + Err(format!("Format '{}' not supported", format)) } } From ee69ac85785e940cf48f3780460c6f103ce0cf4a Mon Sep 17 00:00:00 2001 From: Parth Sarthy Date: Mon, 10 May 2021 23:09:44 +0100 Subject: [PATCH 083/329] add projects (Squirtle and Tensorbase) to list in readme (#312) Co-authored-by: Parth Sarthy --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index bc153368e1bf9..2d4a1c69c2d2a 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,8 @@ Here are some of the projects known to use DataFusion: * [delta-rs](https://github.com/delta-io/delta-rs) * [InfluxDB IOx](https://github.com/influxdata/influxdb_iox) Time Series Database * [ROAPI](https://github.com/roapi/roapi) +* [Tensorbase](https://github.com/tensorbase/tensorbase) +* [Squirtle](https://github.com/DSLAM-UMD/Squirtle) (if you know of another project, please submit a PR to add a link!) From 7133c330d1aabef750d9c42e0dab7e3a260ed69d Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Tue, 11 May 2021 18:52:53 +0800 Subject: [PATCH 084/329] Add json print format mode to datafusion cli (#295) * add csv mode to datafusion cli * add license * fix per comments * update help * adding tsv format * use Self whereas possible * add json support * adding unit test * remove redundant clone * add csv mode to datafusion cli * adding tsv format * use Self whereas possible * prune import --- datafusion-cli/src/format/print_format.rs | 99 ++++++++++++++++++++++- datafusion-cli/src/main.rs | 2 +- 2 files changed, 99 insertions(+), 2 deletions(-) diff --git a/datafusion-cli/src/format/print_format.rs b/datafusion-cli/src/format/print_format.rs index a9fc56b90524b..85caaa3c52767 100644 --- a/datafusion-cli/src/format/print_format.rs +++ b/datafusion-cli/src/format/print_format.rs @@ -17,17 +17,19 @@ //! Print format variants use arrow::csv::writer::WriterBuilder; +use arrow::json::ArrayWriter; use datafusion::arrow::record_batch::RecordBatch; use datafusion::arrow::util::pretty; use datafusion::error::{DataFusionError, Result}; use std::str::FromStr; /// Allow records to be printed in different formats -#[derive(Debug, Clone)] +#[derive(Debug, PartialEq, Eq, Clone)] pub enum PrintFormat { Csv, Tsv, Table, + Json, } impl FromStr for PrintFormat { @@ -37,11 +39,24 @@ impl FromStr for PrintFormat { "csv" => Ok(Self::Csv), "tsv" => Ok(Self::Tsv), "table" => Ok(Self::Table), + "json" => Ok(Self::Json), _ => Err(()), } } } +fn print_batches_to_json(batches: &[RecordBatch]) -> Result { + let mut bytes = vec![]; + { + let mut writer = ArrayWriter::new(&mut bytes); + writer.write_batches(batches)?; + writer.finish()?; + } + let formatted = String::from_utf8(bytes) + .map_err(|e| DataFusionError::Execution(e.to_string()))?; + Ok(formatted) +} + fn print_batches_with_sep(batches: &[RecordBatch], delimiter: u8) -> Result { let mut bytes = vec![]; { @@ -65,7 +80,89 @@ impl PrintFormat { Self::Csv => println!("{}", print_batches_with_sep(batches, b',')?), Self::Tsv => println!("{}", print_batches_with_sep(batches, b'\t')?), Self::Table => pretty::print_batches(batches)?, + Self::Json => println!("{}", print_batches_to_json(batches)?), } Ok(()) } } + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::Int32Array; + use arrow::datatypes::{DataType, Field, Schema}; + use std::sync::Arc; + + #[test] + fn test_from_str() { + let format = "csv".parse::().unwrap(); + assert_eq!(PrintFormat::Csv, format); + + let format = "tsv".parse::().unwrap(); + assert_eq!(PrintFormat::Tsv, format); + + let format = "json".parse::().unwrap(); + assert_eq!(PrintFormat::Json, format); + + let format = "table".parse::().unwrap(); + assert_eq!(PrintFormat::Table, format); + } + + #[test] + fn test_from_str_failure() { + assert_eq!(true, "pretty".parse::().is_err()); + } + + #[test] + fn test_print_batches_with_sep() { + let batches = vec![]; + assert_eq!("", print_batches_with_sep(&batches, b',').unwrap()); + + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + Field::new("c", DataType::Int32, false), + ])); + + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(Int32Array::from(vec![4, 5, 6])), + Arc::new(Int32Array::from(vec![7, 8, 9])), + ], + ) + .unwrap(); + + let batches = vec![batch]; + let r = print_batches_with_sep(&batches, b',').unwrap(); + assert_eq!("a,b,c\n1,4,7\n2,5,8\n3,6,9\n", r); + } + + #[test] + fn test_print_batches_to_json_empty() { + let batches = vec![]; + let r = print_batches_to_json(&batches).unwrap(); + assert_eq!("", r); + + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + Field::new("c", DataType::Int32, false), + ])); + + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(Int32Array::from(vec![4, 5, 6])), + Arc::new(Int32Array::from(vec![7, 8, 9])), + ], + ) + .unwrap(); + + let batches = vec![batch]; + let r = print_batches_to_json(&batches).unwrap(); + assert_eq!("[{\"a\":1,\"b\":4,\"c\":7},{\"a\":2,\"b\":5,\"c\":8},{\"a\":3,\"b\":6,\"c\":9}]", r); + } +} diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index 52d3ccc6123b0..2360d4642484c 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -66,7 +66,7 @@ pub async fn main() { ) .arg( Arg::with_name("format") - .help("Output format (possible values: table, csv, tsv)") + .help("Output format (possible values: table, csv, tsv, json)") .long("format") .default_value("table") .validator(is_valid_format) From 63eaeeeb7106977a7fce14eb586b5889150ede0b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 11 May 2021 06:53:20 -0400 Subject: [PATCH 085/329] Update PR template by commenting out instructions (#315) * Update PR template by commenting out instructions Inspired by @nevi-me 's PR https://github.com/apache/arrow-rs/pull/278 Some contributors don't remove the guidelines when creating PRs, so it might be more convenient if we hide them behind comments. The comments are still visible when editing, but are not displayed when the markdown is rendered * Apply suggestions from code review Co-authored-by: Wakahisa Co-authored-by: Wakahisa --- .github/pull_request_template.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 5da0d08f9469a..e32246061f813 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,19 +1,27 @@ # Which issue does this PR close? + Closes #. # Rationale for this change + # What changes are included in this PR? - + # Are there any user-facing changes? - + + From a1c6898f559020a4a56e301714cd16fd8229ca09 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 11 May 2021 11:32:49 -0400 Subject: [PATCH 086/329] Update arrow-rs deps (#317) --- ballista/rust/client/Cargo.toml | 2 +- ballista/rust/core/Cargo.toml | 4 ++-- ballista/rust/executor/Cargo.toml | 4 ++-- ballista/rust/scheduler/Cargo.toml | 2 +- datafusion-cli/Cargo.toml | 2 +- datafusion-examples/Cargo.toml | 2 +- datafusion/Cargo.toml | 4 ++-- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ballista/rust/client/Cargo.toml b/ballista/rust/client/Cargo.toml index b077af69b954c..8b5d7af3f2a24 100644 --- a/ballista/rust/client/Cargo.toml +++ b/ballista/rust/client/Cargo.toml @@ -31,5 +31,5 @@ futures = "0.3" log = "0.4" tokio = "1.0" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "508f25c10032857da34ea88cc8166f0741616a32" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "8226219fe7104f6c8a2740806f96f02c960d991c" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml index dd3cbe65226cb..d98cc7e49e83b 100644 --- a/ballista/rust/core/Cargo.toml +++ b/ballista/rust/core/Cargo.toml @@ -40,8 +40,8 @@ tokio = "1.0" tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } -arrow = { git = "https://github.com/apache/arrow-rs", rev = "508f25c10032857da34ea88cc8166f0741616a32" } -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "508f25c10032857da34ea88cc8166f0741616a32" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "8226219fe7104f6c8a2740806f96f02c960d991c" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "8226219fe7104f6c8a2740806f96f02c960d991c" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/executor/Cargo.toml b/ballista/rust/executor/Cargo.toml index 5b9dc003013b8..a5e40341981d8 100644 --- a/ballista/rust/executor/Cargo.toml +++ b/ballista/rust/executor/Cargo.toml @@ -44,8 +44,8 @@ tokio-stream = "0.1" tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } -arrow = { git = "https://github.com/apache/arrow-rs", rev = "508f25c10032857da34ea88cc8166f0741616a32" } -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "508f25c10032857da34ea88cc8166f0741616a32" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "8226219fe7104f6c8a2740806f96f02c960d991c" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "8226219fe7104f6c8a2740806f96f02c960d991c" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/scheduler/Cargo.toml b/ballista/rust/scheduler/Cargo.toml index 7315533426529..43dc4285add5a 100644 --- a/ballista/rust/scheduler/Cargo.toml +++ b/ballista/rust/scheduler/Cargo.toml @@ -52,7 +52,7 @@ tonic = "0.4" tower = { version = "0.4" } warp = "0.3" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "508f25c10032857da34ea88cc8166f0741616a32" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "8226219fe7104f6c8a2740806f96f02c960d991c" } datafusion = { path = "../../../datafusion" } [dev-dependencies] diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 2cde4da16ca1c..2551b775adfb2 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -31,4 +31,4 @@ clap = "2.33" rustyline = "8.0" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } datafusion = { path = "../datafusion" } -arrow = { git = "https://github.com/apache/arrow-rs", rev = "508f25c10032857da34ea88cc8166f0741616a32" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "8226219fe7104f6c8a2740806f96f02c960d991c" } diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index f3fe39f5b6fb6..8d8f20eb97931 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -29,7 +29,7 @@ publish = false [dev-dependencies] -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "508f25c10032857da34ea88cc8166f0741616a32" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "8226219fe7104f6c8a2740806f96f02c960d991c" } datafusion = { path = "../datafusion" } prost = "0.7" tonic = "0.4" diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index b42bef682b0d0..610ffdaa4886d 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -46,8 +46,8 @@ unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = "0.7" hashbrown = "0.11" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "508f25c10032857da34ea88cc8166f0741616a32", features = ["prettyprint"] } -parquet = { git = "https://github.com/apache/arrow-rs", rev = "508f25c10032857da34ea88cc8166f0741616a32", features = ["arrow"] } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "8226219fe7104f6c8a2740806f96f02c960d991c", features = ["prettyprint"] } +parquet = { git = "https://github.com/apache/arrow-rs", rev = "8226219fe7104f6c8a2740806f96f02c960d991c", features = ["arrow"] } sqlparser = "0.9.0" paste = "^1.0" num_cpus = "1.13.0" From 3f269b9d8f68910f20f38ec664cb407e89781c74 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 12 May 2021 04:24:59 -0600 Subject: [PATCH 087/329] Fix integration tests by adding datafusion-cli module to docker image (#322) --- dev/docker/rust.dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dev/docker/rust.dockerfile b/dev/docker/rust.dockerfile index ba713b15e90c4..045300126359b 100644 --- a/dev/docker/rust.dockerfile +++ b/dev/docker/rust.dockerfile @@ -35,6 +35,7 @@ RUN mkdir /tmp/ballista/datafusion-examples ADD Cargo.toml . COPY benchmarks ./benchmarks/ COPY datafusion ./datafusion/ +COPY datafusion-cli ./datafusion-cli/ COPY datafusion-examples ./datafusion-examples/ COPY ballista ./ballista/ RUN cargo chef prepare --recipe-path recipe.json @@ -47,11 +48,13 @@ FROM base as builder RUN mkdir /tmp/ballista/ballista RUN mkdir /tmp/ballista/benchmarks RUN mkdir /tmp/ballista/datafusion +RUN mkdir /tmp/ballista/datafusion-cli RUN mkdir /tmp/ballista/datafusion-examples ADD Cargo.toml . COPY benchmarks ./benchmarks/ COPY datafusion ./datafusion/ COPY ballista ./ballista/ +COPY datafusion-cli ./datafusion-cli/ COPY datafusion-examples ./datafusion-examples/ COPY --from=cacher /tmp/ballista/target target ARG RELEASE_FLAG=--release From cdc8b2d8252043c730237d8d33495e39165f5995 Mon Sep 17 00:00:00 2001 From: Charlie Evans Date: Wed, 12 May 2021 19:48:23 +0200 Subject: [PATCH 088/329] Support COUNT(DISTINCT timestamps) (#319) * Pattern match on DataType::Timestamp and build the list * Use the build_values_list in build_list * add count_distinct_timestamps test to sql.rs --- datafusion/src/scalar.rs | 89 ++++++++++++++++++++++++++++++++-------- datafusion/tests/sql.rs | 13 ++++++ 2 files changed, 84 insertions(+), 18 deletions(-) diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index 7b76b7041ef2a..e59d21e7fcef0 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -119,26 +119,76 @@ macro_rules! build_list { ) } Some(values) => { - let mut builder = ListBuilder::new($VALUE_BUILDER_TY::new(values.len())); - - for _ in 0..$SIZE { - for scalar_value in values { - match scalar_value { - ScalarValue::$SCALAR_TY(Some(v)) => { - builder.values().append_value(v.clone()).unwrap() - } - ScalarValue::$SCALAR_TY(None) => { - builder.values().append_null().unwrap(); - } - _ => panic!("Incompatible ScalarValue for list"), - }; - } - builder.append(true).unwrap(); - } + build_values_list!($VALUE_BUILDER_TY, $SCALAR_TY, values, $SIZE) + } + } + }}; +} - builder.finish() +macro_rules! build_timestamp_list { + ($TIME_UNIT:expr, $TIME_ZONE:expr, $VALUES:expr, $SIZE:expr) => {{ + match $VALUES { + // the return on the macro is necessary, to short-circuit and return ArrayRef + None => { + return new_null_array( + &DataType::List(Box::new(Field::new( + "item", + DataType::Timestamp($TIME_UNIT, $TIME_ZONE), + true, + ))), + $SIZE, + ) } + Some(values) => match $TIME_UNIT { + TimeUnit::Second => build_values_list!( + TimestampSecondBuilder, + TimestampSecond, + values, + $SIZE + ), + TimeUnit::Microsecond => build_values_list!( + TimestampMillisecondBuilder, + TimestampMillisecond, + values, + $SIZE + ), + TimeUnit::Millisecond => build_values_list!( + TimestampMicrosecondBuilder, + TimestampMicrosecond, + values, + $SIZE + ), + TimeUnit::Nanosecond => build_values_list!( + TimestampNanosecondBuilder, + TimestampNanosecond, + values, + $SIZE + ), + }, + } + }}; +} + +macro_rules! build_values_list { + ($VALUE_BUILDER_TY:ident, $SCALAR_TY:ident, $VALUES:expr, $SIZE:expr) => {{ + let mut builder = ListBuilder::new($VALUE_BUILDER_TY::new($VALUES.len())); + + for _ in 0..$SIZE { + for scalar_value in $VALUES { + match scalar_value { + ScalarValue::$SCALAR_TY(Some(v)) => { + builder.values().append_value(v.clone()).unwrap() + } + ScalarValue::$SCALAR_TY(None) => { + builder.values().append_null().unwrap(); + } + _ => panic!("Incompatible ScalarValue for list"), + }; + } + builder.append(true).unwrap(); } + + builder.finish() }}; } @@ -360,10 +410,13 @@ impl ScalarValue { DataType::Utf8 => build_list!(StringBuilder, Utf8, values, size), DataType::Float32 => build_list!(Float32Builder, Float32, values, size), DataType::Float64 => build_list!(Float64Builder, Float64, values, size), + DataType::Timestamp(unit, tz) => { + build_timestamp_list!(unit.clone(), tz.clone(), values, size) + } DataType::LargeUtf8 => { build_list!(LargeStringBuilder, LargeUtf8, values, size) } - _ => panic!("Unexpected DataType for list"), + dt => panic!("Unexpected DataType for list {:?}", dt), }), ScalarValue::Date32(e) => match e { Some(value) => Arc::new(Date32Array::from_value(*value, size)), diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 3194ac7bf794d..4b53e2f2e38c1 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -1959,6 +1959,19 @@ async fn to_timestamp() -> Result<()> { Ok(()) } +#[tokio::test] +async fn count_distinct_timestamps() -> Result<()> { + let mut ctx = ExecutionContext::new(); + ctx.register_table("ts_data", make_timestamp_nano_table()?)?; + + let sql = "SELECT COUNT(DISTINCT(ts)) FROM ts_data"; + let actual = execute(&mut ctx, sql).await; + + let expected = vec![vec!["3"]]; + assert_eq!(expected, actual); + Ok(()) +} + #[tokio::test] async fn query_is_null() -> Result<()> { let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Float64, true)])); From 3e7fed8d058f5df0520664d74832ed1935ee0e2b Mon Sep 17 00:00:00 2001 From: Parth Sarthy Date: Wed, 12 May 2021 18:49:33 +0100 Subject: [PATCH 089/329] add new logo svg with white background (#313) Co-authored-by: Parth Sarthy --- README.md | 2 +- .../images/DataFusion-Logo-Background-White.png | Bin 0 -> 12401 bytes .../images/DataFusion-Logo-Background-White.svg | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 datafusion/docs/images/DataFusion-Logo-Background-White.png create mode 100644 datafusion/docs/images/DataFusion-Logo-Background-White.svg diff --git a/README.md b/README.md index 2d4a1c69c2d2a..ded264a003f43 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ # DataFusion - + DataFusion is an extensible query execution framework, written in Rust, that uses [Apache Arrow](https://arrow.apache.org) as its diff --git a/datafusion/docs/images/DataFusion-Logo-Background-White.png b/datafusion/docs/images/DataFusion-Logo-Background-White.png new file mode 100644 index 0000000000000000000000000000000000000000..023c2373fc4949037ca1e11bfb72e2e469a0bda7 GIT binary patch literal 12401 zcmX9_1vp;s8{USQHZvU`j&3$RZG4(B&2%>#Gfa0Kd^*R}Fx|1~nC_nL&Z+ zujfGBv{^-#oEVzuM>Z3=G;n2^y&z?e`!yY(m(6^^Xa?+BI z?jOD~oAP2GJ>sB)ONyzv&FrP2&8d&xbt(|=tFt9AOh!|W$bi{ga^IOLGj^KO zDlTTOcW-yAyxLC4hAPYlQ{AeYJ+Jq@c8Y60{z-mqeZBrC#hXyv1|DQPoGmqbcV5%- zvLfx%g8P*hh4oCOrQqk&&6wEmh{fN7&UHtfq|SRgJWi|KXs@!~s_o`U@?+;3K9%zQ+x#XMe%5@w&kYbGu{WKnh8UB( z42do0Wq5HZMTmU_O^UUv+6fK5Ja^S8Hwj7PG!7(sZ56jUUX)O~o^0)3g~@!jBrbFN z@9YH%3W^V3&&7fV`>SdRY<|f*`#Pt!_VhO%U-%ckE2OlCzGT+5#}U5#VDZJi`N~Xp z@%HZez_Y>5Jl)+a#JcnRbW|xNuM;K+FE?xpMBB^E&zI>>6V#Y<>SJ9VlE7gVLqw4B zBK%{lURjPvMsBpfP{PL%Iuy0|(iVt?(H?cU9L65l`O0;BOf=kbq0v2y^UeLu$#O7F zWc2-Ql=qDUJ^?{8eOGI=3H4O1<2T*Ta+ZnYOS!l{;A_ zA_7m*L0VSr%4csh^HQBcQ)7S^EEqk_&NpG~&3va9&>aVMo?Wg)k%Hl^OTYXiR?1|X zRu@CV!ldGv)K?du71>OecL=?C8VEG`x~(bj&f|Yxg8eZu>WyQ}GLCITqjwPe+s?u( zBC;55LbN?u%HHL)U69wiIP|m(?vVOU~9Xto~9SoHR`kf-#z%%bWM5xe|F(Z@X~^lo`k=J#}x-EYRH(1TwaULCJ3FZtmhy7*VZ zxj|Ku(^>nL0Q4-Xb{XXnlZ@-CNquXXV{c$XTq_ZlBt%Z<~TIh|rE5y{S*i#g+T7?K3& z#j?Uyze&BQ^h3MKiZdbL_WA-RB4qi}hQ7yfr8A&YtnDlMk0(rO`7jaZ)fnks`C169-2(30SG-FOHLttG_bqxL?1+Hkr$0J5z}ku~jiB+@}rRKyM-LxZ-`g zW1u3^@cC?8SZR>q5UCKHSsZ_j5PUkqrOr_qWic{`P3Ecl6ne*>0ZTTi&0 zk^fuR;}n5H`Txr>+8g*5dgwED`|j9lmwclW19YeYg+2epDgW)^0dH1gmBhR5Tkfw# zG#1g=yXk4tw|&1L`Gr%#yN40)Mk5ZIuS@))HJRS-Q-(oTC4)q2VB=o(>}pxv(<2L? zKn!ATVNKoMrcIbO! z_9rG;P+i45s={l`6~c~0ZU<3Av; zNW?%W06qP~!UO}M^@m$|8G+~5zcJXmQOEiMO_Jv)5?Rb!)uI7ai;qoAXYTr?I&MOf zwql>FeSAI859TW!jS9{25#_-ERWjt(vxHP3P(;7UK`ZEMuJ*h>&nok}IqD*o1_4YP z=|06Y0n2IgTC2L*iB20N?bIskNgl%A(I%Gu7)w01GitM5ZQMn?a}Ivhw``g}VFk@X zJ#=uc>7!t+S`_qybFTz8J*)JRXS$!}T^+X&k>r9CCUq4)fC@^DHBFM`hqX+MQ?1#R z`^|4rm5~kai?wIW^Tl92ho6<_xrG~jcTYB_%D&leD5u)TyS>cSDCMKV+Lc`1LiX4# z>ryRDw2wvxB_rmlkv{CqH`Gkr&ISy5@ljAuSVvvQYv}h7fvKFvZ#Oled{aN+IeKRD zNzJ`LLmkg}|6`fGB5#?s3|5(SYk|>5>uARUF}dW?euU*wrepD~YBQJ2I}q5vylA<9 zb1G^C%XQwu)qnPfRD;*+{`O)Bv({?7u(=RH9}7VBZao_2C}c!nCw4v|w-K8R`xL52 z!ydCrRJZ3<>T`3E=_jdGV>gevLImPFoqug?(VMgNm>Dw7eDN1LZb~O=Q`VFE+FI4r*r-`lai}^O!wmcW<$i$TVw6iGWt7! zhK5bVA$tBOX!6hV{z-*sXvGQjsrEGFx-J`#0J0ZvrKT1+0u!@tX8*zQ4Q9IvFl&pSx0= zhn}<1hJc>Kz&QH42%9AFCkH?EDhlD$;PpV7pr^t@u0KG9wD4${2anx?2dcpDb>033 zaFyh4*VjVmrk!{3%gSsfd&6#dG~F7Mt38`U2j5)iZZcvm>1u~Py~sel?o53Y-Zg5aR|dLiUWGY%tL&uVzZlQ z2hB4hG};8MxE4*s5Je$Z_Qy$9HQzhTWo#J#-;c&Uom?a++iI>)J`lHRcH`6Sum)m( z_r$_$n*QeFIgrAuWakwuarmU7OX-|nYl*g#)3vET^y4k{aWe_E6X7``N2~FZ^%5lq zVj3VGCC6{$wO`N;w3=(LVD!xp{A6bqodZ?GCx=7&uH)*|yu7>xU8d^;j!j<)6>@yt zxqivU?Ef$+p2LKb5am2$X^AD4Fs@QB(d93AwU07JUaH?Dur(RR#>1q6n;DsRw6u}o zjVNN50G~d!ME^}%uVbSAbEUk$eFQ5QNg{HUY}xio%KgoBEG#iR^S^QCDiLWcaX zFJFq}zCzb~?{DNgb7vtl(khC?pnk-Z-wMW>p^&$dqCvGp235di6Q<+j`Xu3zS_1=~ zl4KUcP5qmr_7}t+_Qc?;?q`_-@3O>E$}EPdS+6BT#^nC;Tb&EcY93LKY^BR+;r;!J zO5xV(d_hL8ClQ4G8nFC;>jInZ7#S|w#%-O(#mSX)yqNza2g+Bq2BJe5JU*M%YZfE| zUvWGm+GPChsi;9Vgn>KJ>H#Oz#9kp@dz~tJfvsIy|Ks)0$3HI8(vVS!1C)fXB$?ET zwJP4u3Nu=V)WA~zbwlOPf)uHTk^Mb(qv(F!21UVRLAyCVR%2>1pUj_04_8lI(sK8C zxYp6w)^jGVs@-@`mJw6V7TMC3dN^0mB=C-g0I$9{;g8d7D-6qrfyMvpVyI$f`%0$^ z_Fqex<0SKR_*E5b+7A>trx9qd$bZuP^q1bldBPxv(Jx%{zCD}NM(c{0oB3(j2Eey@ zjy_-S)vWvzB8i5&ff(Pu4DY+TYDZkk4(iG19I**sQK1S>t6Mh?mHw^aZ$X&j&1Kpop$L9E?9ng?zldnz+i6pWI`afIlPZxj>>=EC;@Ay@zu~(>-jjpbU;8tr75luKCMvwI z7YQeUNxiU}K&bw`%fyNMY|KQ$!(Kqj`ebwo_i>25fo&Ixid8ymbA2fzc^& z{AB}bD&7TxyAm;VJs12GShNxbC9A}Jaq$@W)|2E4^pFzHes!}UyO{}I z1CA%mulyQgn=)LyoWTyv7CKU1yVE_*_NhabvSKq=TNCAnHq6~%3%O$U(7M9350SWN zxE|)1YNu}dFRg5VHQ=|9qk8Pgs;lPY#i_px=Q__OW|+$X!PR2K%MN#10~wd$%w<2@ zpLX_SGX%ql;V4WPzrFKZsNc_NK=G!xOd6y@MN(<3Xfw3(PGHlU^p@NNF*$T zN~Mj!i=uzX#m@;By|iHExKQfF)mm-Z)Hpo;!gVW6;c1uKu zJH}_Nie{2*?GCQxXg_RR!9}4CN)R){6kAws)$3nA!%Vq2i(b7Ng2HfRr|P>Zicxgg zd$c%NsfD)u3e=8%7J{F>*p$}=dFf2pwdXU|NKzo=Kwfk7owKY?B1cc_{K{7&grdGh z32X0gayUu+W=?8hT;tOh<57QY?25GfU8r_b&?;(iUz=Unvn*?hf2k7JFH=YB6dqVxmbG?BbZk-C0G#O?cR_~BQ~(<!bv&HN z|KMV`VsL`X?@$(uD^@GkRwr}r=S+W%Lsj?w!QC{;w*U_Z#uR zRa0=1877?ToALMP{$k6~?_8QzGOFxwt`p~a#~zcG(-AqY=UL1P@QE7Nor&ds?!ma6 zA|!n{9&W{o;Mw-LoP5SGH4|4qk61x9T3T;kS(UxJV?j>n#N~&9x10s{E$x3zu|ecI zzEoJ7c_|HPAErlk@P&XB4V=f{^POc>Q3UV;o}Pmj2&8ekADd`pf2%j73Ug5bC&a+7 zBudSsrFVu$>5J_PA-r!RF!s~jVu?mdgXToIKZ*Q0=)H>lqQiOST4vtAHel?r^D3rh zY+INqQYqM1*zG-`033|?JX=ZM1FL)o8CwyIvr(4+QLmEn+L~aKx84ANX9-FTZr&Rrw>toe0FT|*cyEvs)>de_5?A(+yvhXIi!4MOKr#-`K*>1*qyL`)Ssk-(j! zpi1h2SBL18R^#hZJDHIvkVOIv1YY>Qb0&fOKhG> zMd^Q77`i-9DmCD7Mvf>_SOX`}6p~-X7-Mvxzy50EwvRNxtFM}M&SmcQrc2Yxv1EZQ z#|;jGWw5$eBw#-tC>bBr>b2U;;VG+eDF&WXFAh`3KYf^a z9Cp${Bl~&K2NyoULK&`9Wc%8`PuH6GomOQY+#*&QoO387b=;XK4g|;@ATqPpbHP9RA=?c!wbA!cMwS(hW7?`u!0^_~S6tVe=dXPGuWGJ&4WT1`Kkz0W@4Xn2 z8$=L72nkLX1gOJFxw5w~Ti{NUlqnrsFV8iQ8r{1V7v?hd z`GKgbKGZ@9G)i4YYt~yj_}BIPZCb@iy~6PUEVvmOy!r4qM((-9U^x}sT=}3j(*wm( zG6|{peG5{@T*=Ej`LS2d3KZ3kiFz_6eNst7w|DcbCrgBNdyLAU&Su-RhBiM26GUcm zm#F4laBG8X2)~DyL$wRq^D(q%kXxE*UMu(_dvo$1QQbjo6u;!n+y#Eo`;n%v+&yT` zVZn~nqStr_MCZfRwmOSu>Oh85pqd6l=HzMW-bXo5@Z&Bnea4q^rw$1;kP0pVHiJ3u z2so$p*|)^O7;+}w@!K6Q+Khkkrlj9=erT3aj}7C0Y8)z*Yd?=ah@Css@ffz8POG_$ z&xr;^!?kjmqhbLkJ>of`$9){bLV4e;NTWn!KNP z^9p6lVTK!dW+tl?6U}Q?J8+FU>SXXaugmk7SeMk%m2gh^w#LC|Mk-fHttq6qV=#4SK8wV{n`Ts6^E%WB1CJUSo% z*h(;CB*%C`uNdCIN5ZP4lghW1Mn7SrIvzp8JrAdbdpu?9VR$qX3zbWM;1>k<3sM!n zuectC0Vf&}Dg!xAB3pR>F#Bw0ahcwyZFw9t(TmKM=m~0cbtxj@32)`Zs;)C6)>O6} zli=Il@>oKtXSHZw!)*G&5&wyUQzwOX0_Af8*P)|Hz=dM70vtCWCE?D#-1JZ5@IvLz z<0noy=WUUrFPD?tqe3i8W&6symn2!xW| zSA*zK2F`=*Z!{_Ja<>CY)v|^$Rp(RFT;T@WSsu$hQfsr5Ugpy7L6@x&+U>HV;5_D_ zqn)3#XQ53ml9>a9Ksfj{p+a8KH4KkrK4%&|+C`A??V&xVHnVkh=YygrmG7qq5dFK&Q};MW0Kj|qx`AlaR~hX9#j zH9e9yn;=rS!53VF(YsWFIpblrHB^Y9_o-g?Ho|4SMGQxOFCl`7&9KAzg~jSg+gVw9A&qoyx9^(Y zqGP5>tjS#s%@{l-?I>RAuYT~EUs}OFH0dED)Up$O5_BJRD-M}`F7v}Y=Gp5jPt#r* zJWNs^d3XDfmHi-D&>CEgu#n+$hU0O_Ri@n<*~b0g5R^4DD9ZqnE;?FXZm;Cg0s?ZA zn8s8qwp~mfXWO)a*yK$24f07`CG%MC(BgXHiT5+i^5OuW;z<0urn`ST1vEWQYrQ?R zk_QV-Rd(zxH|4++39x`3PKwVH`_(}L(Jy>O{_byWemp>U{{Hh6fbQ|sl2Ckw{?AmR zAvV-Qv+lpS@fb4|eW$7jDk5-n?dN;mP6AsSfkO~lSc=a1BKo5pX)>&wfVYKefw^)? z%6b@s3%)+!QU40wya@YSmgQ+t!xk7zODneJd%G8c4pzqbszLyu=lJorQ*JvpXqPxFR7c>%r;rmDhzTpukE;zRi2?JClReQdB3R2?& zXsIlezl+fR0|4j{zbUPSzIx7UJC+}FFynm}yPBKcDgDgXXu|=Nr`M`q_Vmt&JFRuM zM(4z?A&FCk5wZ`Ef$jNo(I!9PBb|^><-|3oEXI@cL77Qo;4P98=-gnfXUA=9GhV1} zp=;7;eCB}4MptLu^trC-{99`@;86aBZ|6rvWqYq)EZ{2B{?3KoP8t5oVLsaAb94@Q z>6eYPPm&#>FZEiG^Au7DY`~iv_u>%J8Qiv>qv~0^*oqWb?S0){TCpmn>qn3GnKF2M zTEY)QmmZoP6AV2P5R(l)BWgy;#RrYWjPPjfKW{U;J{_wvt6?o}SM~qE7}vEu`~LAT z%0PlDW%n=}8zrB^hfS)KSC*aA8tYgUoIr^Ld>~`S6Q1^xg0FxiCtL>mp-i75uoN0fuqgvuQ;6E0rWBy# za*B;3vWvn){_f92Y^8=#+*3V>1)$1W!6M~BK3`~)t#6IpTPffj2*m(T!8c# zkH950|DQF@+En8}&MCU?&8qSXzF*@PoOy)?EzM*|XED{%E*Zj}tBVY5HDX-95D@t2 z?qlgv+D-8?*y7te{O5Q;p9E}~T($B*B*(+1en@8Oe^Pj(;Bj#TC_aFth}sUl@Xq$m z$K5i!`Fbv4vz2iTgLrRx3Q}zRrJm+vdxj@R4yafjZ}oJwnQT>eL}MTBJ(JA(Wc|-W zrhbY6FvChLWH&?ToGZijyfYhYS}4i z@U5cGd^@78od8DajL{$d=g7}oA#MjC=3BCn&i)<3@Zu#c1dm#dcdy zf%g0=_4&%vAbM({qWNdLdH3y7nEvRvm#`uct1*T(^W8h6dfU{mnZc~3ftZ@G<(P`Q z+**P>S?L}Z7<>t`oF`Efw?!wIvrRUV8_Y|VC|4j$GV+|;tlz=K%EgIPo(>-B>q%{X zsFlk~Cd{Z8Eq~lm-4Z+@B@vNs8}J+}nq@L@rWYbh^4yATuVx$HHMndViW~8u6;S`T zJEUL|-1{1u1Y?yG_V-~a;I9p; z4L1#acT1ONabMU?5YFek(U%O#+&ZePb_V&uhiCL_~_!)sT0W0{6 zUki-3P|J?$^Pi(rVe@KN6U%|80n+J68}W8O=g)uh_9~1JE3O+K%@9LPLI;Y6cOH9DK5=mu~MJONXuj6qR6~!J0QYIoR8l9>jw2p@X);`V#U4LM47|PaaJ{&Y}UF1uBWVO~}e})dBpFULlv$RTj zZUpelby81bTj*g-&$A*8P&wVG7pgr^`s$#AC^|Z+X~arj+ztx1!2R$k$=|-(NfB2@wR1z*L-)&-yp+oMpVA+tyqbsOEb<|@v+HwPjX|7D)-x9Z=L zhk-f>Pns?|FnPmjG6iL%4hw!{jPh8QZKJE3idwKg4FN7}7Fn!PZwTuaeQcvx>yU5l ze(c;%>96a{e)hV%GEcfGd(-$@HK&|M&8#x7a@ff~Gv*-`i8f;|KvQny!d(vfItZn< zMiNgK_r^0PknomkCw+w|6-I+cn&QYIz-*N@{l)a(v(iG#XI?ezScu`*-nx(th+ z$Gn;qW)bn0@LuJu5yI8bhs-gy|6`aQZxl<){_y!r(Go-Qy8H|r>$qvNCI`s$B znmp+qC>pSb1RT(W*n=+W?P`gZ-k9X^K0g9`ZIJWAv?zAx@Jg%9C_r9@3q!nSREO-i zLV2#rrDqWTikQ4v6joQLt*C&#bHbCFSn(_4N&h!578~ z_k$eBJJrBdXwXkvF}$fKyTvVsVljBRGZum+=gf)dX#4sYX>}@9l4BXgsH5G*T@z9X zqznn8dnFgMT!f4b0gB=0rgY$cHuJ!Hjv0DIQ^m*)0x>^T(U_Gf*Tw-E`f8i$Y)poV zTtCM+(&*xvmH<=i%2Ldb}pgX?%1iQD`1{9XG(FUdy%u6?-&h zSPH)Xo!3a!dp59<)mlFwVV&n<=zisG&% z=^{OsKYe=t3F)A)*m9*|$|AB5r;8=i(5?lXkvMey!rNgOY{LBP!-9HMB+RT zQ%e(alv_CO*^JF~esWJ3?^LcgMN-<|m||RR^IMTBDN)3`8GzBhL`W+9bcq2b13c0f zDdQJctX^vJgiZJ1$5scn)@vIPRnf8b z2^I1+qEc@bs;6A29%`(X6!^9LDSd#A&O~on{f-N@mzqgS8)Rq}EyyjQf#ld{^gd7N=eojTy+n&uBE-mpZ zihHe}lY~l7~*<&G^p# z!s3|zq(l;pRgQ53qMtA3oR#`{IpxI3H@4z|t*E?$M_hR#Gw&FT7kSF;&n0q!iQT=7 zgt!-|t)H2enV;rn8AmkV^Eu{A#l4Ia z&^%91U^q&e(3hjf;xyCeNRk1ESt!dRfyYPj%q6wIUH@fhQkndgYJ?C{&Qj~ipa^^X z5ZaCKaW~EK`K&!57Rf8+2_ykqmKK^fDY0*y-#3vxsF>< ze)1tGLnm!iQ&xr|x^hs-^eQ?Yux#OXyX>*)l%N*iY1a%1q5UQW5p?!SW4^ zbD`^Kj0R+t{dtz-&7Iw)LwsFFP3E%{v}q1zDL*I9R*8?9)(zN-=N&HxOW=TgH}>yy z6t9@b;UfcjeBdf1mrMRxp()^}Ns`W`Xpfslx<=A39?SU09x1@g%!lbY);uY96{PCu z`C3b9f89;YQt^e?Fxq+W4R7I4VFKYkb7|wTx%dywbLooqw2M`*>4q}$K6)RR(#wZ| zL44@?mwzFGW{wdX1avzOP@*|0L7$ z)|9E?sh0uQFHoAAg}8I|<##T0-k-L#M2=YWotW*#CRr~Kmg=yAv~4`?9U)B;anDRP z(=|l_JAI4(n`zj1W@w+J@A0e9)3yUnI%3pkez+5*2JC%~iC9vFKDE=Z3<6f|V5ky} zCrrmFD%(Wyx!GuGaag6F)Ug~83DcQ z4g&HW4ptOuvoMIRs6w1`8=3vtVxST0_@2U0@jMXtwiG3D7hMH)Nu0(o!_f+JCHIq) z#bBbL-vP-=*|K9|0vcviD_v0u?C1kgrg(b&aKhaJ;4tLguQN59Ox#TO#?0cAKPHB^4#oDmp(~+M z8Y3FFq$!oGez1U;+g}LQyn=?)(F2*cF;j_@+>E@ml4@^W+0ALray}mju6aa3-G=U4 zd+m#YQ5g|%_}sMvYKkyN@S~CX^vqi7MgQ0cv8K0+$NV1U_mGn$^2b%JvKlw(ikp6zPAqlPa`Jd6e;;=_U>GJsD zQh5=6*vgLvU3*Ul1Rg{V&s)1gOt988^?ep2u(kMQ18$TVCLbr0*(j(7k7BsR%z0kS zKb+Dx91>B@#Qug5&6xA{Kn17a^=U2;_<4=Lapwc z#7!f(oPcyH-V6Q*t%fDGQH;zStw5i+`RGC9^BZp&WcSBs$zFp%8zaf^vV+II*85a>EH>pGsIwp*O-$zr3tx11U{0V zn(sbIW59Bn;n9%9qn;-Ld)@K0Gt$^%9i!YOFB+Be&wfCCur8+#o}n2XbjwD#+$U|N zb2zZ-7>fzFqvej}x{Y+Xt;;gK4hFlC9JrQyDRj9qa?v5K%y+X8>CXwtIO8JD%DO1z zt|hyn+G&zPi*jw(==(D(Ko{#l{16Sy`x@=4`R)pyw{2mIqnfl>j~Bk=HW7nQ9R6+l z?@yk%Ou$B#keB14Em+K3Onjy&jz%S;(Y@0(Th*|1Fn`FGi+y=eF3dc>_ODs^%)D*x zvceI|jjTk8*g-l;$nB8mVf}`Pk%uV2Bon<|!C-G(x;&_5A;J4ED900pJ?oA6hq_~E z6t<%NTY#8%DTGAT&?A)}p&b13L+vAw6TQDm<#+B6K!d^UI8&zk5%qtlJNFR6D?D;V mju(Mgq+Yy}fadHy(vAE|h(j5=67)X=kKj^@k|p9FeE$b|>@&>( literal 0 HcmV?d00001 diff --git a/datafusion/docs/images/DataFusion-Logo-Background-White.svg b/datafusion/docs/images/DataFusion-Logo-Background-White.svg new file mode 100644 index 0000000000000..026c70855a71f --- /dev/null +++ b/datafusion/docs/images/DataFusion-Logo-Background-White.svg @@ -0,0 +1 @@ +DataFUSION-Logo-Dark \ No newline at end of file From 69e64c4b5ae279b5a615318c5e38e565d29a472e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 12 May 2021 15:18:44 -0600 Subject: [PATCH 090/329] Remove references to pre-built Docker images in ballistacompute repo (#326) --- benchmarks/docker-compose.yaml | 6 ++--- benchmarks/tpch-gen.sh | 4 ++-- ...build-rust.sh => build-ballista-docker.sh} | 3 ++- dev/build-rust-base.sh | 22 ------------------- dev/build-ui.sh | 2 +- ...se.dockerfile => ballista-base.dockerfile} | 0 ...rfile => ballista-scheduler-ui.dockerfile} | 0 .../{rust.dockerfile => ballista.dockerfile} | 4 ++-- dev/integration-tests.sh | 3 +-- 9 files changed, 11 insertions(+), 33 deletions(-) rename dev/{build-rust.sh => build-ballista-docker.sh} (83%) delete mode 100755 dev/build-rust-base.sh rename dev/docker/{rust-base.dockerfile => ballista-base.dockerfile} (100%) rename dev/docker/{ui.scheduler.dockerfile => ballista-scheduler-ui.dockerfile} (100%) rename dev/docker/{rust.dockerfile => ballista.dockerfile} (97%) diff --git a/benchmarks/docker-compose.yaml b/benchmarks/docker-compose.yaml index bbb31078cf0a5..c13e9eb48c884 100644 --- a/benchmarks/docker-compose.yaml +++ b/benchmarks/docker-compose.yaml @@ -20,7 +20,7 @@ services: image: quay.io/coreos/etcd:v3.4.9 command: "etcd -advertise-client-urls http://etcd:2379 -listen-client-urls http://0.0.0.0:2379" ballista-scheduler: - image: ballistacompute/ballista-rust:0.5.0-SNAPSHOT + image: ballista:0.5.0-SNAPSHOT command: "/scheduler --config-backend etcd --etcd-urls etcd:2379 --bind-host 0.0.0.0 --port 50050" environment: - RUST_LOG=ballista=debug @@ -29,7 +29,7 @@ services: depends_on: - etcd ballista-executor: - image: ballistacompute/ballista-rust:0.5.0-SNAPSHOT + image: ballista:0.5.0-SNAPSHOT command: "/executor --bind-host 0.0.0.0 --port 50051 --scheduler-host ballista-scheduler" scale: 2 environment: @@ -39,7 +39,7 @@ services: depends_on: - ballista-scheduler ballista-client: - image: ballistacompute/ballista-rust:0.5.0-SNAPSHOT + image: ballista:0.5.0-SNAPSHOT command: "/bin/sh" # do nothing working_dir: /ballista/benchmarks/tpch environment: diff --git a/benchmarks/tpch-gen.sh b/benchmarks/tpch-gen.sh index 676c0e7df52bf..a46f65111069e 100755 --- a/benchmarks/tpch-gen.sh +++ b/benchmarks/tpch-gen.sh @@ -21,7 +21,7 @@ pushd .. . ./dev/build-set-env.sh popd -docker build -t ballistacompute/ballista-tpchgen:$BALLISTA_VERSION -f tpchgen.dockerfile . +docker build -t ballista-tpchgen:$BALLISTA_VERSION -f tpchgen.dockerfile . # Generate data into the ./data directory if it does not already exist FILE=./data/supplier.tbl @@ -29,6 +29,6 @@ if test -f "$FILE"; then echo "$FILE exists." else mkdir data 2>/dev/null - docker run -v `pwd`/data:/data -it --rm ballistacompute/ballista-tpchgen:$BALLISTA_VERSION + docker run -v `pwd`/data:/data -it --rm ballista-tpchgen:$BALLISTA_VERSION ls -l data fi \ No newline at end of file diff --git a/dev/build-rust.sh b/dev/build-ballista-docker.sh similarity index 83% rename from dev/build-rust.sh rename to dev/build-ballista-docker.sh index 479cb2a135fbb..bc028da9e716a 100755 --- a/dev/build-rust.sh +++ b/dev/build-ballista-docker.sh @@ -20,4 +20,5 @@ set -e . ./dev/build-set-env.sh -docker build -t ballistacompute/ballista-rust:$BALLISTA_VERSION -f dev/docker/rust.dockerfile . +docker build -t ballista-base:$BALLISTA_VERSION -f dev/docker/ballista-base.dockerfile . +docker build -t ballista:$BALLISTA_VERSION -f dev/docker/ballista.dockerfile . diff --git a/dev/build-rust-base.sh b/dev/build-rust-base.sh deleted file mode 100755 index f2a4cc32385bd..0000000000000 --- a/dev/build-rust-base.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -set -e - -. ./dev/build-set-env.sh -docker build -t ballistacompute/rust-base:$BALLISTA_VERSION -f dev/docker/rust-base.dockerfile . diff --git a/dev/build-ui.sh b/dev/build-ui.sh index bb5bff3d180a5..070839702500e 100755 --- a/dev/build-ui.sh +++ b/dev/build-ui.sh @@ -20,4 +20,4 @@ set -e . ./dev/build-set-env.sh -docker build -t ballistacompute/ballista-scheduler-ui:$BALLISTA_VERSION -f dev/docker/ui.scheduler.dockerfile ballista/ui/scheduler +docker build -t ballista-scheduler-ui:$BALLISTA_VERSION -f dev/docker/ballista-scheduler-ui.dockerfile ballista/ui/scheduler diff --git a/dev/docker/rust-base.dockerfile b/dev/docker/ballista-base.dockerfile similarity index 100% rename from dev/docker/rust-base.dockerfile rename to dev/docker/ballista-base.dockerfile diff --git a/dev/docker/ui.scheduler.dockerfile b/dev/docker/ballista-scheduler-ui.dockerfile similarity index 100% rename from dev/docker/ui.scheduler.dockerfile rename to dev/docker/ballista-scheduler-ui.dockerfile diff --git a/dev/docker/rust.dockerfile b/dev/docker/ballista.dockerfile similarity index 97% rename from dev/docker/rust.dockerfile rename to dev/docker/ballista.dockerfile index 045300126359b..59f21b76d411b 100644 --- a/dev/docker/rust.dockerfile +++ b/dev/docker/ballista.dockerfile @@ -22,7 +22,7 @@ # as a mounted directory. ARG RELEASE_FLAG=--release -FROM ballistacompute/rust-base:0.5.0-SNAPSHOT AS base +FROM ballista-base:0.5.0-SNAPSHOT AS base WORKDIR /tmp/ballista RUN apt-get -y install cmake RUN cargo install cargo-chef @@ -76,7 +76,7 @@ ENV RELEASE_FLAG=${RELEASE_FLAG} RUN if [ -z "$RELEASE_FLAG" ]; then mv /tmp/ballista/target/debug/tpch /tpch; else mv /tmp/ballista/target/release/tpch /tpch; fi # Copy the binary into a new container for a smaller docker image -FROM ballistacompute/rust-base:0.5.0-SNAPSHOT +FROM ballista-base:0.5.0-SNAPSHOT COPY --from=builder /executor / diff --git a/dev/integration-tests.sh b/dev/integration-tests.sh index 06ab108c2931c..3c1306319c369 100755 --- a/dev/integration-tests.sh +++ b/dev/integration-tests.sh @@ -17,8 +17,7 @@ # specific language governing permissions and limitations # under the License. set -e -./dev/build-rust-base.sh -./dev/build-rust.sh +./dev/build-ballista-docker.sh pushd benchmarks ./tpch-gen.sh From b44238d05094ab0fa0171769ce8b890a0045e1e1 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Fri, 14 May 2021 02:38:19 +0800 Subject: [PATCH 091/329] add --quiet/-q flag and allow timing info to be turned on/off (#323) * add print options and allow timing info to be turned on/off * remove self reference * use quiet --- datafusion-cli/src/format.rs | 17 ----- datafusion-cli/src/lib.rs | 56 +++++++++++++++++ datafusion-cli/src/main.rs | 63 +++++++------------ .../src/{format => }/print_format.rs | 0 4 files changed, 77 insertions(+), 59 deletions(-) delete mode 100644 datafusion-cli/src/format.rs create mode 100644 datafusion-cli/src/lib.rs rename datafusion-cli/src/{format => }/print_format.rs (100%) diff --git a/datafusion-cli/src/format.rs b/datafusion-cli/src/format.rs deleted file mode 100644 index c5da78f17951c..0000000000000 --- a/datafusion-cli/src/format.rs +++ /dev/null @@ -1,17 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -pub mod print_format; diff --git a/datafusion-cli/src/lib.rs b/datafusion-cli/src/lib.rs new file mode 100644 index 0000000000000..5bd16e333030c --- /dev/null +++ b/datafusion-cli/src/lib.rs @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +pub mod print_format; + +use datafusion::arrow::record_batch::RecordBatch; +use datafusion::error::Result; +use print_format::PrintFormat; +use std::time::Instant; + +#[derive(Debug, Clone)] +pub struct PrintOptions { + pub format: PrintFormat, + pub quiet: bool, +} + +fn print_timing_info(row_count: usize, now: Instant) { + println!( + "{} {} in set. Query took {} seconds.", + row_count, + if row_count == 1 { "row" } else { "rows" }, + now.elapsed().as_secs() + ); +} + +impl PrintOptions { + /// print the batches to stdout using the specified format + pub fn print_batches(&self, batches: &[RecordBatch]) -> Result<()> { + let now = Instant::now(); + if batches.is_empty() { + if !self.quiet { + print_timing_info(0, now); + } + } else { + self.format.print_batches(batches)?; + if !self.quiet { + let row_count: usize = batches.iter().map(|b| b.num_rows()).sum(); + print_timing_info(row_count, now); + } + } + Ok(()) + } +} diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index 2360d4642484c..f36b5d93d21f6 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -17,19 +17,16 @@ #![allow(bare_trait_objects)] -mod format; - use clap::{crate_version, App, Arg}; use datafusion::error::Result; use datafusion::execution::context::{ExecutionConfig, ExecutionContext}; -use format::print_format::PrintFormat; +use datafusion_cli::{print_format::PrintFormat, PrintOptions}; use rustyline::Editor; use std::env; use std::fs::File; use std::io::prelude::*; use std::io::BufReader; use std::path::Path; -use std::time::Instant; #[tokio::main] pub async fn main() { @@ -72,6 +69,13 @@ pub async fn main() { .validator(is_valid_format) .takes_value(true), ) + .arg( + Arg::with_name("quite") + .help("Reduce printing other than the results and work quietly") + .short("q") + .long("quiet") + .takes_value(false), + ) .get_matches(); if let Some(path) = matches.value_of("data-path") { @@ -88,26 +92,29 @@ pub async fn main() { execution_config = execution_config.with_batch_size(batch_size); }; - let print_format = matches + let format = matches .value_of("format") .expect("No format is specified") .parse::() .expect("Invalid format"); + let quiet = matches.is_present("quiet"); + let print_options = PrintOptions { format, quiet }; + if let Some(file_path) = matches.value_of("file") { let file = File::open(file_path) .unwrap_or_else(|err| panic!("cannot open file '{}': {}", file_path, err)); let mut reader = BufReader::new(file); - exec_from_lines(&mut reader, execution_config, print_format).await; + exec_from_lines(&mut reader, execution_config, print_options).await; } else { - exec_from_repl(execution_config, print_format).await; + exec_from_repl(execution_config, print_options).await; } } async fn exec_from_lines( reader: &mut BufReader, execution_config: ExecutionConfig, - print_format: PrintFormat, + print_options: PrintOptions, ) { let mut ctx = ExecutionContext::with_config(execution_config); let mut query = "".to_owned(); @@ -121,7 +128,7 @@ async fn exec_from_lines( let line = line.trim_end(); query.push_str(line); if line.ends_with(';') { - match exec_and_print(&mut ctx, print_format.clone(), query).await { + match exec_and_print(&mut ctx, print_options.clone(), query).await { Ok(_) => {} Err(err) => println!("{:?}", err), } @@ -138,14 +145,14 @@ async fn exec_from_lines( // run the left over query if the last statement doesn't contain ‘;’ if !query.is_empty() { - match exec_and_print(&mut ctx, print_format, query).await { + match exec_and_print(&mut ctx, print_options, query).await { Ok(_) => {} Err(err) => println!("{:?}", err), } } } -async fn exec_from_repl(execution_config: ExecutionConfig, print_format: PrintFormat) { +async fn exec_from_repl(execution_config: ExecutionConfig, print_options: PrintOptions) { let mut ctx = ExecutionContext::with_config(execution_config); let mut rl = Editor::<()>::new(); @@ -163,7 +170,7 @@ async fn exec_from_repl(execution_config: ExecutionConfig, print_format: PrintFo Ok(ref line) if line.trim_end().ends_with(';') => { query.push_str(line.trim_end()); rl.add_history_entry(query.clone()); - match exec_and_print(&mut ctx, print_format.clone(), query).await { + match exec_and_print(&mut ctx, print_options.clone(), query).await { Ok(_) => {} Err(err) => println!("{:?}", err), } @@ -220,39 +227,11 @@ fn is_exit_command(line: &str) -> bool { async fn exec_and_print( ctx: &mut ExecutionContext, - print_format: PrintFormat, + print_options: PrintOptions, sql: String, ) -> Result<()> { - let now = Instant::now(); - let df = ctx.sql(&sql)?; let results = df.collect().await?; - - if results.is_empty() { - println!( - "0 rows in set. Query took {} seconds.", - now.elapsed().as_secs() - ); - return Ok(()); - } - - print_format.print_batches(&results)?; - - let row_count: usize = results.iter().map(|b| b.num_rows()).sum(); - - if row_count > 1 { - println!( - "{} row in set. Query took {} seconds.", - row_count, - now.elapsed().as_secs() - ); - } else { - println!( - "{} rows in set. Query took {} seconds.", - row_count, - now.elapsed().as_secs() - ); - } - + print_options.print_batches(&results)?; Ok(()) } diff --git a/datafusion-cli/src/format/print_format.rs b/datafusion-cli/src/print_format.rs similarity index 100% rename from datafusion-cli/src/format/print_format.rs rename to datafusion-cli/src/print_format.rs From b096539d9670d9547bfd3ae1fb47ef95b2d06f96 Mon Sep 17 00:00:00 2001 From: sathis Date: Fri, 14 May 2021 23:30:09 +0530 Subject: [PATCH 092/329] [Datafusion] NOW() function support (#288) * Add initial implementation of NOW * Run rustfmt * Change incorrect condition * Add timestamp optimizer which optimizes the logical plan and makes sure all now() return same value * Add unit tests & fix alias * Add unit tests & fix alias * Run cargo fmt * Comment out failing test * Optimize the match to fix clippy * Initialize datetime during optimize not creation * Add assertion to compare multiple now() values * Run cargo fmt * Move timestamp to execution props * Add missing prop * Add missing prop * Remove duplicated code * Fix tests & format * Fix clippy * Revert clippy fix * Update datafusion/src/execution/context.rs Co-authored-by: Andrew Lamb * Fix review comments. Move timestamp evaluation logic to constant_folding.rs * Pass ExecutionProps to scalar functions * Revert "Pass ExecutionProps to scalar functions" This reverts commit d9cb005df4a4c1bf05b18b5d9a1aefc4f9e706bb. * Add closure approach from @alamb * Re-enable concat test * Changing Option> to DateTime Co-authored-by: Sathis Kumar Co-authored-by: Andrew Lamb --- .../src/serde/physical_plan/from_proto.rs | 6 +- datafusion/src/execution/context.rs | 128 ++++++++++++------ datafusion/src/optimizer/constant_folding.rs | 107 ++++++++++++++- datafusion/src/optimizer/eliminate_limit.rs | 13 +- datafusion/src/optimizer/filter_push_down.rs | 7 +- .../src/optimizer/hash_build_probe_order.rs | 17 ++- datafusion/src/optimizer/limit_push_down.rs | 7 +- datafusion/src/optimizer/optimizer.rs | 7 +- .../src/optimizer/projection_push_down.rs | 41 +++++- datafusion/src/optimizer/utils.rs | 15 +- .../src/physical_plan/datetime_expressions.rs | 19 ++- datafusion/src/physical_plan/functions.rs | 55 ++++++-- datafusion/src/physical_plan/parquet.rs | 19 +-- datafusion/src/physical_plan/planner.rs | 23 ++-- datafusion/src/physical_plan/type_coercion.rs | 8 ++ datafusion/tests/sql.rs | 49 ++++++- datafusion/tests/user_defined_plan.rs | 11 +- 17 files changed, 415 insertions(+), 117 deletions(-) diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index 6a33c6a43f628..9c35c9d889411 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -33,7 +33,9 @@ use arrow::datatypes::{DataType, Schema, SchemaRef}; use datafusion::catalog::catalog::{ CatalogList, CatalogProvider, MemoryCatalogList, MemoryCatalogProvider, }; -use datafusion::execution::context::{ExecutionConfig, ExecutionContextState}; +use datafusion::execution::context::{ + ExecutionConfig, ExecutionContextState, ExecutionProps, +}; use datafusion::logical_plan::{DFSchema, Expr}; use datafusion::physical_plan::aggregates::{create_aggregate_expr, AggregateFunction}; use datafusion::physical_plan::expressions::col; @@ -226,6 +228,7 @@ impl TryInto> for &protobuf::PhysicalPlanNode { var_provider: Default::default(), aggregate_functions: Default::default(), config: ExecutionConfig::new(), + execution_props: ExecutionProps::new(), }; let input_schema = hash_agg @@ -391,6 +394,7 @@ fn compile_expr( var_provider: HashMap::new(), aggregate_functions: HashMap::new(), config: ExecutionConfig::new(), + execution_props: ExecutionProps::new(), }; let expr: Expr = expr.try_into()?; df_planner diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index b53f7c15e3aac..9c7a6217d7d93 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -74,6 +74,7 @@ use crate::sql::{ }; use crate::variable::{VarProvider, VarType}; use crate::{dataframe::DataFrame, physical_plan::udaf::AggregateUDF}; +use chrono::{DateTime, Utc}; use parquet::arrow::ArrowWriter; use parquet::file::properties::WriterProperties; @@ -159,6 +160,7 @@ impl ExecutionContext { var_provider: HashMap::new(), aggregate_functions: HashMap::new(), config, + execution_props: ExecutionProps::new(), })), } } @@ -454,12 +456,16 @@ impl ExecutionContext { /// Optimizes the logical plan by applying optimizer rules. pub fn optimize(&self, plan: &LogicalPlan) -> Result { - let optimizers = &self.state.lock().unwrap().config.optimizers; + let state = &mut self.state.lock().unwrap(); + let execution_props = &mut state.execution_props.clone(); + let optimizers = &state.config.optimizers; + + let execution_props = execution_props.start_execution(); let mut new_plan = plan.clone(); debug!("Logical plan:\n {:?}", plan); for optimizer in optimizers { - new_plan = optimizer.optimize(&new_plan)?; + new_plan = optimizer.optimize(&new_plan, execution_props)?; } debug!("Optimized logical plan:\n {:?}", new_plan); Ok(new_plan) @@ -470,7 +476,9 @@ impl ExecutionContext { &self, logical_plan: &LogicalPlan, ) -> Result> { - let state = self.state.lock().unwrap(); + let mut state = self.state.lock().unwrap(); + state.execution_props.start_execution(); + state .config .query_planner @@ -740,6 +748,15 @@ impl ExecutionConfig { } } +/// Holds per-execution properties and data (such as starting timestamps, etc). +/// An instance of this struct is created each time a [`LogicalPlan`] is prepared for +/// execution (optimized). If the same plan is optimized multiple times, a new +/// `ExecutionProps` is created each time. +#[derive(Clone)] +pub struct ExecutionProps { + pub(crate) query_execution_start_time: DateTime, +} + /// Execution context for registering data sources and executing queries #[derive(Clone)] pub struct ExecutionContextState { @@ -753,9 +770,38 @@ pub struct ExecutionContextState { pub aggregate_functions: HashMap>, /// Context configuration pub config: ExecutionConfig, + /// Execution properties + pub execution_props: ExecutionProps, +} + +impl ExecutionProps { + /// Creates a new execution props + pub fn new() -> Self { + ExecutionProps { + query_execution_start_time: chrono::Utc::now(), + } + } + + /// Marks the execution of query started timestamp + pub fn start_execution(&mut self) -> &Self { + self.query_execution_start_time = chrono::Utc::now(); + &*self + } } impl ExecutionContextState { + /// Returns new ExecutionContextState + pub fn new() -> Self { + ExecutionContextState { + catalog_list: Arc::new(MemoryCatalogList::new()), + scalar_functions: HashMap::new(), + var_provider: HashMap::new(), + aggregate_functions: HashMap::new(), + config: ExecutionConfig::new(), + execution_props: ExecutionProps::new(), + } + } + fn resolve_table_ref<'a>( &'a self, table_ref: impl Into>, @@ -1507,7 +1553,7 @@ mod tests { "+-------------------------+-------------------------+-------------------------+---------------------+", "| 2021-01-01 05:11:10.432 | 2021-01-01 05:11:10.432 | 2021-01-01 05:11:10.432 | 2021-01-01 05:11:10 |", "+-------------------------+-------------------------+-------------------------+---------------------+", -]; + ]; assert_batches_sorted_eq!(expected, &results); Ok(()) @@ -1633,7 +1679,7 @@ mod tests { let results = plan_and_collect( &mut ctx, - "SELECT date_trunc('week', t1) as week, SUM(c2) FROM test GROUP BY date_trunc('week', t1)" + "SELECT date_trunc('week', t1) as week, SUM(c2) FROM test GROUP BY date_trunc('week', t1)", ).await?; assert_eq!(results.len(), 1); @@ -1881,16 +1927,15 @@ mod tests { let results = run_count_distinct_integers_aggregated_scenario(partitions).await?; assert_eq!(results.len(), 1); - let expected = vec! -[ - "+---------+-----------------+------------------------+-------------------------+-------------------------+-------------------------+-------------------------+--------------------------+--------------------------+--------------------------+", - "| c_group | COUNT(c_uint64) | COUNT(DISTINCT c_int8) | COUNT(DISTINCT c_int16) | COUNT(DISTINCT c_int32) | COUNT(DISTINCT c_int64) | COUNT(DISTINCT c_uint8) | COUNT(DISTINCT c_uint16) | COUNT(DISTINCT c_uint32) | COUNT(DISTINCT c_uint64) |", - "+---------+-----------------+------------------------+-------------------------+-------------------------+-------------------------+-------------------------+--------------------------+--------------------------+--------------------------+", - "| a | 3 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |", - "| b | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |", - "| c | 3 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |", - "+---------+-----------------+------------------------+-------------------------+-------------------------+-------------------------+-------------------------+--------------------------+--------------------------+--------------------------+", -]; + let expected = vec![ + "+---------+-----------------+------------------------+-------------------------+-------------------------+-------------------------+-------------------------+--------------------------+--------------------------+--------------------------+", + "| c_group | COUNT(c_uint64) | COUNT(DISTINCT c_int8) | COUNT(DISTINCT c_int16) | COUNT(DISTINCT c_int32) | COUNT(DISTINCT c_int64) | COUNT(DISTINCT c_uint8) | COUNT(DISTINCT c_uint16) | COUNT(DISTINCT c_uint32) | COUNT(DISTINCT c_uint64) |", + "+---------+-----------------+------------------------+-------------------------+-------------------------+-------------------------+-------------------------+--------------------------+--------------------------+--------------------------+", + "| a | 3 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |", + "| b | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |", + "| c | 3 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |", + "+---------+-----------------+------------------------+-------------------------+-------------------------+-------------------------+-------------------------+--------------------------+--------------------------+--------------------------+", + ]; assert_batches_sorted_eq!(expected, &results); Ok(()) @@ -1910,14 +1955,14 @@ mod tests { assert_eq!(results.len(), 1); let expected = vec![ - "+---------+-----------------+------------------------+-------------------------+-------------------------+-------------------------+-------------------------+--------------------------+--------------------------+--------------------------+", - "| c_group | COUNT(c_uint64) | COUNT(DISTINCT c_int8) | COUNT(DISTINCT c_int16) | COUNT(DISTINCT c_int32) | COUNT(DISTINCT c_int64) | COUNT(DISTINCT c_uint8) | COUNT(DISTINCT c_uint16) | COUNT(DISTINCT c_uint32) | COUNT(DISTINCT c_uint64) |", - "+---------+-----------------+------------------------+-------------------------+-------------------------+-------------------------+-------------------------+--------------------------+--------------------------+--------------------------+", - "| a | 5 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 |", - "| b | 5 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 |", - "| c | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |", - "+---------+-----------------+------------------------+-------------------------+-------------------------+-------------------------+-------------------------+--------------------------+--------------------------+--------------------------+", -]; + "+---------+-----------------+------------------------+-------------------------+-------------------------+-------------------------+-------------------------+--------------------------+--------------------------+--------------------------+", + "| c_group | COUNT(c_uint64) | COUNT(DISTINCT c_int8) | COUNT(DISTINCT c_int16) | COUNT(DISTINCT c_int32) | COUNT(DISTINCT c_int64) | COUNT(DISTINCT c_uint8) | COUNT(DISTINCT c_uint16) | COUNT(DISTINCT c_uint32) | COUNT(DISTINCT c_uint64) |", + "+---------+-----------------+------------------------+-------------------------+-------------------------+-------------------------+-------------------------+--------------------------+--------------------------+--------------------------+", + "| a | 5 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 |", + "| b | 5 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 |", + "| c | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |", + "+---------+-----------------+------------------------+-------------------------+-------------------------+-------------------------+-------------------------+--------------------------+--------------------------+--------------------------+", + ]; assert_batches_sorted_eq!(expected, &results); Ok(()) @@ -2311,6 +2356,7 @@ mod tests { } Ok(()) } + #[test] fn ctx_sql_should_optimize_plan() -> Result<()> { let mut ctx = ExecutionContext::new(); @@ -2844,13 +2890,11 @@ mod tests { .await .unwrap(); let expected = vec![ - - "+---------------+--------------+------------+-------------+------------------+----------------+-------------+-----------+--------------------------+------------------------+-------------------+-------------------------+---------------+--------------------+---------------+", - "| table_catalog | table_schema | table_name | column_name | ordinal_position | column_default | is_nullable | data_type | character_maximum_length | character_octet_length | numeric_precision | numeric_precision_radix | numeric_scale | datetime_precision | interval_type |", - "+---------------+--------------+------------+-------------+------------------+----------------+-------------+-----------+--------------------------+------------------------+-------------------+-------------------------+---------------+--------------------+---------------+", - "| datafusion | public | t | i | 0 | | YES | Int32 | | | 32 | 2 | | | |", - "+---------------+--------------+------------+-------------+------------------+----------------+-------------+-----------+--------------------------+------------------------+-------------------+-------------------------+---------------+--------------------+---------------+", - + "+---------------+--------------+------------+-------------+------------------+----------------+-------------+-----------+--------------------------+------------------------+-------------------+-------------------------+---------------+--------------------+---------------+", + "| table_catalog | table_schema | table_name | column_name | ordinal_position | column_default | is_nullable | data_type | character_maximum_length | character_octet_length | numeric_precision | numeric_precision_radix | numeric_scale | datetime_precision | interval_type |", + "+---------------+--------------+------------+-------------+------------------+----------------+-------------+-----------+--------------------------+------------------------+-------------------+-------------------------+---------------+--------------------+---------------+", + "| datafusion | public | t | i | 0 | | YES | Int32 | | | 32 | 2 | | | |", + "+---------------+--------------+------------+-------------+------------------+----------------+-------------+-----------+--------------------------+------------------------+-------------------+-------------------------+---------------+--------------------+---------------+", ]; assert_batches_sorted_eq!(expected, &result); @@ -2984,18 +3028,18 @@ mod tests { .unwrap(); let expected = vec![ - "+---------------+--------------+------------+------------------+------------------+----------------+-------------+-----------------------------+--------------------------+------------------------+-------------------+-------------------------+---------------+--------------------+---------------+", - "| table_catalog | table_schema | table_name | column_name | ordinal_position | column_default | is_nullable | data_type | character_maximum_length | character_octet_length | numeric_precision | numeric_precision_radix | numeric_scale | datetime_precision | interval_type |", - "+---------------+--------------+------------+------------------+------------------+----------------+-------------+-----------------------------+--------------------------+------------------------+-------------------+-------------------------+---------------+--------------------+---------------+", - "| my_catalog | my_schema | t1 | i | 0 | | YES | Int32 | | | 32 | 2 | | | |", - "| my_catalog | my_schema | t2 | binary_col | 4 | | NO | Binary | | 2147483647 | | | | | |", - "| my_catalog | my_schema | t2 | float64_col | 1 | | YES | Float64 | | | 24 | 2 | | | |", - "| my_catalog | my_schema | t2 | int32_col | 0 | | NO | Int32 | | | 32 | 2 | | | |", - "| my_catalog | my_schema | t2 | large_binary_col | 5 | | NO | LargeBinary | | 9223372036854775807 | | | | | |", - "| my_catalog | my_schema | t2 | large_utf8_col | 3 | | NO | LargeUtf8 | | 9223372036854775807 | | | | | |", - "| my_catalog | my_schema | t2 | timestamp_nanos | 6 | | NO | Timestamp(Nanosecond, None) | | | | | | | |", - "| my_catalog | my_schema | t2 | utf8_col | 2 | | YES | Utf8 | | 2147483647 | | | | | |", - "+---------------+--------------+------------+------------------+------------------+----------------+-------------+-----------------------------+--------------------------+------------------------+-------------------+-------------------------+---------------+--------------------+---------------+", + "+---------------+--------------+------------+------------------+------------------+----------------+-------------+-----------------------------+--------------------------+------------------------+-------------------+-------------------------+---------------+--------------------+---------------+", + "| table_catalog | table_schema | table_name | column_name | ordinal_position | column_default | is_nullable | data_type | character_maximum_length | character_octet_length | numeric_precision | numeric_precision_radix | numeric_scale | datetime_precision | interval_type |", + "+---------------+--------------+------------+------------------+------------------+----------------+-------------+-----------------------------+--------------------------+------------------------+-------------------+-------------------------+---------------+--------------------+---------------+", + "| my_catalog | my_schema | t1 | i | 0 | | YES | Int32 | | | 32 | 2 | | | |", + "| my_catalog | my_schema | t2 | binary_col | 4 | | NO | Binary | | 2147483647 | | | | | |", + "| my_catalog | my_schema | t2 | float64_col | 1 | | YES | Float64 | | | 24 | 2 | | | |", + "| my_catalog | my_schema | t2 | int32_col | 0 | | NO | Int32 | | | 32 | 2 | | | |", + "| my_catalog | my_schema | t2 | large_binary_col | 5 | | NO | LargeBinary | | 9223372036854775807 | | | | | |", + "| my_catalog | my_schema | t2 | large_utf8_col | 3 | | NO | LargeUtf8 | | 9223372036854775807 | | | | | |", + "| my_catalog | my_schema | t2 | timestamp_nanos | 6 | | NO | Timestamp(Nanosecond, None) | | | | | | | |", + "| my_catalog | my_schema | t2 | utf8_col | 2 | | YES | Utf8 | | 2147483647 | | | | | |", + "+---------------+--------------+------------+------------------+------------------+----------------+-------------+-----------------------------+--------------------------+------------------------+-------------------+-------------------------+---------------+--------------------+---------------+", ]; assert_batches_sorted_eq!(expected, &result); } diff --git a/datafusion/src/optimizer/constant_folding.rs b/datafusion/src/optimizer/constant_folding.rs index 71c84f6153b62..51bf0ce1b5054 100644 --- a/datafusion/src/optimizer/constant_folding.rs +++ b/datafusion/src/optimizer/constant_folding.rs @@ -23,9 +23,11 @@ use std::sync::Arc; use arrow::datatypes::DataType; use crate::error::Result; +use crate::execution::context::ExecutionProps; use crate::logical_plan::{DFSchemaRef, Expr, ExprRewriter, LogicalPlan, Operator}; use crate::optimizer::optimizer::OptimizerRule; use crate::optimizer::utils; +use crate::physical_plan::functions::BuiltinScalarFunction; use crate::scalar::ScalarValue; /// Optimizer that simplifies comparison expressions involving boolean literals. @@ -47,7 +49,11 @@ impl ConstantFolding { } impl OptimizerRule for ConstantFolding { - fn optimize(&self, plan: &LogicalPlan) -> Result { + fn optimize( + &self, + plan: &LogicalPlan, + execution_props: &ExecutionProps, + ) -> Result { // We need to pass down the all schemas within the plan tree to `optimize_expr` in order to // to evaluate expression types. For example, a projection plan's schema will only include // projected columns. With just the projected schema, it's not possible to infer types for @@ -55,12 +61,13 @@ impl OptimizerRule for ConstantFolding { // children plans. let mut rewriter = ConstantRewriter { schemas: plan.all_schemas(), + execution_props, }; match plan { LogicalPlan::Filter { predicate, input } => Ok(LogicalPlan::Filter { predicate: predicate.clone().rewrite(&mut rewriter)?, - input: Arc::new(self.optimize(input)?), + input: Arc::new(self.optimize(input, execution_props)?), }), // Rest: recurse into plan, apply optimization where possible LogicalPlan::Projection { .. } @@ -78,7 +85,7 @@ impl OptimizerRule for ConstantFolding { let inputs = plan.inputs(); let new_inputs = inputs .iter() - .map(|plan| self.optimize(plan)) + .map(|plan| self.optimize(plan, execution_props)) .collect::>>()?; let expr = plan @@ -103,6 +110,7 @@ impl OptimizerRule for ConstantFolding { struct ConstantRewriter<'a> { /// input schemas schemas: Vec<&'a DFSchemaRef>, + execution_props: &'a ExecutionProps, } impl<'a> ConstantRewriter<'a> { @@ -200,6 +208,14 @@ impl<'a> ExprRewriter for ConstantRewriter<'a> { Expr::Not(inner) } } + Expr::ScalarFunction { + fun: BuiltinScalarFunction::Now, + .. + } => Expr::Literal(ScalarValue::TimestampNanosecond(Some( + self.execution_props + .query_execution_start_time + .timestamp_nanos(), + ))), expr => { // no rewrite possible expr @@ -217,6 +233,7 @@ mod tests { }; use arrow::datatypes::*; + use chrono::{DateTime, Utc}; fn test_table_scan() -> Result { let schema = Schema::new(vec![ @@ -243,6 +260,7 @@ mod tests { let schema = expr_test_schema(); let mut rewriter = ConstantRewriter { schemas: vec![&schema], + execution_props: &ExecutionProps::new(), }; assert_eq!( @@ -258,6 +276,7 @@ mod tests { let schema = expr_test_schema(); let mut rewriter = ConstantRewriter { schemas: vec![&schema], + execution_props: &ExecutionProps::new(), }; // x = null is always null @@ -293,6 +312,7 @@ mod tests { let schema = expr_test_schema(); let mut rewriter = ConstantRewriter { schemas: vec![&schema], + execution_props: &ExecutionProps::new(), }; assert_eq!(col("c2").get_type(&schema)?, DataType::Boolean); @@ -323,6 +343,7 @@ mod tests { let schema = expr_test_schema(); let mut rewriter = ConstantRewriter { schemas: vec![&schema], + execution_props: &ExecutionProps::new(), }; // When one of the operand is not of boolean type, folding the other boolean constant will @@ -362,6 +383,7 @@ mod tests { let schema = expr_test_schema(); let mut rewriter = ConstantRewriter { schemas: vec![&schema], + execution_props: &ExecutionProps::new(), }; assert_eq!(col("c2").get_type(&schema)?, DataType::Boolean); @@ -397,6 +419,7 @@ mod tests { let schema = expr_test_schema(); let mut rewriter = ConstantRewriter { schemas: vec![&schema], + execution_props: &ExecutionProps::new(), }; // when one of the operand is not of boolean type, folding the other boolean constant will @@ -432,6 +455,7 @@ mod tests { let schema = expr_test_schema(); let mut rewriter = ConstantRewriter { schemas: vec![&schema], + execution_props: &ExecutionProps::new(), }; assert_eq!( @@ -459,7 +483,9 @@ mod tests { fn assert_optimized_plan_eq(plan: &LogicalPlan, expected: &str) { let rule = ConstantFolding::new(); - let optimized_plan = rule.optimize(plan).expect("failed to optimize plan"); + let optimized_plan = rule + .optimize(plan, &ExecutionProps::new()) + .expect("failed to optimize plan"); let formatted_plan = format!("{:?}", optimized_plan); assert_eq!(formatted_plan, expected); } @@ -589,4 +615,77 @@ mod tests { assert_optimized_plan_eq(&plan, expected); Ok(()) } + + fn get_optimized_plan_formatted( + plan: &LogicalPlan, + date_time: &DateTime, + ) -> String { + let rule = ConstantFolding::new(); + let execution_props = ExecutionProps { + query_execution_start_time: *date_time, + }; + + let optimized_plan = rule + .optimize(plan, &execution_props) + .expect("failed to optimize plan"); + return format!("{:?}", optimized_plan); + } + + #[test] + fn single_now_expr() { + let table_scan = test_table_scan().unwrap(); + let proj = vec![Expr::ScalarFunction { + args: vec![], + fun: BuiltinScalarFunction::Now, + }]; + let time = chrono::Utc::now(); + let plan = LogicalPlanBuilder::from(&table_scan) + .project(proj) + .unwrap() + .build() + .unwrap(); + + let expected = format!( + "Projection: TimestampNanosecond({})\ + \n TableScan: test projection=None", + time.timestamp_nanos() + ); + let actual = get_optimized_plan_formatted(&plan, &time); + + assert_eq!(expected, actual); + } + + #[test] + fn multiple_now_expr() { + let table_scan = test_table_scan().unwrap(); + let time = chrono::Utc::now(); + let proj = vec![ + Expr::ScalarFunction { + args: vec![], + fun: BuiltinScalarFunction::Now, + }, + Expr::Alias( + Box::new(Expr::ScalarFunction { + args: vec![], + fun: BuiltinScalarFunction::Now, + }), + "t2".to_string(), + ), + ]; + let plan = LogicalPlanBuilder::from(&table_scan) + .project(proj) + .unwrap() + .build() + .unwrap(); + + let actual = get_optimized_plan_formatted(&plan, &time); + let expected = format!( + "Projection: TimestampNanosecond({}), TimestampNanosecond({}) AS t2\ + \n TableScan: test projection=None", + time.timestamp_nanos(), + time.timestamp_nanos() + ); + + assert_eq!(actual, expected); + } } diff --git a/datafusion/src/optimizer/eliminate_limit.rs b/datafusion/src/optimizer/eliminate_limit.rs index 87b33d6f5d5bc..1b965f1d02e42 100644 --- a/datafusion/src/optimizer/eliminate_limit.rs +++ b/datafusion/src/optimizer/eliminate_limit.rs @@ -22,6 +22,7 @@ use crate::logical_plan::LogicalPlan; use crate::optimizer::optimizer::OptimizerRule; use super::utils; +use crate::execution::context::ExecutionProps; /// Optimization rule that replaces LIMIT 0 with an [LogicalPlan::EmptyRelation] pub struct EliminateLimit; @@ -34,7 +35,11 @@ impl EliminateLimit { } impl OptimizerRule for EliminateLimit { - fn optimize(&self, plan: &LogicalPlan) -> Result { + fn optimize( + &self, + plan: &LogicalPlan, + execution_props: &ExecutionProps, + ) -> Result { match plan { LogicalPlan::Limit { n, input } if *n == 0 => { Ok(LogicalPlan::EmptyRelation { @@ -50,7 +55,7 @@ impl OptimizerRule for EliminateLimit { let inputs = plan.inputs(); let new_inputs = inputs .iter() - .map(|plan| self.optimize(plan)) + .map(|plan| self.optimize(plan, execution_props)) .collect::>>()?; utils::from_plan(plan, &expr, &new_inputs) @@ -72,7 +77,9 @@ mod tests { fn assert_optimized_plan_eq(plan: &LogicalPlan, expected: &str) { let rule = EliminateLimit::new(); - let optimized_plan = rule.optimize(plan).expect("failed to optimize plan"); + let optimized_plan = rule + .optimize(plan, &ExecutionProps::new()) + .expect("failed to optimize plan"); let formatted_plan = format!("{:?}", optimized_plan); assert_eq!(formatted_plan, expected); assert_eq!(plan.schema(), optimized_plan.schema()); diff --git a/datafusion/src/optimizer/filter_push_down.rs b/datafusion/src/optimizer/filter_push_down.rs index 356d497491a14..4c248e2b6483d 100644 --- a/datafusion/src/optimizer/filter_push_down.rs +++ b/datafusion/src/optimizer/filter_push_down.rs @@ -15,6 +15,7 @@ //! Filter Push Down optimizer rule ensures that filters are applied as early as possible in the plan use crate::datasource::datasource::TableProviderFilterPushDown; +use crate::execution::context::ExecutionProps; use crate::logical_plan::{and, LogicalPlan}; use crate::logical_plan::{DFSchema, Expr}; use crate::optimizer::optimizer::OptimizerRule; @@ -413,7 +414,7 @@ impl OptimizerRule for FilterPushDown { "filter_push_down" } - fn optimize(&self, plan: &LogicalPlan) -> Result { + fn optimize(&self, plan: &LogicalPlan, _: &ExecutionProps) -> Result { optimize(plan, State::default()) } } @@ -456,7 +457,9 @@ mod tests { fn assert_optimized_plan_eq(plan: &LogicalPlan, expected: &str) { let rule = FilterPushDown::new(); - let optimized_plan = rule.optimize(plan).expect("failed to optimize plan"); + let optimized_plan = rule + .optimize(plan, &ExecutionProps::new()) + .expect("failed to optimize plan"); let formatted_plan = format!("{:?}", optimized_plan); assert_eq!(formatted_plan, expected); } diff --git a/datafusion/src/optimizer/hash_build_probe_order.rs b/datafusion/src/optimizer/hash_build_probe_order.rs index b27171f058ca9..168c4a17edfd0 100644 --- a/datafusion/src/optimizer/hash_build_probe_order.rs +++ b/datafusion/src/optimizer/hash_build_probe_order.rs @@ -27,6 +27,7 @@ use crate::optimizer::optimizer::OptimizerRule; use crate::{error::Result, prelude::JoinType}; use super::utils; +use crate::execution::context::ExecutionProps; /// BuildProbeOrder reorders the build and probe phase of /// hash joins. This uses the amount of rows that a datasource has. @@ -106,7 +107,11 @@ impl OptimizerRule for HashBuildProbeOrder { "hash_build_probe_order" } - fn optimize(&self, plan: &LogicalPlan) -> Result { + fn optimize( + &self, + plan: &LogicalPlan, + execution_props: &ExecutionProps, + ) -> Result { match plan { // Main optimization rule, swaps order of left and right // based on number of rows in each table @@ -117,8 +122,8 @@ impl OptimizerRule for HashBuildProbeOrder { join_type, schema, } => { - let left = self.optimize(left)?; - let right = self.optimize(right)?; + let left = self.optimize(left, execution_props)?; + let right = self.optimize(right, execution_props)?; if should_swap_join_order(&left, &right) { // Swap left and right, change join type and (equi-)join key order Ok(LogicalPlan::Join { @@ -147,8 +152,8 @@ impl OptimizerRule for HashBuildProbeOrder { right, schema, } => { - let left = self.optimize(left)?; - let right = self.optimize(right)?; + let left = self.optimize(left, execution_props)?; + let right = self.optimize(right, execution_props)?; if should_swap_join_order(&left, &right) { // Swap left and right Ok(LogicalPlan::CrossJoin { @@ -184,7 +189,7 @@ impl OptimizerRule for HashBuildProbeOrder { let inputs = plan.inputs(); let new_inputs = inputs .iter() - .map(|plan| self.optimize(plan)) + .map(|plan| self.optimize(plan, execution_props)) .collect::>>()?; utils::from_plan(plan, &expr, &new_inputs) diff --git a/datafusion/src/optimizer/limit_push_down.rs b/datafusion/src/optimizer/limit_push_down.rs index 73a231f2248f4..e616869d7c4a4 100644 --- a/datafusion/src/optimizer/limit_push_down.rs +++ b/datafusion/src/optimizer/limit_push_down.rs @@ -21,6 +21,7 @@ use std::sync::Arc; use super::utils; use crate::error::Result; +use crate::execution::context::ExecutionProps; use crate::logical_plan::LogicalPlan; use crate::optimizer::optimizer::OptimizerRule; @@ -125,7 +126,7 @@ fn limit_push_down( } impl OptimizerRule for LimitPushDown { - fn optimize(&self, plan: &LogicalPlan) -> Result { + fn optimize(&self, plan: &LogicalPlan, _: &ExecutionProps) -> Result { limit_push_down(None, plan) } @@ -143,7 +144,9 @@ mod test { fn assert_optimized_plan_eq(plan: &LogicalPlan, expected: &str) { let rule = LimitPushDown::new(); - let optimized_plan = rule.optimize(plan).expect("failed to optimize plan"); + let optimized_plan = rule + .optimize(plan, &ExecutionProps::new()) + .expect("failed to optimize plan"); let formatted_plan = format!("{:?}", optimized_plan); assert_eq!(formatted_plan, expected); } diff --git a/datafusion/src/optimizer/optimizer.rs b/datafusion/src/optimizer/optimizer.rs index dee8e06a5e3ba..5cf4047947044 100644 --- a/datafusion/src/optimizer/optimizer.rs +++ b/datafusion/src/optimizer/optimizer.rs @@ -18,6 +18,7 @@ //! Query optimizer traits use crate::error::Result; +use crate::execution::context::ExecutionProps; use crate::logical_plan::LogicalPlan; /// `OptimizerRule` transforms one ['LogicalPlan'] into another which @@ -25,7 +26,11 @@ use crate::logical_plan::LogicalPlan; /// way. pub trait OptimizerRule { /// Rewrite `plan` to an optimized form - fn optimize(&self, plan: &LogicalPlan) -> Result; + fn optimize( + &self, + plan: &LogicalPlan, + execution_props: &ExecutionProps, + ) -> Result; /// A human readable name for this optimizer rule fn name(&self) -> &str; diff --git a/datafusion/src/optimizer/projection_push_down.rs b/datafusion/src/optimizer/projection_push_down.rs index 7243fa52d9b32..21c9caba3316d 100644 --- a/datafusion/src/optimizer/projection_push_down.rs +++ b/datafusion/src/optimizer/projection_push_down.rs @@ -19,6 +19,7 @@ //! loaded into memory use crate::error::Result; +use crate::execution::context::ExecutionProps; use crate::logical_plan::{DFField, DFSchema, DFSchemaRef, LogicalPlan, ToDFSchema}; use crate::optimizer::optimizer::OptimizerRule; use crate::optimizer::utils; @@ -32,7 +33,11 @@ use utils::optimize_explain; pub struct ProjectionPushDown {} impl OptimizerRule for ProjectionPushDown { - fn optimize(&self, plan: &LogicalPlan) -> Result { + fn optimize( + &self, + plan: &LogicalPlan, + execution_props: &ExecutionProps, + ) -> Result { // set of all columns refered by the plan (and thus considered required by the root) let required_columns = plan .schema() @@ -40,7 +45,7 @@ impl OptimizerRule for ProjectionPushDown { .iter() .map(|f| f.name().clone()) .collect::>(); - optimize_plan(self, plan, &required_columns, false) + optimize_plan(self, plan, &required_columns, false, execution_props) } fn name(&self) -> &str { @@ -105,6 +110,7 @@ fn optimize_plan( plan: &LogicalPlan, required_columns: &HashSet, // set of columns required up to this step has_projection: bool, + execution_props: &ExecutionProps, ) -> Result { let mut new_required_columns = required_columns.clone(); match plan { @@ -137,8 +143,13 @@ fn optimize_plan( } })?; - let new_input = - optimize_plan(optimizer, &input, &new_required_columns, true)?; + let new_input = optimize_plan( + optimizer, + &input, + &new_required_columns, + true, + execution_props, + )?; if new_fields.is_empty() { // no need for an expression at all Ok(new_input) @@ -167,12 +178,14 @@ fn optimize_plan( &left, &new_required_columns, true, + execution_props, )?), right: Arc::new(optimize_plan( optimizer, &right, &new_required_columns, true, + execution_props, )?), join_type: *join_type, @@ -226,6 +239,7 @@ fn optimize_plan( &input, &new_required_columns, true, + execution_props, )?), schema: DFSchemaRef::new(new_schema), }) @@ -259,7 +273,14 @@ fn optimize_plan( schema, } => { let schema = schema.as_ref().to_owned().into(); - optimize_explain(optimizer, *verbose, &*plan, stringified_plans, &schema) + optimize_explain( + optimizer, + *verbose, + &*plan, + stringified_plans, + &schema, + execution_props, + ) } // all other nodes: Add any additional columns used by // expressions in this node to the list of required columns @@ -281,7 +302,13 @@ fn optimize_plan( let new_inputs = inputs .iter() .map(|plan| { - optimize_plan(optimizer, plan, &new_required_columns, has_projection) + optimize_plan( + optimizer, + plan, + &new_required_columns, + has_projection, + execution_props, + ) }) .collect::>>()?; @@ -538,6 +565,6 @@ mod tests { fn optimize(plan: &LogicalPlan) -> Result { let rule = ProjectionPushDown::new(); - rule.optimize(plan) + rule.optimize(plan, &ExecutionProps::new()) } } diff --git a/datafusion/src/optimizer/utils.rs b/datafusion/src/optimizer/utils.rs index 0ec3fa7c02a16..9288c65ac4dac 100644 --- a/datafusion/src/optimizer/utils.rs +++ b/datafusion/src/optimizer/utils.rs @@ -22,6 +22,7 @@ use std::{collections::HashSet, sync::Arc}; use arrow::datatypes::Schema; use super::optimizer::OptimizerRule; +use crate::execution::context::ExecutionProps; use crate::logical_plan::{ Expr, LogicalPlan, Operator, Partitioning, PlanType, Recursion, StringifiedPlan, ToDFSchema, @@ -101,11 +102,12 @@ pub fn optimize_explain( plan: &LogicalPlan, stringified_plans: &[StringifiedPlan], schema: &Schema, + execution_props: &ExecutionProps, ) -> Result { // These are the fields of LogicalPlan::Explain It might be nice // to transform that enum Variant into its own struct and avoid // passing the fields individually - let plan = Arc::new(optimizer.optimize(plan)?); + let plan = Arc::new(optimizer.optimize(plan, execution_props)?); let mut stringified_plans = stringified_plans.to_vec(); let optimizer_name = optimizer.name().into(); stringified_plans.push(StringifiedPlan::new( @@ -128,6 +130,7 @@ pub fn optimize_explain( pub fn optimize_children( optimizer: &impl OptimizerRule, plan: &LogicalPlan, + execution_props: &ExecutionProps, ) -> Result { if let LogicalPlan::Explain { verbose, @@ -142,6 +145,7 @@ pub fn optimize_children( &*plan, stringified_plans, &schema.as_ref().to_owned().into(), + execution_props, ); } @@ -149,7 +153,7 @@ pub fn optimize_children( let new_inputs = plan .inputs() .into_iter() - .map(|plan| optimizer.optimize(plan)) + .map(|plan| optimizer.optimize(plan, execution_props)) .collect::>>()?; from_plan(plan, &new_exprs, &new_inputs) @@ -443,7 +447,11 @@ mod tests { struct TestOptimizer {} impl OptimizerRule for TestOptimizer { - fn optimize(&self, plan: &LogicalPlan) -> Result { + fn optimize( + &self, + plan: &LogicalPlan, + _: &ExecutionProps, + ) -> Result { Ok(plan.clone()) } @@ -465,6 +473,7 @@ mod tests { &empty_plan, &[StringifiedPlan::new(PlanType::LogicalPlan, "...")], schema.as_ref(), + &ExecutionProps::new(), )?; match &optimized_explain { diff --git a/datafusion/src/physical_plan/datetime_expressions.rs b/datafusion/src/physical_plan/datetime_expressions.rs index 7b5816186f27e..ec52e6bc4d528 100644 --- a/datafusion/src/physical_plan/datetime_expressions.rs +++ b/datafusion/src/physical_plan/datetime_expressions.rs @@ -268,6 +268,23 @@ pub fn to_timestamp(args: &[ColumnarValue]) -> Result { ) } +/// Create an implementation of `now()` that always returns the +/// specified timestamp. +/// +/// The semantics of `now()` require it to return the same value +/// whenever it is called in a query. This this value is chosen during +/// planning time and bound into a closure that +pub fn make_now( + now_ts: DateTime, +) -> impl Fn(&[ColumnarValue]) -> Result { + let now_ts = Some(now_ts.timestamp_nanos()); + move |_arg| { + Ok(ColumnarValue::Scalar(ScalarValue::TimestampNanosecond( + now_ts, + ))) + } +} + fn date_trunc_single(granularity: &str, value: i64) -> Result { let value = timestamp_ns_to_datetime(value).with_nanosecond(0); let value = match granularity { @@ -300,7 +317,7 @@ fn date_trunc_single(granularity: &str, value: i64) -> Result { return Err(DataFusionError::Execution(format!( "Unsupported date_trunc granularity: {}", unsupported - ))) + ))); } }; // `with_x(0)` are infalible because `0` are always a valid diff --git a/datafusion/src/physical_plan/functions.rs b/datafusion/src/physical_plan/functions.rs index 960d7c5d8e0d7..2e053a80976b0 100644 --- a/datafusion/src/physical_plan/functions.rs +++ b/datafusion/src/physical_plan/functions.rs @@ -33,6 +33,7 @@ use super::{ type_coercion::{coerce, data_types}, ColumnarValue, PhysicalExpr, }; +use crate::execution::context::ExecutionContextState; use crate::physical_plan::array_expressions; use crate::physical_plan::datetime_expressions; use crate::physical_plan::expressions::{nullif_func, SUPPORTED_NULLIF_TYPES}; @@ -194,6 +195,8 @@ pub enum BuiltinScalarFunction { ToHex, /// to_timestamp ToTimestamp, + ///now + Now, /// translate Translate, /// trim @@ -273,6 +276,7 @@ impl FromStr for BuiltinScalarFunction { "substr" => BuiltinScalarFunction::Substr, "to_hex" => BuiltinScalarFunction::ToHex, "to_timestamp" => BuiltinScalarFunction::ToTimestamp, + "now" => BuiltinScalarFunction::Now, "translate" => BuiltinScalarFunction::Translate, "trim" => BuiltinScalarFunction::Trim, "upper" => BuiltinScalarFunction::Upper, @@ -298,15 +302,6 @@ pub fn return_type( // verify that this is a valid set of data types for this function data_types(&arg_types, &signature(fun))?; - if arg_types.is_empty() { - // functions currently cannot be evaluated without arguments, as they can't - // know the number of rows to return. - return Err(DataFusionError::Plan(format!( - "Function '{}' requires at least one argument", - fun - ))); - } - // the return type of the built in function. // Some built-in functions' return type depends on the incoming type. match fun { @@ -582,6 +577,7 @@ pub fn return_type( BuiltinScalarFunction::ToTimestamp => { Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)) } + BuiltinScalarFunction::Now => Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)), BuiltinScalarFunction::Translate => Ok(match arg_types[0] { DataType::LargeUtf8 => DataType::LargeUtf8, DataType::Utf8 => DataType::Utf8, @@ -714,6 +710,7 @@ pub fn create_physical_expr( fun: &BuiltinScalarFunction, args: &[Arc], input_schema: &Schema, + ctx_state: &ExecutionContextState, ) -> Result> { let fun_expr: ScalarFunctionImplementation = Arc::new(match fun { // math functions @@ -805,6 +802,22 @@ pub fn create_physical_expr( } BuiltinScalarFunction::DatePart => datetime_expressions::date_part, BuiltinScalarFunction::DateTrunc => datetime_expressions::date_trunc, + BuiltinScalarFunction::Now => { + // bind value for now at plan time + let fun_expr = Arc::new(datetime_expressions::make_now( + ctx_state.execution_props.query_execution_start_time, + )); + + // TODO refactor code to not return here, but instead fall through below + let args = vec![]; + let arg_types = vec![]; // has no args + return Ok(Arc::new(ScalarFunctionExpr::new( + &format!("{}", fun), + fun_expr, + args, + &return_type(&fun, &arg_types)?, + ))); + } BuiltinScalarFunction::InitCap => |args| match args[0].data_type() { DataType::Utf8 => { make_scalar_function(string_expressions::initcap::)(args) @@ -1451,13 +1464,14 @@ mod tests { ($FUNC:ident, $ARGS:expr, $EXPECTED:expr, $EXPECTED_TYPE:ty, $DATA_TYPE: ident, $ARRAY_TYPE:ident) => { // used to provide type annotation let expected: Result> = $EXPECTED; + let ctx_state = ExecutionContextState::new(); // any type works here: we evaluate against a literal of `value` let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); let columns: Vec = vec![Arc::new(Int32Array::from(vec![1]))]; let expr = - create_physical_expr(&BuiltinScalarFunction::$FUNC, $ARGS, &schema)?; + create_physical_expr(&BuiltinScalarFunction::$FUNC, $ARGS, &schema, &ctx_state)?; // type is correct assert_eq!(expr.data_type(&schema)?, DataType::$DATA_TYPE); @@ -3618,7 +3632,20 @@ mod tests { #[test] fn test_concat_error() -> Result<()> { - let result = return_type(&BuiltinScalarFunction::Concat, &[]); + let ctx_state = ExecutionContextState::new(); + let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); + + let expr = create_physical_expr( + &BuiltinScalarFunction::Concat, + &[], + &schema, + &ctx_state, + )?; + + let columns: Vec = vec![Arc::new(Int32Array::from(vec![1]))]; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), columns)?; + let result = expr.evaluate(&batch); + if result.is_ok() { Err(DataFusionError::Plan( "Function 'concat' cannot accept zero arguments".to_string(), @@ -3640,11 +3667,13 @@ mod tests { Field::new("b", value2.data_type().clone(), false), ]); let columns: Vec = vec![value1, value2]; + let ctx_state = ExecutionContextState::new(); let expr = create_physical_expr( &BuiltinScalarFunction::Array, &[col("a"), col("b")], &schema, + &ctx_state, )?; // type is correct @@ -3700,6 +3729,7 @@ mod tests { #[cfg(feature = "regex_expressions")] fn test_regexp_match() -> Result<()> { let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]); + let ctx_state = ExecutionContextState::new(); // concat(value, value) let col_value: ArrayRef = Arc::new(StringArray::from(vec!["aaa-555"])); @@ -3709,6 +3739,7 @@ mod tests { &BuiltinScalarFunction::RegexpMatch, &[col("a"), pattern], &schema, + &ctx_state, )?; // type is correct @@ -3737,6 +3768,7 @@ mod tests { #[cfg(feature = "regex_expressions")] fn test_regexp_match_all_literals() -> Result<()> { let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); + let ctx_state = ExecutionContextState::new(); // concat(value, value) let col_value = lit(ScalarValue::Utf8(Some("aaa-555".to_string()))); @@ -3746,6 +3778,7 @@ mod tests { &BuiltinScalarFunction::RegexpMatch, &[col_value, pattern], &schema, + &ctx_state, )?; // type is correct diff --git a/datafusion/src/physical_plan/parquet.rs b/datafusion/src/physical_plan/parquet.rs index 09dd48df3ed5c..dee0fc89a7a0d 100644 --- a/datafusion/src/physical_plan/parquet.rs +++ b/datafusion/src/physical_plan/parquet.rs @@ -21,25 +21,18 @@ use std::fmt; use std::fs::File; use std::sync::Arc; use std::task::{Context, Poll}; -use std::{ - any::Any, - collections::{HashMap, HashSet}, -}; +use std::{any::Any, collections::HashSet}; use super::{ planner::DefaultPhysicalPlanner, ColumnarValue, PhysicalExpr, RecordBatchStream, SendableRecordBatchStream, }; -use crate::{ - catalog::catalog::MemoryCatalogList, - physical_plan::{common, ExecutionPlan, Partitioning}, -}; +use crate::physical_plan::{common, ExecutionPlan, Partitioning}; use crate::{ error::{DataFusionError, Result}, execution::context::ExecutionContextState, logical_plan::{Expr, Operator}, optimizer::utils, - prelude::ExecutionConfig, }; use arrow::record_batch::RecordBatch; use arrow::{ @@ -393,13 +386,7 @@ impl RowGroupPredicateBuilder { .map(|(_, _, f)| f.clone()) .collect::>(); let stat_schema = Schema::new(stat_fields); - let execution_context_state = ExecutionContextState { - catalog_list: Arc::new(MemoryCatalogList::new()), - scalar_functions: HashMap::new(), - var_provider: HashMap::new(), - aggregate_functions: HashMap::new(), - config: ExecutionConfig::new(), - }; + let execution_context_state = ExecutionContextState::new(); let predicate_expr = DefaultPhysicalPlanner::default().create_physical_expr( &logical_predicate_expr, &stat_schema, diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index acbb863c604b7..664e4dccbdf9d 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -555,7 +555,12 @@ impl DefaultPhysicalPlanner { .iter() .map(|e| self.create_physical_expr(e, input_schema, ctx_state)) .collect::>>()?; - functions::create_physical_expr(fun, &physical_args, input_schema) + functions::create_physical_expr( + fun, + &physical_args, + input_schema, + ctx_state, + ) } Expr::ScalarUDF { fun, args } => { let mut physical_args = vec![]; @@ -736,13 +741,9 @@ fn tuple_err(value: (Result, Result)) -> Result<(T, R)> { #[cfg(test)] mod tests { use super::*; + use crate::logical_plan::{DFField, DFSchema, DFSchemaRef}; use crate::physical_plan::{csv::CsvReadOptions, expressions, Partitioning}; - use crate::prelude::ExecutionConfig; use crate::scalar::ScalarValue; - use crate::{ - catalog::catalog::MemoryCatalogList, - logical_plan::{DFField, DFSchema, DFSchemaRef}, - }; use crate::{ logical_plan::{col, lit, sum, LogicalPlanBuilder}, physical_plan::SendableRecordBatchStream, @@ -750,16 +751,10 @@ mod tests { use arrow::datatypes::{DataType, Field, SchemaRef}; use async_trait::async_trait; use fmt::Debug; - use std::{any::Any, collections::HashMap, fmt}; + use std::{any::Any, fmt}; fn make_ctx_state() -> ExecutionContextState { - ExecutionContextState { - catalog_list: Arc::new(MemoryCatalogList::new()), - scalar_functions: HashMap::new(), - var_provider: HashMap::new(), - aggregate_functions: HashMap::new(), - config: ExecutionConfig::new(), - } + ExecutionContextState::new() } fn plan(logical_plan: &LogicalPlan) -> Result> { diff --git a/datafusion/src/physical_plan/type_coercion.rs b/datafusion/src/physical_plan/type_coercion.rs index d9f84e7cb8622..98ae09cc381dc 100644 --- a/datafusion/src/physical_plan/type_coercion.rs +++ b/datafusion/src/physical_plan/type_coercion.rs @@ -46,6 +46,10 @@ pub fn coerce( schema: &Schema, signature: &Signature, ) -> Result>> { + if expressions.is_empty() { + return Ok(vec![]); + } + let current_types = expressions .iter() .map(|e| e.data_type(schema)) @@ -68,6 +72,10 @@ pub fn data_types( current_types: &[DataType], signature: &Signature, ) -> Result> { + if current_types.is_empty() { + return Ok(vec![]); + } + let valid_types = get_valid_types(signature, current_types)?; if valid_types diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 4b53e2f2e38c1..c80ffe4d34673 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -2342,7 +2342,7 @@ macro_rules! test_expression { let mut ctx = ExecutionContext::new(); let sql = format!("SELECT {}", $SQL); let actual = execute(&mut ctx, sql.as_str()).await; - assert_eq!($EXPECTED, actual[0][0]); + assert_eq!(actual[0][0], $EXPECTED); }; } @@ -2863,6 +2863,53 @@ async fn test_cast_expressions() -> Result<()> { Ok(()) } +#[tokio::test] +async fn test_current_timestamp_expressions() -> Result<()> { + let t1 = chrono::Utc::now().timestamp(); + let mut ctx = ExecutionContext::new(); + let actual = execute(&mut ctx, "SELECT NOW(), NOW() as t2").await; + let res1 = actual[0][0].as_str(); + let res2 = actual[0][1].as_str(); + let t3 = chrono::Utc::now().timestamp(); + let t2_naive = + chrono::NaiveDateTime::parse_from_str(res1, "%Y-%m-%d %H:%M:%S%.6f").unwrap(); + + let t2 = t2_naive.timestamp(); + assert!(t1 <= t2 && t2 <= t3); + assert_eq!(res2, res1); + + Ok(()) +} + +#[tokio::test] +async fn test_current_timestamp_expressions_non_optimized() -> Result<()> { + let t1 = chrono::Utc::now().timestamp(); + let ctx = ExecutionContext::new(); + let sql = "SELECT NOW(), NOW() as t2"; + + let msg = format!("Creating logical plan for '{}'", sql); + let plan = ctx.create_logical_plan(sql).expect(&msg); + + let msg = format!("Creating physical plan for '{}': {:?}", sql, plan); + let plan = ctx.create_physical_plan(&plan).expect(&msg); + + let msg = format!("Executing physical plan for '{}': {:?}", sql, plan); + let res = collect(plan).await.expect(&msg); + let actual = result_vec(&res); + + let res1 = actual[0][0].as_str(); + let res2 = actual[0][1].as_str(); + let t3 = chrono::Utc::now().timestamp(); + let t2_naive = + chrono::NaiveDateTime::parse_from_str(res1, "%Y-%m-%d %H:%M:%S%.6f").unwrap(); + + let t2 = t2_naive.timestamp(); + assert!(t1 <= t2 && t2 <= t3); + assert_eq!(res2, res1); + + Ok(()) +} + #[tokio::test] async fn test_cast_expressions_error() -> Result<()> { // sin(utf8) should error diff --git a/datafusion/tests/user_defined_plan.rs b/datafusion/tests/user_defined_plan.rs index f9f24430104c8..5e38c57b6f1bc 100644 --- a/datafusion/tests/user_defined_plan.rs +++ b/datafusion/tests/user_defined_plan.rs @@ -85,6 +85,7 @@ use std::task::{Context, Poll}; use std::{any::Any, collections::BTreeMap, fmt, sync::Arc}; use async_trait::async_trait; +use datafusion::execution::context::ExecutionProps; use datafusion::logical_plan::DFSchemaRef; /// Execute the specified sql and return the resulting record batches @@ -211,7 +212,11 @@ impl QueryPlanner for TopKQueryPlanner { struct TopKOptimizerRule {} impl OptimizerRule for TopKOptimizerRule { // Example rewrite pass to insert a user defined LogicalPlanNode - fn optimize(&self, plan: &LogicalPlan) -> Result { + fn optimize( + &self, + plan: &LogicalPlan, + execution_props: &ExecutionProps, + ) -> Result { // Note: this code simply looks for the pattern of a Limit followed by a // Sort and replaces it by a TopK node. It does not handle many // edge cases (e.g multiple sort columns, sort ASC / DESC), etc. @@ -226,7 +231,7 @@ impl OptimizerRule for TopKOptimizerRule { return Ok(LogicalPlan::Extension { node: Arc::new(TopKPlanNode { k: *n, - input: self.optimize(input.as_ref())?, + input: self.optimize(input.as_ref(), execution_props)?, expr: expr[0].clone(), }), }); @@ -236,7 +241,7 @@ impl OptimizerRule for TopKOptimizerRule { // If we didn't find the Limit/Sort combination, recurse as // normal and build the result. - optimize_children(self, plan) + optimize_children(self, plan, execution_props) } fn name(&self) -> &str { From 9cf32cf2cda8472b87130142c4eee1126d4d9cbe Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 14 May 2021 15:01:15 -0400 Subject: [PATCH 093/329] Implement readable explain plans for physical plans (#337) * Implement readable explain plans for physical plans * Add apache copyright to display.rs * Set concurrency explictly in test and make it windows friendly * fix doc example test * fmt! --- datafusion/src/logical_plan/display.rs | 13 +- datafusion/src/logical_plan/plan.rs | 4 +- .../src/physical_plan/coalesce_batches.rs | 19 ++- datafusion/src/physical_plan/cross_join.rs | 19 ++- datafusion/src/physical_plan/csv.rs | 32 +++- datafusion/src/physical_plan/display.rs | 90 ++++++++++ .../src/physical_plan/distinct_expressions.rs | 4 + datafusion/src/physical_plan/empty.rs | 17 +- datafusion/src/physical_plan/explain.rs | 19 ++- .../src/physical_plan/expressions/average.rs | 4 + .../src/physical_plan/expressions/count.rs | 4 + .../src/physical_plan/expressions/min_max.rs | 8 + .../src/physical_plan/expressions/mod.rs | 13 ++ .../src/physical_plan/expressions/sum.rs | 4 + datafusion/src/physical_plan/filter.rs | 16 +- .../src/physical_plan/hash_aggregate.rs | 39 ++++- datafusion/src/physical_plan/hash_join.rs | 21 ++- datafusion/src/physical_plan/limit.rs | 28 +++- datafusion/src/physical_plan/memory.rs | 24 ++- datafusion/src/physical_plan/merge.rs | 15 +- datafusion/src/physical_plan/mod.rs | 154 +++++++++++++++++- datafusion/src/physical_plan/parquet.rs | 28 +++- datafusion/src/physical_plan/planner.rs | 7 +- datafusion/src/physical_plan/projection.rs | 29 +++- datafusion/src/physical_plan/repartition.rs | 14 +- datafusion/src/physical_plan/sort.rs | 15 +- datafusion/src/physical_plan/udaf.rs | 4 + datafusion/tests/custom_sources.rs | 17 +- datafusion/tests/sql.rs | 48 +++++- datafusion/tests/user_defined_plan.rs | 22 ++- 30 files changed, 683 insertions(+), 48 deletions(-) create mode 100644 datafusion/src/physical_plan/display.rs diff --git a/datafusion/src/logical_plan/display.rs b/datafusion/src/logical_plan/display.rs index 76749b547a8f8..f285534fdf1b6 100644 --- a/datafusion/src/logical_plan/display.rs +++ b/datafusion/src/logical_plan/display.rs @@ -29,7 +29,8 @@ pub struct IndentVisitor<'a, 'b> { f: &'a mut fmt::Formatter<'b>, /// If true, includes summarized schema information with_schema: bool, - indent: u32, + /// The current indent + indent: usize, } impl<'a, 'b> IndentVisitor<'a, 'b> { @@ -42,13 +43,6 @@ impl<'a, 'b> IndentVisitor<'a, 'b> { indent: 0, } } - - fn write_indent(&mut self) -> fmt::Result { - for _ in 0..self.indent { - write!(self.f, " ")?; - } - Ok(()) - } } impl<'a, 'b> PlanVisitor for IndentVisitor<'a, 'b> { @@ -58,8 +52,7 @@ impl<'a, 'b> PlanVisitor for IndentVisitor<'a, 'b> { if self.indent > 0 { writeln!(self.f)?; } - self.write_indent()?; - + write!(self.f, "{:indent$}", "", indent = self.indent * 2)?; write!(self.f, "{}", plan.display())?; if self.with_schema { write!( diff --git a/datafusion/src/logical_plan/plan.rs b/datafusion/src/logical_plan/plan.rs index 13509d13eb159..8b9aac9ea73b9 100644 --- a/datafusion/src/logical_plan/plan.rs +++ b/datafusion/src/logical_plan/plan.rs @@ -356,13 +356,15 @@ pub enum Partitioning { /// after all children have been visited. //// /// To use, define a struct that implements this trait and then invoke -/// "LogicalPlan::accept". +/// [`LogicalPlan::accept`]. /// /// For example, for a logical plan like: /// +/// ```text /// Projection: #id /// Filter: #state Eq Utf8(\"CO\")\ /// CsvScan: employee.csv projection=Some([0, 3])"; +/// ``` /// /// The sequence of visit operations would be: /// ```text diff --git a/datafusion/src/physical_plan/coalesce_batches.rs b/datafusion/src/physical_plan/coalesce_batches.rs index b91e0b672eb58..e25412d9d6b8b 100644 --- a/datafusion/src/physical_plan/coalesce_batches.rs +++ b/datafusion/src/physical_plan/coalesce_batches.rs @@ -25,7 +25,8 @@ use std::task::{Context, Poll}; use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ - ExecutionPlan, Partitioning, RecordBatchStream, SendableRecordBatchStream, + DisplayFormatType, ExecutionPlan, Partitioning, RecordBatchStream, + SendableRecordBatchStream, }; use arrow::compute::kernels::concat::concat; @@ -114,6 +115,22 @@ impl ExecutionPlan for CoalesceBatchesExec { is_closed: false, })) } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + write!( + f, + "CoalesceBatchesExec: target_batch_size={}", + self.target_batch_size + ) + } + } + } } struct CoalesceBatchesStream { diff --git a/datafusion/src/physical_plan/cross_join.rs b/datafusion/src/physical_plan/cross_join.rs index 4372352d6ecf9..f6f5da4cf8db9 100644 --- a/datafusion/src/physical_plan/cross_join.rs +++ b/datafusion/src/physical_plan/cross_join.rs @@ -21,7 +21,6 @@ use futures::{lock::Mutex, StreamExt}; use std::{any::Any, sync::Arc, task::Poll}; -use crate::physical_plan::memory::MemoryStream; use arrow::datatypes::{Schema, SchemaRef}; use arrow::error::Result as ArrowResult; use arrow::record_batch::RecordBatch; @@ -36,8 +35,10 @@ use crate::{ use async_trait::async_trait; use std::time::Instant; -use super::{ExecutionPlan, Partitioning, RecordBatchStream, SendableRecordBatchStream}; -use crate::physical_plan::coalesce_batches::concat_batches; +use super::{ + coalesce_batches::concat_batches, memory::MemoryStream, DisplayFormatType, + ExecutionPlan, Partitioning, RecordBatchStream, SendableRecordBatchStream, +}; use log::debug; /// Data of the left side @@ -192,6 +193,18 @@ impl ExecutionPlan for CrossJoinExec { join_time: 0, })) } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + write!(f, "CrossJoinExec") + } + } + } } /// A stream that issues [RecordBatch]es as they arrive from the right of the join. diff --git a/datafusion/src/physical_plan/csv.rs b/datafusion/src/physical_plan/csv.rs index 9ab817799954f..96b24cc33201f 100644 --- a/datafusion/src/physical_plan/csv.rs +++ b/datafusion/src/physical_plan/csv.rs @@ -18,8 +18,7 @@ //! Execution plan for reading CSV files use crate::error::{DataFusionError, Result}; -use crate::physical_plan::ExecutionPlan; -use crate::physical_plan::{common, Partitioning}; +use crate::physical_plan::{common, DisplayFormatType, ExecutionPlan, Partitioning}; use arrow::csv; use arrow::datatypes::{Schema, SchemaRef}; use arrow::error::Result as ArrowResult; @@ -135,6 +134,19 @@ impl std::fmt::Debug for Source { } } +impl std::fmt::Display for Source { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Source::PartitionedFiles { path, filenames } => { + write!(f, "Path({}: [{}])", path, filenames.join(",")) + } + Source::Reader(_) => { + write!(f, "Reader(...)") + } + } + } +} + impl Clone for Source { fn clone(&self) -> Self { match self { @@ -405,6 +417,22 @@ impl ExecutionPlan for CsvExec { } } } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + write!( + f, + "CsvExec: source={}, has_header={}", + self.source, self.has_header + ) + } + } + } } /// Iterator over batches diff --git a/datafusion/src/physical_plan/display.rs b/datafusion/src/physical_plan/display.rs new file mode 100644 index 0000000000000..bfc3cd951d21a --- /dev/null +++ b/datafusion/src/physical_plan/display.rs @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Implementation of physical plan display. See +//! [`crate::physical_plan::displayable`] for examples of how to +//! format + +use std::fmt; + +use super::{accept, ExecutionPlan, ExecutionPlanVisitor}; + +/// Options for controlling how each [`ExecutionPlan`] should format itself +#[derive(Debug, Clone, Copy)] +pub enum DisplayFormatType { + /// Default, compact format. Example: `FilterExec: c12 < 10.0` + Default, +} + +/// Wraps an `ExecutionPlan` with various ways to display this plan +pub struct DisplayableExecutionPlan<'a> { + inner: &'a dyn ExecutionPlan, +} + +impl<'a> DisplayableExecutionPlan<'a> { + /// Create a wrapper around an [`'ExecutionPlan'] which can be + /// pretty printed in a variety of ways + pub fn new(inner: &'a dyn ExecutionPlan) -> Self { + Self { inner } + } + + /// Return a `format`able structure that produces a single line + /// per node. + /// + /// ```text + /// ProjectionExec: expr=[a] + /// CoalesceBatchesExec: target_batch_size=4096 + /// FilterExec: a < 5 + /// RepartitionExec: partitioning=RoundRobinBatch(16) + /// CsvExec: source=...", + /// ``` + pub fn indent(&self) -> impl fmt::Display + 'a { + struct Wrapper<'a>(&'a dyn ExecutionPlan); + impl<'a> fmt::Display for Wrapper<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let t = DisplayFormatType::Default; + let mut visitor = IndentVisitor { t, f, indent: 0 }; + accept(self.0, &mut visitor) + } + } + Wrapper(self.inner) + } +} + +/// Formats plans with a single line per node. +struct IndentVisitor<'a, 'b> { + /// How to format each node + t: DisplayFormatType, + /// Write to this formatter + f: &'a mut fmt::Formatter<'b>, + ///with_schema: bool, + indent: usize, +} + +impl<'a, 'b> ExecutionPlanVisitor for IndentVisitor<'a, 'b> { + type Error = fmt::Error; + fn pre_visit( + &mut self, + plan: &dyn ExecutionPlan, + ) -> std::result::Result { + write!(self.f, "{:indent$}", "", indent = self.indent * 2)?; + plan.fmt_as(self.t, self.f)?; + writeln!(self.f)?; + self.indent += 1; + Ok(true) + } +} diff --git a/datafusion/src/physical_plan/distinct_expressions.rs b/datafusion/src/physical_plan/distinct_expressions.rs index 927f16fe3d216..f3513c2950e4d 100644 --- a/datafusion/src/physical_plan/distinct_expressions.rs +++ b/datafusion/src/physical_plan/distinct_expressions.rs @@ -120,6 +120,10 @@ impl AggregateExpr for DistinctCount { count_data_type: self.data_type.clone(), })) } + + fn name(&self) -> &str { + &self.name + } } #[derive(Debug)] diff --git a/datafusion/src/physical_plan/empty.rs b/datafusion/src/physical_plan/empty.rs index 3011b289507ff..391a695f45014 100644 --- a/datafusion/src/physical_plan/empty.rs +++ b/datafusion/src/physical_plan/empty.rs @@ -21,8 +21,9 @@ use std::any::Any; use std::sync::Arc; use crate::error::{DataFusionError, Result}; -use crate::physical_plan::memory::MemoryStream; -use crate::physical_plan::{Distribution, ExecutionPlan, Partitioning}; +use crate::physical_plan::{ + memory::MemoryStream, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, +}; use arrow::array::NullArray; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; @@ -120,6 +121,18 @@ impl ExecutionPlan for EmptyExec { None, )?)) } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + write!(f, "EmptyExec: produce_one_row={}", self.produce_one_row) + } + } + } } #[cfg(test)] diff --git a/datafusion/src/physical_plan/explain.rs b/datafusion/src/physical_plan/explain.rs index 26d2c94dc80a4..3c5ef1af32366 100644 --- a/datafusion/src/physical_plan/explain.rs +++ b/datafusion/src/physical_plan/explain.rs @@ -20,15 +20,14 @@ use std::any::Any; use std::sync::Arc; -use crate::error::{DataFusionError, Result}; use crate::{ + error::{DataFusionError, Result}, logical_plan::StringifiedPlan, - physical_plan::{common::SizedRecordBatchStream, ExecutionPlan}, + physical_plan::Partitioning, + physical_plan::{common::SizedRecordBatchStream, DisplayFormatType, ExecutionPlan}, }; use arrow::{array::StringBuilder, datatypes::SchemaRef, record_batch::RecordBatch}; -use crate::physical_plan::Partitioning; - use super::SendableRecordBatchStream; use async_trait::async_trait; @@ -122,4 +121,16 @@ impl ExecutionPlan for ExplainExec { vec![Arc::new(record_batch)], ))) } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + write!(f, "ExplainExec") + } + } + } } diff --git a/datafusion/src/physical_plan/expressions/average.rs b/datafusion/src/physical_plan/expressions/average.rs index 38644129dcd09..6a6332042188f 100644 --- a/datafusion/src/physical_plan/expressions/average.rs +++ b/datafusion/src/physical_plan/expressions/average.rs @@ -109,6 +109,10 @@ impl AggregateExpr for Avg { fn expressions(&self) -> Vec> { vec![self.expr.clone()] } + + fn name(&self) -> &str { + &self.name + } } /// An accumulator to compute the average diff --git a/datafusion/src/physical_plan/expressions/count.rs b/datafusion/src/physical_plan/expressions/count.rs index 22459813b7e5b..4a3fbe4fa7d3d 100644 --- a/datafusion/src/physical_plan/expressions/count.rs +++ b/datafusion/src/physical_plan/expressions/count.rs @@ -83,6 +83,10 @@ impl AggregateExpr for Count { fn create_accumulator(&self) -> Result> { Ok(Box::new(CountAccumulator::new())) } + + fn name(&self) -> &str { + &self.name + } } #[derive(Debug)] diff --git a/datafusion/src/physical_plan/expressions/min_max.rs b/datafusion/src/physical_plan/expressions/min_max.rs index 5ed14610ada38..ea917d30d940d 100644 --- a/datafusion/src/physical_plan/expressions/min_max.rs +++ b/datafusion/src/physical_plan/expressions/min_max.rs @@ -88,6 +88,10 @@ impl AggregateExpr for Max { fn create_accumulator(&self) -> Result> { Ok(Box::new(MaxAccumulator::try_new(&self.data_type)?)) } + + fn name(&self) -> &str { + &self.name + } } // Statically-typed version of min/max(array) -> ScalarValue for string types. @@ -387,6 +391,10 @@ impl AggregateExpr for Min { fn create_accumulator(&self) -> Result> { Ok(Box::new(MinAccumulator::try_new(&self.data_type)?)) } + + fn name(&self) -> &str { + &self.name + } } #[derive(Debug)] diff --git a/datafusion/src/physical_plan/expressions/mod.rs b/datafusion/src/physical_plan/expressions/mod.rs index 6e252205955dc..4d57c39bb31cc 100644 --- a/datafusion/src/physical_plan/expressions/mod.rs +++ b/datafusion/src/physical_plan/expressions/mod.rs @@ -74,6 +74,19 @@ pub struct PhysicalSortExpr { pub options: SortOptions, } +impl std::fmt::Display for PhysicalSortExpr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let opts_string = match (self.options.descending, self.options.nulls_first) { + (true, true) => "DESC", + (true, false) => "DESC NULLS LAST", + (false, true) => "ASC", + (false, false) => "ASC NULLS LAST", + }; + + write!(f, "{} {}", self.expr, opts_string) + } +} + impl PhysicalSortExpr { /// evaluate the sort expression into SortColumn that can be passed into arrow sort kernel pub fn evaluate_to_sort_column(&self, batch: &RecordBatch) -> Result { diff --git a/datafusion/src/physical_plan/expressions/sum.rs b/datafusion/src/physical_plan/expressions/sum.rs index 6f50894003da6..7bbbf99fa6598 100644 --- a/datafusion/src/physical_plan/expressions/sum.rs +++ b/datafusion/src/physical_plan/expressions/sum.rs @@ -104,6 +104,10 @@ impl AggregateExpr for Sum { fn create_accumulator(&self) -> Result> { Ok(Box::new(SumAccumulator::try_new(&self.data_type)?)) } + + fn name(&self) -> &str { + &self.name + } } #[derive(Debug)] diff --git a/datafusion/src/physical_plan/filter.rs b/datafusion/src/physical_plan/filter.rs index 61af78db8ed2a..bc2b17aa4f47d 100644 --- a/datafusion/src/physical_plan/filter.rs +++ b/datafusion/src/physical_plan/filter.rs @@ -25,7 +25,9 @@ use std::task::{Context, Poll}; use super::{RecordBatchStream, SendableRecordBatchStream}; use crate::error::{DataFusionError, Result}; -use crate::physical_plan::{ExecutionPlan, Partitioning, PhysicalExpr}; +use crate::physical_plan::{ + DisplayFormatType, ExecutionPlan, Partitioning, PhysicalExpr, +}; use arrow::array::BooleanArray; use arrow::compute::filter_record_batch; use arrow::datatypes::{DataType, SchemaRef}; @@ -119,6 +121,18 @@ impl ExecutionPlan for FilterExec { input: self.input.execute(partition).await?, })) } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + write!(f, "FilterExec: {}", self.predicate) + } + } + } } /// The FilterExec streams wraps the input iterator and applies the predicate expression to diff --git a/datafusion/src/physical_plan/hash_aggregate.rs b/datafusion/src/physical_plan/hash_aggregate.rs index fad4fa585034b..3059e2f746ce4 100644 --- a/datafusion/src/physical_plan/hash_aggregate.rs +++ b/datafusion/src/physical_plan/hash_aggregate.rs @@ -28,8 +28,10 @@ use futures::{ }; use crate::error::{DataFusionError, Result}; -use crate::physical_plan::{Accumulator, AggregateExpr, SQLMetric}; -use crate::physical_plan::{Distribution, ExecutionPlan, Partitioning, PhysicalExpr}; +use crate::physical_plan::{ + Accumulator, AggregateExpr, DisplayFormatType, Distribution, ExecutionPlan, + Partitioning, PhysicalExpr, SQLMetric, +}; use arrow::{ array::{Array, UInt32Builder}, @@ -257,6 +259,39 @@ impl ExecutionPlan for HashAggregateExec { metrics.insert("outputRows".to_owned(), (*self.output_rows).clone()); metrics } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + write!(f, "HashAggregateExec: mode={:?}", self.mode)?; + let g: Vec = self + .group_expr + .iter() + .map(|(e, alias)| { + let e = e.to_string(); + if &e != alias { + format!("{} as {}", e, alias) + } else { + e + } + }) + .collect(); + write!(f, ", gby=[{}]", g.join(", "))?; + + let a: Vec = self + .aggr_expr + .iter() + .map(|agg| agg.name().to_string()) + .collect(); + write!(f, ", aggr=[{}]", a.join(", "))?; + } + } + Ok(()) + } } /* diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index 2682623d374a6..0bf5a2857fdee 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -58,7 +58,10 @@ use super::{ }; use crate::error::{DataFusionError, Result}; -use super::{ExecutionPlan, Partitioning, RecordBatchStream, SendableRecordBatchStream}; +use super::{ + DisplayFormatType, ExecutionPlan, Partitioning, RecordBatchStream, + SendableRecordBatchStream, +}; use crate::physical_plan::coalesce_batches::concat_batches; use log::debug; @@ -393,6 +396,22 @@ impl ExecutionPlan for HashJoinExec { is_exhausted: false, })) } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + write!( + f, + "HashJoinExec: mode={:?}, join_type={:?}, on={:?}", + self.mode, self.join_type, self.on + ) + } + } + } } /// Updates `hash` with new entries from [RecordBatch] evaluated against the expressions `on`, diff --git a/datafusion/src/physical_plan/limit.rs b/datafusion/src/physical_plan/limit.rs index c091196483f40..c56dbe141b2d1 100644 --- a/datafusion/src/physical_plan/limit.rs +++ b/datafusion/src/physical_plan/limit.rs @@ -26,7 +26,9 @@ use futures::stream::Stream; use futures::stream::StreamExt; use crate::error::{DataFusionError, Result}; -use crate::physical_plan::{Distribution, ExecutionPlan, Partitioning}; +use crate::physical_plan::{ + DisplayFormatType, Distribution, ExecutionPlan, Partitioning, +}; use arrow::array::ArrayRef; use arrow::compute::limit; use arrow::datatypes::SchemaRef; @@ -121,6 +123,18 @@ impl ExecutionPlan for GlobalLimitExec { let stream = self.input.execute(0).await?; Ok(Box::pin(LimitStream::new(stream, self.limit))) } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + write!(f, "GlobalLimitExec: limit={}", self.limit) + } + } + } } /// LocalLimitExec applies a limit to a single partition @@ -187,6 +201,18 @@ impl ExecutionPlan for LocalLimitExec { let stream = self.input.execute(partition).await?; Ok(Box::pin(LimitStream::new(stream, self.limit))) } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + write!(f, "LocalLimitExec: limit={}", self.limit) + } + } + } } /// Truncate a RecordBatch to maximum of n rows diff --git a/datafusion/src/physical_plan/memory.rs b/datafusion/src/physical_plan/memory.rs index 9022077559acf..85d8aeef073c1 100644 --- a/datafusion/src/physical_plan/memory.rs +++ b/datafusion/src/physical_plan/memory.rs @@ -22,7 +22,10 @@ use std::any::Any; use std::sync::Arc; use std::task::{Context, Poll}; -use super::{ExecutionPlan, Partitioning, RecordBatchStream, SendableRecordBatchStream}; +use super::{ + DisplayFormatType, ExecutionPlan, Partitioning, RecordBatchStream, + SendableRecordBatchStream, +}; use crate::error::{DataFusionError, Result}; use arrow::datatypes::SchemaRef; use arrow::error::Result as ArrowResult; @@ -88,6 +91,25 @@ impl ExecutionPlan for MemoryExec { self.projection.clone(), )?)) } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + let partitions: Vec<_> = + self.partitions.iter().map(|b| b.len()).collect(); + write!( + f, + "MemoryExec: partitions={}, partition_sizes={:?}", + partitions.len(), + partitions + ) + } + } + } } impl MemoryExec { diff --git a/datafusion/src/physical_plan/merge.rs b/datafusion/src/physical_plan/merge.rs index c66532b73ccff..c65227c161148 100644 --- a/datafusion/src/physical_plan/merge.rs +++ b/datafusion/src/physical_plan/merge.rs @@ -36,8 +36,7 @@ use arrow::{ use super::RecordBatchStream; use crate::error::{DataFusionError, Result}; -use crate::physical_plan::ExecutionPlan; -use crate::physical_plan::Partitioning; +use crate::physical_plan::{DisplayFormatType, ExecutionPlan, Partitioning}; use super::SendableRecordBatchStream; use pin_project_lite::pin_project; @@ -151,6 +150,18 @@ impl ExecutionPlan for MergeExec { } } } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + write!(f, "MergeExec") + } + } + } } pin_project! { diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index a8f6f0c35f00e..6ab9570790e75 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -17,7 +17,7 @@ //! Traits for physical query plan, supporting parallel execution for partitioned relations. -use std::fmt::{Debug, Display}; +use std::fmt::{self, Debug, Display}; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; use std::{any::Any, pin::Pin}; @@ -31,9 +31,10 @@ use arrow::record_batch::RecordBatch; use arrow::{array::ArrayRef, datatypes::Field}; use async_trait::async_trait; +pub use display::DisplayFormatType; use futures::stream::Stream; -use self::merge::MergeExec; +use self::{display::DisplayableExecutionPlan, merge::MergeExec}; use hashbrown::HashMap; /// Trait for types that stream [arrow::record_batch::RecordBatch] @@ -120,7 +121,16 @@ pub trait PhysicalPlanner { ) -> Result>; } -/// Partition-aware execution plan for a relation +/// `ExecutionPlan` represent nodes in the DataFusion Physical Plan. +/// +/// Each `ExecutionPlan` is Partition-aware and is responsible for +/// creating the actual `async` [`SendableRecordBatchStream`]s +/// of [`RecordBatch`] that incrementally compute the operator's +/// output from its input partition. +/// +/// [`ExecutionPlan`] can be displayed in an simplified form using the +/// return value from [`displayable`] in addition to the (normally +/// quite verbose) `Debug` output. #[async_trait] pub trait ExecutionPlan: Debug + Send + Sync { /// Returns the execution plan as [`Any`](std::any::Any) so that it can be @@ -152,6 +162,137 @@ pub trait ExecutionPlan: Debug + Send + Sync { fn metrics(&self) -> HashMap { HashMap::new() } + + /// Format this `ExecutionPlan` to `f` in the specified type. + /// + /// Should not include a newline + /// + /// Note this function prints a placeholder by default to preserve + /// backwards compatibility. + fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "ExecutionPlan(PlaceHolder)") + } +} + +/// Return a [wrapper](DisplayableExecutionPlan) around an +/// [`ExecutionPlan`] which can be displayed in various easier to +/// understand ways. +/// +/// ``` +/// use datafusion::prelude::*; +/// use datafusion::physical_plan::displayable; +/// +/// // Hard code concurrency as it appears in the RepartitionExec output +/// let config = ExecutionConfig::new() +/// .with_concurrency(3); +/// let mut ctx = ExecutionContext::with_config(config); +/// +/// // register the a table +/// ctx.register_csv("example", "tests/example.csv", CsvReadOptions::new()).unwrap(); +/// +/// // create a plan to run a SQL query +/// let plan = ctx +/// .create_logical_plan("SELECT a FROM example WHERE a < 5") +/// .unwrap(); +/// let plan = ctx.optimize(&plan).unwrap(); +/// let physical_plan = ctx.create_physical_plan(&plan).unwrap(); +/// +/// // Format using display string +/// let displayable_plan = displayable(physical_plan.as_ref()); +/// let plan_string = format!("{}", displayable_plan.indent()); +/// +/// assert_eq!("ProjectionExec: expr=[a]\ +/// \n CoalesceBatchesExec: target_batch_size=4096\ +/// \n FilterExec: a < 5\ +/// \n RepartitionExec: partitioning=RoundRobinBatch(3)\ +/// \n CsvExec: source=Path(tests/example.csv: [tests/example.csv]), has_header=true", +/// plan_string.trim()); +/// ``` +/// +pub fn displayable(plan: &dyn ExecutionPlan) -> DisplayableExecutionPlan<'_> { + DisplayableExecutionPlan::new(plan) +} + +/// Visit all children of this plan, according to the order defined on `ExecutionPlanVisitor`. +// Note that this would be really nice if it were a method on +// ExecutionPlan, but it can not be because it takes a generic +// parameter and `ExecutionPlan` is a trait +pub fn accept( + plan: &dyn ExecutionPlan, + visitor: &mut V, +) -> std::result::Result<(), V::Error> { + visitor.pre_visit(plan)?; + for child in plan.children() { + visit_execution_plan(child.as_ref(), visitor)?; + } + visitor.post_visit(plan)?; + Ok(()) +} + +/// Trait that implements the [Visitor +/// pattern](https://en.wikipedia.org/wiki/Visitor_pattern) for a +/// depth first walk of `ExecutionPlan` nodes. `pre_visit` is called +/// before any children are visited, and then `post_visit` is called +/// after all children have been visited. +//// +/// To use, define a struct that implements this trait and then invoke +/// ['accept']. +/// +/// For example, for an execution plan that looks like: +/// +/// ```text +/// ProjectionExec: #id +/// FilterExec: state = CO +/// CsvExec: +/// ``` +/// +/// The sequence of visit operations would be: +/// ```text +/// visitor.pre_visit(ProjectionExec) +/// visitor.pre_visit(FilterExec) +/// visitor.pre_visit(CsvExec) +/// visitor.post_visit(CsvExec) +/// visitor.post_visit(FilterExec) +/// visitor.post_visit(ProjectionExec) +/// ``` +pub trait ExecutionPlanVisitor { + /// The type of error returned by this visitor + type Error; + + /// Invoked on an `ExecutionPlan` plan before any of its child + /// inputs have been visited. If Ok(true) is returned, the + /// recursion continues. If Err(..) or Ok(false) are returned, the + /// recursion stops immediately and the error, if any, is returned + /// to `accept` + fn pre_visit( + &mut self, + plan: &dyn ExecutionPlan, + ) -> std::result::Result; + + /// Invoked on an `ExecutionPlan` plan *after* all of its child + /// inputs have been visited. The return value is handled the same + /// as the return value of `pre_visit`. The provided default + /// implementation returns `Ok(true)`. + fn post_visit( + &mut self, + _plan: &dyn ExecutionPlan, + ) -> std::result::Result { + Ok(true) + } +} + +/// Recursively calls `pre_visit` and `post_visit` for this node and +/// all of its children, as described on [`ExecutionPlanVisitor`] +pub fn visit_execution_plan( + plan: &dyn ExecutionPlan, + visitor: &mut V, +) -> std::result::Result<(), V::Error> { + visitor.pre_visit(plan)?; + for child in plan.children() { + visit_execution_plan(child.as_ref(), visitor)?; + } + visitor.post_visit(plan)?; + Ok(()) } /// Execute the [ExecutionPlan] and collect the results in memory @@ -290,6 +431,12 @@ pub trait AggregateExpr: Send + Sync + Debug { /// expressions that are passed to the Accumulator. /// Single-column aggregations such as `sum` return a single value, others (e.g. `cov`) return many. fn expressions(&self) -> Vec>; + + /// Human readable name such as `"MIN(c2)"`. The default + /// implementation returns placeholder text. + fn name(&self) -> &str { + "AggregateExpr: default name" + } } /// An accumulator represents a stateful object that lives throughout the evaluation of multiple rows and @@ -351,6 +498,7 @@ pub mod cross_join; pub mod crypto_expressions; pub mod csv; pub mod datetime_expressions; +pub mod display; pub mod distinct_expressions; pub mod empty; pub mod explain; diff --git a/datafusion/src/physical_plan/parquet.rs b/datafusion/src/physical_plan/parquet.rs index dee0fc89a7a0d..dd5e77bc21eb9 100644 --- a/datafusion/src/physical_plan/parquet.rs +++ b/datafusion/src/physical_plan/parquet.rs @@ -27,7 +27,7 @@ use super::{ planner::DefaultPhysicalPlanner, ColumnarValue, PhysicalExpr, RecordBatchStream, SendableRecordBatchStream, }; -use crate::physical_plan::{common, ExecutionPlan, Partitioning}; +use crate::physical_plan::{common, DisplayFormatType, ExecutionPlan, Partitioning}; use crate::{ error::{DataFusionError, Result}, execution::context::ExecutionContextState, @@ -864,6 +864,32 @@ impl ExecutionPlan for ParquetExec { inner: ReceiverStream::new(response_rx), })) } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + let files: Vec<_> = self + .partitions + .iter() + .map(|pp| pp.filenames.iter()) + .flatten() + .map(|s| s.as_str()) + .collect(); + + write!( + f, + "ParquetExec: batch_size={}, limit={:?}, partitions=[{}]", + self.batch_size, + self.limit, + files.join(", ") + ) + } + } + } } fn send_result( diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index 664e4dccbdf9d..d11e8e93d199c 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -23,7 +23,6 @@ use super::{ aggregates, cross_join::CrossJoinExec, empty::EmptyExec, expressions::binary, functions, hash_join::PartitionMode, udaf, union::UnionExec, }; -use crate::error::{DataFusionError, Result}; use crate::execution::context::ExecutionContextState; use crate::logical_plan::{ DFSchema, Expr, LogicalPlan, Operator, Partitioning as LogicalPartitioning, PlanType, @@ -45,6 +44,10 @@ use crate::physical_plan::{AggregateExpr, ExecutionPlan, PhysicalExpr, PhysicalP use crate::prelude::JoinType; use crate::scalar::ScalarValue; use crate::variable::VarType; +use crate::{ + error::{DataFusionError, Result}, + physical_plan::displayable, +}; use arrow::compute::can_cast_types; use arrow::compute::SortOptions; @@ -383,7 +386,7 @@ impl DefaultPhysicalPlanner { if *verbose { stringified_plans.push(StringifiedPlan::new( PlanType::PhysicalPlan, - format!("{:#?}", input), + displayable(input.as_ref()).indent().to_string(), )); } Ok(Arc::new(ExplainExec::new( diff --git a/datafusion/src/physical_plan/projection.rs b/datafusion/src/physical_plan/projection.rs index a881beb453a0c..c0d78ff7168bf 100644 --- a/datafusion/src/physical_plan/projection.rs +++ b/datafusion/src/physical_plan/projection.rs @@ -26,7 +26,9 @@ use std::sync::Arc; use std::task::{Context, Poll}; use crate::error::{DataFusionError, Result}; -use crate::physical_plan::{ExecutionPlan, Partitioning, PhysicalExpr}; +use crate::physical_plan::{ + DisplayFormatType, ExecutionPlan, Partitioning, PhysicalExpr, +}; use arrow::datatypes::{Field, Schema, SchemaRef}; use arrow::error::Result as ArrowResult; use arrow::record_batch::RecordBatch; @@ -130,6 +132,31 @@ impl ExecutionPlan for ProjectionExec { input: self.input.execute(partition).await?, })) } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + let expr: Vec = self + .expr + .iter() + .map(|(e, alias)| { + let e = e.to_string(); + if &e != alias { + format!("{} as {}", e, alias) + } else { + e + } + }) + .collect(); + + write!(f, "ProjectionExec: expr=[{}]", expr.join(", ")) + } + } + } } fn batch_project( diff --git a/datafusion/src/physical_plan/repartition.rs b/datafusion/src/physical_plan/repartition.rs index 7243550127bde..2599690bfc003 100644 --- a/datafusion/src/physical_plan/repartition.rs +++ b/datafusion/src/physical_plan/repartition.rs @@ -24,7 +24,7 @@ use std::task::{Context, Poll}; use std::{any::Any, collections::HashMap, vec}; use crate::error::{DataFusionError, Result}; -use crate::physical_plan::{ExecutionPlan, Partitioning}; +use crate::physical_plan::{DisplayFormatType, ExecutionPlan, Partitioning}; use arrow::record_batch::RecordBatch; use arrow::{array::Array, error::Result as ArrowResult}; use arrow::{compute::take, datatypes::SchemaRef}; @@ -235,6 +235,18 @@ impl ExecutionPlan for RepartitionExec { input: UnboundedReceiverStream::new(channels.remove(&partition).unwrap().1), })) } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + write!(f, "RepartitionExec: partitioning={:?}", self.partitioning) + } + } + } } impl RepartitionExec { diff --git a/datafusion/src/physical_plan/sort.rs b/datafusion/src/physical_plan/sort.rs index 010e4068638ba..8229060190215 100644 --- a/datafusion/src/physical_plan/sort.rs +++ b/datafusion/src/physical_plan/sort.rs @@ -41,7 +41,7 @@ use super::{RecordBatchStream, SendableRecordBatchStream}; use crate::error::{DataFusionError, Result}; use crate::physical_plan::expressions::PhysicalSortExpr; use crate::physical_plan::{ - common, Distribution, ExecutionPlan, Partitioning, SQLMetric, + common, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, SQLMetric, }; /// Sort execution plan @@ -145,6 +145,19 @@ impl ExecutionPlan for SortExec { ))) } + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + let expr: Vec = self.expr.iter().map(|e| e.to_string()).collect(); + write!(f, "SortExec: [{}]", expr.join(",")) + } + } + } + fn metrics(&self) -> HashMap { let mut metrics = HashMap::new(); metrics.insert("outputRows".to_owned(), (*self.output_rows).clone()); diff --git a/datafusion/src/physical_plan/udaf.rs b/datafusion/src/physical_plan/udaf.rs index 3dc6aa402f527..f7515d326d0a5 100644 --- a/datafusion/src/physical_plan/udaf.rs +++ b/datafusion/src/physical_plan/udaf.rs @@ -165,4 +165,8 @@ impl AggregateExpr for AggregateFunctionExpr { fn create_accumulator(&self) -> Result> { (self.fun.accumulator)() } + + fn name(&self) -> &str { + &self.name + } } diff --git a/datafusion/tests/custom_sources.rs b/datafusion/tests/custom_sources.rs index a00dd6ac28216..b39f47bba07b1 100644 --- a/datafusion/tests/custom_sources.rs +++ b/datafusion/tests/custom_sources.rs @@ -20,11 +20,14 @@ use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::error::Result as ArrowResult; use arrow::record_batch::RecordBatch; -use datafusion::error::{DataFusionError, Result}; use datafusion::{ datasource::{datasource::Statistics, TableProvider}, physical_plan::collect, }; +use datafusion::{ + error::{DataFusionError, Result}, + physical_plan::DisplayFormatType, +}; use datafusion::execution::context::ExecutionContext; use datafusion::logical_plan::{col, Expr, LogicalPlan, LogicalPlanBuilder}; @@ -128,6 +131,18 @@ impl ExecutionPlan for CustomExecutionPlan { async fn execute(&self, _partition: usize) -> Result { Ok(Box::pin(TestCustomRecordBatchStream { nb_batch: 1 })) } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + write!(f, "CustomExecutionPlan: projection={:#?}", self.projection) + } + } + } } impl TableProvider for CustomTableProvider { diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index c80ffe4d34673..0b9cc2ae18b95 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -31,9 +31,8 @@ use arrow::{ util::display::array_value_to_string, }; -use datafusion::execution::context::ExecutionContext; use datafusion::logical_plan::LogicalPlan; -use datafusion::prelude::create_udf; +use datafusion::prelude::*; use datafusion::{ datasource::{csv::CsvReadOptions, MemTable}, physical_plan::collect, @@ -42,6 +41,7 @@ use datafusion::{ error::{DataFusionError, Result}, physical_plan::ColumnarValue, }; +use datafusion::{execution::context::ExecutionContext, physical_plan::displayable}; #[tokio::test] async fn nyc() -> Result<()> { @@ -2932,3 +2932,47 @@ async fn test_cast_expressions_error() -> Result<()> { Ok(()) } + +#[tokio::test] +async fn test_physical_plan_display_indent() { + // Hard code concurrency as it appears in the RepartitionExec output + let config = ExecutionConfig::new().with_concurrency(3); + let mut ctx = ExecutionContext::with_config(config); + register_aggregate_csv(&mut ctx).unwrap(); + let sql = "SELECT c1, MAX(c12), MIN(c12) as the_min \ + FROM aggregate_test_100 \ + WHERE c12 < 10 \ + GROUP BY c1 \ + ORDER BY the_min DESC \ + LIMIT 10"; + let plan = ctx.create_logical_plan(&sql).unwrap(); + let plan = ctx.optimize(&plan).unwrap(); + + let physical_plan = ctx.create_physical_plan(&plan).unwrap(); + let expected = vec![ + "GlobalLimitExec: limit=10", + " SortExec: [the_min DESC]", + " ProjectionExec: expr=[c1, MAX(c12), MIN(c12) as the_min]", + " HashAggregateExec: mode=Final, gby=[c1], aggr=[MAX(c12), MIN(c12)]", + " MergeExec", + " HashAggregateExec: mode=Partial, gby=[c1], aggr=[MAX(c12), MIN(c12)]", + " CoalesceBatchesExec: target_batch_size=4096", + " FilterExec: c12 < CAST(10 AS Float64)", + " RepartitionExec: partitioning=RoundRobinBatch(3)", + " CsvExec: source=Path(ARROW_TEST_DATA/csv/aggregate_test_100.csv: [ARROW_TEST_DATA/csv/aggregate_test_100.csv]), has_header=true", + ]; + + let data_path = arrow::util::test_util::arrow_test_data(); + let actual = format!("{}", displayable(physical_plan.as_ref()).indent()) + .trim() + .lines() + // normalize paths + .map(|s| s.replace(&data_path, "ARROW_TEST_DATA")) + .collect::>(); + + assert_eq!( + expected, actual, + "expected:\n{:#?}\nactual:\n\n{:#?}\n", + expected, actual + ); +} diff --git a/datafusion/tests/user_defined_plan.rs b/datafusion/tests/user_defined_plan.rs index 5e38c57b6f1bc..8914c05e8f88f 100644 --- a/datafusion/tests/user_defined_plan.rs +++ b/datafusion/tests/user_defined_plan.rs @@ -75,8 +75,8 @@ use datafusion::{ optimizer::{optimizer::OptimizerRule, utils::optimize_children}, physical_plan::{ planner::{DefaultPhysicalPlanner, ExtensionPlanner}, - Distribution, ExecutionPlan, Partitioning, PhysicalPlanner, RecordBatchStream, - SendableRecordBatchStream, + DisplayFormatType, Distribution, ExecutionPlan, Partitioning, PhysicalPlanner, + RecordBatchStream, SendableRecordBatchStream, }, prelude::{ExecutionConfig, ExecutionContext}, }; @@ -163,9 +163,9 @@ async fn topk_plan() -> Result<()> { let mut ctx = setup_table(make_topk_context()).await?; let expected = vec![ - "| logical_plan after topk | TopK: k=3 |", - "| | Projection: #customer_id, #revenue |", - "| | TableScan: sales projection=Some([0, 1]) |", + "| logical_plan after topk | TopK: k=3 |", + "| | Projection: #customer_id, #revenue |", + "| | TableScan: sales projection=Some([0, 1]) |", ].join("\n"); let explain_query = format!("EXPLAIN VERBOSE {}", QUERY); @@ -397,6 +397,18 @@ impl ExecutionPlan for TopKExec { state: BTreeMap::new(), })) } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + write!(f, "TopKExec: k={}", self.k) + } + } + } } // A very specialized TopK implementation From 874132ec752c251dcdd2541f80e6889d307219d6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 14 May 2021 17:03:28 -0400 Subject: [PATCH 094/329] Update arrow dependencies again (#341) --- ballista/rust/client/Cargo.toml | 2 +- ballista/rust/core/Cargo.toml | 4 ++-- ballista/rust/executor/Cargo.toml | 4 ++-- ballista/rust/scheduler/Cargo.toml | 2 +- datafusion-cli/Cargo.toml | 2 +- datafusion-examples/Cargo.toml | 2 +- datafusion/Cargo.toml | 4 ++-- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ballista/rust/client/Cargo.toml b/ballista/rust/client/Cargo.toml index 8b5d7af3f2a24..f7ed273ec8b90 100644 --- a/ballista/rust/client/Cargo.toml +++ b/ballista/rust/client/Cargo.toml @@ -31,5 +31,5 @@ futures = "0.3" log = "0.4" tokio = "1.0" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "8226219fe7104f6c8a2740806f96f02c960d991c" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml index d98cc7e49e83b..7eec207b096e8 100644 --- a/ballista/rust/core/Cargo.toml +++ b/ballista/rust/core/Cargo.toml @@ -40,8 +40,8 @@ tokio = "1.0" tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } -arrow = { git = "https://github.com/apache/arrow-rs", rev = "8226219fe7104f6c8a2740806f96f02c960d991c" } -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "8226219fe7104f6c8a2740806f96f02c960d991c" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/executor/Cargo.toml b/ballista/rust/executor/Cargo.toml index a5e40341981d8..31fd9d0137b20 100644 --- a/ballista/rust/executor/Cargo.toml +++ b/ballista/rust/executor/Cargo.toml @@ -44,8 +44,8 @@ tokio-stream = "0.1" tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } -arrow = { git = "https://github.com/apache/arrow-rs", rev = "8226219fe7104f6c8a2740806f96f02c960d991c" } -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "8226219fe7104f6c8a2740806f96f02c960d991c" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/scheduler/Cargo.toml b/ballista/rust/scheduler/Cargo.toml index 43dc4285add5a..4793534e9ecad 100644 --- a/ballista/rust/scheduler/Cargo.toml +++ b/ballista/rust/scheduler/Cargo.toml @@ -52,7 +52,7 @@ tonic = "0.4" tower = { version = "0.4" } warp = "0.3" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "8226219fe7104f6c8a2740806f96f02c960d991c" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98" } datafusion = { path = "../../../datafusion" } [dev-dependencies] diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 2551b775adfb2..747a6b0287ebc 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -31,4 +31,4 @@ clap = "2.33" rustyline = "8.0" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } datafusion = { path = "../datafusion" } -arrow = { git = "https://github.com/apache/arrow-rs", rev = "8226219fe7104f6c8a2740806f96f02c960d991c" } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98" } diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 8d8f20eb97931..28175f842e9bd 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -29,7 +29,7 @@ publish = false [dev-dependencies] -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "8226219fe7104f6c8a2740806f96f02c960d991c" } +arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98" } datafusion = { path = "../datafusion" } prost = "0.7" tonic = "0.4" diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index 610ffdaa4886d..4d98fdb1b2075 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -46,8 +46,8 @@ unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = "0.7" hashbrown = "0.11" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "8226219fe7104f6c8a2740806f96f02c960d991c", features = ["prettyprint"] } -parquet = { git = "https://github.com/apache/arrow-rs", rev = "8226219fe7104f6c8a2740806f96f02c960d991c", features = ["arrow"] } +arrow = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98", features = ["prettyprint"] } +parquet = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98", features = ["arrow"] } sqlparser = "0.9.0" paste = "^1.0" num_cpus = "1.13.0" From 1702d6c85ebfdbc968b1dc427a9799e74b64ff96 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 14 May 2021 15:03:53 -0600 Subject: [PATCH 095/329] Make it easer for developers to find Ballista documentation (#330) --- DEVELOPERS.md | 3 + README.md | 3 + ballista/README.md | 15 ++-- ballista/docs/README.md | 7 +- ballista/docs/{dev-env-rust.md => dev-env.md} | 0 ballista/docs/integration-testing.md | 10 +-- ballista/docs/release-process.md | 68 ------------------- ballista/docs/rust-docker.md | 66 ------------------ 8 files changed, 18 insertions(+), 154 deletions(-) rename ballista/docs/{dev-env-rust.md => dev-env.md} (100%) delete mode 100644 ballista/docs/release-process.md delete mode 100644 ballista/docs/rust-docker.md diff --git a/DEVELOPERS.md b/DEVELOPERS.md index 1dc9304651c81..be8bb61b148f5 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -21,6 +21,9 @@ This section describes how you can get started at developing DataFusion. +For information on developing with Ballista, see the +[Ballista developer documentation](ballista/docs/README.md). + ### Bootstrap environment DataFusion is written in Rust and it uses a standard rust toolkit: diff --git a/README.md b/README.md index ded264a003f43..f72c73bb80372 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,9 @@ logical query plans as well as a query optimizer and execution engine capable of parallel execution against partitioned data sources (CSV and Parquet) using threads. +DataFusion also supports distributed query execution via the +[Ballista](ballista/README.md) crate. + ## Use Cases DataFusion is used to create modern, fast and efficient data diff --git a/ballista/README.md b/ballista/README.md index 288386f01619a..276af3c4d9b2d 100644 --- a/ballista/README.md +++ b/ballista/README.md @@ -50,15 +50,14 @@ Although Ballista is largely inspired by Apache Spark, there are some key differ - The use of Apache Arrow as the memory model and network protocol means that data can be exchanged between executors in any programming language with minimal serialization overhead. -# Status +## Status -The Ballista project was donated to Apache Arrow in April 2021 and work is underway to integrate more tightly with -DataFusion. - -One of the goals is to implement a common scheduler that can seamlessly scale queries across cores in DataFusion and -across nodes in Ballista. - -Ballista issues are tracked in ASF JIRA [here](https://issues.apache.org/jira/issues/?jql=project%20%3D%20ARROW%20AND%20component%20%3D%20%22Rust%20-%20Ballista%22) +Ballista was [donated](https://arrow.apache.org/blog/2021/04/12/ballista-donation/) to the Apache Arrow project in +April 2021 and should be considered experimental. +## Getting Started +The [Ballista Developer Documentation](docs/README.md) and the +[DataFusion User Guide](https://github.com/apache/arrow-datafusion/tree/master/docs/user-guide) are currently the +best sources of information for getting started with Ballista. diff --git a/ballista/docs/README.md b/ballista/docs/README.md index 44c831d37800d..6588c1d4d37b6 100644 --- a/ballista/docs/README.md +++ b/ballista/docs/README.md @@ -20,7 +20,7 @@ This directory contains documentation for developers that are contributing to Ballista. If you are looking for end-user documentation for a published release, please start with the -[Ballista User Guide](https://ballistacompute.org/docs/) instead. +[DataFusion User Guide](../../docs/user-guide) instead. ## Architecture & Design @@ -29,9 +29,6 @@ end-user documentation for a published release, please start with the ## Build, Test, Release -- Setting up a [Rust development environment](dev-env-rust.md). -- Setting up a [Java development environment](dev-env-jvm.md). -- Notes on building [Rust docker images](rust-docker.md) +- Setting up a [development environment](dev-env.md). - [Integration Testing](integration-testing.md) -- [Release process](release-process.md) diff --git a/ballista/docs/dev-env-rust.md b/ballista/docs/dev-env.md similarity index 100% rename from ballista/docs/dev-env-rust.md rename to ballista/docs/dev-env.md diff --git a/ballista/docs/integration-testing.md b/ballista/docs/integration-testing.md index 2a979b6ec3487..3f818a4596b0f 100644 --- a/ballista/docs/integration-testing.md +++ b/ballista/docs/integration-testing.md @@ -18,15 +18,11 @@ --> # Integration Testing -Ballista has a [benchmark crate](https://github.com/ballista-compute/ballista/tree/main/rust/benchmarks/tpch) which is -derived from TPC-H and this is currently the main form of integration testing. +We use the [DataFusion Benchmarks](https://github.com/apache/arrow-datafusion/tree/master/benchmarks) for integration +testing. -The following command can be used to run the integration tests. +The integration tests can be executed by running the following command from the root of the DataFusion repository. ```bash ./dev/integration-tests.sh ``` - -Please refer to the -[benchmark documentation](https://github.com/ballista-compute/ballista/blob/main/rust/benchmarks/tpch/README.md) -for more information. diff --git a/ballista/docs/release-process.md b/ballista/docs/release-process.md deleted file mode 100644 index c6c45c3cf1770..0000000000000 --- a/ballista/docs/release-process.md +++ /dev/null @@ -1,68 +0,0 @@ - -# Release Process - -These instructions are for project maintainers wishing to create public releases of Ballista. - -- Create a `release-0.4` branch or merge latest from `main` into an existing `release-0.4` branch. -- Update version numbers using `./dev/bump-version.sh` -- Run integration tests with `./dev/integration-tests.sh` -- Push changes -- Create `v0.4.x` release tag from the `release-0.4` branch -- Publish Docker images -- Publish crate if possible (if we're using a published version of Arrow) - -## Publishing Java artifacts to Maven Central - -The JVM artifacts are published to Maven central by uploading to sonatype. You will need to set the environment -variables `SONATYPE_USERNAME` and `SONATYPE_PASSWORD` to the correct values for your account and you will also need -verified GPG keys available for signing the artifacts (instructions tbd). - -Run the follow commands to publish the artifacts to a sonatype staging repository. - -```bash -./dev/publish-jvm.sh -``` - -## Publishing Rust Artifacts - -Run the following script to publish the Rust crate to crates.io. - -``` -./dev/publish-rust.sh -``` - -## Publishing Docker Images - -Run the following script to publish the executor Docker images to Docker Hub. - -``` -./dev/publish-docker-images.sh -``` - -## GPG Notes - -Refer to [this article](https://help.github.com/en/github/authenticating-to-github/generating-a-new-gpg-key) for -instructions on setting up GPG keys. Some useful commands are: - -```bash -gpg --full-generate-key -gpg --export-secret-keys > ~/.gnupg/secring.gpg -gpg --key-server keys.openpgp.org --send-keys KEYID -``` \ No newline at end of file diff --git a/ballista/docs/rust-docker.md b/ballista/docs/rust-docker.md deleted file mode 100644 index 0b94a1499a0c3..0000000000000 --- a/ballista/docs/rust-docker.md +++ /dev/null @@ -1,66 +0,0 @@ - -### How to build rust's docker image - -To build the docker image in development, use - -``` -docker build -f docker/rust.dockerfile -t ballistacompute/ballista-rust:latest . -``` - -This uses a multi-stage build, on which the build stage is called `builder`. -Our github has this target cached, that we use to speed-up the build time: - -``` -export BUILDER_IMAGE=docker.pkg.github.com/ballista-compute/ballista/ballista-rust-builder:main - -docker login docker.pkg.github.com -u ... -p ... # a personal access token to read from the read:packages -docker pull $BUILDER_IMAGE - -docker build --cache-from $BUILDER_IMAGE -f docker/rust.dockerfile -t ballista:latest . -``` - -will build the image by re-using a cached image. - -### Docker images for development - -This project often requires testing on kubernetes. For this reason, we have a github workflow to push images to -github's registry, both from this repo and its forks. - -The basic principle is that every push to a git reference builds and publishes a docker image. -Specifically, given a branch or tag `${REF}`, - -* `docker.pkg.github.com/ballista-compute/ballista/ballista-rust:${REF}` is the latest image from $REF -* `docker.pkg.github.com/${USER}/ballista/ballista-rust:${REF}` is the latest image from $REF on your fork - -To pull them from a kubernetes cluster or your computer, you need to have a personal access token with scope `read:packages`, -and login to the registry `docker.pkg.github.com`. - -The builder image - the large image with all the cargo caches - is available on the same registry as described above, and is also -available in all forks and for all references. - -Please refer to the [rust workflow](.github/workflows/rust.yaml) and [rust dockerfile](docker/rust.dockerfile) for details on how we build and publish these images. - -### Get the binary - -If you do not aim to run this in docker but any linux-based machine, you can get the latest binary from a docker image on the registry: the binary is statically linked and thus runs on any linux-based machine. You can get it using - -``` -id=$(docker create $BUILDER_IMAGE) && docker cp $id:/executor executor && docker rm -v $id -``` From 1c5037138305f5b6204a1dbbfb17f9521f6995d3 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Sun, 16 May 2021 18:02:53 +0800 Subject: [PATCH 096/329] Use NullArray to Pass row count to ScalarFunctions that take 0 arguments (#328) * add docs and comments * use supports_zero_argument --- datafusion/src/physical_plan/functions.rs | 49 +++++++++++++++++++---- datafusion/src/physical_plan/udf.rs | 7 ++++ 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/datafusion/src/physical_plan/functions.rs b/datafusion/src/physical_plan/functions.rs index 2e053a80976b0..c0c915f29a72a 100644 --- a/datafusion/src/physical_plan/functions.rs +++ b/datafusion/src/physical_plan/functions.rs @@ -44,13 +44,14 @@ use crate::{ scalar::ScalarValue, }; use arrow::{ - array::ArrayRef, + array::{ArrayRef, NullArray}, compute::kernels::length::{bit_length, length}, datatypes::TimeUnit, datatypes::{DataType, Field, Int32Type, Int64Type, Schema}, record_batch::RecordBatch, }; use fmt::{Debug, Formatter}; +use std::convert::From; use std::{any::Any, fmt, str::FromStr, sync::Arc}; /// A function's signature, which defines the function's supported argument types. @@ -76,6 +77,13 @@ pub enum Signature { } /// Scalar function +/// +/// The Fn param is the wrapped function but be aware that the function will +/// be passed with the slice / vec of columnar values (either scalar or array) +/// with the exception of zero param function, where a singular element vec +/// will be passed. In that case the single element is a null array to indicate +/// the batch's row count (so that the generative zero-argument function can know +/// the result array size). pub type ScalarFunctionImplementation = Arc Result + Send + Sync>; @@ -207,6 +215,14 @@ pub enum BuiltinScalarFunction { RegexpMatch, } +impl BuiltinScalarFunction { + /// an allowlist of functions to take zero arguments, so that they will get special treatment + /// while executing. + fn supports_zero_argument(&self) -> bool { + matches!(self, BuiltinScalarFunction::Now) + } +} + impl fmt::Display for BuiltinScalarFunction { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // lowercase of the debug. @@ -1371,6 +1387,17 @@ impl fmt::Display for ScalarFunctionExpr { } } +/// null columnar values are implemented as a null array in order to pass batch +/// num_rows +type NullColumnarValue = ColumnarValue; + +impl From<&RecordBatch> for NullColumnarValue { + fn from(batch: &RecordBatch) -> Self { + let num_rows = batch.num_rows(); + ColumnarValue::Array(Arc::new(NullArray::new(num_rows))) + } +} + impl PhysicalExpr for ScalarFunctionExpr { /// Return a reference to Any that can be used for downcasting fn as_any(&self) -> &dyn Any { @@ -1386,12 +1413,18 @@ impl PhysicalExpr for ScalarFunctionExpr { } fn evaluate(&self, batch: &RecordBatch) -> Result { - // evaluate the arguments - let inputs = self - .args - .iter() - .map(|e| e.evaluate(batch)) - .collect::>>()?; + // evaluate the arguments, if there are no arguments we'll instead pass in a null array + // indicating the batch size (as a convention) + let inputs = match (self.args.len(), self.name.parse::()) { + (0, Ok(scalar_fun)) if scalar_fun.supports_zero_argument() => { + vec![NullColumnarValue::from(batch)] + } + _ => self + .args + .iter() + .map(|e| e.evaluate(batch)) + .collect::>>()?, + }; // evaluate the function let fun = self.fun.as_ref(); @@ -1399,7 +1432,7 @@ impl PhysicalExpr for ScalarFunctionExpr { } } -/// decorates a function to handle [`ScalarValue`]s by coverting them to arrays before calling the function +/// decorates a function to handle [`ScalarValue`]s by converting them to arrays before calling the function /// and vice-versa after evaluation. pub fn make_scalar_function(inner: F) -> ScalarFunctionImplementation where diff --git a/datafusion/src/physical_plan/udf.rs b/datafusion/src/physical_plan/udf.rs index 9189da47bd6f8..a79c0a8a36059 100644 --- a/datafusion/src/physical_plan/udf.rs +++ b/datafusion/src/physical_plan/udf.rs @@ -43,6 +43,13 @@ pub struct ScalarUDF { /// Return type pub return_type: ReturnTypeFunction, /// actual implementation + /// + /// The fn param is the wrapped function but be aware that the function will + /// be passed with the slice / vec of columnar values (either scalar or array) + /// with the exception of zero param function, where a singular element vec + /// will be passed. In that case the single element is a null array to indicate + /// the batch's row count (so that the generative zero-argument function can know + /// the result array size). pub fun: ScalarFunctionImplementation, } From ed92673e19f1b20e0ea35397f73da49bdb304be4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sun, 16 May 2021 21:33:03 +0200 Subject: [PATCH 097/329] Implement hash partitioned aggregation (#320) * Implement hash partitioned aggregation * Ballista * Make configurable and use configured concurrency * WIP * Add some hash types * Fmt * Disable repartition aggregations in ballista * fmt * Clippy, ballista * Fix test * Revert test ode * Update datafusion/src/physical_plan/hash_aggregate.rs Co-authored-by: Andrew Lamb * Add info about required child partitioning * Add test * Test fix * Set concurrency Co-authored-by: Andrew Lamb --- ballista/rust/core/proto/ballista.proto | 1 + .../src/serde/physical_plan/from_proto.rs | 3 + .../core/src/serde/physical_plan/to_proto.rs | 3 + ballista/rust/core/src/utils.rs | 1 + ballista/rust/scheduler/src/planner.rs | 7 +- ballista/rust/scheduler/src/test_utils.rs | 6 +- datafusion/src/execution/context.rs | 19 ++-- .../src/physical_optimizer/merge_exec.rs | 1 + .../src/physical_optimizer/repartition.rs | 5 +- .../src/physical_plan/hash_aggregate.rs | 22 ++++- datafusion/src/physical_plan/hash_join.rs | 98 ++++++++++++++++++- datafusion/src/physical_plan/mod.rs | 5 +- datafusion/src/physical_plan/planner.rs | 86 +++++++++++++--- .../src/physical_plan/unicode_expressions.rs | 1 - datafusion/tests/sql.rs | 24 ++--- 15 files changed, 229 insertions(+), 53 deletions(-) diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index 07419d09b7a91..3da0e85437d76 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -396,6 +396,7 @@ message ProjectionExecNode { enum AggregateMode { PARTIAL = 0; FINAL = 1; + FINAL_PARTITIONED = 2; } message HashAggregateExecNode { diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index 9c35c9d889411..97f03948f7bd9 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -201,6 +201,9 @@ impl TryInto> for &protobuf::PhysicalPlanNode { let agg_mode: AggregateMode = match mode { protobuf::AggregateMode::Partial => AggregateMode::Partial, protobuf::AggregateMode::Final => AggregateMode::Final, + protobuf::AggregateMode::FinalPartitioned => { + AggregateMode::FinalPartitioned + } }; let group = hash_agg diff --git a/ballista/rust/core/src/serde/physical_plan/to_proto.rs b/ballista/rust/core/src/serde/physical_plan/to_proto.rs index 8a5fd71083f75..9571f3de2e76b 100644 --- a/ballista/rust/core/src/serde/physical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/to_proto.rs @@ -172,6 +172,9 @@ impl TryInto for Arc { let agg_mode = match exec.mode() { AggregateMode::Partial => protobuf::AggregateMode::Partial, AggregateMode::Final => protobuf::AggregateMode::Final, + AggregateMode::FinalPartitioned => { + protobuf::AggregateMode::FinalPartitioned + } }; let input_schema = exec.input_schema(); let input: protobuf::PhysicalPlanNode = exec.input().to_owned().try_into()?; diff --git a/ballista/rust/core/src/utils.rs b/ballista/rust/core/src/utils.rs index ee9c9557e7899..55541d5fd0148 100644 --- a/ballista/rust/core/src/utils.rs +++ b/ballista/rust/core/src/utils.rs @@ -322,6 +322,7 @@ pub fn create_datafusion_context() -> ExecutionContext { let config = ExecutionConfig::new() .with_concurrency(1) .with_repartition_joins(false) + .with_repartition_aggregations(false) .with_physical_optimizer_rules(rules); ExecutionContext::with_config(config) } diff --git a/ballista/rust/scheduler/src/planner.rs b/ballista/rust/scheduler/src/planner.rs index 20dd0d36d9ab9..b81d7de355ef0 100644 --- a/ballista/rust/scheduler/src/planner.rs +++ b/ballista/rust/scheduler/src/planner.rs @@ -128,7 +128,7 @@ impl DistributedPlanner { //TODO should insert query stages in more generic way based on partitioning metadata // and not specifically for this operator match agg.mode() { - AggregateMode::Final => { + AggregateMode::Final | AggregateMode::FinalPartitioned => { let mut new_children: Vec> = vec![]; for child in &children { let new_stage = create_query_stage( @@ -237,10 +237,9 @@ mod test { use ballista_core::serde::protobuf; use ballista_core::utils::format_plan; use datafusion::physical_plan::hash_aggregate::HashAggregateExec; - use datafusion::physical_plan::merge::MergeExec; - use datafusion::physical_plan::projection::ProjectionExec; use datafusion::physical_plan::sort::SortExec; use datafusion::physical_plan::ExecutionPlan; + use datafusion::physical_plan::{merge::MergeExec, projection::ProjectionExec}; use std::convert::TryInto; use std::sync::Arc; use uuid::Uuid; @@ -278,11 +277,9 @@ mod test { QueryStageExec: job=f011432e-e424-4016-915d-e3d8b84f6dbd, stage=1 HashAggregateExec: groupBy=["l_returnflag"], aggrExpr=["SUM(l_extendedprice Multiply Int64(1)) [\"l_extendedprice * CAST(1 AS Float64)\"]"] CsvExec: testdata/lineitem; partitions=2 - QueryStageExec: job=f011432e-e424-4016-915d-e3d8b84f6dbd, stage=2 MergeExec UnresolvedShuffleExec: stages=[1] - QueryStageExec: job=f011432e-e424-4016-915d-e3d8b84f6dbd, stage=3 SortExec { input: ProjectionExec { expr: [(Column { name: "l_returnflag" }, "l_returnflag"), (Column { name: "SUM(l_ext ProjectionExec { expr: [(Column { name: "l_returnflag" }, "l_returnflag"), (Column { name: "SUM(l_extendedprice Multip diff --git a/ballista/rust/scheduler/src/test_utils.rs b/ballista/rust/scheduler/src/test_utils.rs index 330cc9a9332cb..0989060503869 100644 --- a/ballista/rust/scheduler/src/test_utils.rs +++ b/ballista/rust/scheduler/src/test_utils.rs @@ -33,10 +33,12 @@ pub const TPCH_TABLES: &[&str] = &[ pub fn datafusion_test_context(path: &str) -> Result { // remove Repartition rule because that isn't supported yet let rules: Vec> = vec![ - Arc::new(CoalesceBatches::new()), Arc::new(AddMergeExec::new()), + Arc::new(CoalesceBatches::new()), ]; - let config = ExecutionConfig::new().with_physical_optimizer_rules(rules); + let config = ExecutionConfig::new() + .with_physical_optimizer_rules(rules) + .with_repartition_aggregations(false); let mut ctx = ExecutionContext::with_config(config); for table in TPCH_TABLES { diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index 9c7a6217d7d93..272e75acba6fd 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -636,6 +636,9 @@ pub struct ExecutionConfig { /// Should DataFusion repartition data using the join keys to execute joins in parallel /// using the provided `concurrency` level pub repartition_joins: bool, + /// Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel + /// using the provided `concurrency` level + pub repartition_aggregations: bool, } impl ExecutionConfig { @@ -663,6 +666,7 @@ impl ExecutionConfig { create_default_catalog_and_schema: true, information_schema: false, repartition_joins: true, + repartition_aggregations: true, } } @@ -746,6 +750,11 @@ impl ExecutionConfig { self.repartition_joins = enabled; self } + /// Enables or disables the use of repartitioning for aggregations to improve parallelism + pub fn with_repartition_aggregations(mut self, enabled: bool) -> Self { + self.repartition_aggregations = enabled; + self + } } /// Holds per-execution properties and data (such as starting timestamps, etc). @@ -1351,7 +1360,6 @@ mod tests { #[tokio::test] async fn aggregate_grouped() -> Result<()> { let results = execute("SELECT c1, SUM(c2) FROM test GROUP BY c1", 4).await?; - assert_eq!(results.len(), 1); let expected = vec![ "+----+---------+", @@ -1371,7 +1379,6 @@ mod tests { #[tokio::test] async fn aggregate_grouped_avg() -> Result<()> { let results = execute("SELECT c1, AVG(c2) FROM test GROUP BY c1", 4).await?; - assert_eq!(results.len(), 1); let expected = vec![ "+----+---------+", @@ -1392,7 +1399,6 @@ mod tests { async fn boolean_literal() -> Result<()> { let results = execute("SELECT c1, c3 FROM test WHERE c1 > 2 AND c3 = true", 4).await?; - assert_eq!(results.len(), 1); let expected = vec![ "+----+------+", @@ -1414,7 +1420,6 @@ mod tests { async fn aggregate_grouped_empty() -> Result<()> { let results = execute("SELECT c1, AVG(c2) FROM test WHERE c1 = 123 GROUP BY c1", 4).await?; - assert_eq!(results.len(), 1); let expected = vec!["++", "||", "++", "++"]; assert_batches_sorted_eq!(expected, &results); @@ -1425,7 +1430,6 @@ mod tests { #[tokio::test] async fn aggregate_grouped_max() -> Result<()> { let results = execute("SELECT c1, MAX(c2) FROM test GROUP BY c1", 4).await?; - assert_eq!(results.len(), 1); let expected = vec![ "+----+---------+", @@ -1445,7 +1449,6 @@ mod tests { #[tokio::test] async fn aggregate_grouped_min() -> Result<()> { let results = execute("SELECT c1, MIN(c2) FROM test GROUP BY c1", 4).await?; - assert_eq!(results.len(), 1); let expected = vec![ "+----+---------+", @@ -1629,7 +1632,6 @@ mod tests { #[tokio::test] async fn count_aggregated() -> Result<()> { let results = execute("SELECT c1, COUNT(c2) FROM test GROUP BY c1", 4).await?; - assert_eq!(results.len(), 1); let expected = vec![ "+----+-----------+", @@ -1681,7 +1683,6 @@ mod tests { &mut ctx, "SELECT date_trunc('week', t1) as week, SUM(c2) FROM test GROUP BY date_trunc('week', t1)", ).await?; - assert_eq!(results.len(), 1); let expected = vec![ "+---------------------+---------+", @@ -1925,7 +1926,6 @@ mod tests { ]; let results = run_count_distinct_integers_aggregated_scenario(partitions).await?; - assert_eq!(results.len(), 1); let expected = vec![ "+---------+-----------------+------------------------+-------------------------+-------------------------+-------------------------+-------------------------+--------------------------+--------------------------+--------------------------+", @@ -1952,7 +1952,6 @@ mod tests { ]; let results = run_count_distinct_integers_aggregated_scenario(partitions).await?; - assert_eq!(results.len(), 1); let expected = vec![ "+---------+-----------------+------------------------+-------------------------+-------------------------+-------------------------+-------------------------+--------------------------+--------------------------+--------------------------+", diff --git a/datafusion/src/physical_optimizer/merge_exec.rs b/datafusion/src/physical_optimizer/merge_exec.rs index 255d1bc245872..877c0be00e1b8 100644 --- a/datafusion/src/physical_optimizer/merge_exec.rs +++ b/datafusion/src/physical_optimizer/merge_exec.rs @@ -52,6 +52,7 @@ impl PhysicalOptimizerRule for AddMergeExec { .collect::>>()?; match plan.required_child_distribution() { Distribution::UnspecifiedDistribution => plan.with_new_children(children), + Distribution::HashPartitioned(_) => plan.with_new_children(children), Distribution::SinglePartition => plan.with_new_children( children .iter() diff --git a/datafusion/src/physical_optimizer/repartition.rs b/datafusion/src/physical_optimizer/repartition.rs index 82f46f9cbbbb6..fee4b3e11e5d2 100644 --- a/datafusion/src/physical_optimizer/repartition.rs +++ b/datafusion/src/physical_optimizer/repartition.rs @@ -52,7 +52,10 @@ fn optimize_concurrency( .map(|child| { optimize_concurrency( concurrency, - plan.required_child_distribution() == Distribution::SinglePartition, + matches!( + plan.required_child_distribution(), + Distribution::SinglePartition + ), child.clone(), ) }) diff --git a/datafusion/src/physical_plan/hash_aggregate.rs b/datafusion/src/physical_plan/hash_aggregate.rs index 3059e2f746ce4..0a822dc898afb 100644 --- a/datafusion/src/physical_plan/hash_aggregate.rs +++ b/datafusion/src/physical_plan/hash_aggregate.rs @@ -78,6 +78,13 @@ pub enum AggregateMode { Partial, /// Final aggregate that produces a single partition of output Final, + /// Final aggregate that works on pre-partitioned data. + /// + /// This requires the invariant that all rows with a particular + /// grouping key are in the same partitions, such as is the case + /// with Hash repartitioning on the group keys. If a group key is + /// duplicated, duplicate groups would be produced + FinalPartitioned, } /// Hash aggregate execution plan @@ -123,7 +130,7 @@ fn create_schema( fields.extend(expr.state_fields()?.iter().cloned()) } } - AggregateMode::Final => { + AggregateMode::Final | AggregateMode::FinalPartitioned => { // in final mode, the field with the final result of the accumulator for expr in aggr_expr { fields.push(expr.field()?) @@ -204,6 +211,9 @@ impl ExecutionPlan for HashAggregateExec { fn required_child_distribution(&self) -> Distribution { match &self.mode { AggregateMode::Partial => Distribution::UnspecifiedDistribution, + AggregateMode::FinalPartitioned => Distribution::HashPartitioned( + self.group_expr.iter().map(|x| x.0.clone()).collect(), + ), AggregateMode::Final => Distribution::SinglePartition, } } @@ -454,7 +464,7 @@ fn group_aggregate_batch( }) .try_for_each(|(accumulator, values)| match mode { AggregateMode::Partial => accumulator.update_batch(&values), - AggregateMode::Final => { + AggregateMode::FinalPartitioned | AggregateMode::Final => { // note: the aggregation here is over states, not values, thus the merge accumulator.merge_batch(&values) } @@ -807,7 +817,7 @@ fn aggregate_expressions( Ok(aggr_expr.iter().map(|agg| agg.expressions()).collect()) } // in this mode, we build the merge expressions of the aggregation - AggregateMode::Final => Ok(aggr_expr + AggregateMode::Final | AggregateMode::FinalPartitioned => Ok(aggr_expr .iter() .map(|agg| merge_expressions(agg)) .collect::>>()?), @@ -901,7 +911,9 @@ fn aggregate_batch( // 1.3 match mode { AggregateMode::Partial => accum.update_batch(values), - AggregateMode::Final => accum.merge_batch(values), + AggregateMode::Final | AggregateMode::FinalPartitioned => { + accum.merge_batch(values) + } } }) } @@ -1074,7 +1086,7 @@ fn finalize_aggregation( .collect::>>()?; Ok(a.iter().flatten().cloned().collect::>()) } - AggregateMode::Final => { + AggregateMode::Final | AggregateMode::FinalPartitioned => { // merge the state to the final value accumulators .iter() diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index 0bf5a2857fdee..01551cd4daf4c 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -23,8 +23,9 @@ use ahash::RandomState; use arrow::{ array::{ - ArrayData, ArrayRef, BooleanArray, LargeStringArray, PrimitiveArray, - TimestampMicrosecondArray, TimestampNanosecondArray, UInt32BufferBuilder, + ArrayData, ArrayRef, BooleanArray, Date32Array, Date64Array, Float32Array, + Float64Array, LargeStringArray, PrimitiveArray, TimestampMicrosecondArray, + TimestampMillisecondArray, TimestampNanosecondArray, UInt32BufferBuilder, UInt32Builder, UInt64BufferBuilder, UInt64Builder, }, compute, @@ -862,6 +863,49 @@ macro_rules! hash_array_primitive { }; } +macro_rules! hash_array_float { + ($array_type:ident, $column: ident, $ty: ident, $hashes: ident, $random_state: ident, $multi_col: ident) => { + let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + let values = array.values(); + + if array.null_count() == 0 { + if $multi_col { + for (hash, value) in $hashes.iter_mut().zip(values.iter()) { + *hash = combine_hashes( + $ty::get_hash(&value.to_le_bytes(), $random_state), + *hash, + ); + } + } else { + for (hash, value) in $hashes.iter_mut().zip(values.iter()) { + *hash = $ty::get_hash(&value.to_le_bytes(), $random_state) + } + } + } else { + if $multi_col { + for (i, (hash, value)) in + $hashes.iter_mut().zip(values.iter()).enumerate() + { + if !array.is_null(i) { + *hash = combine_hashes( + $ty::get_hash(&value.to_le_bytes(), $random_state), + *hash, + ); + } + } + } else { + for (i, (hash, value)) in + $hashes.iter_mut().zip(values.iter()).enumerate() + { + if !array.is_null(i) { + *hash = $ty::get_hash(&value.to_le_bytes(), $random_state); + } + } + } + } + }; +} + /// Creates hash values for every element in the row based on the values in the columns pub fn create_hashes<'a>( arrays: &[ArrayRef], @@ -953,6 +997,36 @@ pub fn create_hashes<'a>( multi_col ); } + DataType::Float32 => { + hash_array_float!( + Float32Array, + col, + u32, + hashes_buffer, + random_state, + multi_col + ); + } + DataType::Float64 => { + hash_array_float!( + Float64Array, + col, + u64, + hashes_buffer, + random_state, + multi_col + ); + } + DataType::Timestamp(TimeUnit::Millisecond, None) => { + hash_array_primitive!( + TimestampMillisecondArray, + col, + i64, + hashes_buffer, + random_state, + multi_col + ); + } DataType::Timestamp(TimeUnit::Microsecond, None) => { hash_array_primitive!( TimestampMicrosecondArray, @@ -973,6 +1047,26 @@ pub fn create_hashes<'a>( multi_col ); } + DataType::Date32 => { + hash_array_primitive!( + Date32Array, + col, + i32, + hashes_buffer, + random_state, + multi_col + ); + } + DataType::Date64 => { + hash_array_primitive!( + Date64Array, + col, + i64, + hashes_buffer, + random_state, + multi_col + ); + } DataType::Boolean => { hash_array!( BooleanArray, diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index 6ab9570790e75..e915b2c257ddc 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -359,12 +359,15 @@ impl Partitioning { } /// Distribution schemes -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone)] pub enum Distribution { /// Unspecified distribution UnspecifiedDistribution, /// A single partition is required SinglePartition, + /// Requires children to be distributed in such a way that the same + /// values of the keys end up in the same partition + HashPartitioned(Vec>), } /// Represents the result from an expression diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index d11e8e93d199c..9e7dc7172b820 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -48,7 +48,7 @@ use crate::{ error::{DataFusionError, Result}, physical_plan::displayable, }; -use arrow::compute::can_cast_types; +use arrow::{compute::can_cast_types, datatypes::DataType}; use arrow::compute::SortOptions; use arrow::datatypes::{Schema, SchemaRef}; @@ -187,19 +187,54 @@ impl DefaultPhysicalPlanner { let final_group: Vec> = (0..groups.len()).map(|i| col(&groups[i].1)).collect(); - // construct a second aggregation, keeping the final column name equal to the first aggregation - // and the expressions corresponding to the respective aggregate - Ok(Arc::new(HashAggregateExec::try_new( - AggregateMode::Final, - final_group - .iter() - .enumerate() - .map(|(i, expr)| (expr.clone(), groups[i].1.clone())) - .collect(), - aggregates, - initial_aggr, - input_schema, - )?)) + // TODO: dictionary type not yet supported in Hash Repartition + let contains_dict = groups + .iter() + .flat_map(|x| x.0.data_type(physical_input_schema.as_ref())) + .any(|x| matches!(x, DataType::Dictionary(_, _))); + + if !groups.is_empty() + && ctx_state.config.concurrency > 1 + && ctx_state.config.repartition_aggregations + && !contains_dict + { + // Divide partial hash aggregates into multiple partitions by hash key + let hash_repartition = Arc::new(RepartitionExec::try_new( + initial_aggr, + Partitioning::Hash( + final_group.clone(), + ctx_state.config.concurrency, + ), + )?); + + // Combine hashaggregates within the partition + Ok(Arc::new(HashAggregateExec::try_new( + AggregateMode::FinalPartitioned, + final_group + .iter() + .enumerate() + .map(|(i, expr)| (expr.clone(), groups[i].1.clone())) + .collect(), + aggregates, + hash_repartition, + input_schema, + )?)) + } else { + // construct a second aggregation, keeping the final column name equal to the first aggregation + // and the expressions corresponding to the respective aggregate + + Ok(Arc::new(HashAggregateExec::try_new( + AggregateMode::Final, + final_group + .iter() + .enumerate() + .map(|(i, expr)| (expr.clone(), groups[i].1.clone())) + .collect(), + aggregates, + initial_aggr, + input_schema, + )?)) + } } LogicalPlan::Projection { input, expr, .. } => { let input_exec = self.create_initial_plan(input, ctx_state)?; @@ -761,7 +796,8 @@ mod tests { } fn plan(logical_plan: &LogicalPlan) -> Result> { - let ctx_state = make_ctx_state(); + let mut ctx_state = make_ctx_state(); + ctx_state.config.concurrency = 4; let planner = DefaultPhysicalPlanner::default(); planner.create_physical_plan(logical_plan, &ctx_state) } @@ -998,6 +1034,26 @@ mod tests { Ok(()) } + #[test] + fn hash_agg_group_by_partitioned() -> Result<()> { + let testdata = arrow::util::test_util::arrow_test_data(); + let path = format!("{}/csv/aggregate_test_100.csv", testdata); + + let options = CsvReadOptions::new().schema_infer_max_records(100); + let logical_plan = LogicalPlanBuilder::scan_csv(&path, options, None)? + .aggregate(vec![col("c1")], vec![sum(col("c2"))])? + .build()?; + + let execution_plan = plan(&logical_plan)?; + let formatted = format!("{:?}", execution_plan); + + // Make sure the plan contains a FinalPartitioned, which means it will not use the Final + // mode in HashAggregate (which is slower) + assert!(formatted.contains("FinalPartitioned")); + + Ok(()) + } + /// An example extension node that doesn't do anything struct NoOpExtensionNode { schema: DFSchemaRef, diff --git a/datafusion/src/physical_plan/unicode_expressions.rs b/datafusion/src/physical_plan/unicode_expressions.rs index 787ea7ea26730..3852fd7c931fa 100644 --- a/datafusion/src/physical_plan/unicode_expressions.rs +++ b/datafusion/src/physical_plan/unicode_expressions.rs @@ -93,7 +93,6 @@ where pub fn left(args: &[ArrayRef]) -> Result { let string_array = downcast_string_arg!(args[0], "string", T); let n_array = downcast_arg!(args[1], "n", Int64Array); - let result = string_array .iter() .zip(n_array.iter()) diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 0b9cc2ae18b95..6edb757334900 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -2950,17 +2950,19 @@ async fn test_physical_plan_display_indent() { let physical_plan = ctx.create_physical_plan(&plan).unwrap(); let expected = vec![ - "GlobalLimitExec: limit=10", - " SortExec: [the_min DESC]", - " ProjectionExec: expr=[c1, MAX(c12), MIN(c12) as the_min]", - " HashAggregateExec: mode=Final, gby=[c1], aggr=[MAX(c12), MIN(c12)]", - " MergeExec", - " HashAggregateExec: mode=Partial, gby=[c1], aggr=[MAX(c12), MIN(c12)]", - " CoalesceBatchesExec: target_batch_size=4096", - " FilterExec: c12 < CAST(10 AS Float64)", - " RepartitionExec: partitioning=RoundRobinBatch(3)", - " CsvExec: source=Path(ARROW_TEST_DATA/csv/aggregate_test_100.csv: [ARROW_TEST_DATA/csv/aggregate_test_100.csv]), has_header=true", - ]; + "GlobalLimitExec: limit=10", + " SortExec: [the_min DESC]", + " MergeExec", + " ProjectionExec: expr=[c1, MAX(c12), MIN(c12) as the_min]", + " HashAggregateExec: mode=FinalPartitioned, gby=[c1], aggr=[MAX(c12), MIN(c12)]", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"c1\" }], 3)", + " HashAggregateExec: mode=Partial, gby=[c1], aggr=[MAX(c12), MIN(c12)]", + " CoalesceBatchesExec: target_batch_size=4096", + " FilterExec: c12 < CAST(10 AS Float64)", + " RepartitionExec: partitioning=RoundRobinBatch(3)", + " CsvExec: source=Path(ARROW_TEST_DATA/csv/aggregate_test_100.csv: [ARROW_TEST_DATA/csv/aggregate_test_100.csv]), has_header=true", + ]; let data_path = arrow::util::test_util::arrow_test_data(); let actual = format!("{}", displayable(physical_plan.as_ref()).indent()) From 26b78c60a928a8866a5aa89aa309423795e20e6e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 17 May 2021 08:37:01 -0600 Subject: [PATCH 098/329] Instructions for cross-compiling Ballista to the Raspberry Pi (#263) --- dev/build-ballista-docker-arm64.sh | 34 +++++ dev/docker/ballista-arm64.Dockerfile | 27 ++++ docs/user-guide/src/SUMMARY.md | 1 + .../user-guide/src/distributed/raspberrypi.md | 128 ++++++++++++++++++ 4 files changed, 190 insertions(+) create mode 100755 dev/build-ballista-docker-arm64.sh create mode 100644 dev/docker/ballista-arm64.Dockerfile create mode 100644 docs/user-guide/src/distributed/raspberrypi.md diff --git a/dev/build-ballista-docker-arm64.sh b/dev/build-ballista-docker-arm64.sh new file mode 100755 index 0000000000000..5d951773ada41 --- /dev/null +++ b/dev/build-ballista-docker-arm64.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +if [ -z "${DOCKER_REPO}" ]; then + echo "DOCKER_REPO env var must be set" + exit -1 +fi +cargo install cross +cross build --release --target aarch64-unknown-linux-gnu +rm -rf temp-ballista-docker +mkdir temp-ballista-docker +cp target/aarch64-unknown-linux-gnu/release/ballista-executor temp-ballista-docker +cp target/aarch64-unknown-linux-gnu/release/ballista-scheduler temp-ballista-docker +cp target/aarch64-unknown-linux-gnu/release/tpch temp-ballista-docker +mkdir temp-ballista-docker/queries/ +cp benchmarks/queries/*.sql temp-ballista-docker/queries/ +docker buildx build --push -t $DOCKER_REPO/ballista-arm64 --platform=linux/arm64 -f dev/docker/ballista-arm64.Dockerfile temp-ballista-docker +rm -rf temp-ballista-docker \ No newline at end of file diff --git a/dev/docker/ballista-arm64.Dockerfile b/dev/docker/ballista-arm64.Dockerfile new file mode 100644 index 0000000000000..eb82c03e311aa --- /dev/null +++ b/dev/docker/ballista-arm64.Dockerfile @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +FROM arm64v8/ubuntu + +ADD ballista-scheduler / +ADD ballista-executor / + +# Add benchmarks +ADD tpch / +RUN mkdir /queries +ADD queries/*.sql /queries/ + +ENV RUST_LOG=info \ No newline at end of file diff --git a/docs/user-guide/src/SUMMARY.md b/docs/user-guide/src/SUMMARY.md index 0fced3bb3deab..903dad1732a15 100644 --- a/docs/user-guide/src/SUMMARY.md +++ b/docs/user-guide/src/SUMMARY.md @@ -31,6 +31,7 @@ - [Docker](distributed/standalone.md) - [Docker Compose](distributed/docker-compose.md) - [Kubernetes](distributed/kubernetes.md) + - [Raspberry Pi](distributed/raspberrypi.md) - [Ballista Configuration](distributed/configuration.md) - [Clients](distributed/clients.md) - [Rust](distributed/client-rust.md) diff --git a/docs/user-guide/src/distributed/raspberrypi.md b/docs/user-guide/src/distributed/raspberrypi.md new file mode 100644 index 0000000000000..d4d2079bb5ccd --- /dev/null +++ b/docs/user-guide/src/distributed/raspberrypi.md @@ -0,0 +1,128 @@ + +# Running Ballista on Raspberry Pi + +The Raspberry Pi single-board computer provides a fun and relatively inexpensive way to get started with distributed +computing. + +These instructions have been tested using an Ubuntu Linux desktop as the host, and a +[Raspberry Pi 4 Model B](https://www.raspberrypi.org/products/raspberry-pi-4-model-b/) with 4 GB RAM as the target. + +## Preparing the Raspberry Pi + +We recommend installing the 64-bit version of [Ubuntu for Raspberry Pi](https://ubuntu.com/raspberry-pi). + +The Rust implementation of Arrow does not work correctly on 32-bit ARM architectures +([issue](https://github.com/apache/arrow-rs/issues/109)). + +## Cross Compiling DataFusion for the Raspberry Pi + +We do not yet publish official Docker images as part of the release process, although we do plan to do this in the +future ([issue #228](https://github.com/apache/arrow-datafusion/issues/228)). + +Although it is technically possible to build DataFusion directly on a Raspberry Pi, it really isn't very practical. +It is much faster to use [cross](https://github.com/rust-embedded/cross) to cross-compile from a more powerful +desktop computer. + +Docker must be installed and the Docker daemon must be running before cross-compiling with cross. See the +[cross](https://github.com/rust-embedded/cross) project for more detailed instructions. + +Run the following command to install cross. + +```bash +cargo install cross +``` + +From the root of the DataFusion project, run the following command to cross-compile for ARM 64 architecture. + +```bash +cross build --release --target aarch64-unknown-linux-gnu +``` + +It is even possible to cross-test from your desktop computer: + +```bash +cross test --target aarch64-unknown-linux-gnu +``` + +## Deploying the binaries to Raspberry Pi + +You should now be able to copy the executable to the Raspberry Pi using scp on Linux. You will need to change the IP +address in these commands to be the IP address for your Raspberry Pi. The easiest way to find this is to connect a +keyboard and monitor to the Pi and run `ifconfig`. + +```bash +scp ./target/aarch64-unknown-linux-gnu/release/ballista-scheduler ubuntu@10.0.0.186: +scp ./target/aarch64-unknown-linux-gnu/release/ballista-executor ubuntu@10.0.0.186: +``` + +Finally, ssh into the Pi and make the binaries executable: + +```bash +ssh ubuntu@10.0.0.186 +chmod +x ballista-scheduler ballista-executor +``` + +It is now possible to run the Ballista scheduler and executor natively on the Pi. + +## Docker + +Using Docker's `buildx` cross-platform functionality, we can also build a docker image targeting ARM64 +from any desktop environment. This will require write access to a Docker repository +on [Docker Hub](https://hub.docker.com/) because the resulting Docker image will be pushed directly +to the repo. + +```bash +DOCKER_REPO=myrepo ./dev/build-ballista-docker-arm64.sh +``` + +On the Raspberry Pi: + +```bash +docker pull myrepo/ballista-arm64 +``` + +Run a scheduler: + +```bash +docker run -it myrepo/ballista-arm64 /ballista-scheduler +``` + +Run an executor: + +```bash +docker run -it myrepo/ballista-arm64 /ballista-executor +``` + +Run the benchmarks: + +```bash +docker run -it myrepo/ballista-arm64 \ + /tpch benchmark --query=1 --path=/path/to/data --format=parquet \ + --concurrency=24 --iterations=1 --debug --host=ballista-scheduler --port=50050 +``` + +Note that it will be necessary to mount appropriate volumes into the containers and also configure networking +so that the Docker containers can communicate with each other. This can be achieved using Docker compose or Kubernetes. + +## Kubernetes + +With Docker images built using the instructions above, it is now possible to deploy Ballista to a Kubernetes cluster +running on one of more Raspberry Pi computers. Refer to the instructions in the [Kubernetes](kubernetes.md) chapter +for more information, and remember to change the Docker image name to `myrepo/ballista-arm64`. \ No newline at end of file From 6c050b896215717233dafe441b5c08ae40baf174 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Tue, 18 May 2021 05:29:50 +0800 Subject: [PATCH 099/329] add random SQL function (#303) * fix 305 * add supports_zero_argument * fix unit test --- datafusion/Cargo.toml | 2 +- datafusion/src/physical_plan/functions.rs | 11 +++++- .../src/physical_plan/math_expressions.rs | 36 ++++++++++++++++++- datafusion/src/physical_plan/type_coercion.rs | 1 - datafusion/tests/sql.rs | 11 ++++++ 5 files changed, 57 insertions(+), 4 deletions(-) diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index 4d98fdb1b2075..a127076135f12 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -65,9 +65,9 @@ unicode-segmentation = { version = "^1.7.1", optional = true } regex = { version = "^1.4.3", optional = true } lazy_static = { version = "^1.4.0", optional = true } smallvec = { version = "1.6", features = ["union"] } +rand = "0.8" [dev-dependencies] -rand = "0.8" criterion = "0.3" tempfile = "3" doc-comment = "0.3" diff --git a/datafusion/src/physical_plan/functions.rs b/datafusion/src/physical_plan/functions.rs index c0c915f29a72a..18becf2c8e4e5 100644 --- a/datafusion/src/physical_plan/functions.rs +++ b/datafusion/src/physical_plan/functions.rs @@ -169,6 +169,8 @@ pub enum BuiltinScalarFunction { NullIf, /// octet_length OctetLength, + /// random + Random, /// regexp_replace RegexpReplace, /// repeat @@ -219,7 +221,10 @@ impl BuiltinScalarFunction { /// an allowlist of functions to take zero arguments, so that they will get special treatment /// while executing. fn supports_zero_argument(&self) -> bool { - matches!(self, BuiltinScalarFunction::Now) + matches!( + self, + BuiltinScalarFunction::Random | BuiltinScalarFunction::Now + ) } } @@ -275,6 +280,7 @@ impl FromStr for BuiltinScalarFunction { "md5" => BuiltinScalarFunction::MD5, "nullif" => BuiltinScalarFunction::NullIf, "octet_length" => BuiltinScalarFunction::OctetLength, + "random" => BuiltinScalarFunction::Random, "regexp_replace" => BuiltinScalarFunction::RegexpReplace, "repeat" => BuiltinScalarFunction::Repeat, "replace" => BuiltinScalarFunction::Replace, @@ -438,6 +444,7 @@ pub fn return_type( )); } }), + BuiltinScalarFunction::Random => Ok(DataType::Float64), BuiltinScalarFunction::RegexpReplace => Ok(match arg_types[0] { DataType::LargeUtf8 => DataType::LargeUtf8, DataType::Utf8 => DataType::Utf8, @@ -742,6 +749,7 @@ pub fn create_physical_expr( BuiltinScalarFunction::Ln => math_expressions::ln, BuiltinScalarFunction::Log10 => math_expressions::log10, BuiltinScalarFunction::Log2 => math_expressions::log2, + BuiltinScalarFunction::Random => math_expressions::random, BuiltinScalarFunction::Round => math_expressions::round, BuiltinScalarFunction::Signum => math_expressions::signum, BuiltinScalarFunction::Sin => math_expressions::sin, @@ -1307,6 +1315,7 @@ fn signature(fun: &BuiltinScalarFunction) -> Signature { Signature::Exact(vec![DataType::Utf8, DataType::Utf8, DataType::Utf8]), Signature::Exact(vec![DataType::LargeUtf8, DataType::Utf8, DataType::Utf8]), ]), + BuiltinScalarFunction::Random => Signature::Exact(vec![]), // math expressions expect 1 argument of type f64 or f32 // priority is given to f64 because e.g. `sqrt(1i32)` is in IR (real numbers) and thus we // return the best approximation for it (in f64). diff --git a/datafusion/src/physical_plan/math_expressions.rs b/datafusion/src/physical_plan/math_expressions.rs index 0e0bed2deac2f..cfc239cde6613 100644 --- a/datafusion/src/physical_plan/math_expressions.rs +++ b/datafusion/src/physical_plan/math_expressions.rs @@ -16,11 +16,12 @@ // under the License. //! Math expressions - use super::{ColumnarValue, ScalarValue}; use crate::error::{DataFusionError, Result}; use arrow::array::{Float32Array, Float64Array}; use arrow::datatypes::DataType; +use rand::{thread_rng, Rng}; +use std::iter; use std::sync::Arc; macro_rules! downcast_compute_op { @@ -100,3 +101,36 @@ math_unary_function!("exp", exp); math_unary_function!("ln", ln); math_unary_function!("log2", log2); math_unary_function!("log10", log10); + +/// random SQL function +pub fn random(args: &[ColumnarValue]) -> Result { + let len: usize = match &args[0] { + ColumnarValue::Array(array) => array.len(), + _ => { + return Err(DataFusionError::Internal( + "Expect random function to take no param".to_string(), + )) + } + }; + let mut rng = thread_rng(); + let values = iter::repeat_with(|| rng.gen_range(0.0..1.0)).take(len); + let array = Float64Array::from_iter_values(values); + Ok(ColumnarValue::Array(Arc::new(array))) +} + +#[cfg(test)] +mod tests { + + use super::*; + use arrow::array::{Float64Array, NullArray}; + + #[test] + fn test_random_expression() { + let args = vec![ColumnarValue::Array(Arc::new(NullArray::new(1)))]; + let array = random(&args).expect("fail").into_array(1); + let floats = array.as_any().downcast_ref::().expect("fail"); + + assert_eq!(floats.len(), 1); + assert!(0.0 <= floats.value(0) && floats.value(0) < 1.0); + } +} diff --git a/datafusion/src/physical_plan/type_coercion.rs b/datafusion/src/physical_plan/type_coercion.rs index 98ae09cc381dc..06d3739b53b27 100644 --- a/datafusion/src/physical_plan/type_coercion.rs +++ b/datafusion/src/physical_plan/type_coercion.rs @@ -75,7 +75,6 @@ pub fn data_types( if current_types.is_empty() { return Ok(vec![]); } - let valid_types = get_valid_types(signature, current_types)?; if valid_types diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 6edb757334900..eb50661b42e6c 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -2910,6 +2910,17 @@ async fn test_current_timestamp_expressions_non_optimized() -> Result<()> { Ok(()) } +#[tokio::test] +async fn test_random_expression() -> Result<()> { + let mut ctx = create_ctx()?; + let sql = "SELECT random() r1"; + let actual = execute(&mut ctx, sql).await; + let r1 = actual[0][0].parse::().unwrap(); + assert!(0.0 <= r1); + assert!(r1 < 1.0); + Ok(()) +} + #[tokio::test] async fn test_cast_expressions_error() -> Result<()> { // sin(utf8) should error From aa26112525164a81c28f879c80e650452e396306 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Wed, 19 May 2021 05:33:31 +0800 Subject: [PATCH 100/329] cleanup function return type fn (#350) --- datafusion/src/physical_plan/functions.rs | 325 ++++------------------ 1 file changed, 55 insertions(+), 270 deletions(-) diff --git a/datafusion/src/physical_plan/functions.rs b/datafusion/src/physical_plan/functions.rs index 18becf2c8e4e5..367e594f6e977 100644 --- a/datafusion/src/physical_plan/functions.rs +++ b/datafusion/src/physical_plan/functions.rs @@ -313,6 +313,28 @@ impl FromStr for BuiltinScalarFunction { } } +macro_rules! make_utf8_to_return_type { + ($FUNC:ident, $largeUtf8Type:expr, $utf8Type:expr) => { + fn $FUNC(arg_type: &DataType, name: &str) -> Result { + Ok(match arg_type { + DataType::LargeUtf8 => $largeUtf8Type, + DataType::Utf8 => $utf8Type, + _ => { + // this error is internal as `data_types` should have captured this. + return Err(DataFusionError::Internal(format!( + "The {:?} function can only accept strings.", + name + ))); + } + }) + } + }; +} + +make_utf8_to_return_type!(utf8_to_str_type, DataType::LargeUtf8, DataType::Utf8); +make_utf8_to_return_type!(utf8_to_int_type, DataType::Int64, DataType::Int32); +make_utf8_to_return_type!(utf8_to_binary_type, DataType::Binary, DataType::Binary); + /// Returns the datatype of the scalar function pub fn return_type( fun: &BuiltinScalarFunction, @@ -332,36 +354,11 @@ pub fn return_type( arg_types.len() as i32, )), BuiltinScalarFunction::Ascii => Ok(DataType::Int32), - BuiltinScalarFunction::BitLength => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::Int64, - DataType::Utf8 => DataType::Int32, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The bit_length function can only accept strings.".to_string(), - )); - } - }), - BuiltinScalarFunction::Btrim => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::LargeUtf8, - DataType::Utf8 => DataType::Utf8, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The btrim function can only accept strings.".to_string(), - )); - } - }), - BuiltinScalarFunction::CharacterLength => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::Int64, - DataType::Utf8 => DataType::Int32, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The character_length function can only accept strings.".to_string(), - )); - } - }), + BuiltinScalarFunction::BitLength => utf8_to_int_type(&arg_types[0], "bit_length"), + BuiltinScalarFunction::Btrim => utf8_to_str_type(&arg_types[0], "btrim"), + BuiltinScalarFunction::CharacterLength => { + utf8_to_int_type(&arg_types[0], "character_length") + } BuiltinScalarFunction::Chr => Ok(DataType::Utf8), BuiltinScalarFunction::Concat => Ok(DataType::Utf8), BuiltinScalarFunction::ConcatWithSeparator => Ok(DataType::Utf8), @@ -369,223 +366,38 @@ pub fn return_type( BuiltinScalarFunction::DateTrunc => { Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)) } - BuiltinScalarFunction::InitCap => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::LargeUtf8, - DataType::Utf8 => DataType::Utf8, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The initcap function can only accept strings.".to_string(), - )); - } - }), - BuiltinScalarFunction::Left => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::LargeUtf8, - DataType::Utf8 => DataType::Utf8, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The left function can only accept strings.".to_string(), - )); - } - }), - BuiltinScalarFunction::Lower => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::LargeUtf8, - DataType::Utf8 => DataType::Utf8, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The upper function can only accept strings.".to_string(), - )); - } - }), - BuiltinScalarFunction::Lpad => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::LargeUtf8, - DataType::Utf8 => DataType::Utf8, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The lpad function can only accept strings.".to_string(), - )); - } - }), - BuiltinScalarFunction::Ltrim => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::LargeUtf8, - DataType::Utf8 => DataType::Utf8, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The ltrim function can only accept strings.".to_string(), - )); - } - }), - BuiltinScalarFunction::MD5 => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::LargeUtf8, - DataType::Utf8 => DataType::Utf8, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The md5 function can only accept strings.".to_string(), - )); - } - }), + BuiltinScalarFunction::InitCap => utf8_to_str_type(&arg_types[0], "initcap"), + BuiltinScalarFunction::Left => utf8_to_str_type(&arg_types[0], "left"), + BuiltinScalarFunction::Lower => utf8_to_str_type(&arg_types[0], "lower"), + BuiltinScalarFunction::Lpad => utf8_to_str_type(&arg_types[0], "lpad"), + BuiltinScalarFunction::Ltrim => utf8_to_str_type(&arg_types[0], "ltrim"), + BuiltinScalarFunction::MD5 => utf8_to_str_type(&arg_types[0], "md5"), BuiltinScalarFunction::NullIf => { // NULLIF has two args and they might get coerced, get a preview of this let coerced_types = data_types(arg_types, &signature(fun)); coerced_types.map(|typs| typs[0].clone()) } - BuiltinScalarFunction::OctetLength => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::Int64, - DataType::Utf8 => DataType::Int32, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The octet_length function can only accept strings.".to_string(), - )); - } - }), + BuiltinScalarFunction::OctetLength => { + utf8_to_int_type(&arg_types[0], "octet_length") + } BuiltinScalarFunction::Random => Ok(DataType::Float64), - BuiltinScalarFunction::RegexpReplace => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::LargeUtf8, - DataType::Utf8 => DataType::Utf8, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The regexp_replace function can only accept strings.".to_string(), - )); - } - }), - BuiltinScalarFunction::Repeat => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::LargeUtf8, - DataType::Utf8 => DataType::Utf8, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The repeat function can only accept strings.".to_string(), - )); - } - }), - BuiltinScalarFunction::Replace => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::LargeUtf8, - DataType::Utf8 => DataType::Utf8, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The replace function can only accept strings.".to_string(), - )); - } - }), - BuiltinScalarFunction::Reverse => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::LargeUtf8, - DataType::Utf8 => DataType::Utf8, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The reverse function can only accept strings.".to_string(), - )); - } - }), - BuiltinScalarFunction::Right => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::LargeUtf8, - DataType::Utf8 => DataType::Utf8, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The right function can only accept strings.".to_string(), - )); - } - }), - BuiltinScalarFunction::Rpad => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::LargeUtf8, - DataType::Utf8 => DataType::Utf8, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The rpad function can only accept strings.".to_string(), - )); - } - }), - BuiltinScalarFunction::Rtrim => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::LargeUtf8, - DataType::Utf8 => DataType::Utf8, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The rtrim function can only accept strings.".to_string(), - )); - } - }), - BuiltinScalarFunction::SHA224 => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::Binary, - DataType::Utf8 => DataType::Binary, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The sha224 function can only accept strings.".to_string(), - )); - } - }), - BuiltinScalarFunction::SHA256 => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::Binary, - DataType::Utf8 => DataType::Binary, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The sha256 function can only accept strings.".to_string(), - )); - } - }), - BuiltinScalarFunction::SHA384 => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::Binary, - DataType::Utf8 => DataType::Binary, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The sha384 function can only accept strings.".to_string(), - )); - } - }), - BuiltinScalarFunction::SHA512 => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::Binary, - DataType::Utf8 => DataType::Binary, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The sha512 function can only accept strings.".to_string(), - )); - } - }), - BuiltinScalarFunction::SplitPart => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::LargeUtf8, - DataType::Utf8 => DataType::Utf8, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The split_part function can only accept strings.".to_string(), - )); - } - }), + BuiltinScalarFunction::RegexpReplace => { + utf8_to_str_type(&arg_types[0], "regex_replace") + } + BuiltinScalarFunction::Repeat => utf8_to_str_type(&arg_types[0], "repeat"), + BuiltinScalarFunction::Replace => utf8_to_str_type(&arg_types[0], "replace"), + BuiltinScalarFunction::Reverse => utf8_to_str_type(&arg_types[0], "reverse"), + BuiltinScalarFunction::Right => utf8_to_str_type(&arg_types[0], "right"), + BuiltinScalarFunction::Rpad => utf8_to_str_type(&arg_types[0], "rpad"), + BuiltinScalarFunction::Rtrim => utf8_to_str_type(&arg_types[0], "rtrimp"), + BuiltinScalarFunction::SHA224 => utf8_to_binary_type(&arg_types[0], "sha224"), + BuiltinScalarFunction::SHA256 => utf8_to_binary_type(&arg_types[0], "sha256"), + BuiltinScalarFunction::SHA384 => utf8_to_binary_type(&arg_types[0], "sha384"), + BuiltinScalarFunction::SHA512 => utf8_to_binary_type(&arg_types[0], "sha512"), + BuiltinScalarFunction::SplitPart => utf8_to_str_type(&arg_types[0], "split_part"), BuiltinScalarFunction::StartsWith => Ok(DataType::Boolean), - BuiltinScalarFunction::Strpos => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::Int64, - DataType::Utf8 => DataType::Int32, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The strpos function can only accept strings.".to_string(), - )); - } - }), - BuiltinScalarFunction::Substr => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::LargeUtf8, - DataType::Utf8 => DataType::Utf8, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The substr function can only accept strings.".to_string(), - )); - } - }), + BuiltinScalarFunction::Strpos => utf8_to_int_type(&arg_types[0], "strpos"), + BuiltinScalarFunction::Substr => utf8_to_str_type(&arg_types[0], "substr"), BuiltinScalarFunction::ToHex => Ok(match arg_types[0] { DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 => { DataType::Utf8 @@ -601,36 +413,9 @@ pub fn return_type( Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)) } BuiltinScalarFunction::Now => Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)), - BuiltinScalarFunction::Translate => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::LargeUtf8, - DataType::Utf8 => DataType::Utf8, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The translate function can only accept strings.".to_string(), - )); - } - }), - BuiltinScalarFunction::Trim => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::LargeUtf8, - DataType::Utf8 => DataType::Utf8, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The trim function can only accept strings.".to_string(), - )); - } - }), - BuiltinScalarFunction::Upper => Ok(match arg_types[0] { - DataType::LargeUtf8 => DataType::LargeUtf8, - DataType::Utf8 => DataType::Utf8, - _ => { - // this error is internal as `data_types` should have captured this. - return Err(DataFusionError::Internal( - "The upper function can only accept strings.".to_string(), - )); - } - }), + BuiltinScalarFunction::Translate => utf8_to_str_type(&arg_types[0], "translate"), + BuiltinScalarFunction::Trim => utf8_to_str_type(&arg_types[0], "trim"), + BuiltinScalarFunction::Upper => utf8_to_str_type(&arg_types[0], "upper"), BuiltinScalarFunction::RegexpMatch => Ok(match arg_types[0] { DataType::LargeUtf8 => { DataType::List(Box::new(Field::new("item", DataType::LargeUtf8, true))) From 371c240821f5639c364abe3bfb95056e62e477cc Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 19 May 2021 09:52:23 -0600 Subject: [PATCH 101/329] Update Ballista to use new physical plan formatter utility (#344) --- ballista/rust/core/src/utils.rs | 95 -------------------- ballista/rust/executor/src/flight_service.rs | 6 +- ballista/rust/scheduler/src/planner.rs | 5 +- 3 files changed, 5 insertions(+), 101 deletions(-) diff --git a/ballista/rust/core/src/utils.rs b/ballista/rust/core/src/utils.rs index 55541d5fd0148..dc570f81f2c7e 100644 --- a/ballista/rust/core/src/utils.rs +++ b/ballista/rust/core/src/utils.rs @@ -102,101 +102,6 @@ pub async fn collect_stream( Ok(batches) } -pub fn format_plan(plan: &dyn ExecutionPlan, indent: usize) -> Result { - let operator_str = - if let Some(exec) = plan.as_any().downcast_ref::() { - format!( - "HashAggregateExec: groupBy={:?}, aggrExpr={:?}", - exec.group_expr() - .iter() - .map(|e| format_expr(e.0.as_ref())) - .collect::>(), - exec.aggr_expr() - .iter() - .map(|e| format_agg_expr(e.as_ref())) - .collect::>>()? - ) - } else if let Some(exec) = plan.as_any().downcast_ref::() { - format!( - "HashJoinExec: joinType={:?}, on={:?}", - exec.join_type(), - exec.on() - ) - } else if let Some(exec) = plan.as_any().downcast_ref::() { - let mut num_files = 0; - for part in exec.partitions() { - num_files += part.filenames().len(); - } - format!( - "ParquetExec: partitions={}, files={}", - exec.partitions().len(), - num_files - ) - } else if let Some(exec) = plan.as_any().downcast_ref::() { - format!( - "CsvExec: {}; partitions={}", - &exec.path(), - exec.output_partitioning().partition_count() - ) - } else if let Some(exec) = plan.as_any().downcast_ref::() { - format!("FilterExec: {}", format_expr(exec.predicate().as_ref())) - } else if let Some(exec) = plan.as_any().downcast_ref::() { - format!( - "QueryStageExec: job={}, stage={}", - exec.job_id, exec.stage_id - ) - } else if let Some(exec) = plan.as_any().downcast_ref::() { - format!("UnresolvedShuffleExec: stages={:?}", exec.query_stage_ids) - } else if let Some(exec) = plan.as_any().downcast_ref::() { - format!( - "CoalesceBatchesExec: batchSize={}", - exec.target_batch_size() - ) - } else if plan.as_any().downcast_ref::().is_some() { - "MergeExec".to_string() - } else { - let str = format!("{:?}", plan); - String::from(&str[0..120]) - }; - - let children_str = plan - .children() - .iter() - .map(|c| format_plan(c.as_ref(), indent + 1)) - .collect::>>()? - .join("\n"); - - let indent_str = " ".repeat(indent); - if plan.children().is_empty() { - Ok(format!("{}{}{}", indent_str, &operator_str, children_str)) - } else { - Ok(format!("{}{}\n{}", indent_str, &operator_str, children_str)) - } -} - -pub fn format_agg_expr(expr: &dyn AggregateExpr) -> Result { - Ok(format!( - "{} {:?}", - expr.field()?.name(), - expr.expressions() - .iter() - .map(|e| format_expr(e.as_ref())) - .collect::>() - )) -} - -pub fn format_expr(expr: &dyn PhysicalExpr) -> String { - if let Some(e) = expr.as_any().downcast_ref::() { - e.name().to_string() - } else if let Some(e) = expr.as_any().downcast_ref::() { - e.to_string() - } else if let Some(e) = expr.as_any().downcast_ref::() { - format!("{} {} {}", e.left(), e.op(), e.right()) - } else { - format!("{}", expr) - } -} - pub fn produce_diagram(filename: &str, stages: &[Arc]) -> Result<()> { let write_file = File::create(filename)?; let mut w = BufWriter::new(&write_file); diff --git a/ballista/rust/executor/src/flight_service.rs b/ballista/rust/executor/src/flight_service.rs index 115e1ab0d800e..62aaf7f93a44c 100644 --- a/ballista/rust/executor/src/flight_service.rs +++ b/ballista/rust/executor/src/flight_service.rs @@ -26,7 +26,7 @@ use std::time::Instant; use ballista_core::error::BallistaError; use ballista_core::serde::decode_protobuf; use ballista_core::serde::scheduler::{Action as BallistaAction, PartitionStats}; -use ballista_core::utils::{self, format_plan}; +use ballista_core::utils; use arrow::array::{ArrayRef, StringBuilder}; use arrow::datatypes::{DataType, Field, Schema}; @@ -40,6 +40,7 @@ use arrow_flight::{ PutResult, SchemaResult, Ticket, }; use datafusion::error::DataFusionError; +use datafusion::physical_plan::displayable; use futures::{Stream, StreamExt}; use log::{info, warn}; use std::io::{Read, Seek}; @@ -97,8 +98,7 @@ impl FlightService for BallistaFlightService { partition.job_id, partition.stage_id, partition.partition_id, - format_plan(partition.plan.as_ref(), 0) - .map_err(|e| from_ballista_err(&e))? + displayable(partition.plan.as_ref()).indent().to_string() ); let mut tasks: Vec>> = vec![]; diff --git a/ballista/rust/scheduler/src/planner.rs b/ballista/rust/scheduler/src/planner.rs index b81d7de355ef0..2f01e73e60591 100644 --- a/ballista/rust/scheduler/src/planner.rs +++ b/ballista/rust/scheduler/src/planner.rs @@ -235,10 +235,9 @@ mod test { use ballista_core::error::BallistaError; use ballista_core::execution_plans::UnresolvedShuffleExec; use ballista_core::serde::protobuf; - use ballista_core::utils::format_plan; use datafusion::physical_plan::hash_aggregate::HashAggregateExec; use datafusion::physical_plan::sort::SortExec; - use datafusion::physical_plan::ExecutionPlan; + use datafusion::physical_plan::{displayable, ExecutionPlan}; use datafusion::physical_plan::{merge::MergeExec, projection::ProjectionExec}; use std::convert::TryInto; use std::sync::Arc; @@ -270,7 +269,7 @@ mod test { let job_uuid = Uuid::new_v4(); let stages = planner.plan_query_stages(&job_uuid.to_string(), plan)?; for stage in &stages { - println!("{}", format_plan(stage.as_ref(), 0)?); + println!("{}", displayable(stage.as_ref()).indent().to_string()); } /* Expected result: From 5c5a0bb0af8130069071c193ca8c214022666a4e Mon Sep 17 00:00:00 2001 From: Agata Naomichi Date: Thu, 20 May 2021 01:44:07 +0900 Subject: [PATCH 102/329] Fix SQL planner to support multibyte column names (#357) * Fix SQL planner to support multibyte column names * Fix test cases for multibyte column name support * Use starts_with --- datafusion/src/sql/planner.rs | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 036c66da16cc5..34c5901b450a2 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -857,7 +857,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ), SQLExpr::Identifier(ref id) => { - if &id.value[0..1] == "@" { + if id.value.starts_with('@') { let var_names = vec![id.value.clone()]; Ok(Expr::ScalarVariable(var_names)) } else { @@ -1520,7 +1520,7 @@ mod tests { use functions::ScalarFunctionImplementation; const PERSON_COLUMN_NAMES: &str = - "id, first_name, last_name, age, state, salary, birth_date"; + "id, first_name, last_name, age, state, salary, birth_date, 😀"; #[test] fn select_no_relation() { @@ -1559,7 +1559,7 @@ mod tests { let sql = "SELECT *, age FROM person"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Plan(\"Projections require unique expression names but the expression \\\"#age\\\" at position 3 and \\\"#age\\\" at position 7 have the same name. Consider aliasing (\\\"AS\\\") one of them.\")", + "Plan(\"Projections require unique expression names but the expression \\\"#age\\\" at position 3 and \\\"#age\\\" at position 8 have the same name. Consider aliasing (\\\"AS\\\") one of them.\")", format!("{:?}", err) ); } @@ -1568,7 +1568,7 @@ mod tests { fn select_wildcard_with_repeated_column_but_is_aliased() { quick_test( "SELECT *, first_name AS fn from person", - "Projection: #id, #first_name, #last_name, #age, #state, #salary, #birth_date, #first_name AS fn\ + "Projection: #id, #first_name, #last_name, #age, #state, #salary, #birth_date, #😀, #first_name AS fn\ \n TableScan: person projection=None", ); } @@ -2044,9 +2044,9 @@ mod tests { #[test] fn select_wildcard_with_groupby() { quick_test( - "SELECT * FROM person GROUP BY id, first_name, last_name, age, state, salary, birth_date", - "Projection: #id, #first_name, #last_name, #age, #state, #salary, #birth_date\ - \n Aggregate: groupBy=[[#id, #first_name, #last_name, #age, #state, #salary, #birth_date]], aggr=[[]]\ + r#"SELECT * FROM person GROUP BY id, first_name, last_name, age, state, salary, birth_date, "😀""#, + "Projection: #id, #first_name, #last_name, #age, #state, #salary, #birth_date, #😀\ + \n Aggregate: groupBy=[[#id, #first_name, #last_name, #age, #state, #salary, #birth_date, #😀]], aggr=[[]]\ \n TableScan: person projection=None", ); quick_test( @@ -2365,7 +2365,7 @@ mod tests { fn test_wildcard() { quick_test( "SELECT * from person", - "Projection: #id, #first_name, #last_name, #age, #state, #salary, #birth_date\ + "Projection: #id, #first_name, #last_name, #age, #state, #salary, #birth_date, #😀\ \n TableScan: person projection=None", ); } @@ -2679,6 +2679,14 @@ mod tests { quick_test(sql, expected); } + #[test] + fn select_multibyte_column() { + let sql = r#"SELECT "😀" FROM person"#; + let expected = "Projection: #😀\ + \n TableScan: person projection=None"; + quick_test(sql, expected); + } + fn logical_plan(sql: &str) -> Result { let planner = SqlToRel::new(&MockContextProvider {}); let result = DFParser::parse_sql(&sql); @@ -2712,6 +2720,7 @@ mod tests { DataType::Timestamp(TimeUnit::Nanosecond, None), false, ), + Field::new("😀", DataType::Int32, false), ])), "orders" => Some(Schema::new(vec![ Field::new("order_id", DataType::UInt32, false), From 2f73558b6ed68974f2b63d64bef4628b0776d3d5 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 19 May 2021 14:00:14 -0400 Subject: [PATCH 103/329] Fix indented display for multi-child nodes (#358) --- datafusion/src/physical_plan/display.rs | 5 +++ datafusion/tests/sql.rs | 49 +++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/datafusion/src/physical_plan/display.rs b/datafusion/src/physical_plan/display.rs index bfc3cd951d21a..e178ea18bb439 100644 --- a/datafusion/src/physical_plan/display.rs +++ b/datafusion/src/physical_plan/display.rs @@ -87,4 +87,9 @@ impl<'a, 'b> ExecutionPlanVisitor for IndentVisitor<'a, 'b> { self.indent += 1; Ok(true) } + + fn post_visit(&mut self, _plan: &dyn ExecutionPlan) -> Result { + self.indent -= 1; + Ok(true) + } } diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index eb50661b42e6c..17e0f13609a38 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -2989,3 +2989,52 @@ async fn test_physical_plan_display_indent() { expected, actual ); } + +#[tokio::test] +async fn test_physical_plan_display_indent_multi_children() { + // Hard code concurrency as it appears in the RepartitionExec output + let config = ExecutionConfig::new().with_concurrency(3); + let mut ctx = ExecutionContext::with_config(config); + // ensure indenting works for nodes with multiple children + register_aggregate_csv(&mut ctx).unwrap(); + let sql = "SELECT c1 \ + FROM (select c1 from aggregate_test_100)\ + JOIN\ + (select c1 as c2 from aggregate_test_100)\ + ON c1=c2\ + "; + + let plan = ctx.create_logical_plan(&sql).unwrap(); + let plan = ctx.optimize(&plan).unwrap(); + + let physical_plan = ctx.create_physical_plan(&plan).unwrap(); + let expected = vec![ + "ProjectionExec: expr=[c1]", + " CoalesceBatchesExec: target_batch_size=4096", + " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(\"c1\", \"c2\")]", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"c1\" }], 3)", + " ProjectionExec: expr=[c1]", + " RepartitionExec: partitioning=RoundRobinBatch(3)", + " CsvExec: source=Path(ARROW_TEST_DATA/csv/aggregate_test_100.csv: [ARROW_TEST_DATA/csv/aggregate_test_100.csv]), has_header=true", + " CoalesceBatchesExec: target_batch_size=4096", + " RepartitionExec: partitioning=Hash([Column { name: \"c2\" }], 3)", + " ProjectionExec: expr=[c1 as c2]", + " RepartitionExec: partitioning=RoundRobinBatch(3)", + " CsvExec: source=Path(ARROW_TEST_DATA/csv/aggregate_test_100.csv: [ARROW_TEST_DATA/csv/aggregate_test_100.csv]), has_header=true", + ]; + + let data_path = arrow::util::test_util::arrow_test_data(); + let actual = format!("{}", displayable(physical_plan.as_ref()).indent()) + .trim() + .lines() + // normalize paths + .map(|s| s.replace(&data_path, "ARROW_TEST_DATA")) + .collect::>(); + + assert_eq!( + expected, actual, + "expected:\n{:#?}\nactual:\n\n{:#?}\n", + expected, actual + ); +} From 913bf86147773863ec96b76f1f7cfc1cfbb84a6e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 20 May 2021 17:29:24 -0400 Subject: [PATCH 104/329] Refactor: move RowGroupPredicateBuilder into its own module, rename to PruningPredicateBuilder (#365) * Move row group pruning logic into pruning.rs * Rename RowGroup Predicates --> Pruning Predicates, touch up comments * add rat * Remove commented out println! and code * use .any instead of filter/count --- datafusion/src/physical_optimizer/mod.rs | 1 + datafusion/src/physical_optimizer/pruning.rs | 751 ++++++++++++++++++ datafusion/src/physical_plan/parquet.rs | 781 +------------------ 3 files changed, 784 insertions(+), 749 deletions(-) create mode 100644 datafusion/src/physical_optimizer/pruning.rs diff --git a/datafusion/src/physical_optimizer/mod.rs b/datafusion/src/physical_optimizer/mod.rs index eca63db9f3de7..8e79fe9328741 100644 --- a/datafusion/src/physical_optimizer/mod.rs +++ b/datafusion/src/physical_optimizer/mod.rs @@ -21,4 +21,5 @@ pub mod coalesce_batches; pub mod merge_exec; pub mod optimizer; +pub mod pruning; pub mod repartition; diff --git a/datafusion/src/physical_optimizer/pruning.rs b/datafusion/src/physical_optimizer/pruning.rs new file mode 100644 index 0000000000000..a13ca56630bc0 --- /dev/null +++ b/datafusion/src/physical_optimizer/pruning.rs @@ -0,0 +1,751 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! This module contains code to rule out row groups / partitions / +//! etc based on statistics prior in order to skip evaluating entire +//! swaths of rows. +//! +//! This code is currently specific to Parquet, but soon (TM), via +//! https://github.com/apache/arrow-datafusion/issues/363 it will +//! be genericized. + +use std::{collections::HashSet, sync::Arc}; + +use arrow::{ + array::{ + make_array, new_null_array, ArrayData, ArrayRef, BooleanArray, + BooleanBufferBuilder, + }, + buffer::MutableBuffer, + datatypes::{DataType, Field, Schema}, + record_batch::RecordBatch, +}; + +use parquet::file::{ + metadata::RowGroupMetaData, statistics::Statistics as ParquetStatistics, +}; + +use crate::{ + error::{DataFusionError, Result}, + execution::context::ExecutionContextState, + logical_plan::{Expr, Operator}, + optimizer::utils, + physical_plan::{planner::DefaultPhysicalPlanner, ColumnarValue, PhysicalExpr}, +}; + +#[derive(Debug, Clone)] +/// Builder used for generating predicate functions that can be used +/// to prune data based on statistics (e.g. parquet row group metadata) +pub struct PruningPredicateBuilder { + schema: Schema, + predicate_expr: Arc, + stat_column_req: Vec<(String, StatisticsType, Field)>, +} + +impl PruningPredicateBuilder { + /// Try to create a new instance of [`PruningPredicateBuilder`] + /// + /// This will translate the filter expression into a statistics predicate expression + /// + /// For example, `(column / 2) = 4` becomes `(column_min / 2) <= 4 && 4 <= (column_max / 2))` + pub fn try_new(expr: &Expr, schema: Schema) -> Result { + // build predicate expression once + let mut stat_column_req = Vec::<(String, StatisticsType, Field)>::new(); + let logical_predicate_expr = + build_predicate_expression(expr, &schema, &mut stat_column_req)?; + let stat_fields = stat_column_req + .iter() + .map(|(_, _, f)| f.clone()) + .collect::>(); + let stat_schema = Schema::new(stat_fields); + let execution_context_state = ExecutionContextState::new(); + let predicate_expr = DefaultPhysicalPlanner::default().create_physical_expr( + &logical_predicate_expr, + &stat_schema, + &execution_context_state, + )?; + Ok(Self { + schema, + predicate_expr, + stat_column_req, + }) + } + + /// Generate a predicate function used to filter based on + /// statistics + /// + /// This function takes a slice of statistics as parameter, so + /// that DataFusion's physical expressions can be executed once + /// against a single RecordBatch, containing statistics arrays, on + /// which the physical predicate expression is executed to + /// generate a row group filter array. + /// + /// The generated filter function is then used in the returned + /// closure to filter row groups. NOTE this is parquet specific at the moment + pub fn build_pruning_predicate( + &self, + row_group_metadata: &[RowGroupMetaData], + ) -> Box bool> { + // build statistics record batch + let predicate_result = build_statistics_record_batch( + row_group_metadata, + &self.schema, + &self.stat_column_req, + ) + .and_then(|statistics_batch| { + // execute predicate expression + self.predicate_expr.evaluate(&statistics_batch) + }) + .and_then(|v| match v { + ColumnarValue::Array(array) => Ok(array), + ColumnarValue::Scalar(_) => Err(DataFusionError::Plan( + "predicate expression didn't return an array".to_string(), + )), + }); + + let predicate_array = match predicate_result { + Ok(array) => array, + // row group filter array could not be built + // return a closure which will not filter out any row groups + _ => return Box::new(|_r, _i| true), + }; + + let predicate_array = predicate_array.as_any().downcast_ref::(); + match predicate_array { + // return row group predicate function + Some(array) => { + // when the result of the predicate expression for a row group is null / undefined, + // e.g. due to missing statistics, this row group can't be filtered out, + // so replace with true + let predicate_values = + array.iter().map(|x| x.unwrap_or(true)).collect::>(); + Box::new(move |_, i| predicate_values[i]) + } + // predicate result is not a BooleanArray + // return a closure which will not filter out any row groups + _ => Box::new(|_r, _i| true), + } + } +} + +/// Build a RecordBatch from a list of statistics (currently parquet +/// [`RowGroupMetadata`] structs), creating arrays, one for each +/// statistics column, as requested in the stat_column_req parameter. +fn build_statistics_record_batch( + row_groups: &[RowGroupMetaData], + schema: &Schema, + stat_column_req: &[(String, StatisticsType, Field)], +) -> Result { + let mut fields = Vec::::new(); + let mut arrays = Vec::::new(); + for (column_name, statistics_type, stat_field) in stat_column_req { + if let Some((column_index, _)) = schema.column_with_name(column_name) { + let statistics = row_groups + .iter() + .map(|g| g.column(column_index).statistics()) + .collect::>(); + let array = build_statistics_array( + &statistics, + *statistics_type, + stat_field.data_type(), + ); + fields.push(stat_field.clone()); + arrays.push(array); + } + } + let schema = Arc::new(Schema::new(fields)); + RecordBatch::try_new(schema, arrays) + .map_err(|err| DataFusionError::Plan(err.to_string())) +} + +struct StatisticsExpressionBuilder<'a> { + column_name: String, + column_expr: &'a Expr, + scalar_expr: &'a Expr, + field: &'a Field, + stat_column_req: &'a mut Vec<(String, StatisticsType, Field)>, + reverse_operator: bool, +} + +impl<'a> StatisticsExpressionBuilder<'a> { + fn try_new( + left: &'a Expr, + right: &'a Expr, + schema: &'a Schema, + stat_column_req: &'a mut Vec<(String, StatisticsType, Field)>, + ) -> Result { + // find column name; input could be a more complicated expression + let mut left_columns = HashSet::::new(); + utils::expr_to_column_names(left, &mut left_columns)?; + let mut right_columns = HashSet::::new(); + utils::expr_to_column_names(right, &mut right_columns)?; + let (column_expr, scalar_expr, column_names, reverse_operator) = + match (left_columns.len(), right_columns.len()) { + (1, 0) => (left, right, left_columns, false), + (0, 1) => (right, left, right_columns, true), + _ => { + // if more than one column used in expression - not supported + return Err(DataFusionError::Plan( + "Multi-column expressions are not currently supported" + .to_string(), + )); + } + }; + let column_name = column_names.iter().next().unwrap().clone(); + let field = match schema.column_with_name(&column_name) { + Some((_, f)) => f, + _ => { + return Err(DataFusionError::Plan( + "Field not found in schema".to_string(), + )); + } + }; + + Ok(Self { + column_name, + column_expr, + scalar_expr, + field, + stat_column_req, + reverse_operator, + }) + } + + fn correct_operator(&self, op: Operator) -> Operator { + if !self.reverse_operator { + return op; + } + + match op { + Operator::Lt => Operator::Gt, + Operator::Gt => Operator::Lt, + Operator::LtEq => Operator::GtEq, + Operator::GtEq => Operator::LtEq, + _ => op, + } + } + + fn scalar_expr(&self) -> &Expr { + self.scalar_expr + } + + fn is_stat_column_missing(&self, statistics_type: StatisticsType) -> bool { + !self + .stat_column_req + .iter() + .any(|(c, t, _f)| c == &self.column_name && t == &statistics_type) + } + + fn stat_column_expr( + &mut self, + stat_type: StatisticsType, + suffix: &str, + ) -> Result { + let stat_column_name = format!("{}_{}", self.column_name, suffix); + let stat_field = Field::new( + stat_column_name.as_str(), + self.field.data_type().clone(), + self.field.is_nullable(), + ); + if self.is_stat_column_missing(stat_type) { + // only add statistics column if not previously added + self.stat_column_req + .push((self.column_name.clone(), stat_type, stat_field)); + } + rewrite_column_expr( + self.column_expr, + self.column_name.as_str(), + stat_column_name.as_str(), + ) + } + + fn min_column_expr(&mut self) -> Result { + self.stat_column_expr(StatisticsType::Min, "min") + } + + fn max_column_expr(&mut self) -> Result { + self.stat_column_expr(StatisticsType::Max, "max") + } +} + +/// replaces a column with an old name with a new name in an expression +fn rewrite_column_expr( + expr: &Expr, + column_old_name: &str, + column_new_name: &str, +) -> Result { + let expressions = utils::expr_sub_expressions(&expr)?; + let expressions = expressions + .iter() + .map(|e| rewrite_column_expr(e, column_old_name, column_new_name)) + .collect::>>()?; + + if let Expr::Column(name) = expr { + if name == column_old_name { + return Ok(Expr::Column(column_new_name.to_string())); + } + } + utils::rewrite_expression(&expr, &expressions) +} + +/// Translate logical filter expression into statistics predicate expression +fn build_predicate_expression( + expr: &Expr, + schema: &Schema, + stat_column_req: &mut Vec<(String, StatisticsType, Field)>, +) -> Result { + use crate::logical_plan; + // predicate expression can only be a binary expression + let (left, op, right) = match expr { + Expr::BinaryExpr { left, op, right } => (left, *op, right), + _ => { + // unsupported expression - replace with TRUE + // this can still be useful when multiple conditions are joined using AND + // such as: column > 10 AND TRUE + return Ok(logical_plan::lit(true)); + } + }; + + if op == Operator::And || op == Operator::Or { + let left_expr = build_predicate_expression(left, schema, stat_column_req)?; + let right_expr = build_predicate_expression(right, schema, stat_column_req)?; + return Ok(logical_plan::binary_expr(left_expr, op, right_expr)); + } + + let expr_builder = + StatisticsExpressionBuilder::try_new(left, right, schema, stat_column_req); + let mut expr_builder = match expr_builder { + Ok(builder) => builder, + // allow partial failure in predicate expression generation + // this can still produce a useful predicate when multiple conditions are joined using AND + Err(_) => { + return Ok(logical_plan::lit(true)); + } + }; + let corrected_op = expr_builder.correct_operator(op); + let statistics_expr = match corrected_op { + Operator::Eq => { + // column = literal => (min, max) = literal => min <= literal && literal <= max + // (column / 2) = 4 => (column_min / 2) <= 4 && 4 <= (column_max / 2) + let min_column_expr = expr_builder.min_column_expr()?; + let max_column_expr = expr_builder.max_column_expr()?; + min_column_expr + .lt_eq(expr_builder.scalar_expr().clone()) + .and(expr_builder.scalar_expr().clone().lt_eq(max_column_expr)) + } + Operator::Gt => { + // column > literal => (min, max) > literal => max > literal + expr_builder + .max_column_expr()? + .gt(expr_builder.scalar_expr().clone()) + } + Operator::GtEq => { + // column >= literal => (min, max) >= literal => max >= literal + expr_builder + .max_column_expr()? + .gt_eq(expr_builder.scalar_expr().clone()) + } + Operator::Lt => { + // column < literal => (min, max) < literal => min < literal + expr_builder + .min_column_expr()? + .lt(expr_builder.scalar_expr().clone()) + } + Operator::LtEq => { + // column <= literal => (min, max) <= literal => min <= literal + expr_builder + .min_column_expr()? + .lt_eq(expr_builder.scalar_expr().clone()) + } + // other expressions are not supported + _ => logical_plan::lit(true), + }; + Ok(statistics_expr) +} + +#[derive(Debug, Copy, Clone, PartialEq)] +enum StatisticsType { + Min, + Max, +} + +fn build_statistics_array( + statistics: &[Option<&ParquetStatistics>], + statistics_type: StatisticsType, + data_type: &DataType, +) -> ArrayRef { + let statistics_count = statistics.len(); + let first_group_stats = statistics.iter().find(|s| s.is_some()); + let first_group_stats = if let Some(Some(statistics)) = first_group_stats { + // found first row group with statistics defined + statistics + } else { + // no row group has statistics defined + return new_null_array(data_type, statistics_count); + }; + + let (data_size, arrow_type) = match first_group_stats { + ParquetStatistics::Int32(_) => (std::mem::size_of::(), DataType::Int32), + ParquetStatistics::Int64(_) => (std::mem::size_of::(), DataType::Int64), + ParquetStatistics::Float(_) => (std::mem::size_of::(), DataType::Float32), + ParquetStatistics::Double(_) => (std::mem::size_of::(), DataType::Float64), + ParquetStatistics::ByteArray(_) if data_type == &DataType::Utf8 => { + (0, DataType::Utf8) + } + _ => { + // type of statistics not supported + return new_null_array(data_type, statistics_count); + } + }; + + let statistics = statistics.iter().map(|s| { + s.filter(|s| s.has_min_max_set()) + .map(|s| match statistics_type { + StatisticsType::Min => s.min_bytes(), + StatisticsType::Max => s.max_bytes(), + }) + }); + + if arrow_type == DataType::Utf8 { + let data_size = statistics + .clone() + .map(|x| x.map(|b| b.len()).unwrap_or(0)) + .sum(); + let mut builder = + arrow::array::StringBuilder::with_capacity(statistics_count, data_size); + let string_statistics = + statistics.map(|x| x.and_then(|bytes| std::str::from_utf8(bytes).ok())); + for maybe_string in string_statistics { + match maybe_string { + Some(string_value) => builder.append_value(string_value).unwrap(), + None => builder.append_null().unwrap(), + }; + } + return Arc::new(builder.finish()); + } + + let mut data_buffer = MutableBuffer::new(statistics_count * data_size); + let mut bitmap_builder = BooleanBufferBuilder::new(statistics_count); + let mut null_count = 0; + for s in statistics { + if let Some(stat_data) = s { + bitmap_builder.append(true); + data_buffer.extend_from_slice(stat_data); + } else { + bitmap_builder.append(false); + data_buffer.resize(data_buffer.len() + data_size, 0); + null_count += 1; + } + } + + let mut builder = ArrayData::builder(arrow_type) + .len(statistics_count) + .add_buffer(data_buffer.into()); + if null_count > 0 { + builder = builder.null_bit_buffer(bitmap_builder.finish()); + } + let array_data = builder.build(); + let statistics_array = make_array(array_data); + if statistics_array.data_type() == data_type { + return statistics_array; + } + // cast statistics array to required data type + arrow::compute::cast(&statistics_array, data_type) + .unwrap_or_else(|_| new_null_array(data_type, statistics_count)) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::physical_optimizer::pruning::StatisticsType; + use arrow::{ + array::{Int32Array, StringArray}, + datatypes::DataType, + }; + use parquet::file::statistics::Statistics as ParquetStatistics; + + #[test] + fn build_statistics_array_int32() { + // build row group metadata array + let s1 = ParquetStatistics::int32(None, Some(10), None, 0, false); + let s2 = ParquetStatistics::int32(Some(2), Some(20), None, 0, false); + let s3 = ParquetStatistics::int32(Some(3), Some(30), None, 0, false); + let statistics = vec![Some(&s1), Some(&s2), Some(&s3)]; + + let statistics_array = + build_statistics_array(&statistics, StatisticsType::Min, &DataType::Int32); + let int32_array = statistics_array + .as_any() + .downcast_ref::() + .unwrap(); + let int32_vec = int32_array.into_iter().collect::>(); + assert_eq!(int32_vec, vec![None, Some(2), Some(3)]); + + let statistics_array = + build_statistics_array(&statistics, StatisticsType::Max, &DataType::Int32); + let int32_array = statistics_array + .as_any() + .downcast_ref::() + .unwrap(); + let int32_vec = int32_array.into_iter().collect::>(); + // here the first max value is None and not the Some(10) value which was actually set + // because the min value is None + assert_eq!(int32_vec, vec![None, Some(20), Some(30)]); + } + + #[test] + fn build_statistics_array_utf8() { + // build row group metadata array + let s1 = ParquetStatistics::byte_array(None, Some("10".into()), None, 0, false); + let s2 = ParquetStatistics::byte_array( + Some("2".into()), + Some("20".into()), + None, + 0, + false, + ); + let s3 = ParquetStatistics::byte_array( + Some("3".into()), + Some("30".into()), + None, + 0, + false, + ); + let statistics = vec![Some(&s1), Some(&s2), Some(&s3)]; + + let statistics_array = + build_statistics_array(&statistics, StatisticsType::Min, &DataType::Utf8); + let string_array = statistics_array + .as_any() + .downcast_ref::() + .unwrap(); + let string_vec = string_array.into_iter().collect::>(); + assert_eq!(string_vec, vec![None, Some("2"), Some("3")]); + + let statistics_array = + build_statistics_array(&statistics, StatisticsType::Max, &DataType::Utf8); + let string_array = statistics_array + .as_any() + .downcast_ref::() + .unwrap(); + let string_vec = string_array.into_iter().collect::>(); + // here the first max value is None and not the Some("10") value which was actually set + // because the min value is None + assert_eq!(string_vec, vec![None, Some("20"), Some("30")]); + } + + #[test] + fn build_statistics_array_empty_stats() { + let data_type = DataType::Int32; + let statistics = vec![]; + let statistics_array = + build_statistics_array(&statistics, StatisticsType::Min, &data_type); + assert_eq!(statistics_array.len(), 0); + + let statistics = vec![None, None]; + let statistics_array = + build_statistics_array(&statistics, StatisticsType::Min, &data_type); + assert_eq!(statistics_array.len(), statistics.len()); + assert_eq!(statistics_array.data_type(), &data_type); + for i in 0..statistics_array.len() { + assert_eq!(statistics_array.is_null(i), true); + assert_eq!(statistics_array.is_valid(i), false); + } + } + + #[test] + fn build_statistics_array_unsupported_type() { + // boolean is not currently a supported type for statistics + let s1 = ParquetStatistics::boolean(Some(false), Some(true), None, 0, false); + let s2 = ParquetStatistics::boolean(Some(false), Some(true), None, 0, false); + let statistics = vec![Some(&s1), Some(&s2)]; + let data_type = DataType::Boolean; + let statistics_array = + build_statistics_array(&statistics, StatisticsType::Min, &data_type); + assert_eq!(statistics_array.len(), statistics.len()); + assert_eq!(statistics_array.data_type(), &data_type); + for i in 0..statistics_array.len() { + assert_eq!(statistics_array.is_null(i), true); + assert_eq!(statistics_array.is_valid(i), false); + } + } + + #[test] + fn row_group_predicate_eq() -> Result<()> { + use crate::logical_plan::{col, lit}; + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + let expected_expr = "#c1_min LtEq Int32(1) And Int32(1) LtEq #c1_max"; + + // test column on the left + let expr = col("c1").eq(lit(1)); + let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + // test column on the right + let expr = lit(1).eq(col("c1")); + let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_gt() -> Result<()> { + use crate::logical_plan::{col, lit}; + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + let expected_expr = "#c1_max Gt Int32(1)"; + + // test column on the left + let expr = col("c1").gt(lit(1)); + let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + // test column on the right + let expr = lit(1).lt(col("c1")); + let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_gt_eq() -> Result<()> { + use crate::logical_plan::{col, lit}; + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + let expected_expr = "#c1_max GtEq Int32(1)"; + + // test column on the left + let expr = col("c1").gt_eq(lit(1)); + let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + // test column on the right + let expr = lit(1).lt_eq(col("c1")); + let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_lt() -> Result<()> { + use crate::logical_plan::{col, lit}; + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + let expected_expr = "#c1_min Lt Int32(1)"; + + // test column on the left + let expr = col("c1").lt(lit(1)); + let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + // test column on the right + let expr = lit(1).gt(col("c1")); + let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_lt_eq() -> Result<()> { + use crate::logical_plan::{col, lit}; + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + let expected_expr = "#c1_min LtEq Int32(1)"; + + // test column on the left + let expr = col("c1").lt_eq(lit(1)); + let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + // test column on the right + let expr = lit(1).gt_eq(col("c1")); + let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_and() -> Result<()> { + use crate::logical_plan::{col, lit}; + let schema = Schema::new(vec![ + Field::new("c1", DataType::Int32, false), + Field::new("c2", DataType::Int32, false), + Field::new("c3", DataType::Int32, false), + ]); + // test AND operator joining supported c1 < 1 expression and unsupported c2 > c3 expression + let expr = col("c1").lt(lit(1)).and(col("c2").lt(col("c3"))); + let expected_expr = "#c1_min Lt Int32(1) And Boolean(true)"; + let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_or() -> Result<()> { + use crate::logical_plan::{col, lit}; + let schema = Schema::new(vec![ + Field::new("c1", DataType::Int32, false), + Field::new("c2", DataType::Int32, false), + ]); + // test OR operator joining supported c1 < 1 expression and unsupported c2 % 2 expression + let expr = col("c1").lt(lit(1)).or(col("c2").modulus(lit(2))); + let expected_expr = "#c1_min Lt Int32(1) Or Boolean(true)"; + let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_stat_column_req() -> Result<()> { + use crate::logical_plan::{col, lit}; + let schema = Schema::new(vec![ + Field::new("c1", DataType::Int32, false), + Field::new("c2", DataType::Int32, false), + ]); + let mut stat_column_req = vec![]; + // c1 < 1 and (c2 = 2 or c2 = 3) + let expr = col("c1") + .lt(lit(1)) + .and(col("c2").eq(lit(2)).or(col("c2").eq(lit(3)))); + let expected_expr = "#c1_min Lt Int32(1) And #c2_min LtEq Int32(2) And Int32(2) LtEq #c2_max Or #c2_min LtEq Int32(3) And Int32(3) LtEq #c2_max"; + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut stat_column_req)?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + // c1 < 1 should add c1_min + let c1_min_field = Field::new("c1_min", DataType::Int32, false); + assert_eq!( + stat_column_req[0], + ("c1".to_owned(), StatisticsType::Min, c1_min_field) + ); + // c2 = 2 should add c2_min and c2_max + let c2_min_field = Field::new("c2_min", DataType::Int32, false); + assert_eq!( + stat_column_req[1], + ("c2".to_owned(), StatisticsType::Min, c2_min_field) + ); + let c2_max_field = Field::new("c2_max", DataType::Int32, false); + assert_eq!( + stat_column_req[2], + ("c2".to_owned(), StatisticsType::Max, c2_max_field) + ); + // c2 = 3 shouldn't add any new statistics fields + assert_eq!(stat_column_req.len(), 3); + + Ok(()) + } +} diff --git a/datafusion/src/physical_plan/parquet.rs b/datafusion/src/physical_plan/parquet.rs index dd5e77bc21eb9..66b1253db3d40 100644 --- a/datafusion/src/physical_plan/parquet.rs +++ b/datafusion/src/physical_plan/parquet.rs @@ -17,38 +17,28 @@ //! Execution plan for reading Parquet files +use std::any::Any; use std::fmt; use std::fs::File; use std::sync::Arc; use std::task::{Context, Poll}; -use std::{any::Any, collections::HashSet}; -use super::{ - planner::DefaultPhysicalPlanner, ColumnarValue, PhysicalExpr, RecordBatchStream, - SendableRecordBatchStream, -}; -use crate::physical_plan::{common, DisplayFormatType, ExecutionPlan, Partitioning}; use crate::{ error::{DataFusionError, Result}, - execution::context::ExecutionContextState, - logical_plan::{Expr, Operator}, - optimizer::utils, + logical_plan::Expr, + physical_optimizer::pruning::PruningPredicateBuilder, + physical_plan::{ + common, DisplayFormatType, ExecutionPlan, Partitioning, RecordBatchStream, + SendableRecordBatchStream, + }, }; -use arrow::record_batch::RecordBatch; + use arrow::{ - array::new_null_array, + datatypes::{Schema, SchemaRef}, error::{ArrowError, Result as ArrowResult}, + record_batch::RecordBatch, }; -use arrow::{ - array::{make_array, ArrayData, ArrayRef, BooleanArray, BooleanBufferBuilder}, - buffer::MutableBuffer, - datatypes::{DataType, Field, Schema, SchemaRef}, -}; -use parquet::file::{ - metadata::RowGroupMetaData, - reader::{FileReader, SerializedFileReader}, - statistics::Statistics as ParquetStatistics, -}; +use parquet::file::reader::{FileReader, SerializedFileReader}; use fmt::Debug; use parquet::arrow::{ArrowReader, ParquetFileArrowReader}; @@ -76,7 +66,7 @@ pub struct ParquetExec { /// Statistics for the data set (sum of statistics for all partitions) statistics: Statistics, /// Optional predicate builder - predicate_builder: Option, + predicate_builder: Option, /// Optional limit of the number of rows limit: Option, } @@ -229,7 +219,7 @@ impl ParquetExec { } let schema = schemas[0].clone(); let predicate_builder = predicate.and_then(|predicate_expr| { - RowGroupPredicateBuilder::try_new(&predicate_expr, schema.clone()).ok() + PruningPredicateBuilder::try_new(&predicate_expr, schema.clone()).ok() }); Ok(Self::new( @@ -247,7 +237,7 @@ impl ParquetExec { partitions: Vec, schema: Schema, projection: Option>, - predicate_builder: Option, + predicate_builder: Option, batch_size: usize, limit: Option, ) -> Self { @@ -358,445 +348,6 @@ impl ParquetPartition { } } -#[derive(Debug, Clone)] -/// Predicate builder used for generating of predicate functions, used to filter row group metadata -pub struct RowGroupPredicateBuilder { - parquet_schema: Schema, - predicate_expr: Arc, - stat_column_req: Vec<(String, StatisticsType, Field)>, -} - -impl RowGroupPredicateBuilder { - /// Try to create a new instance of PredicateExpressionBuilder. - /// This will translate the filter expression into a statistics predicate expression - /// (for example (column / 2) = 4 becomes (column_min / 2) <= 4 && 4 <= (column_max / 2)), - /// then convert it to a DataFusion PhysicalExpression and cache it for later use by build_row_group_predicate. - pub fn try_new(expr: &Expr, parquet_schema: Schema) -> Result { - // build predicate expression once - let mut stat_column_req = Vec::<(String, StatisticsType, Field)>::new(); - let logical_predicate_expr = - build_predicate_expression(expr, &parquet_schema, &mut stat_column_req)?; - // println!( - // "RowGroupPredicateBuilder::try_new, logical_predicate_expr: {:?}", - // logical_predicate_expr - // ); - // build physical predicate expression - let stat_fields = stat_column_req - .iter() - .map(|(_, _, f)| f.clone()) - .collect::>(); - let stat_schema = Schema::new(stat_fields); - let execution_context_state = ExecutionContextState::new(); - let predicate_expr = DefaultPhysicalPlanner::default().create_physical_expr( - &logical_predicate_expr, - &stat_schema, - &execution_context_state, - )?; - // println!( - // "RowGroupPredicateBuilder::try_new, predicate_expr: {:?}", - // predicate_expr - // ); - Ok(Self { - parquet_schema, - predicate_expr, - stat_column_req, - }) - } - - /// Generate a predicate function used to filter row group metadata. - /// This function takes a list of all row groups as parameter, - /// so that DataFusion's physical expressions can be re-used by - /// generating a RecordBatch, containing statistics arrays, - /// on which the physical predicate expression is executed to generate a row group filter array. - /// The generated filter array is then used in the returned closure to filter row groups. - pub fn build_row_group_predicate( - &self, - row_group_metadata: &[RowGroupMetaData], - ) -> Box bool> { - // build statistics record batch - let predicate_result = build_statistics_record_batch( - row_group_metadata, - &self.parquet_schema, - &self.stat_column_req, - ) - .and_then(|statistics_batch| { - // execute predicate expression - self.predicate_expr.evaluate(&statistics_batch) - }) - .and_then(|v| match v { - ColumnarValue::Array(array) => Ok(array), - ColumnarValue::Scalar(_) => Err(DataFusionError::Plan( - "predicate expression didn't return an array".to_string(), - )), - }); - - let predicate_array = match predicate_result { - Ok(array) => array, - // row group filter array could not be built - // return a closure which will not filter out any row groups - _ => return Box::new(|_r, _i| true), - }; - - let predicate_array = predicate_array.as_any().downcast_ref::(); - match predicate_array { - // return row group predicate function - Some(array) => { - // when the result of the predicate expression for a row group is null / undefined, - // e.g. due to missing statistics, this row group can't be filtered out, - // so replace with true - let predicate_values = - array.iter().map(|x| x.unwrap_or(true)).collect::>(); - Box::new(move |_, i| predicate_values[i]) - } - // predicate result is not a BooleanArray - // return a closure which will not filter out any row groups - _ => Box::new(|_r, _i| true), - } - } -} - -/// Build a RecordBatch from a list of RowGroupMetadata structs, -/// creating arrays, one for each statistics column, -/// as requested in the stat_column_req parameter. -fn build_statistics_record_batch( - row_groups: &[RowGroupMetaData], - parquet_schema: &Schema, - stat_column_req: &[(String, StatisticsType, Field)], -) -> Result { - let mut fields = Vec::::new(); - let mut arrays = Vec::::new(); - for (column_name, statistics_type, stat_field) in stat_column_req { - if let Some((column_index, _)) = parquet_schema.column_with_name(column_name) { - let statistics = row_groups - .iter() - .map(|g| g.column(column_index).statistics()) - .collect::>(); - let array = build_statistics_array( - &statistics, - *statistics_type, - stat_field.data_type(), - ); - fields.push(stat_field.clone()); - arrays.push(array); - } - } - let schema = Arc::new(Schema::new(fields)); - RecordBatch::try_new(schema, arrays) - .map_err(|err| DataFusionError::Plan(err.to_string())) -} - -struct StatisticsExpressionBuilder<'a> { - column_name: String, - column_expr: &'a Expr, - scalar_expr: &'a Expr, - parquet_field: &'a Field, - stat_column_req: &'a mut Vec<(String, StatisticsType, Field)>, - reverse_operator: bool, -} - -impl<'a> StatisticsExpressionBuilder<'a> { - fn try_new( - left: &'a Expr, - right: &'a Expr, - parquet_schema: &'a Schema, - stat_column_req: &'a mut Vec<(String, StatisticsType, Field)>, - ) -> Result { - // find column name; input could be a more complicated expression - let mut left_columns = HashSet::::new(); - utils::expr_to_column_names(left, &mut left_columns)?; - let mut right_columns = HashSet::::new(); - utils::expr_to_column_names(right, &mut right_columns)?; - let (column_expr, scalar_expr, column_names, reverse_operator) = - match (left_columns.len(), right_columns.len()) { - (1, 0) => (left, right, left_columns, false), - (0, 1) => (right, left, right_columns, true), - _ => { - // if more than one column used in expression - not supported - return Err(DataFusionError::Plan( - "Multi-column expressions are not currently supported" - .to_string(), - )); - } - }; - let column_name = column_names.iter().next().unwrap().clone(); - let field = match parquet_schema.column_with_name(&column_name) { - Some((_, f)) => f, - _ => { - // field not found in parquet schema - return Err(DataFusionError::Plan( - "Field not found in parquet schema".to_string(), - )); - } - }; - - Ok(Self { - column_name, - column_expr, - scalar_expr, - parquet_field: field, - stat_column_req, - reverse_operator, - }) - } - - fn correct_operator(&self, op: Operator) -> Operator { - if !self.reverse_operator { - return op; - } - - match op { - Operator::Lt => Operator::Gt, - Operator::Gt => Operator::Lt, - Operator::LtEq => Operator::GtEq, - Operator::GtEq => Operator::LtEq, - _ => op, - } - } - - // fn column_expr(&self) -> &Expr { - // self.column_expr - // } - - fn scalar_expr(&self) -> &Expr { - self.scalar_expr - } - - // fn column_name(&self) -> &String { - // &self.column_name - // } - - fn is_stat_column_missing(&self, statistics_type: StatisticsType) -> bool { - self.stat_column_req - .iter() - .filter(|(c, t, _f)| c == &self.column_name && t == &statistics_type) - .count() - == 0 - } - - fn stat_column_expr( - &mut self, - stat_type: StatisticsType, - suffix: &str, - ) -> Result { - let stat_column_name = format!("{}_{}", self.column_name, suffix); - let stat_field = Field::new( - stat_column_name.as_str(), - self.parquet_field.data_type().clone(), - self.parquet_field.is_nullable(), - ); - if self.is_stat_column_missing(stat_type) { - // only add statistics column if not previously added - self.stat_column_req - .push((self.column_name.clone(), stat_type, stat_field)); - } - rewrite_column_expr( - self.column_expr, - self.column_name.as_str(), - stat_column_name.as_str(), - ) - } - - fn min_column_expr(&mut self) -> Result { - self.stat_column_expr(StatisticsType::Min, "min") - } - - fn max_column_expr(&mut self) -> Result { - self.stat_column_expr(StatisticsType::Max, "max") - } -} - -/// replaces a column with an old name with a new name in an expression -fn rewrite_column_expr( - expr: &Expr, - column_old_name: &str, - column_new_name: &str, -) -> Result { - let expressions = utils::expr_sub_expressions(&expr)?; - let expressions = expressions - .iter() - .map(|e| rewrite_column_expr(e, column_old_name, column_new_name)) - .collect::>>()?; - - if let Expr::Column(name) = expr { - if name == column_old_name { - return Ok(Expr::Column(column_new_name.to_string())); - } - } - utils::rewrite_expression(&expr, &expressions) -} - -/// Translate logical filter expression into parquet statistics predicate expression -fn build_predicate_expression( - expr: &Expr, - parquet_schema: &Schema, - stat_column_req: &mut Vec<(String, StatisticsType, Field)>, -) -> Result { - use crate::logical_plan; - // predicate expression can only be a binary expression - let (left, op, right) = match expr { - Expr::BinaryExpr { left, op, right } => (left, *op, right), - _ => { - // unsupported expression - replace with TRUE - // this can still be useful when multiple conditions are joined using AND - // such as: column > 10 AND TRUE - return Ok(logical_plan::lit(true)); - } - }; - - if op == Operator::And || op == Operator::Or { - let left_expr = - build_predicate_expression(left, parquet_schema, stat_column_req)?; - let right_expr = - build_predicate_expression(right, parquet_schema, stat_column_req)?; - return Ok(logical_plan::binary_expr(left_expr, op, right_expr)); - } - - let expr_builder = StatisticsExpressionBuilder::try_new( - left, - right, - parquet_schema, - stat_column_req, - ); - let mut expr_builder = match expr_builder { - Ok(builder) => builder, - // allow partial failure in predicate expression generation - // this can still produce a useful predicate when multiple conditions are joined using AND - Err(_) => { - return Ok(logical_plan::lit(true)); - } - }; - let corrected_op = expr_builder.correct_operator(op); - let statistics_expr = match corrected_op { - Operator::Eq => { - // column = literal => (min, max) = literal => min <= literal && literal <= max - // (column / 2) = 4 => (column_min / 2) <= 4 && 4 <= (column_max / 2) - let min_column_expr = expr_builder.min_column_expr()?; - let max_column_expr = expr_builder.max_column_expr()?; - min_column_expr - .lt_eq(expr_builder.scalar_expr().clone()) - .and(expr_builder.scalar_expr().clone().lt_eq(max_column_expr)) - } - Operator::Gt => { - // column > literal => (min, max) > literal => max > literal - expr_builder - .max_column_expr()? - .gt(expr_builder.scalar_expr().clone()) - } - Operator::GtEq => { - // column >= literal => (min, max) >= literal => max >= literal - expr_builder - .max_column_expr()? - .gt_eq(expr_builder.scalar_expr().clone()) - } - Operator::Lt => { - // column < literal => (min, max) < literal => min < literal - expr_builder - .min_column_expr()? - .lt(expr_builder.scalar_expr().clone()) - } - Operator::LtEq => { - // column <= literal => (min, max) <= literal => min <= literal - expr_builder - .min_column_expr()? - .lt_eq(expr_builder.scalar_expr().clone()) - } - // other expressions are not supported - _ => logical_plan::lit(true), - }; - Ok(statistics_expr) -} - -#[derive(Debug, Copy, Clone, PartialEq)] -enum StatisticsType { - Min, - Max, -} - -fn build_statistics_array( - statistics: &[Option<&ParquetStatistics>], - statistics_type: StatisticsType, - data_type: &DataType, -) -> ArrayRef { - let statistics_count = statistics.len(); - let first_group_stats = statistics.iter().find(|s| s.is_some()); - let first_group_stats = if let Some(Some(statistics)) = first_group_stats { - // found first row group with statistics defined - statistics - } else { - // no row group has statistics defined - return new_null_array(data_type, statistics_count); - }; - - let (data_size, arrow_type) = match first_group_stats { - ParquetStatistics::Int32(_) => (std::mem::size_of::(), DataType::Int32), - ParquetStatistics::Int64(_) => (std::mem::size_of::(), DataType::Int64), - ParquetStatistics::Float(_) => (std::mem::size_of::(), DataType::Float32), - ParquetStatistics::Double(_) => (std::mem::size_of::(), DataType::Float64), - ParquetStatistics::ByteArray(_) if data_type == &DataType::Utf8 => { - (0, DataType::Utf8) - } - _ => { - // type of statistics not supported - return new_null_array(data_type, statistics_count); - } - }; - - let statistics = statistics.iter().map(|s| { - s.filter(|s| s.has_min_max_set()) - .map(|s| match statistics_type { - StatisticsType::Min => s.min_bytes(), - StatisticsType::Max => s.max_bytes(), - }) - }); - - if arrow_type == DataType::Utf8 { - let data_size = statistics - .clone() - .map(|x| x.map(|b| b.len()).unwrap_or(0)) - .sum(); - let mut builder = - arrow::array::StringBuilder::with_capacity(statistics_count, data_size); - let string_statistics = - statistics.map(|x| x.and_then(|bytes| std::str::from_utf8(bytes).ok())); - for maybe_string in string_statistics { - match maybe_string { - Some(string_value) => builder.append_value(string_value).unwrap(), - None => builder.append_null().unwrap(), - }; - } - return Arc::new(builder.finish()); - } - - let mut data_buffer = MutableBuffer::new(statistics_count * data_size); - let mut bitmap_builder = BooleanBufferBuilder::new(statistics_count); - let mut null_count = 0; - for s in statistics { - if let Some(stat_data) = s { - bitmap_builder.append(true); - data_buffer.extend_from_slice(stat_data); - } else { - bitmap_builder.append(false); - data_buffer.resize(data_buffer.len() + data_size, 0); - null_count += 1; - } - } - - let mut builder = ArrayData::builder(arrow_type) - .len(statistics_count) - .add_buffer(data_buffer.into()); - if null_count > 0 { - builder = builder.null_bit_buffer(bitmap_builder.finish()); - } - let array_data = builder.build(); - let statistics_array = make_array(array_data); - if statistics_array.data_type() == data_type { - return statistics_array; - } - // cast statistics array to required data type - arrow::compute::cast(&statistics_array, data_type) - .unwrap_or_else(|_| new_null_array(data_type, statistics_count)) -} - #[async_trait] impl ExecutionPlan for ParquetExec { /// Return a reference to Any that can be used for downcasting @@ -906,7 +457,7 @@ fn send_result( fn read_files( filenames: &[String], projection: &[usize], - predicate_builder: &Option, + predicate_builder: &Option, batch_size: usize, response_tx: Sender>, limit: Option, @@ -917,7 +468,7 @@ fn read_files( let mut file_reader = SerializedFileReader::new(file)?; if let Some(predicate_builder) = predicate_builder { let row_group_predicate = predicate_builder - .build_row_group_predicate(file_reader.metadata().row_groups()); + .build_pruning_predicate(file_reader.metadata().row_groups()); file_reader.filter_row_groups(&row_group_predicate); } let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(file_reader)); @@ -992,10 +543,13 @@ impl RecordBatchStream for ParquetStream { #[cfg(test)] mod tests { use super::*; - use arrow::array::{Int32Array, StringArray}; + use arrow::datatypes::{DataType, Field}; use futures::StreamExt; - use parquet::basic::Type as PhysicalType; - use parquet::schema::types::SchemaDescPtr; + use parquet::{ + basic::Type as PhysicalType, + file::{metadata::RowGroupMetaData, statistics::Statistics as ParquetStatistics}, + schema::types::SchemaDescPtr, + }; #[test] fn test_split_files() { @@ -1070,284 +624,13 @@ mod tests { Ok(()) } - #[test] - fn build_statistics_array_int32() { - // build row group metadata array - let s1 = ParquetStatistics::int32(None, Some(10), None, 0, false); - let s2 = ParquetStatistics::int32(Some(2), Some(20), None, 0, false); - let s3 = ParquetStatistics::int32(Some(3), Some(30), None, 0, false); - let statistics = vec![Some(&s1), Some(&s2), Some(&s3)]; - - let statistics_array = - build_statistics_array(&statistics, StatisticsType::Min, &DataType::Int32); - let int32_array = statistics_array - .as_any() - .downcast_ref::() - .unwrap(); - let int32_vec = int32_array.into_iter().collect::>(); - assert_eq!(int32_vec, vec![None, Some(2), Some(3)]); - - let statistics_array = - build_statistics_array(&statistics, StatisticsType::Max, &DataType::Int32); - let int32_array = statistics_array - .as_any() - .downcast_ref::() - .unwrap(); - let int32_vec = int32_array.into_iter().collect::>(); - // here the first max value is None and not the Some(10) value which was actually set - // because the min value is None - assert_eq!(int32_vec, vec![None, Some(20), Some(30)]); - } - - #[test] - fn build_statistics_array_utf8() { - // build row group metadata array - let s1 = ParquetStatistics::byte_array(None, Some("10".into()), None, 0, false); - let s2 = ParquetStatistics::byte_array( - Some("2".into()), - Some("20".into()), - None, - 0, - false, - ); - let s3 = ParquetStatistics::byte_array( - Some("3".into()), - Some("30".into()), - None, - 0, - false, - ); - let statistics = vec![Some(&s1), Some(&s2), Some(&s3)]; - - let statistics_array = - build_statistics_array(&statistics, StatisticsType::Min, &DataType::Utf8); - let string_array = statistics_array - .as_any() - .downcast_ref::() - .unwrap(); - let string_vec = string_array.into_iter().collect::>(); - assert_eq!(string_vec, vec![None, Some("2"), Some("3")]); - - let statistics_array = - build_statistics_array(&statistics, StatisticsType::Max, &DataType::Utf8); - let string_array = statistics_array - .as_any() - .downcast_ref::() - .unwrap(); - let string_vec = string_array.into_iter().collect::>(); - // here the first max value is None and not the Some("10") value which was actually set - // because the min value is None - assert_eq!(string_vec, vec![None, Some("20"), Some("30")]); - } - - #[test] - fn build_statistics_array_empty_stats() { - let data_type = DataType::Int32; - let statistics = vec![]; - let statistics_array = - build_statistics_array(&statistics, StatisticsType::Min, &data_type); - assert_eq!(statistics_array.len(), 0); - - let statistics = vec![None, None]; - let statistics_array = - build_statistics_array(&statistics, StatisticsType::Min, &data_type); - assert_eq!(statistics_array.len(), statistics.len()); - assert_eq!(statistics_array.data_type(), &data_type); - for i in 0..statistics_array.len() { - assert_eq!(statistics_array.is_null(i), true); - assert_eq!(statistics_array.is_valid(i), false); - } - } - - #[test] - fn build_statistics_array_unsupported_type() { - // boolean is not currently a supported type for statistics - let s1 = ParquetStatistics::boolean(Some(false), Some(true), None, 0, false); - let s2 = ParquetStatistics::boolean(Some(false), Some(true), None, 0, false); - let statistics = vec![Some(&s1), Some(&s2)]; - let data_type = DataType::Boolean; - let statistics_array = - build_statistics_array(&statistics, StatisticsType::Min, &data_type); - assert_eq!(statistics_array.len(), statistics.len()); - assert_eq!(statistics_array.data_type(), &data_type); - for i in 0..statistics_array.len() { - assert_eq!(statistics_array.is_null(i), true); - assert_eq!(statistics_array.is_valid(i), false); - } - } - - #[test] - fn row_group_predicate_eq() -> Result<()> { - use crate::logical_plan::{col, lit}; - let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = "#c1_min LtEq Int32(1) And Int32(1) LtEq #c1_max"; - - // test column on the left - let expr = col("c1").eq(lit(1)); - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; - assert_eq!(format!("{:?}", predicate_expr), expected_expr); - - // test column on the right - let expr = lit(1).eq(col("c1")); - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; - assert_eq!(format!("{:?}", predicate_expr), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_gt() -> Result<()> { - use crate::logical_plan::{col, lit}; - let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = "#c1_max Gt Int32(1)"; - - // test column on the left - let expr = col("c1").gt(lit(1)); - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; - assert_eq!(format!("{:?}", predicate_expr), expected_expr); - - // test column on the right - let expr = lit(1).lt(col("c1")); - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; - assert_eq!(format!("{:?}", predicate_expr), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_gt_eq() -> Result<()> { - use crate::logical_plan::{col, lit}; - let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = "#c1_max GtEq Int32(1)"; - - // test column on the left - let expr = col("c1").gt_eq(lit(1)); - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; - assert_eq!(format!("{:?}", predicate_expr), expected_expr); - // test column on the right - let expr = lit(1).lt_eq(col("c1")); - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; - assert_eq!(format!("{:?}", predicate_expr), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_lt() -> Result<()> { - use crate::logical_plan::{col, lit}; - let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = "#c1_min Lt Int32(1)"; - - // test column on the left - let expr = col("c1").lt(lit(1)); - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; - assert_eq!(format!("{:?}", predicate_expr), expected_expr); - - // test column on the right - let expr = lit(1).gt(col("c1")); - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; - assert_eq!(format!("{:?}", predicate_expr), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_lt_eq() -> Result<()> { - use crate::logical_plan::{col, lit}; - let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = "#c1_min LtEq Int32(1)"; - - // test column on the left - let expr = col("c1").lt_eq(lit(1)); - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; - assert_eq!(format!("{:?}", predicate_expr), expected_expr); - // test column on the right - let expr = lit(1).gt_eq(col("c1")); - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; - assert_eq!(format!("{:?}", predicate_expr), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_and() -> Result<()> { - use crate::logical_plan::{col, lit}; - let schema = Schema::new(vec![ - Field::new("c1", DataType::Int32, false), - Field::new("c2", DataType::Int32, false), - Field::new("c3", DataType::Int32, false), - ]); - // test AND operator joining supported c1 < 1 expression and unsupported c2 > c3 expression - let expr = col("c1").lt(lit(1)).and(col("c2").lt(col("c3"))); - let expected_expr = "#c1_min Lt Int32(1) And Boolean(true)"; - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; - assert_eq!(format!("{:?}", predicate_expr), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_or() -> Result<()> { - use crate::logical_plan::{col, lit}; - let schema = Schema::new(vec![ - Field::new("c1", DataType::Int32, false), - Field::new("c2", DataType::Int32, false), - ]); - // test OR operator joining supported c1 < 1 expression and unsupported c2 % 2 expression - let expr = col("c1").lt(lit(1)).or(col("c2").modulus(lit(2))); - let expected_expr = "#c1_min Lt Int32(1) Or Boolean(true)"; - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; - assert_eq!(format!("{:?}", predicate_expr), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_stat_column_req() -> Result<()> { - use crate::logical_plan::{col, lit}; - let schema = Schema::new(vec![ - Field::new("c1", DataType::Int32, false), - Field::new("c2", DataType::Int32, false), - ]); - let mut stat_column_req = vec![]; - // c1 < 1 and (c2 = 2 or c2 = 3) - let expr = col("c1") - .lt(lit(1)) - .and(col("c2").eq(lit(2)).or(col("c2").eq(lit(3)))); - let expected_expr = "#c1_min Lt Int32(1) And #c2_min LtEq Int32(2) And Int32(2) LtEq #c2_max Or #c2_min LtEq Int32(3) And Int32(3) LtEq #c2_max"; - let predicate_expr = - build_predicate_expression(&expr, &schema, &mut stat_column_req)?; - assert_eq!(format!("{:?}", predicate_expr), expected_expr); - // c1 < 1 should add c1_min - let c1_min_field = Field::new("c1_min", DataType::Int32, false); - assert_eq!( - stat_column_req[0], - ("c1".to_owned(), StatisticsType::Min, c1_min_field) - ); - // c2 = 2 should add c2_min and c2_max - let c2_min_field = Field::new("c2_min", DataType::Int32, false); - assert_eq!( - stat_column_req[1], - ("c2".to_owned(), StatisticsType::Min, c2_min_field) - ); - let c2_max_field = Field::new("c2_max", DataType::Int32, false); - assert_eq!( - stat_column_req[2], - ("c2".to_owned(), StatisticsType::Max, c2_max_field) - ); - // c2 = 3 shouldn't add any new statistics fields - assert_eq!(stat_column_req.len(), 3); - - Ok(()) - } - #[test] fn row_group_predicate_builder_simple_expr() -> Result<()> { use crate::logical_plan::{col, lit}; // int > 1 => c1_max > 1 let expr = col("c1").gt(lit(15)); let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let predicate_builder = RowGroupPredicateBuilder::try_new(&expr, schema)?; + let predicate_builder = PruningPredicateBuilder::try_new(&expr, schema)?; let schema_descr = get_test_schema_descr(vec![("c1", PhysicalType::INT32)]); let rgm1 = get_row_group_meta_data( @@ -1360,7 +643,7 @@ mod tests { ); let row_group_metadata = vec![rgm1, rgm2]; let row_group_predicate = - predicate_builder.build_row_group_predicate(&row_group_metadata); + predicate_builder.build_pruning_predicate(&row_group_metadata); let row_group_filter = row_group_metadata .iter() .enumerate() @@ -1377,7 +660,7 @@ mod tests { // int > 1 => c1_max > 1 let expr = col("c1").gt(lit(15)); let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let predicate_builder = RowGroupPredicateBuilder::try_new(&expr, schema)?; + let predicate_builder = PruningPredicateBuilder::try_new(&expr, schema)?; let schema_descr = get_test_schema_descr(vec![("c1", PhysicalType::INT32)]); let rgm1 = get_row_group_meta_data( @@ -1390,7 +673,7 @@ mod tests { ); let row_group_metadata = vec![rgm1, rgm2]; let row_group_predicate = - predicate_builder.build_row_group_predicate(&row_group_metadata); + predicate_builder.build_pruning_predicate(&row_group_metadata); let row_group_filter = row_group_metadata .iter() .enumerate() @@ -1413,7 +696,7 @@ mod tests { Field::new("c1", DataType::Int32, false), Field::new("c2", DataType::Int32, false), ]); - let predicate_builder = RowGroupPredicateBuilder::try_new(&expr, schema.clone())?; + let predicate_builder = PruningPredicateBuilder::try_new(&expr, schema.clone())?; let schema_descr = get_test_schema_descr(vec![ ("c1", PhysicalType::INT32), @@ -1435,7 +718,7 @@ mod tests { ); let row_group_metadata = vec![rgm1, rgm2]; let row_group_predicate = - predicate_builder.build_row_group_predicate(&row_group_metadata); + predicate_builder.build_pruning_predicate(&row_group_metadata); let row_group_filter = row_group_metadata .iter() .enumerate() @@ -1448,9 +731,9 @@ mod tests { // if conditions in predicate are joined with OR and an unsupported expression is used // this bypasses the entire predicate expression and no row groups are filtered out let expr = col("c1").gt(lit(15)).or(col("c2").modulus(lit(2))); - let predicate_builder = RowGroupPredicateBuilder::try_new(&expr, schema)?; + let predicate_builder = PruningPredicateBuilder::try_new(&expr, schema)?; let row_group_predicate = - predicate_builder.build_row_group_predicate(&row_group_metadata); + predicate_builder.build_pruning_predicate(&row_group_metadata); let row_group_filter = row_group_metadata .iter() .enumerate() @@ -1472,7 +755,7 @@ mod tests { Field::new("c1", DataType::Int32, false), Field::new("c2", DataType::Boolean, false), ]); - let predicate_builder = RowGroupPredicateBuilder::try_new(&expr, schema)?; + let predicate_builder = PruningPredicateBuilder::try_new(&expr, schema)?; let schema_descr = get_test_schema_descr(vec![ ("c1", PhysicalType::INT32), @@ -1494,7 +777,7 @@ mod tests { ); let row_group_metadata = vec![rgm1, rgm2]; let row_group_predicate = - predicate_builder.build_row_group_predicate(&row_group_metadata); + predicate_builder.build_pruning_predicate(&row_group_metadata); let row_group_filter = row_group_metadata .iter() .enumerate() From db4f098d38993b96ce1134c4bc7bf5c6579509cf Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Fri, 21 May 2021 18:07:56 +0800 Subject: [PATCH 105/329] Add window expression part 1 - logical and physical planning, structure, to/from proto, and explain, for empty over clause only (#334) * add window expr * fix unused imports * fix clippy * fix unit test * Update datafusion/src/logical_plan/builder.rs Co-authored-by: Andrew Lamb * Update datafusion/src/logical_plan/builder.rs Co-authored-by: Andrew Lamb * Update datafusion/src/physical_plan/window_functions.rs Co-authored-by: Andrew Lamb * Update datafusion/src/physical_plan/window_functions.rs Co-authored-by: Andrew Lamb * adding more built-in functions * adding filter by todo * enrich unit test * update * add more tests * fix test * fix unit test * fix error * fix unit test * fix unit test * use upper case * fix unit test * comment out test Co-authored-by: Andrew Lamb --- ballista/rust/core/proto/ballista.proto | 80 +++- .../core/src/serde/logical_plan/from_proto.rs | 197 +++++++++- .../core/src/serde/logical_plan/to_proto.rs | 126 ++++++- .../src/serde/physical_plan/from_proto.rs | 81 ++++- ballista/rust/scheduler/src/planner.rs | 8 + datafusion/src/logical_plan/builder.rs | 57 ++- datafusion/src/logical_plan/expr.rs | 33 +- datafusion/src/logical_plan/plan.rs | 66 +++- datafusion/src/optimizer/constant_folding.rs | 1 + .../src/optimizer/hash_build_probe_order.rs | 5 + .../src/optimizer/projection_push_down.rs | 55 +++ datafusion/src/optimizer/utils.rs | 23 ++ datafusion/src/physical_plan/aggregates.rs | 3 +- datafusion/src/physical_plan/mod.rs | 19 + datafusion/src/physical_plan/planner.rs | 67 +++- datafusion/src/physical_plan/sort.rs | 1 + .../src/physical_plan/window_functions.rs | 342 ++++++++++++++++++ datafusion/src/physical_plan/windows.rs | 195 ++++++++++ datafusion/src/sql/planner.rs | 211 ++++++++--- datafusion/src/sql/utils.rs | 15 + datafusion/tests/sql.rs | 15 + 21 files changed, 1498 insertions(+), 102 deletions(-) create mode 100644 datafusion/src/physical_plan/window_functions.rs create mode 100644 datafusion/src/physical_plan/windows.rs diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index 3da0e85437d76..da0c615e3b23e 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -39,7 +39,6 @@ message LogicalExprNode { ScalarValue literal = 3; - // binary expressions BinaryExprNode binary_expr = 4; @@ -60,6 +59,9 @@ message LogicalExprNode { bool wildcard = 15; ScalarFunctionNode scalar_function = 16; TryCastNode try_cast = 17; + + // window expressions + WindowExprNode window_expr = 18; } } @@ -151,6 +153,29 @@ message AggregateExprNode { LogicalExprNode expr = 2; } +enum BuiltInWindowFunction { + ROW_NUMBER = 0; + RANK = 1; + DENSE_RANK = 2; + PERCENT_RANK = 3; + CUME_DIST = 4; + NTILE = 5; + LAG = 6; + LEAD = 7; + FIRST_VALUE = 8; + LAST_VALUE = 9; + NTH_VALUE = 10; +} + +message WindowExprNode { + oneof window_function { + AggregateFunction aggr_function = 1; + BuiltInWindowFunction built_in_function = 2; + // udaf = 3 + } + LogicalExprNode expr = 4; +} + message BetweenNode { LogicalExprNode expr = 1; bool negated = 2; @@ -200,6 +225,7 @@ message LogicalPlanNode { EmptyRelationNode empty_relation = 10; CreateExternalTableNode create_external_table = 11; ExplainNode explain = 12; + WindowNode window = 13; } } @@ -288,6 +314,50 @@ message AggregateNode { repeated LogicalExprNode aggr_expr = 3; } +message WindowNode { + LogicalPlanNode input = 1; + repeated LogicalExprNode window_expr = 2; + repeated LogicalExprNode partition_by_expr = 3; + repeated LogicalExprNode order_by_expr = 4; + // "optional" keyword is stable in protoc 3.15 but prost is still on 3.14 (see https://github.com/danburkert/prost/issues/430) + // this syntax is ugly but is binary compatible with the "optional" keyword (see https://stackoverflow.com/questions/42622015/how-to-define-an-optional-field-in-protobuf-3) + oneof window_frame { + WindowFrame frame = 5; + } + // TODO add filter by expr +} + +enum WindowFrameUnits { + ROWS = 0; + RANGE = 1; + GROUPS = 2; +} + +message WindowFrame { + WindowFrameUnits window_frame_units = 1; + WindowFrameBound start_bound = 2; + // "optional" keyword is stable in protoc 3.15 but prost is still on 3.14 (see https://github.com/danburkert/prost/issues/430) + // this syntax is ugly but is binary compatible with the "optional" keyword (see https://stackoverflow.com/questions/42622015/how-to-define-an-optional-field-in-protobuf-3) + oneof end_bound { + WindowFrameBound bound = 3; + } +} + +enum WindowFrameBoundType { + CURRENT_ROW = 0; + PRECEDING = 1; + FOLLOWING = 2; +} + +message WindowFrameBound { + WindowFrameBoundType window_frame_bound_type = 1; + // "optional" keyword is stable in protoc 3.15 but prost is still on 3.14 (see https://github.com/danburkert/prost/issues/430) + // this syntax is ugly but is binary compatible with the "optional" keyword (see https://stackoverflow.com/questions/42622015/how-to-define-an-optional-field-in-protobuf-3) + oneof bound_value { + uint64 value = 2; + } +} + enum JoinType { INNER = 0; LEFT = 1; @@ -334,6 +404,7 @@ message PhysicalPlanNode { MergeExecNode merge = 14; UnresolvedShuffleExecNode unresolved = 15; RepartitionExecNode repartition = 16; + WindowAggExecNode window = 17; } } @@ -399,6 +470,13 @@ enum AggregateMode { FINAL_PARTITIONED = 2; } +message WindowAggExecNode { + PhysicalPlanNode input = 1; + repeated LogicalExprNode window_expr = 2; + repeated string window_expr_name = 3; + Schema input_schema = 4; +} + message HashAggregateExecNode { repeated LogicalExprNode group_expr = 1; repeated LogicalExprNode aggr_expr = 2; diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index 6987035394c6d..020858fbfc3fe 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -17,15 +17,15 @@ //! Serde code to convert from protocol buffers to Rust data structures. +use crate::error::BallistaError; +use crate::serde::{proto_error, protobuf}; +use crate::{convert_box_required, convert_required}; +use sqlparser::ast::{WindowFrame, WindowFrameBound, WindowFrameUnits}; use std::{ convert::{From, TryInto}, unimplemented, }; -use crate::error::BallistaError; -use crate::serde::{proto_error, protobuf}; -use crate::{convert_box_required, convert_required}; - use arrow::datatypes::{DataType, Field, Schema}; use datafusion::logical_plan::{ abs, acos, asin, atan, ceil, cos, exp, floor, ln, log10, log2, round, signum, sin, @@ -33,6 +33,7 @@ use datafusion::logical_plan::{ }; use datafusion::physical_plan::aggregates::AggregateFunction; use datafusion::physical_plan::csv::CsvReadOptions; +use datafusion::physical_plan::window_functions::BuiltInWindowFunction; use datafusion::scalar::ScalarValue; use protobuf::logical_plan_node::LogicalPlanType; use protobuf::{logical_expr_node::ExprType, scalar_type}; @@ -75,6 +76,34 @@ impl TryInto for &protobuf::LogicalPlanNode { .build() .map_err(|e| e.into()) } + LogicalPlanType::Window(window) => { + let input: LogicalPlan = convert_box_required!(window.input)?; + let window_expr = window + .window_expr + .iter() + .map(|expr| expr.try_into()) + .collect::, _>>()?; + + // let partition_by_expr = window + // .partition_by_expr + // .iter() + // .map(|expr| expr.try_into()) + // .collect::, _>>()?; + // let order_by_expr = window + // .order_by_expr + // .iter() + // .map(|expr| expr.try_into()) + // .collect::, _>>()?; + // // FIXME: add filter by expr + // // FIXME: parse the window_frame data + // let window_frame = None; + LogicalPlanBuilder::from(&input) + .window( + window_expr, /* filter_by_expr, partition_by_expr, order_by_expr, window_frame*/ + )? + .build() + .map_err(|e| e.into()) + } LogicalPlanType::Aggregate(aggregate) => { let input: LogicalPlan = convert_box_required!(aggregate.input)?; let group_expr = aggregate @@ -871,7 +900,10 @@ impl TryInto for &protobuf::LogicalExprNode { type Error = BallistaError; fn try_into(self) -> Result { + use datafusion::physical_plan::window_functions; use protobuf::logical_expr_node::ExprType; + use protobuf::window_expr_node; + use protobuf::WindowExprNode; let expr_type = self .expr_type @@ -889,6 +921,48 @@ impl TryInto for &protobuf::LogicalExprNode { let scalar_value: datafusion::scalar::ScalarValue = literal.try_into()?; Ok(Expr::Literal(scalar_value)) } + ExprType::WindowExpr(expr) => { + let window_function = expr + .window_function + .as_ref() + .ok_or_else(|| proto_error("Received empty window function"))?; + match window_function { + window_expr_node::WindowFunction::AggrFunction(i) => { + let aggr_function = protobuf::AggregateFunction::from_i32(*i) + .ok_or_else(|| { + proto_error(format!( + "Received an unknown aggregate window function: {}", + i + )) + })?; + + Ok(Expr::WindowFunction { + fun: window_functions::WindowFunction::AggregateFunction( + AggregateFunction::from(aggr_function), + ), + args: vec![parse_required_expr(&expr.expr)?], + }) + } + window_expr_node::WindowFunction::BuiltInFunction(i) => { + let built_in_function = + protobuf::BuiltInWindowFunction::from_i32(*i).ok_or_else( + || { + proto_error(format!( + "Received an unknown built-in window function: {}", + i + )) + }, + )?; + + Ok(Expr::WindowFunction { + fun: window_functions::WindowFunction::BuiltInWindowFunction( + BuiltInWindowFunction::from(built_in_function), + ), + args: vec![parse_required_expr(&expr.expr)?], + }) + } + } + } ExprType::AggregateExpr(expr) => { let aggr_function = protobuf::AggregateFunction::from_i32(expr.aggr_function) @@ -898,13 +972,7 @@ impl TryInto for &protobuf::LogicalExprNode { expr.aggr_function )) })?; - let fun = match aggr_function { - protobuf::AggregateFunction::Min => AggregateFunction::Min, - protobuf::AggregateFunction::Max => AggregateFunction::Max, - protobuf::AggregateFunction::Sum => AggregateFunction::Sum, - protobuf::AggregateFunction::Avg => AggregateFunction::Avg, - protobuf::AggregateFunction::Count => AggregateFunction::Count, - }; + let fun = AggregateFunction::from(aggr_function); Ok(Expr::AggregateFunction { fun, @@ -1152,6 +1220,7 @@ impl TryInto for &protobuf::Field { } use datafusion::physical_plan::datetime_expressions::{date_trunc, to_timestamp}; +use datafusion::physical_plan::{aggregates, windows}; use datafusion::prelude::{ array, length, lower, ltrim, md5, rtrim, sha224, sha256, sha384, sha512, trim, upper, }; @@ -1202,3 +1271,109 @@ fn parse_optional_expr( None => Ok(None), } } + +impl From for WindowFrameUnits { + fn from(units: protobuf::WindowFrameUnits) -> Self { + match units { + protobuf::WindowFrameUnits::Rows => WindowFrameUnits::Rows, + protobuf::WindowFrameUnits::Range => WindowFrameUnits::Range, + protobuf::WindowFrameUnits::Groups => WindowFrameUnits::Groups, + } + } +} + +impl TryFrom for WindowFrameBound { + type Error = BallistaError; + + fn try_from(bound: protobuf::WindowFrameBound) -> Result { + let bound_type = protobuf::WindowFrameBoundType::from_i32(bound.window_frame_bound_type).ok_or_else(|| { + proto_error(format!( + "Received a WindowFrameBound message with unknown WindowFrameBoundType {}", + bound.window_frame_bound_type + )) + })?; + match bound_type { + protobuf::WindowFrameBoundType::CurrentRow => { + Ok(WindowFrameBound::CurrentRow) + } + protobuf::WindowFrameBoundType::Preceding => { + // FIXME implement bound value parsing + Ok(WindowFrameBound::Preceding(Some(1))) + } + protobuf::WindowFrameBoundType::Following => { + // FIXME implement bound value parsing + Ok(WindowFrameBound::Following(Some(1))) + } + } + } +} + +impl TryFrom for WindowFrame { + type Error = BallistaError; + + fn try_from(window: protobuf::WindowFrame) -> Result { + let units = protobuf::WindowFrameUnits::from_i32(window.window_frame_units) + .ok_or_else(|| { + proto_error(format!( + "Received a WindowFrame message with unknown WindowFrameUnits {}", + window.window_frame_units + )) + })? + .into(); + let start_bound = window + .start_bound + .ok_or_else(|| { + proto_error( + "Received a WindowFrame message with no start_bound".to_owned(), + ) + })? + .try_into()?; + // FIXME parse end bound + let end_bound = None; + Ok(WindowFrame { + units, + start_bound, + end_bound, + }) + } +} + +impl From for AggregateFunction { + fn from(aggr_function: protobuf::AggregateFunction) -> Self { + match aggr_function { + protobuf::AggregateFunction::Min => AggregateFunction::Min, + protobuf::AggregateFunction::Max => AggregateFunction::Max, + protobuf::AggregateFunction::Sum => AggregateFunction::Sum, + protobuf::AggregateFunction::Avg => AggregateFunction::Avg, + protobuf::AggregateFunction::Count => AggregateFunction::Count, + } + } +} + +impl From for BuiltInWindowFunction { + fn from(built_in_function: protobuf::BuiltInWindowFunction) -> Self { + match built_in_function { + protobuf::BuiltInWindowFunction::RowNumber => { + BuiltInWindowFunction::RowNumber + } + protobuf::BuiltInWindowFunction::Rank => BuiltInWindowFunction::Rank, + protobuf::BuiltInWindowFunction::PercentRank => { + BuiltInWindowFunction::PercentRank + } + protobuf::BuiltInWindowFunction::DenseRank => { + BuiltInWindowFunction::DenseRank + } + protobuf::BuiltInWindowFunction::Lag => BuiltInWindowFunction::Lag, + protobuf::BuiltInWindowFunction::Lead => BuiltInWindowFunction::Lead, + protobuf::BuiltInWindowFunction::FirstValue => { + BuiltInWindowFunction::FirstValue + } + protobuf::BuiltInWindowFunction::CumeDist => BuiltInWindowFunction::CumeDist, + protobuf::BuiltInWindowFunction::Ntile => BuiltInWindowFunction::Ntile, + protobuf::BuiltInWindowFunction::NthValue => BuiltInWindowFunction::NthValue, + protobuf::BuiltInWindowFunction::LastValue => { + BuiltInWindowFunction::LastValue + } + } + } +} diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/ballista/rust/core/src/serde/logical_plan/to_proto.rs index 01b669d264461..47e27483ff307 100644 --- a/ballista/rust/core/src/serde/logical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/to_proto.rs @@ -26,16 +26,19 @@ use std::{ use crate::datasource::DfTableAdapter; use crate::serde::{protobuf, BallistaError}; - use arrow::datatypes::{DataType, Schema}; use datafusion::datasource::CsvFile; use datafusion::logical_plan::{Expr, JoinType, LogicalPlan}; use datafusion::physical_plan::aggregates::AggregateFunction; +use datafusion::physical_plan::window_functions::{ + BuiltInWindowFunction, WindowFunction, +}; use datafusion::{datasource::parquet::ParquetTable, logical_plan::exprlist_to_fields}; use protobuf::{ arrow_type, logical_expr_node::ExprType, scalar_type, DateUnit, Field, PrimitiveScalarType, ScalarListValue, ScalarType, }; +use sqlparser::ast::{WindowFrame, WindowFrameBound, WindowFrameUnits}; use super::super::proto_error; use datafusion::physical_plan::functions::BuiltinScalarFunction; @@ -772,6 +775,43 @@ impl TryInto for &LogicalPlan { ))), }) } + LogicalPlan::Window { + input, + window_expr, + // FIXME implement next + // filter_by_expr, + // FIXME implement next + // partition_by_expr, + // FIXME implement next + // order_by_expr, + // FIXME implement next + // window_frame, + .. + } => { + let input: protobuf::LogicalPlanNode = input.as_ref().try_into()?; + // FIXME: implement + // let filter_by_expr = vec![]; + // FIXME: implement + let partition_by_expr = vec![]; + // FIXME: implement + let order_by_expr = vec![]; + // FIXME: implement + let window_frame = None; + Ok(protobuf::LogicalPlanNode { + logical_plan_type: Some(LogicalPlanType::Window(Box::new( + protobuf::WindowNode { + input: Some(Box::new(input)), + window_expr: window_expr + .iter() + .map(|expr| expr.try_into()) + .collect::, BallistaError>>()?, + partition_by_expr, + order_by_expr, + window_frame, + }, + ))), + }) + } LogicalPlan::Aggregate { input, group_expr, @@ -997,6 +1037,30 @@ impl TryInto for &Expr { expr_type: Some(ExprType::BinaryExpr(binary_expr)), }) } + Expr::WindowFunction { + ref fun, ref args, .. + } => { + let window_function = match fun { + WindowFunction::AggregateFunction(fun) => { + protobuf::window_expr_node::WindowFunction::AggrFunction( + protobuf::AggregateFunction::from(fun).into(), + ) + } + WindowFunction::BuiltInWindowFunction(fun) => { + protobuf::window_expr_node::WindowFunction::BuiltInFunction( + protobuf::BuiltInWindowFunction::from(fun).into(), + ) + } + }; + let arg = &args[0]; + let window_expr = Box::new(protobuf::WindowExprNode { + expr: Some(Box::new(arg.try_into()?)), + window_function: Some(window_function), + }); + Ok(protobuf::LogicalExprNode { + expr_type: Some(ExprType::WindowExpr(window_expr)), + }) + } Expr::AggregateFunction { ref fun, ref args, .. } => { @@ -1178,6 +1242,66 @@ impl Into for &Schema { } } +impl From<&AggregateFunction> for protobuf::AggregateFunction { + fn from(value: &AggregateFunction) -> Self { + match value { + AggregateFunction::Min => Self::Min, + AggregateFunction::Max => Self::Max, + AggregateFunction::Sum => Self::Sum, + AggregateFunction::Avg => Self::Avg, + AggregateFunction::Count => Self::Count, + } + } +} + +impl From<&BuiltInWindowFunction> for protobuf::BuiltInWindowFunction { + fn from(value: &BuiltInWindowFunction) -> Self { + match value { + BuiltInWindowFunction::FirstValue => Self::FirstValue, + BuiltInWindowFunction::LastValue => Self::LastValue, + BuiltInWindowFunction::NthValue => Self::NthValue, + BuiltInWindowFunction::Ntile => Self::Ntile, + BuiltInWindowFunction::CumeDist => Self::CumeDist, + BuiltInWindowFunction::PercentRank => Self::PercentRank, + BuiltInWindowFunction::RowNumber => Self::RowNumber, + BuiltInWindowFunction::Rank => Self::Rank, + BuiltInWindowFunction::Lag => Self::Lag, + BuiltInWindowFunction::Lead => Self::Lead, + BuiltInWindowFunction::DenseRank => Self::DenseRank, + } + } +} + +impl From for protobuf::WindowFrameUnits { + fn from(units: WindowFrameUnits) -> Self { + match units { + WindowFrameUnits::Rows => protobuf::WindowFrameUnits::Rows, + WindowFrameUnits::Range => protobuf::WindowFrameUnits::Range, + WindowFrameUnits::Groups => protobuf::WindowFrameUnits::Groups, + } + } +} + +impl TryFrom for protobuf::WindowFrameBound { + type Error = BallistaError; + + fn try_from(_bound: WindowFrameBound) -> Result { + Err(BallistaError::NotImplemented( + "WindowFrameBound => protobuf::WindowFrameBound".to_owned(), + )) + } +} + +impl TryFrom for protobuf::WindowFrame { + type Error = BallistaError; + + fn try_from(_window: WindowFrame) -> Result { + Err(BallistaError::NotImplemented( + "WindowFrame => protobuf::WindowFrame".to_owned(), + )) + } +} + impl TryFrom<&arrow::datatypes::DataType> for protobuf::ScalarType { type Error = BallistaError; fn try_from(value: &arrow::datatypes::DataType) -> Result { diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index 97f03948f7bd9..d034f3ca3bfee 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -28,7 +28,6 @@ use crate::serde::protobuf::LogicalExprNode; use crate::serde::scheduler::PartitionLocation; use crate::serde::{proto_error, protobuf}; use crate::{convert_box_required, convert_required}; - use arrow::datatypes::{DataType, Schema, SchemaRef}; use datafusion::catalog::catalog::{ CatalogList, CatalogProvider, MemoryCatalogList, MemoryCatalogProvider, @@ -43,6 +42,11 @@ use datafusion::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec use datafusion::physical_plan::hash_join::PartitionMode; use datafusion::physical_plan::merge::MergeExec; use datafusion::physical_plan::planner::DefaultPhysicalPlanner; +use datafusion::physical_plan::window_functions::{ + BuiltInWindowFunction, WindowFunction, +}; +use datafusion::physical_plan::windows::create_window_expr; +use datafusion::physical_plan::windows::WindowAggExec; use datafusion::physical_plan::{ coalesce_batches::CoalesceBatchesExec, csv::CsvExec, @@ -58,7 +62,7 @@ use datafusion::physical_plan::{ sort::{SortExec, SortOptions}, Partitioning, }; -use datafusion::physical_plan::{AggregateExpr, ExecutionPlan, PhysicalExpr}; +use datafusion::physical_plan::{AggregateExpr, ExecutionPlan, PhysicalExpr, WindowExpr}; use datafusion::prelude::CsvReadOptions; use log::debug; use protobuf::logical_expr_node::ExprType; @@ -189,6 +193,77 @@ impl TryInto> for &protobuf::PhysicalPlanNode { let input: Arc = convert_box_required!(limit.input)?; Ok(Arc::new(LocalLimitExec::new(input, limit.limit as usize))) } + PhysicalPlanType::Window(window_agg) => { + let input: Arc = + convert_box_required!(window_agg.input)?; + let input_schema = window_agg + .input_schema + .as_ref() + .ok_or_else(|| { + BallistaError::General( + "input_schema in WindowAggrNode is missing.".to_owned(), + ) + })? + .clone(); + + let physical_schema: SchemaRef = + SchemaRef::new((&input_schema).try_into()?); + + let catalog_list = + Arc::new(MemoryCatalogList::new()) as Arc; + let ctx_state = ExecutionContextState { + catalog_list, + scalar_functions: Default::default(), + var_provider: Default::default(), + aggregate_functions: Default::default(), + config: ExecutionConfig::new(), + execution_props: ExecutionProps::new(), + }; + + let window_agg_expr: Vec<(Expr, String)> = window_agg + .window_expr + .iter() + .zip(window_agg.window_expr_name.iter()) + .map(|(expr, name)| expr.try_into().map(|expr| (expr, name.clone()))) + .collect::, _>>()?; + + let mut physical_window_expr = vec![]; + + let df_planner = DefaultPhysicalPlanner::default(); + + for (expr, name) in &window_agg_expr { + match expr { + Expr::WindowFunction { fun, args } => { + let arg = df_planner + .create_physical_expr( + &args[0], + &physical_schema, + &ctx_state, + ) + .map_err(|e| { + BallistaError::General(format!("{:?}", e)) + })?; + physical_window_expr.push(create_window_expr( + &fun, + &[arg], + &physical_schema, + name.to_owned(), + )?); + } + _ => { + return Err(BallistaError::General( + "Invalid expression for WindowAggrExec".to_string(), + )); + } + } + } + + Ok(Arc::new(WindowAggExec::try_new( + physical_window_expr, + input, + Arc::new((&input_schema).try_into()?), + )?)) + } PhysicalPlanType::HashAggregate(hash_agg) => { let input: Arc = convert_box_required!(hash_agg.input)?; @@ -222,7 +297,6 @@ impl TryInto> for &protobuf::PhysicalPlanNode { .map(|(expr, name)| expr.try_into().map(|expr| (expr, name.clone()))) .collect::, _>>()?; - let df_planner = DefaultPhysicalPlanner::default(); let catalog_list = Arc::new(MemoryCatalogList::new()) as Arc; let ctx_state = ExecutionContextState { @@ -248,6 +322,7 @@ impl TryInto> for &protobuf::PhysicalPlanNode { let mut physical_aggr_expr = vec![]; + let df_planner = DefaultPhysicalPlanner::default(); for (expr, name) in &logical_agg_expr { match expr { Expr::AggregateFunction { fun, args, .. } => { diff --git a/ballista/rust/scheduler/src/planner.rs b/ballista/rust/scheduler/src/planner.rs index 2f01e73e60591..b1d999b733334 100644 --- a/ballista/rust/scheduler/src/planner.rs +++ b/ballista/rust/scheduler/src/planner.rs @@ -35,6 +35,7 @@ use datafusion::physical_optimizer::optimizer::PhysicalOptimizerRule; use datafusion::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec}; use datafusion::physical_plan::hash_join::HashJoinExec; use datafusion::physical_plan::merge::MergeExec; +use datafusion::physical_plan::windows::WindowAggExec; use datafusion::physical_plan::ExecutionPlan; use log::info; @@ -150,6 +151,13 @@ impl DistributedPlanner { } else if let Some(join) = execution_plan.as_any().downcast_ref::() { Ok((join.with_new_children(children)?, stages)) + } else if let Some(window) = + execution_plan.as_any().downcast_ref::() + { + Err(BallistaError::NotImplemented(format!( + "WindowAggExec with window {:?}", + window + ))) } else { // TODO check for compatible partitioning schema, not just count if execution_plan.output_partitioning().partition_count() diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs index 2e69814d2634e..9515ac2ff3739 100644 --- a/datafusion/src/logical_plan/builder.rs +++ b/datafusion/src/logical_plan/builder.rs @@ -24,18 +24,17 @@ use arrow::{ record_batch::RecordBatch, }; +use super::dfschema::ToDFSchema; +use super::{ + col, exprlist_to_fields, Expr, JoinType, LogicalPlan, PlanType, StringifiedPlan, +}; use crate::datasource::TableProvider; use crate::error::{DataFusionError, Result}; +use crate::logical_plan::{DFField, DFSchema, DFSchemaRef, Partitioning}; use crate::{ datasource::{empty::EmptyTable, parquet::ParquetTable, CsvFile, MemTable}, prelude::CsvReadOptions, }; - -use super::dfschema::ToDFSchema; -use super::{ - col, exprlist_to_fields, Expr, JoinType, LogicalPlan, PlanType, StringifiedPlan, -}; -use crate::logical_plan::{DFField, DFSchema, DFSchemaRef, Partitioning}; use std::collections::HashSet; /// Builder for logical plans @@ -289,6 +288,52 @@ impl LogicalPlanBuilder { })) } + /// Apply a window + /// + /// NOTE: this feature is under development and this API will be changing + /// + /// - https://github.com/apache/arrow-datafusion/issues/359 basic structure + /// - https://github.com/apache/arrow-datafusion/issues/298 empty over clause + /// - https://github.com/apache/arrow-datafusion/issues/299 with partition clause + /// - https://github.com/apache/arrow-datafusion/issues/360 with order by + /// - https://github.com/apache/arrow-datafusion/issues/361 with window frame + pub fn window( + &self, + window_expr: impl IntoIterator, + // FIXME: implement next + // filter_by_expr: impl IntoIterator, + // FIXME: implement next + // partition_by_expr: impl IntoIterator, + // FIXME: implement next + // order_by_expr: impl IntoIterator, + // FIXME: implement next + // window_frame: Option, + ) -> Result { + let window_expr = window_expr.into_iter().collect::>(); + // FIXME: implement next + // let partition_by_expr = partition_by_expr.into_iter().collect::>(); + // FIXME: implement next + // let order_by_expr = order_by_expr.into_iter().collect::>(); + let all_expr = window_expr.iter(); + validate_unique_names("Windows", all_expr.clone(), self.plan.schema())?; + + let mut window_fields: Vec = + exprlist_to_fields(all_expr, self.plan.schema())?; + window_fields.extend_from_slice(self.plan.schema().fields()); + + Ok(Self::from(&LogicalPlan::Window { + input: Arc::new(self.plan.clone()), + // FIXME implement next + // partition_by_expr, + // FIXME implement next + // order_by_expr, + // FIXME implement next + // window_frame, + window_expr, + schema: Arc::new(DFSchema::new(window_fields)?), + })) + } + /// Apply an aggregate: grouping on the `group_expr` expressions /// and calculating `aggr_expr` aggregates for each distinct /// value of the `group_expr`; diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index 3365bf2603234..ab02559175302 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -30,6 +30,7 @@ use crate::error::{DataFusionError, Result}; use crate::logical_plan::{DFField, DFSchema}; use crate::physical_plan::{ aggregates, expressions::binary_operator_data_type, functions, udf::ScalarUDF, + window_functions, }; use crate::{physical_plan::udaf::AggregateUDF, scalar::ScalarValue}; use functions::{ReturnTypeFunction, ScalarFunctionImplementation, Signature}; @@ -190,6 +191,13 @@ pub enum Expr { /// Whether this is a DISTINCT aggregation or not distinct: bool, }, + /// Represents the call of a window function with arguments. + WindowFunction { + /// Name of the function + fun: window_functions::WindowFunction, + /// List of expressions to feed to the functions as arguments + args: Vec, + }, /// aggregate function AggregateUDF { /// The function @@ -244,6 +252,13 @@ impl Expr { .collect::>>()?; functions::return_type(fun, &data_types) } + Expr::WindowFunction { fun, args, .. } => { + let data_types = args + .iter() + .map(|e| e.get_type(schema)) + .collect::>>()?; + window_functions::return_type(fun, &data_types) + } Expr::AggregateFunction { fun, args, .. } => { let data_types = args .iter() @@ -316,6 +331,7 @@ impl Expr { Expr::TryCast { .. } => Ok(true), Expr::ScalarFunction { .. } => Ok(true), Expr::ScalarUDF { .. } => Ok(true), + Expr::WindowFunction { .. } => Ok(true), Expr::AggregateFunction { .. } => Ok(true), Expr::AggregateUDF { .. } => Ok(true), Expr::Not(expr) => expr.nullable(input_schema), @@ -571,6 +587,9 @@ impl Expr { Expr::ScalarUDF { args, .. } => args .iter() .try_fold(visitor, |visitor, arg| arg.accept(visitor)), + Expr::WindowFunction { args, .. } => args + .iter() + .try_fold(visitor, |visitor, arg| arg.accept(visitor)), Expr::AggregateFunction { args, .. } => args .iter() .try_fold(visitor, |visitor, arg| arg.accept(visitor)), @@ -704,6 +723,10 @@ impl Expr { args: rewrite_vec(args, rewriter)?, fun, }, + Expr::WindowFunction { args, fun } => Expr::WindowFunction { + args: rewrite_vec(args, rewriter)?, + fun, + }, Expr::AggregateFunction { args, fun, @@ -1151,7 +1174,7 @@ pub fn create_udf( } /// Creates a new UDAF with a specific signature, state type and return type. -/// The signature and state type must match the `Acumulator's implementation`. +/// The signature and state type must match the `Accumulator's implementation`. #[allow(clippy::rc_buffer)] pub fn create_udaf( name: &str, @@ -1245,6 +1268,9 @@ impl fmt::Debug for Expr { Expr::ScalarUDF { fun, ref args, .. } => { fmt_function(f, &fun.name, false, args) } + Expr::WindowFunction { fun, ref args, .. } => { + fmt_function(f, &fun.to_string(), false, args) + } Expr::AggregateFunction { fun, distinct, @@ -1360,6 +1386,9 @@ fn create_name(e: &Expr, input_schema: &DFSchema) -> Result { Expr::ScalarUDF { fun, args, .. } => { create_function_name(&fun.name, false, args, input_schema) } + Expr::WindowFunction { fun, args } => { + create_function_name(&fun.to_string(), false, args, input_schema) + } Expr::AggregateFunction { fun, distinct, @@ -1387,7 +1416,7 @@ fn create_name(e: &Expr, input_schema: &DFSchema) -> Result { } } other => Err(DataFusionError::NotImplemented(format!( - "Physical plan does not support logical expression {:?}", + "Create name does not support logical expression {:?}", other ))), } diff --git a/datafusion/src/logical_plan/plan.rs b/datafusion/src/logical_plan/plan.rs index 8b9aac9ea73b9..4027916c8a7cd 100644 --- a/datafusion/src/logical_plan/plan.rs +++ b/datafusion/src/logical_plan/plan.rs @@ -17,24 +17,21 @@ //! This module contains the `LogicalPlan` enum that describes queries //! via a logical query plan. -use std::{ - cmp::min, - fmt::{self, Display}, - sync::Arc, -}; - -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; - -use crate::datasource::TableProvider; -use crate::sql::parser::FileType; - use super::expr::Expr; use super::extension::UserDefinedLogicalNode; use super::{ col, display::{GraphvizVisitor, IndentVisitor}, }; +use crate::datasource::TableProvider; use crate::logical_plan::dfschema::DFSchemaRef; +use crate::sql::parser::FileType; +use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use std::{ + cmp::min, + fmt::{self, Display}, + sync::Arc, +}; /// Join type #[derive(Debug, Clone, Copy)] @@ -83,6 +80,23 @@ pub enum LogicalPlan { /// The incoming logical plan input: Arc, }, + /// Window its input based on a set of window spec and window function (e.g. SUM or RANK) + Window { + /// The incoming logical plan + input: Arc, + /// The window function expression + window_expr: Vec, + /// Filter by expressions + // filter_by_expr: Vec, + /// Partition by expressions + // partition_by_expr: Vec, + /// Order by expressions + // order_by_expr: Vec, + /// Window Frame + // window_frame: Option, + /// The schema description of the window output + schema: DFSchemaRef, + }, /// Aggregates its input based on a set of grouping and aggregate /// expressions (e.g. SUM). Aggregate { @@ -211,6 +225,7 @@ impl LogicalPlan { } => &projected_schema, LogicalPlan::Projection { schema, .. } => &schema, LogicalPlan::Filter { input, .. } => input.schema(), + LogicalPlan::Window { schema, .. } => &schema, LogicalPlan::Aggregate { schema, .. } => &schema, LogicalPlan::Sort { input, .. } => input.schema(), LogicalPlan::Join { schema, .. } => &schema, @@ -230,7 +245,8 @@ impl LogicalPlan { LogicalPlan::TableScan { projected_schema, .. } => vec![&projected_schema], - LogicalPlan::Aggregate { input, schema, .. } + LogicalPlan::Window { input, schema, .. } + | LogicalPlan::Aggregate { input, schema, .. } | LogicalPlan::Projection { input, schema, .. } => { let mut schemas = input.all_schemas(); schemas.insert(0, &schema); @@ -288,6 +304,16 @@ impl LogicalPlan { Partitioning::Hash(expr, _) => expr.clone(), _ => vec![], }, + LogicalPlan::Window { + window_expr, + // FIXME implement next + // filter_by_expr, + // FIXME implement next + // partition_by_expr, + // FIXME implement next + // order_by_expr, + .. + } => window_expr.clone(), LogicalPlan::Aggregate { group_expr, aggr_expr, @@ -322,6 +348,7 @@ impl LogicalPlan { LogicalPlan::Projection { input, .. } => vec![input], LogicalPlan::Filter { input, .. } => vec![input], LogicalPlan::Repartition { input, .. } => vec![input], + LogicalPlan::Window { input, .. } => vec![input], LogicalPlan::Aggregate { input, .. } => vec![input], LogicalPlan::Sort { input, .. } => vec![input], LogicalPlan::Join { left, right, .. } => vec![left, right], @@ -415,6 +442,7 @@ impl LogicalPlan { LogicalPlan::Projection { input, .. } => input.accept(visitor)?, LogicalPlan::Filter { input, .. } => input.accept(visitor)?, LogicalPlan::Repartition { input, .. } => input.accept(visitor)?, + LogicalPlan::Window { input, .. } => input.accept(visitor)?, LogicalPlan::Aggregate { input, .. } => input.accept(visitor)?, LogicalPlan::Sort { input, .. } => input.accept(visitor)?, LogicalPlan::Join { left, right, .. } @@ -667,6 +695,20 @@ impl LogicalPlan { predicate: ref expr, .. } => write!(f, "Filter: {:?}", expr), + LogicalPlan::Window { + ref window_expr, + // FIXME implement next + // ref partition_by_expr, + // FIXME implement next + // ref order_by_expr, + .. + } => { + write!( + f, + "WindowAggr: windowExpr=[{:?}] partitionBy=[], orderBy=[]", + window_expr + ) + } LogicalPlan::Aggregate { ref group_expr, ref aggr_expr, diff --git a/datafusion/src/optimizer/constant_folding.rs b/datafusion/src/optimizer/constant_folding.rs index 51bf0ce1b5054..af89aa13908c4 100644 --- a/datafusion/src/optimizer/constant_folding.rs +++ b/datafusion/src/optimizer/constant_folding.rs @@ -71,6 +71,7 @@ impl OptimizerRule for ConstantFolding { }), // Rest: recurse into plan, apply optimization where possible LogicalPlan::Projection { .. } + | LogicalPlan::Window { .. } | LogicalPlan::Aggregate { .. } | LogicalPlan::Repartition { .. } | LogicalPlan::CreateExternalTable { .. } diff --git a/datafusion/src/optimizer/hash_build_probe_order.rs b/datafusion/src/optimizer/hash_build_probe_order.rs index 168c4a17edfd0..100ae4fb09b73 100644 --- a/datafusion/src/optimizer/hash_build_probe_order.rs +++ b/datafusion/src/optimizer/hash_build_probe_order.rs @@ -54,6 +54,10 @@ fn get_num_rows(logical_plan: &LogicalPlan) -> Option { let num_rows_input = get_num_rows(input); num_rows_input.map(|rows| std::cmp::min(*limit, rows)) } + LogicalPlan::Window { input, .. } => { + // window functions do not change num of rows + get_num_rows(input) + } LogicalPlan::Aggregate { .. } => { // we cannot yet predict how many rows will be produced by an aggregate because // we do not know the cardinality of the grouping keys @@ -172,6 +176,7 @@ impl OptimizerRule for HashBuildProbeOrder { } // Rest: recurse into plan, apply optimization where possible LogicalPlan::Projection { .. } + | LogicalPlan::Window { .. } | LogicalPlan::Aggregate { .. } | LogicalPlan::TableScan { .. } | LogicalPlan::Limit { .. } diff --git a/datafusion/src/optimizer/projection_push_down.rs b/datafusion/src/optimizer/projection_push_down.rs index 21c9caba3316d..e47832b07f921 100644 --- a/datafusion/src/optimizer/projection_push_down.rs +++ b/datafusion/src/optimizer/projection_push_down.rs @@ -193,6 +193,61 @@ fn optimize_plan( schema: schema.clone(), }) } + LogicalPlan::Window { + schema, + window_expr, + input, + // FIXME implement next + // filter_by_expr, + // FIXME implement next + // partition_by_expr, + // FIXME implement next + // order_by_expr, + // FIXME implement next + // window_frame, + .. + } => { + // Gather all columns needed for expressions in this Window + let mut new_window_expr = Vec::new(); + window_expr.iter().try_for_each(|expr| { + let name = &expr.name(&schema)?; + if required_columns.contains(name) { + new_window_expr.push(expr.clone()); + new_required_columns.insert(name.clone()); + // add to the new set of required columns + utils::expr_to_column_names(expr, &mut new_required_columns) + } else { + Ok(()) + } + })?; + + let new_schema = DFSchema::new( + schema + .fields() + .iter() + .filter(|x| new_required_columns.contains(x.name())) + .cloned() + .collect(), + )?; + + Ok(LogicalPlan::Window { + window_expr: new_window_expr, + // FIXME implement next + // partition_by_expr: partition_by_expr.clone(), + // FIXME implement next + // order_by_expr: order_by_expr.clone(), + // FIXME implement next + // window_frame: window_frame.clone(), + input: Arc::new(optimize_plan( + optimizer, + &input, + &new_required_columns, + true, + execution_props, + )?), + schema: DFSchemaRef::new(new_schema), + }) + } LogicalPlan::Aggregate { schema, input, diff --git a/datafusion/src/optimizer/utils.rs b/datafusion/src/optimizer/utils.rs index 9288c65ac4dac..284ead252ac67 100644 --- a/datafusion/src/optimizer/utils.rs +++ b/datafusion/src/optimizer/utils.rs @@ -78,6 +78,7 @@ impl ExpressionVisitor for ColumnNameVisitor<'_> { Expr::Sort { .. } => {} Expr::ScalarFunction { .. } => {} Expr::ScalarUDF { .. } => {} + Expr::WindowFunction { .. } => {} Expr::AggregateFunction { .. } => {} Expr::AggregateUDF { .. } => {} Expr::InList { .. } => {} @@ -188,6 +189,23 @@ pub fn from_plan( input: Arc::new(inputs[0].clone()), }), }, + LogicalPlan::Window { + // FIXME implement next + // filter_by_expr, + // FIXME implement next + // partition_by_expr, + // FIXME implement next + // order_by_expr, + // FIXME implement next + // window_frame, + window_expr, + schema, + .. + } => Ok(LogicalPlan::Window { + input: Arc::new(inputs[0].clone()), + window_expr: expr[0..window_expr.len()].to_vec(), + schema: schema.clone(), + }), LogicalPlan::Aggregate { group_expr, schema, .. } => Ok(LogicalPlan::Aggregate { @@ -247,6 +265,7 @@ pub fn expr_sub_expressions(expr: &Expr) -> Result> { Expr::IsNotNull(e) => Ok(vec![e.as_ref().to_owned()]), Expr::ScalarFunction { args, .. } => Ok(args.clone()), Expr::ScalarUDF { args, .. } => Ok(args.clone()), + Expr::WindowFunction { args, .. } => Ok(args.clone()), Expr::AggregateFunction { args, .. } => Ok(args.clone()), Expr::AggregateUDF { args, .. } => Ok(args.clone()), Expr::Case { @@ -319,6 +338,10 @@ pub fn rewrite_expression(expr: &Expr, expressions: &[Expr]) -> Result { fun: fun.clone(), args: expressions.to_vec(), }), + Expr::WindowFunction { fun, .. } => Ok(Expr::WindowFunction { + fun: fun.clone(), + args: expressions.to_vec(), + }), Expr::AggregateFunction { fun, distinct, .. } => Ok(Expr::AggregateFunction { fun: fun.clone(), args: expressions.to_vec(), diff --git a/datafusion/src/physical_plan/aggregates.rs b/datafusion/src/physical_plan/aggregates.rs index 9417c7c8f05a5..3607f29debba1 100644 --- a/datafusion/src/physical_plan/aggregates.rs +++ b/datafusion/src/physical_plan/aggregates.rs @@ -37,7 +37,6 @@ use crate::physical_plan::expressions; use arrow::datatypes::{DataType, Schema, TimeUnit}; use expressions::{avg_return_type, sum_return_type}; use std::{fmt, str::FromStr, sync::Arc}; - /// the implementation of an aggregate function pub type AccumulatorFunctionImplementation = Arc Result> + Send + Sync>; @@ -183,7 +182,7 @@ static TIMESTAMPS: &[DataType] = &[ ]; /// the signatures supported by the function `fun`. -fn signature(fun: &AggregateFunction) -> Signature { +pub fn signature(fun: &AggregateFunction) -> Signature { // note: the physical expression must accept the type returned by this function or the execution panics. match fun { AggregateFunction::Count => Signature::Any(1), diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index e915b2c257ddc..c053229bc000b 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -442,6 +442,23 @@ pub trait AggregateExpr: Send + Sync + Debug { } } +/// A window expression that: +/// * knows its resulting field +pub trait WindowExpr: Send + Sync + Debug { + /// Returns the window expression as [`Any`](std::any::Any) so that it can be + /// downcast to a specific implementation. + fn as_any(&self) -> &dyn Any; + + /// the field of the final result of this window function. + fn field(&self) -> Result; + + /// Human readable name such as `"MIN(c2)"` or `"RANK()"`. The default + /// implementation returns placeholder text. + fn name(&self) -> &str { + "WindowExpr: default name" + } +} + /// An accumulator represents a stateful object that lives throughout the evaluation of multiple rows and /// generically accumulates values. An accumulator knows how to: /// * update its state from inputs via `update` @@ -530,3 +547,5 @@ pub mod udf; #[cfg(feature = "unicode_expressions")] pub mod unicode_expressions; pub mod union; +pub mod window_functions; +pub mod windows; diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index 9e7dc7172b820..018925d0e5356 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -21,7 +21,7 @@ use std::sync::Arc; use super::{ aggregates, cross_join::CrossJoinExec, empty::EmptyExec, expressions::binary, - functions, hash_join::PartitionMode, udaf, union::UnionExec, + functions, hash_join::PartitionMode, udaf, union::UnionExec, windows, }; use crate::execution::context::ExecutionContextState; use crate::logical_plan::{ @@ -39,8 +39,11 @@ use crate::physical_plan::projection::ProjectionExec; use crate::physical_plan::repartition::RepartitionExec; use crate::physical_plan::sort::SortExec; use crate::physical_plan::udf; +use crate::physical_plan::windows::WindowAggExec; use crate::physical_plan::{hash_utils, Partitioning}; -use crate::physical_plan::{AggregateExpr, ExecutionPlan, PhysicalExpr, PhysicalPlanner}; +use crate::physical_plan::{ + AggregateExpr, ExecutionPlan, PhysicalExpr, PhysicalPlanner, WindowExpr, +}; use crate::prelude::JoinType; use crate::scalar::ScalarValue; use crate::variable::VarType; @@ -48,10 +51,9 @@ use crate::{ error::{DataFusionError, Result}, physical_plan::displayable, }; -use arrow::{compute::can_cast_types, datatypes::DataType}; - use arrow::compute::SortOptions; use arrow::datatypes::{Schema, SchemaRef}; +use arrow::{compute::can_cast_types, datatypes::DataType}; use expressions::col; use log::debug; @@ -139,6 +141,32 @@ impl DefaultPhysicalPlanner { limit, .. } => source.scan(projection, batch_size, filters, *limit), + LogicalPlan::Window { + input, window_expr, .. + } => { + // Initially need to perform the aggregate and then merge the partitions + let input_exec = self.create_initial_plan(input, ctx_state)?; + let input_schema = input_exec.schema(); + let physical_input_schema = input_exec.as_ref().schema(); + let logical_input_schema = input.as_ref().schema(); + let window_expr = window_expr + .iter() + .map(|e| { + self.create_window_expr( + e, + &logical_input_schema, + &physical_input_schema, + ctx_state, + ) + }) + .collect::>>()?; + + Ok(Arc::new(WindowAggExec::try_new( + window_expr, + input_exec.clone(), + input_schema, + )?)) + } LogicalPlan::Aggregate { input, group_expr, @@ -700,6 +728,37 @@ impl DefaultPhysicalPlanner { } } + /// Create a window expression from a logical expression + pub fn create_window_expr( + &self, + e: &Expr, + logical_input_schema: &DFSchema, + physical_input_schema: &Schema, + ctx_state: &ExecutionContextState, + ) -> Result> { + // unpack aliased logical expressions, e.g. "sum(col) over () as total" + let (name, e) = match e { + Expr::Alias(sub_expr, alias) => (alias.clone(), sub_expr.as_ref()), + _ => (e.name(logical_input_schema)?, e), + }; + + match e { + Expr::WindowFunction { fun, args } => { + let args = args + .iter() + .map(|e| { + self.create_physical_expr(e, physical_input_schema, ctx_state) + }) + .collect::>>()?; + windows::create_window_expr(fun, &args, physical_input_schema, name) + } + other => Err(DataFusionError::Internal(format!( + "Invalid window expression '{:?}'", + other + ))), + } + } + /// Create an aggregate expression from a logical expression pub fn create_aggregate_expr( &self, diff --git a/datafusion/src/physical_plan/sort.rs b/datafusion/src/physical_plan/sort.rs index 8229060190215..caa32cfa264e1 100644 --- a/datafusion/src/physical_plan/sort.rs +++ b/datafusion/src/physical_plan/sort.rs @@ -135,6 +135,7 @@ impl ExecutionPlan for SortExec { "SortExec requires a single input partition".to_owned(), )); } + let input = self.input.execute(0).await?; Ok(Box::pin(SortStream::new( diff --git a/datafusion/src/physical_plan/window_functions.rs b/datafusion/src/physical_plan/window_functions.rs new file mode 100644 index 0000000000000..65d5373d54f47 --- /dev/null +++ b/datafusion/src/physical_plan/window_functions.rs @@ -0,0 +1,342 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Window functions provide the ability to perform calculations across +//! sets of rows that are related to the current query row. +//! +//! see also https://www.postgresql.org/docs/current/functions-window.html + +use crate::error::{DataFusionError, Result}; +use crate::physical_plan::{ + aggregates, aggregates::AggregateFunction, functions::Signature, + type_coercion::data_types, +}; +use arrow::datatypes::DataType; +use std::{fmt, str::FromStr}; + +/// WindowFunction +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum WindowFunction { + /// window function that leverages an aggregate function + AggregateFunction(AggregateFunction), + /// window function that leverages a built-in window function + BuiltInWindowFunction(BuiltInWindowFunction), +} + +impl FromStr for WindowFunction { + type Err = DataFusionError; + fn from_str(name: &str) -> Result { + let name = name.to_lowercase(); + if let Ok(aggregate) = AggregateFunction::from_str(name.as_str()) { + Ok(WindowFunction::AggregateFunction(aggregate)) + } else if let Ok(built_in_function) = + BuiltInWindowFunction::from_str(name.as_str()) + { + Ok(WindowFunction::BuiltInWindowFunction(built_in_function)) + } else { + Err(DataFusionError::Plan(format!( + "There is no window function named {}", + name + ))) + } + } +} + +impl fmt::Display for BuiltInWindowFunction { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + BuiltInWindowFunction::RowNumber => write!(f, "ROW_NUMBER"), + BuiltInWindowFunction::Rank => write!(f, "RANK"), + BuiltInWindowFunction::DenseRank => write!(f, "DENSE_RANK"), + BuiltInWindowFunction::PercentRank => write!(f, "PERCENT_RANK"), + BuiltInWindowFunction::CumeDist => write!(f, "CUME_DIST"), + BuiltInWindowFunction::Ntile => write!(f, "NTILE"), + BuiltInWindowFunction::Lag => write!(f, "LAG"), + BuiltInWindowFunction::Lead => write!(f, "LEAD"), + BuiltInWindowFunction::FirstValue => write!(f, "FIRST_VALUE"), + BuiltInWindowFunction::LastValue => write!(f, "LAST_VALUE"), + BuiltInWindowFunction::NthValue => write!(f, "NTH_VALUE"), + } + } +} + +impl fmt::Display for WindowFunction { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + WindowFunction::AggregateFunction(fun) => fun.fmt(f), + WindowFunction::BuiltInWindowFunction(fun) => fun.fmt(f), + } + } +} + +/// An aggregate function that is part of a built-in window function +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum BuiltInWindowFunction { + /// number of the current row within its partition, counting from 1 + RowNumber, + /// rank of the current row with gaps; same as row_number of its first peer + Rank, + /// ank of the current row without gaps; this function counts peer groups + DenseRank, + /// relative rank of the current row: (rank - 1) / (total rows - 1) + PercentRank, + /// relative rank of the current row: (number of rows preceding or peer with current row) / (total rows) + CumeDist, + /// integer ranging from 1 to the argument value, dividing the partition as equally as possible + Ntile, + /// returns value evaluated at the row that is offset rows before the current row within the partition; + /// if there is no such row, instead return default (which must be of the same type as value). + /// Both offset and default are evaluated with respect to the current row. + /// If omitted, offset defaults to 1 and default to null + Lag, + /// returns value evaluated at the row that is offset rows after the current row within the partition; + /// if there is no such row, instead return default (which must be of the same type as value). + /// Both offset and default are evaluated with respect to the current row. + /// If omitted, offset defaults to 1 and default to null + Lead, + /// returns value evaluated at the row that is the first row of the window frame + FirstValue, + /// returns value evaluated at the row that is the last row of the window frame + LastValue, + /// returns value evaluated at the row that is the nth row of the window frame (counting from 1); null if no such row + NthValue, +} + +impl FromStr for BuiltInWindowFunction { + type Err = DataFusionError; + fn from_str(name: &str) -> Result { + Ok(match name.to_uppercase().as_str() { + "ROW_NUMBER" => BuiltInWindowFunction::RowNumber, + "RANK" => BuiltInWindowFunction::Rank, + "DENSE_RANK" => BuiltInWindowFunction::DenseRank, + "PERCENT_RANK" => BuiltInWindowFunction::PercentRank, + "CUME_DIST" => BuiltInWindowFunction::CumeDist, + "NTILE" => BuiltInWindowFunction::Ntile, + "LAG" => BuiltInWindowFunction::Lag, + "LEAD" => BuiltInWindowFunction::Lead, + "FIRST_VALUE" => BuiltInWindowFunction::FirstValue, + "LAST_VALUE" => BuiltInWindowFunction::LastValue, + "NTH_VALUE" => BuiltInWindowFunction::NthValue, + _ => { + return Err(DataFusionError::Plan(format!( + "There is no built-in window function named {}", + name + ))) + } + }) + } +} + +/// Returns the datatype of the window function +pub fn return_type(fun: &WindowFunction, arg_types: &[DataType]) -> Result { + // Note that this function *must* return the same type that the respective physical expression returns + // or the execution panics. + + // verify that this is a valid set of data types for this function + data_types(arg_types, &signature(fun))?; + + match fun { + WindowFunction::AggregateFunction(fun) => aggregates::return_type(fun, arg_types), + WindowFunction::BuiltInWindowFunction(fun) => match fun { + BuiltInWindowFunction::RowNumber + | BuiltInWindowFunction::Rank + | BuiltInWindowFunction::DenseRank => Ok(DataType::UInt64), + BuiltInWindowFunction::PercentRank | BuiltInWindowFunction::CumeDist => { + Ok(DataType::Float64) + } + BuiltInWindowFunction::Ntile => Ok(DataType::UInt32), + BuiltInWindowFunction::Lag + | BuiltInWindowFunction::Lead + | BuiltInWindowFunction::FirstValue + | BuiltInWindowFunction::LastValue + | BuiltInWindowFunction::NthValue => Ok(arg_types[0].clone()), + }, + } +} + +/// the signatures supported by the function `fun`. +fn signature(fun: &WindowFunction) -> Signature { + // note: the physical expression must accept the type returned by this function or the execution panics. + match fun { + WindowFunction::AggregateFunction(fun) => aggregates::signature(fun), + WindowFunction::BuiltInWindowFunction(fun) => match fun { + BuiltInWindowFunction::RowNumber + | BuiltInWindowFunction::Rank + | BuiltInWindowFunction::DenseRank + | BuiltInWindowFunction::PercentRank + | BuiltInWindowFunction::CumeDist => Signature::Any(0), + BuiltInWindowFunction::Lag + | BuiltInWindowFunction::Lead + | BuiltInWindowFunction::FirstValue + | BuiltInWindowFunction::LastValue => Signature::Any(1), + BuiltInWindowFunction::Ntile => Signature::Exact(vec![DataType::UInt64]), + BuiltInWindowFunction::NthValue => Signature::Any(2), + }, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_window_function_case_insensitive() -> Result<()> { + let names = vec![ + "row_number", + "rank", + "dense_rank", + "percent_rank", + "cume_dist", + "ntile", + "lag", + "lead", + "first_value", + "last_value", + "nth_value", + "min", + "max", + "count", + "avg", + "sum", + ]; + for name in names { + let fun = WindowFunction::from_str(name)?; + let fun2 = WindowFunction::from_str(name.to_uppercase().as_str())?; + assert_eq!(fun, fun2); + assert_eq!(fun.to_string(), name.to_uppercase()); + } + Ok(()) + } + + #[test] + fn test_window_function_from_str() -> Result<()> { + assert_eq!( + WindowFunction::from_str("max")?, + WindowFunction::AggregateFunction(AggregateFunction::Max) + ); + assert_eq!( + WindowFunction::from_str("min")?, + WindowFunction::AggregateFunction(AggregateFunction::Min) + ); + assert_eq!( + WindowFunction::from_str("avg")?, + WindowFunction::AggregateFunction(AggregateFunction::Avg) + ); + assert_eq!( + WindowFunction::from_str("cume_dist")?, + WindowFunction::BuiltInWindowFunction(BuiltInWindowFunction::CumeDist) + ); + assert_eq!( + WindowFunction::from_str("first_value")?, + WindowFunction::BuiltInWindowFunction(BuiltInWindowFunction::FirstValue) + ); + assert_eq!( + WindowFunction::from_str("LAST_value")?, + WindowFunction::BuiltInWindowFunction(BuiltInWindowFunction::LastValue) + ); + assert_eq!( + WindowFunction::from_str("LAG")?, + WindowFunction::BuiltInWindowFunction(BuiltInWindowFunction::Lag) + ); + assert_eq!( + WindowFunction::from_str("LEAD")?, + WindowFunction::BuiltInWindowFunction(BuiltInWindowFunction::Lead) + ); + Ok(()) + } + + #[test] + fn test_count_return_type() -> Result<()> { + let fun = WindowFunction::from_str("count")?; + let observed = return_type(&fun, &[DataType::Utf8])?; + assert_eq!(DataType::UInt64, observed); + + let observed = return_type(&fun, &[DataType::UInt64])?; + assert_eq!(DataType::UInt64, observed); + + Ok(()) + } + + #[test] + fn test_first_value_return_type() -> Result<()> { + let fun = WindowFunction::from_str("first_value")?; + let observed = return_type(&fun, &[DataType::Utf8])?; + assert_eq!(DataType::Utf8, observed); + + let observed = return_type(&fun, &[DataType::UInt64])?; + assert_eq!(DataType::UInt64, observed); + + Ok(()) + } + + #[test] + fn test_last_value_return_type() -> Result<()> { + let fun = WindowFunction::from_str("last_value")?; + let observed = return_type(&fun, &[DataType::Utf8])?; + assert_eq!(DataType::Utf8, observed); + + let observed = return_type(&fun, &[DataType::Float64])?; + assert_eq!(DataType::Float64, observed); + + Ok(()) + } + + #[test] + fn test_lead_return_type() -> Result<()> { + let fun = WindowFunction::from_str("lead")?; + let observed = return_type(&fun, &[DataType::Utf8])?; + assert_eq!(DataType::Utf8, observed); + + let observed = return_type(&fun, &[DataType::Float64])?; + assert_eq!(DataType::Float64, observed); + + Ok(()) + } + + #[test] + fn test_lag_return_type() -> Result<()> { + let fun = WindowFunction::from_str("lag")?; + let observed = return_type(&fun, &[DataType::Utf8])?; + assert_eq!(DataType::Utf8, observed); + + let observed = return_type(&fun, &[DataType::Float64])?; + assert_eq!(DataType::Float64, observed); + + Ok(()) + } + + #[test] + fn test_nth_value_return_type() -> Result<()> { + let fun = WindowFunction::from_str("nth_value")?; + let observed = return_type(&fun, &[DataType::Utf8, DataType::UInt64])?; + assert_eq!(DataType::Utf8, observed); + + let observed = return_type(&fun, &[DataType::Float64, DataType::UInt64])?; + assert_eq!(DataType::Float64, observed); + + Ok(()) + } + + #[test] + fn test_cume_dist_return_type() -> Result<()> { + let fun = WindowFunction::from_str("cume_dist")?; + let observed = return_type(&fun, &[])?; + assert_eq!(DataType::Float64, observed); + + Ok(()) + } +} diff --git a/datafusion/src/physical_plan/windows.rs b/datafusion/src/physical_plan/windows.rs new file mode 100644 index 0000000000000..bdd25d69fd553 --- /dev/null +++ b/datafusion/src/physical_plan/windows.rs @@ -0,0 +1,195 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Execution plan for window functions + +use crate::error::{DataFusionError, Result}; +use crate::physical_plan::{ + aggregates, window_functions::WindowFunction, AggregateExpr, Distribution, + ExecutionPlan, Partitioning, PhysicalExpr, SendableRecordBatchStream, WindowExpr, +}; +use arrow::datatypes::{Field, Schema, SchemaRef}; +use async_trait::async_trait; +use std::any::Any; +use std::sync::Arc; + +/// Window execution plan +#[derive(Debug)] +pub struct WindowAggExec { + /// Input plan + input: Arc, + /// Window function expression + window_expr: Vec>, + /// Schema after the window is run + schema: SchemaRef, + /// Schema before the window + input_schema: SchemaRef, +} + +/// Create a physical expression for window function +pub fn create_window_expr( + fun: &WindowFunction, + args: &[Arc], + input_schema: &Schema, + name: String, +) -> Result> { + match fun { + WindowFunction::AggregateFunction(fun) => Ok(Arc::new(AggregateWindowExpr { + aggregate: aggregates::create_aggregate_expr( + fun, + false, + args, + input_schema, + name, + )?, + })), + WindowFunction::BuiltInWindowFunction(fun) => { + Err(DataFusionError::NotImplemented(format!( + "window function with {:?} not implemented", + fun + ))) + } + } +} + +/// A window expr that takes the form of a built in window function +#[derive(Debug)] +pub struct BuiltInWindowExpr {} + +/// A window expr that takes the form of an aggregate function +#[derive(Debug)] +pub struct AggregateWindowExpr { + aggregate: Arc, +} + +impl WindowExpr for AggregateWindowExpr { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + &self.aggregate.name() + } + + fn field(&self) -> Result { + self.aggregate.field() + } +} + +fn create_schema( + input_schema: &Schema, + window_expr: &[Arc], +) -> Result { + let mut fields = Vec::with_capacity(input_schema.fields().len() + window_expr.len()); + for expr in window_expr { + fields.push(expr.field()?); + } + fields.extend_from_slice(input_schema.fields()); + Ok(Schema::new(fields)) +} + +impl WindowAggExec { + /// Create a new execution plan for window aggregates + pub fn try_new( + window_expr: Vec>, + input: Arc, + input_schema: SchemaRef, + ) -> Result { + let schema = create_schema(&input.schema(), &window_expr)?; + let schema = Arc::new(schema); + Ok(WindowAggExec { + input, + window_expr, + schema, + input_schema, + }) + } + + /// Input plan + pub fn input(&self) -> &Arc { + &self.input + } + + /// Get the input schema before any aggregates are applied + pub fn input_schema(&self) -> SchemaRef { + self.input_schema.clone() + } +} + +#[async_trait] +impl ExecutionPlan for WindowAggExec { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + + fn children(&self) -> Vec> { + vec![self.input.clone()] + } + + /// Get the output partitioning of this plan + fn output_partitioning(&self) -> Partitioning { + Partitioning::UnknownPartitioning(1) + } + + fn required_child_distribution(&self) -> Distribution { + Distribution::SinglePartition + } + + fn with_new_children( + &self, + children: Vec>, + ) -> Result> { + match children.len() { + 1 => Ok(Arc::new(WindowAggExec::try_new( + self.window_expr.clone(), + children[0].clone(), + children[0].schema(), + )?)), + _ => Err(DataFusionError::Internal( + "WindowAggExec wrong number of children".to_owned(), + )), + } + } + + async fn execute(&self, partition: usize) -> Result { + if 0 != partition { + return Err(DataFusionError::Internal(format!( + "WindowAggExec invalid partition {}", + partition + ))); + } + + // window needs to operate on a single partition currently + if 1 != self.input.output_partitioning().partition_count() { + return Err(DataFusionError::Internal( + "WindowAggExec requires a single input partition".to_owned(), + )); + } + + // let input = self.input.execute(0).await?; + + Err(DataFusionError::NotImplemented( + "WindowAggExec::execute".to_owned(), + )) + } +} diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 34c5901b450a2..a3027e589985e 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -35,7 +35,7 @@ use crate::{ }; use crate::{ physical_plan::udf::ScalarUDF, - physical_plan::{aggregates, functions}, + physical_plan::{aggregates, functions, window_functions}, sql::parser::{CreateExternalTable, FileType, Statement as DFStatement}, }; @@ -57,7 +57,8 @@ use super::{ parser::DFParser, utils::{ can_columns_satisfy_exprs, expand_wildcard, expr_as_column_expr, extract_aliases, - find_aggregate_exprs, find_column_exprs, rebase_expr, resolve_aliases_to_exprs, + find_aggregate_exprs, find_column_exprs, find_window_exprs, rebase_expr, + resolve_aliases_to_exprs, }, }; @@ -413,7 +414,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { )) } JoinConstraint::None => Err(DataFusionError::NotImplemented( - "NONE contraint is not supported".to_string(), + "NONE constraint is not supported".to_string(), )), } } @@ -624,15 +625,24 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { plan }; + // window function + let window_func_exprs = find_window_exprs(&select_exprs_post_aggr); + + let (plan, exprs) = if window_func_exprs.is_empty() { + (plan, select_exprs_post_aggr) + } else { + self.window(&plan, window_func_exprs, &select_exprs_post_aggr)? + }; + let plan = if select.distinct { return LogicalPlanBuilder::from(&plan) - .aggregate(select_exprs_post_aggr, vec![])? + .aggregate(exprs, vec![])? .build(); } else { plan }; - self.project(&plan, select_exprs_post_aggr) + self.project(&plan, exprs) } /// Returns the `Expr`'s corresponding to a SQL query's SELECT expressions. @@ -657,10 +667,28 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { /// Wrap a plan in a projection fn project(&self, input: &LogicalPlan, expr: Vec) -> Result { self.validate_schema_satisfies_exprs(&input.schema(), &expr)?; - LogicalPlanBuilder::from(input).project(expr)?.build() } + /// Wrap a plan in a window + fn window( + &self, + input: &LogicalPlan, + window_exprs: Vec, + select_exprs: &[Expr], + ) -> Result<(LogicalPlan, Vec)> { + let plan = LogicalPlanBuilder::from(input) + .window(window_exprs)? + .build()?; + let select_exprs = select_exprs + .iter() + .map(|expr| expr_as_column_expr(&expr, &plan)) + .into_iter() + .collect::>>()?; + Ok((plan, select_exprs)) + } + + /// Wrap a plan in an aggregate fn aggregate( &self, input: &LogicalPlan, @@ -1059,70 +1087,69 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // first, scalar built-in if let Ok(fun) = functions::BuiltinScalarFunction::from_str(&name) { - let args = function - .args - .iter() - .map(|a| self.sql_fn_arg_to_logical_expr(a)) - .collect::>>()?; + let args = self.function_args_to_expr(function)?; return Ok(Expr::ScalarFunction { fun, args }); }; + // then, window function + if let Some(window) = &function.over { + if window.partition_by.is_empty() + && window.order_by.is_empty() + && window.window_frame.is_none() + { + let fun = window_functions::WindowFunction::from_str(&name); + if let Ok(window_functions::WindowFunction::AggregateFunction( + aggregate_fun, + )) = fun + { + return Ok(Expr::WindowFunction { + fun: window_functions::WindowFunction::AggregateFunction( + aggregate_fun.clone(), + ), + args: self + .aggregate_fn_to_expr(&aggregate_fun, function)?, + }); + } else if let Ok( + window_functions::WindowFunction::BuiltInWindowFunction( + window_fun, + ), + ) = fun + { + return Ok(Expr::WindowFunction { + fun: window_functions::WindowFunction::BuiltInWindowFunction( + window_fun, + ), + args:self.function_args_to_expr(function)?, + }); + } + } + return Err(DataFusionError::NotImplemented(format!( + "Unsupported OVER clause ({})", + window + ))); + } + // next, aggregate built-ins if let Ok(fun) = aggregates::AggregateFunction::from_str(&name) { - let args = if fun == aggregates::AggregateFunction::Count { - function - .args - .iter() - .map(|a| match a { - FunctionArg::Unnamed(SQLExpr::Value(Value::Number( - _, - _, - ))) => Ok(lit(1_u8)), - FunctionArg::Unnamed(SQLExpr::Wildcard) => Ok(lit(1_u8)), - _ => self.sql_fn_arg_to_logical_expr(a), - }) - .collect::>>()? - } else { - function - .args - .iter() - .map(|a| self.sql_fn_arg_to_logical_expr(a)) - .collect::>>()? - }; - - return match &function.over { - Some(window) => Err(DataFusionError::NotImplemented(format!( - "Unsupported OVER clause ({})", - window - ))), - _ => Ok(Expr::AggregateFunction { - fun, - distinct: function.distinct, - args, - }), - }; + let args = self.aggregate_fn_to_expr(&fun, function)?; + return Ok(Expr::AggregateFunction { + fun, + distinct: function.distinct, + args, + }); }; // finally, user-defined functions (UDF) and UDAF match self.schema_provider.get_function_meta(&name) { Some(fm) => { - let args = function - .args - .iter() - .map(|a| self.sql_fn_arg_to_logical_expr(a)) - .collect::>>()?; + let args = self.function_args_to_expr(function)?; Ok(Expr::ScalarUDF { fun: fm, args }) } None => match self.schema_provider.get_aggregate_meta(&name) { Some(fm) => { - let args = function - .args - .iter() - .map(|a| self.sql_fn_arg_to_logical_expr(a)) - .collect::>>()?; - + let args = self.function_args_to_expr(function)?; Ok(Expr::AggregateUDF { fun: fm, args }) } _ => Err(DataFusionError::Plan(format!( @@ -1142,6 +1169,39 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } } + fn function_args_to_expr( + &self, + function: &sqlparser::ast::Function, + ) -> Result> { + function + .args + .iter() + .map(|a| self.sql_fn_arg_to_logical_expr(a)) + .collect::>>() + } + + fn aggregate_fn_to_expr( + &self, + fun: &aggregates::AggregateFunction, + function: &sqlparser::ast::Function, + ) -> Result> { + if *fun == aggregates::AggregateFunction::Count { + function + .args + .iter() + .map(|a| match a { + FunctionArg::Unnamed(SQLExpr::Value(Value::Number(_, _))) => { + Ok(lit(1_u8)) + } + FunctionArg::Unnamed(SQLExpr::Wildcard) => Ok(lit(1_u8)), + _ => self.sql_fn_arg_to_logical_expr(a), + }) + .collect::>>() + } else { + self.function_args_to_expr(function) + } + } + fn sql_interval_to_literal( &self, value: &str, @@ -2641,13 +2701,34 @@ mod tests { } #[test] - fn over_not_supported() { + fn empty_over() { let sql = "SELECT order_id, MAX(order_id) OVER () from orders"; - let err = logical_plan(sql).expect_err("query should have failed"); - assert_eq!( - "NotImplemented(\"Unsupported OVER clause ()\")", - format!("{:?}", err) - ); + let expected = "\ + Projection: #order_id, #MAX(order_id)\ + \n WindowAggr: windowExpr=[[MAX(#order_id)]] partitionBy=[], orderBy=[]\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); + } + + #[test] + fn empty_over_plus() { + let sql = "SELECT order_id, MAX(qty * 1.1) OVER () from orders"; + let expected = "\ + Projection: #order_id, #MAX(qty Multiply Float64(1.1))\ + \n WindowAggr: windowExpr=[[MAX(#qty Multiply Float64(1.1))]] partitionBy=[], orderBy=[]\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); + } + + #[test] + fn empty_over_multiple() { + let sql = + "SELECT order_id, MAX(qty) OVER (), min(qty) over (), aVg(qty) OVER () from orders"; + let expected = "\ + Projection: #order_id, #MAX(qty), #MIN(qty), #AVG(qty)\ + \n WindowAggr: windowExpr=[[MAX(#qty), MIN(#qty), AVG(#qty)]] partitionBy=[], orderBy=[]\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); } #[test] @@ -2661,6 +2742,16 @@ mod tests { ); } + #[test] + fn over_order_by_not_supported() { + let sql = "SELECT order_id, MAX(delivered) OVER (order BY order_id) from orders"; + let err = logical_plan(sql).expect_err("query should have failed"); + assert_eq!( + "NotImplemented(\"Unsupported OVER clause (ORDER BY order_id)\")", + format!("{:?}", err) + ); + } + #[test] fn only_union_all_supported() { let sql = "SELECT order_id from orders EXCEPT SELECT order_id FROM orders"; diff --git a/datafusion/src/sql/utils.rs b/datafusion/src/sql/utils.rs index f41643d2ab449..70b9df0608397 100644 --- a/datafusion/src/sql/utils.rs +++ b/datafusion/src/sql/utils.rs @@ -46,6 +46,14 @@ pub(crate) fn find_aggregate_exprs(exprs: &[Expr]) -> Vec { }) } +/// Collect all deeply nested `Expr::WindowFunction`. They are returned in order of occurrence +/// (depth first), with duplicates omitted. +pub(crate) fn find_window_exprs(exprs: &[Expr]) -> Vec { + find_exprs_in_exprs(exprs, &|nested_expr| { + matches!(nested_expr, Expr::WindowFunction { .. }) + }) +} + /// Collect all deeply nested `Expr::Column`'s. They are returned in order of /// appearance (depth first), with duplicates omitted. pub(crate) fn find_column_exprs(exprs: &[Expr]) -> Vec { @@ -217,6 +225,13 @@ where .collect::>>()?, distinct: *distinct, }), + Expr::WindowFunction { fun, args } => Ok(Expr::WindowFunction { + fun: fun.clone(), + args: args + .iter() + .map(|e| clone_with_replacement(e, replacement_fn)) + .collect::>>()?, + }), Expr::AggregateUDF { fun, args } => Ok(Expr::AggregateUDF { fun: fun.clone(), args: args diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 17e0f13609a38..e68c53b251e6c 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -797,6 +797,21 @@ async fn csv_query_count() -> Result<()> { Ok(()) } +// FIXME uncomment this when exec is done +// #[tokio::test] +// async fn csv_query_window_with_empty_over() -> Result<()> { +// let mut ctx = ExecutionContext::new(); +// register_aggregate_csv(&mut ctx)?; +// let sql = "SELECT count(c12) over () FROM aggregate_test_100"; +// // FIXME: so far the WindowAggExec is not implemented +// // and the current behavior is to throw not implemented exception + +// let result = execute(&mut ctx, sql).await; +// let expected: Vec> = vec![]; +// assert_eq!(result, expected); +// Ok(()) +// } + #[tokio::test] async fn csv_query_group_by_int_count() -> Result<()> { let mut ctx = ExecutionContext::new(); From 76ac91440e759c59112b70450e970cc99776ec5e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Sun, 23 May 2021 09:39:17 +0100 Subject: [PATCH 106/329] Add PartialOrd and Ord to GroupByScalar (#364) (#368) Signed-off-by: Raphael Taylor-Davies --- datafusion/src/physical_plan/group_scalar.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/src/physical_plan/group_scalar.rs b/datafusion/src/physical_plan/group_scalar.rs index 943386d215c4f..d5f72b0d78176 100644 --- a/datafusion/src/physical_plan/group_scalar.rs +++ b/datafusion/src/physical_plan/group_scalar.rs @@ -24,7 +24,7 @@ use crate::error::{DataFusionError, Result}; use crate::scalar::ScalarValue; /// Enumeration of types that can be used in a GROUP BY expression -#[derive(Debug, PartialEq, Eq, Hash, Clone)] +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)] pub(crate) enum GroupByScalar { Float32(OrderedFloat), Float64(OrderedFloat), From 9eca3291ce38554042ee4441b118d8b231f3d4a7 Mon Sep 17 00:00:00 2001 From: Javier Goday Date: Sun, 23 May 2021 10:43:25 +0200 Subject: [PATCH 107/329] #374: Add missing functions to python (#388) --- python/src/functions.rs | 272 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 272 insertions(+) diff --git a/python/src/functions.rs b/python/src/functions.rs index 68000cb1ecbf8..b51c76ba4b66e 100644 --- a/python/src/functions.rs +++ b/python/src/functions.rs @@ -44,6 +44,20 @@ fn lit(value: i32) -> expression::Expression { } } +#[pyfunction] +fn array(value: Vec) -> expression::Expression { + expression::Expression { + expr: logical_plan::array(value.into_iter().map(|x| x.expr).collect::>()), + } +} + +#[pyfunction] +fn ascii(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::ascii(value.expr), + } +} + #[pyfunction] fn sum(value: expression::Expression) -> expression::Expression { expression::Expression { @@ -51,6 +65,230 @@ fn sum(value: expression::Expression) -> expression::Expression { } } +#[pyfunction] +fn bit_length(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::bit_length(value.expr), + } +} + +#[pyfunction] +fn btrim(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::btrim(value.expr), + } +} + +#[pyfunction] +fn character_length(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::character_length(value.expr), + } +} + +#[pyfunction] +fn chr(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::chr(value.expr), + } +} + +#[pyfunction] +fn concat_ws(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::concat_ws(value.expr), + } +} + +#[pyfunction] +fn in_list(expr: expression::Expression, value: Vec, negated: bool) -> expression::Expression { + expression::Expression { + expr: logical_plan::in_list(expr.expr, value.into_iter().map(|x| x.expr).collect::>(), negated), + } +} + +#[pyfunction] +fn initcap(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::initcap(value.expr), + } +} + +#[pyfunction] +fn left(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::left(value.expr), + } +} + +#[pyfunction] +fn lower(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::lower(value.expr), + } +} + +#[pyfunction] +fn lpad(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::lpad(value.expr), + } +} + +#[pyfunction] +fn ltrim(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::ltrim(value.expr), + } +} + +#[pyfunction] +fn md5(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::md5(value.expr), + } +} + +#[pyfunction] +fn octet_length(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::octet_length(value.expr), + } +} + +#[pyfunction] +fn regexp_replace(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::regexp_replace(value.expr), + } +} + +#[pyfunction] +fn repeat(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::repeat(value.expr), + } +} + +#[pyfunction] +fn replace(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::replace(value.expr), + } +} + +#[pyfunction] +fn reverse(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::reverse(value.expr), + } +} + +#[pyfunction] +fn right(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::right(value.expr), + } +} + +#[pyfunction] +fn rpad(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::rpad(value.expr), + } +} + +#[pyfunction] +fn rtrim(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::rtrim(value.expr), + } +} + +#[pyfunction] +fn sha224(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::sha224(value.expr), + } +} + +#[pyfunction] +fn sha256(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::sha256(value.expr), + } +} + +#[pyfunction] +fn sha384(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::sha384(value.expr), + } +} + +#[pyfunction] +fn sha512(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::sha512(value.expr), + } +} + +#[pyfunction] +fn split_part(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::split_part(value.expr), + } +} + +#[pyfunction] +fn starts_with(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::starts_with(value.expr), + } +} + +#[pyfunction] +fn strpos(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::strpos(value.expr), + } +} + +#[pyfunction] +fn substr(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::substr(value.expr), + } +} + +#[pyfunction] +fn to_hex(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::to_hex(value.expr), + } +} + +#[pyfunction] +fn translate(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::translate(value.expr), + } +} + +#[pyfunction] +fn trim(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::trim(value.expr), + } +} + +#[pyfunction] +fn upper(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::upper(value.expr), + } +} + #[pyfunction] fn avg(value: expression::Expression) -> expression::Expression { expression::Expression { @@ -155,6 +393,40 @@ pub fn init(module: &PyModule) -> PyResult<()> { // see https://github.com/apache/arrow-datafusion/issues/226 //module.add_function(wrap_pyfunction!(concat, module)?)?; module.add_function(wrap_pyfunction!(udf, module)?)?; + module.add_function(wrap_pyfunction!(array, module)?)?; + module.add_function(wrap_pyfunction!(ascii, module)?)?; + module.add_function(wrap_pyfunction!(bit_length, module)?)?; + module.add_function(wrap_pyfunction!(character_length, module)?)?; + module.add_function(wrap_pyfunction!(chr, module)?)?; + module.add_function(wrap_pyfunction!(btrim, module)?)?; + module.add_function(wrap_pyfunction!(concat_ws, module)?)?; + module.add_function(wrap_pyfunction!(in_list, module)?)?; + module.add_function(wrap_pyfunction!(initcap, module)?)?; + module.add_function(wrap_pyfunction!(left, module)?)?; + module.add_function(wrap_pyfunction!(lower, module)?)?; + module.add_function(wrap_pyfunction!(lpad, module)?)?; + module.add_function(wrap_pyfunction!(md5, module)?)?; + module.add_function(wrap_pyfunction!(ltrim, module)?)?; + module.add_function(wrap_pyfunction!(octet_length, module)?)?; + module.add_function(wrap_pyfunction!(regexp_replace, module)?)?; + module.add_function(wrap_pyfunction!(repeat, module)?)?; + module.add_function(wrap_pyfunction!(replace, module)?)?; + module.add_function(wrap_pyfunction!(reverse, module)?)?; + module.add_function(wrap_pyfunction!(right, module)?)?; + module.add_function(wrap_pyfunction!(rpad, module)?)?; + module.add_function(wrap_pyfunction!(rtrim, module)?)?; + module.add_function(wrap_pyfunction!(sha224, module)?)?; + module.add_function(wrap_pyfunction!(sha256, module)?)?; + module.add_function(wrap_pyfunction!(sha384, module)?)?; + module.add_function(wrap_pyfunction!(sha512, module)?)?; + module.add_function(wrap_pyfunction!(split_part, module)?)?; + module.add_function(wrap_pyfunction!(starts_with, module)?)?; + module.add_function(wrap_pyfunction!(strpos, module)?)?; + module.add_function(wrap_pyfunction!(substr, module)?)?; + module.add_function(wrap_pyfunction!(to_hex, module)?)?; + module.add_function(wrap_pyfunction!(translate, module)?)?; + module.add_function(wrap_pyfunction!(trim, module)?)?; + module.add_function(wrap_pyfunction!(upper, module)?)?; module.add_function(wrap_pyfunction!(sum, module)?)?; module.add_function(wrap_pyfunction!(count, module)?)?; module.add_function(wrap_pyfunction!(min, module)?)?; From efdbdbbf1aab897e2e94ef4ddef08feb378f8cf9 Mon Sep 17 00:00:00 2001 From: Jorge Leitao Date: Sun, 23 May 2021 12:18:46 +0200 Subject: [PATCH 108/329] Fixed incorrect logical type in GroupByScalar. (#391) --- datafusion/src/physical_plan/hash_aggregate.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/src/physical_plan/hash_aggregate.rs b/datafusion/src/physical_plan/hash_aggregate.rs index 0a822dc898afb..c9d268619cad3 100644 --- a/datafusion/src/physical_plan/hash_aggregate.rs +++ b/datafusion/src/physical_plan/hash_aggregate.rs @@ -1164,7 +1164,7 @@ fn create_group_by_value(col: &ArrayRef, row: usize) -> Result { } DataType::LargeUtf8 => { let array = col.as_any().downcast_ref::().unwrap(); - Ok(GroupByScalar::Utf8(Box::new(array.value(row).into()))) + Ok(GroupByScalar::LargeUtf8(Box::new(array.value(row).into()))) } DataType::Boolean => { let array = col.as_any().downcast_ref::().unwrap(); From 174226c086a4838eab2a238853b4871c295c0189 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Sun, 23 May 2021 18:19:20 +0800 Subject: [PATCH 109/329] export both now and random functions (#389) --- datafusion/src/logical_plan/expr.rs | 2 ++ datafusion/src/logical_plan/mod.rs | 10 +++++----- datafusion/src/prelude.rs | 6 +++--- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index ab02559175302..29723e73d25ca 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -1104,7 +1104,9 @@ unary_scalar_expr!(Acos, acos); unary_scalar_expr!(Atan, atan); unary_scalar_expr!(Floor, floor); unary_scalar_expr!(Ceil, ceil); +unary_scalar_expr!(Now, now); unary_scalar_expr!(Round, round); +unary_scalar_expr!(Random, random); unary_scalar_expr!(Trunc, trunc); unary_scalar_expr!(Abs, abs); unary_scalar_expr!(Signum, signum); diff --git a/datafusion/src/logical_plan/mod.rs b/datafusion/src/logical_plan/mod.rs index f9be1ff983001..f948770e6437b 100644 --- a/datafusion/src/logical_plan/mod.rs +++ b/datafusion/src/logical_plan/mod.rs @@ -36,11 +36,11 @@ pub use expr::{ abs, acos, and, array, ascii, asin, atan, avg, binary_expr, bit_length, btrim, case, ceil, character_length, chr, col, combine_filters, concat, concat_ws, cos, count, count_distinct, create_udaf, create_udf, exp, exprlist_to_fields, floor, in_list, - initcap, left, length, lit, ln, log10, log2, lower, lpad, ltrim, max, md5, min, - octet_length, or, regexp_match, regexp_replace, repeat, replace, reverse, right, - round, rpad, rtrim, sha224, sha256, sha384, sha512, signum, sin, split_part, sqrt, - starts_with, strpos, substr, sum, tan, to_hex, translate, trim, trunc, upper, when, - Expr, ExprRewriter, ExpressionVisitor, Literal, Recursion, + initcap, left, length, lit, ln, log10, log2, lower, lpad, ltrim, max, md5, min, now, + octet_length, or, random, regexp_match, regexp_replace, repeat, replace, reverse, + right, round, rpad, rtrim, sha224, sha256, sha384, sha512, signum, sin, split_part, + sqrt, starts_with, strpos, substr, sum, tan, to_hex, translate, trim, trunc, upper, + when, Expr, ExprRewriter, ExpressionVisitor, Literal, Recursion, }; pub use extension::UserDefinedLogicalNode; pub use operators::Operator; diff --git a/datafusion/src/prelude.rs b/datafusion/src/prelude.rs index 0edc82a98afbd..e1f1d7b76047f 100644 --- a/datafusion/src/prelude.rs +++ b/datafusion/src/prelude.rs @@ -30,8 +30,8 @@ pub use crate::execution::context::{ExecutionConfig, ExecutionContext}; pub use crate::logical_plan::{ array, ascii, avg, bit_length, btrim, character_length, chr, col, concat, concat_ws, count, create_udf, in_list, initcap, left, length, lit, lower, lpad, ltrim, max, md5, - min, octet_length, regexp_replace, repeat, replace, reverse, right, rpad, rtrim, - sha224, sha256, sha384, sha512, split_part, starts_with, strpos, substr, sum, to_hex, - translate, trim, upper, JoinType, Partitioning, + min, now, octet_length, random, regexp_replace, repeat, replace, reverse, right, + rpad, rtrim, sha224, sha256, sha384, sha512, split_part, starts_with, strpos, substr, + sum, to_hex, translate, trim, upper, JoinType, Partitioning, }; pub use crate::physical_plan::csv::CsvReadOptions; From aeed776986da6813a4e1c54d20e8bf0eb363d706 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 23 May 2021 14:41:28 -0600 Subject: [PATCH 110/329] Add metrics to RepartitionExec (#398) * Add metrics to RepartitionExec * Add sendTime metric * cargo fmt --- datafusion/src/physical_plan/repartition.rs | 53 +++++++++++++++++++-- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/datafusion/src/physical_plan/repartition.rs b/datafusion/src/physical_plan/repartition.rs index 2599690bfc003..e5747dda88b75 100644 --- a/datafusion/src/physical_plan/repartition.rs +++ b/datafusion/src/physical_plan/repartition.rs @@ -21,10 +21,11 @@ use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; -use std::{any::Any, collections::HashMap, vec}; +use std::time::Instant; +use std::{any::Any, vec}; use crate::error::{DataFusionError, Result}; -use crate::physical_plan::{DisplayFormatType, ExecutionPlan, Partitioning}; +use crate::physical_plan::{DisplayFormatType, ExecutionPlan, Partitioning, SQLMetric}; use arrow::record_batch::RecordBatch; use arrow::{array::Array, error::Result as ArrowResult}; use arrow::{compute::take, datatypes::SchemaRef}; @@ -35,6 +36,7 @@ use async_trait::async_trait; use futures::stream::Stream; use futures::StreamExt; +use hashbrown::HashMap; use tokio::sync::{ mpsc::{UnboundedReceiver, UnboundedSender}, Mutex, @@ -58,6 +60,12 @@ pub struct RepartitionExec { HashMap, UnboundedReceiver)>, >, >, + /// Time in nanos to execute child operator and fetch batches + fetch_time_nanos: Arc, + /// Time in nanos to perform repartitioning + repart_time_nanos: Arc, + /// Time in nanos for sending resulting batches to channels + send_time_nanos: Arc, } impl RepartitionExec { @@ -136,26 +144,46 @@ impl ExecutionPlan for RepartitionExec { for i in 0..num_input_partitions { let random_state = random.clone(); let input = self.input.clone(); + let fetch_time = self.fetch_time_nanos.clone(); + let repart_time = self.repart_time_nanos.clone(); + let send_time = self.send_time_nanos.clone(); let mut txs: HashMap<_, _> = channels .iter() .map(|(partition, (tx, _rx))| (*partition, tx.clone())) .collect(); let partitioning = self.partitioning.clone(); let _: JoinHandle> = tokio::spawn(async move { + // execute the child operator + let now = Instant::now(); let mut stream = input.execute(i).await?; + fetch_time.add(now.elapsed().as_nanos() as usize); + let mut counter = 0; let hashes_buf = &mut vec![]; - while let Some(result) = stream.next().await { + loop { + // fetch the next batch + let now = Instant::now(); + let result = stream.next().await; + fetch_time.add(now.elapsed().as_nanos() as usize); + + if result.is_none() { + break; + } + let result = result.unwrap(); + match &partitioning { Partitioning::RoundRobinBatch(_) => { + let now = Instant::now(); let output_partition = counter % num_output_partitions; let tx = txs.get_mut(&output_partition).unwrap(); tx.send(Some(result)).map_err(|e| { DataFusionError::Execution(e.to_string()) })?; + send_time.add(now.elapsed().as_nanos() as usize); } Partitioning::Hash(exprs, _) => { + let now = Instant::now(); let input_batch = result?; let arrays = exprs .iter() @@ -176,9 +204,11 @@ impl ExecutionPlan for RepartitionExec { [(*hash % num_output_partitions as u64) as usize] .push(index as u64) } + repart_time.add(now.elapsed().as_nanos() as usize); for (num_output_partition, partition_indices) in indices.into_iter().enumerate() { + let now = Instant::now(); let indices = partition_indices.into(); // Produce batches based on indices let columns = input_batch @@ -198,10 +228,13 @@ impl ExecutionPlan for RepartitionExec { input_batch.schema(), columns, ); + repart_time.add(now.elapsed().as_nanos() as usize); + let now = Instant::now(); let tx = txs.get_mut(&num_output_partition).unwrap(); tx.send(Some(output_batch)).map_err(|e| { DataFusionError::Execution(e.to_string()) })?; + send_time.add(now.elapsed().as_nanos() as usize); } } other => { @@ -236,6 +269,17 @@ impl ExecutionPlan for RepartitionExec { })) } + fn metrics(&self) -> HashMap { + let mut metrics = HashMap::new(); + metrics.insert("fetchTime".to_owned(), (*self.fetch_time_nanos).clone()); + metrics.insert( + "repartitionTime".to_owned(), + (*self.repart_time_nanos).clone(), + ); + metrics.insert("sendTime".to_owned(), (*self.send_time_nanos).clone()); + metrics + } + fn fmt_as( &self, t: DisplayFormatType, @@ -259,6 +303,9 @@ impl RepartitionExec { input, partitioning, channels: Arc::new(Mutex::new(HashMap::new())), + fetch_time_nanos: SQLMetric::time_nanos(), + repart_time_nanos: SQLMetric::time_nanos(), + send_time_nanos: SQLMetric::time_nanos(), }) } } From 4bf2b42ad4d85e65e8dbd8d99425db99302ed134 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 24 May 2021 02:54:31 -0400 Subject: [PATCH 111/329] Use arrow-rs from crates.io (#395) --- ballista/rust/client/Cargo.toml | 2 +- ballista/rust/core/Cargo.toml | 4 ++-- ballista/rust/executor/Cargo.toml | 4 ++-- ballista/rust/scheduler/Cargo.toml | 2 +- datafusion-cli/Cargo.toml | 2 +- datafusion-examples/Cargo.toml | 2 +- datafusion/Cargo.toml | 4 ++-- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ballista/rust/client/Cargo.toml b/ballista/rust/client/Cargo.toml index f7ed273ec8b90..9ce0517ee2938 100644 --- a/ballista/rust/client/Cargo.toml +++ b/ballista/rust/client/Cargo.toml @@ -31,5 +31,5 @@ futures = "0.3" log = "0.4" tokio = "1.0" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98" } +arrow = { version = "4.0" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml index 7eec207b096e8..2868b60f637f5 100644 --- a/ballista/rust/core/Cargo.toml +++ b/ballista/rust/core/Cargo.toml @@ -40,8 +40,8 @@ tokio = "1.0" tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } -arrow = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98" } -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98" } +arrow = { version = "4.0" } +arrow-flight = { version = "4.0" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/executor/Cargo.toml b/ballista/rust/executor/Cargo.toml index 31fd9d0137b20..7574fca82774d 100644 --- a/ballista/rust/executor/Cargo.toml +++ b/ballista/rust/executor/Cargo.toml @@ -44,8 +44,8 @@ tokio-stream = "0.1" tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } -arrow = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98" } -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98" } +arrow = { version = "4.0" } +arrow-flight = { version = "4.0" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/scheduler/Cargo.toml b/ballista/rust/scheduler/Cargo.toml index 4793534e9ecad..19e2574fea598 100644 --- a/ballista/rust/scheduler/Cargo.toml +++ b/ballista/rust/scheduler/Cargo.toml @@ -52,7 +52,7 @@ tonic = "0.4" tower = { version = "0.4" } warp = "0.3" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98" } +arrow = { version = "4.0" } datafusion = { path = "../../../datafusion" } [dev-dependencies] diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 747a6b0287ebc..cd17b61984d5e 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -31,4 +31,4 @@ clap = "2.33" rustyline = "8.0" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } datafusion = { path = "../datafusion" } -arrow = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98" } +arrow = { version = "4.0" } diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 28175f842e9bd..886f8f5e74f68 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -29,7 +29,7 @@ publish = false [dev-dependencies] -arrow-flight = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98" } +arrow-flight = { version = "4.0" } datafusion = { path = "../datafusion" } prost = "0.7" tonic = "0.4" diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index a127076135f12..0668ec016ba1f 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -46,8 +46,8 @@ unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = "0.7" hashbrown = "0.11" -arrow = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98", features = ["prettyprint"] } -parquet = { git = "https://github.com/apache/arrow-rs", rev = "4449ee96fe3fd4a0b275da8dd25ce2792699bc98", features = ["arrow"] } +arrow = { version = "4.0", features = ["prettyprint"] } +parquet = { version = "4.0", features = ["arrow"] } sqlparser = "0.9.0" paste = "^1.0" num_cpus = "1.13.0" From 1ba0eb0ea4d3f2f91b70a670cda5cf395d9e3f94 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Mon, 24 May 2021 17:01:10 +0800 Subject: [PATCH 112/329] refactor datafusion/`scalar_value` to use more macro and avoid dup code (#392) --- datafusion/src/scalar.rs | 242 +++++++++++++++++++++------------------ 1 file changed, 133 insertions(+), 109 deletions(-) diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index e59d21e7fcef0..e19e274341a55 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -17,28 +17,17 @@ //! This module provides ScalarValue, an enum that can be used for storage of single elements -use std::{convert::TryFrom, fmt, iter::repeat, sync::Arc}; - -use arrow::datatypes::{ArrowDictionaryKeyType, DataType, Field, IntervalUnit, TimeUnit}; +use crate::error::{DataFusionError, Result}; use arrow::{ array::*, datatypes::{ - ArrowNativeType, Float32Type, Int16Type, Int32Type, Int64Type, Int8Type, - TimestampNanosecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, + ArrowDictionaryKeyType, ArrowNativeType, DataType, Field, Float32Type, Int16Type, + Int32Type, Int64Type, Int8Type, IntervalUnit, TimeUnit, TimestampMicrosecondType, + TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, + UInt16Type, UInt32Type, UInt64Type, UInt8Type, }, }; -use arrow::{ - array::{ - ArrayRef, Int16Builder, Int32Builder, Int64Builder, Int8Builder, ListBuilder, - TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, - UInt16Builder, UInt32Builder, UInt64Builder, UInt8Builder, - }, - datatypes::{ - TimestampMicrosecondType, TimestampMillisecondType, TimestampSecondType, - }, -}; - -use crate::error::{DataFusionError, Result}; +use std::{convert::TryFrom, fmt, iter::repeat, sync::Arc}; /// Represents a dynamically typed, nullable single value. /// This is the single-valued counter-part of arrow’s `Array`. @@ -192,6 +181,27 @@ macro_rules! build_values_list { }}; } +macro_rules! build_array_from_option { + ($DATA_TYPE:ident, $ARRAY_TYPE:ident, $EXPR:expr, $SIZE:expr) => {{ + match $EXPR { + Some(value) => Arc::new($ARRAY_TYPE::from_value(*value, $SIZE)), + None => new_null_array(&DataType::$DATA_TYPE, $SIZE), + } + }}; + ($DATA_TYPE:ident, $ENUM:expr, $ARRAY_TYPE:ident, $EXPR:expr, $SIZE:expr) => {{ + match $EXPR { + Some(value) => Arc::new($ARRAY_TYPE::from_value(*value, $SIZE)), + None => new_null_array(&DataType::$DATA_TYPE($ENUM), $SIZE), + } + }}; + ($DATA_TYPE:ident, $ENUM:expr, $ENUM2:expr, $ARRAY_TYPE:ident, $EXPR:expr, $SIZE:expr) => {{ + match $EXPR { + Some(value) => Arc::new($ARRAY_TYPE::from_value(*value, $SIZE)), + None => new_null_array(&DataType::$DATA_TYPE($ENUM, $ENUM2), $SIZE), + } + }}; +} + impl ScalarValue { /// Getter for the `DataType` of the value pub fn get_datatype(&self) -> DataType { @@ -289,80 +299,59 @@ impl ScalarValue { ScalarValue::Boolean(e) => { Arc::new(BooleanArray::from(vec![*e; size])) as ArrayRef } - ScalarValue::Float64(e) => match e { - Some(value) => Arc::new(Float64Array::from_value(*value, size)), - None => new_null_array(&DataType::Float64, size), - }, - ScalarValue::Float32(e) => match e { - Some(value) => Arc::new(Float32Array::from_value(*value, size)), - None => new_null_array(&DataType::Float32, size), - }, - ScalarValue::Int8(e) => match e { - Some(value) => Arc::new(Int8Array::from_value(*value, size)), - None => new_null_array(&DataType::Int8, size), - }, - ScalarValue::Int16(e) => match e { - Some(value) => Arc::new(Int16Array::from_value(*value, size)), - None => new_null_array(&DataType::Int16, size), - }, - ScalarValue::Int32(e) => match e { - Some(value) => Arc::new(Int32Array::from_value(*value, size)), - None => new_null_array(&DataType::Int32, size), - }, - ScalarValue::Int64(e) => match e { - Some(value) => Arc::new(Int64Array::from_value(*value, size)), - None => new_null_array(&DataType::Int64, size), - }, - ScalarValue::UInt8(e) => match e { - Some(value) => Arc::new(UInt8Array::from_value(*value, size)), - None => new_null_array(&DataType::UInt8, size), - }, - ScalarValue::UInt16(e) => match e { - Some(value) => Arc::new(UInt16Array::from_value(*value, size)), - None => new_null_array(&DataType::UInt16, size), - }, - ScalarValue::UInt32(e) => match e { - Some(value) => Arc::new(UInt32Array::from_value(*value, size)), - None => new_null_array(&DataType::UInt32, size), - }, - ScalarValue::UInt64(e) => match e { - Some(value) => Arc::new(UInt64Array::from_value(*value, size)), - None => new_null_array(&DataType::UInt64, size), - }, - ScalarValue::TimestampSecond(e) => match e { - Some(value) => Arc::new(TimestampSecondArray::from_iter_values( - repeat(*value).take(size), - )), - None => { - new_null_array(&DataType::Timestamp(TimeUnit::Second, None), size) - } - }, - ScalarValue::TimestampMillisecond(e) => match e { - Some(value) => Arc::new(TimestampMillisecondArray::from_iter_values( - repeat(*value).take(size), - )), - None => new_null_array( - &DataType::Timestamp(TimeUnit::Millisecond, None), - size, - ), - }, - ScalarValue::TimestampMicrosecond(e) => match e { - Some(value) => { - Arc::new(TimestampMicrosecondArray::from_value(*value, size)) - } - None => new_null_array( - &DataType::Timestamp(TimeUnit::Microsecond, None), - size, - ), - }, - ScalarValue::TimestampNanosecond(e) => match e { - Some(value) => { - Arc::new(TimestampNanosecondArray::from_value(*value, size)) - } - None => { - new_null_array(&DataType::Timestamp(TimeUnit::Nanosecond, None), size) - } - }, + ScalarValue::Float64(e) => { + build_array_from_option!(Float64, Float64Array, e, size) + } + ScalarValue::Float32(e) => { + build_array_from_option!(Float32, Float32Array, e, size) + } + ScalarValue::Int8(e) => build_array_from_option!(Int8, Int8Array, e, size), + ScalarValue::Int16(e) => build_array_from_option!(Int16, Int16Array, e, size), + ScalarValue::Int32(e) => build_array_from_option!(Int32, Int32Array, e, size), + ScalarValue::Int64(e) => build_array_from_option!(Int64, Int64Array, e, size), + ScalarValue::UInt8(e) => build_array_from_option!(UInt8, UInt8Array, e, size), + ScalarValue::UInt16(e) => { + build_array_from_option!(UInt16, UInt16Array, e, size) + } + ScalarValue::UInt32(e) => { + build_array_from_option!(UInt32, UInt32Array, e, size) + } + ScalarValue::UInt64(e) => { + build_array_from_option!(UInt64, UInt64Array, e, size) + } + ScalarValue::TimestampSecond(e) => build_array_from_option!( + Timestamp, + TimeUnit::Second, + None, + TimestampSecondArray, + e, + size + ), + ScalarValue::TimestampMillisecond(e) => build_array_from_option!( + Timestamp, + TimeUnit::Millisecond, + None, + TimestampMillisecondArray, + e, + size + ), + + ScalarValue::TimestampMicrosecond(e) => build_array_from_option!( + Timestamp, + TimeUnit::Microsecond, + None, + TimestampMicrosecondArray, + e, + size + ), + ScalarValue::TimestampNanosecond(e) => build_array_from_option!( + Timestamp, + TimeUnit::Nanosecond, + None, + TimestampNanosecondArray, + e, + size + ), ScalarValue::Utf8(e) => match e { Some(value) => { Arc::new(StringArray::from_iter_values(repeat(value).take(size))) @@ -418,24 +407,27 @@ impl ScalarValue { } dt => panic!("Unexpected DataType for list {:?}", dt), }), - ScalarValue::Date32(e) => match e { - Some(value) => Arc::new(Date32Array::from_value(*value, size)), - None => new_null_array(&DataType::Date32, size), - }, - ScalarValue::Date64(e) => match e { - Some(value) => Arc::new(Date64Array::from_value(*value, size)), - None => new_null_array(&DataType::Date64, size), - }, - ScalarValue::IntervalDayTime(e) => match e { - Some(value) => Arc::new(IntervalDayTimeArray::from_value(*value, size)), - None => new_null_array(&DataType::Interval(IntervalUnit::DayTime), size), - }, - ScalarValue::IntervalYearMonth(e) => match e { - Some(value) => Arc::new(IntervalYearMonthArray::from_value(*value, size)), - None => { - new_null_array(&DataType::Interval(IntervalUnit::YearMonth), size) - } - }, + ScalarValue::Date32(e) => { + build_array_from_option!(Date32, Date32Array, e, size) + } + ScalarValue::Date64(e) => { + build_array_from_option!(Date64, Date64Array, e, size) + } + ScalarValue::IntervalDayTime(e) => build_array_from_option!( + Interval, + IntervalUnit::DayTime, + IntervalDayTimeArray, + e, + size + ), + + ScalarValue::IntervalYearMonth(e) => build_array_from_option!( + Interval, + IntervalUnit::YearMonth, + IntervalYearMonthArray, + e, + size + ), } } @@ -880,6 +872,38 @@ impl ScalarType for TimestampNanosecondType { mod tests { use super::*; + #[test] + fn scalar_value_to_array_u64() { + let value = ScalarValue::UInt64(Some(13u64)); + let array = value.to_array(); + let array = array.as_any().downcast_ref::().unwrap(); + assert_eq!(array.len(), 1); + assert_eq!(false, array.is_null(0)); + assert_eq!(array.value(0), 13); + + let value = ScalarValue::UInt64(None); + let array = value.to_array(); + let array = array.as_any().downcast_ref::().unwrap(); + assert_eq!(array.len(), 1); + assert!(array.is_null(0)); + } + + #[test] + fn scalar_value_to_array_u32() { + let value = ScalarValue::UInt32(Some(13u32)); + let array = value.to_array(); + let array = array.as_any().downcast_ref::().unwrap(); + assert_eq!(array.len(), 1); + assert_eq!(false, array.is_null(0)); + assert_eq!(array.value(0), 13); + + let value = ScalarValue::UInt32(None); + let array = value.to_array(); + let array = array.as_any().downcast_ref::().unwrap(); + assert_eq!(array.len(), 1); + assert!(array.is_null(0)); + } + #[test] fn scalar_list_null_to_array() { let list_array_ref = ScalarValue::List(None, DataType::UInt64).to_array(); From 7fd7a115595d1ba271d95299ed030dbf788991d7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 24 May 2021 08:48:51 -0400 Subject: [PATCH 113/329] Change 'breaking change' label to 'api change' (#366) --- .github/pull_request_template.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index e32246061f813..e71d245c75301 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -23,5 +23,5 @@ If there are user-facing changes then we may require documentation to be updated --> From 8b31714d69ff08517443a1f2da41e0dc0ec00302 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 24 May 2021 08:49:37 -0400 Subject: [PATCH 114/329] Return Vec from PredicateBuilder rather than an `Fn` (#370) * Return Vec from PruningPredicateBuilder rather than an `Fn` * Update datafusion/src/physical_optimizer/pruning.rs Co-authored-by: Jorge Leitao Co-authored-by: Jorge Leitao --- datafusion/src/physical_optimizer/pruning.rs | 85 ++++++++++---------- datafusion/src/physical_plan/parquet.rs | 37 +++++++-- 2 files changed, 72 insertions(+), 50 deletions(-) diff --git a/datafusion/src/physical_optimizer/pruning.rs b/datafusion/src/physical_optimizer/pruning.rs index a13ca56630bc0..0446904eae030 100644 --- a/datafusion/src/physical_optimizer/pruning.rs +++ b/datafusion/src/physical_optimizer/pruning.rs @@ -15,9 +15,14 @@ // specific language governing permissions and limitations // under the License. -//! This module contains code to rule out row groups / partitions / -//! etc based on statistics prior in order to skip evaluating entire -//! swaths of rows. +//! This module contains code to prune "containers" of row groups +//! based on statistics prior to execution. This can lead to +//! significant performance improvements by avoiding the need +//! to evaluate a plan on entire containers (e.g. an entire file) +//! +//! For example, it is used to prune (skip) row groups while reading +//! parquet files if it can be determined from the predicate that +//! nothing in the row group can match. //! //! This code is currently specific to Parquet, but soon (TM), via //! https://github.com/apache/arrow-datafusion/issues/363 it will @@ -85,24 +90,24 @@ impl PruningPredicateBuilder { }) } - /// Generate a predicate function used to filter based on - /// statistics + /// For each set of statistics, evalates the predicate in this + /// builder and returns a `bool` with the following meaning for a + /// container with those statistics: + /// + /// `true`: The container MAY contain rows that match the predicate /// - /// This function takes a slice of statistics as parameter, so - /// that DataFusion's physical expressions can be executed once - /// against a single RecordBatch, containing statistics arrays, on - /// which the physical predicate expression is executed to - /// generate a row group filter array. + /// `false`: The container MUST NOT contain rows that match the predicate /// - /// The generated filter function is then used in the returned - /// closure to filter row groups. NOTE this is parquet specific at the moment + /// Note this function takes a slice of statistics as a parameter + /// to amortize the cost of the evaluation of the predicate + /// against a single record batch. pub fn build_pruning_predicate( &self, - row_group_metadata: &[RowGroupMetaData], - ) -> Box bool> { + statistics: &[RowGroupMetaData], + ) -> Result> { // build statistics record batch - let predicate_result = build_statistics_record_batch( - row_group_metadata, + let predicate_array = build_statistics_record_batch( + statistics, &self.schema, &self.stat_column_req, ) @@ -112,33 +117,29 @@ impl PruningPredicateBuilder { }) .and_then(|v| match v { ColumnarValue::Array(array) => Ok(array), - ColumnarValue::Scalar(_) => Err(DataFusionError::Plan( + ColumnarValue::Scalar(_) => Err(DataFusionError::Internal( "predicate expression didn't return an array".to_string(), )), - }); - - let predicate_array = match predicate_result { - Ok(array) => array, - // row group filter array could not be built - // return a closure which will not filter out any row groups - _ => return Box::new(|_r, _i| true), - }; + })?; - let predicate_array = predicate_array.as_any().downcast_ref::(); - match predicate_array { - // return row group predicate function - Some(array) => { - // when the result of the predicate expression for a row group is null / undefined, - // e.g. due to missing statistics, this row group can't be filtered out, - // so replace with true - let predicate_values = - array.iter().map(|x| x.unwrap_or(true)).collect::>(); - Box::new(move |_, i| predicate_values[i]) - } - // predicate result is not a BooleanArray - // return a closure which will not filter out any row groups - _ => Box::new(|_r, _i| true), - } + let predicate_array = predicate_array + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal(format!( + "Expected pruning predicate evaluation to be BooleanArray, \ + but was {:?}", + predicate_array + )) + })?; + + // when the result of the predicate expression for a row group is null / undefined, + // e.g. due to missing statistics, this row group can't be filtered out, + // so replace with true + Ok(predicate_array + .into_iter() + .map(|x| x.unwrap_or(true)) + .collect::>()) } } @@ -146,7 +147,7 @@ impl PruningPredicateBuilder { /// [`RowGroupMetadata`] structs), creating arrays, one for each /// statistics column, as requested in the stat_column_req parameter. fn build_statistics_record_batch( - row_groups: &[RowGroupMetaData], + statistics: &[RowGroupMetaData], schema: &Schema, stat_column_req: &[(String, StatisticsType, Field)], ) -> Result { @@ -154,7 +155,7 @@ fn build_statistics_record_batch( let mut arrays = Vec::::new(); for (column_name, statistics_type, stat_field) in stat_column_req { if let Some((column_index, _)) = schema.column_with_name(column_name) { - let statistics = row_groups + let statistics = statistics .iter() .map(|g| g.column(column_index).statistics()) .collect::>(); diff --git a/datafusion/src/physical_plan/parquet.rs b/datafusion/src/physical_plan/parquet.rs index 66b1253db3d40..f36171cdb73f4 100644 --- a/datafusion/src/physical_plan/parquet.rs +++ b/datafusion/src/physical_plan/parquet.rs @@ -38,7 +38,10 @@ use arrow::{ error::{ArrowError, Result as ArrowResult}, record_batch::RecordBatch, }; -use parquet::file::reader::{FileReader, SerializedFileReader}; +use parquet::file::{ + metadata::RowGroupMetaData, + reader::{FileReader, SerializedFileReader}, +}; use fmt::Debug; use parquet::arrow::{ArrowReader, ParquetFileArrowReader}; @@ -454,6 +457,22 @@ fn send_result( Ok(()) } +fn build_row_group_predicate( + predicate_builder: &PruningPredicateBuilder, + row_group_metadata: &[RowGroupMetaData], +) -> Box bool> { + let predicate_values = predicate_builder.build_pruning_predicate(row_group_metadata); + + let predicate_values = match predicate_values { + Ok(values) => values, + // stats filter array could not be built + // return a closure which will not filter out any row groups + _ => return Box::new(|_r, _i| true), + }; + + Box::new(move |_, i| predicate_values[i]) +} + fn read_files( filenames: &[String], projection: &[usize], @@ -467,8 +486,10 @@ fn read_files( let file = File::open(&filename)?; let mut file_reader = SerializedFileReader::new(file)?; if let Some(predicate_builder) = predicate_builder { - let row_group_predicate = predicate_builder - .build_pruning_predicate(file_reader.metadata().row_groups()); + let row_group_predicate = build_row_group_predicate( + predicate_builder, + file_reader.metadata().row_groups(), + ); file_reader.filter_row_groups(&row_group_predicate); } let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(file_reader)); @@ -643,7 +664,7 @@ mod tests { ); let row_group_metadata = vec![rgm1, rgm2]; let row_group_predicate = - predicate_builder.build_pruning_predicate(&row_group_metadata); + build_row_group_predicate(&predicate_builder, &row_group_metadata); let row_group_filter = row_group_metadata .iter() .enumerate() @@ -673,7 +694,7 @@ mod tests { ); let row_group_metadata = vec![rgm1, rgm2]; let row_group_predicate = - predicate_builder.build_pruning_predicate(&row_group_metadata); + build_row_group_predicate(&predicate_builder, &row_group_metadata); let row_group_filter = row_group_metadata .iter() .enumerate() @@ -718,7 +739,7 @@ mod tests { ); let row_group_metadata = vec![rgm1, rgm2]; let row_group_predicate = - predicate_builder.build_pruning_predicate(&row_group_metadata); + build_row_group_predicate(&predicate_builder, &row_group_metadata); let row_group_filter = row_group_metadata .iter() .enumerate() @@ -733,7 +754,7 @@ mod tests { let expr = col("c1").gt(lit(15)).or(col("c2").modulus(lit(2))); let predicate_builder = PruningPredicateBuilder::try_new(&expr, schema)?; let row_group_predicate = - predicate_builder.build_pruning_predicate(&row_group_metadata); + build_row_group_predicate(&predicate_builder, &row_group_metadata); let row_group_filter = row_group_metadata .iter() .enumerate() @@ -777,7 +798,7 @@ mod tests { ); let row_group_metadata = vec![rgm1, rgm2]; let row_group_predicate = - predicate_builder.build_pruning_predicate(&row_group_metadata); + build_row_group_predicate(&predicate_builder, &row_group_metadata); let row_group_filter = row_group_metadata .iter() .enumerate() From 0aea0df448b6df3ec7cb489a5ed49f4840bbb551 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 24 May 2021 07:03:27 -0600 Subject: [PATCH 115/329] Implement fmt_as for ShuffleReaderExec (#400) --- .../src/execution_plans/shuffle_reader.rs | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/ballista/rust/core/src/execution_plans/shuffle_reader.rs b/ballista/rust/core/src/execution_plans/shuffle_reader.rs index bd8f6fdbbea0f..107cc15bfa054 100644 --- a/ballista/rust/core/src/execution_plans/shuffle_reader.rs +++ b/ballista/rust/core/src/execution_plans/shuffle_reader.rs @@ -24,12 +24,13 @@ use crate::serde::scheduler::PartitionLocation; use arrow::datatypes::SchemaRef; use async_trait::async_trait; -use datafusion::physical_plan::{ExecutionPlan, Partitioning}; +use datafusion::physical_plan::{DisplayFormatType, ExecutionPlan, Partitioning}; use datafusion::{ error::{DataFusionError, Result}, physical_plan::RecordBatchStream, }; use log::info; +use std::fmt::Formatter; /// ShuffleReaderExec reads partitions that have already been materialized by an executor. #[derive(Debug, Clone)] @@ -103,4 +104,31 @@ impl ExecutionPlan for ShuffleReaderExec { .await .map_err(|e| DataFusionError::Execution(format!("Ballista Error: {:?}", e))) } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + let loc_str = self + .partition_location + .iter() + .map(|l| { + format!( + "[executor={} part={}:{}:{} stats={:?}]", + l.executor_meta.id, + l.partition_id.job_id, + l.partition_id.stage_id, + l.partition_id.partition_id, + l.partition_stats + ) + }) + .collect::>() + .join(","); + write!(f, "ShuffleReaderExec: partition_locations={}", loc_str) + } + } + } } From 9fdc4fe7afdff8fc35b4797c8419daa928898922 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 24 May 2021 09:24:23 -0400 Subject: [PATCH 116/329] Function to create `ArrayRef` from an iterator of ScalarValues (#381) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Function to create `ArrayRef` from an iterator of ScalarValues * Apply suggestions from code review Co-authored-by: Daniël Heres * Update datafusion/src/scalar.rs Co-authored-by: Daniël Heres * Fix up code references Co-authored-by: Daniël Heres --- datafusion/src/scalar.rs | 290 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 290 insertions(+) diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index e19e274341a55..f3fa5b2c5de5c 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -293,6 +293,155 @@ impl ScalarValue { self.to_array_of_size(1) } + /// Converts an iterator of references [`ScalarValue`] into an [`ArrayRef`] + /// corresponding to those values. For example, + /// + /// Returns an error if the iterator is empty or if the + /// [`ScalarValue`]s are not all the same type + /// + /// Example + /// ``` + /// use datafusion::scalar::ScalarValue; + /// use arrow::array::{ArrayRef, BooleanArray}; + /// + /// let scalars = vec![ + /// ScalarValue::Boolean(Some(true)), + /// ScalarValue::Boolean(None), + /// ScalarValue::Boolean(Some(false)), + /// ]; + /// + /// // Build an Array from the list of ScalarValues + /// let array = ScalarValue::iter_to_array(scalars.iter()) + /// .unwrap(); + /// + /// let expected: ArrayRef = std::sync::Arc::new( + /// BooleanArray::from(vec![ + /// Some(true), + /// None, + /// Some(false) + /// ] + /// )); + /// + /// assert_eq!(&array, &expected); + /// ``` + pub fn iter_to_array<'a>( + scalars: impl IntoIterator, + ) -> Result { + let mut scalars = scalars.into_iter().peekable(); + + // figure out the type based on the first element + let data_type = match scalars.peek() { + None => { + return Err(DataFusionError::Internal( + "Empty iterator passed to ScalarValue::iter_to_array".to_string(), + )) + } + Some(sv) => sv.get_datatype(), + }; + + /// Creates an array of $ARRAY_TY by unpacking values of + /// SCALAR_TY for primitive types + macro_rules! build_array_primitive { + ($ARRAY_TY:ident, $SCALAR_TY:ident) => {{ + { + let values = scalars + .map(|sv| { + if let ScalarValue::$SCALAR_TY(v) = sv { + Ok(*v) + } else { + Err(DataFusionError::Internal(format!( + "Inconsistent types in ScalarValue::iter_to_array. \ + Expected {:?}, got {:?}", + data_type, sv + ))) + } + }) + .collect::>>()?; + + let array: $ARRAY_TY = values.iter().collect(); + Arc::new(array) + } + }}; + } + + /// Creates an array of $ARRAY_TY by unpacking values of + /// SCALAR_TY for "string-like" types. + macro_rules! build_array_string { + ($ARRAY_TY:ident, $SCALAR_TY:ident) => {{ + { + let values = scalars + .map(|sv| { + if let ScalarValue::$SCALAR_TY(v) = sv { + Ok(v) + } else { + Err(DataFusionError::Internal(format!( + "Inconsistent types in ScalarValue::iter_to_array. \ + Expected {:?}, got {:?}", + data_type, sv + ))) + } + }) + .collect::>>()?; + + // it is annoying that one can not create + // StringArray et al directly from iter of &String, + // requiring this map to &str + let values = values.iter().map(|s| s.as_ref()); + + let array: $ARRAY_TY = values.collect(); + Arc::new(array) + } + }}; + } + + let array: ArrayRef = match &data_type { + DataType::Boolean => build_array_primitive!(BooleanArray, Boolean), + DataType::Float32 => build_array_primitive!(Float32Array, Float32), + DataType::Float64 => build_array_primitive!(Float64Array, Float64), + DataType::Int8 => build_array_primitive!(Int8Array, Int8), + DataType::Int16 => build_array_primitive!(Int16Array, Int16), + DataType::Int32 => build_array_primitive!(Int32Array, Int32), + DataType::Int64 => build_array_primitive!(Int64Array, Int64), + DataType::UInt8 => build_array_primitive!(UInt8Array, UInt8), + DataType::UInt16 => build_array_primitive!(UInt16Array, UInt16), + DataType::UInt32 => build_array_primitive!(UInt32Array, UInt32), + DataType::UInt64 => build_array_primitive!(UInt64Array, UInt64), + DataType::Utf8 => build_array_string!(StringArray, Utf8), + DataType::LargeUtf8 => build_array_string!(LargeStringArray, LargeUtf8), + DataType::Binary => build_array_string!(BinaryArray, Binary), + DataType::LargeBinary => build_array_string!(LargeBinaryArray, LargeBinary), + DataType::Date32 => build_array_primitive!(Date32Array, Date32), + DataType::Date64 => build_array_primitive!(Date64Array, Date64), + DataType::Timestamp(TimeUnit::Second, None) => { + build_array_primitive!(TimestampSecondArray, TimestampSecond) + } + DataType::Timestamp(TimeUnit::Millisecond, None) => { + build_array_primitive!(TimestampMillisecondArray, TimestampMillisecond) + } + DataType::Timestamp(TimeUnit::Microsecond, None) => { + build_array_primitive!(TimestampMicrosecondArray, TimestampMicrosecond) + } + DataType::Timestamp(TimeUnit::Nanosecond, None) => { + build_array_primitive!(TimestampNanosecondArray, TimestampNanosecond) + } + DataType::Interval(IntervalUnit::DayTime) => { + build_array_primitive!(IntervalDayTimeArray, IntervalDayTime) + } + DataType::Interval(IntervalUnit::YearMonth) => { + build_array_primitive!(IntervalYearMonthArray, IntervalYearMonth) + } + _ => { + return Err(DataFusionError::Internal(format!( + "Unsupported creation of {:?} array from ScalarValue {:?}", + data_type, + scalars.peek() + ))) + } + }; + + Ok(array) + } + /// Converts a scalar value into an array of `size` rows. pub fn to_array_of_size(&self, size: usize) -> ArrayRef { match self { @@ -609,6 +758,12 @@ impl From for ScalarValue { } } +impl From<&str> for ScalarValue { + fn from(value: &str) -> Self { + ScalarValue::Utf8(Some(value.to_string())) + } +} + macro_rules! impl_try_from { ($SCALAR:ident, $NATIVE:ident) => { impl TryFrom for $NATIVE { @@ -940,4 +1095,139 @@ mod tests { assert!(prim_array.is_null(1)); assert_eq!(prim_array.value(2), 101); } + + /// Creates array directly and via ScalarValue and ensures they are the same + macro_rules! check_scalar_iter { + ($SCALAR_T:ident, $ARRAYTYPE:ident, $INPUT:expr) => {{ + let scalars: Vec<_> = + $INPUT.iter().map(|v| ScalarValue::$SCALAR_T(*v)).collect(); + + let array = ScalarValue::iter_to_array(scalars.iter()).unwrap(); + + let expected: ArrayRef = Arc::new($ARRAYTYPE::from($INPUT)); + + assert_eq!(&array, &expected); + }}; + } + + /// Creates array directly and via ScalarValue and ensures they + /// are the same, for string arrays + macro_rules! check_scalar_iter_string { + ($SCALAR_T:ident, $ARRAYTYPE:ident, $INPUT:expr) => {{ + let scalars: Vec<_> = $INPUT + .iter() + .map(|v| ScalarValue::$SCALAR_T(v.map(|v| v.to_string()))) + .collect(); + + let array = ScalarValue::iter_to_array(scalars.iter()).unwrap(); + + let expected: ArrayRef = Arc::new($ARRAYTYPE::from($INPUT)); + + assert_eq!(&array, &expected); + }}; + } + + /// Creates array directly and via ScalarValue and ensures they + /// are the same, for binary arrays + macro_rules! check_scalar_iter_binary { + ($SCALAR_T:ident, $ARRAYTYPE:ident, $INPUT:expr) => {{ + let scalars: Vec<_> = $INPUT + .iter() + .map(|v| ScalarValue::$SCALAR_T(v.map(|v| v.to_vec()))) + .collect(); + + let array = ScalarValue::iter_to_array(scalars.iter()).unwrap(); + + let expected: $ARRAYTYPE = + $INPUT.iter().map(|v| v.map(|v| v.to_vec())).collect(); + + let expected: ArrayRef = Arc::new(expected); + + assert_eq!(&array, &expected); + }}; + } + + #[test] + fn scalar_iter_to_array_boolean() { + check_scalar_iter!(Boolean, BooleanArray, vec![Some(true), None, Some(false)]); + check_scalar_iter!(Float32, Float32Array, vec![Some(1.9), None, Some(-2.1)]); + check_scalar_iter!(Float64, Float64Array, vec![Some(1.9), None, Some(-2.1)]); + + check_scalar_iter!(Int8, Int8Array, vec![Some(1), None, Some(3)]); + check_scalar_iter!(Int16, Int16Array, vec![Some(1), None, Some(3)]); + check_scalar_iter!(Int32, Int32Array, vec![Some(1), None, Some(3)]); + check_scalar_iter!(Int64, Int64Array, vec![Some(1), None, Some(3)]); + + check_scalar_iter!(UInt8, UInt8Array, vec![Some(1), None, Some(3)]); + check_scalar_iter!(UInt16, UInt16Array, vec![Some(1), None, Some(3)]); + check_scalar_iter!(UInt32, UInt32Array, vec![Some(1), None, Some(3)]); + check_scalar_iter!(UInt64, UInt64Array, vec![Some(1), None, Some(3)]); + + check_scalar_iter!( + TimestampSecond, + TimestampSecondArray, + vec![Some(1), None, Some(3)] + ); + check_scalar_iter!( + TimestampMillisecond, + TimestampMillisecondArray, + vec![Some(1), None, Some(3)] + ); + check_scalar_iter!( + TimestampMicrosecond, + TimestampMicrosecondArray, + vec![Some(1), None, Some(3)] + ); + check_scalar_iter!( + TimestampNanosecond, + TimestampNanosecondArray, + vec![Some(1), None, Some(3)] + ); + + check_scalar_iter_string!( + Utf8, + StringArray, + vec![Some("foo"), None, Some("bar")] + ); + check_scalar_iter_string!( + LargeUtf8, + LargeStringArray, + vec![Some("foo"), None, Some("bar")] + ); + check_scalar_iter_binary!( + Binary, + BinaryArray, + vec![Some(b"foo"), None, Some(b"bar")] + ); + check_scalar_iter_binary!( + LargeBinary, + LargeBinaryArray, + vec![Some(b"foo"), None, Some(b"bar")] + ); + } + + #[test] + fn scalar_iter_to_array_empty() { + let scalars = vec![] as Vec; + + let result = ScalarValue::iter_to_array(scalars.iter()).unwrap_err(); + assert!( + result + .to_string() + .contains("Empty iterator passed to ScalarValue::iter_to_array"), + "{}", + result + ); + } + + #[test] + fn scalar_iter_to_array_mismatched_types() { + use ScalarValue::*; + // If the scalar values are not all the correct type, error here + let scalars: Vec = vec![Boolean(Some(true)), Int32(Some(5))]; + + let result = ScalarValue::iter_to_array(scalars.iter()).unwrap_err(); + assert!(result.to_string().contains("Inconsistent types in ScalarValue::iter_to_array. Expected Boolean, got Int32(5)"), + "{}", result); + } } From ee8b5bf9c482e8af1a0fc6ce6bbd9d32b0949d89 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Mon, 24 May 2021 21:25:46 +0800 Subject: [PATCH 117/329] use prettier to format md files (#367) * use prettier to format md files * apply prettier * update ballista --- CODE_OF_CONDUCT.md | 4 +- DEVELOPERS.md | 72 +++++------ README.md | 115 ++++++++---------- ballista/README.md | 11 +- ballista/docs/README.md | 8 +- ballista/docs/architecture.md | 30 ++--- ballista/docs/dev-env.md | 5 +- ballista/docs/integration-testing.md | 5 +- ballista/rust/client/README.md | 2 +- ballista/rust/core/README.md | 1 + ballista/rust/executor/README.md | 3 +- ballista/rust/scheduler/README.md | 9 +- ballista/ui/scheduler/README.md | 4 + datafusion/docs/cli.md | 13 +- docs/user-guide/README.md | 3 +- docs/user-guide/src/SUMMARY.md | 6 +- .../src/distributed/client-python.md | 3 +- .../user-guide/src/distributed/client-rust.md | 5 +- docs/user-guide/src/distributed/clients.md | 1 + .../src/distributed/configuration.md | 12 +- docs/user-guide/src/distributed/deployment.md | 2 +- .../src/distributed/docker-compose.md | 10 +- .../src/distributed/introduction.md | 12 +- docs/user-guide/src/distributed/kubernetes.md | 49 ++++---- .../user-guide/src/distributed/raspberrypi.md | 31 ++--- docs/user-guide/src/distributed/standalone.md | 9 +- docs/user-guide/src/example-usage.md | 1 + docs/user-guide/src/faq.md | 9 +- docs/user-guide/src/introduction.md | 9 +- docs/user-guide/src/library.md | 1 + docs/user-guide/src/sql/select.md | 13 +- 31 files changed, 239 insertions(+), 219 deletions(-) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 2efe740b77c50..9a24b9b8a1109 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -19,6 +19,6 @@ # Code of Conduct -* [Code of Conduct for The Apache Software Foundation][1] +- [Code of Conduct for The Apache Software Foundation][1] -[1]: https://www.apache.org/foundation/policies/conduct.html \ No newline at end of file +[1]: https://www.apache.org/foundation/policies/conduct.html diff --git a/DEVELOPERS.md b/DEVELOPERS.md index be8bb61b148f5..60048c868e6c1 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -21,57 +21,57 @@ This section describes how you can get started at developing DataFusion. -For information on developing with Ballista, see the -[Ballista developer documentation](ballista/docs/README.md). +For information on developing with Ballista, see the +[Ballista developer documentation](ballista/docs/README.md). ### Bootstrap environment DataFusion is written in Rust and it uses a standard rust toolkit: -* `cargo build` -* `cargo fmt` to format the code -* `cargo test` to test -* etc. +- `cargo build` +- `cargo fmt` to format the code +- `cargo test` to test +- etc. ## How to add a new scalar function Below is a checklist of what you need to do to add a new scalar function to DataFusion: -* Add the actual implementation of the function: - * [here](datafusion/src/physical_plan/string_expressions.rs) for string functions - * [here](datafusion/src/physical_plan/math_expressions.rs) for math functions - * [here](datafusion/src/physical_plan/datetime_expressions.rs) for datetime functions - * create a new module [here](datafusion/src/physical_plan) for other functions -* In [src/physical_plan/functions](datafusion/src/physical_plan/functions.rs), add: - * a new variant to `BuiltinScalarFunction` - * a new entry to `FromStr` with the name of the function as called by SQL - * a new line in `return_type` with the expected return type of the function, given an incoming type - * a new line in `signature` with the signature of the function (number and types of its arguments) - * a new line in `create_physical_expr` mapping the built-in to the implementation - * tests to the function. -* In [tests/sql.rs](datafusion/tests/sql.rs), add a new test where the function is called through SQL against well known data and returns the expected result. -* In [src/logical_plan/expr](datafusion/src/logical_plan/expr.rs), add: - * a new entry of the `unary_scalar_expr!` macro for the new function. -* In [src/logical_plan/mod](datafusion/src/logical_plan/mod.rs), add: - * a new entry in the `pub use expr::{}` set. +- Add the actual implementation of the function: + - [here](datafusion/src/physical_plan/string_expressions.rs) for string functions + - [here](datafusion/src/physical_plan/math_expressions.rs) for math functions + - [here](datafusion/src/physical_plan/datetime_expressions.rs) for datetime functions + - create a new module [here](datafusion/src/physical_plan) for other functions +- In [src/physical_plan/functions](datafusion/src/physical_plan/functions.rs), add: + - a new variant to `BuiltinScalarFunction` + - a new entry to `FromStr` with the name of the function as called by SQL + - a new line in `return_type` with the expected return type of the function, given an incoming type + - a new line in `signature` with the signature of the function (number and types of its arguments) + - a new line in `create_physical_expr` mapping the built-in to the implementation + - tests to the function. +- In [tests/sql.rs](datafusion/tests/sql.rs), add a new test where the function is called through SQL against well known data and returns the expected result. +- In [src/logical_plan/expr](datafusion/src/logical_plan/expr.rs), add: + - a new entry of the `unary_scalar_expr!` macro for the new function. +- In [src/logical_plan/mod](datafusion/src/logical_plan/mod.rs), add: + - a new entry in the `pub use expr::{}` set. ## How to add a new aggregate function Below is a checklist of what you need to do to add a new aggregate function to DataFusion: -* Add the actual implementation of an `Accumulator` and `AggregateExpr`: - * [here](datafusion/src/physical_plan/string_expressions.rs) for string functions - * [here](datafusion/src/physical_plan/math_expressions.rs) for math functions - * [here](datafusion/src/physical_plan/datetime_expressions.rs) for datetime functions - * create a new module [here](datafusion/src/physical_plan) for other functions -* In [src/physical_plan/aggregates](datafusion/src/physical_plan/aggregates.rs), add: - * a new variant to `BuiltinAggregateFunction` - * a new entry to `FromStr` with the name of the function as called by SQL - * a new line in `return_type` with the expected return type of the function, given an incoming type - * a new line in `signature` with the signature of the function (number and types of its arguments) - * a new line in `create_aggregate_expr` mapping the built-in to the implementation - * tests to the function. -* In [tests/sql.rs](datafusion/tests/sql.rs), add a new test where the function is called through SQL against well known data and returns the expected result. +- Add the actual implementation of an `Accumulator` and `AggregateExpr`: + - [here](datafusion/src/physical_plan/string_expressions.rs) for string functions + - [here](datafusion/src/physical_plan/math_expressions.rs) for math functions + - [here](datafusion/src/physical_plan/datetime_expressions.rs) for datetime functions + - create a new module [here](datafusion/src/physical_plan) for other functions +- In [src/physical_plan/aggregates](datafusion/src/physical_plan/aggregates.rs), add: + - a new variant to `BuiltinAggregateFunction` + - a new entry to `FromStr` with the name of the function as called by SQL + - a new line in `return_type` with the expected return type of the function, given an incoming type + - a new line in `signature` with the signature of the function (number and types of its arguments) + - a new line in `create_aggregate_expr` mapping the built-in to the implementation + - tests to the function. +- In [tests/sql.rs](datafusion/tests/sql.rs), add a new test where the function is called through SQL against well known data and returns the expected result. ## How to display plans graphically diff --git a/README.md b/README.md index f72c73bb80372..f3ae412fde940 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ logical query plans as well as a query optimizer and execution engine capable of parallel execution against partitioned data sources (CSV and Parquet) using threads. -DataFusion also supports distributed query execution via the +DataFusion also supports distributed query execution via the [Ballista](ballista/README.md) crate. ## Use Cases @@ -42,24 +42,24 @@ the convenience of an SQL interface or a DataFrame API. ## Why DataFusion? -* *High Performance*: Leveraging Rust and Arrow's memory model, DataFusion achieves very high performance -* *Easy to Connect*: Being part of the Apache Arrow ecosystem (Arrow, Parquet and Flight), DataFusion works well with the rest of the big data ecosystem -* *Easy to Embed*: Allowing extension at almost any point in its design, DataFusion can be tailored for your specific usecase -* *High Quality*: Extensively tested, both by itself and with the rest of the Arrow ecosystem, DataFusion can be used as the foundation for production systems. +- _High Performance_: Leveraging Rust and Arrow's memory model, DataFusion achieves very high performance +- _Easy to Connect_: Being part of the Apache Arrow ecosystem (Arrow, Parquet and Flight), DataFusion works well with the rest of the big data ecosystem +- _Easy to Embed_: Allowing extension at almost any point in its design, DataFusion can be tailored for your specific usecase +- _High Quality_: Extensively tested, both by itself and with the rest of the Arrow ecosystem, DataFusion can be used as the foundation for production systems. ## Known Uses Here are some of the projects known to use DataFusion: -* [Ballista](ballista) Distributed Compute Platform -* [Cloudfuse Buzz](https://github.com/cloudfuse-io/buzz-rust) -* [Cube Store](https://github.com/cube-js/cube.js/tree/master/rust) -* [datafusion-python](https://pypi.org/project/datafusion) -* [delta-rs](https://github.com/delta-io/delta-rs) -* [InfluxDB IOx](https://github.com/influxdata/influxdb_iox) Time Series Database -* [ROAPI](https://github.com/roapi/roapi) -* [Tensorbase](https://github.com/tensorbase/tensorbase) -* [Squirtle](https://github.com/DSLAM-UMD/Squirtle) +- [Ballista](ballista) Distributed Compute Platform +- [Cloudfuse Buzz](https://github.com/cloudfuse-io/buzz-rust) +- [Cube Store](https://github.com/cube-js/cube.js/tree/master/rust) +- [datafusion-python](https://pypi.org/project/datafusion) +- [delta-rs](https://github.com/delta-io/delta-rs) +- [InfluxDB IOx](https://github.com/influxdata/influxdb_iox) Time Series Database +- [ROAPI](https://github.com/roapi/roapi) +- [Tensorbase](https://github.com/tensorbase/tensorbase) +- [Squirtle](https://github.com/DSLAM-UMD/Squirtle) (if you know of another project, please submit a PR to add a link!) @@ -122,8 +122,6 @@ Both of these examples will produce +---+--------+ ``` - - ## Using DataFusion as a library DataFusion is [published on crates.io](https://crates.io/crates/datafusion), and is [well documented on docs.rs](https://docs.rs/datafusion/). @@ -230,7 +228,6 @@ DataFusion also includes a simple command-line interactive SQL utility. See the - [x] Parquet primitive types - [ ] Parquet nested types - ## Extensibility DataFusion is designed to be extensible at all points. To that end, you can provide your own custom: @@ -242,26 +239,24 @@ DataFusion is designed to be extensible at all points. To that end, you can prov - [x] User Defined `LogicalPlan` nodes - [x] User Defined `ExecutionPlan` nodes - # Supported SQL This library currently supports many SQL constructs, including -* `CREATE EXTERNAL TABLE X STORED AS PARQUET LOCATION '...';` to register a table's locations -* `SELECT ... FROM ...` together with any expression -* `ALIAS` to name an expression -* `CAST` to change types, including e.g. `Timestamp(Nanosecond, None)` -* most mathematical unary and binary expressions such as `+`, `/`, `sqrt`, `tan`, `>=`. -* `WHERE` to filter -* `GROUP BY` together with one of the following aggregations: `MIN`, `MAX`, `COUNT`, `SUM`, `AVG` -* `ORDER BY` together with an expression and optional `ASC` or `DESC` and also optional `NULLS FIRST` or `NULLS LAST` - +- `CREATE EXTERNAL TABLE X STORED AS PARQUET LOCATION '...';` to register a table's locations +- `SELECT ... FROM ...` together with any expression +- `ALIAS` to name an expression +- `CAST` to change types, including e.g. `Timestamp(Nanosecond, None)` +- most mathematical unary and binary expressions such as `+`, `/`, `sqrt`, `tan`, `>=`. +- `WHERE` to filter +- `GROUP BY` together with one of the following aggregations: `MIN`, `MAX`, `COUNT`, `SUM`, `AVG` +- `ORDER BY` together with an expression and optional `ASC` or `DESC` and also optional `NULLS FIRST` or `NULLS LAST` ## Supported Functions DataFusion strives to implement a subset of the [PostgreSQL SQL dialect](https://www.postgresql.org/docs/current/functions.html) where possible. We explicitly choose a single dialect to maximize interoperability with other tools and allow reuse of the PostgreSQL documents and tutorials as much as possible. -Currently, only a subset of the PosgreSQL dialect is implemented, and we will document any deviations. +Currently, only a subset of the PostgreSQL dialect is implemented, and we will document any deviations. ## Schema Metadata / Information Schema Support @@ -269,8 +264,7 @@ DataFusion supports the showing metadata about the tables available. This inform More information can be found in the [Postgres docs](https://www.postgresql.org/docs/13/infoschema-schema.html)). - -To show tables available for use in DataFusion, use the `SHOW TABLES` command or the `information_schema.tables` view: +To show tables available for use in DataFusion, use the `SHOW TABLES` command or the `information_schema.tables` view: ```sql > show tables; @@ -291,7 +285,7 @@ To show tables available for use in DataFusion, use the `SHOW TABLES` command o +---------------+--------------------+------------+--------------+ ``` -To show the schema of a table in DataFusion, use the `SHOW COLUMNS` command or the or `information_schema.columns` view: +To show the schema of a table in DataFusion, use the `SHOW COLUMNS` command or the or `information_schema.columns` view: ```sql > show columns from t; @@ -313,8 +307,6 @@ To show the schema of a table in DataFusion, use the `SHOW COLUMNS` command or +------------+-------------+------------------+-------------+-----------+ ``` - - ## Supported Data Types DataFusion uses Arrow, and thus the Arrow type system, for query @@ -322,41 +314,38 @@ execution. The SQL types from [sqlparser-rs](https://github.com/ballista-compute/sqlparser-rs/blob/main/src/ast/data_type.rs#L57) are mapped to Arrow types according to the following table - -| SQL Data Type | Arrow DataType | -| --------------- | -------------------------------- | -| `CHAR` | `Utf8` | -| `VARCHAR` | `Utf8` | -| `UUID` | *Not yet supported* | -| `CLOB` | *Not yet supported* | -| `BINARY` | *Not yet supported* | -| `VARBINARY` | *Not yet supported* | -| `DECIMAL` | `Float64` | -| `FLOAT` | `Float32` | -| `SMALLINT` | `Int16` | -| `INT` | `Int32` | -| `BIGINT` | `Int64` | -| `REAL` | `Float64` | -| `DOUBLE` | `Float64` | -| `BOOLEAN` | `Boolean` | -| `DATE` | `Date32` | -| `TIME` | `Time64(TimeUnit::Millisecond)` | -| `TIMESTAMP` | `Date64` | -| `INTERVAL` | *Not yet supported* | -| `REGCLASS` | *Not yet supported* | -| `TEXT` | *Not yet supported* | -| `BYTEA` | *Not yet supported* | -| `CUSTOM` | *Not yet supported* | -| `ARRAY` | *Not yet supported* | - +| SQL Data Type | Arrow DataType | +| ------------- | ------------------------------- | +| `CHAR` | `Utf8` | +| `VARCHAR` | `Utf8` | +| `UUID` | _Not yet supported_ | +| `CLOB` | _Not yet supported_ | +| `BINARY` | _Not yet supported_ | +| `VARBINARY` | _Not yet supported_ | +| `DECIMAL` | `Float64` | +| `FLOAT` | `Float32` | +| `SMALLINT` | `Int16` | +| `INT` | `Int32` | +| `BIGINT` | `Int64` | +| `REAL` | `Float64` | +| `DOUBLE` | `Float64` | +| `BOOLEAN` | `Boolean` | +| `DATE` | `Date32` | +| `TIME` | `Time64(TimeUnit::Millisecond)` | +| `TIMESTAMP` | `Date64` | +| `INTERVAL` | _Not yet supported_ | +| `REGCLASS` | _Not yet supported_ | +| `TEXT` | _Not yet supported_ | +| `BYTEA` | _Not yet supported_ | +| `CUSTOM` | _Not yet supported_ | +| `ARRAY` | _Not yet supported_ | # Architecture Overview There is no formal document describing DataFusion's architecture yet, but the following presentations offer a good overview of its different components and how they interact together. -* (March 2021): The DataFusion architecture is described in *Query Engine Design and the Rust-Based DataFusion in Apache Arrow*: [recording](https://www.youtube.com/watch?v=K6eCAVEk4kU) (DataFusion content starts ~ 15 minutes in) and [slides](https://www.slideshare.net/influxdata/influxdb-iox-tech-talks-query-engine-design-and-the-rustbased-datafusion-in-apache-arrow-244161934) -* (Feburary 2021): How DataFusion is used within the Ballista Project is described in *Ballista: Distributed Compute with Rust and Apache Arrow: [recording](https://www.youtube.com/watch?v=ZZHQaOap9pQ) - +- (March 2021): The DataFusion architecture is described in _Query Engine Design and the Rust-Based DataFusion in Apache Arrow_: [recording](https://www.youtube.com/watch?v=K6eCAVEk4kU) (DataFusion content starts ~ 15 minutes in) and [slides](https://www.slideshare.net/influxdata/influxdb-iox-tech-talks-query-engine-design-and-the-rustbased-datafusion-in-apache-arrow-244161934) +- (Feburary 2021): How DataFusion is used within the Ballista Project is described in \*Ballista: Distributed Compute with Rust and Apache Arrow: [recording](https://www.youtube.com/watch?v=ZZHQaOap9pQ) # Developer's guide diff --git a/ballista/README.md b/ballista/README.md index 276af3c4d9b2d..038146a639ed8 100644 --- a/ballista/README.md +++ b/ballista/README.md @@ -19,14 +19,14 @@ # Ballista: Distributed Compute with Apache Arrow -Ballista is a distributed compute platform primarily implemented in Rust, and powered by Apache Arrow. It is built -on an architecture that allows other programming languages (such as Python, C++, and Java) to be supported as +Ballista is a distributed compute platform primarily implemented in Rust, and powered by Apache Arrow. It is built +on an architecture that allows other programming languages (such as Python, C++, and Java) to be supported as first-class citizens without paying a penalty for serialization costs. The foundational technologies in Ballista are: - [Apache Arrow](https://arrow.apache.org/) memory model and compute kernels for efficient processing of data. -- [Apache Arrow Flight Protocol](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for efficient +- [Apache Arrow Flight Protocol](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for efficient data transfer between processes. - [Google Protocol Buffers](https://developers.google.com/protocol-buffers) for serializing query plans. - [Docker](https://www.docker.com/) for packaging up executors along with user-defined code. @@ -57,7 +57,6 @@ April 2021 and should be considered experimental. ## Getting Started -The [Ballista Developer Documentation](docs/README.md) and the -[DataFusion User Guide](https://github.com/apache/arrow-datafusion/tree/master/docs/user-guide) are currently the +The [Ballista Developer Documentation](docs/README.md) and the +[DataFusion User Guide](https://github.com/apache/arrow-datafusion/tree/master/docs/user-guide) are currently the best sources of information for getting started with Ballista. - diff --git a/ballista/docs/README.md b/ballista/docs/README.md index 6588c1d4d37b6..38d3db5dff395 100644 --- a/ballista/docs/README.md +++ b/ballista/docs/README.md @@ -16,19 +16,19 @@ specific language governing permissions and limitations under the License. --> + # Ballista Developer Documentation -This directory contains documentation for developers that are contributing to Ballista. If you are looking for -end-user documentation for a published release, please start with the +This directory contains documentation for developers that are contributing to Ballista. If you are looking for +end-user documentation for a published release, please start with the [DataFusion User Guide](../../docs/user-guide) instead. ## Architecture & Design -- Read the [Architecture Overview](architecture.md) to get an understanding of the scheduler and executor +- Read the [Architecture Overview](architecture.md) to get an understanding of the scheduler and executor processes and how distributed query execution works. ## Build, Test, Release - Setting up a [development environment](dev-env.md). - [Integration Testing](integration-testing.md) - diff --git a/ballista/docs/architecture.md b/ballista/docs/architecture.md index a73b53a087016..04e1dc26bac1e 100644 --- a/ballista/docs/architecture.md +++ b/ballista/docs/architecture.md @@ -16,37 +16,38 @@ specific language governing permissions and limitations under the License. --> + # Ballista Architecture ## Overview -Ballista allows queries to be executed in a distributed cluster. A cluster consists of one or +Ballista allows queries to be executed in a distributed cluster. A cluster consists of one or more scheduler processes and one or more executor processes. See the following sections in this document for more details about these components. -The scheduler accepts logical query plans and translates them into physical query plans using DataFusion and then -runs a secondary planning/optimization process to translate the physical query plan into a distributed physical -query plan. +The scheduler accepts logical query plans and translates them into physical query plans using DataFusion and then +runs a secondary planning/optimization process to translate the physical query plan into a distributed physical +query plan. -This process breaks a query down into a number of query stages that can be executed independently. There are -dependencies between query stages and these dependencies form a directionally-acyclic graph (DAG) because a query +This process breaks a query down into a number of query stages that can be executed independently. There are +dependencies between query stages and these dependencies form a directionally-acyclic graph (DAG) because a query stage cannot start until its child query stages have completed. -Each query stage has one or more partitions that can be processed in parallel by the available +Each query stage has one or more partitions that can be processed in parallel by the available executors in the cluster. This is the basic unit of scalability in Ballista. -The following diagram shows the flow of requests and responses between the client, scheduler, and executor -processes. +The following diagram shows the flow of requests and responses between the client, scheduler, and executor +processes. ![Query Execution Flow](images/query-execution.png) ## Scheduler Process -The scheduler process implements a gRPC interface (defined in +The scheduler process implements a gRPC interface (defined in [ballista.proto](../rust/ballista/proto/ballista.proto)). The interface provides the following methods: | Method | Description | -|----------------------|----------------------------------------------------------------------| +| -------------------- | -------------------------------------------------------------------- | | ExecuteQuery | Submit a logical query plan or SQL query for execution | | GetExecutorsMetadata | Retrieves a list of executors that have registered with a scheduler | | GetFileMetadata | Retrieve metadata about files available in the cluster file system | @@ -60,7 +61,7 @@ The scheduler can run in standalone mode, or can be run in clustered mode using The executor process implements the Apache Arrow Flight gRPC interface and is responsible for: - Executing query stages and persisting the results to disk in Apache Arrow IPC Format -- Making query stage results available as Flights so that they can be retrieved by other executors as well as by +- Making query stage results available as Flights so that they can be retrieved by other executors as well as by clients ## Rust Client @@ -69,7 +70,6 @@ The Rust client provides a DataFrame API that is a thin wrapper around the DataF the means for a client to build a query plan for execution. The client executes the query plan by submitting an `ExecuteLogicalPlan` request to the scheduler and then calls -`GetJobStatus` to check for completion. On completion, the client receives a list of locations for the Flights -containing the results for the query and will then connect to the appropriate executor processes to retrieve +`GetJobStatus` to check for completion. On completion, the client receives a list of locations for the Flights +containing the results for the query and will then connect to the appropriate executor processes to retrieve those results. - diff --git a/ballista/docs/dev-env.md b/ballista/docs/dev-env.md index bf50c9d9c9137..f02d156d1ec83 100644 --- a/ballista/docs/dev-env.md +++ b/ballista/docs/dev-env.md @@ -16,13 +16,14 @@ specific language governing permissions and limitations under the License. --> + # Setting up a Rust development environment You will need a standard Rust development environment. The easiest way to achieve this is by using rustup: https://rustup.rs/ ## Install OpenSSL -Follow instructions for [setting up OpenSSL](https://docs.rs/openssl/0.10.28/openssl/). For Ubuntu users, the following +Follow instructions for [setting up OpenSSL](https://docs.rs/openssl/0.10.28/openssl/). For Ubuntu users, the following command works. ```bash @@ -35,4 +36,4 @@ You'll need cmake in order to compile some of ballista's dependencies. Ubuntu us ```bash sudo apt-get install cmake -``` \ No newline at end of file +``` diff --git a/ballista/docs/integration-testing.md b/ballista/docs/integration-testing.md index 3f818a4596b0f..5c5b9ee9b6560 100644 --- a/ballista/docs/integration-testing.md +++ b/ballista/docs/integration-testing.md @@ -16,10 +16,11 @@ specific language governing permissions and limitations under the License. --> + # Integration Testing -We use the [DataFusion Benchmarks](https://github.com/apache/arrow-datafusion/tree/master/benchmarks) for integration -testing. +We use the [DataFusion Benchmarks](https://github.com/apache/arrow-datafusion/tree/master/benchmarks) for integration +testing. The integration tests can be executed by running the following command from the root of the DataFusion repository. diff --git a/ballista/rust/client/README.md b/ballista/rust/client/README.md index 00bf3ea5ec6c2..a9fbd8efa3743 100644 --- a/ballista/rust/client/README.md +++ b/ballista/rust/client/README.md @@ -18,5 +18,5 @@ --> # Ballista - Rust -This crate contains the Ballista client library. For an example usage, please refer [here](../benchmarks/tpch/README.md). +This crate contains the Ballista client library. For an example usage, please refer [here](../benchmarks/tpch/README.md). diff --git a/ballista/rust/core/README.md b/ballista/rust/core/README.md index f97952b3f7023..d51ae2fddbf66 100644 --- a/ballista/rust/core/README.md +++ b/ballista/rust/core/README.md @@ -18,4 +18,5 @@ --> # Ballista - Rust + This crate contains the core Ballista types. diff --git a/ballista/rust/executor/README.md b/ballista/rust/executor/README.md index c0824e639fdc8..105e027c48874 100644 --- a/ballista/rust/executor/README.md +++ b/ballista/rust/executor/README.md @@ -18,6 +18,7 @@ --> # Ballista Executor - Rust + This crate contains the Ballista Executor. It can be used both as a library or as a binary. ## Run @@ -28,4 +29,4 @@ RUST_LOG=info cargo run --release [2021-02-11T05:30:13Z INFO executor] Running with config: ExecutorConfig { host: "localhost", port: 50051, work_dir: "/var/folders/y8/fc61kyjd4n53tn444n72rjrm0000gn/T/.tmpv1LjN0", concurrent_tasks: 4 } ``` -By default, the executor will bind to `localhost` and listen on port `50051`. \ No newline at end of file +By default, the executor will bind to `localhost` and listen on port `50051`. diff --git a/ballista/rust/scheduler/README.md b/ballista/rust/scheduler/README.md index d87eec30e23b7..78a800092fda9 100644 --- a/ballista/rust/scheduler/README.md +++ b/ballista/rust/scheduler/README.md @@ -18,6 +18,7 @@ --> # Ballista Scheduler + This crate contains the Ballista Scheduler. It can be used both as a library or as a binary. ## Run @@ -32,8 +33,9 @@ $ RUST_LOG=info cargo run --release By default, the scheduler will bind to `localhost` and listen on port `50051`. ## Connecting to Scheduler -Scheduler supports REST model also using content negotiation. -For e.x if you want to get list of executors connected to the scheduler, + +Scheduler supports REST model also using content negotiation. +For e.x if you want to get list of executors connected to the scheduler, you can do (assuming you use default config) ```bash @@ -43,7 +45,8 @@ curl --request GET \ ``` ## Scheduler UI -A basic ui for the scheduler is in `ui/scheduler` of the ballista repo. + +A basic ui for the scheduler is in `ui/scheduler` of the ballista repo. It can be started using the following [yarn](https://yarnpkg.com/) command ```bash diff --git a/ballista/ui/scheduler/README.md b/ballista/ui/scheduler/README.md index 90bc2bface4c6..3fee499ead9c1 100644 --- a/ballista/ui/scheduler/README.md +++ b/ballista/ui/scheduler/README.md @@ -22,7 +22,9 @@ ## Start project from source ### Run scheduler/executor + First, run scheduler from project: + ```shell $ cd rust/scheduler $ RUST_LOG=info cargo run --release @@ -34,6 +36,7 @@ $ RUST_LOG=info cargo run --release ``` and run executor in new terminal: + ```shell $ cd rust/executor $ RUST_LOG=info cargo run --release @@ -44,6 +47,7 @@ $ RUST_LOG=info cargo run --release ``` ### Run Client project + ```shell $ cd ui/scheduler $ yarn diff --git a/datafusion/docs/cli.md b/datafusion/docs/cli.md index 27605b2e98c0f..db3e0b939a81c 100644 --- a/datafusion/docs/cli.md +++ b/datafusion/docs/cli.md @@ -45,16 +45,23 @@ docker run -it -v $(your_data_location):/data datafusion-cli ## Usage ``` +DataFusion 4.0.0-SNAPSHOT +DataFusion is an in-memory query engine that uses Apache Arrow as the memory model. It supports executing SQL queries +against CSV and Parquet files as well as querying directly against in-memory data. + USAGE: - datafusion-cli [OPTIONS] + datafusion-cli [FLAGS] [OPTIONS] FLAGS: -h, --help Prints help information + -q, --quiet Reduce printing other than the results and work quietly -V, --version Prints version information OPTIONS: - -c, --batch-size The batch size of each query, default value is 1048576 + -c, --batch-size The batch size of each query, or use DataFusion default -p, --data-path Path to your data, default to current directory + -f, --file Execute commands from file, then exit + --format Output format (possible values: table, csv, tsv, json) [default: table] ``` Type `exit` or `quit` to exit the CLI. @@ -64,7 +71,7 @@ Type `exit` or `quit` to exit the CLI. Parquet data sources can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement. It is not necessary to provide schema information for Parquet files. ```sql -CREATE EXTERNAL TABLE taxi +CREATE EXTERNAL TABLE taxi STORED AS PARQUET LOCATION '/mnt/nyctaxi/tripdata.parquet'; ``` diff --git a/docs/user-guide/README.md b/docs/user-guide/README.md index 0b9278c593b1e..6698e5631d936 100644 --- a/docs/user-guide/README.md +++ b/docs/user-guide/README.md @@ -16,6 +16,7 @@ specific language governing permissions and limitations under the License. --> + # DataFusion User Guide Source This directory contains the sources for the DataFusion user guide. @@ -27,4 +28,4 @@ To generate the user guide in HTML format, run the following commands: ```bash cargo install mdbook mdbook build -``` \ No newline at end of file +``` diff --git a/docs/user-guide/src/SUMMARY.md b/docs/user-guide/src/SUMMARY.md index 903dad1732a15..aa101b3de1173 100644 --- a/docs/user-guide/src/SUMMARY.md +++ b/docs/user-guide/src/SUMMARY.md @@ -16,12 +16,14 @@ specific language governing permissions and limitations under the License. --> + # Summary - [Introduction](introduction.md) -- [Example Usage](example-usage.md) +- [Example Usage](example-usage.md) - [Use as a Library](library.md) - [SQL Reference](sql/introduction.md) + - [SELECT](sql/select.md) - [DDL](sql/ddl.md) - [CREATE EXTERNAL TABLE](sql/ddl.md) @@ -36,4 +38,4 @@ - [Clients](distributed/clients.md) - [Rust](distributed/client-rust.md) - [Python](distributed/client-python.md) -- [Frequently Asked Questions](faq.md) \ No newline at end of file +- [Frequently Asked Questions](faq.md) diff --git a/docs/user-guide/src/distributed/client-python.md b/docs/user-guide/src/distributed/client-python.md index 7525c608ad233..dac06408bed03 100644 --- a/docs/user-guide/src/distributed/client-python.md +++ b/docs/user-guide/src/distributed/client-python.md @@ -16,6 +16,7 @@ specific language governing permissions and limitations under the License. --> + # Python -Coming soon. \ No newline at end of file +Coming soon. diff --git a/docs/user-guide/src/distributed/client-rust.md b/docs/user-guide/src/distributed/client-rust.md index 048c10fc9263d..7f7ffcb95b827 100644 --- a/docs/user-guide/src/distributed/client-rust.md +++ b/docs/user-guide/src/distributed/client-rust.md @@ -16,7 +16,8 @@ specific language governing permissions and limitations under the License. --> + ## Ballista Rust Client -The Rust client supports a `DataFrame` API as well as SQL. See the -[TPC-H Benchmark Client](https://github.com/ballista-compute/ballista/tree/main/rust/benchmarks/tpch) for an example. \ No newline at end of file +The Rust client supports a `DataFrame` API as well as SQL. See the +[TPC-H Benchmark Client](https://github.com/ballista-compute/ballista/tree/main/rust/benchmarks/tpch) for an example. diff --git a/docs/user-guide/src/distributed/clients.md b/docs/user-guide/src/distributed/clients.md index 1e223dd8eb05d..7b69f195b1a2d 100644 --- a/docs/user-guide/src/distributed/clients.md +++ b/docs/user-guide/src/distributed/clients.md @@ -16,6 +16,7 @@ specific language governing permissions and limitations under the License. --> + ## Clients - [Rust](client-rust.md) diff --git a/docs/user-guide/src/distributed/configuration.md b/docs/user-guide/src/distributed/configuration.md index 52b05b0e91679..56ca9289511a0 100644 --- a/docs/user-guide/src/distributed/configuration.md +++ b/docs/user-guide/src/distributed/configuration.md @@ -16,8 +16,10 @@ specific language governing permissions and limitations under the License. --> -# Configuration -The rust executor and scheduler can be configured using toml files, environment variables and command line arguments. The specification for config options can be found in `rust/ballista/src/bin/[executor|scheduler]_config_spec.toml`. + +# Configuration + +The rust executor and scheduler can be configured using toml files, environment variables and command line arguments. The specification for config options can be found in `rust/ballista/src/bin/[executor|scheduler]_config_spec.toml`. Those files fully define Ballista's configuration. If there is a discrepancy between this documentation and the files, assume those files are correct. @@ -25,8 +27,8 @@ To get a list of command line arguments, run the binary with `--help` There is an example config file at `ballista/rust/ballista/examples/example_executor_config.toml` -The order of precedence for arguments is: default config file < environment variables < specified config file < command line arguments. +The order of precedence for arguments is: default config file < environment variables < specified config file < command line arguments. -The executor and scheduler will look for the default config file at `/etc/ballista/[executor|scheduler].toml` To specify a config file use the `--config-file` argument. +The executor and scheduler will look for the default config file at `/etc/ballista/[executor|scheduler].toml` To specify a config file use the `--config-file` argument. -Environment variables are prefixed by `BALLISTA_EXECUTOR` or `BALLISTA_SCHEDULER` for the executor and scheduler respectively. Hyphens in command line arguments become underscores. For example, the `--scheduler-host` argument for the executor becomes `BALLISTA_EXECUTOR_SCHEDULER_HOST` \ No newline at end of file +Environment variables are prefixed by `BALLISTA_EXECUTOR` or `BALLISTA_SCHEDULER` for the executor and scheduler respectively. Hyphens in command line arguments become underscores. For example, the `--scheduler-host` argument for the executor becomes `BALLISTA_EXECUTOR_SCHEDULER_HOST` diff --git a/docs/user-guide/src/distributed/deployment.md b/docs/user-guide/src/distributed/deployment.md index 2432f2bebb1a5..3a00c96822e6b 100644 --- a/docs/user-guide/src/distributed/deployment.md +++ b/docs/user-guide/src/distributed/deployment.md @@ -16,6 +16,7 @@ specific language governing permissions and limitations under the License. --> + # Deployment Ballista is packaged as Docker images. Refer to the following guides to create a Ballista cluster: @@ -23,4 +24,3 @@ Ballista is packaged as Docker images. Refer to the following guides to create a - [Create a cluster using Docker](standalone.md) - [Create a cluster using Docker Compose](docker-compose.md) - [Create a cluster using Kubernetes](kubernetes.md) - diff --git a/docs/user-guide/src/distributed/docker-compose.md b/docs/user-guide/src/distributed/docker-compose.md index 2548e57e5a759..de27364fc2528 100644 --- a/docs/user-guide/src/distributed/docker-compose.md +++ b/docs/user-guide/src/distributed/docker-compose.md @@ -19,12 +19,12 @@ # Installing Ballista with Docker Compose -Docker Compose is a convenient way to launch a cluister when testing locally. The following Docker Compose example -demonstrates how to start a cluster using a single process that acts as both a scheduler and an executor, with a data +Docker Compose is a convenient way to launch a cluister when testing locally. The following Docker Compose example +demonstrates how to start a cluster using a single process that acts as both a scheduler and an executor, with a data volume mounted into the container so that Ballista can access the host file system. ```yaml -version: '2.0' +version: "2.0" services: etcd: image: quay.io/coreos/etcd:v3.4.9 @@ -41,11 +41,9 @@ services: - "50051:50051" volumes: - ./data:/data - - ``` -With the above content saved to a `docker-compose.yaml` file, the following command can be used to start the single +With the above content saved to a `docker-compose.yaml` file, the following command can be used to start the single node cluster. ```bash diff --git a/docs/user-guide/src/distributed/introduction.md b/docs/user-guide/src/distributed/introduction.md index 59d7a1a2a5c12..0d96c26219583 100644 --- a/docs/user-guide/src/distributed/introduction.md +++ b/docs/user-guide/src/distributed/introduction.md @@ -19,7 +19,7 @@ ## Overview -Ballista is a distributed compute platform primarily implemented in Rust, and powered by Apache Arrow. It is +Ballista is a distributed compute platform primarily implemented in Rust, and powered by Apache Arrow. It is built on an architecture that allows other programming languages to be supported as first-class citizens without paying a penalty for serialization costs. @@ -41,12 +41,12 @@ The following diagram highlights some of the integrations that will be possible Although Ballista is largely inspired by Apache Spark, there are some key differences. - The choice of Rust as the main execution language means that memory usage is deterministic and avoids the overhead of GC pauses. -- Ballista is designed from the ground up to use columnar data, enabling a number of efficiencies such as vectorized -processing (SIMD and GPU) and efficient compression. Although Spark does have some columnar support, it is still -largely row-based today. +- Ballista is designed from the ground up to use columnar data, enabling a number of efficiencies such as vectorized + processing (SIMD and GPU) and efficient compression. Although Spark does have some columnar support, it is still + largely row-based today. - The combination of Rust and Arrow provides excellent memory efficiency and memory usage can be 5x - 10x lower than Apache Spark in some cases, which means that more processing can fit on a single node, reducing the overhead of distributed compute. - The use of Apache Arrow as the memory model and network protocol means that data can be exchanged between executors in any programming language with minimal serialization overhead. - + ## Status -Ballista is at the proof-of-concept phase currently but is under active development by a growing community. \ No newline at end of file +Ballista is at the proof-of-concept phase currently but is under active development by a growing community. diff --git a/docs/user-guide/src/distributed/kubernetes.md b/docs/user-guide/src/distributed/kubernetes.md index 027a44d469682..7b9b356dfa428 100644 --- a/docs/user-guide/src/distributed/kubernetes.md +++ b/docs/user-guide/src/distributed/kubernetes.md @@ -16,6 +16,7 @@ specific language governing permissions and limitations under the License. --> + # Deploying Ballista with Kubernetes Ballista can be deployed to any Kubernetes cluster using the following instructions. These instructions assume that @@ -32,15 +33,15 @@ The k8s deployment consists of: Ballista is at an early stage of development and therefore has some significant limitations: -- There is no support for shared object stores such as S3. All data must exist locally on each node in the +- There is no support for shared object stores such as S3. All data must exist locally on each node in the cluster, including where any client process runs. -- Only a single scheduler instance is currently supported unless the scheduler is configured to use `etcd` as a +- Only a single scheduler instance is currently supported unless the scheduler is configured to use `etcd` as a backing store. -## Create Persistent Volume and Persistent Volume Claim +## Create Persistent Volume and Persistent Volume Claim -Copy the following yaml to a `pv.yaml` file and apply to the cluster to create a persistent volume and a persistent -volume claim so that the specified host directory is available to the containers. This is where any data should be +Copy the following yaml to a `pv.yaml` file and apply to the cluster to create a persistent volume and a persistent +volume claim so that the specified host directory is available to the containers. This is where any data should be located so that Ballista can execute queries against it. ```yaml @@ -121,20 +122,20 @@ spec: ballista-cluster: ballista spec: containers: - - name: ballista-scheduler - image: ballistacompute/ballista-rust:0.4.2-SNAPSHOT - command: ["/scheduler"] - args: ["--port=50050"] - ports: - - containerPort: 50050 - name: flight - volumeMounts: - - mountPath: /mnt - name: data + - name: ballista-scheduler + image: ballistacompute/ballista-rust:0.4.2-SNAPSHOT + command: ["/scheduler"] + args: ["--port=50050"] + ports: + - containerPort: 50050 + name: flight + volumeMounts: + - mountPath: /mnt + name: data volumes: - - name: data - persistentVolumeClaim: - claimName: data-pv-claim + - name: data + persistentVolumeClaim: + claimName: data-pv-claim --- apiVersion: apps/v1 kind: StatefulSet @@ -156,12 +157,18 @@ spec: - name: ballista-executor image: ballistacompute/ballista-rust:0.4.2-SNAPSHOT command: ["/executor"] - args: ["--port=50051", "--scheduler-host=ballista-scheduler", "--scheduler-port=50050", "--external-host=$(MY_POD_IP)"] + args: + [ + "--port=50051", + "--scheduler-host=ballista-scheduler", + "--scheduler-port=50050", + "--external-host=$(MY_POD_IP)", + ] env: - name: MY_POD_IP valueFrom: fieldRef: - fieldPath: status.podIP + fieldPath: status.podIP ports: - containerPort: 50051 name: flight @@ -212,4 +219,4 @@ Run the following kubectl command to delete the cluster. ```bash kubectl delete -f cluster.yaml -``` \ No newline at end of file +``` diff --git a/docs/user-guide/src/distributed/raspberrypi.md b/docs/user-guide/src/distributed/raspberrypi.md index d4d2079bb5ccd..c7e429aec3c8c 100644 --- a/docs/user-guide/src/distributed/raspberrypi.md +++ b/docs/user-guide/src/distributed/raspberrypi.md @@ -16,31 +16,32 @@ specific language governing permissions and limitations under the License. --> + # Running Ballista on Raspberry Pi The Raspberry Pi single-board computer provides a fun and relatively inexpensive way to get started with distributed computing. -These instructions have been tested using an Ubuntu Linux desktop as the host, and a +These instructions have been tested using an Ubuntu Linux desktop as the host, and a [Raspberry Pi 4 Model B](https://www.raspberrypi.org/products/raspberry-pi-4-model-b/) with 4 GB RAM as the target. ## Preparing the Raspberry Pi We recommend installing the 64-bit version of [Ubuntu for Raspberry Pi](https://ubuntu.com/raspberry-pi). -The Rust implementation of Arrow does not work correctly on 32-bit ARM architectures +The Rust implementation of Arrow does not work correctly on 32-bit ARM architectures ([issue](https://github.com/apache/arrow-rs/issues/109)). ## Cross Compiling DataFusion for the Raspberry Pi -We do not yet publish official Docker images as part of the release process, although we do plan to do this in the -future ([issue #228](https://github.com/apache/arrow-datafusion/issues/228)). +We do not yet publish official Docker images as part of the release process, although we do plan to do this in the +future ([issue #228](https://github.com/apache/arrow-datafusion/issues/228)). -Although it is technically possible to build DataFusion directly on a Raspberry Pi, it really isn't very practical. -It is much faster to use [cross](https://github.com/rust-embedded/cross) to cross-compile from a more powerful +Although it is technically possible to build DataFusion directly on a Raspberry Pi, it really isn't very practical. +It is much faster to use [cross](https://github.com/rust-embedded/cross) to cross-compile from a more powerful desktop computer. -Docker must be installed and the Docker daemon must be running before cross-compiling with cross. See the +Docker must be installed and the Docker daemon must be running before cross-compiling with cross. See the [cross](https://github.com/rust-embedded/cross) project for more detailed instructions. Run the following command to install cross. @@ -63,9 +64,9 @@ cross test --target aarch64-unknown-linux-gnu ## Deploying the binaries to Raspberry Pi -You should now be able to copy the executable to the Raspberry Pi using scp on Linux. You will need to change the IP -address in these commands to be the IP address for your Raspberry Pi. The easiest way to find this is to connect a -keyboard and monitor to the Pi and run `ifconfig`. +You should now be able to copy the executable to the Raspberry Pi using scp on Linux. You will need to change the IP +address in these commands to be the IP address for your Raspberry Pi. The easiest way to find this is to connect a +keyboard and monitor to the Pi and run `ifconfig`. ```bash scp ./target/aarch64-unknown-linux-gnu/release/ballista-scheduler ubuntu@10.0.0.186: @@ -83,9 +84,9 @@ It is now possible to run the Ballista scheduler and executor natively on the Pi ## Docker -Using Docker's `buildx` cross-platform functionality, we can also build a docker image targeting ARM64 -from any desktop environment. This will require write access to a Docker repository -on [Docker Hub](https://hub.docker.com/) because the resulting Docker image will be pushed directly +Using Docker's `buildx` cross-platform functionality, we can also build a docker image targeting ARM64 +from any desktop environment. This will require write access to a Docker repository +on [Docker Hub](https://hub.docker.com/) because the resulting Docker image will be pushed directly to the repo. ```bash @@ -118,11 +119,11 @@ docker run -it myrepo/ballista-arm64 \ --concurrency=24 --iterations=1 --debug --host=ballista-scheduler --port=50050 ``` -Note that it will be necessary to mount appropriate volumes into the containers and also configure networking +Note that it will be necessary to mount appropriate volumes into the containers and also configure networking so that the Docker containers can communicate with each other. This can be achieved using Docker compose or Kubernetes. ## Kubernetes With Docker images built using the instructions above, it is now possible to deploy Ballista to a Kubernetes cluster running on one of more Raspberry Pi computers. Refer to the instructions in the [Kubernetes](kubernetes.md) chapter -for more information, and remember to change the Docker image name to `myrepo/ballista-arm64`. \ No newline at end of file +for more information, and remember to change the Docker image name to `myrepo/ballista-arm64`. diff --git a/docs/user-guide/src/distributed/standalone.md b/docs/user-guide/src/distributed/standalone.md index e4c24fedd3198..e9db425dc1119 100644 --- a/docs/user-guide/src/distributed/standalone.md +++ b/docs/user-guide/src/distributed/standalone.md @@ -16,6 +16,7 @@ specific language governing permissions and limitations under the License. --> + ## Deploying a standalone Ballista cluster ### Start a Scheduler @@ -50,7 +51,7 @@ Start one or more executor processes. Each executor process will need to listen ```bash docker run --network=host \ -d ballistacompute/ballista-rust:0.4.2-SNAPSHOT \ - /executor --external-host localhost --port 50051 + /executor --external-host localhost --port 50051 ``` Use `docker ps` to check that both the scheduer and executor(s) are now running: @@ -71,14 +72,14 @@ $ docker logs 0746ce262a19 [2021-02-14T18:36:25Z INFO executor] Starting registration with scheduler ``` -The external host and port will be registered with the scheduler. The executors will discover other executors by +The external host and port will be registered with the scheduler. The executors will discover other executors by requesting a list of executors from the scheduler. ### Using etcd as backing store _NOTE: This functionality is currently experimental_ -Ballista can optionally use [etcd](https://etcd.io/) as a backing store for the scheduler. +Ballista can optionally use [etcd](https://etcd.io/) as a backing store for the scheduler. ```bash docker run --network=host \ @@ -88,5 +89,5 @@ docker run --network=host \ --etcd-urls etcd:2379 ``` -Please refer to the [etcd](https://etcd.io/) web site for installation instructions. Etcd version 3.4.9 or later is +Please refer to the [etcd](https://etcd.io/) web site for installation instructions. Etcd version 3.4.9 or later is recommended. diff --git a/docs/user-guide/src/example-usage.md b/docs/user-guide/src/example-usage.md index ff23c96de362e..2ea7dca9bdede 100644 --- a/docs/user-guide/src/example-usage.md +++ b/docs/user-guide/src/example-usage.md @@ -16,6 +16,7 @@ specific language governing permissions and limitations under the License. --> + # Example Usage Run a SQL query against data stored in a CSV: diff --git a/docs/user-guide/src/faq.md b/docs/user-guide/src/faq.md index b73a376988b51..5e1a72c0d0338 100644 --- a/docs/user-guide/src/faq.md +++ b/docs/user-guide/src/faq.md @@ -16,6 +16,7 @@ specific language governing permissions and limitations under the License. --> + # Frequently Asked Questions ## What is the relationship between Apache Arrow, DataFusion, and Ballista? @@ -23,9 +24,9 @@ Apache Arrow is a library which provides a standardized memory representation for columnar data. It also provides "kernels" for performing common operations on this data. -DataFusion is a library for executing queries in-process using the Apache Arrow memory -model and computational kernels. It is designed to run within a single process, using threads -for parallel query execution. +DataFusion is a library for executing queries in-process using the Apache Arrow memory +model and computational kernels. It is designed to run within a single process, using threads +for parallel query execution. Ballista is a distributed compute platform design to leverage DataFusion and other query -execution libraries. \ No newline at end of file +execution libraries. diff --git a/docs/user-guide/src/introduction.md b/docs/user-guide/src/introduction.md index c67fb90103d88..7ba3c963cc867 100644 --- a/docs/user-guide/src/introduction.md +++ b/docs/user-guide/src/introduction.md @@ -37,8 +37,7 @@ the convenience of an SQL interface or a DataFrame API. ## Why DataFusion? -* *High Performance*: Leveraging Rust and Arrow's memory model, DataFusion achieves very high performance -* *Easy to Connect*: Being part of the Apache Arrow ecosystem (Arrow, Parquet and Flight), DataFusion works well with the rest of the big data ecosystem -* *Easy to Embed*: Allowing extension at almost any point in its design, DataFusion can be tailored for your specific usecase -* *High Quality*: Extensively tested, both by itself and with the rest of the Arrow ecosystem, DataFusion can be used as the foundation for production systems. - +- _High Performance_: Leveraging Rust and Arrow's memory model, DataFusion achieves very high performance +- _Easy to Connect_: Being part of the Apache Arrow ecosystem (Arrow, Parquet and Flight), DataFusion works well with the rest of the big data ecosystem +- _Easy to Embed_: Allowing extension at almost any point in its design, DataFusion can be tailored for your specific usecase +- _High Quality_: Extensively tested, both by itself and with the rest of the Arrow ecosystem, DataFusion can be used as the foundation for production systems. diff --git a/docs/user-guide/src/library.md b/docs/user-guide/src/library.md index 12879b160c8f1..d35a4b74bbb81 100644 --- a/docs/user-guide/src/library.md +++ b/docs/user-guide/src/library.md @@ -16,6 +16,7 @@ specific language governing permissions and limitations under the License. --> + # Using DataFusion as a library DataFusion is [published on crates.io](https://crates.io/crates/datafusion), and is [well documented on docs.rs](https://docs.rs/datafusion/). diff --git a/docs/user-guide/src/sql/select.md b/docs/user-guide/src/sql/select.md index 78d0cb58531d4..348ffff2887f7 100644 --- a/docs/user-guide/src/sql/select.md +++ b/docs/user-guide/src/sql/select.md @@ -20,7 +20,7 @@ # SELECT syntax The queries in DataFusion scan data from tables and return 0 or more rows. -In this documentation we describe the SQL syntax in DataFusion. +In this documentation we describe the SQL syntax in DataFusion. DataFusion supports the following syntax for queries: @@ -32,7 +32,7 @@ DataFusion supports the following syntax for queries: [ [GROUP BY](#group-by-clause) grouping_element [, ...] ]
[ [HAVING](#having-clause) condition]
[ [UNION](#union-clause) [ ALL | select ]
-[ [ORDER BY](#order-by-clause) expression [ ASC | DESC ] [, ...] ]
+[ [ORDER BY](#order-by-clause) expression [ ASC | DESC ][, ...] ]
[ [LIMIT](#limit-clause) count ]
@@ -48,11 +48,10 @@ SELECT a, b FROM x; # SELECT clause - Example: ```sql -SELECT a, b, a + b FROM table +SELECT a, b, a + b FROM table ``` The `DISTINCT` quantifier can be added to make the query return all distinct rows. @@ -65,11 +64,11 @@ SELECT DISTINCT person, age FROM employees # FROM clause Example: + ```sql SELECT t.a FROM table AS t ``` - # WHERE clause Example: @@ -86,7 +85,6 @@ Example: SELECT a, b, MAX(c) FROM table GROUP BY a, b ``` - # HAVING clause Example: @@ -126,7 +124,6 @@ SELECT age, person FROM table ORDER BY age DESC; SELECT age, person FROM table ORDER BY age, person DESC; ``` - # LIMIT clause Limits the number of rows to be a maximum of `count` rows. `count` should be a non-negative integer. @@ -136,4 +133,4 @@ Example: ```sql SELECT age, person FROM table LIMIT 10 -``` \ No newline at end of file +``` From 7359e4b4df0836b8de970d551c6eeae22d1cc810 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 24 May 2021 07:40:58 -0600 Subject: [PATCH 118/329] Update TPC-H benchmark to show physical plan when debug mode is enabled (#386) --- benchmarks/src/bin/tpch.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index cee555fe675e0..433bf2d269934 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -38,7 +38,7 @@ use datafusion::datasource::parquet::ParquetTable; use datafusion::datasource::{CsvFile, MemTable, TableProvider}; use datafusion::error::{DataFusionError, Result}; use datafusion::logical_plan::LogicalPlan; -use datafusion::physical_plan::collect; +use datafusion::physical_plan::{collect, displayable}; use datafusion::prelude::*; use datafusion::parquet::basic::Compression; @@ -310,6 +310,12 @@ async fn execute_query( println!("Optimized logical plan:\n{:?}", plan); } let physical_plan = ctx.create_physical_plan(&plan)?; + if debug { + println!( + "Physical plan:\n{}", + displayable(physical_plan.as_ref()).indent().to_string() + ); + } let result = collect(physical_plan).await?; if debug { pretty::print_batches(&result)?; From eeb69af9b37686f494f2e09fe52aa428407238ae Mon Sep 17 00:00:00 2001 From: Parth Sarthy Date: Mon, 24 May 2021 15:41:18 +0100 Subject: [PATCH 119/329] add some padding around the logo (#411) --- datafusion/docs/images/DataFusion-Logo-Background-White.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/docs/images/DataFusion-Logo-Background-White.svg b/datafusion/docs/images/DataFusion-Logo-Background-White.svg index 026c70855a71f..b3bb47c5e07f4 100644 --- a/datafusion/docs/images/DataFusion-Logo-Background-White.svg +++ b/datafusion/docs/images/DataFusion-Logo-Background-White.svg @@ -1 +1 @@ -DataFUSION-Logo-Dark \ No newline at end of file +DataFUSION-Logo-Dark \ No newline at end of file From 68ad9902b8b507b36f196b991556b845d04e8b7e Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Tue, 25 May 2021 01:44:08 +0800 Subject: [PATCH 120/329] add csv mode to datafusion cli (#281) --- .github/workflows/rust.yml | 90 +++++++++++++++---- datafusion-cli/src/main.rs | 27 +++--- datafusion-cli/src/print_format.rs | 30 +++++++ datafusion/docs/cli.md | 4 +- integration-tests/__init__.py | 15 ++++ .../sqls/simple_math_expressions.sql | 22 +++++ integration-tests/sqls/simple_select.sql | 17 ++++ integration-tests/test_psql_parity.py | 87 ++++++++++++++++++ 8 files changed, 260 insertions(+), 32 deletions(-) create mode 100644 integration-tests/__init__.py create mode 100644 integration-tests/sqls/simple_math_expressions.sql create mode 100644 integration-tests/sqls/simple_select.sql create mode 100644 integration-tests/test_psql_parity.py diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index f76873ef77aed..f492b2e31d0bc 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -23,7 +23,6 @@ on: pull_request: jobs: - # build the library, a compilation step used by multiple steps below linux-build-lib: name: Build Libraries on AMD64 Rust ${{ matrix.rust }} @@ -61,17 +60,19 @@ jobs: rustup component add rustfmt - name: Build Workspace run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" cargo build + env: + CARGO_HOME: "/github/home/.cargo" + CARGO_TARGET_DIR: "/github/home/target" # Ballista is currently not part of the main workspace so requires a separate build step - name: Build Ballista run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" cd ballista/rust # snmalloc requires cmake so build without default features cargo build --no-default-features + env: + CARGO_HOME: "/github/home/.cargo" + CARGO_TARGET_DIR: "/github/home/target" # test the crate linux-test: @@ -111,8 +112,6 @@ jobs: rustup component add rustfmt - name: Run tests run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" export ARROW_TEST_DATA=$(pwd)/testing/data export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data # run tests on all workspace members with default feature list @@ -122,16 +121,69 @@ jobs: cargo test --no-default-features cargo run --example csv_sql cargo run --example parquet_sql + env: + CARGO_HOME: "/github/home/.cargo" + CARGO_TARGET_DIR: "/github/home/target" # Ballista is currently not part of the main workspace so requires a separate test step - name: Run Ballista tests run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" export ARROW_TEST_DATA=$(pwd)/testing/data export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data cd ballista/rust # snmalloc requires cmake so build without default features cargo test --no-default-features + env: + CARGO_HOME: "/github/home/.cargo" + CARGO_TARGET_DIR: "/github/home/target" + + integration-test: + name: "Integration Test" + needs: [linux-build-lib] + runs-on: ubuntu-latest + services: + postgres: + image: postgres:13 + env: + POSTGRES_PASSWORD: postgres + POSTGRES_DB: db_test + ports: + - 5432/tcp + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: "3.8" + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip setuptools wheel + python -m pip install --upgrade numpy==1.20.3 pandas==1.2.4 + - name: Allow access of psql + run: | + # make sure psql can access the server + echo "$POSTGRES_HOST:$POSTGRES_PORT:$POSTGRES_DB:$POSTGRES_USER:$POSTGRES_PASSWORD" | tee ~/.pgpass + chmod 0600 ~/.pgpass + psql -d "$POSTGRES_DB" -h "$POSTGRES_HOST" -p "$POSTGRES_PORT" -U "$POSTGRES_USER" -c 'select now() as now' + env: + POSTGRES_HOST: localhost + POSTGRES_PORT: ${{ job.services.postgres.ports[5432] }} + POSTGRES_DB: db_test + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + - name: Build datafusion-cli + run: cargo build --bin datafusion-cli + - name: Test Psql Parity + run: python -m unittest -v integration-tests/test_psql_parity.py + env: + POSTGRES_HOST: localhost + POSTGRES_PORT: ${{ job.services.postgres.ports[5432] }} + POSTGRES_DB: db_test + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres windows-and-macos: name: Test on ${{ matrix.os }} Rust ${{ matrix.rust }} @@ -156,9 +208,10 @@ jobs: run: | export ARROW_TEST_DATA=$(pwd)/testing/data export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data - # do not produce debug symbols to keep memory usage down - export RUSTFLAGS="-C debuginfo=0" cargo test + env: + # do not produce debug symbols to keep memory usage down + RUSTFLAGS: "-C debuginfo=0" lint: name: Lint @@ -212,9 +265,10 @@ jobs: rustup component add rustfmt clippy - name: Run clippy run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" cargo clippy --all-targets --workspace -- -D warnings + env: + CARGO_HOME: "/github/home/.cargo" + CARGO_TARGET_DIR: "/github/home/target" miri-checks: name: MIRI @@ -242,9 +296,9 @@ jobs: - name: Run Miri Checks env: RUST_BACKTRACE: full - RUST_LOG: 'trace' + RUST_LOG: "trace" + MIRIFLAGS: "-Zmiri-disable-isolation" run: | - export MIRIFLAGS="-Zmiri-disable-isolation" cargo miri setup cargo clean # Ignore MIRI errors until we can get a clean run @@ -275,9 +329,6 @@ jobs: key: ${{ runner.os }}-${{ matrix.arch }}-target-coverage-cache-${{ matrix.rust }}- - name: Run coverage run: | - export CARGO_HOME="/home/runner/.cargo" - export CARGO_TARGET_DIR="/home/runner/target" - export ARROW_TEST_DATA=$(pwd)/testing/data export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data @@ -285,6 +336,9 @@ jobs: # see https://github.com/xd009642/tarpaulin/issues/618 cargo install --version 0.16.0 cargo-tarpaulin cargo tarpaulin --out Xml + env: + CARGO_HOME: "/home/runner/.cargo" + CARGO_TARGET_DIR: "/home/runner/target" - name: Report coverage continue-on-error: true run: bash <(curl -s https://codecov.io/bash) diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index f36b5d93d21f6..5b35880580b20 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -20,7 +20,10 @@ use clap::{crate_version, App, Arg}; use datafusion::error::Result; use datafusion::execution::context::{ExecutionConfig, ExecutionContext}; -use datafusion_cli::{print_format::PrintFormat, PrintOptions}; +use datafusion_cli::{ + print_format::{all_print_formats, PrintFormat}, + PrintOptions, +}; use rustyline::Editor; use std::env; use std::fs::File; @@ -63,14 +66,22 @@ pub async fn main() { ) .arg( Arg::with_name("format") - .help("Output format (possible values: table, csv, tsv, json)") + .help("Output format") .long("format") .default_value("table") - .validator(is_valid_format) + .possible_values( + &all_print_formats() + .iter() + .map(|format| format.to_string()) + .collect::>() + .iter() + .map(|i| i.as_str()) + .collect::>(), + ) .takes_value(true), ) .arg( - Arg::with_name("quite") + Arg::with_name("quiet") .help("Reduce printing other than the results and work quietly") .short("q") .long("quiet") @@ -189,14 +200,6 @@ async fn exec_from_repl(execution_config: ExecutionConfig, print_options: PrintO rl.save_history(".history").ok(); } -fn is_valid_format(format: String) -> std::result::Result<(), String> { - if format.parse::().is_ok() { - Ok(()) - } else { - Err(format!("Format '{}' not supported", format)) - } -} - fn is_valid_file(dir: String) -> std::result::Result<(), String> { if Path::new(&dir).is_file() { Ok(()) diff --git a/datafusion-cli/src/print_format.rs b/datafusion-cli/src/print_format.rs index 85caaa3c52767..c7aa06149678a 100644 --- a/datafusion-cli/src/print_format.rs +++ b/datafusion-cli/src/print_format.rs @@ -21,6 +21,7 @@ use arrow::json::ArrayWriter; use datafusion::arrow::record_batch::RecordBatch; use datafusion::arrow::util::pretty; use datafusion::error::{DataFusionError, Result}; +use std::fmt; use std::str::FromStr; /// Allow records to be printed in different formats @@ -32,6 +33,16 @@ pub enum PrintFormat { Json, } +/// returns all print formats +pub fn all_print_formats() -> Vec { + vec![ + PrintFormat::Csv, + PrintFormat::Tsv, + PrintFormat::Table, + PrintFormat::Json, + ] +} + impl FromStr for PrintFormat { type Err = (); fn from_str(s: &str) -> std::result::Result { @@ -45,6 +56,17 @@ impl FromStr for PrintFormat { } } +impl fmt::Display for PrintFormat { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + Self::Csv => write!(f, "csv"), + Self::Tsv => write!(f, "tsv"), + Self::Table => write!(f, "table"), + Self::Json => write!(f, "json"), + } + } +} + fn print_batches_to_json(batches: &[RecordBatch]) -> Result { let mut bytes = vec![]; { @@ -108,6 +130,14 @@ mod tests { assert_eq!(PrintFormat::Table, format); } + #[test] + fn test_to_str() { + assert_eq!("csv", PrintFormat::Csv.to_string()); + assert_eq!("table", PrintFormat::Table.to_string()); + assert_eq!("tsv", PrintFormat::Tsv.to_string()); + assert_eq!("json", PrintFormat::Json.to_string()); + } + #[test] fn test_from_str_failure() { assert_eq!(true, "pretty".parse::().is_err()); diff --git a/datafusion/docs/cli.md b/datafusion/docs/cli.md index db3e0b939a81c..a1fd652b7a047 100644 --- a/datafusion/docs/cli.md +++ b/datafusion/docs/cli.md @@ -25,7 +25,7 @@ The DataFusion CLI is a command-line interactive SQL utility that allows queries Use the following commands to clone this repository and run the CLI. This will require the Rust toolchain to be installed. Rust can be installed from [https://rustup.rs/](https://rustup.rs/). -```sh +```bash git clone https://github.com/apache/arrow-datafusion cd arrow-datafusion/datafusion-cli cargo run --release @@ -35,7 +35,7 @@ cargo run --release Use the following commands to clone this repository and build a Docker image containing the CLI tool. Note that there is `.dockerignore` file in the root of the repository that may need to be deleted in order for this to work. -```sh +```bash git clone https://github.com/apache/arrow-datafusion cd arrow-datafusion docker build -f datafusion-cli/Dockerfile . --tag datafusion-cli diff --git a/integration-tests/__init__.py b/integration-tests/__init__.py new file mode 100644 index 0000000000000..8516388ee6213 --- /dev/null +++ b/integration-tests/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/integration-tests/sqls/simple_math_expressions.sql b/integration-tests/sqls/simple_math_expressions.sql new file mode 100644 index 0000000000000..504689d51130f --- /dev/null +++ b/integration-tests/sqls/simple_math_expressions.sql @@ -0,0 +1,22 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at + +-- http://www.apache.org/licenses/LICENSE-2.0 + +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +SELECT + abs(-1.1) as abs, + exp(2.0) as exp, + sin(3.0) as sin, + cos(4.0) as cos, + tan(5.0) as tan; diff --git a/integration-tests/sqls/simple_select.sql b/integration-tests/sqls/simple_select.sql new file mode 100644 index 0000000000000..78f54bdd8ece5 --- /dev/null +++ b/integration-tests/sqls/simple_select.sql @@ -0,0 +1,17 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at + +-- http://www.apache.org/licenses/LICENSE-2.0 + +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +SELECT 1 as num; diff --git a/integration-tests/test_psql_parity.py b/integration-tests/test_psql_parity.py new file mode 100644 index 0000000000000..204f9063297e9 --- /dev/null +++ b/integration-tests/test_psql_parity.py @@ -0,0 +1,87 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import numpy as np +import io +import os +import subprocess +from pathlib import Path +import unittest + +pg_db, pg_user, pg_host, pg_port = [ + os.environ.get(i) + for i in ( + "POSTGRES_DB", + "POSTGRES_USER", + "POSTGRES_HOST", + "POSTGRES_PORT", + ) +] + + +def generate_csv_from_datafusion(fname: str): + return subprocess.check_output( + [ + "./target/debug/datafusion-cli", + "-f", + fname, + "--format", + "csv", + "-q", + ], + ) + + +def generate_csv_from_psql(fname: str): + return subprocess.check_output( + [ + "psql", + "-d", + pg_db, + "-h", + pg_host, + "-p", + pg_port, + "-U", + pg_user, + "-X", + "--csv", + "-f", + fname, + ] + ) + + +class PsqlParityTest(unittest.TestCase): + def test_parity(self): + root = Path(os.path.dirname(__file__)) / "sqls" + files = set(root.glob("*.sql")) + self.assertEqual(len(files), 2, msg="tests are missed") + for fname in files: + with self.subTest(fname=fname): + datafusion_output = pd.read_csv( + io.BytesIO(generate_csv_from_datafusion(fname)) + ) + psql_output = pd.read_csv(io.BytesIO(generate_csv_from_psql(fname))) + self.assertTrue( + np.allclose(datafusion_output, psql_output), + msg=f"data fusion output={datafusion_output}, psql_output={psql_output}", + ) + + +if __name__ == "__main__": + unittest.main() From ea59d05b6390a0f676956db9160805b3f660cb54 Mon Sep 17 00:00:00 2001 From: Javier Goday Date: Tue, 25 May 2021 14:25:04 +0200 Subject: [PATCH 121/329] Benchmark subcommand to distinguish between DataFusion and Ballista (#402) * #401: Add subcommand to TPC-H benchmark args to distinguish between DataFusion and Ballista * fix benchmark subcommand name * Fix lint * fix benchmark tests using DatafusionBenchmarkOpts * Fix DataFusionBenchmarkOpts name and update doc --- benchmarks/README.md | 8 +- benchmarks/run.sh | 2 +- benchmarks/src/bin/tpch.rs | 77 +++++++++++++++---- .../user-guide/src/distributed/raspberrypi.md | 2 +- 4 files changed, 66 insertions(+), 23 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index e003d9687c9c1..e347130689b3d 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -44,13 +44,13 @@ to the `.gitignore` file. The benchmark can then be run (assuming the data created from `dbgen` is in `./data`) with a command such as: ```bash -cargo run --release --bin tpch -- benchmark --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096 +cargo run --release --bin tpch -- benchmark datafusion --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096 ``` You can enable the features `simd` (to use SIMD instructions) and/or `mimalloc` or `snmalloc` (to use either the mimalloc or snmalloc allocator) as features by passing them in as `--features`: ``` -cargo run --release --features "simd mimalloc" --bin tpch -- benchmark --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096 +cargo run --release --features "simd mimalloc" --bin tpch -- benchmark datafusion --iterations 3 --path ./data --format tbl --query 1 --batch-size 4096 ``` The benchmark program also supports CSV and Parquet input file formats and a utility is provided to convert from `tbl` @@ -123,7 +123,7 @@ To run the benchmarks: ```bash cd $ARROW_HOME/ballista/rust/benchmarks/tpch -cargo run --release benchmark --host localhost --port 50050 --query 1 --path $(pwd)/data --format tbl +cargo run --release benchmark ballista --host localhost --port 50050 --query 1 --path $(pwd)/data --format tbl ``` ## Running the Ballista Benchmarks on docker-compose @@ -140,7 +140,7 @@ docker-compose up Then you can run the benchmark with: ```bash -docker-compose run ballista-client cargo run benchmark --host ballista-scheduler --port 50050 --query 1 --path /data --format tbl +docker-compose run ballista-client cargo run benchmark ballista --host ballista-scheduler --port 50050 --query 1 --path /data --format tbl ``` ## Expected output diff --git a/benchmarks/run.sh b/benchmarks/run.sh index fd97ff9a9a6a5..8e36424da89f0 100755 --- a/benchmarks/run.sh +++ b/benchmarks/run.sh @@ -22,5 +22,5 @@ set -e cd / for query in 1 3 5 6 10 12 do - /tpch benchmark --host ballista-scheduler --port 50050 --query $query --path /data --format tbl --iterations 1 --debug + /tpch benchmark ballista --host ballista-scheduler --port 50050 --query $query --path /data --format tbl --iterations 1 --debug done diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index 433bf2d269934..9ac66e136dbdb 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -54,7 +54,7 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc; static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; #[derive(Debug, StructOpt, Clone)] -struct BenchmarkOpt { +struct BallistaBenchmarkOpt { /// Query number #[structopt(short, long)] query: usize, @@ -67,10 +67,6 @@ struct BenchmarkOpt { #[structopt(short = "i", long = "iterations", default_value = "3")] iterations: usize, - /// Number of threads to use for parallel execution - #[structopt(short = "c", long = "concurrency", default_value = "2")] - concurrency: usize, - /// Batch size when reading CSV or Parquet files #[structopt(short = "s", long = "batch-size", default_value = "8192")] batch_size: usize, @@ -100,6 +96,45 @@ struct BenchmarkOpt { port: Option, } +#[derive(Debug, StructOpt, Clone)] +struct DataFusionBenchmarkOpt { + /// Query number + #[structopt(short, long)] + query: usize, + + /// Activate debug mode to see query results + #[structopt(short, long)] + debug: bool, + + /// Number of iterations of each test run + #[structopt(short = "i", long = "iterations", default_value = "3")] + iterations: usize, + + /// Number of threads to use for parallel execution + #[structopt(short = "c", long = "concurrency", default_value = "2")] + concurrency: usize, + + /// Batch size when reading CSV or Parquet files + #[structopt(short = "s", long = "batch-size", default_value = "8192")] + batch_size: usize, + + /// Path to data files + #[structopt(parse(from_os_str), required = true, short = "p", long = "path")] + path: PathBuf, + + /// File format: `csv` or `parquet` + #[structopt(short = "f", long = "format", default_value = "csv")] + file_format: String, + + /// Load the data into a MemTable before executing the query + #[structopt(short = "m", long = "mem-table")] + mem_table: bool, + + /// Number of partitions to create when using MemTable as input + #[structopt(short = "n", long = "partitions", default_value = "8")] + partitions: usize, +} + #[derive(Debug, StructOpt)] struct ConvertOpt { /// Path to csv files @@ -127,10 +162,19 @@ struct ConvertOpt { batch_size: usize, } +#[derive(Debug, StructOpt)] +#[structopt(about = "benchmark command")] +enum BenchmarkSubCommandOpt { + #[structopt(name = "ballista")] + BallistaBenchmark(BallistaBenchmarkOpt), + #[structopt(name = "datafusion")] + DataFusionBenchmark(DataFusionBenchmarkOpt), +} + #[derive(Debug, StructOpt)] #[structopt(name = "TPC-H", about = "TPC-H Benchmarks.")] enum TpchOpt { - Benchmark(BenchmarkOpt), + Benchmark(BenchmarkSubCommandOpt), Convert(ConvertOpt), } @@ -140,20 +184,21 @@ const TABLES: &[&str] = &[ #[tokio::main] async fn main() -> Result<()> { + use BenchmarkSubCommandOpt::*; + env_logger::init(); match TpchOpt::from_args() { - TpchOpt::Benchmark(opt) => { - if opt.host.is_some() && opt.port.is_some() { - benchmark_ballista(opt).await.map(|_| ()) - } else { - benchmark_datafusion(opt).await.map(|_| ()) - } + TpchOpt::Benchmark(BallistaBenchmark(opt)) => { + benchmark_ballista(opt).await.map(|_| ()) + } + TpchOpt::Benchmark(DataFusionBenchmark(opt)) => { + benchmark_datafusion(opt).await.map(|_| ()) } TpchOpt::Convert(opt) => convert_tbl(opt).await, } } -async fn benchmark_datafusion(opt: BenchmarkOpt) -> Result> { +async fn benchmark_datafusion(opt: DataFusionBenchmarkOpt) -> Result> { println!("Running benchmarks with the following options: {:?}", opt); let config = ExecutionConfig::new() .with_concurrency(opt.concurrency) @@ -204,7 +249,7 @@ async fn benchmark_datafusion(opt: BenchmarkOpt) -> Result> { Ok(result) } -async fn benchmark_ballista(opt: BenchmarkOpt) -> Result<()> { +async fn benchmark_ballista(opt: BallistaBenchmarkOpt) -> Result<()> { println!("Running benchmarks with the following options: {:?}", opt); let mut settings = HashMap::new(); @@ -956,7 +1001,7 @@ mod tests { let expected = df.collect().await?; // run the query to compute actual results of the query - let opt = BenchmarkOpt { + let opt = DataFusionBenchmarkOpt { query: n, debug: false, iterations: 1, @@ -966,8 +1011,6 @@ mod tests { file_format: "tbl".to_string(), mem_table: false, partitions: 16, - host: None, - port: None, }; let actual = benchmark_datafusion(opt).await?; diff --git a/docs/user-guide/src/distributed/raspberrypi.md b/docs/user-guide/src/distributed/raspberrypi.md index c7e429aec3c8c..0083d191770b6 100644 --- a/docs/user-guide/src/distributed/raspberrypi.md +++ b/docs/user-guide/src/distributed/raspberrypi.md @@ -115,7 +115,7 @@ Run the benchmarks: ```bash docker run -it myrepo/ballista-arm64 \ - /tpch benchmark --query=1 --path=/path/to/data --format=parquet \ + /tpch benchmark datafusion --query=1 --path=/path/to/data --format=parquet \ --concurrency=24 --iterations=1 --debug --host=ballista-scheduler --port=50050 ``` From 880650d853f4d824686bd23fa65bb75a96eaaa6b Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Tue, 25 May 2021 16:53:12 +0100 Subject: [PATCH 122/329] fix: don't duplicate existing filters (#409) --- datafusion/src/optimizer/filter_push_down.rs | 33 +++++++++++++++++--- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/datafusion/src/optimizer/filter_push_down.rs b/datafusion/src/optimizer/filter_push_down.rs index 4c248e2b6483d..2056e1972950a 100644 --- a/datafusion/src/optimizer/filter_push_down.rs +++ b/datafusion/src/optimizer/filter_push_down.rs @@ -379,6 +379,11 @@ fn optimize(plan: &LogicalPlan, mut state: State) -> Result { } if add_to_provider { + // Don't add expression again if it's already present in + // pushed down filters. + if new_filters.contains(filter_expr) { + break; + } new_filters.push(filter_expr.clone()); } } @@ -455,11 +460,14 @@ mod tests { use crate::{logical_plan::col, prelude::JoinType}; use arrow::datatypes::SchemaRef; - fn assert_optimized_plan_eq(plan: &LogicalPlan, expected: &str) { + fn optimize_plan(plan: &LogicalPlan) -> LogicalPlan { let rule = FilterPushDown::new(); - let optimized_plan = rule - .optimize(plan, &ExecutionProps::new()) - .expect("failed to optimize plan"); + rule.optimize(plan, &ExecutionProps::new()) + .expect("failed to optimize plan") + } + + fn assert_optimized_plan_eq(plan: &LogicalPlan, expected: &str) { + let optimized_plan = optimize_plan(plan); let formatted_plan = format!("{:?}", optimized_plan); assert_eq!(formatted_plan, expected); } @@ -1037,6 +1045,23 @@ mod tests { Ok(()) } + #[test] + fn filter_with_table_provider_multiple_invocations() -> Result<()> { + let plan = + table_scan_with_pushdown_provider(TableProviderFilterPushDown::Inexact)?; + + let optimised_plan = optimize_plan(&plan); + + let expected = "\ + Filter: #a Eq Int64(1)\ + \n TableScan: projection=None, filters=[#a Eq Int64(1)]"; + + // Optimizing the same plan multiple times should produce the same plan + // each time. + assert_optimized_plan_eq(&optimised_plan, expected); + Ok(()) + } + #[test] fn filter_with_table_provider_unsupported() -> Result<()> { let plan = From d9b044787ce465e2597f9ab37f601ae8515921ee Mon Sep 17 00:00:00 2001 From: sathis Date: Wed, 26 May 2021 00:01:53 +0530 Subject: [PATCH 123/329] Constant fold / optimize `to_timestamp` function during planning (#387) * optimize to_timestamp * cargo fmt * fix clippy * Added testcase & removed optimization for invalid timestamps * Added negative testcase * Fix clippy * fix clippy Co-authored-by: Sathis Kumar Co-authored-by: Andrew Lamb --- datafusion/src/optimizer/constant_folding.rs | 92 ++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/datafusion/src/optimizer/constant_folding.rs b/datafusion/src/optimizer/constant_folding.rs index af89aa13908c4..97cc23264bda1 100644 --- a/datafusion/src/optimizer/constant_folding.rs +++ b/datafusion/src/optimizer/constant_folding.rs @@ -29,6 +29,7 @@ use crate::optimizer::optimizer::OptimizerRule; use crate::optimizer::utils; use crate::physical_plan::functions::BuiltinScalarFunction; use crate::scalar::ScalarValue; +use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos; /// Optimizer that simplifies comparison expressions involving boolean literals. /// @@ -217,6 +218,35 @@ impl<'a> ExprRewriter for ConstantRewriter<'a> { .query_execution_start_time .timestamp_nanos(), ))), + Expr::ScalarFunction { + fun: BuiltinScalarFunction::ToTimestamp, + args, + } => { + if !args.is_empty() { + match &args[0] { + Expr::Literal(ScalarValue::Utf8(Some(val))) => { + match string_to_timestamp_nanos(val) { + Ok(timestamp) => Expr::Literal( + ScalarValue::TimestampNanosecond(Some(timestamp)), + ), + _ => Expr::ScalarFunction { + fun: BuiltinScalarFunction::ToTimestamp, + args, + }, + } + } + _ => Expr::ScalarFunction { + fun: BuiltinScalarFunction::ToTimestamp, + args, + }, + } + } else { + Expr::ScalarFunction { + fun: BuiltinScalarFunction::ToTimestamp, + args, + } + } + } expr => { // no rewrite possible expr @@ -632,6 +662,68 @@ mod tests { return format!("{:?}", optimized_plan); } + #[test] + fn to_timestamp_expr() { + let table_scan = test_table_scan().unwrap(); + let proj = vec![Expr::ScalarFunction { + args: vec![Expr::Literal(ScalarValue::Utf8(Some( + "2020-09-08T12:00:00+00:00".to_string(), + )))], + fun: BuiltinScalarFunction::ToTimestamp, + }]; + let plan = LogicalPlanBuilder::from(&table_scan) + .project(proj) + .unwrap() + .build() + .unwrap(); + + let expected = "Projection: TimestampNanosecond(1599566400000000000)\ + \n TableScan: test projection=None" + .to_string(); + let actual = get_optimized_plan_formatted(&plan, &chrono::Utc::now()); + assert_eq!(expected, actual); + } + + #[test] + fn to_timestamp_expr_wrong_arg() { + let table_scan = test_table_scan().unwrap(); + let proj = vec![Expr::ScalarFunction { + args: vec![Expr::Literal(ScalarValue::Utf8(Some( + "I'M NOT A TIMESTAMP".to_string(), + )))], + fun: BuiltinScalarFunction::ToTimestamp, + }]; + let plan = LogicalPlanBuilder::from(&table_scan) + .project(proj) + .unwrap() + .build() + .unwrap(); + + let expected = "Projection: totimestamp(Utf8(\"I\'M NOT A TIMESTAMP\"))\ + \n TableScan: test projection=None"; + let actual = get_optimized_plan_formatted(&plan, &chrono::Utc::now()); + assert_eq!(expected, actual); + } + + #[test] + fn to_timestamp_expr_no_arg() { + let table_scan = test_table_scan().unwrap(); + let proj = vec![Expr::ScalarFunction { + args: vec![], + fun: BuiltinScalarFunction::ToTimestamp, + }]; + let plan = LogicalPlanBuilder::from(&table_scan) + .project(proj) + .unwrap() + .build() + .unwrap(); + + let expected = "Projection: totimestamp()\ + \n TableScan: test projection=None"; + let actual = get_optimized_plan_formatted(&plan, &chrono::Utc::now()); + assert_eq!(expected, actual); + } + #[test] fn single_now_expr() { let table_scan = test_table_scan().unwrap(); From 3593d1f358b54bd83cfbafdd18e3664f6f7ab73d Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 25 May 2021 23:04:08 +0100 Subject: [PATCH 124/329] Add support for multiple partitions with SortExec (#362) (#378) * Add support for multiple partitions with SortExec * make SortExec partitioning optional --- datafusion/src/physical_plan/sort.rs | 55 ++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/datafusion/src/physical_plan/sort.rs b/datafusion/src/physical_plan/sort.rs index caa32cfa264e1..7cd4d9df78759 100644 --- a/datafusion/src/physical_plan/sort.rs +++ b/datafusion/src/physical_plan/sort.rs @@ -55,6 +55,8 @@ pub struct SortExec { output_rows: Arc, /// Time to sort batches sort_time_nanos: Arc, + /// Preserve partitions of input plan + preserve_partitioning: bool, } impl SortExec { @@ -63,12 +65,23 @@ impl SortExec { expr: Vec, input: Arc, ) -> Result { - Ok(Self { + Ok(Self::new_with_partitioning(expr, input, false)) + } + + /// Create a new sort execution plan with the option to preserve + /// the partitioning of the input plan + pub fn new_with_partitioning( + expr: Vec, + input: Arc, + preserve_partitioning: bool, + ) -> Self { + Self { expr, input, + preserve_partitioning, output_rows: SQLMetric::counter(), sort_time_nanos: SQLMetric::time_nanos(), - }) + } } /// Input schema @@ -99,11 +112,19 @@ impl ExecutionPlan for SortExec { /// Get the output partitioning of this plan fn output_partitioning(&self) -> Partitioning { - Partitioning::UnknownPartitioning(1) + if self.preserve_partitioning { + self.input.output_partitioning() + } else { + Partitioning::UnknownPartitioning(1) + } } fn required_child_distribution(&self) -> Distribution { - Distribution::SinglePartition + if self.preserve_partitioning { + Distribution::UnspecifiedDistribution + } else { + Distribution::SinglePartition + } } fn with_new_children( @@ -122,21 +143,23 @@ impl ExecutionPlan for SortExec { } async fn execute(&self, partition: usize) -> Result { - if 0 != partition { - return Err(DataFusionError::Internal(format!( - "SortExec invalid partition {}", - partition - ))); - } + if !self.preserve_partitioning { + if 0 != partition { + return Err(DataFusionError::Internal(format!( + "SortExec invalid partition {}", + partition + ))); + } - // sort needs to operate on a single partition currently - if 1 != self.input.output_partitioning().partition_count() { - return Err(DataFusionError::Internal( - "SortExec requires a single input partition".to_owned(), - )); + // sort needs to operate on a single partition currently + if 1 != self.input.output_partitioning().partition_count() { + return Err(DataFusionError::Internal( + "SortExec requires a single input partition".to_owned(), + )); + } } - let input = self.input.execute(0).await?; + let input = self.input.execute(partition).await?; Ok(Box::pin(SortStream::new( input, From 4b1e9e6fae0e200debda215f6ad78c654c37c1a8 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 27 May 2021 04:03:20 +0800 Subject: [PATCH 125/329] add window expression stream, delegated window aggregation to aggregate functions, and implement `row_number` (#375) * Squashed commit of the following: commit 7fb3640e733bfbbdbf18d58000896f378ba9644c Author: Jiayu Liu Date: Fri May 21 16:38:25 2021 +0800 row number done commit 17239267cd2fbcbb676d5731beeffd0321bbd3ba Author: Jiayu Liu Date: Fri May 21 16:05:50 2021 +0800 add row number commit bf5b8a56f6f33d8eedf3e3009e7fcdb3c388ea5b Author: Jiayu Liu Date: Fri May 21 15:04:49 2021 +0800 save commit d2ce852ead5d8ae3d15962b4dd3062e24bce51de Author: Jiayu Liu Date: Fri May 21 14:53:05 2021 +0800 add streams commit 0a861a76bde0bb43e5561f1cf1ef14fd64e0c08b Author: Jiayu Liu Date: Thu May 20 22:28:34 2021 +0800 save stream commit a9121af7e2e9104d0e4b6ca3ef4f484aaf8baf42 Author: Jiayu Liu Date: Thu May 20 22:01:51 2021 +0800 update unit test commit 2af2a270262ff1bc759af39153d7cd681c32dc0a Author: Jiayu Liu Date: Fri May 21 14:25:12 2021 +0800 fix unit test commit bb57c762b0a1fabc35e207e681bca2bfff7fcf01 Author: Jiayu Liu Date: Fri May 21 14:23:34 2021 +0800 use upper case commit 5d96e525f587fbfaf3e5e9762c9bb10315fcbc3a Author: Jiayu Liu Date: Fri May 21 14:16:16 2021 +0800 fix unit test commit 1ecae8f6cbc6c1898ccf0b38b1e596b6c2e9bb46 Author: Jiayu Liu Date: Fri May 21 12:27:26 2021 +0800 fix unit test commit bc2271d58fd4a9a9cc96126f8abcd6e8f10272ca Author: Jiayu Liu Date: Fri May 21 10:04:29 2021 +0800 fix error commit 880b94f6e27df61b4d3877366f71a51b9b2f5d5d Author: Jiayu Liu Date: Fri May 21 08:24:00 2021 +0800 fix unit test commit 4e792e123a33fd0dcb5f701c679566b55589b0c0 Author: Jiayu Liu Date: Fri May 21 08:05:17 2021 +0800 fix test commit c36c04abf06c74d016597983bf3d3a2a5b5cbdd5 Author: Jiayu Liu Date: Fri May 21 00:07:54 2021 +0800 add more tests commit f5e64de7192a1916df78a4c2fbab7d471c906720 Author: Jiayu Liu Date: Thu May 20 23:41:36 2021 +0800 update commit a1eae864926a6acfeeebe995a12de4ad725ea869 Author: Jiayu Liu Date: Thu May 20 23:36:15 2021 +0800 enrich unit test commit 0d2a214131fe69e19e22144c68fbb992228db6b3 Author: Jiayu Liu Date: Thu May 20 23:25:43 2021 +0800 adding filter by todo commit 8b486d53b09ff1c7a6b9cf4687796ba1c13d6160 Author: Jiayu Liu Date: Thu May 20 23:17:22 2021 +0800 adding more built-in functions commit abf08cd137a80c1381af7de9ae2b3dab05cb4512 Author: Jiayu Liu Date: Thu May 20 22:36:27 2021 +0800 Update datafusion/src/physical_plan/window_functions.rs Co-authored-by: Andrew Lamb commit 0cbca53dac642233520f7d32289b1dfad77b882e Author: Jiayu Liu Date: Thu May 20 22:34:57 2021 +0800 Update datafusion/src/physical_plan/window_functions.rs Co-authored-by: Andrew Lamb commit 831c069f02236a953653b8f1ca25124e393ce20b Author: Jiayu Liu Date: Thu May 20 22:34:04 2021 +0800 Update datafusion/src/logical_plan/builder.rs Co-authored-by: Andrew Lamb commit f70c739fd40e30c4b476253e58b24b9297b42859 Author: Jiayu Liu Date: Thu May 20 22:33:04 2021 +0800 Update datafusion/src/logical_plan/builder.rs Co-authored-by: Andrew Lamb commit 3ee87aa3477c160f17a86628d71a353e03d736b3 Author: Jiayu Liu Date: Wed May 19 22:55:08 2021 +0800 fix unit test commit 5c4d92dc9f570ba6919d84cb8ac70a736d73f40f Author: Jiayu Liu Date: Wed May 19 22:48:26 2021 +0800 fix clippy commit a0b7526c413abbdd4aadab4af8ca9ad8f323f03b Author: Jiayu Liu Date: Wed May 19 22:46:38 2021 +0800 fix unused imports commit 1d3b076acc1c0f248a19c6149c0634e63a5b836e Author: Jiayu Liu Date: Thu May 13 18:51:14 2021 +0800 add window expr * fix unit test --- datafusion/src/execution/context.rs | 29 ++ .../src/physical_plan/expressions/mod.rs | 2 + .../physical_plan/expressions/row_number.rs | 174 +++++++++ .../src/physical_plan/hash_aggregate.rs | 7 +- datafusion/src/physical_plan/mod.rs | 81 +++- datafusion/src/physical_plan/planner.rs | 4 +- datafusion/src/physical_plan/sort.rs | 1 + .../src/physical_plan/window_functions.rs | 107 +++-- datafusion/src/physical_plan/windows.rs | 365 +++++++++++++++++- datafusion/tests/sql.rs | 39 +- parquet-testing | 2 +- 11 files changed, 736 insertions(+), 75 deletions(-) create mode 100644 datafusion/src/physical_plan/expressions/row_number.rs diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index 272e75acba6fd..cfd3b7194429e 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -1268,6 +1268,35 @@ mod tests { Ok(()) } + #[tokio::test] + async fn window() -> Result<()> { + let results = execute( + "SELECT c1, c2, SUM(c2) OVER (), COUNT(c2) OVER (), MAX(c2) OVER (), MIN(c2) OVER (), AVG(c2) OVER () FROM test ORDER BY c1, c2 LIMIT 5", + 4, + ) + .await?; + // result in one batch, although e.g. having 2 batches do not change + // result semantics, having a len=1 assertion upfront keeps surprises + // at bay + assert_eq!(results.len(), 1); + + let expected = vec![ + "+----+----+---------+-----------+---------+---------+---------+", + "| c1 | c2 | SUM(c2) | COUNT(c2) | MAX(c2) | MIN(c2) | AVG(c2) |", + "+----+----+---------+-----------+---------+---------+---------+", + "| 0 | 1 | 220 | 40 | 10 | 1 | 5.5 |", + "| 0 | 2 | 220 | 40 | 10 | 1 | 5.5 |", + "| 0 | 3 | 220 | 40 | 10 | 1 | 5.5 |", + "| 0 | 4 | 220 | 40 | 10 | 1 | 5.5 |", + "| 0 | 5 | 220 | 40 | 10 | 1 | 5.5 |", + "+----+----+---------+-----------+---------+---------+---------+", + ]; + + // window function shall respect ordering + assert_batches_eq!(expected, &results); + Ok(()) + } + #[tokio::test] async fn aggregate() -> Result<()> { let results = execute("SELECT SUM(c1), SUM(c2) FROM test", 4).await?; diff --git a/datafusion/src/physical_plan/expressions/mod.rs b/datafusion/src/physical_plan/expressions/mod.rs index 4d57c39bb31cc..803870f3f7840 100644 --- a/datafusion/src/physical_plan/expressions/mod.rs +++ b/datafusion/src/physical_plan/expressions/mod.rs @@ -41,6 +41,7 @@ mod min_max; mod negative; mod not; mod nullif; +mod row_number; mod sum; mod try_cast; @@ -58,6 +59,7 @@ pub use min_max::{Max, Min}; pub use negative::{negative, NegativeExpr}; pub use not::{not, NotExpr}; pub use nullif::{nullif_func, SUPPORTED_NULLIF_TYPES}; +pub use row_number::RowNumber; pub use sum::{sum_return_type, Sum}; pub use try_cast::{try_cast, TryCastExpr}; /// returns the name of the state diff --git a/datafusion/src/physical_plan/expressions/row_number.rs b/datafusion/src/physical_plan/expressions/row_number.rs new file mode 100644 index 0000000000000..f399995461f70 --- /dev/null +++ b/datafusion/src/physical_plan/expressions/row_number.rs @@ -0,0 +1,174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines physical expression for `row_number` that can evaluated at runtime during query execution + +use crate::error::Result; +use crate::physical_plan::{ + window_functions::BuiltInWindowFunctionExpr, PhysicalExpr, WindowAccumulator, +}; +use crate::scalar::ScalarValue; +use arrow::array::{ArrayRef, UInt64Array}; +use arrow::datatypes::{DataType, Field}; +use std::any::Any; +use std::sync::Arc; + +/// row_number expression +#[derive(Debug)] +pub struct RowNumber { + name: String, +} + +impl RowNumber { + /// Create a new ROW_NUMBER function + pub fn new(name: String) -> Self { + Self { name } + } +} + +impl BuiltInWindowFunctionExpr for RowNumber { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn field(&self) -> Result { + let nullable = false; + let data_type = DataType::UInt64; + Ok(Field::new(&self.name(), data_type, nullable)) + } + + fn expressions(&self) -> Vec> { + vec![] + } + + fn name(&self) -> &str { + self.name.as_str() + } + + fn create_accumulator(&self) -> Result> { + Ok(Box::new(RowNumberAccumulator::new())) + } +} + +#[derive(Debug)] +struct RowNumberAccumulator { + row_number: u64, +} + +impl RowNumberAccumulator { + /// new row_number accumulator + pub fn new() -> Self { + // row number is 1 based + Self { row_number: 1 } + } +} + +impl WindowAccumulator for RowNumberAccumulator { + fn scan(&mut self, _values: &[ScalarValue]) -> Result> { + let result = Some(ScalarValue::UInt64(Some(self.row_number))); + self.row_number += 1; + Ok(result) + } + + fn scan_batch( + &mut self, + num_rows: usize, + _values: &[ArrayRef], + ) -> Result> { + let new_row_number = self.row_number + (num_rows as u64); + // TODO: probably would be nice to have a (optimized) kernel for this at some point to + // generate an array like this. + let result = UInt64Array::from_iter_values(self.row_number..new_row_number); + self.row_number = new_row_number; + Ok(Some(Arc::new(result))) + } + + fn evaluate(&self) -> Result> { + Ok(None) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::error::Result; + use arrow::record_batch::RecordBatch; + use arrow::{array::*, datatypes::*}; + + #[test] + fn row_number_all_null() -> Result<()> { + let arr: ArrayRef = Arc::new(BooleanArray::from(vec![ + None, None, None, None, None, None, None, None, + ])); + let schema = Schema::new(vec![Field::new("arr", DataType::Boolean, false)]); + let batch = RecordBatch::try_new(Arc::new(schema), vec![arr])?; + + let row_number = Arc::new(RowNumber::new("row_number".to_owned())); + + let mut acc = row_number.create_accumulator()?; + let expr = row_number.expressions(); + let values = expr + .iter() + .map(|e| e.evaluate(&batch)) + .map(|r| r.map(|v| v.into_array(batch.num_rows()))) + .collect::>>()?; + + let result = acc.scan_batch(batch.num_rows(), &values)?; + assert_eq!(true, result.is_some()); + + let result = result.unwrap(); + let result = result.as_any().downcast_ref::().unwrap(); + let result = result.values(); + assert_eq!(vec![1, 2, 3, 4, 5, 6, 7, 8], result); + + let result = acc.evaluate()?; + assert_eq!(false, result.is_some()); + Ok(()) + } + + #[test] + fn row_number_all_values() -> Result<()> { + let arr: ArrayRef = Arc::new(BooleanArray::from(vec![ + true, false, true, false, false, true, false, true, + ])); + let schema = Schema::new(vec![Field::new("arr", DataType::Boolean, false)]); + let batch = RecordBatch::try_new(Arc::new(schema), vec![arr])?; + + let row_number = Arc::new(RowNumber::new("row_number".to_owned())); + + let mut acc = row_number.create_accumulator()?; + let expr = row_number.expressions(); + let values = expr + .iter() + .map(|e| e.evaluate(&batch)) + .map(|r| r.map(|v| v.into_array(batch.num_rows()))) + .collect::>>()?; + + let result = acc.scan_batch(batch.num_rows(), &values)?; + assert_eq!(true, result.is_some()); + + let result = result.unwrap(); + let result = result.as_any().downcast_ref::().unwrap(); + let result = result.values(); + assert_eq!(vec![1, 2, 3, 4, 5, 6, 7, 8], result); + + let result = acc.evaluate()?; + assert_eq!(false, result.is_some()); + Ok(()) + } +} diff --git a/datafusion/src/physical_plan/hash_aggregate.rs b/datafusion/src/physical_plan/hash_aggregate.rs index c9d268619cad3..5008f49250b0b 100644 --- a/datafusion/src/physical_plan/hash_aggregate.rs +++ b/datafusion/src/physical_plan/hash_aggregate.rs @@ -712,7 +712,7 @@ impl GroupedHashAggregateStream { tx.send(result) }); - GroupedHashAggregateStream { + Self { schema, output: rx, finished: false, @@ -825,7 +825,8 @@ fn aggregate_expressions( } pin_project! { - struct HashAggregateStream { + /// stream struct for hash aggregation + pub struct HashAggregateStream { schema: SchemaRef, #[pin] output: futures::channel::oneshot::Receiver>, @@ -878,7 +879,7 @@ impl HashAggregateStream { tx.send(result) }); - HashAggregateStream { + Self { schema, output: rx, finished: false, diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index c053229bc000b..4f90a8cf7d6ec 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -17,22 +17,23 @@ //! Traits for physical query plan, supporting parallel execution for partitioned relations. -use std::fmt::{self, Debug, Display}; -use std::sync::atomic::{AtomicUsize, Ordering}; -use std::sync::Arc; -use std::{any::Any, pin::Pin}; - use crate::execution::context::ExecutionContextState; use crate::logical_plan::LogicalPlan; -use crate::{error::Result, scalar::ScalarValue}; +use crate::{ + error::{DataFusionError, Result}, + scalar::ScalarValue, +}; use arrow::datatypes::{DataType, Schema, SchemaRef}; use arrow::error::Result as ArrowResult; use arrow::record_batch::RecordBatch; use arrow::{array::ArrayRef, datatypes::Field}; - use async_trait::async_trait; pub use display::DisplayFormatType; use futures::stream::Stream; +use std::fmt::{self, Debug, Display}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; +use std::{any::Any, pin::Pin}; use self::{display::DisplayableExecutionPlan, merge::MergeExec}; use hashbrown::HashMap; @@ -457,10 +458,22 @@ pub trait WindowExpr: Send + Sync + Debug { fn name(&self) -> &str { "WindowExpr: default name" } + + /// the accumulator used to accumulate values from the expressions. + /// the accumulator expects the same number of arguments as `expressions` and must + /// return states with the same description as `state_fields` + fn create_accumulator(&self) -> Result>; + + /// expressions that are passed to the WindowAccumulator. + /// Functions which take a single input argument, such as `sum`, return a single [`Expr`], + /// others (e.g. `cov`) return many. + fn expressions(&self) -> Vec>; } /// An accumulator represents a stateful object that lives throughout the evaluation of multiple rows and -/// generically accumulates values. An accumulator knows how to: +/// generically accumulates values. +/// +/// An accumulator knows how to: /// * update its state from inputs via `update` /// * convert its internal state to a vector of scalar values /// * update its state from multiple accumulators' states via `merge` @@ -509,6 +522,58 @@ pub trait Accumulator: Send + Sync + Debug { fn evaluate(&self) -> Result; } +/// A window accumulator represents a stateful object that lives throughout the evaluation of multiple +/// rows and generically accumulates values. +/// +/// An accumulator knows how to: +/// * update its state from inputs via `update` +/// * convert its internal state to a vector of scalar values +/// * update its state from multiple accumulators' states via `merge` +/// * compute the final value from its internal state via `evaluate` +pub trait WindowAccumulator: Send + Sync + Debug { + /// scans the accumulator's state from a vector of scalars, similar to Accumulator it also + /// optionally generates values. + fn scan(&mut self, values: &[ScalarValue]) -> Result>; + + /// scans the accumulator's state from a vector of arrays. + fn scan_batch( + &mut self, + num_rows: usize, + values: &[ArrayRef], + ) -> Result> { + if values.is_empty() { + return Ok(None); + }; + // transpose columnar to row based so that we can apply window + let result = (0..num_rows) + .map(|index| { + let v = values + .iter() + .map(|array| ScalarValue::try_from_array(array, index)) + .collect::>>()?; + self.scan(&v) + }) + .collect::>>>()? + .into_iter() + .collect::>>(); + + Ok(match result { + Some(arr) if num_rows == arr.len() => Some(ScalarValue::iter_to_array(&arr)?), + None => None, + Some(arr) => { + return Err(DataFusionError::Internal(format!( + "expect scan batch to return {:?} rows, but got {:?}", + num_rows, + arr.len() + ))) + } + }) + } + + /// returns its value based on its current state. + fn evaluate(&self) -> Result>; +} + pub mod aggregates; pub mod array_expressions; pub mod coalesce_batches; diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index 018925d0e5356..7ddfaf8f68972 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -147,8 +147,10 @@ impl DefaultPhysicalPlanner { // Initially need to perform the aggregate and then merge the partitions let input_exec = self.create_initial_plan(input, ctx_state)?; let input_schema = input_exec.schema(); - let physical_input_schema = input_exec.as_ref().schema(); + let logical_input_schema = input.as_ref().schema(); + let physical_input_schema = input_exec.as_ref().schema(); + let window_expr = window_expr .iter() .map(|e| { diff --git a/datafusion/src/physical_plan/sort.rs b/datafusion/src/physical_plan/sort.rs index 7cd4d9df78759..c5b838c6e84bb 100644 --- a/datafusion/src/physical_plan/sort.rs +++ b/datafusion/src/physical_plan/sort.rs @@ -250,6 +250,7 @@ fn sort_batches( } pin_project! { + /// stream for sort plan struct SortStream { #[pin] output: futures::channel::oneshot::Receiver>>, diff --git a/datafusion/src/physical_plan/window_functions.rs b/datafusion/src/physical_plan/window_functions.rs index 65d5373d54f47..e6afcaad8ad6b 100644 --- a/datafusion/src/physical_plan/window_functions.rs +++ b/datafusion/src/physical_plan/window_functions.rs @@ -20,12 +20,15 @@ //! //! see also https://www.postgresql.org/docs/current/functions-window.html +use crate::arrow::datatypes::Field; use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ aggregates, aggregates::AggregateFunction, functions::Signature, - type_coercion::data_types, + type_coercion::data_types, PhysicalExpr, WindowAccumulator, }; use arrow::datatypes::DataType; +use std::any::Any; +use std::sync::Arc; use std::{fmt, str::FromStr}; /// WindowFunction @@ -143,52 +146,92 @@ impl FromStr for BuiltInWindowFunction { /// Returns the datatype of the window function pub fn return_type(fun: &WindowFunction, arg_types: &[DataType]) -> Result { + match fun { + WindowFunction::AggregateFunction(fun) => aggregates::return_type(fun, arg_types), + WindowFunction::BuiltInWindowFunction(fun) => { + return_type_for_built_in(fun, arg_types) + } + } +} + +/// Returns the datatype of the built-in window function +pub(super) fn return_type_for_built_in( + fun: &BuiltInWindowFunction, + arg_types: &[DataType], +) -> Result { // Note that this function *must* return the same type that the respective physical expression returns // or the execution panics. // verify that this is a valid set of data types for this function - data_types(arg_types, &signature(fun))?; + data_types(arg_types, &signature_for_built_in(fun))?; match fun { - WindowFunction::AggregateFunction(fun) => aggregates::return_type(fun, arg_types), - WindowFunction::BuiltInWindowFunction(fun) => match fun { - BuiltInWindowFunction::RowNumber - | BuiltInWindowFunction::Rank - | BuiltInWindowFunction::DenseRank => Ok(DataType::UInt64), - BuiltInWindowFunction::PercentRank | BuiltInWindowFunction::CumeDist => { - Ok(DataType::Float64) - } - BuiltInWindowFunction::Ntile => Ok(DataType::UInt32), - BuiltInWindowFunction::Lag - | BuiltInWindowFunction::Lead - | BuiltInWindowFunction::FirstValue - | BuiltInWindowFunction::LastValue - | BuiltInWindowFunction::NthValue => Ok(arg_types[0].clone()), - }, + BuiltInWindowFunction::RowNumber + | BuiltInWindowFunction::Rank + | BuiltInWindowFunction::DenseRank => Ok(DataType::UInt64), + BuiltInWindowFunction::PercentRank | BuiltInWindowFunction::CumeDist => { + Ok(DataType::Float64) + } + BuiltInWindowFunction::Ntile => Ok(DataType::UInt32), + BuiltInWindowFunction::Lag + | BuiltInWindowFunction::Lead + | BuiltInWindowFunction::FirstValue + | BuiltInWindowFunction::LastValue + | BuiltInWindowFunction::NthValue => Ok(arg_types[0].clone()), } } /// the signatures supported by the function `fun`. -fn signature(fun: &WindowFunction) -> Signature { - // note: the physical expression must accept the type returned by this function or the execution panics. +pub fn signature(fun: &WindowFunction) -> Signature { match fun { WindowFunction::AggregateFunction(fun) => aggregates::signature(fun), - WindowFunction::BuiltInWindowFunction(fun) => match fun { - BuiltInWindowFunction::RowNumber - | BuiltInWindowFunction::Rank - | BuiltInWindowFunction::DenseRank - | BuiltInWindowFunction::PercentRank - | BuiltInWindowFunction::CumeDist => Signature::Any(0), - BuiltInWindowFunction::Lag - | BuiltInWindowFunction::Lead - | BuiltInWindowFunction::FirstValue - | BuiltInWindowFunction::LastValue => Signature::Any(1), - BuiltInWindowFunction::Ntile => Signature::Exact(vec![DataType::UInt64]), - BuiltInWindowFunction::NthValue => Signature::Any(2), - }, + WindowFunction::BuiltInWindowFunction(fun) => signature_for_built_in(fun), + } +} + +/// the signatures supported by the built-in window function `fun`. +pub(super) fn signature_for_built_in(fun: &BuiltInWindowFunction) -> Signature { + // note: the physical expression must accept the type returned by this function or the execution panics. + match fun { + BuiltInWindowFunction::RowNumber + | BuiltInWindowFunction::Rank + | BuiltInWindowFunction::DenseRank + | BuiltInWindowFunction::PercentRank + | BuiltInWindowFunction::CumeDist => Signature::Any(0), + BuiltInWindowFunction::Lag + | BuiltInWindowFunction::Lead + | BuiltInWindowFunction::FirstValue + | BuiltInWindowFunction::LastValue => Signature::Any(1), + BuiltInWindowFunction::Ntile => Signature::Exact(vec![DataType::UInt64]), + BuiltInWindowFunction::NthValue => Signature::Any(2), } } +/// A window expression that is a built-in window function +pub trait BuiltInWindowFunctionExpr: Send + Sync + std::fmt::Debug { + /// Returns the aggregate expression as [`Any`](std::any::Any) so that it can be + /// downcast to a specific implementation. + fn as_any(&self) -> &dyn Any; + + /// the field of the final result of this aggregation. + fn field(&self) -> Result; + + /// expressions that are passed to the Accumulator. + /// Single-column aggregations such as `sum` return a single value, others (e.g. `cov`) return many. + fn expressions(&self) -> Vec>; + + /// Human readable name such as `"MIN(c2)"` or `"RANK()"`. The default + /// implementation returns placeholder text. + fn name(&self) -> &str { + "BuiltInWindowFunctionExpr: default name" + } + + /// the accumulator used to accumulate values from the expressions. + /// the accumulator expects the same number of arguments as `expressions` and must + /// return states with the same description as `state_fields` + fn create_accumulator(&self) -> Result>; +} + #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/src/physical_plan/windows.rs b/datafusion/src/physical_plan/windows.rs index bdd25d69fd553..8ced3aec8ec11 100644 --- a/datafusion/src/physical_plan/windows.rs +++ b/datafusion/src/physical_plan/windows.rs @@ -19,13 +19,30 @@ use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ - aggregates, window_functions::WindowFunction, AggregateExpr, Distribution, - ExecutionPlan, Partitioning, PhysicalExpr, SendableRecordBatchStream, WindowExpr, + aggregates, + expressions::RowNumber, + window_functions::BuiltInWindowFunctionExpr, + window_functions::{BuiltInWindowFunction, WindowFunction}, + Accumulator, AggregateExpr, Distribution, ExecutionPlan, Partitioning, PhysicalExpr, + RecordBatchStream, SendableRecordBatchStream, WindowAccumulator, WindowExpr, +}; +use crate::scalar::ScalarValue; +use arrow::compute::concat; +use arrow::{ + array::{Array, ArrayRef}, + datatypes::{Field, Schema, SchemaRef}, + error::{ArrowError, Result as ArrowResult}, + record_batch::RecordBatch, }; -use arrow::datatypes::{Field, Schema, SchemaRef}; use async_trait::async_trait; +use futures::stream::{Stream, StreamExt}; +use futures::Future; +use pin_project_lite::pin_project; use std::any::Any; +use std::iter; +use std::pin::Pin; use std::sync::Arc; +use std::task::{Context, Poll}; /// Window execution plan #[derive(Debug)] @@ -57,18 +74,55 @@ pub fn create_window_expr( name, )?, })), - WindowFunction::BuiltInWindowFunction(fun) => { - Err(DataFusionError::NotImplemented(format!( - "window function with {:?} not implemented", - fun - ))) - } + WindowFunction::BuiltInWindowFunction(fun) => Ok(Arc::new(BuiltInWindowExpr { + window: create_built_in_window_expr(fun, args, input_schema, name)?, + })), + } +} + +fn create_built_in_window_expr( + fun: &BuiltInWindowFunction, + _args: &[Arc], + _input_schema: &Schema, + name: String, +) -> Result> { + match fun { + BuiltInWindowFunction::RowNumber => Ok(Arc::new(RowNumber::new(name))), + _ => Err(DataFusionError::NotImplemented(format!( + "Window function with {:?} not yet implemented", + fun + ))), } } /// A window expr that takes the form of a built in window function #[derive(Debug)] -pub struct BuiltInWindowExpr {} +pub struct BuiltInWindowExpr { + window: Arc, +} + +impl WindowExpr for BuiltInWindowExpr { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + &self.window.name() + } + + fn field(&self) -> Result { + self.window.field() + } + + fn expressions(&self) -> Vec> { + self.window.expressions() + } + + fn create_accumulator(&self) -> Result> { + self.window.create_accumulator() + } +} /// A window expr that takes the form of an aggregate function #[derive(Debug)] @@ -76,6 +130,23 @@ pub struct AggregateWindowExpr { aggregate: Arc, } +#[derive(Debug)] +struct AggregateWindowAccumulator { + accumulator: Box, +} + +impl WindowAccumulator for AggregateWindowAccumulator { + fn scan(&mut self, values: &[ScalarValue]) -> Result> { + self.accumulator.update(values)?; + Ok(None) + } + + /// returns its value based on its current state. + fn evaluate(&self) -> Result> { + Ok(Some(self.accumulator.evaluate()?)) + } +} + impl WindowExpr for AggregateWindowExpr { /// Return a reference to Any that can be used for downcasting fn as_any(&self) -> &dyn Any { @@ -89,6 +160,15 @@ impl WindowExpr for AggregateWindowExpr { fn field(&self) -> Result { self.aggregate.field() } + + fn expressions(&self) -> Vec> { + self.aggregate.expressions() + } + + fn create_accumulator(&self) -> Result> { + let accumulator = self.aggregate.create_accumulator()?; + Ok(Box::new(AggregateWindowAccumulator { accumulator })) + } } fn create_schema( @@ -120,12 +200,17 @@ impl WindowAggExec { }) } + /// Window expressions + pub fn window_expr(&self) -> &[Arc] { + &self.window_expr + } + /// Input plan pub fn input(&self) -> &Arc { &self.input } - /// Get the input schema before any aggregates are applied + /// Get the input schema before any window functions are applied pub fn input_schema(&self) -> SchemaRef { self.input_schema.clone() } @@ -163,7 +248,7 @@ impl ExecutionPlan for WindowAggExec { 1 => Ok(Arc::new(WindowAggExec::try_new( self.window_expr.clone(), children[0].clone(), - children[0].schema(), + self.input_schema.clone(), )?)), _ => Err(DataFusionError::Internal( "WindowAggExec wrong number of children".to_owned(), @@ -186,10 +271,258 @@ impl ExecutionPlan for WindowAggExec { )); } - // let input = self.input.execute(0).await?; + let input = self.input.execute(partition).await?; + + let stream = Box::pin(WindowAggStream::new( + self.schema.clone(), + self.window_expr.clone(), + input, + )); + Ok(stream) + } +} + +pin_project! { + /// stream for window aggregation plan + pub struct WindowAggStream { + schema: SchemaRef, + #[pin] + output: futures::channel::oneshot::Receiver>, + finished: bool, + } +} + +type WindowAccumulatorItem = Box; + +fn window_expressions( + window_expr: &[Arc], +) -> Result>>> { + Ok(window_expr + .iter() + .map(|expr| expr.expressions()) + .collect::>()) +} + +fn window_aggregate_batch( + batch: &RecordBatch, + window_accumulators: &mut [WindowAccumulatorItem], + expressions: &[Vec>], +) -> Result>> { + // 1.1 iterate accumulators and respective expressions together + // 1.2 evaluate expressions + // 1.3 update / merge window accumulators with the expressions' values + + // 1.1 + window_accumulators + .iter_mut() + .zip(expressions) + .map(|(window_acc, expr)| { + // 1.2 + let values = &expr + .iter() + .map(|e| e.evaluate(batch)) + .map(|r| r.map(|v| v.into_array(batch.num_rows()))) + .collect::>>()?; + + window_acc.scan_batch(batch.num_rows(), values) + }) + .into_iter() + .collect::>>() +} + +/// returns a vector of ArrayRefs, where each entry corresponds to one window expr +fn finalize_window_aggregation( + window_accumulators: &[WindowAccumulatorItem], +) -> Result>> { + window_accumulators + .iter() + .map(|window_accumulator| window_accumulator.evaluate()) + .collect::>>() +} + +fn create_window_accumulators( + window_expr: &[Arc], +) -> Result> { + window_expr + .iter() + .map(|expr| expr.create_accumulator()) + .collect::>>() +} + +async fn compute_window_aggregate( + schema: SchemaRef, + window_expr: Vec>, + mut input: SendableRecordBatchStream, +) -> ArrowResult { + let mut window_accumulators = create_window_accumulators(&window_expr) + .map_err(DataFusionError::into_arrow_external_error)?; + + let expressions = window_expressions(&window_expr) + .map_err(DataFusionError::into_arrow_external_error)?; + + let expressions = Arc::new(expressions); + + // TODO each element shall have some size hint + let mut accumulator: Vec> = + iter::repeat(vec![]).take(window_expr.len()).collect(); + + let mut original_batches: Vec = vec![]; + + let mut total_num_rows = 0; + + while let Some(batch) = input.next().await { + let batch = batch?; + total_num_rows += batch.num_rows(); + original_batches.push(batch.clone()); + + let batch_aggregated = + window_aggregate_batch(&batch, &mut window_accumulators, &expressions) + .map_err(DataFusionError::into_arrow_external_error)?; + accumulator.iter_mut().zip(batch_aggregated).for_each( + |(acc_for_window, window_batch)| { + if let Some(data) = window_batch { + acc_for_window.push(data); + } + }, + ); + } + + let aggregated_mapped = finalize_window_aggregation(&window_accumulators) + .map_err(DataFusionError::into_arrow_external_error)?; + + let mut columns: Vec = accumulator + .iter() + .zip(aggregated_mapped) + .map(|(acc, agg)| { + Ok(match (acc, agg) { + (acc, Some(scalar_value)) if acc.is_empty() => { + scalar_value.to_array_of_size(total_num_rows) + } + (acc, None) if !acc.is_empty() => { + let vec_array: Vec<&dyn Array> = + acc.iter().map(|arc| arc.as_ref()).collect(); + concat(&vec_array)? + } + _ => { + return Err(DataFusionError::Execution( + "Invalid window function behavior".to_owned(), + )) + } + }) + }) + .collect::>>() + .map_err(DataFusionError::into_arrow_external_error)?; + + for i in 0..(schema.fields().len() - window_expr.len()) { + let col = concat( + &original_batches + .iter() + .map(|batch| batch.column(i).as_ref()) + .collect::>(), + )?; + columns.push(col); + } + + RecordBatch::try_new(schema.clone(), columns) +} + +impl WindowAggStream { + /// Create a new WindowAggStream + pub fn new( + schema: SchemaRef, + window_expr: Vec>, + input: SendableRecordBatchStream, + ) -> Self { + let (tx, rx) = futures::channel::oneshot::channel(); + let schema_clone = schema.clone(); + tokio::spawn(async move { + let result = compute_window_aggregate(schema_clone, window_expr, input).await; + tx.send(result) + }); + + Self { + output: rx, + finished: false, + schema, + } + } +} + +impl Stream for WindowAggStream { + type Item = ArrowResult; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + if self.finished { + return Poll::Ready(None); + } - Err(DataFusionError::NotImplemented( - "WindowAggExec::execute".to_owned(), - )) + // is the output ready? + let this = self.project(); + let output_poll = this.output.poll(cx); + + match output_poll { + Poll::Ready(result) => { + *this.finished = true; + // check for error in receiving channel and unwrap actual result + let result = match result { + Err(e) => Some(Err(ArrowError::ExternalError(Box::new(e)))), // error receiving + Ok(result) => Some(result), + }; + Poll::Ready(result) + } + Poll::Pending => Poll::Pending, + } + } +} + +impl RecordBatchStream for WindowAggStream { + /// Get the schema + fn schema(&self) -> SchemaRef { + self.schema.clone() } } + +#[cfg(test)] +mod tests { + // use super::*; + + // /// some mock data to test windows + // fn some_data() -> (Arc, Vec) { + // // define a schema. + // let schema = Arc::new(Schema::new(vec![ + // Field::new("a", DataType::UInt32, false), + // Field::new("b", DataType::Float64, false), + // ])); + + // // define data. + // ( + // schema.clone(), + // vec![ + // RecordBatch::try_new( + // schema.clone(), + // vec![ + // Arc::new(UInt32Array::from(vec![2, 3, 4, 4])), + // Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])), + // ], + // ) + // .unwrap(), + // RecordBatch::try_new( + // schema, + // vec![ + // Arc::new(UInt32Array::from(vec![2, 3, 3, 4])), + // Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])), + // ], + // ) + // .unwrap(), + // ], + // ) + // } + + // #[tokio::test] + // async fn window_function() -> Result<()> { + // let input: Arc = unimplemented!(); + // let input_schema = input.schema(); + // let window_expr = vec![]; + // WindowAggExec::try_new(window_expr, input, input_schema); + // } +} diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index e68c53b251e6c..55bc88eedf9ab 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -797,20 +797,31 @@ async fn csv_query_count() -> Result<()> { Ok(()) } -// FIXME uncomment this when exec is done -// #[tokio::test] -// async fn csv_query_window_with_empty_over() -> Result<()> { -// let mut ctx = ExecutionContext::new(); -// register_aggregate_csv(&mut ctx)?; -// let sql = "SELECT count(c12) over () FROM aggregate_test_100"; -// // FIXME: so far the WindowAggExec is not implemented -// // and the current behavior is to throw not implemented exception - -// let result = execute(&mut ctx, sql).await; -// let expected: Vec> = vec![]; -// assert_eq!(result, expected); -// Ok(()) -// } +#[tokio::test] +async fn csv_query_window_with_empty_over() -> Result<()> { + let mut ctx = ExecutionContext::new(); + register_aggregate_csv(&mut ctx)?; + let sql = "select \ + c2, \ + sum(c3) over (), \ + avg(c3) over (), \ + count(c3) over (), \ + max(c3) over (), \ + min(c3) over () \ + from aggregate_test_100 \ + order by c2 \ + limit 5"; + let actual = execute(&mut ctx, sql).await; + let expected = vec![ + vec!["1", "781", "7.81", "100", "125", "-117"], + vec!["1", "781", "7.81", "100", "125", "-117"], + vec!["1", "781", "7.81", "100", "125", "-117"], + vec!["1", "781", "7.81", "100", "125", "-117"], + vec!["1", "781", "7.81", "100", "125", "-117"], + ]; + assert_eq!(expected, actual); + Ok(()) +} #[tokio::test] async fn csv_query_group_by_int_count() -> Result<()> { diff --git a/parquet-testing b/parquet-testing index 8e7badc6a3817..ddd898958803c 160000 --- a/parquet-testing +++ b/parquet-testing @@ -1 +1 @@ -Subproject commit 8e7badc6a3817a02e06d17b5d8ab6b6dc356e890 +Subproject commit ddd898958803cb89b7156c6350584d1cda0fe8de From b38282990a3a3ec3c3c3963e96158f879df0ffe2 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 27 May 2021 04:26:33 +0800 Subject: [PATCH 126/329] Update more docs and also the developer.md doc (#414) * update dev.md * update docs --- DEVELOPERS.md | 17 +++++++++++++++++ datafusion-examples/examples/README.md | 2 +- dev/README.md | 14 ++++++++++++-- .../tests/fixtures/crossbow-success-message.md | 12 ++++++------ dev/benchmarking/README.md | 9 +++++---- dev/release/VERIFY.md | 10 +++++----- dev/tasks/conda-recipes/README.md | 2 -- dev/tasks/gandiva-jars/README.md | 2 +- dev/tasks/linux-packages/README.md | 6 +++--- 9 files changed, 50 insertions(+), 24 deletions(-) diff --git a/DEVELOPERS.md b/DEVELOPERS.md index 60048c868e6c1..c2daf3a72e5e0 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -93,3 +93,20 @@ can be displayed. For example, the following command creates a ```bash dot -Tpdf < /tmp/plan.dot > /tmp/plan.pdf ``` + +## How to format `.md` document + +We are using `prettier` to format `.md` files. + +You can either use `npm i -g prettier` to install it globally or use `npx` to run it as a standalone binary. Using `npx` required a working node environment. Upgrading to the latest prettier is recommended (by adding `--upgrade` to the `npm` command). + +```bash +$ prettier --version +2.3.0 +``` + +After you've confirmed your prettier version, you can format all the `.md` files: + +```bash +prettier -w {ballista,datafusion,datafusion-examples,dev,docs,python}/**/*.md +``` diff --git a/datafusion-examples/examples/README.md b/datafusion-examples/examples/README.md index 163ef3d952bf5..2b24a22382a19 100644 --- a/datafusion-examples/examples/README.md +++ b/datafusion-examples/examples/README.md @@ -25,4 +25,4 @@ The examples `csv_sql.rs` and `parquet_sql.rs` demonstrate building a query plan ## Distributed -The `flight-client.rs` and `flight-server.rs` examples demonstrate how to run DataFusion as a standalone process and execute SQL queries from a client using the Flight protocol. \ No newline at end of file +The `flight-client.rs` and `flight-server.rs` examples demonstrate how to run DataFusion as a standalone process and execute SQL queries from a client using the Flight protocol. diff --git a/dev/README.md b/dev/README.md index 258792b805a0b..27440878bbcec 100644 --- a/dev/README.md +++ b/dev/README.md @@ -33,14 +33,16 @@ committer. ## How to merge a Pull request -Please don't merge PRs using the Github Web interface. Instead, set up -your git clone such as to have a remote named ``apache`` pointing to the +Please don't merge PRs using the Github Web interface. Instead, set up +your git clone such as to have a remote named `apache` pointing to the official Arrow repository: + ``` git remote add apache git@github.com:apache/arrow.git ``` and then run the following command: + ``` ./dev/merge_arrow_pr.sh ``` @@ -64,10 +66,13 @@ If these aren't supplied, the script will ask you the values of them. Note that the directory name of your Arrow git clone must be called `arrow`. example output: + ``` Which pull request would you like to merge? (e.g. 34): ``` + Type the pull request number (from https://github.com/apache/arrow/pulls) and hit enter. + ``` === Pull Request #X === title Blah Blah Blah @@ -77,7 +82,9 @@ url https://api.github.com/repos/apache/arrow/pulls/X Proceed with merging pull request #3? (y/n): ``` + If this looks good, type y and hit enter. + ``` From git-wip-us.apache.org:/repos/asf/arrow.git * [new branch] master -> PR_TOOL_MERGE_PR_3_MASTER @@ -85,8 +92,10 @@ Switched to branch 'PR_TOOL_MERGE_PR_3_MASTER' Merge complete (local ref PR_TOOL_MERGE_PR_3_MASTER). Push to apache? (y/n): ``` + A local branch with the merge has been created. type y and hit enter to push it to apache master + ``` Counting objects: 67, done. Delta compression using up to 4 threads. @@ -115,6 +124,7 @@ Merge hash: 485658a5 Would you like to pick 485658a5 into another branch? (y/n): ``` + For now just say n as we have 1 branch ## Verifying Release Candidates diff --git a/dev/archery/archery/crossbow/tests/fixtures/crossbow-success-message.md b/dev/archery/archery/crossbow/tests/fixtures/crossbow-success-message.md index 15825218c13ac..f914287dcc092 100644 --- a/dev/archery/archery/crossbow/tests/fixtures/crossbow-success-message.md +++ b/dev/archery/archery/crossbow/tests/fixtures/crossbow-success-message.md @@ -2,9 +2,9 @@ Revision: {revision} Submitted crossbow builds: [{repo} @ {branch}](https://github.com/{repo}/branches/all?query={branch}) -|Task|Status| -|----|------| -|docker-cpp-cmake32|[![CircleCI](https://img.shields.io/circleci/build/github/{repo}/{branch}-circle-docker-cpp-cmake32.svg)](https://circleci.com/gh/{repo}/tree/{branch}-circle-docker-cpp-cmake32)| -|wheel-osx-cp36m|[![TravisCI](https://img.shields.io/travis/{repo}/{branch}-travis-wheel-osx-cp36m.svg)](https://travis-ci.com/{repo}/branches)| -|wheel-osx-cp37m|[![TravisCI](https://img.shields.io/travis/{repo}/{branch}-travis-wheel-osx-cp37m.svg)](https://travis-ci.com/{repo}/branches)| -|wheel-win-cp36m|[![Appveyor](https://img.shields.io/appveyor/ci/{repo}/{branch}-appveyor-wheel-win-cp36m.svg)](https://ci.appveyor.com/project/{repo}/history)| +| Task | Status | +| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| docker-cpp-cmake32 | [![CircleCI](https://img.shields.io/circleci/build/github/{repo}/{branch}-circle-docker-cpp-cmake32.svg)](https://circleci.com/gh/{repo}/tree/{branch}-circle-docker-cpp-cmake32) | +| wheel-osx-cp36m | [![TravisCI](https://img.shields.io/travis/{repo}/{branch}-travis-wheel-osx-cp36m.svg)](https://travis-ci.com/{repo}/branches) | +| wheel-osx-cp37m | [![TravisCI](https://img.shields.io/travis/{repo}/{branch}-travis-wheel-osx-cp37m.svg)](https://travis-ci.com/{repo}/branches) | +| wheel-win-cp36m | [![Appveyor](https://img.shields.io/appveyor/ci/{repo}/{branch}-appveyor-wheel-win-cp36m.svg)](https://ci.appveyor.com/project/{repo}/history) | diff --git a/dev/benchmarking/README.md b/dev/benchmarking/README.md index c5ddd62e026fd..0c49baf3a9f2f 100644 --- a/dev/benchmarking/README.md +++ b/dev/benchmarking/README.md @@ -17,7 +17,6 @@ ~ under the License. --> - > NOTE: For those deploying this database, Postgres does not by default use > UTF-8, however it is [required for the jsonb][pg-jsonb] format used in > some columns to always work. This [stackoverflow post][so-utf8] describes @@ -81,7 +80,6 @@ database will be set up automatically once the container is running. To start the containers, be sure to have [Docker installed][docker], and then run the following from this directory (arrow/dev/benchmarking). - ``` docker-compose up ``` @@ -111,9 +109,11 @@ The `psql` shell client is bundled with the PostgreSQL core distribution available from the [Postgres download page][postgres-downloads]. Using the `PG_USER` defined in the `.env` file (currently "benchmark"), the command to connect to the container is: + ```shell psql -h localhost -p 5432 -U benchmark ``` + There is an example script in [examples/example.sql](examples/example.sql) that runs some queries against the database. To run it in the psql client, type the following in the psql command-line interface: @@ -168,7 +168,7 @@ The script [graphql_submit.sh](./graphql_submit.sh) simplifies submission to the database via curl. Examples: ```shell -./graphql_submit.sh benchmarks examples/benchmark_example.json +./graphql_submit.sh benchmarks examples/benchmark_example.json ./graphql_submit.sh runs examples/benchmark_run_example.json ``` @@ -176,6 +176,7 @@ to the database via curl. Examples: The output of the query is a JSON object that is hard to read on the command line. Here is an example query in the shell: + ```shell curl -X POST \ -H "Content-Type: application/json" \ @@ -190,6 +191,7 @@ which (if you have previously run the "examples.sql" command) yields ``` Here is an example query using Python: + ```python import json import requests @@ -219,7 +221,6 @@ for row in response.json()['data']['allEnvironmentViews']['edges']: > how to do it for Amazon RDS. This [section of the docs][pg-charset] > states how to do it in general, i.e.: `initdb -E UTF8`. - ## Quick reference - String variables `'have single quotes'` diff --git a/dev/release/VERIFY.md b/dev/release/VERIFY.md index 5b441ac13f1ca..ec77bccaf5b41 100644 --- a/dev/release/VERIFY.md +++ b/dev/release/VERIFY.md @@ -34,11 +34,11 @@ GLib, Java and JavaScript builds on Linux and macOS. Read the comments in You need the followings to verify C GLib build: - * GLib - * GObject Introspection - * Ruby (not EOL-ed version is required) - * gobject-introspection gem - * test-unit gem +- GLib +- GObject Introspection +- Ruby (not EOL-ed version is required) +- gobject-introspection gem +- test-unit gem You can install them by the followings on Debian GNU/Linux and Ubuntu: diff --git a/dev/tasks/conda-recipes/README.md b/dev/tasks/conda-recipes/README.md index 39f82f1b01a9d..074cefe52de01 100644 --- a/dev/tasks/conda-recipes/README.md +++ b/dev/tasks/conda-recipes/README.md @@ -54,14 +54,12 @@ related parts (the cloning of arrow and the jinja templated variables) and moving the matrix definitions like [this][matrix-definition] to the crossbow [tasks.yml][../tasks.yml] config file. - ### Porting recipes from crossbow to the upstream feedstocks Theoretically these recipes should be up to date with the actual version of Arrow, so during the release procedure the content of these recipes should be copied to the upstream feedstocks. - [arrow-cpp-feedstock]: https://github.com/conda-forge/arrow-cpp-feedstock [parquet-cpp-feedstock]: https://github.com/conda-forge/parquet-cpp-feedstock [matrix-definition]: https://github.com/conda-forge/arrow-cpp-feedstock/blob/master/.azure-pipelines/azure-pipelines-linux.yml#L12 diff --git a/dev/tasks/gandiva-jars/README.md b/dev/tasks/gandiva-jars/README.md index 2f4c694d799b3..5de59a08debb7 100644 --- a/dev/tasks/gandiva-jars/README.md +++ b/dev/tasks/gandiva-jars/README.md @@ -26,4 +26,4 @@ Do the following to update arrow manylinux docker image for building Gandiva Jar - Export JAVA_HOME environment variable. - Then update build_boost.sh under python/manylinux1/scripts to build boost statically. -Please look at https://github.com/praveenbingo/arrow/tree/buildGandivaDocker that already has these changes. \ No newline at end of file +Please look at https://github.com/praveenbingo/arrow/tree/buildGandivaDocker that already has these changes. diff --git a/dev/tasks/linux-packages/README.md b/dev/tasks/linux-packages/README.md index cafcc04ed0469..a1a14d1531aad 100644 --- a/dev/tasks/linux-packages/README.md +++ b/dev/tasks/linux-packages/README.md @@ -21,9 +21,9 @@ ## Requirements - * Ruby - * Docker - * Tools to build tar.gz for Apache Arrow C++ and GLib +- Ruby +- Docker +- Tools to build tar.gz for Apache Arrow C++ and GLib ## How to build .deb packages From 3f7736c4efc1199b0ee63f7ce2f9ac6eb3b5b2e0 Mon Sep 17 00:00:00 2001 From: Javier Goday Date: Thu, 27 May 2021 02:57:10 +0200 Subject: [PATCH 127/329] #352: BallistaContext::collect() logging is too noisy (#394) --- ballista/rust/client/src/context.rs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/ballista/rust/client/src/context.rs b/ballista/rust/client/src/context.rs index e26dcac256d28..df97e3a22984c 100644 --- a/ballista/rust/client/src/context.rs +++ b/ballista/rust/client/src/context.rs @@ -186,6 +186,8 @@ impl BallistaContext { .into_inner() .job_id; + let mut prev_status: Option = None; + loop { let GetJobStatusResult { status } = scheduler .get_job_status(GetJobStatusParams { @@ -198,14 +200,21 @@ impl BallistaContext { DataFusionError::Internal("Received empty status message".to_owned()) })?; let wait_future = tokio::time::sleep(Duration::from_millis(100)); + let has_status_change = prev_status.map(|x| x != status).unwrap_or(true); match status { job_status::Status::Queued(_) => { - info!("Job {} still queued...", job_id); + if has_status_change { + info!("Job {} still queued...", job_id); + } wait_future.await; + prev_status = Some(status); } job_status::Status::Running(_) => { - info!("Job {} is running...", job_id); + if has_status_change { + info!("Job {} is running...", job_id); + } wait_future.await; + prev_status = Some(status); } job_status::Status::Failed(err) => { let msg = format!("Job {} failed: {}", job_id, err.error); From 9e7bd2d13643c81e474e023749998ec8efa770a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Thu, 27 May 2021 06:57:48 +0200 Subject: [PATCH 128/329] Speed up `create_batch_from_map` (#339) --- .../src/physical_plan/hash_aggregate.rs | 156 ++++++++---------- datafusion/src/scalar.rs | 140 +++++++++++++--- 2 files changed, 182 insertions(+), 114 deletions(-) diff --git a/datafusion/src/physical_plan/hash_aggregate.rs b/datafusion/src/physical_plan/hash_aggregate.rs index 5008f49250b0b..ffb51b2e8a1f2 100644 --- a/datafusion/src/physical_plan/hash_aggregate.rs +++ b/datafusion/src/physical_plan/hash_aggregate.rs @@ -20,6 +20,7 @@ use std::any::Any; use std::sync::Arc; use std::task::{Context, Poll}; +use std::vec; use ahash::RandomState; use futures::{ @@ -32,6 +33,7 @@ use crate::physical_plan::{ Accumulator, AggregateExpr, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, PhysicalExpr, SQLMetric, }; +use crate::scalar::ScalarValue; use arrow::{ array::{Array, UInt32Builder}, @@ -623,10 +625,12 @@ fn create_key_for_col(col: &ArrayRef, row: usize, vec: &mut Vec) -> Result<( DataType::UInt64 => { dictionary_create_key_for_col::(col, row, vec)?; } - _ => return Err(DataFusionError::Internal(format!( + _ => { + return Err(DataFusionError::Internal(format!( "Unsupported GROUP BY type (dictionary index type not supported creating key) {}", col.data_type(), - ))), + ))) + } }, _ => { // This is internal because we should have caught this before. @@ -957,20 +961,6 @@ impl RecordBatchStream for HashAggregateStream { } } -/// Given Vec>, concatenates the inners `Vec` into `ArrayRef`, returning `Vec` -/// This assumes that `arrays` is not empty. -fn concatenate(arrays: Vec>) -> ArrowResult> { - (0..arrays[0].len()) - .map(|column| { - let array_list = arrays - .iter() - .map(|a| a[column].as_ref()) - .collect::>(); - compute::concat(&array_list) - }) - .collect::>>() -} - /// Create a RecordBatch with all group keys and accumulator' states or values. fn create_batch_from_map( mode: &AggregateMode, @@ -978,84 +968,72 @@ fn create_batch_from_map( num_group_expr: usize, output_schema: &Schema, ) -> ArrowResult { - // 1. for each key - // 2. create single-row ArrayRef with all group expressions - // 3. create single-row ArrayRef with all aggregate states or values - // 4. collect all in a vector per key of vec, vec[i][j] - // 5. concatenate the arrays over the second index [j] into a single vec. - let arrays = accumulators - .iter() - .map(|(_, (group_by_values, accumulator_set, _))| { - // 2. - let mut groups = (0..num_group_expr) - .map(|i| match &group_by_values[i] { - GroupByScalar::Float32(n) => { - Arc::new(Float32Array::from(vec![(*n).into()] as Vec)) - as ArrayRef - } - GroupByScalar::Float64(n) => { - Arc::new(Float64Array::from(vec![(*n).into()] as Vec)) - as ArrayRef - } - GroupByScalar::Int8(n) => { - Arc::new(Int8Array::from(vec![*n])) as ArrayRef - } - GroupByScalar::Int16(n) => Arc::new(Int16Array::from(vec![*n])), - GroupByScalar::Int32(n) => Arc::new(Int32Array::from(vec![*n])), - GroupByScalar::Int64(n) => Arc::new(Int64Array::from(vec![*n])), - GroupByScalar::UInt8(n) => Arc::new(UInt8Array::from(vec![*n])), - GroupByScalar::UInt16(n) => Arc::new(UInt16Array::from(vec![*n])), - GroupByScalar::UInt32(n) => Arc::new(UInt32Array::from(vec![*n])), - GroupByScalar::UInt64(n) => Arc::new(UInt64Array::from(vec![*n])), - GroupByScalar::Utf8(str) => { - Arc::new(StringArray::from(vec![&***str])) - } - GroupByScalar::LargeUtf8(str) => { - Arc::new(LargeStringArray::from(vec![&***str])) - } - GroupByScalar::Boolean(b) => Arc::new(BooleanArray::from(vec![*b])), - GroupByScalar::TimeMillisecond(n) => { - Arc::new(TimestampMillisecondArray::from(vec![*n])) - } - GroupByScalar::TimeMicrosecond(n) => { - Arc::new(TimestampMicrosecondArray::from(vec![*n])) - } - GroupByScalar::TimeNanosecond(n) => { - Arc::new(TimestampNanosecondArray::from_vec(vec![*n], None)) - } - GroupByScalar::Date32(n) => Arc::new(Date32Array::from(vec![*n])), - }) - .collect::>(); + if accumulators.is_empty() { + return Ok(RecordBatch::new_empty(Arc::new(output_schema.to_owned()))); + } + let (_, (_, accs, _)) = accumulators.iter().next().unwrap(); + let mut acc_data_types: Vec = vec![]; - // 3. - groups.extend( - finalize_aggregation(accumulator_set, mode) - .map_err(DataFusionError::into_arrow_external_error)?, - ); + // Calculate number/shape of state arrays + match mode { + AggregateMode::Partial => { + for acc in accs.iter() { + let state = acc + .state() + .map_err(DataFusionError::into_arrow_external_error)?; + acc_data_types.push(state.len()); + } + } + AggregateMode::Final | AggregateMode::FinalPartitioned => { + acc_data_types = vec![1; accs.len()]; + } + } - Ok(groups) + let mut columns = (0..num_group_expr) + .map(|i| { + ScalarValue::iter_to_array(accumulators.into_iter().map( + |(_, (group_by_values, _, _))| ScalarValue::from(&group_by_values[i]), + )) }) - // 4. - .collect::>>>()?; + .collect::>>() + .map_err(|x| x.into_arrow_external_error())?; + + // add state / evaluated arrays + for (x, &state_len) in acc_data_types.iter().enumerate() { + for y in 0..state_len { + match mode { + AggregateMode::Partial => { + let res = ScalarValue::iter_to_array(accumulators.into_iter().map( + |(_, (_, accumulator, _))| { + let x = accumulator[x].state().unwrap(); + x[y].clone() + }, + )) + .map_err(DataFusionError::into_arrow_external_error)?; + + columns.push(res); + } + AggregateMode::Final | AggregateMode::FinalPartitioned => { + let res = ScalarValue::iter_to_array(accumulators.into_iter().map( + |(_, (_, accumulator, _))| accumulator[x].evaluate().unwrap(), + )) + .map_err(DataFusionError::into_arrow_external_error)?; + columns.push(res); + } + } + } + } - let batch = if !arrays.is_empty() { - // 5. - let columns = concatenate(arrays)?; + // cast output if needed (e.g. for types like Dictionary where + // the intermediate GroupByScalar type was not the same as the + // output + let columns = columns + .iter() + .zip(output_schema.fields().iter()) + .map(|(col, desired_field)| cast(col, desired_field.data_type())) + .collect::>>()?; - // cast output if needed (e.g. for types like Dictionary where - // the intermediate GroupByScalar type was not the same as the - // output - let columns = columns - .iter() - .zip(output_schema.fields().iter()) - .map(|(col, desired_field)| cast(col, desired_field.data_type())) - .collect::>>()?; - - RecordBatch::try_new(Arc::new(output_schema.to_owned()), columns)? - } else { - RecordBatch::new_empty(Arc::new(output_schema.to_owned())) - }; - Ok(batch) + RecordBatch::try_new(Arc::new(output_schema.to_owned()), columns) } fn create_accumulators( diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index f3fa5b2c5de5c..ac7deeed22c74 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -21,10 +21,10 @@ use crate::error::{DataFusionError, Result}; use arrow::{ array::*, datatypes::{ - ArrowDictionaryKeyType, ArrowNativeType, DataType, Field, Float32Type, Int16Type, - Int32Type, Int64Type, Int8Type, IntervalUnit, TimeUnit, TimestampMicrosecondType, - TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, - UInt16Type, UInt32Type, UInt64Type, UInt8Type, + ArrowDictionaryKeyType, ArrowNativeType, DataType, Field, Float32Type, + Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, IntervalUnit, TimeUnit, + TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, + TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }, }; use std::{convert::TryFrom, fmt, iter::repeat, sync::Arc}; @@ -311,7 +311,7 @@ impl ScalarValue { /// ]; /// /// // Build an Array from the list of ScalarValues - /// let array = ScalarValue::iter_to_array(scalars.iter()) + /// let array = ScalarValue::iter_to_array(scalars.into_iter()) /// .unwrap(); /// /// let expected: ArrayRef = std::sync::Arc::new( @@ -324,8 +324,8 @@ impl ScalarValue { /// /// assert_eq!(&array, &expected); /// ``` - pub fn iter_to_array<'a>( - scalars: impl IntoIterator, + pub fn iter_to_array( + scalars: impl IntoIterator, ) -> Result { let mut scalars = scalars.into_iter().peekable(); @@ -344,10 +344,10 @@ impl ScalarValue { macro_rules! build_array_primitive { ($ARRAY_TY:ident, $SCALAR_TY:ident) => {{ { - let values = scalars + let array = scalars .map(|sv| { if let ScalarValue::$SCALAR_TY(v) = sv { - Ok(*v) + Ok(v) } else { Err(DataFusionError::Internal(format!( "Inconsistent types in ScalarValue::iter_to_array. \ @@ -356,9 +356,8 @@ impl ScalarValue { ))) } }) - .collect::>>()?; + .collect::>()?; - let array: $ARRAY_TY = values.iter().collect(); Arc::new(array) } }}; @@ -369,7 +368,7 @@ impl ScalarValue { macro_rules! build_array_string { ($ARRAY_TY:ident, $SCALAR_TY:ident) => {{ { - let values = scalars + let array = scalars .map(|sv| { if let ScalarValue::$SCALAR_TY(v) = sv { Ok(v) @@ -381,19 +380,74 @@ impl ScalarValue { ))) } }) - .collect::>>()?; - - // it is annoying that one can not create - // StringArray et al directly from iter of &String, - // requiring this map to &str - let values = values.iter().map(|s| s.as_ref()); - - let array: $ARRAY_TY = values.collect(); + .collect::>()?; Arc::new(array) } }}; } + macro_rules! build_array_list_primitive { + ($ARRAY_TY:ident, $SCALAR_TY:ident, $NATIVE_TYPE:ident) => {{ + Arc::new(ListArray::from_iter_primitive::<$ARRAY_TY, _, _>( + scalars.into_iter().map(|x| match x { + ScalarValue::List(xs, _) => xs.map(|x| { + x.iter() + .map(|x| match x { + ScalarValue::$SCALAR_TY(i) => *i, + sv => panic!("Inconsistent types in ScalarValue::iter_to_array. \ + Expected {:?}, got {:?}", data_type, sv), + }) + .collect::>>() + }), + sv => panic!("Inconsistent types in ScalarValue::iter_to_array. \ + Expected {:?}, got {:?}", data_type, sv), + }), + )) + }}; + } + + macro_rules! build_array_list_string { + ($BUILDER:ident, $SCALAR_TY:ident) => {{ + let mut builder = ListBuilder::new($BUILDER::new(0)); + + for scalar in scalars.into_iter() { + match scalar { + ScalarValue::List(Some(xs), _) => { + for s in xs { + match s { + ScalarValue::$SCALAR_TY(Some(val)) => { + builder.values().append_value(val)?; + } + ScalarValue::$SCALAR_TY(None) => { + builder.values().append_null()?; + } + sv => return Err(DataFusionError::Internal(format!( + "Inconsistent types in ScalarValue::iter_to_array. \ + Expected Utf8, got {:?}", + sv + ))), + } + } + builder.append(true)?; + } + ScalarValue::List(None, _) => { + builder.append(false)?; + } + sv => { + return Err(DataFusionError::Internal(format!( + "Inconsistent types in ScalarValue::iter_to_array. \ + Expected List, got {:?}", + sv + ))) + } + } + } + + Arc::new(builder.finish()) + + }} + } + let array: ArrayRef = match &data_type { DataType::Boolean => build_array_primitive!(BooleanArray, Boolean), DataType::Float32 => build_array_primitive!(Float32Array, Float32), @@ -430,6 +484,42 @@ impl ScalarValue { DataType::Interval(IntervalUnit::YearMonth) => { build_array_primitive!(IntervalYearMonthArray, IntervalYearMonth) } + DataType::List(fields) if fields.data_type() == &DataType::Int8 => { + build_array_list_primitive!(Int8Type, Int8, i8) + } + DataType::List(fields) if fields.data_type() == &DataType::Int16 => { + build_array_list_primitive!(Int16Type, Int16, i16) + } + DataType::List(fields) if fields.data_type() == &DataType::Int32 => { + build_array_list_primitive!(Int32Type, Int32, i32) + } + DataType::List(fields) if fields.data_type() == &DataType::Int64 => { + build_array_list_primitive!(Int64Type, Int64, i64) + } + DataType::List(fields) if fields.data_type() == &DataType::UInt8 => { + build_array_list_primitive!(UInt8Type, UInt8, u8) + } + DataType::List(fields) if fields.data_type() == &DataType::UInt16 => { + build_array_list_primitive!(UInt16Type, UInt16, u16) + } + DataType::List(fields) if fields.data_type() == &DataType::UInt32 => { + build_array_list_primitive!(UInt32Type, UInt32, u32) + } + DataType::List(fields) if fields.data_type() == &DataType::UInt64 => { + build_array_list_primitive!(UInt64Type, UInt64, u64) + } + DataType::List(fields) if fields.data_type() == &DataType::Float32 => { + build_array_list_primitive!(Float32Type, Float32, f32) + } + DataType::List(fields) if fields.data_type() == &DataType::Float64 => { + build_array_list_primitive!(Float64Type, Float64, f64) + } + DataType::List(fields) if fields.data_type() == &DataType::Utf8 => { + build_array_list_string!(StringBuilder, Utf8) + } + DataType::List(fields) if fields.data_type() == &DataType::LargeUtf8 => { + build_array_list_string!(LargeStringBuilder, LargeUtf8) + } _ => { return Err(DataFusionError::Internal(format!( "Unsupported creation of {:?} array from ScalarValue {:?}", @@ -1102,7 +1192,7 @@ mod tests { let scalars: Vec<_> = $INPUT.iter().map(|v| ScalarValue::$SCALAR_T(*v)).collect(); - let array = ScalarValue::iter_to_array(scalars.iter()).unwrap(); + let array = ScalarValue::iter_to_array(scalars.into_iter()).unwrap(); let expected: ArrayRef = Arc::new($ARRAYTYPE::from($INPUT)); @@ -1119,7 +1209,7 @@ mod tests { .map(|v| ScalarValue::$SCALAR_T(v.map(|v| v.to_string()))) .collect(); - let array = ScalarValue::iter_to_array(scalars.iter()).unwrap(); + let array = ScalarValue::iter_to_array(scalars.into_iter()).unwrap(); let expected: ArrayRef = Arc::new($ARRAYTYPE::from($INPUT)); @@ -1136,7 +1226,7 @@ mod tests { .map(|v| ScalarValue::$SCALAR_T(v.map(|v| v.to_vec()))) .collect(); - let array = ScalarValue::iter_to_array(scalars.iter()).unwrap(); + let array = ScalarValue::iter_to_array(scalars.into_iter()).unwrap(); let expected: $ARRAYTYPE = $INPUT.iter().map(|v| v.map(|v| v.to_vec())).collect(); @@ -1210,7 +1300,7 @@ mod tests { fn scalar_iter_to_array_empty() { let scalars = vec![] as Vec; - let result = ScalarValue::iter_to_array(scalars.iter()).unwrap_err(); + let result = ScalarValue::iter_to_array(scalars.into_iter()).unwrap_err(); assert!( result .to_string() @@ -1226,7 +1316,7 @@ mod tests { // If the scalar values are not all the correct type, error here let scalars: Vec = vec![Boolean(Some(true)), Int32(Some(5))]; - let result = ScalarValue::iter_to_array(scalars.iter()).unwrap_err(); + let result = ScalarValue::iter_to_array(scalars.into_iter()).unwrap_err(); assert!(result.to_string().contains("Inconsistent types in ScalarValue::iter_to_array. Expected Boolean, got Int32(5)"), "{}", result); } From 9c0ad7b68387181e9d35d34c5dd55b6fe43b94d3 Mon Sep 17 00:00:00 2001 From: Jorge Leitao Date: Thu, 27 May 2021 08:23:05 +0200 Subject: [PATCH 129/329] Fixed master. (#433) --- datafusion/src/physical_plan/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index 4f90a8cf7d6ec..b1234a0314aa6 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -558,7 +558,7 @@ pub trait WindowAccumulator: Send + Sync + Debug { .collect::>>(); Ok(match result { - Some(arr) if num_rows == arr.len() => Some(ScalarValue::iter_to_array(&arr)?), + Some(arr) if num_rows == arr.len() => Some(ScalarValue::iter_to_array(arr)?), None => None, Some(arr) => { return Err(DataFusionError::Internal(format!( From bc1385d000b1c87700b1145406277abdec815aa7 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Fri, 28 May 2021 04:21:54 +0800 Subject: [PATCH 130/329] add support for ndjson for datafusion-cli (#427) --- datafusion-cli/src/print_format.rs | 50 ++++++++++++++++++++---------- datafusion/docs/cli.md | 2 +- 2 files changed, 35 insertions(+), 17 deletions(-) diff --git a/datafusion-cli/src/print_format.rs b/datafusion-cli/src/print_format.rs index c7aa06149678a..34cf5e1f65e91 100644 --- a/datafusion-cli/src/print_format.rs +++ b/datafusion-cli/src/print_format.rs @@ -17,7 +17,7 @@ //! Print format variants use arrow::csv::writer::WriterBuilder; -use arrow::json::ArrayWriter; +use arrow::json::{ArrayWriter, LineDelimitedWriter}; use datafusion::arrow::record_batch::RecordBatch; use datafusion::arrow::util::pretty; use datafusion::error::{DataFusionError, Result}; @@ -31,6 +31,7 @@ pub enum PrintFormat { Tsv, Table, Json, + NdJson, } /// returns all print formats @@ -40,17 +41,19 @@ pub fn all_print_formats() -> Vec { PrintFormat::Tsv, PrintFormat::Table, PrintFormat::Json, + PrintFormat::NdJson, ] } impl FromStr for PrintFormat { type Err = (); fn from_str(s: &str) -> std::result::Result { - match s { + match s.to_lowercase().as_str() { "csv" => Ok(Self::Csv), "tsv" => Ok(Self::Tsv), "table" => Ok(Self::Table), "json" => Ok(Self::Json), + "ndjson" => Ok(Self::NdJson), _ => Err(()), } } @@ -63,20 +66,21 @@ impl fmt::Display for PrintFormat { Self::Tsv => write!(f, "tsv"), Self::Table => write!(f, "table"), Self::Json => write!(f, "json"), + Self::NdJson => write!(f, "ndjson"), } } } -fn print_batches_to_json(batches: &[RecordBatch]) -> Result { - let mut bytes = vec![]; - { - let mut writer = ArrayWriter::new(&mut bytes); - writer.write_batches(batches)?; - writer.finish()?; - } - let formatted = String::from_utf8(bytes) - .map_err(|e| DataFusionError::Execution(e.to_string()))?; - Ok(formatted) +macro_rules! batches_to_json { + ($WRITER: ident, $batches: expr) => {{ + let mut bytes = vec![]; + { + let mut writer = $WRITER::new(&mut bytes); + writer.write_batches($batches)?; + writer.finish()?; + } + String::from_utf8(bytes).map_err(|e| DataFusionError::Execution(e.to_string()))? + }}; } fn print_batches_with_sep(batches: &[RecordBatch], delimiter: u8) -> Result { @@ -102,7 +106,10 @@ impl PrintFormat { Self::Csv => println!("{}", print_batches_with_sep(batches, b',')?), Self::Tsv => println!("{}", print_batches_with_sep(batches, b'\t')?), Self::Table => pretty::print_batches(batches)?, - Self::Json => println!("{}", print_batches_to_json(batches)?), + Self::Json => println!("{}", batches_to_json!(ArrayWriter, batches)), + Self::NdJson => { + println!("{}", batches_to_json!(LineDelimitedWriter, batches)) + } } Ok(()) } @@ -126,6 +133,9 @@ mod tests { let format = "json".parse::().unwrap(); assert_eq!(PrintFormat::Json, format); + let format = "ndjson".parse::().unwrap(); + assert_eq!(PrintFormat::NdJson, format); + let format = "table".parse::().unwrap(); assert_eq!(PrintFormat::Table, format); } @@ -136,6 +146,7 @@ mod tests { assert_eq!("table", PrintFormat::Table.to_string()); assert_eq!("tsv", PrintFormat::Tsv.to_string()); assert_eq!("json", PrintFormat::Json.to_string()); + assert_eq!("ndjson", PrintFormat::NdJson.to_string()); } #[test] @@ -170,9 +181,12 @@ mod tests { } #[test] - fn test_print_batches_to_json_empty() { + fn test_print_batches_to_json_empty() -> Result<()> { let batches = vec![]; - let r = print_batches_to_json(&batches).unwrap(); + let r = batches_to_json!(ArrayWriter, &batches); + assert_eq!("", r); + + let r = batches_to_json!(LineDelimitedWriter, &batches); assert_eq!("", r); let schema = Arc::new(Schema::new(vec![ @@ -192,7 +206,11 @@ mod tests { .unwrap(); let batches = vec![batch]; - let r = print_batches_to_json(&batches).unwrap(); + let r = batches_to_json!(ArrayWriter, &batches); assert_eq!("[{\"a\":1,\"b\":4,\"c\":7},{\"a\":2,\"b\":5,\"c\":8},{\"a\":3,\"b\":6,\"c\":9}]", r); + + let r = batches_to_json!(LineDelimitedWriter, &batches); + assert_eq!("{\"a\":1,\"b\":4,\"c\":7}\n{\"a\":2,\"b\":5,\"c\":8}\n{\"a\":3,\"b\":6,\"c\":9}\n", r); + Ok(()) } } diff --git a/datafusion/docs/cli.md b/datafusion/docs/cli.md index a1fd652b7a047..d62dcdd5b4f1d 100644 --- a/datafusion/docs/cli.md +++ b/datafusion/docs/cli.md @@ -61,7 +61,7 @@ OPTIONS: -c, --batch-size The batch size of each query, or use DataFusion default -p, --data-path Path to your data, default to current directory -f, --file Execute commands from file, then exit - --format Output format (possible values: table, csv, tsv, json) [default: table] + --format Output format [default: table] [possible values: csv, tsv, table, json, ndjson] ``` Type `exit` or `quit` to exit the CLI. From 7007f8e4e1a4d8df3a984088c6e2273de4e70b9e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 28 May 2021 05:56:09 -0400 Subject: [PATCH 131/329] Rewrite pruning logic in terms of PruningStatistics using Array trait (option 2) (#426) * Rewrite pruning logic in terms of PruningStatistics using Array trait * avoid a collect * Revert "avoid a collect" This reverts commit 86f80797041ad08b236ac72a8cb810c0d9bd1c26. * update for new api --- datafusion/src/physical_optimizer/pruning.rs | 702 +++++++++++------- .../src/physical_optimizer/repartition.rs | 6 +- datafusion/src/physical_plan/parquet.rs | 133 +++- 3 files changed, 571 insertions(+), 270 deletions(-) diff --git a/datafusion/src/physical_optimizer/pruning.rs b/datafusion/src/physical_optimizer/pruning.rs index 0446904eae030..3a5a64c6f6689 100644 --- a/datafusion/src/physical_optimizer/pruning.rs +++ b/datafusion/src/physical_optimizer/pruning.rs @@ -31,19 +31,11 @@ use std::{collections::HashSet, sync::Arc}; use arrow::{ - array::{ - make_array, new_null_array, ArrayData, ArrayRef, BooleanArray, - BooleanBufferBuilder, - }, - buffer::MutableBuffer, - datatypes::{DataType, Field, Schema}, + array::{new_null_array, ArrayRef, BooleanArray}, + datatypes::{Field, Schema, SchemaRef}, record_batch::RecordBatch, }; -use parquet::file::{ - metadata::RowGroupMetaData, statistics::Statistics as ParquetStatistics, -}; - use crate::{ error::{DataFusionError, Result}, execution::context::ExecutionContextState, @@ -52,26 +44,81 @@ use crate::{ physical_plan::{planner::DefaultPhysicalPlanner, ColumnarValue, PhysicalExpr}, }; +/// Interface to pass statistics information to [`PruningPredicates`] +/// +/// Returns statistics for containers / files of data in Arrays. +/// +/// For example, for the following three files with a single column +/// ```text +/// file1: column a: min=5, max=10 +/// file2: column a: No stats +/// file2: column a: min=20, max=30 +/// ``` +/// +/// PruningStatistics should return: +/// +/// ```text +/// min_values("a") -> Some([5, Null, 20]) +/// max_values("a") -> Some([20, Null, 30]) +/// min_values("X") -> None +/// ``` +pub trait PruningStatistics { + /// return the minimum values for the named column, if known. + /// Note: the returned array must contain `num_containers()` rows + fn min_values(&self, column: &str) -> Option; + + /// return the maximum values for the named column, if known. + /// Note: the returned array must contain `num_containers()` rows. + fn max_values(&self, column: &str) -> Option; + + /// return the number of containers (e.g. row groups) being + /// pruned with these statistics + fn num_containers(&self) -> usize; +} + +/// Evaluates filter expressions on statistics in order to +/// prune data containers (e.g. parquet row group) +/// +/// See [`try_new`] for more information. #[derive(Debug, Clone)] -/// Builder used for generating predicate functions that can be used -/// to prune data based on statistics (e.g. parquet row group metadata) -pub struct PruningPredicateBuilder { - schema: Schema, +pub struct PruningPredicate { + /// The input schema against which the predicate will be evaluated + schema: SchemaRef, + /// Actual pruning predicate (rewritten in terms of column min/max statistics) predicate_expr: Arc, + /// The statistics required to evaluate this predicate: + /// * The column name in the input schema + /// * Statistics type (e.g. Min or Max) + /// * The field the statistics value should be placed in for + /// pruning predicate evaluation stat_column_req: Vec<(String, StatisticsType, Field)>, } -impl PruningPredicateBuilder { - /// Try to create a new instance of [`PruningPredicateBuilder`] +impl PruningPredicate { + /// Try to create a new instance of [`PruningPredicate`] + /// + /// This will translate the provided `expr` filter expression into + /// a *pruning predicate*. + /// + /// A pruning predicate is one that has been rewritten in terms of + /// the min and max values of column references and that evaluates + /// to FALSE if the filter predicate would evaluate FALSE *for + /// every row* whose values fell within the min / max ranges (aka + /// could be pruned). /// - /// This will translate the filter expression into a statistics predicate expression + /// The pruning predicate evaluates to TRUE or NULL + /// if the filter predicate *might* evaluate to TRUE for at least + /// one row whose vaules fell within the min/max ranges (in other + /// words they might pass the predicate) /// - /// For example, `(column / 2) = 4` becomes `(column_min / 2) <= 4 && 4 <= (column_max / 2))` - pub fn try_new(expr: &Expr, schema: Schema) -> Result { + /// For example, the filter expression `(column / 2) = 4` becomes + /// the pruning predicate + /// `(column_min / 2) <= 4 && 4 <= (column_max / 2))` + pub fn try_new(expr: &Expr, schema: SchemaRef) -> Result { // build predicate expression once let mut stat_column_req = Vec::<(String, StatisticsType, Field)>::new(); let logical_predicate_expr = - build_predicate_expression(expr, &schema, &mut stat_column_req)?; + build_predicate_expression(expr, schema.as_ref(), &mut stat_column_req)?; let stat_fields = stat_column_req .iter() .map(|(_, _, f)| f.clone()) @@ -90,37 +137,31 @@ impl PruningPredicateBuilder { }) } - /// For each set of statistics, evalates the predicate in this - /// builder and returns a `bool` with the following meaning for a - /// container with those statistics: + /// For each set of statistics, evalates the pruning predicate + /// and returns a `bool` with the following meaning for a + /// all rows whose values match the statistics: /// - /// `true`: The container MAY contain rows that match the predicate + /// `true`: There MAY be rows that match the predicate /// - /// `false`: The container MUST NOT contain rows that match the predicate + /// `false`: There are no rows that could match the predicate /// /// Note this function takes a slice of statistics as a parameter /// to amortize the cost of the evaluation of the predicate /// against a single record batch. - pub fn build_pruning_predicate( - &self, - statistics: &[RowGroupMetaData], - ) -> Result> { + pub fn prune(&self, statistics: &S) -> Result> { // build statistics record batch - let predicate_array = build_statistics_record_batch( - statistics, - &self.schema, - &self.stat_column_req, - ) - .and_then(|statistics_batch| { - // execute predicate expression - self.predicate_expr.evaluate(&statistics_batch) - }) - .and_then(|v| match v { - ColumnarValue::Array(array) => Ok(array), - ColumnarValue::Scalar(_) => Err(DataFusionError::Internal( - "predicate expression didn't return an array".to_string(), - )), - })?; + let predicate_array = + build_statistics_record_batch(statistics, &self.stat_column_req) + .and_then(|statistics_batch| { + // execute predicate expression + self.predicate_expr.evaluate(&statistics_batch) + }) + .and_then(|v| match v { + ColumnarValue::Array(array) => Ok(array), + ColumnarValue::Scalar(_) => Err(DataFusionError::Internal( + "predicate expression didn't return an array".to_string(), + )), + })?; let predicate_array = predicate_array .as_any() @@ -141,39 +182,78 @@ impl PruningPredicateBuilder { .map(|x| x.unwrap_or(true)) .collect::>()) } + + /// Return a reference to the input schema + pub fn schema(&self) -> &SchemaRef { + &self.schema + } } -/// Build a RecordBatch from a list of statistics (currently parquet -/// [`RowGroupMetadata`] structs), creating arrays, one for each -/// statistics column, as requested in the stat_column_req parameter. -fn build_statistics_record_batch( - statistics: &[RowGroupMetaData], - schema: &Schema, +/// Build a RecordBatch from a list of statistics, creating arrays, +/// with one row for each PruningStatistics and columns specified in +/// in the stat_column_req parameter. +/// +/// For example, if the requested columns are +/// ```text +/// ("s1", Min, Field:s1_min) +/// ("s2", Max, field:s2_max) +///``` +/// +/// And the input statistics had +/// ```text +/// S1(Min: 5, Max: 10) +/// S2(Min: 99, Max: 1000) +/// S3(Min: 1, Max: 2) +/// ``` +/// +/// Then this function would build a record batch with 2 columns and +/// one row s1_min and s2_max as follows (s3 is not requested): +/// +/// ```text +/// s1_min | s2_max +/// -------+-------- +/// 5 | 1000 +/// ``` +fn build_statistics_record_batch( + statistics: &S, stat_column_req: &[(String, StatisticsType, Field)], ) -> Result { let mut fields = Vec::::new(); let mut arrays = Vec::::new(); + // For each needed statistics column: for (column_name, statistics_type, stat_field) in stat_column_req { - if let Some((column_index, _)) = schema.column_with_name(column_name) { - let statistics = statistics - .iter() - .map(|g| g.column(column_index).statistics()) - .collect::>(); - let array = build_statistics_array( - &statistics, - *statistics_type, - stat_field.data_type(), - ); - fields.push(stat_field.clone()); - arrays.push(array); + let data_type = stat_field.data_type(); + + let num_containers = statistics.num_containers(); + + let array = match statistics_type { + StatisticsType::Min => statistics.min_values(column_name), + StatisticsType::Max => statistics.max_values(column_name), + }; + let array = array.unwrap_or_else(|| new_null_array(data_type, num_containers)); + + if num_containers != array.len() { + return Err(DataFusionError::Internal(format!( + "mismatched statistics length. Expected {}, got {}", + num_containers, + array.len() + ))); } + + // cast statistics array to required data type (e.g. parquet + // provides timestamp statistics as "Int64") + let array = arrow::compute::cast(&array, data_type)?; + + fields.push(stat_field.clone()); + arrays.push(array); } + let schema = Arc::new(Schema::new(fields)); RecordBatch::try_new(schema, arrays) .map_err(|err| DataFusionError::Plan(err.to_string())) } -struct StatisticsExpressionBuilder<'a> { +struct PruningExpressionBuilder<'a> { column_name: String, column_expr: &'a Expr, scalar_expr: &'a Expr, @@ -182,7 +262,7 @@ struct StatisticsExpressionBuilder<'a> { reverse_operator: bool, } -impl<'a> StatisticsExpressionBuilder<'a> { +impl<'a> PruningExpressionBuilder<'a> { fn try_new( left: &'a Expr, right: &'a Expr, @@ -303,7 +383,11 @@ fn rewrite_column_expr( utils::rewrite_expression(&expr, &expressions) } -/// Translate logical filter expression into statistics predicate expression +/// Translate logical filter expression into pruning predicate +/// expression that will evaluate to FALSE if it can be determined no +/// rows between the min/max values could pass the predicates. +/// +/// Returns the pruning predicate as an [`Expr`] fn build_predicate_expression( expr: &Expr, schema: &Schema, @@ -328,7 +412,7 @@ fn build_predicate_expression( } let expr_builder = - StatisticsExpressionBuilder::try_new(left, right, schema, stat_column_req); + PruningExpressionBuilder::try_new(left, right, schema, stat_column_req); let mut expr_builder = match expr_builder { Ok(builder) => builder, // allow partial failure in predicate expression generation @@ -384,210 +468,307 @@ enum StatisticsType { Max, } -fn build_statistics_array( - statistics: &[Option<&ParquetStatistics>], - statistics_type: StatisticsType, - data_type: &DataType, -) -> ArrayRef { - let statistics_count = statistics.len(); - let first_group_stats = statistics.iter().find(|s| s.is_some()); - let first_group_stats = if let Some(Some(statistics)) = first_group_stats { - // found first row group with statistics defined - statistics - } else { - // no row group has statistics defined - return new_null_array(data_type, statistics_count); +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use super::*; + use crate::logical_plan::{col, lit}; + use crate::{assert_batches_eq, physical_optimizer::pruning::StatisticsType}; + use arrow::{ + array::{BinaryArray, Int32Array, Int64Array, StringArray}, + datatypes::{DataType, TimeUnit}, }; - let (data_size, arrow_type) = match first_group_stats { - ParquetStatistics::Int32(_) => (std::mem::size_of::(), DataType::Int32), - ParquetStatistics::Int64(_) => (std::mem::size_of::(), DataType::Int64), - ParquetStatistics::Float(_) => (std::mem::size_of::(), DataType::Float32), - ParquetStatistics::Double(_) => (std::mem::size_of::(), DataType::Float64), - ParquetStatistics::ByteArray(_) if data_type == &DataType::Utf8 => { - (0, DataType::Utf8) + #[derive(Debug)] + /// Test for container stats + struct ContainerStats { + min: ArrayRef, + max: ArrayRef, + } + + impl ContainerStats { + fn new_i32( + min: impl IntoIterator>, + max: impl IntoIterator>, + ) -> Self { + Self { + min: Arc::new(min.into_iter().collect::()), + max: Arc::new(max.into_iter().collect::()), + } } - _ => { - // type of statistics not supported - return new_null_array(data_type, statistics_count); + + fn new_utf8<'a>( + min: impl IntoIterator>, + max: impl IntoIterator>, + ) -> Self { + Self { + min: Arc::new(min.into_iter().collect::()), + max: Arc::new(max.into_iter().collect::()), + } } - }; - let statistics = statistics.iter().map(|s| { - s.filter(|s| s.has_min_max_set()) - .map(|s| match statistics_type { - StatisticsType::Min => s.min_bytes(), - StatisticsType::Max => s.max_bytes(), - }) - }); - - if arrow_type == DataType::Utf8 { - let data_size = statistics - .clone() - .map(|x| x.map(|b| b.len()).unwrap_or(0)) - .sum(); - let mut builder = - arrow::array::StringBuilder::with_capacity(statistics_count, data_size); - let string_statistics = - statistics.map(|x| x.and_then(|bytes| std::str::from_utf8(bytes).ok())); - for maybe_string in string_statistics { - match maybe_string { - Some(string_value) => builder.append_value(string_value).unwrap(), - None => builder.append_null().unwrap(), - }; + fn min(&self) -> Option { + Some(self.min.clone()) + } + + fn max(&self) -> Option { + Some(self.max.clone()) } - return Arc::new(builder.finish()); - } - - let mut data_buffer = MutableBuffer::new(statistics_count * data_size); - let mut bitmap_builder = BooleanBufferBuilder::new(statistics_count); - let mut null_count = 0; - for s in statistics { - if let Some(stat_data) = s { - bitmap_builder.append(true); - data_buffer.extend_from_slice(stat_data); - } else { - bitmap_builder.append(false); - data_buffer.resize(data_buffer.len() + data_size, 0); - null_count += 1; + + fn len(&self) -> usize { + assert_eq!(self.min.len(), self.max.len()); + self.min.len() } } - let mut builder = ArrayData::builder(arrow_type) - .len(statistics_count) - .add_buffer(data_buffer.into()); - if null_count > 0 { - builder = builder.null_bit_buffer(bitmap_builder.finish()); + #[derive(Debug, Default)] + struct TestStatistics { + // key: column name + stats: HashMap, } - let array_data = builder.build(); - let statistics_array = make_array(array_data); - if statistics_array.data_type() == data_type { - return statistics_array; + + impl TestStatistics { + fn new() -> Self { + Self::default() + } + + fn with( + mut self, + name: impl Into, + container_stats: ContainerStats, + ) -> Self { + self.stats.insert(name.into(), container_stats); + self + } } - // cast statistics array to required data type - arrow::compute::cast(&statistics_array, data_type) - .unwrap_or_else(|_| new_null_array(data_type, statistics_count)) -} -#[cfg(test)] -mod tests { - use super::*; - use crate::physical_optimizer::pruning::StatisticsType; - use arrow::{ - array::{Int32Array, StringArray}, - datatypes::DataType, - }; - use parquet::file::statistics::Statistics as ParquetStatistics; + impl PruningStatistics for TestStatistics { + fn min_values(&self, column: &str) -> Option { + self.stats + .get(column) + .map(|container_stats| container_stats.min()) + .unwrap_or(None) + } + + fn max_values(&self, column: &str) -> Option { + self.stats + .get(column) + .map(|container_stats| container_stats.max()) + .unwrap_or(None) + } + + fn num_containers(&self) -> usize { + self.stats + .values() + .next() + .map(|container_stats| container_stats.len()) + .unwrap_or(0) + } + } + + /// Returns the specified min/max container values + struct OneContainerStats { + min_values: Option, + max_values: Option, + num_containers: usize, + } + + impl PruningStatistics for OneContainerStats { + fn min_values(&self, _column: &str) -> Option { + self.min_values.clone() + } + + fn max_values(&self, _column: &str) -> Option { + self.max_values.clone() + } + + fn num_containers(&self) -> usize { + self.num_containers + } + } #[test] - fn build_statistics_array_int32() { - // build row group metadata array - let s1 = ParquetStatistics::int32(None, Some(10), None, 0, false); - let s2 = ParquetStatistics::int32(Some(2), Some(20), None, 0, false); - let s3 = ParquetStatistics::int32(Some(3), Some(30), None, 0, false); - let statistics = vec![Some(&s1), Some(&s2), Some(&s3)]; - - let statistics_array = - build_statistics_array(&statistics, StatisticsType::Min, &DataType::Int32); - let int32_array = statistics_array - .as_any() - .downcast_ref::() - .unwrap(); - let int32_vec = int32_array.into_iter().collect::>(); - assert_eq!(int32_vec, vec![None, Some(2), Some(3)]); - - let statistics_array = - build_statistics_array(&statistics, StatisticsType::Max, &DataType::Int32); - let int32_array = statistics_array - .as_any() - .downcast_ref::() - .unwrap(); - let int32_vec = int32_array.into_iter().collect::>(); - // here the first max value is None and not the Some(10) value which was actually set - // because the min value is None - assert_eq!(int32_vec, vec![None, Some(20), Some(30)]); + fn test_build_statistics_record_batch() { + // Request a record batch with of s1_min, s2_max, s3_max, s3_min + let stat_column_req = vec![ + // min of original column s1, named s1_min + ( + "s1".to_string(), + StatisticsType::Min, + Field::new("s1_min", DataType::Int32, true), + ), + // max of original column s2, named s2_max + ( + "s2".to_string(), + StatisticsType::Max, + Field::new("s2_max", DataType::Int32, true), + ), + // max of original column s3, named s3_max + ( + "s3".to_string(), + StatisticsType::Max, + Field::new("s3_max", DataType::Utf8, true), + ), + // min of original column s3, named s3_min + ( + "s3".to_string(), + StatisticsType::Min, + Field::new("s3_min", DataType::Utf8, true), + ), + ]; + + let statistics = TestStatistics::new() + .with( + "s1", + ContainerStats::new_i32( + vec![None, None, Some(9), None], // min + vec![Some(10), None, None, None], // max + ), + ) + .with( + "s2", + ContainerStats::new_i32( + vec![Some(2), None, None, None], // min + vec![Some(20), None, None, None], // max + ), + ) + .with( + "s3", + ContainerStats::new_utf8( + vec![Some("a"), None, None, None], // min + vec![Some("q"), None, Some("r"), None], // max + ), + ); + + let batch = build_statistics_record_batch(&statistics, &stat_column_req).unwrap(); + let expected = vec![ + "+--------+--------+--------+--------+", + "| s1_min | s2_max | s3_max | s3_min |", + "+--------+--------+--------+--------+", + "| | 20 | q | a |", + "| | | | |", + "| 9 | | r | |", + "| | | | |", + "+--------+--------+--------+--------+", + ]; + + assert_batches_eq!(expected, &[batch]); } #[test] - fn build_statistics_array_utf8() { - // build row group metadata array - let s1 = ParquetStatistics::byte_array(None, Some("10".into()), None, 0, false); - let s2 = ParquetStatistics::byte_array( - Some("2".into()), - Some("20".into()), - None, - 0, - false, - ); - let s3 = ParquetStatistics::byte_array( - Some("3".into()), - Some("30".into()), - None, - 0, - false, - ); - let statistics = vec![Some(&s1), Some(&s2), Some(&s3)]; + fn test_build_statistics_casting() { + // Test requesting a Timestamp column, but getting statistics as Int64 + // which is what Parquet does + + // Request a record batch with of s1_min as a timestamp + let stat_column_req = vec![( + "s1".to_string(), + StatisticsType::Min, + Field::new( + "s1_min", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ), + )]; + + // Note the statistics pass back i64 (not timestamp) + let statistics = OneContainerStats { + min_values: Some(Arc::new(Int64Array::from(vec![Some(10)]))), + max_values: Some(Arc::new(Int64Array::from(vec![Some(20)]))), + num_containers: 1, + }; - let statistics_array = - build_statistics_array(&statistics, StatisticsType::Min, &DataType::Utf8); - let string_array = statistics_array - .as_any() - .downcast_ref::() - .unwrap(); - let string_vec = string_array.into_iter().collect::>(); - assert_eq!(string_vec, vec![None, Some("2"), Some("3")]); - - let statistics_array = - build_statistics_array(&statistics, StatisticsType::Max, &DataType::Utf8); - let string_array = statistics_array - .as_any() - .downcast_ref::() - .unwrap(); - let string_vec = string_array.into_iter().collect::>(); - // here the first max value is None and not the Some("10") value which was actually set - // because the min value is None - assert_eq!(string_vec, vec![None, Some("20"), Some("30")]); + let batch = build_statistics_record_batch(&statistics, &stat_column_req).unwrap(); + let expected = vec![ + "+-------------------------------+", + "| s1_min |", + "+-------------------------------+", + "| 1970-01-01 00:00:00.000000010 |", + "+-------------------------------+", + ]; + + assert_batches_eq!(expected, &[batch]); } #[test] - fn build_statistics_array_empty_stats() { - let data_type = DataType::Int32; - let statistics = vec![]; - let statistics_array = - build_statistics_array(&statistics, StatisticsType::Min, &data_type); - assert_eq!(statistics_array.len(), 0); - - let statistics = vec![None, None]; - let statistics_array = - build_statistics_array(&statistics, StatisticsType::Min, &data_type); - assert_eq!(statistics_array.len(), statistics.len()); - assert_eq!(statistics_array.data_type(), &data_type); - for i in 0..statistics_array.len() { - assert_eq!(statistics_array.is_null(i), true); - assert_eq!(statistics_array.is_valid(i), false); - } + fn test_build_statistics_no_stats() { + let stat_column_req = vec![]; + + let statistics = OneContainerStats { + min_values: Some(Arc::new(Int64Array::from(vec![Some(10)]))), + max_values: Some(Arc::new(Int64Array::from(vec![Some(20)]))), + num_containers: 1, + }; + + let result = + build_statistics_record_batch(&statistics, &stat_column_req).unwrap_err(); + assert!( + result.to_string().contains("Invalid argument error"), + "{}", + result + ); } #[test] - fn build_statistics_array_unsupported_type() { - // boolean is not currently a supported type for statistics - let s1 = ParquetStatistics::boolean(Some(false), Some(true), None, 0, false); - let s2 = ParquetStatistics::boolean(Some(false), Some(true), None, 0, false); - let statistics = vec![Some(&s1), Some(&s2)]; - let data_type = DataType::Boolean; - let statistics_array = - build_statistics_array(&statistics, StatisticsType::Min, &data_type); - assert_eq!(statistics_array.len(), statistics.len()); - assert_eq!(statistics_array.data_type(), &data_type); - for i in 0..statistics_array.len() { - assert_eq!(statistics_array.is_null(i), true); - assert_eq!(statistics_array.is_valid(i), false); - } + fn test_build_statistics_inconsistent_types() { + // Test requesting a Utf8 column when the stats return some other type + + // Request a record batch with of s1_min as a timestamp + let stat_column_req = vec![( + "s1".to_string(), + StatisticsType::Min, + Field::new("s1_min", DataType::Utf8, true), + )]; + + // Note the statistics return binary (which can't be cast to string) + let statistics = OneContainerStats { + min_values: Some(Arc::new(BinaryArray::from(vec![&[255u8] as &[u8]]))), + max_values: None, + num_containers: 1, + }; + + let batch = build_statistics_record_batch(&statistics, &stat_column_req).unwrap(); + let expected = vec![ + "+--------+", + "| s1_min |", + "+--------+", + "| |", + "+--------+", + ]; + + assert_batches_eq!(expected, &[batch]); + } + + #[test] + fn test_build_statistics_inconsistent_length() { + // return an inconsistent length to the actual statistics arrays + let stat_column_req = vec![( + "s1".to_string(), + StatisticsType::Min, + Field::new("s1_min", DataType::Int64, true), + )]; + + // Note the statistics pass back i64 (not timestamp) + let statistics = OneContainerStats { + min_values: Some(Arc::new(Int64Array::from(vec![Some(10)]))), + max_values: Some(Arc::new(Int64Array::from(vec![Some(20)]))), + num_containers: 3, + }; + + let result = + build_statistics_record_batch(&statistics, &stat_column_req).unwrap_err(); + assert!( + result + .to_string() + .contains("mismatched statistics length. Expected 3, got 1"), + "{}", + result + ); } #[test] fn row_group_predicate_eq() -> Result<()> { - use crate::logical_plan::{col, lit}; let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); let expected_expr = "#c1_min LtEq Int32(1) And Int32(1) LtEq #c1_max"; @@ -606,7 +787,6 @@ mod tests { #[test] fn row_group_predicate_gt() -> Result<()> { - use crate::logical_plan::{col, lit}; let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); let expected_expr = "#c1_max Gt Int32(1)"; @@ -625,7 +805,6 @@ mod tests { #[test] fn row_group_predicate_gt_eq() -> Result<()> { - use crate::logical_plan::{col, lit}; let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); let expected_expr = "#c1_max GtEq Int32(1)"; @@ -643,7 +822,6 @@ mod tests { #[test] fn row_group_predicate_lt() -> Result<()> { - use crate::logical_plan::{col, lit}; let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); let expected_expr = "#c1_min Lt Int32(1)"; @@ -662,7 +840,6 @@ mod tests { #[test] fn row_group_predicate_lt_eq() -> Result<()> { - use crate::logical_plan::{col, lit}; let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); let expected_expr = "#c1_min LtEq Int32(1)"; @@ -680,7 +857,6 @@ mod tests { #[test] fn row_group_predicate_and() -> Result<()> { - use crate::logical_plan::{col, lit}; let schema = Schema::new(vec![ Field::new("c1", DataType::Int32, false), Field::new("c2", DataType::Int32, false), @@ -697,7 +873,6 @@ mod tests { #[test] fn row_group_predicate_or() -> Result<()> { - use crate::logical_plan::{col, lit}; let schema = Schema::new(vec![ Field::new("c1", DataType::Int32, false), Field::new("c2", DataType::Int32, false), @@ -713,7 +888,6 @@ mod tests { #[test] fn row_group_predicate_stat_column_req() -> Result<()> { - use crate::logical_plan::{col, lit}; let schema = Schema::new(vec![ Field::new("c1", DataType::Int32, false), Field::new("c2", DataType::Int32, false), @@ -749,4 +923,34 @@ mod tests { Ok(()) } + + #[test] + fn prune_api() { + let schema = Arc::new(Schema::new(vec![ + Field::new("s1", DataType::Utf8, false), + Field::new("s2", DataType::Int32, false), + ])); + + // Prune using s2 > 5 + let expr = col("s2").gt(lit(5)); + + let statistics = TestStatistics::new().with( + "s2", + ContainerStats::new_i32( + vec![Some(0), Some(4), None, Some(3)], // min + vec![Some(5), Some(6), None, None], // max + ), + ); + + // s2 [0, 5] ==> no rows should pass + // s2 [4, 6] ==> some rows could pass + // No stats for s2 ==> some rows could pass + // s2 [3, None] (null max) ==> some rows could pass + + let p = PruningPredicate::try_new(&expr, schema).unwrap(); + let result = p.prune(&statistics).unwrap(); + let expected = vec![false, true, true, true]; + + assert_eq!(result, expected); + } } diff --git a/datafusion/src/physical_optimizer/repartition.rs b/datafusion/src/physical_optimizer/repartition.rs index fee4b3e11e5d2..011db64aaf8a2 100644 --- a/datafusion/src/physical_optimizer/repartition.rs +++ b/datafusion/src/physical_optimizer/repartition.rs @@ -115,6 +115,7 @@ mod tests { #[test] fn added_repartition_to_single_partition() -> Result<()> { + let schema = Arc::new(Schema::empty()); let parquet_project = ProjectionExec::try_new( vec![], Arc::new(ParquetExec::new( @@ -122,7 +123,7 @@ mod tests { filenames: vec!["x".to_string()], statistics: Statistics::default(), }], - Schema::empty(), + schema, None, None, 2048, @@ -149,6 +150,7 @@ mod tests { #[test] fn repartition_deepest_node() -> Result<()> { + let schema = Arc::new(Schema::empty()); let parquet_project = ProjectionExec::try_new( vec![], Arc::new(ProjectionExec::try_new( @@ -158,7 +160,7 @@ mod tests { filenames: vec!["x".to_string()], statistics: Statistics::default(), }], - Schema::empty(), + schema, None, None, 2048, diff --git a/datafusion/src/physical_plan/parquet.rs b/datafusion/src/physical_plan/parquet.rs index f36171cdb73f4..55a6d96738cb4 100644 --- a/datafusion/src/physical_plan/parquet.rs +++ b/datafusion/src/physical_plan/parquet.rs @@ -17,23 +17,25 @@ //! Execution plan for reading Parquet files -use std::any::Any; use std::fmt; use std::fs::File; use std::sync::Arc; use std::task::{Context, Poll}; +use std::{any::Any, convert::TryInto}; use crate::{ error::{DataFusionError, Result}, logical_plan::Expr, - physical_optimizer::pruning::PruningPredicateBuilder, + physical_optimizer::pruning::{PruningPredicate, PruningStatistics}, physical_plan::{ common, DisplayFormatType, ExecutionPlan, Partitioning, RecordBatchStream, SendableRecordBatchStream, }, + scalar::ScalarValue, }; use arrow::{ + array::ArrayRef, datatypes::{Schema, SchemaRef}, error::{ArrowError, Result as ArrowResult}, record_batch::RecordBatch, @@ -41,10 +43,12 @@ use arrow::{ use parquet::file::{ metadata::RowGroupMetaData, reader::{FileReader, SerializedFileReader}, + statistics::Statistics as ParquetStatistics, }; use fmt::Debug; use parquet::arrow::{ArrowReader, ParquetFileArrowReader}; + use tokio::{ sync::mpsc::{channel, Receiver, Sender}, task, @@ -69,7 +73,7 @@ pub struct ParquetExec { /// Statistics for the data set (sum of statistics for all partitions) statistics: Statistics, /// Optional predicate builder - predicate_builder: Option, + predicate_builder: Option, /// Optional limit of the number of rows limit: Option, } @@ -220,9 +224,9 @@ impl ParquetExec { schemas.len() ))); } - let schema = schemas[0].clone(); + let schema = Arc::new(schemas.pop().unwrap()); let predicate_builder = predicate.and_then(|predicate_expr| { - PruningPredicateBuilder::try_new(&predicate_expr, schema.clone()).ok() + PruningPredicate::try_new(&predicate_expr, schema.clone()).ok() }); Ok(Self::new( @@ -238,9 +242,9 @@ impl ParquetExec { /// Create a new Parquet reader execution plan with provided partitions and schema pub fn new( partitions: Vec, - schema: Schema, + schema: SchemaRef, projection: Option>, - predicate_builder: Option, + predicate_builder: Option, batch_size: usize, limit: Option, ) -> Self { @@ -457,11 +461,102 @@ fn send_result( Ok(()) } +/// Wraps parquet statistics in a way +/// that implements [`PruningStatistics`] +struct RowGroupPruningStatistics<'a> { + row_group_metadata: &'a [RowGroupMetaData], + parquet_schema: &'a Schema, +} + +/// Extract the min/max statistics from a `ParquetStatistics` object +macro_rules! get_statistic { + ($column_statistics:expr, $func:ident, $bytes_func:ident) => {{ + if !$column_statistics.has_min_max_set() { + return None; + } + match $column_statistics { + ParquetStatistics::Boolean(s) => Some(ScalarValue::Boolean(Some(*s.$func()))), + ParquetStatistics::Int32(s) => Some(ScalarValue::Int32(Some(*s.$func()))), + ParquetStatistics::Int64(s) => Some(ScalarValue::Int64(Some(*s.$func()))), + // 96 bit ints not supported + ParquetStatistics::Int96(_) => None, + ParquetStatistics::Float(s) => Some(ScalarValue::Float32(Some(*s.$func()))), + ParquetStatistics::Double(s) => Some(ScalarValue::Float64(Some(*s.$func()))), + ParquetStatistics::ByteArray(s) => { + let s = std::str::from_utf8(s.$bytes_func()) + .map(|s| s.to_string()) + .ok(); + Some(ScalarValue::Utf8(s)) + } + // type not supported yet + ParquetStatistics::FixedLenByteArray(_) => None, + } + }}; +} + +// Extract the min or max value calling `func` or `bytes_func` on the ParquetStatistics as appropriate +macro_rules! get_min_max_values { + ($self:expr, $column:expr, $func:ident, $bytes_func:ident) => {{ + let (column_index, field) = if let Some((v, f)) = $self.parquet_schema.column_with_name($column) { + (v, f) + } else { + // Named column was not present + return None + }; + + let data_type = field.data_type(); + let null_scalar: ScalarValue = if let Ok(v) = data_type.try_into() { + v + } else { + // DataFusion doesn't have support for ScalarValues of the column type + return None + }; + + let scalar_values : Vec = $self.row_group_metadata + .iter() + .flat_map(|meta| { + meta.column(column_index).statistics() + }) + .map(|stats| { + get_statistic!(stats, $func, $bytes_func) + }) + .map(|maybe_scalar| { + // column either did't have statistics at all or didn't have min/max values + maybe_scalar.unwrap_or_else(|| null_scalar.clone()) + }) + .collect(); + + // ignore errors converting to arrays (e.g. different types) + ScalarValue::iter_to_array(scalar_values).ok() + }} +} + +impl<'a> PruningStatistics for RowGroupPruningStatistics<'a> { + fn min_values(&self, column: &str) -> Option { + get_min_max_values!(self, column, min, min_bytes) + } + + fn max_values(&self, column: &str) -> Option { + get_min_max_values!(self, column, max, max_bytes) + } + + fn num_containers(&self) -> usize { + self.row_group_metadata.len() + } +} + fn build_row_group_predicate( - predicate_builder: &PruningPredicateBuilder, + predicate_builder: &PruningPredicate, row_group_metadata: &[RowGroupMetaData], ) -> Box bool> { - let predicate_values = predicate_builder.build_pruning_predicate(row_group_metadata); + let parquet_schema = predicate_builder.schema().as_ref(); + + let pruning_stats = RowGroupPruningStatistics { + row_group_metadata, + parquet_schema, + }; + + let predicate_values = predicate_builder.prune(&pruning_stats); let predicate_values = match predicate_values { Ok(values) => values, @@ -476,7 +571,7 @@ fn build_row_group_predicate( fn read_files( filenames: &[String], projection: &[usize], - predicate_builder: &Option, + predicate_builder: &Option, batch_size: usize, response_tx: Sender>, limit: Option, @@ -651,7 +746,7 @@ mod tests { // int > 1 => c1_max > 1 let expr = col("c1").gt(lit(15)); let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let predicate_builder = PruningPredicateBuilder::try_new(&expr, schema)?; + let predicate_builder = PruningPredicate::try_new(&expr, Arc::new(schema))?; let schema_descr = get_test_schema_descr(vec![("c1", PhysicalType::INT32)]); let rgm1 = get_row_group_meta_data( @@ -681,7 +776,7 @@ mod tests { // int > 1 => c1_max > 1 let expr = col("c1").gt(lit(15)); let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let predicate_builder = PruningPredicateBuilder::try_new(&expr, schema)?; + let predicate_builder = PruningPredicate::try_new(&expr, Arc::new(schema))?; let schema_descr = get_test_schema_descr(vec![("c1", PhysicalType::INT32)]); let rgm1 = get_row_group_meta_data( @@ -713,11 +808,11 @@ mod tests { // test row group predicate with partially supported expression // int > 1 and int % 2 => c1_max > 1 and true let expr = col("c1").gt(lit(15)).and(col("c2").modulus(lit(2))); - let schema = Schema::new(vec![ + let schema = Arc::new(Schema::new(vec![ Field::new("c1", DataType::Int32, false), Field::new("c2", DataType::Int32, false), - ]); - let predicate_builder = PruningPredicateBuilder::try_new(&expr, schema.clone())?; + ])); + let predicate_builder = PruningPredicate::try_new(&expr, schema.clone())?; let schema_descr = get_test_schema_descr(vec![ ("c1", PhysicalType::INT32), @@ -752,7 +847,7 @@ mod tests { // if conditions in predicate are joined with OR and an unsupported expression is used // this bypasses the entire predicate expression and no row groups are filtered out let expr = col("c1").gt(lit(15)).or(col("c2").modulus(lit(2))); - let predicate_builder = PruningPredicateBuilder::try_new(&expr, schema)?; + let predicate_builder = PruningPredicate::try_new(&expr, schema)?; let row_group_predicate = build_row_group_predicate(&predicate_builder, &row_group_metadata); let row_group_filter = row_group_metadata @@ -772,11 +867,11 @@ mod tests { // where a null array is generated for some statistics columns // int > 1 and bool = true => c1_max > 1 and null let expr = col("c1").gt(lit(15)).and(col("c2").eq(lit(true))); - let schema = Schema::new(vec![ + let schema = Arc::new(Schema::new(vec![ Field::new("c1", DataType::Int32, false), Field::new("c2", DataType::Boolean, false), - ]); - let predicate_builder = PruningPredicateBuilder::try_new(&expr, schema)?; + ])); + let predicate_builder = PruningPredicate::try_new(&expr, schema)?; let schema_descr = get_test_schema_descr(vec![ ("c1", PhysicalType::INT32), From c9ed34c16e47bd5da58b98f64a017e38ab8c946f Mon Sep 17 00:00:00 2001 From: QP Hou Date: Fri, 28 May 2021 04:01:09 -0700 Subject: [PATCH 132/329] add output field name rfc (#422) * add output field name rfc * move to spec model * add link to developers docs & add ASF header --- DEVELOPERS.md | 13 ++ .../output-field-name-semantic.md | 220 ++++++++++++++++++ 2 files changed, 233 insertions(+) create mode 100644 docs/specification/output-field-name-semantic.md diff --git a/DEVELOPERS.md b/DEVELOPERS.md index c2daf3a72e5e0..9223d990e1d63 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -94,6 +94,19 @@ can be displayed. For example, the following command creates a dot -Tpdf < /tmp/plan.dot > /tmp/plan.pdf ``` +## Specification + +We formalize Datafusion semantics and behaviors through specification +documents. These specifications are useful to be used as references to help +resolve ambiguities during development or code reviews. + +You are also welcome to propose changes to existing specifications or create +new specifications as you see fit. + +Here is the list current active specifications: + +* [Output field name semantic](docs/specification/output-field-name-semantic.md) + ## How to format `.md` document We are using `prettier` to format `.md` files. diff --git a/docs/specification/output-field-name-semantic.md b/docs/specification/output-field-name-semantic.md new file mode 100644 index 0000000000000..fd28d118921b2 --- /dev/null +++ b/docs/specification/output-field-name-semantic.md @@ -0,0 +1,220 @@ + + +# Datafusion output field name semantic + +This specification documents how field names in output record batches should be +generated based on given user queries. The filed name rules apply to +Datafusion queries planned from both SQL queries and Dataframe APIs. + +## Field name rules + +* All field names MUST not contain relation/table qualifier. + * Both `SELECT t1.id`, `SELECT id` and `df.select_columns(&["id"])` SHOULD result in field name: `id` +* Function names MUST be converted to lowercase. + * `SELECT AVG(c1)` SHOULD result in field name: `avg(c1)` +* Literal string MUST not be wrapped with quotes or double quotes. + * `SELECT 'foo'` SHOULD result in field name: `foo` +* Operator expressions MUST be wrapped with parentheses. + * `SELECT -2` SHOULD result in field name: `(- 2)` +* Operator and operand MUST be separated by spaces. + * `SELECT 1+2` SHOULD result in field name: `(1 + 2)` +* Function arguments MUST be separated by a comma `,` and a space. + * `SELECT f(c1,c2)` and `df.select(vec![f.udf("f")?.call(vec![col("c1"), col("c2")])])` SHOULD result in field name: `f(c1, c2)` + +## Appendices + +### Examples and comparison with other systems + +Data schema for test sample queries: + +``` +CREATE TABLE t1 (id INT, a VARCHAR(5)); +INSERT INTO t1 (id, a) VALUES (1, 'foo'); +INSERT INTO t1 (id, a) VALUES (2, 'bar'); + +CREATE TABLE t2 (id INT, b VARCHAR(5)); +INSERT INTO t2 (id, b) VALUES (1, 'hello'); +INSERT INTO t2 (id, b) VALUES (2, 'world'); +``` + +#### Projected columns + +Query: + +``` +SELECT t1.id, a, t2.id, b +FROM t1 +JOIN t2 ON t1.id = t2.id +``` + +Datafusion Arrow record batches output: + +| id | a | id | b | +|----|-----|----|-------| +| 1 | foo | 1 | hello | +| 2 | bar | 2 | world | + + +Spark, MySQL 8 and PostgreSQL 13 output: + +| id | a | id | b | +|----|-----|----|-------| +| 1 | foo | 1 | hello | +| 2 | bar | 2 | world | + +SQLite 3 output: + +| id | a | b | +|----|-----|-------| +| 1 | foo | hello | +| 2 | bar | world | + + +#### Function transformed columns + +Query: + +``` +SELECT ABS(t1.id), abs(-id) FROM t1; +``` + +Datafusion Arrow record batches output: + +| abs(id) | abs((- id)) | +|---------|-------------| +| 1 | 1 | +| 2 | 2 | + + +Spark output: + +| abs(id) | abs((- id)) | +|---------|-------------| +| 1 | 1 | +| 2 | 2 | + + +MySQL 8 output: + +| ABS(t1.id) | abs(-id) | +|------------|----------| +| 1 | 1 | +| 2 | 2 | + +PostgreSQL 13 output: + +| abs | abs | +|-----|-----| +| 1 | 1 | +| 2 | 2 | + +SQlite 3 output: + +| ABS(t1.id) | abs(-id) | +|------------|----------| +| 1 | 1 | +| 2 | 2 | + + +#### Function with operators + +Query: + +``` +SELECT t1.id + ABS(id), ABS(id * t1.id) FROM t1; +``` + +Datafusion Arrow record batches output: + +| id + abs(id) | abs(id * id) | +|--------------|--------------| +| 2 | 1 | +| 4 | 4 | + + +Spark output: + +| id + abs(id) | abs(id * id) | +|--------------|--------------| +| 2 | 1 | +| 4 | 4 | + +MySQL 8 output: + +| t1.id + ABS(id) | ABS(id * t1.id) | +|-----------------|-----------------| +| 2 | 1 | +| 4 | 4 | + +PostgreSQL output: + +| ?column? | abs | +|----------|-----| +| 2 | 1 | +| 4 | 4 | + +SQLite output: + +| t1.id + ABS(id) | ABS(id * t1.id) | +|-----------------|-----------------| +| 2 | 1 | +| 4 | 4 | + + +#### Project literals + +Query: + +``` +SELECT 1, 2+5, 'foo_bar'; +``` + +Datafusion Arrow record batches output: + +| 1 | (2 + 5) | foo_bar | +|---|---------|---------| +| 1 | 7 | foo_bar | + + +Spark output: + +| 1 | (2 + 5) | foo_bar | +|---|---------|---------| +| 1 | 7 | foo_bar | + +MySQL output: + +| 1 | 2+5 | foo_bar | +|---|-----|---------| +| 1 | 7 | foo_bar | + + +PostgreSQL output: + +| ?column? | ?column? | ?column? | +|----------|----------|----------| +| 1 | 7 | foo_bar | + + +SQLite 3 output: + +| 1 | 2+5 | 'foo_bar' | +|---|-----|-----------| +| 1 | 7 | foo_bar | From 321fda40a47bcc494c5d2511b6e8b02c9ea975b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=80=9D=E7=BB=B4?= Date: Sat, 29 May 2021 18:47:30 +0800 Subject: [PATCH 133/329] NdJson support (#404) --- datafusion/src/datasource/csv.rs | 10 +- datafusion/src/datasource/json.rs | 190 ++++++++++ datafusion/src/datasource/mod.rs | 9 + datafusion/src/physical_plan/csv.rs | 76 +--- datafusion/src/physical_plan/json.rs | 487 +++++++++++++++++++++++++ datafusion/src/physical_plan/mod.rs | 10 +- datafusion/src/physical_plan/source.rs | 90 +++++ datafusion/tests/jsons/1.json | 4 + datafusion/tests/jsons/2.json | 12 + 9 files changed, 803 insertions(+), 85 deletions(-) create mode 100644 datafusion/src/datasource/json.rs create mode 100644 datafusion/src/physical_plan/json.rs create mode 100644 datafusion/src/physical_plan/source.rs create mode 100644 datafusion/tests/jsons/1.json create mode 100644 datafusion/tests/jsons/2.json diff --git a/datafusion/src/datasource/csv.rs b/datafusion/src/datasource/csv.rs index 33cbeb12ca6bd..10e6659089b47 100644 --- a/datafusion/src/datasource/csv.rs +++ b/datafusion/src/datasource/csv.rs @@ -40,21 +40,13 @@ use std::string::String; use std::sync::{Arc, Mutex}; use crate::datasource::datasource::Statistics; -use crate::datasource::TableProvider; +use crate::datasource::{Source, TableProvider}; use crate::error::{DataFusionError, Result}; use crate::logical_plan::Expr; use crate::physical_plan::csv::CsvExec; pub use crate::physical_plan::csv::CsvReadOptions; use crate::physical_plan::{common, ExecutionPlan}; -enum Source { - /// Path to a single CSV file or a directory containing one of more CSV files - Path(String), - - /// Read CSV data from a reader - Reader(Mutex>>), -} - /// Represents a CSV file with a provided schema pub struct CsvFile { source: Source, diff --git a/datafusion/src/datasource/json.rs b/datafusion/src/datasource/json.rs new file mode 100644 index 0000000000000..f916f6c1e382c --- /dev/null +++ b/datafusion/src/datasource/json.rs @@ -0,0 +1,190 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Line-delimited JSON data source +//! +//! This data source allows Line-delimited JSON string or files to be used as input for queries. +//! + +use std::{ + any::Any, + io::{BufReader, Read, Seek}, + sync::{Arc, Mutex}, +}; + +use crate::{ + datasource::{Source, TableProvider}, + error::{DataFusionError, Result}, + physical_plan::{ + common, + json::{NdJsonExec, NdJsonReadOptions}, + ExecutionPlan, + }, +}; +use arrow::{datatypes::SchemaRef, json::reader::infer_json_schema_from_seekable}; + +use super::datasource::Statistics; + +trait SeekRead: Read + Seek {} + +impl SeekRead for T {} + +/// Represents a line-delimited JSON file with a provided schema +pub struct NdJsonFile { + source: Source>, + schema: SchemaRef, + file_extension: String, + statistics: Statistics, +} + +impl NdJsonFile { + /// Attempt to initialize a `NdJsonFile` from a path. The schema can be inferred automatically. + pub fn try_new(path: &str, options: NdJsonReadOptions) -> Result { + let schema = if let Some(schema) = options.schema { + schema + } else { + let filenames = common::build_file_list(path, options.file_extension)?; + if filenames.is_empty() { + return Err(DataFusionError::Plan(format!( + "No files found at {path} with file extension {file_extension}", + path = path, + file_extension = options.file_extension + ))); + } + + NdJsonExec::try_infer_schema( + filenames, + Some(options.schema_infer_max_records), + )? + .into() + }; + + Ok(Self { + source: Source::Path(path.to_string()), + schema, + file_extension: options.file_extension.to_string(), + statistics: Statistics::default(), + }) + } + + /// Attempt to initialize a `NdJsonFile` from a reader impls `Seek`. The schema can be inferred automatically. + pub fn try_new_from_reader( + mut reader: R, + options: NdJsonReadOptions, + ) -> Result { + let schema = if let Some(schema) = options.schema { + schema + } else { + let mut bufr = BufReader::new(reader); + let schema = infer_json_schema_from_seekable( + &mut bufr, + Some(options.schema_infer_max_records), + )? + .into(); + reader = bufr.into_inner(); + schema + }; + Ok(Self { + source: Source::Reader(Mutex::new(Some(Box::new(reader)))), + schema, + statistics: Statistics::default(), + file_extension: String::new(), + }) + } +} +impl TableProvider for NdJsonFile { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + + fn scan( + &self, + projection: &Option>, + batch_size: usize, + _filters: &[crate::logical_plan::Expr], + limit: Option, + ) -> Result> { + let opts = NdJsonReadOptions { + schema: Some(self.schema.clone()), + schema_infer_max_records: 0, // schema will always be provided, so it's unnecessary to infer schema + file_extension: self.file_extension.as_str(), + }; + let batch_size = limit + .map(|l| std::cmp::min(l, batch_size)) + .unwrap_or(batch_size); + + let exec = match &self.source { + Source::Reader(maybe_reader) => { + if let Some(rdr) = maybe_reader.lock().unwrap().take() { + NdJsonExec::try_new_from_reader( + rdr, + opts, + projection.clone(), + batch_size, + limit, + )? + } else { + return Err(DataFusionError::Execution( + "You can only read once if the data comes from a reader" + .to_string(), + )); + } + } + Source::Path(p) => { + NdJsonExec::try_new(&p, opts, projection.clone(), batch_size, limit)? + } + }; + Ok(Arc::new(exec)) + } + + fn statistics(&self) -> Statistics { + self.statistics.clone() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::prelude::*; + const TEST_DATA_BASE: &str = "tests/jsons"; + + #[tokio::test] + async fn csv_file_from_reader() -> Result<()> { + let mut ctx = ExecutionContext::new(); + let path = format!("{}/2.json", TEST_DATA_BASE); + ctx.register_table( + "ndjson", + Arc::new(NdJsonFile::try_new(&path, Default::default())?), + )?; + let df = ctx.sql("select sum(a) from ndjson")?; + let batches = df.collect().await?; + assert_eq!( + batches[0] + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 100000000000011 + ); + Ok(()) + } +} diff --git a/datafusion/src/datasource/mod.rs b/datafusion/src/datasource/mod.rs index ac2f3d2dee1ee..b46b9cc4e8995 100644 --- a/datafusion/src/datasource/mod.rs +++ b/datafusion/src/datasource/mod.rs @@ -20,9 +20,18 @@ pub mod csv; pub mod datasource; pub mod empty; +pub mod json; pub mod memory; pub mod parquet; pub use self::csv::{CsvFile, CsvReadOptions}; pub use self::datasource::{TableProvider, TableType}; pub use self::memory::MemTable; + +pub(crate) enum Source> { + /// Path to a single file or a directory containing one of more files + Path(String), + + /// Read data from a reader + Reader(std::sync::Mutex>), +} diff --git a/datafusion/src/physical_plan/csv.rs b/datafusion/src/physical_plan/csv.rs index 96b24cc33201f..9f88a53bc17cd 100644 --- a/datafusion/src/physical_plan/csv.rs +++ b/datafusion/src/physical_plan/csv.rs @@ -18,7 +18,8 @@ //! Execution plan for reading CSV files use crate::error::{DataFusionError, Result}; -use crate::physical_plan::{common, DisplayFormatType, ExecutionPlan, Partitioning}; +use crate::physical_plan::ExecutionPlan; +use crate::physical_plan::{common, source::Source, Partitioning}; use arrow::csv; use arrow::datatypes::{Schema, SchemaRef}; use arrow::error::Result as ArrowResult; @@ -32,7 +33,7 @@ use std::sync::Arc; use std::sync::Mutex; use std::task::{Context, Poll}; -use super::{RecordBatchStream, SendableRecordBatchStream}; +use super::{DisplayFormatType, RecordBatchStream, SendableRecordBatchStream}; use async_trait::async_trait; /// CSV file read option @@ -106,77 +107,6 @@ impl<'a> CsvReadOptions<'a> { } } -/// Source represents where the data comes from. -enum Source { - /// The data comes from partitioned files - PartitionedFiles { - /// Path to directory containing partitioned files with the same schema - path: String, - /// The individual files under path - filenames: Vec, - }, - - /// The data comes from anything impl Read trait - Reader(Mutex>>), -} - -impl std::fmt::Debug for Source { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Source::PartitionedFiles { path, filenames } => f - .debug_struct("PartitionedFiles") - .field("path", path) - .field("filenames", filenames) - .finish()?, - Source::Reader(_) => f.write_str("Reader")?, - }; - Ok(()) - } -} - -impl std::fmt::Display for Source { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Source::PartitionedFiles { path, filenames } => { - write!(f, "Path({}: [{}])", path, filenames.join(",")) - } - Source::Reader(_) => { - write!(f, "Reader(...)") - } - } - } -} - -impl Clone for Source { - fn clone(&self) -> Self { - match self { - Source::PartitionedFiles { path, filenames } => Self::PartitionedFiles { - path: path.clone(), - filenames: filenames.clone(), - }, - Source::Reader(_) => Self::Reader(Mutex::new(None)), - } - } -} - -impl Source { - /// Path to directory containing partitioned files with the same schema - pub fn path(&self) -> &str { - match self { - Source::PartitionedFiles { path, .. } => path.as_str(), - Source::Reader(_) => "", - } - } - - /// The individual files under path - pub fn filenames(&self) -> &[String] { - match self { - Source::PartitionedFiles { filenames, .. } => filenames, - Source::Reader(_) => &[], - } - } -} - /// Execution plan for scanning a CSV file #[derive(Debug, Clone)] pub struct CsvExec { diff --git a/datafusion/src/physical_plan/json.rs b/datafusion/src/physical_plan/json.rs new file mode 100644 index 0000000000000..ed9b0b03a38ea --- /dev/null +++ b/datafusion/src/physical_plan/json.rs @@ -0,0 +1,487 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Execution plan for reading line-delimited JSON files +use async_trait::async_trait; +use futures::Stream; + +use super::{common, source::Source, ExecutionPlan, Partitioning, RecordBatchStream}; +use crate::error::{DataFusionError, Result}; +use arrow::json::reader::{infer_json_schema_from_iterator, ValueIter}; +use arrow::{ + datatypes::{Schema, SchemaRef}, + error::Result as ArrowResult, + json, + record_batch::RecordBatch, +}; +use std::fs::File; +use std::{any::Any, io::Seek}; +use std::{ + io::{BufReader, Read}, + pin::Pin, + sync::{Arc, Mutex}, + task::{Context, Poll}, +}; + +/// Line-delimited JSON read options +#[derive(Clone)] +pub struct NdJsonReadOptions<'a> { + /// The data source schema. + pub schema: Option, + + /// Max number of rows to read from CSV files for schema inference if needed. Defaults to 1000. + pub schema_infer_max_records: usize, + + /// File extension; only files with this extension are selected for data input. + /// Defaults to ".json". + pub file_extension: &'a str, +} + +impl<'a> Default for NdJsonReadOptions<'a> { + fn default() -> Self { + Self { + schema: None, + schema_infer_max_records: 1000, + file_extension: ".json", + } + } +} + +trait SeekRead: Read + Seek {} + +impl SeekRead for T {} +/// Execution plan for scanning NdJson data source +#[derive(Debug)] +pub struct NdJsonExec { + source: Source>, + schema: SchemaRef, + projection: Option>, + projected_schema: SchemaRef, + file_extension: String, + batch_size: usize, + limit: Option, +} + +impl NdJsonExec { + /// Create a new execution plan for reading from a path + pub fn try_new( + path: &str, + options: NdJsonReadOptions, + projection: Option>, + batch_size: usize, + limit: Option, + ) -> Result { + let file_extension = options.file_extension.to_string(); + + let filenames = common::build_file_list(path, &file_extension)?; + + if filenames.is_empty() { + return Err(DataFusionError::Execution(format!( + "No files found at {path} with file extension {file_extension}", + path = path, + file_extension = file_extension.as_str() + ))); + } + + let schema = match options.schema { + Some(s) => s, + None => Arc::new(NdJsonExec::try_infer_schema( + filenames.clone(), + Some(options.schema_infer_max_records), + )?), + }; + + let projected_schema = match &projection { + None => schema.clone(), + Some(p) => Arc::new(Schema::new( + p.iter().map(|i| schema.field(*i).clone()).collect(), + )), + }; + + Ok(Self { + source: Source::PartitionedFiles { + path: path.to_string(), + filenames, + }, + schema, + file_extension, + projection, + projected_schema, + batch_size, + limit, + }) + } + /// Create a new execution plan for reading from a reader + pub fn try_new_from_reader( + reader: impl Read + Seek + Send + Sync + 'static, + options: NdJsonReadOptions, + projection: Option>, + batch_size: usize, + limit: Option, + ) -> Result { + let schema = match options.schema { + Some(s) => s, + None => { + return Err(DataFusionError::Execution( + "The schema must be provided in options when reading from a reader" + .to_string(), + )); + } + }; + + let projected_schema = match &projection { + None => schema.clone(), + Some(p) => Arc::new(Schema::new( + p.iter().map(|i| schema.field(*i).clone()).collect(), + )), + }; + + Ok(Self { + source: Source::Reader(Mutex::new(Some(Box::new(reader)))), + schema, + file_extension: String::new(), + projection, + projected_schema, + batch_size, + limit, + }) + } + + /// Path to directory containing partitioned CSV files with the same schema + pub fn path(&self) -> &str { + self.source.path() + } + + /// The individual files under path + pub fn filenames(&self) -> &[String] { + self.source.filenames() + } + + /// File extension + pub fn file_extension(&self) -> &str { + &self.file_extension + } + + /// Get the schema of the CSV file + pub fn file_schema(&self) -> SchemaRef { + self.schema.clone() + } + + /// Optional projection for which columns to load + pub fn projection(&self) -> Option<&Vec> { + self.projection.as_ref() + } + + /// Batch size + pub fn batch_size(&self) -> usize { + self.batch_size + } + + /// Limit + pub fn limit(&self) -> Option { + self.limit + } + + /// Infer schema for given CSV dataset + pub fn try_infer_schema( + mut filenames: Vec, + max_records: Option, + ) -> Result { + let mut schemas = Vec::new(); + let mut records_to_read = max_records.unwrap_or(usize::MAX); + while records_to_read > 0 && !filenames.is_empty() { + let file = File::open(filenames.pop().unwrap())?; + let mut reader = BufReader::new(file); + let iter = ValueIter::new(&mut reader, None); + let schema = infer_json_schema_from_iterator(iter.take_while(|_| { + let should_take = records_to_read > 0; + records_to_read -= 1; + should_take + }))?; + schemas.push(schema); + } + + Ok(Schema::try_merge(schemas)?) + } +} + +#[async_trait] +impl ExecutionPlan for NdJsonExec { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.projected_schema.clone() + } + + fn output_partitioning(&self) -> Partitioning { + Partitioning::UnknownPartitioning(match &self.source { + Source::PartitionedFiles { filenames, .. } => filenames.len(), + Source::Reader(_) => 1, + }) + } + + fn children(&self) -> Vec> { + Vec::new() + } + + fn with_new_children( + &self, + children: Vec>, + ) -> Result> { + if !children.is_empty() { + Err(DataFusionError::Internal(format!( + "Children cannot be replaced in {:?}", + self + ))) + } else if let Source::PartitionedFiles { filenames, path } = &self.source { + Ok(Arc::new(Self { + source: Source::PartitionedFiles { + filenames: filenames.clone(), + path: path.clone(), + }, + schema: self.schema.clone(), + projection: self.projection.clone(), + projected_schema: self.projected_schema.clone(), + batch_size: self.batch_size, + limit: self.limit, + file_extension: self.file_extension.clone(), + })) + } else { + Err(DataFusionError::Internal( + "NdJsonExec with reader source cannot be used with `with_new_children`" + .to_string(), + )) + } + } + + async fn execute( + &self, + partition: usize, + ) -> Result { + let mut builder = json::ReaderBuilder::new() + .with_schema(self.schema.clone()) + .with_batch_size(self.batch_size); + if let Some(proj) = &self.projection { + builder = builder.with_projection( + proj.iter() + .map(|col_idx| self.schema.field(*col_idx).name()) + .cloned() + .collect(), + ); + } + match &self.source { + Source::PartitionedFiles { filenames, .. } => { + let file = File::open(&filenames[partition])?; + + Ok(Box::pin(NdJsonStream::new( + builder.build(file)?, + self.limit, + ))) + } + Source::Reader(rdr) => { + if partition != 0 { + Err(DataFusionError::Internal( + "Only partition 0 is valid when CSV comes from a reader" + .to_string(), + )) + } else if let Some(rdr) = rdr.lock().unwrap().take() { + Ok(Box::pin(NdJsonStream::new(builder.build(rdr)?, self.limit))) + } else { + Err(DataFusionError::Execution( + "Error reading CSV: Data can only be read a single time when the source is a reader" + .to_string(), + )) + } + } + } + } +} + +struct NdJsonStream { + reader: json::Reader, + remain: Option, +} + +impl NdJsonStream { + fn new(reader: json::Reader, limit: Option) -> Self { + Self { + reader, + remain: limit, + } + } +} + +impl Stream for NdJsonStream { + type Item = ArrowResult; + + fn poll_next( + mut self: Pin<&mut Self>, + _cx: &mut Context<'_>, + ) -> Poll> { + if let Some(remain) = self.remain.as_mut() { + if *remain < 1 { + return Poll::Ready(None); + } + } + + Poll::Ready(match self.reader.next() { + Ok(Some(item)) => { + if let Some(remain) = self.remain.as_mut() { + if *remain >= item.num_rows() { + *remain -= item.num_rows(); + Some(Ok(item)) + } else { + let len = *remain; + *remain = 0; + Some(Ok(RecordBatch::try_new( + item.schema(), + item.columns() + .iter() + .map(|column| column.slice(0, len)) + .collect(), + )?)) + } + } else { + Some(Ok(item)) + } + } + Ok(None) => None, + Err(err) => Some(Err(err)), + }) + } +} + +impl RecordBatchStream for NdJsonStream { + fn schema(&self) -> SchemaRef { + self.reader.schema() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use futures::StreamExt; + + const TEST_DATA_BASE: &str = "tests/jsons"; + + #[tokio::test] + async fn nd_json_exec_file_without_projection() -> Result<()> { + use arrow::datatypes::DataType; + let path = format!("{}/1.json", TEST_DATA_BASE); + let exec = NdJsonExec::try_new(&path, Default::default(), None, 1024, Some(3))?; + let inferred_schema = exec.schema(); + assert_eq!(inferred_schema.fields().len(), 4); + + // a,b,c,d should be inferred + inferred_schema.field_with_name("a").unwrap(); + inferred_schema.field_with_name("b").unwrap(); + inferred_schema.field_with_name("c").unwrap(); + inferred_schema.field_with_name("d").unwrap(); + + assert_eq!( + inferred_schema.field_with_name("a").unwrap().data_type(), + &DataType::Int64 + ); + assert!(matches!( + inferred_schema.field_with_name("b").unwrap().data_type(), + DataType::List(_) + )); + assert_eq!( + inferred_schema.field_with_name("d").unwrap().data_type(), + &DataType::Utf8 + ); + + let mut it = exec.execute(0).await?; + let batch = it.next().await.unwrap()?; + + assert_eq!(batch.num_rows(), 3); + let values = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(values.value(0), 1); + assert_eq!(values.value(1), -10); + assert_eq!(values.value(2), 2); + + Ok(()) + } + + #[tokio::test] + async fn nd_json_exec_file_projection() -> Result<()> { + let path = format!("{}/1.json", TEST_DATA_BASE); + let exec = + NdJsonExec::try_new(&path, Default::default(), Some(vec![0, 2]), 1024, None)?; + let inferred_schema = exec.schema(); + assert_eq!(inferred_schema.fields().len(), 2); + + inferred_schema.field_with_name("a").unwrap(); + inferred_schema.field_with_name("b").unwrap_err(); + inferred_schema.field_with_name("c").unwrap(); + inferred_schema.field_with_name("d").unwrap_err(); + + let mut it = exec.execute(0).await?; + let batch = it.next().await.unwrap()?; + + assert_eq!(batch.num_rows(), 4); + let values = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(values.value(0), 1); + assert_eq!(values.value(1), -10); + assert_eq!(values.value(2), 2); + Ok(()) + } + + #[tokio::test] + async fn nd_json_exec_from_reader() -> Result<()> { + let content = r#"{"a":"aaa", "b":[2.0, 1.3, -6.1], "c":[false, true], "d":"4"} +{"a":"bbb", "b":[2.0, 1.3, -6.1], "c":[true, true], "d":"4"}"#; + let cur = std::io::Cursor::new(content); + let mut bufrdr = std::io::BufReader::new(cur); + let schema = + arrow::json::reader::infer_json_schema_from_seekable(&mut bufrdr, None)?; + let exec = NdJsonExec::try_new_from_reader( + bufrdr, + NdJsonReadOptions { + schema: Some(Arc::new(schema)), + ..Default::default() + }, + None, + 1024, + Some(1), + )?; + + let mut it = exec.execute(0).await?; + let batch = it.next().await.unwrap()?; + + assert_eq!(batch.num_rows(), 1); + + let values = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(values.value(0), "aaa"); + + Ok(()) + } +} diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index b1234a0314aa6..ae84b36b31a8e 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -17,6 +17,11 @@ //! Traits for physical query plan, supporting parallel execution for partitioned relations. +use std::fmt; +use std::fmt::{Debug, Display}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; + use crate::execution::context::ExecutionContextState; use crate::logical_plan::LogicalPlan; use crate::{ @@ -30,9 +35,6 @@ use arrow::{array::ArrayRef, datatypes::Field}; use async_trait::async_trait; pub use display::DisplayFormatType; use futures::stream::Stream; -use std::fmt::{self, Debug, Display}; -use std::sync::atomic::{AtomicUsize, Ordering}; -use std::sync::Arc; use std::{any::Any, pin::Pin}; use self::{display::DisplayableExecutionPlan, merge::MergeExec}; @@ -594,6 +596,7 @@ pub mod group_scalar; pub mod hash_aggregate; pub mod hash_join; pub mod hash_utils; +pub mod json; pub mod limit; pub mod math_expressions; pub mod memory; @@ -605,6 +608,7 @@ pub mod projection; pub mod regex_expressions; pub mod repartition; pub mod sort; +pub mod source; pub mod string_expressions; pub mod type_coercion; pub mod udaf; diff --git a/datafusion/src/physical_plan/source.rs b/datafusion/src/physical_plan/source.rs new file mode 100644 index 0000000000000..012405a38a1a3 --- /dev/null +++ b/datafusion/src/physical_plan/source.rs @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Contains a `Source` enum represents where the data comes from. + +use std::{io::Read, sync::Mutex}; + +/// Source represents where the data comes from. +pub(crate) enum Source> { + /// The data comes from partitioned files + PartitionedFiles { + /// Path to directory containing partitioned files with the same schema + path: String, + /// The individual files under path + filenames: Vec, + }, + + /// The data comes from anything impl Read trait + Reader(Mutex>), +} + +impl std::fmt::Debug for Source { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Source::PartitionedFiles { path, filenames } => f + .debug_struct("PartitionedFiles") + .field("path", path) + .field("filenames", filenames) + .finish()?, + Source::Reader(_) => f.write_str("Reader")?, + }; + Ok(()) + } +} +impl std::fmt::Display for Source { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Source::PartitionedFiles { path, filenames } => { + write!(f, "Path({}: [{}])", path, filenames.join(",")) + } + Source::Reader(_) => { + write!(f, "Reader(...)") + } + } + } +} + +impl Clone for Source { + fn clone(&self) -> Self { + match self { + Source::PartitionedFiles { path, filenames } => Self::PartitionedFiles { + path: path.clone(), + filenames: filenames.clone(), + }, + Source::Reader(_) => Self::Reader(Mutex::new(None)), + } + } +} + +impl Source { + /// Path to directory containing partitioned files with the same schema + pub fn path(&self) -> &str { + match self { + Source::PartitionedFiles { path, .. } => path.as_str(), + Source::Reader(_) => "", + } + } + + /// The individual files under path + pub fn filenames(&self) -> &[String] { + match self { + Source::PartitionedFiles { filenames, .. } => filenames, + Source::Reader(_) => &[], + } + } +} diff --git a/datafusion/tests/jsons/1.json b/datafusion/tests/jsons/1.json new file mode 100644 index 0000000000000..e6f360fe551bb --- /dev/null +++ b/datafusion/tests/jsons/1.json @@ -0,0 +1,4 @@ +{"a":1, "b":[2.0, 1.3, -6.1], "c":[false, true], "d":"4"} +{"a":-10, "b":[2.0, 1.3, -6.1], "c":[true, true], "d":"4"} +{"a":2, "b":[2.0, null, -6.1], "c":[false, null], "d":"text"} +{} \ No newline at end of file diff --git a/datafusion/tests/jsons/2.json b/datafusion/tests/jsons/2.json new file mode 100644 index 0000000000000..dafd2dd2e420d --- /dev/null +++ b/datafusion/tests/jsons/2.json @@ -0,0 +1,12 @@ +{"a":1, "b":2.0, "c":false, "d":"4"} +{"a":-10, "b":-3.5, "c":true, "d":"4"} +{"a":2, "b":0.6, "c":false, "d":"text"} +{"a":1, "b":2.0, "c":false, "d":"4"} +{"a":7, "b":-3.5, "c":true, "d":"4"} +{"a":1, "b":0.6, "c":false, "d":"text"} +{"a":1, "b":2.0, "c":false, "d":"4"} +{"a":5, "b":-3.5, "c":true, "d":"4"} +{"a":1, "b":0.6, "c":false, "d":"text"} +{"a":1, "b":2.0, "c":false, "d":"4"} +{"a":1, "b":-3.5, "c":true, "d":"4"} +{"a":100000000000000, "b":0.6, "c":false, "d":"text"} \ No newline at end of file From bdae93b9365ef5892e686915250d42e927d00620 Mon Sep 17 00:00:00 2001 From: Jorge Leitao Date: Sun, 30 May 2021 20:51:52 +0200 Subject: [PATCH 134/329] Improved usage of use arrow in ballista. (#447) --- ballista/rust/client/Cargo.toml | 1 - ballista/rust/client/src/columnar_batch.rs | 2 +- ballista/rust/client/src/context.rs | 2 +- ballista/rust/core/Cargo.toml | 1 - ballista/rust/core/src/client.rs | 12 +- ballista/rust/core/src/datasource.rs | 2 +- ballista/rust/core/src/error.rs | 2 +- .../core/src/execution_plans/query_stage.rs | 2 +- .../src/execution_plans/shuffle_reader.rs | 2 +- .../src/execution_plans/unresolved_shuffle.rs | 2 +- ballista/rust/core/src/memory_stream.rs | 2 +- .../core/src/serde/logical_plan/from_proto.rs | 63 ++++---- .../rust/core/src/serde/logical_plan/mod.rs | 47 +----- .../core/src/serde/logical_plan/to_proto.rs | 141 ++++++++---------- .../src/serde/physical_plan/from_proto.rs | 2 +- .../rust/core/src/serde/physical_plan/mod.rs | 46 +++--- ballista/rust/core/src/serde/scheduler/mod.rs | 4 +- ballista/rust/core/src/utils.rs | 15 +- ballista/rust/executor/src/collect.rs | 6 +- ballista/rust/executor/src/flight_service.rs | 14 +- ballista/rust/scheduler/src/test_utils.rs | 2 +- 21 files changed, 157 insertions(+), 213 deletions(-) diff --git a/ballista/rust/client/Cargo.toml b/ballista/rust/client/Cargo.toml index 9ce0517ee2938..dd1a57ce14fbe 100644 --- a/ballista/rust/client/Cargo.toml +++ b/ballista/rust/client/Cargo.toml @@ -31,5 +31,4 @@ futures = "0.3" log = "0.4" tokio = "1.0" -arrow = { version = "4.0" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/client/src/columnar_batch.rs b/ballista/rust/client/src/columnar_batch.rs index d3ff8861faac8..a40b68ff3ebd7 100644 --- a/ballista/rust/client/src/columnar_batch.rs +++ b/ballista/rust/client/src/columnar_batch.rs @@ -19,7 +19,7 @@ use std::{collections::HashMap, sync::Arc}; use ballista_core::error::{ballista_error, Result}; -use arrow::{ +use datafusion::arrow::{ array::ArrayRef, datatypes::{DataType, Schema}, record_batch::RecordBatch, diff --git a/ballista/rust/client/src/context.rs b/ballista/rust/client/src/context.rs index df97e3a22984c..4c0ab4244be35 100644 --- a/ballista/rust/client/src/context.rs +++ b/ballista/rust/client/src/context.rs @@ -33,7 +33,7 @@ use ballista_core::{ utils::create_datafusion_context, }; -use arrow::datatypes::Schema; +use datafusion::arrow::datatypes::Schema; use datafusion::catalog::TableReference; use datafusion::error::{DataFusionError, Result}; use datafusion::logical_plan::LogicalPlan; diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml index 2868b60f637f5..1fc0e0c78bf7e 100644 --- a/ballista/rust/core/Cargo.toml +++ b/ballista/rust/core/Cargo.toml @@ -40,7 +40,6 @@ tokio = "1.0" tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } -arrow = { version = "4.0" } arrow-flight = { version = "4.0" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/core/src/client.rs b/ballista/rust/core/src/client.rs index 1d0fedca7b4ef..c8267c8194c20 100644 --- a/ballista/rust/core/src/client.rs +++ b/ballista/rust/core/src/client.rs @@ -31,15 +31,15 @@ use crate::serde::scheduler::{ Action, ExecutePartition, ExecutePartitionResult, PartitionId, PartitionStats, }; -use arrow::record_batch::RecordBatch; -use arrow::{ - array::{StringArray, StructArray}, - error::{ArrowError, Result as ArrowResult}, -}; -use arrow::{datatypes::Schema, datatypes::SchemaRef}; use arrow_flight::utils::flight_data_to_arrow_batch; use arrow_flight::Ticket; use arrow_flight::{flight_service_client::FlightServiceClient, FlightData}; +use datafusion::arrow::{ + array::{StringArray, StructArray}, + datatypes::{Schema, SchemaRef}, + error::{ArrowError, Result as ArrowResult}, + record_batch::RecordBatch, +}; use datafusion::physical_plan::common::collect; use datafusion::physical_plan::{ExecutionPlan, SendableRecordBatchStream}; use datafusion::{logical_plan::LogicalPlan, physical_plan::RecordBatchStream}; diff --git a/ballista/rust/core/src/datasource.rs b/ballista/rust/core/src/datasource.rs index 5b1540ac50373..b774b8d39b9db 100644 --- a/ballista/rust/core/src/datasource.rs +++ b/ballista/rust/core/src/datasource.rs @@ -17,7 +17,7 @@ use std::{any::Any, sync::Arc}; -use arrow::datatypes::SchemaRef; +use datafusion::arrow::datatypes::SchemaRef; use datafusion::error::Result as DFResult; use datafusion::{ datasource::{datasource::Statistics, TableProvider}, diff --git a/ballista/rust/core/src/error.rs b/ballista/rust/core/src/error.rs index e16920e047443..b2c8d99ae9f98 100644 --- a/ballista/rust/core/src/error.rs +++ b/ballista/rust/core/src/error.rs @@ -23,7 +23,7 @@ use std::{ io, result, }; -use arrow::error::ArrowError; +use datafusion::arrow::error::ArrowError; use datafusion::error::DataFusionError; use sqlparser::parser; diff --git a/ballista/rust/core/src/execution_plans/query_stage.rs b/ballista/rust/core/src/execution_plans/query_stage.rs index d8822ea3138a0..e95a5d8b51972 100644 --- a/ballista/rust/core/src/execution_plans/query_stage.rs +++ b/ballista/rust/core/src/execution_plans/query_stage.rs @@ -18,8 +18,8 @@ use std::sync::Arc; use std::{any::Any, pin::Pin}; -use arrow::datatypes::SchemaRef; use async_trait::async_trait; +use datafusion::arrow::datatypes::SchemaRef; use datafusion::physical_plan::{ExecutionPlan, Partitioning}; use datafusion::{error::Result, physical_plan::RecordBatchStream}; use uuid::Uuid; diff --git a/ballista/rust/core/src/execution_plans/shuffle_reader.rs b/ballista/rust/core/src/execution_plans/shuffle_reader.rs index 107cc15bfa054..db29cf13b5fed 100644 --- a/ballista/rust/core/src/execution_plans/shuffle_reader.rs +++ b/ballista/rust/core/src/execution_plans/shuffle_reader.rs @@ -22,8 +22,8 @@ use crate::client::BallistaClient; use crate::memory_stream::MemoryStream; use crate::serde::scheduler::PartitionLocation; -use arrow::datatypes::SchemaRef; use async_trait::async_trait; +use datafusion::arrow::datatypes::SchemaRef; use datafusion::physical_plan::{DisplayFormatType, ExecutionPlan, Partitioning}; use datafusion::{ error::{DataFusionError, Result}, diff --git a/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs b/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs index 7d147d53537c4..5c1b41798c5d3 100644 --- a/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs +++ b/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs @@ -21,8 +21,8 @@ use std::{any::Any, pin::Pin}; use crate::memory_stream::MemoryStream; use crate::serde::scheduler::PartitionLocation; -use arrow::datatypes::SchemaRef; use async_trait::async_trait; +use datafusion::arrow::datatypes::SchemaRef; use datafusion::physical_plan::{ExecutionPlan, Partitioning}; use datafusion::{ error::{DataFusionError, Result}, diff --git a/ballista/rust/core/src/memory_stream.rs b/ballista/rust/core/src/memory_stream.rs index 8bf5e203f6d14..ab72bdc82aee1 100644 --- a/ballista/rust/core/src/memory_stream.rs +++ b/ballista/rust/core/src/memory_stream.rs @@ -20,7 +20,7 @@ use std::task::{Context, Poll}; -use arrow::{datatypes::SchemaRef, error::Result, record_batch::RecordBatch}; +use datafusion::arrow::{datatypes::SchemaRef, error::Result, record_batch::RecordBatch}; use datafusion::physical_plan::RecordBatchStream; use futures::Stream; diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index 020858fbfc3fe..10c4670e809aa 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -26,7 +26,7 @@ use std::{ unimplemented, }; -use arrow::datatypes::{DataType, Field, Schema}; +use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use datafusion::logical_plan::{ abs, acos, asin, atan, ceil, cos, exp, floor, ln, log10, log2, round, signum, sin, sqrt, tan, trunc, Expr, JoinType, LogicalPlan, LogicalPlanBuilder, Operator, @@ -299,9 +299,9 @@ impl TryInto for protobuf::Schema { } } -impl TryInto for &protobuf::scalar_type::Datatype { +impl TryInto for &protobuf::scalar_type::Datatype { type Error = BallistaError; - fn try_into(self) -> Result { + fn try_into(self) -> Result { use protobuf::scalar_type::Datatype; Ok(match self { Datatype::Scalar(scalar_type) => { @@ -332,17 +332,18 @@ impl TryInto for &protobuf::scalar_type::Datatype { )) })?; //Because length is checked above it is safe to unwrap .last() - let mut scalar_type = - arrow::datatypes::DataType::List(Box::new(Field::new( - field_names.last().unwrap().as_str(), - pb_scalar_type.into(), - true, - ))); + let mut scalar_type = DataType::List(Box::new(Field::new( + field_names.last().unwrap().as_str(), + pb_scalar_type.into(), + true, + ))); //Iterate over field names in reverse order except for the last item in the vector for name in field_names.iter().rev().skip(1) { - let new_datatype = arrow::datatypes::DataType::List(Box::new( - Field::new(name.as_str(), scalar_type, true), - )); + let new_datatype = DataType::List(Box::new(Field::new( + name.as_str(), + scalar_type, + true, + ))); scalar_type = new_datatype; } scalar_type @@ -351,10 +352,9 @@ impl TryInto for &protobuf::scalar_type::Datatype { } } -impl TryInto for &protobuf::arrow_type::ArrowTypeEnum { +impl TryInto for &protobuf::arrow_type::ArrowTypeEnum { type Error = BallistaError; - fn try_into(self) -> Result { - use arrow::datatypes::DataType; + fn try_into(self) -> Result { use protobuf::arrow_type; Ok(match self { arrow_type::ArrowTypeEnum::None(_) => DataType::Null, @@ -467,9 +467,8 @@ impl TryInto for &protobuf::arrow_type::ArrowTypeEnu } #[allow(clippy::from_over_into)] -impl Into for protobuf::PrimitiveScalarType { - fn into(self) -> arrow::datatypes::DataType { - use arrow::datatypes::DataType; +impl Into for protobuf::PrimitiveScalarType { + fn into(self) -> DataType { match self { protobuf::PrimitiveScalarType::Bool => DataType::Boolean, protobuf::PrimitiveScalarType::Uint8 => DataType::UInt8, @@ -486,10 +485,10 @@ impl Into for protobuf::PrimitiveScalarType { protobuf::PrimitiveScalarType::LargeUtf8 => DataType::LargeUtf8, protobuf::PrimitiveScalarType::Date32 => DataType::Date32, protobuf::PrimitiveScalarType::TimeMicrosecond => { - DataType::Time64(arrow::datatypes::TimeUnit::Microsecond) + DataType::Time64(TimeUnit::Microsecond) } protobuf::PrimitiveScalarType::TimeNanosecond => { - DataType::Time64(arrow::datatypes::TimeUnit::Nanosecond) + DataType::Time64(TimeUnit::Nanosecond) } protobuf::PrimitiveScalarType::Null => DataType::Null, } @@ -746,9 +745,9 @@ impl TryInto for &protobuf::ScalarListValue { } } -impl TryInto for &protobuf::ScalarListType { +impl TryInto for &protobuf::ScalarListType { type Error = BallistaError; - fn try_into(self) -> Result { + fn try_into(self) -> Result { use protobuf::PrimitiveScalarType; let protobuf::ScalarListType { deepest_type, @@ -762,7 +761,7 @@ impl TryInto for &protobuf::ScalarListType { )); } - let mut curr_type = arrow::datatypes::DataType::List(Box::new(Field::new( + let mut curr_type = DataType::List(Box::new(Field::new( //Since checked vector is not empty above this is safe to unwrap field_names.last().unwrap(), PrimitiveScalarType::from_i32(*deepest_type) @@ -774,9 +773,8 @@ impl TryInto for &protobuf::ScalarListType { ))); //Iterates over field names in reverse order except for the last item in the vector for name in field_names.iter().rev().skip(1) { - let temp_curr_type = arrow::datatypes::DataType::List(Box::new(Field::new( - name, curr_type, true, - ))); + let temp_curr_type = + DataType::List(Box::new(Field::new(name, curr_type, true))); curr_type = temp_curr_type; } Ok(curr_type) @@ -876,8 +874,7 @@ impl TryInto for &protobuf::ScalarValue { .iter() .map(|val| val.try_into()) .collect::, _>>()?; - let scalar_type: arrow::datatypes::DataType = - pb_scalar_type.try_into()?; + let scalar_type: DataType = pb_scalar_type.try_into()?; ScalarValue::List(Some(typechecked_values), scalar_type) } protobuf::scalar_value::Value::NullListValue(v) => { @@ -1169,9 +1166,9 @@ fn from_proto_binary_op(op: &str) -> Result { } } -impl TryInto for &protobuf::ScalarType { +impl TryInto for &protobuf::ScalarType { type Error = BallistaError; - fn try_into(self) -> Result { + fn try_into(self) -> Result { let pb_scalartype = self.datatype.as_ref().ok_or_else(|| { proto_error("ScalarType message missing required field 'datatype'") })?; @@ -1202,16 +1199,16 @@ impl TryInto for &protobuf::Schema { } } -impl TryInto for &protobuf::Field { +impl TryInto for &protobuf::Field { type Error = BallistaError; - fn try_into(self) -> Result { + fn try_into(self) -> Result { let pb_datatype = self.arrow_type.as_ref().ok_or_else(|| { proto_error( "Protobuf deserialization error: Field message missing required field 'arrow_type'", ) })?; - Ok(arrow::datatypes::Field::new( + Ok(Field::new( self.name.as_str(), pb_datatype.as_ref().try_into()?, self.nullable, diff --git a/ballista/rust/core/src/serde/logical_plan/mod.rs b/ballista/rust/core/src/serde/logical_plan/mod.rs index 48dd96c4d3f31..d2792b09fa168 100644 --- a/ballista/rust/core/src/serde/logical_plan/mod.rs +++ b/ballista/rust/core/src/serde/logical_plan/mod.rs @@ -19,19 +19,18 @@ pub mod from_proto; pub mod to_proto; #[cfg(test)] - mod roundtrip_tests { use super::super::{super::error::Result, protobuf}; use crate::error::BallistaError; - use arrow::datatypes::{DataType, Field, Schema}; use core::panic; - use datafusion::physical_plan::functions::BuiltinScalarFunction::Sqrt; use datafusion::{ - logical_plan::{Expr, LogicalPlan, LogicalPlanBuilder}, - physical_plan::csv::CsvReadOptions, + arrow::datatypes::{DataType, Field, IntervalUnit, Schema, TimeUnit}, + logical_plan::{Expr, LogicalPlan, LogicalPlanBuilder, Partitioning, ToDFSchema}, + physical_plan::{csv::CsvReadOptions, functions::BuiltinScalarFunction::Sqrt}, prelude::*, scalar::ScalarValue, + sql::parser::FileType, }; use protobuf::arrow_type; use std::convert::TryInto; @@ -57,7 +56,6 @@ mod roundtrip_tests { } #[test] - fn roundtrip_repartition() -> Result<()> { use datafusion::logical_plan::Partitioning; @@ -119,18 +117,12 @@ mod roundtrip_tests { Ok(()) } - fn new_box_field( - name: &str, - dt: DataType, - nullable: bool, - ) -> Box { - Box::new(arrow::datatypes::Field::new(name, dt, nullable)) + fn new_box_field(name: &str, dt: DataType, nullable: bool) -> Box { + Box::new(Field::new(name, dt, nullable)) } #[test] fn scalar_values_error_serialization() -> Result<()> { - use arrow::datatypes::DataType; - use datafusion::scalar::ScalarValue; let should_fail_on_seralize: Vec = vec![ //Should fail due to inconsistent types ScalarValue::List( @@ -194,8 +186,6 @@ mod roundtrip_tests { #[test] fn round_trip_scalar_values() -> Result<()> { - use arrow::datatypes::DataType; - use datafusion::scalar::ScalarValue; let should_pass: Vec = vec![ ScalarValue::Boolean(None), ScalarValue::Float32(None), @@ -302,8 +292,6 @@ mod roundtrip_tests { #[test] fn round_trip_scalar_types() -> Result<()> { - use arrow::datatypes::DataType; - use arrow::datatypes::{IntervalUnit, TimeUnit}; let should_pass: Vec = vec![ DataType::Boolean, DataType::Int8, @@ -459,8 +447,6 @@ mod roundtrip_tests { #[test] fn round_trip_datatype() -> Result<()> { - use arrow::datatypes::DataType; - use arrow::datatypes::{IntervalUnit, TimeUnit}; let test_cases: Vec = vec![ DataType::Null, DataType::Boolean, @@ -592,9 +578,6 @@ mod roundtrip_tests { #[test] fn roundtrip_null_scalar_values() -> Result<()> { - use arrow::datatypes::DataType; - use arrow::datatypes::Field; - use datafusion::scalar::ScalarValue; let test_types = vec![ ScalarValue::Boolean(None), ScalarValue::Float32(None), @@ -629,7 +612,6 @@ mod roundtrip_tests { } #[test] - fn roundtrip_create_external_table() -> Result<()> { let schema = Schema::new(vec![ Field::new("id", DataType::Int32, false), @@ -639,12 +621,8 @@ mod roundtrip_tests { Field::new("salary", DataType::Int32, false), ]); - use datafusion::logical_plan::ToDFSchema; - let df_schema_ref = schema.to_dfschema_ref()?; - use datafusion::sql::parser::FileType; - let filetypes: [FileType; 3] = [FileType::NdJson, FileType::Parquet, FileType::CSV]; @@ -664,7 +642,6 @@ mod roundtrip_tests { } #[test] - fn roundtrip_explain() -> Result<()> { let schema = Schema::new(vec![ Field::new("id", DataType::Int32, false), @@ -751,7 +728,6 @@ mod roundtrip_tests { } #[test] - fn roundtrip_empty_relation() -> Result<()> { let plan_false = LogicalPlanBuilder::empty(false) .build() @@ -769,7 +745,6 @@ mod roundtrip_tests { } #[test] - fn roundtrip_logical_plan() -> Result<()> { let schema = Schema::new(vec![ Field::new("id", DataType::Int32, false), @@ -794,7 +769,6 @@ mod roundtrip_tests { } #[test] - fn roundtrip_not() -> Result<()> { let test_expr = Expr::Not(Box::new(Expr::Literal((1.0).into()))); @@ -804,7 +778,6 @@ mod roundtrip_tests { } #[test] - fn roundtrip_is_null() -> Result<()> { let test_expr = Expr::IsNull(Box::new(Expr::Column("id".into()))); @@ -814,7 +787,6 @@ mod roundtrip_tests { } #[test] - fn roundtrip_is_not_null() -> Result<()> { let test_expr = Expr::IsNotNull(Box::new(Expr::Column("id".into()))); @@ -824,7 +796,6 @@ mod roundtrip_tests { } #[test] - fn roundtrip_between() -> Result<()> { let test_expr = Expr::Between { expr: Box::new(Expr::Literal((1.0).into())), @@ -839,7 +810,6 @@ mod roundtrip_tests { } #[test] - fn roundtrip_case() -> Result<()> { let test_expr = Expr::Case { expr: Some(Box::new(Expr::Literal((1.0).into()))), @@ -856,7 +826,6 @@ mod roundtrip_tests { } #[test] - fn roundtrip_cast() -> Result<()> { let test_expr = Expr::Cast { expr: Box::new(Expr::Literal((1.0).into())), @@ -869,7 +838,6 @@ mod roundtrip_tests { } #[test] - fn roundtrip_sort_expr() -> Result<()> { let test_expr = Expr::Sort { expr: Box::new(Expr::Literal((1.0).into())), @@ -883,7 +851,6 @@ mod roundtrip_tests { } #[test] - fn roundtrip_negative() -> Result<()> { let test_expr = Expr::Negative(Box::new(Expr::Literal((1.0).into()))); @@ -893,7 +860,6 @@ mod roundtrip_tests { } #[test] - fn roundtrip_inlist() -> Result<()> { let test_expr = Expr::InList { expr: Box::new(Expr::Literal((1.0).into())), @@ -907,7 +873,6 @@ mod roundtrip_tests { } #[test] - fn roundtrip_wildcard() -> Result<()> { let test_expr = Expr::Wildcard; diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/ballista/rust/core/src/serde/logical_plan/to_proto.rs index 47e27483ff307..b630dfcc0d1b4 100644 --- a/ballista/rust/core/src/serde/logical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/to_proto.rs @@ -26,7 +26,7 @@ use std::{ use crate::datasource::DfTableAdapter; use crate::serde::{protobuf, BallistaError}; -use arrow::datatypes::{DataType, Schema}; +use datafusion::arrow::datatypes::{DataType, Field, IntervalUnit, Schema, TimeUnit}; use datafusion::datasource::CsvFile; use datafusion::logical_plan::{Expr, JoinType, LogicalPlan}; use datafusion::physical_plan::aggregates::AggregateFunction; @@ -35,8 +35,8 @@ use datafusion::physical_plan::window_functions::{ }; use datafusion::{datasource::parquet::ParquetTable, logical_plan::exprlist_to_fields}; use protobuf::{ - arrow_type, logical_expr_node::ExprType, scalar_type, DateUnit, Field, - PrimitiveScalarType, ScalarListValue, ScalarType, + arrow_type, logical_expr_node::ExprType, scalar_type, DateUnit, PrimitiveScalarType, + ScalarListValue, ScalarType, }; use sqlparser::ast::{WindowFrame, WindowFrameBound, WindowFrameUnits}; @@ -44,22 +44,17 @@ use super::super::proto_error; use datafusion::physical_plan::functions::BuiltinScalarFunction; impl protobuf::IntervalUnit { - pub fn from_arrow_interval_unit( - interval_unit: &arrow::datatypes::IntervalUnit, - ) -> Self { + pub fn from_arrow_interval_unit(interval_unit: &IntervalUnit) -> Self { match interval_unit { - arrow::datatypes::IntervalUnit::YearMonth => { - protobuf::IntervalUnit::YearMonth - } - arrow::datatypes::IntervalUnit::DayTime => protobuf::IntervalUnit::DayTime, + IntervalUnit::YearMonth => protobuf::IntervalUnit::YearMonth, + IntervalUnit::DayTime => protobuf::IntervalUnit::DayTime, } } pub fn from_i32_to_arrow( interval_unit_i32: i32, - ) -> Result { + ) -> Result { let pb_interval_unit = protobuf::IntervalUnit::from_i32(interval_unit_i32); - use arrow::datatypes::IntervalUnit; match pb_interval_unit { Some(interval_unit) => Ok(match interval_unit { protobuf::IntervalUnit::YearMonth => IntervalUnit::YearMonth, @@ -74,15 +69,15 @@ impl protobuf::IntervalUnit { /* Arrow changed dates to no longer have date unit impl protobuf::DateUnit { - pub fn from_arrow_date_unit(val: &arrow::datatypes::DateUnit) -> Self { + pub fn from_arrow_date_unit(val: &DateUnit) -> Self { match val { - arrow::datatypes::DateUnit::Day => protobuf::DateUnit::Day, - arrow::datatypes::DateUnit::Millisecond => protobuf::DateUnit::DateMillisecond, + DateUnit::Day => protobuf::DateUnit::Day, + DateUnit::Millisecond => protobuf::DateUnit::DateMillisecond, } } - pub fn from_i32_to_arrow(date_unit_i32: i32) -> Result { + pub fn from_i32_to_arrow(date_unit_i32: i32) -> Result { let pb_date_unit = protobuf::DateUnit::from_i32(date_unit_i32); - use arrow::datatypes::DateUnit; + use datafusion::DateUnit; match pb_date_unit { Some(date_unit) => Ok(match date_unit { protobuf::DateUnit::Day => DateUnit::Day, @@ -95,21 +90,16 @@ impl protobuf::DateUnit { }*/ impl protobuf::TimeUnit { - pub fn from_arrow_time_unit(val: &arrow::datatypes::TimeUnit) -> Self { + pub fn from_arrow_time_unit(val: &TimeUnit) -> Self { match val { - arrow::datatypes::TimeUnit::Second => protobuf::TimeUnit::Second, - arrow::datatypes::TimeUnit::Millisecond => { - protobuf::TimeUnit::TimeMillisecond - } - arrow::datatypes::TimeUnit::Microsecond => protobuf::TimeUnit::Microsecond, - arrow::datatypes::TimeUnit::Nanosecond => protobuf::TimeUnit::Nanosecond, + TimeUnit::Second => protobuf::TimeUnit::Second, + TimeUnit::Millisecond => protobuf::TimeUnit::TimeMillisecond, + TimeUnit::Microsecond => protobuf::TimeUnit::Microsecond, + TimeUnit::Nanosecond => protobuf::TimeUnit::Nanosecond, } } - pub fn from_i32_to_arrow( - time_unit_i32: i32, - ) -> Result { + pub fn from_i32_to_arrow(time_unit_i32: i32) -> Result { let pb_time_unit = protobuf::TimeUnit::from_i32(time_unit_i32); - use arrow::datatypes::TimeUnit; match pb_time_unit { Some(time_unit) => Ok(match time_unit { protobuf::TimeUnit::Second => TimeUnit::Second, @@ -124,8 +114,8 @@ impl protobuf::TimeUnit { } } -impl From<&arrow::datatypes::Field> for protobuf::Field { - fn from(field: &arrow::datatypes::Field) -> Self { +impl From<&Field> for protobuf::Field { + fn from(field: &Field) -> Self { protobuf::Field { name: field.name().to_owned(), arrow_type: Some(Box::new(field.data_type().into())), @@ -135,23 +125,22 @@ impl From<&arrow::datatypes::Field> for protobuf::Field { } } -impl From<&arrow::datatypes::DataType> for protobuf::ArrowType { - fn from(val: &arrow::datatypes::DataType) -> protobuf::ArrowType { +impl From<&DataType> for protobuf::ArrowType { + fn from(val: &DataType) -> protobuf::ArrowType { protobuf::ArrowType { arrow_type_enum: Some(val.into()), } } } -impl TryInto for &protobuf::ArrowType { +impl TryInto for &protobuf::ArrowType { type Error = BallistaError; - fn try_into(self) -> Result { + fn try_into(self) -> Result { let pb_arrow_type = self.arrow_type_enum.as_ref().ok_or_else(|| { proto_error( "Protobuf deserialization error: ArrowType missing required field 'data_type'", ) })?; - use arrow::datatypes::DataType; Ok(match pb_arrow_type { protobuf::arrow_type::ArrowTypeEnum::None(_) => DataType::Null, protobuf::arrow_type::ArrowTypeEnum::Bool(_) => DataType::Boolean, @@ -208,7 +197,7 @@ impl TryInto for &protobuf::ArrowType { .as_ref() .ok_or_else(|| proto_error("Protobuf deserialization error: List message was missing required field 'field_type'"))? .as_ref(); - arrow::datatypes::DataType::List(Box::new(field_ref.try_into()?)) + DataType::List(Box::new(field_ref.try_into()?)) } protobuf::arrow_type::ArrowTypeEnum::LargeList(boxed_list) => { let field_ref = boxed_list @@ -216,7 +205,7 @@ impl TryInto for &protobuf::ArrowType { .as_ref() .ok_or_else(|| proto_error("Protobuf deserialization error: List message was missing required field 'field_type'"))? .as_ref(); - arrow::datatypes::DataType::LargeList(Box::new(field_ref.try_into()?)) + DataType::LargeList(Box::new(field_ref.try_into()?)) } protobuf::arrow_type::ArrowTypeEnum::FixedSizeList(boxed_list) => { let fsl_ref = boxed_list.as_ref(); @@ -224,7 +213,7 @@ impl TryInto for &protobuf::ArrowType { .field_type .as_ref() .ok_or_else(|| proto_error("Protobuf deserialization error: FixedSizeList message was missing required field 'field_type'"))?; - arrow::datatypes::DataType::FixedSizeList( + DataType::FixedSizeList( Box::new(pb_fieldtype.as_ref().try_into()?), fsl_ref.list_size, ) @@ -235,7 +224,7 @@ impl TryInto for &protobuf::ArrowType { .iter() .map(|field| field.try_into()) .collect::, _>>()?; - arrow::datatypes::DataType::Struct(fields) + DataType::Struct(fields) } protobuf::arrow_type::ArrowTypeEnum::Union(union) => { let union_types = union @@ -243,7 +232,7 @@ impl TryInto for &protobuf::ArrowType { .iter() .map(|field| field.try_into()) .collect::, _>>()?; - arrow::datatypes::DataType::Union(union_types) + DataType::Union(union_types) } protobuf::arrow_type::ArrowTypeEnum::Dictionary(boxed_dict) => { let dict_ref = boxed_dict.as_ref(); @@ -255,7 +244,7 @@ impl TryInto for &protobuf::ArrowType { .value .as_ref() .ok_or_else(|| proto_error("Protobuf deserialization error: Dictionary message was missing required field 'value'"))?; - arrow::datatypes::DataType::Dictionary( + DataType::Dictionary( Box::new(pb_key.as_ref().try_into()?), Box::new(pb_value.as_ref().try_into()?), ) @@ -264,15 +253,15 @@ impl TryInto for &protobuf::ArrowType { } } -impl TryInto for &Box { +impl TryInto for &Box { type Error = BallistaError; - fn try_into(self) -> Result { + fn try_into(self) -> Result { let list_ref = self.as_ref(); match &list_ref.field_type { Some(pb_field) => { let pb_field_ref = pb_field.as_ref(); - let arrow_field: arrow::datatypes::Field = pb_field_ref.try_into()?; - Ok(arrow::datatypes::DataType::List(Box::new(arrow_field))) + let arrow_field: Field = pb_field_ref.try_into()?; + Ok(DataType::List(Box::new(arrow_field))) } None => Err(proto_error( "List message missing required field 'field_type'", @@ -281,8 +270,8 @@ impl TryInto for &Box { } } -impl From<&arrow::datatypes::DataType> for protobuf::arrow_type::ArrowTypeEnum { - fn from(val: &arrow::datatypes::DataType) -> protobuf::arrow_type::ArrowTypeEnum { +impl From<&DataType> for protobuf::arrow_type::ArrowTypeEnum { + fn from(val: &DataType) -> protobuf::arrow_type::ArrowTypeEnum { use protobuf::arrow_type::ArrowTypeEnum; use protobuf::ArrowType; use protobuf::EmptyMessage; @@ -368,7 +357,7 @@ impl From<&arrow::datatypes::DataType> for protobuf::arrow_type::ArrowTypeEnum { } //Does not check if list subtypes are valid -fn is_valid_scalar_type_no_list_check(datatype: &arrow::datatypes::DataType) -> bool { +fn is_valid_scalar_type_no_list_check(datatype: &DataType) -> bool { match datatype { DataType::Boolean | DataType::Int8 @@ -384,22 +373,18 @@ fn is_valid_scalar_type_no_list_check(datatype: &arrow::datatypes::DataType) -> | DataType::LargeUtf8 | DataType::Utf8 | DataType::Date32 => true, - DataType::Time64(time_unit) => matches!( - time_unit, - arrow::datatypes::TimeUnit::Microsecond - | arrow::datatypes::TimeUnit::Nanosecond - ), + DataType::Time64(time_unit) => { + matches!(time_unit, TimeUnit::Microsecond | TimeUnit::Nanosecond) + } DataType::List(_) => true, _ => false, } } -impl TryFrom<&arrow::datatypes::DataType> for protobuf::scalar_type::Datatype { +impl TryFrom<&DataType> for protobuf::scalar_type::Datatype { type Error = BallistaError; - fn try_from(val: &arrow::datatypes::DataType) -> Result { - use protobuf::scalar_type; - use protobuf::Field; + fn try_from(val: &DataType) -> Result { use protobuf::{List, PrimitiveScalarType}; let scalar_value = match val { DataType::Boolean => scalar_type::Datatype::Scalar(PrimitiveScalarType::Bool as i32), @@ -415,8 +400,8 @@ impl TryFrom<&arrow::datatypes::DataType> for protobuf::scalar_type::Datatype { DataType::Float64 => scalar_type::Datatype::Scalar(PrimitiveScalarType::Float64 as i32), DataType::Date32 => scalar_type::Datatype::Scalar(PrimitiveScalarType::Date32 as i32), DataType::Time64(time_unit) => match time_unit { - arrow::datatypes::TimeUnit::Microsecond => scalar_type::Datatype::Scalar(PrimitiveScalarType::TimeMicrosecond as i32), - arrow::datatypes::TimeUnit::Nanosecond => scalar_type::Datatype::Scalar(PrimitiveScalarType::TimeNanosecond as i32), + TimeUnit::Microsecond => scalar_type::Datatype::Scalar(PrimitiveScalarType::TimeMicrosecond as i32), + TimeUnit::Nanosecond => scalar_type::Datatype::Scalar(PrimitiveScalarType::TimeNanosecond as i32), _ => { return Err(proto_error(format!( "Found invalid time unit for scalar value, only TimeUnit::Microsecond and TimeUnit::Nanosecond are valid time units: {:?}", @@ -428,7 +413,7 @@ impl TryFrom<&arrow::datatypes::DataType> for protobuf::scalar_type::Datatype { DataType::LargeUtf8 => scalar_type::Datatype::Scalar(PrimitiveScalarType::LargeUtf8 as i32), DataType::List(field_type) => { let mut field_names: Vec = Vec::new(); - let mut curr_field: &arrow::datatypes::Field = field_type.as_ref(); + let mut curr_field = field_type.as_ref(); field_names.push(curr_field.name().to_owned()); //For each nested field check nested datatype, since datafusion scalars only support recursive lists with a leaf scalar type // any other compound types are errors. @@ -458,8 +443,8 @@ impl TryFrom<&arrow::datatypes::DataType> for protobuf::scalar_type::Datatype { DataType::Float64 => PrimitiveScalarType::Float64, DataType::Date32 => PrimitiveScalarType::Date32, DataType::Time64(time_unit) => match time_unit { - arrow::datatypes::TimeUnit::Microsecond => PrimitiveScalarType::TimeMicrosecond, - arrow::datatypes::TimeUnit::Nanosecond => PrimitiveScalarType::TimeNanosecond, + TimeUnit::Microsecond => PrimitiveScalarType::TimeMicrosecond, + TimeUnit::Nanosecond => PrimitiveScalarType::TimeNanosecond, _ => { return Err(proto_error(format!( "Found invalid time unit for scalar value, only TimeUnit::Microsecond and TimeUnit::Nanosecond are valid time units: {:?}", @@ -594,7 +579,7 @@ impl TryFrom<&datafusion::scalar::ScalarValue> for protobuf::ScalarValue { let type_checked_values: Vec = values .iter() .map(|scalar| match (scalar, scalar_type) { - (scalar::ScalarValue::List(_, arrow::datatypes::DataType::List(list_field)), arrow::datatypes::DataType::List(field)) => { + (scalar::ScalarValue::List(_, DataType::List(list_field)), DataType::List(field)) => { let scalar_datatype = field.data_type(); let list_datatype = list_field.data_type(); if std::mem::discriminant(list_datatype) != std::mem::discriminant(scalar_datatype) { @@ -605,19 +590,19 @@ impl TryFrom<&datafusion::scalar::ScalarValue> for protobuf::ScalarValue { } scalar.try_into() } - (scalar::ScalarValue::Boolean(_), arrow::datatypes::DataType::Boolean) => scalar.try_into(), - (scalar::ScalarValue::Float32(_), arrow::datatypes::DataType::Float32) => scalar.try_into(), - (scalar::ScalarValue::Float64(_), arrow::datatypes::DataType::Float64) => scalar.try_into(), - (scalar::ScalarValue::Int8(_), arrow::datatypes::DataType::Int8) => scalar.try_into(), - (scalar::ScalarValue::Int16(_), arrow::datatypes::DataType::Int16) => scalar.try_into(), - (scalar::ScalarValue::Int32(_), arrow::datatypes::DataType::Int32) => scalar.try_into(), - (scalar::ScalarValue::Int64(_), arrow::datatypes::DataType::Int64) => scalar.try_into(), - (scalar::ScalarValue::UInt8(_), arrow::datatypes::DataType::UInt8) => scalar.try_into(), - (scalar::ScalarValue::UInt16(_), arrow::datatypes::DataType::UInt16) => scalar.try_into(), - (scalar::ScalarValue::UInt32(_), arrow::datatypes::DataType::UInt32) => scalar.try_into(), - (scalar::ScalarValue::UInt64(_), arrow::datatypes::DataType::UInt64) => scalar.try_into(), - (scalar::ScalarValue::Utf8(_), arrow::datatypes::DataType::Utf8) => scalar.try_into(), - (scalar::ScalarValue::LargeUtf8(_), arrow::datatypes::DataType::LargeUtf8) => scalar.try_into(), + (scalar::ScalarValue::Boolean(_), DataType::Boolean) => scalar.try_into(), + (scalar::ScalarValue::Float32(_), DataType::Float32) => scalar.try_into(), + (scalar::ScalarValue::Float64(_), DataType::Float64) => scalar.try_into(), + (scalar::ScalarValue::Int8(_), DataType::Int8) => scalar.try_into(), + (scalar::ScalarValue::Int16(_), DataType::Int16) => scalar.try_into(), + (scalar::ScalarValue::Int32(_), DataType::Int32) => scalar.try_into(), + (scalar::ScalarValue::Int64(_), DataType::Int64) => scalar.try_into(), + (scalar::ScalarValue::UInt8(_), DataType::UInt8) => scalar.try_into(), + (scalar::ScalarValue::UInt16(_), DataType::UInt16) => scalar.try_into(), + (scalar::ScalarValue::UInt32(_), DataType::UInt32) => scalar.try_into(), + (scalar::ScalarValue::UInt64(_), DataType::UInt64) => scalar.try_into(), + (scalar::ScalarValue::Utf8(_), DataType::Utf8) => scalar.try_into(), + (scalar::ScalarValue::LargeUtf8(_), DataType::LargeUtf8) => scalar.try_into(), _ => Err(proto_error(format!( "Protobuf serialization error, {:?} was inconsistent with designated type {:?}", scalar, datatype @@ -1302,9 +1287,9 @@ impl TryFrom for protobuf::WindowFrame { } } -impl TryFrom<&arrow::datatypes::DataType> for protobuf::ScalarType { +impl TryFrom<&DataType> for protobuf::ScalarType { type Error = BallistaError; - fn try_from(value: &arrow::datatypes::DataType) -> Result { + fn try_from(value: &DataType) -> Result { let datatype = protobuf::scalar_type::Datatype::try_from(value)?; Ok(protobuf::ScalarType { datatype: Some(datatype), diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index d034f3ca3bfee..2039def908bc0 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -28,7 +28,7 @@ use crate::serde::protobuf::LogicalExprNode; use crate::serde::scheduler::PartitionLocation; use crate::serde::{proto_error, protobuf}; use crate::{convert_box_required, convert_required}; -use arrow::datatypes::{DataType, Schema, SchemaRef}; +use datafusion::arrow::datatypes::{DataType, Schema, SchemaRef}; use datafusion::catalog::catalog::{ CatalogList, CatalogProvider, MemoryCatalogList, MemoryCatalogProvider, }; diff --git a/ballista/rust/core/src/serde/physical_plan/mod.rs b/ballista/rust/core/src/serde/physical_plan/mod.rs index e7985cc84a9a7..fdba2152b7f8d 100644 --- a/ballista/rust/core/src/serde/physical_plan/mod.rs +++ b/ballista/rust/core/src/serde/physical_plan/mod.rs @@ -20,27 +20,32 @@ pub mod to_proto; #[cfg(test)] mod roundtrip_tests { - use datafusion::physical_plan::hash_utils::JoinType; use std::{convert::TryInto, sync::Arc}; - use arrow::datatypes::{DataType, Schema}; - use datafusion::physical_plan::ColumnarValue; - use datafusion::physical_plan::{ - empty::EmptyExec, - expressions::{Avg, Column, PhysicalSortExpr}, - hash_aggregate::{AggregateMode, HashAggregateExec}, - hash_join::HashJoinExec, - limit::{GlobalLimitExec, LocalLimitExec}, - sort::SortExec, - ExecutionPlan, - }; - use datafusion::physical_plan::{ - AggregateExpr, Distribution, Partitioning, PhysicalExpr, + use datafusion::{ + arrow::{ + compute::kernels::sort::SortOptions, + datatypes::{DataType, Field, Schema}, + }, + logical_plan::Operator, + physical_plan::{ + empty::EmptyExec, + expressions::{binary, lit, InListExpr, NotExpr}, + expressions::{Avg, Column, PhysicalSortExpr}, + filter::FilterExec, + hash_aggregate::{AggregateMode, HashAggregateExec}, + hash_join::{HashJoinExec, PartitionMode}, + hash_utils::JoinType, + limit::{GlobalLimitExec, LocalLimitExec}, + sort::SortExec, + AggregateExpr, ColumnarValue, Distribution, ExecutionPlan, Partitioning, + PhysicalExpr, + }, + scalar::ScalarValue, }; use super::super::super::error::Result; use super::super::protobuf; - use datafusion::physical_plan::hash_join::PartitionMode; fn roundtrip_test(exec_plan: Arc) -> Result<()> { let proto: protobuf::PhysicalPlanNode = exec_plan.clone().try_into()?; @@ -75,7 +80,6 @@ mod roundtrip_tests { #[test] fn roundtrip_hash_join() -> Result<()> { - use arrow::datatypes::{DataType, Field, Schema}; let field_a = Field::new("col", DataType::Int64, false); let schema_left = Schema::new(vec![field_a.clone()]); let schema_right = Schema::new(vec![field_a]); @@ -95,7 +99,6 @@ mod roundtrip_tests { #[test] fn rountrip_hash_aggregate() -> Result<()> { - use arrow::datatypes::{DataType, Field, Schema}; let groups: Vec<(Arc, String)> = vec![(col("a"), "unused".to_string())]; @@ -120,13 +123,6 @@ mod roundtrip_tests { #[test] fn roundtrip_filter_with_not_and_in_list() -> Result<()> { - use arrow::datatypes::{DataType, Field, Schema}; - use datafusion::logical_plan::Operator; - use datafusion::physical_plan::{ - expressions::{binary, lit, InListExpr, NotExpr}, - filter::FilterExec, - }; - use datafusion::scalar::ScalarValue; let field_a = Field::new("a", DataType::Boolean, false); let field_b = Field::new("b", DataType::Int64, false); let field_c = Field::new("c", DataType::Int64, false); @@ -149,8 +145,6 @@ mod roundtrip_tests { #[test] fn roundtrip_sort() -> Result<()> { - use arrow::compute::kernels::sort::SortOptions; - use arrow::datatypes::{DataType, Field, Schema}; let field_a = Field::new("a", DataType::Boolean, false); let field_b = Field::new("b", DataType::Int64, false); let schema = Arc::new(Schema::new(vec![field_a, field_b])); diff --git a/ballista/rust/core/src/serde/scheduler/mod.rs b/ballista/rust/core/src/serde/scheduler/mod.rs index bbbd48b74a1f3..b502c325595ff 100644 --- a/ballista/rust/core/src/serde/scheduler/mod.rs +++ b/ballista/rust/core/src/serde/scheduler/mod.rs @@ -17,10 +17,10 @@ use std::{collections::HashMap, sync::Arc}; -use arrow::array::{ +use datafusion::arrow::array::{ ArrayBuilder, ArrayRef, StructArray, StructBuilder, UInt64Array, UInt64Builder, }; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion::logical_plan::LogicalPlan; use datafusion::physical_plan::ExecutionPlan; use serde::Serialize; diff --git a/ballista/rust/core/src/utils.rs b/ballista/rust/core/src/utils.rs index dc570f81f2c7e..85d1161795ad3 100644 --- a/ballista/rust/core/src/utils.rs +++ b/ballista/rust/core/src/utils.rs @@ -26,13 +26,16 @@ use crate::error::{BallistaError, Result}; use crate::execution_plans::{QueryStageExec, UnresolvedShuffleExec}; use crate::memory_stream::MemoryStream; use crate::serde::scheduler::PartitionStats; -use arrow::array::{ - ArrayBuilder, ArrayRef, StructArray, StructBuilder, UInt64Array, UInt64Builder, + +use datafusion::arrow::{ + array::{ + ArrayBuilder, ArrayRef, StructArray, StructBuilder, UInt64Array, UInt64Builder, + }, + datatypes::{DataType, Field}, + ipc::reader::FileReader, + ipc::writer::FileWriter, + record_batch::RecordBatch, }; -use arrow::datatypes::{DataType, Field}; -use arrow::ipc::reader::FileReader; -use arrow::ipc::writer::FileWriter; -use arrow::record_batch::RecordBatch; use datafusion::execution::context::{ExecutionConfig, ExecutionContext}; use datafusion::logical_plan::Operator; use datafusion::physical_optimizer::coalesce_batches::CoalesceBatches; diff --git a/ballista/rust/executor/src/collect.rs b/ballista/rust/executor/src/collect.rs index a2f9d4c636031..a4c544f8c47b4 100644 --- a/ballista/rust/executor/src/collect.rs +++ b/ballista/rust/executor/src/collect.rs @@ -22,10 +22,10 @@ use std::sync::Arc; use std::task::{Context, Poll}; use std::{any::Any, pin::Pin}; -use arrow::datatypes::SchemaRef; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; use async_trait::async_trait; +use datafusion::arrow::{ + datatypes::SchemaRef, error::Result as ArrowResult, record_batch::RecordBatch, +}; use datafusion::error::DataFusionError; use datafusion::physical_plan::{ExecutionPlan, Partitioning, SendableRecordBatchStream}; use datafusion::{error::Result, physical_plan::RecordBatchStream}; diff --git a/ballista/rust/executor/src/flight_service.rs b/ballista/rust/executor/src/flight_service.rs index 62aaf7f93a44c..b35ac15ec2673 100644 --- a/ballista/rust/executor/src/flight_service.rs +++ b/ballista/rust/executor/src/flight_service.rs @@ -28,17 +28,19 @@ use ballista_core::serde::decode_protobuf; use ballista_core::serde::scheduler::{Action as BallistaAction, PartitionStats}; use ballista_core::utils; -use arrow::array::{ArrayRef, StringBuilder}; -use arrow::datatypes::{DataType, Field, Schema}; -use arrow::error::ArrowError; -use arrow::ipc::reader::FileReader; -use arrow::ipc::writer::IpcWriteOptions; -use arrow::record_batch::RecordBatch; use arrow_flight::{ flight_service_server::FlightService, Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket, }; +use datafusion::arrow::{ + array::{ArrayRef, StringBuilder}, + datatypes::{DataType, Field, Schema}, + error::ArrowError, + ipc::reader::FileReader, + ipc::writer::IpcWriteOptions, + record_batch::RecordBatch, +}; use datafusion::error::DataFusionError; use datafusion::physical_plan::displayable; use futures::{Stream, StreamExt}; diff --git a/ballista/rust/scheduler/src/test_utils.rs b/ballista/rust/scheduler/src/test_utils.rs index 0989060503869..311f9a7a3de0c 100644 --- a/ballista/rust/scheduler/src/test_utils.rs +++ b/ballista/rust/scheduler/src/test_utils.rs @@ -19,7 +19,7 @@ use std::sync::Arc; use ballista_core::error::Result; -use arrow::datatypes::{DataType, Field, Schema}; +use datafusion::arrow::datatypes::{DataType, Field, Schema}; use datafusion::execution::context::{ExecutionConfig, ExecutionContext}; use datafusion::physical_optimizer::coalesce_batches::CoalesceBatches; use datafusion::physical_optimizer::merge_exec::AddMergeExec; From 2b5b0090d06ba49efdb3724943716857f16ad542 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Mon, 31 May 2021 18:15:25 +0800 Subject: [PATCH 135/329] Squashed commit of the following: (#403) commit 7fb3640e733bfbbdbf18d58000896f378ba9644c Author: Jiayu Liu Date: Fri May 21 16:38:25 2021 +0800 row number done commit 17239267cd2fbcbb676d5731beeffd0321bbd3ba Author: Jiayu Liu Date: Fri May 21 16:05:50 2021 +0800 add row number commit bf5b8a56f6f33d8eedf3e3009e7fcdb3c388ea5b Author: Jiayu Liu Date: Fri May 21 15:04:49 2021 +0800 save commit d2ce852ead5d8ae3d15962b4dd3062e24bce51de Author: Jiayu Liu Date: Fri May 21 14:53:05 2021 +0800 add streams commit 0a861a76bde0bb43e5561f1cf1ef14fd64e0c08b Author: Jiayu Liu Date: Thu May 20 22:28:34 2021 +0800 save stream commit a9121af7e2e9104d0e4b6ca3ef4f484aaf8baf42 Author: Jiayu Liu Date: Thu May 20 22:01:51 2021 +0800 update unit test commit 2af2a270262ff1bc759af39153d7cd681c32dc0a Author: Jiayu Liu Date: Fri May 21 14:25:12 2021 +0800 fix unit test commit bb57c762b0a1fabc35e207e681bca2bfff7fcf01 Author: Jiayu Liu Date: Fri May 21 14:23:34 2021 +0800 use upper case commit 5d96e525f587fbfaf3e5e9762c9bb10315fcbc3a Author: Jiayu Liu Date: Fri May 21 14:16:16 2021 +0800 fix unit test commit 1ecae8f6cbc6c1898ccf0b38b1e596b6c2e9bb46 Author: Jiayu Liu Date: Fri May 21 12:27:26 2021 +0800 fix unit test commit bc2271d58fd4a9a9cc96126f8abcd6e8f10272ca Author: Jiayu Liu Date: Fri May 21 10:04:29 2021 +0800 fix error commit 880b94f6e27df61b4d3877366f71a51b9b2f5d5d Author: Jiayu Liu Date: Fri May 21 08:24:00 2021 +0800 fix unit test commit 4e792e123a33fd0dcb5f701c679566b55589b0c0 Author: Jiayu Liu Date: Fri May 21 08:05:17 2021 +0800 fix test commit c36c04abf06c74d016597983bf3d3a2a5b5cbdd5 Author: Jiayu Liu Date: Fri May 21 00:07:54 2021 +0800 add more tests commit f5e64de7192a1916df78a4c2fbab7d471c906720 Author: Jiayu Liu Date: Thu May 20 23:41:36 2021 +0800 update commit a1eae864926a6acfeeebe995a12de4ad725ea869 Author: Jiayu Liu Date: Thu May 20 23:36:15 2021 +0800 enrich unit test commit 0d2a214131fe69e19e22144c68fbb992228db6b3 Author: Jiayu Liu Date: Thu May 20 23:25:43 2021 +0800 adding filter by todo commit 8b486d53b09ff1c7a6b9cf4687796ba1c13d6160 Author: Jiayu Liu Date: Thu May 20 23:17:22 2021 +0800 adding more built-in functions commit abf08cd137a80c1381af7de9ae2b3dab05cb4512 Author: Jiayu Liu Date: Thu May 20 22:36:27 2021 +0800 Update datafusion/src/physical_plan/window_functions.rs Co-authored-by: Andrew Lamb commit 0cbca53dac642233520f7d32289b1dfad77b882e Author: Jiayu Liu Date: Thu May 20 22:34:57 2021 +0800 Update datafusion/src/physical_plan/window_functions.rs Co-authored-by: Andrew Lamb commit 831c069f02236a953653b8f1ca25124e393ce20b Author: Jiayu Liu Date: Thu May 20 22:34:04 2021 +0800 Update datafusion/src/logical_plan/builder.rs Co-authored-by: Andrew Lamb commit f70c739fd40e30c4b476253e58b24b9297b42859 Author: Jiayu Liu Date: Thu May 20 22:33:04 2021 +0800 Update datafusion/src/logical_plan/builder.rs Co-authored-by: Andrew Lamb commit 3ee87aa3477c160f17a86628d71a353e03d736b3 Author: Jiayu Liu Date: Wed May 19 22:55:08 2021 +0800 fix unit test commit 5c4d92dc9f570ba6919d84cb8ac70a736d73f40f Author: Jiayu Liu Date: Wed May 19 22:48:26 2021 +0800 fix clippy commit a0b7526c413abbdd4aadab4af8ca9ad8f323f03b Author: Jiayu Liu Date: Wed May 19 22:46:38 2021 +0800 fix unused imports commit 1d3b076acc1c0f248a19c6149c0634e63a5b836e Author: Jiayu Liu Date: Thu May 13 18:51:14 2021 +0800 add window expr --- .../src/physical_plan/expressions/mod.rs | 2 + .../physical_plan/expressions/nth_value.rs | 223 ++++++++++++++++++ datafusion/src/physical_plan/windows.rs | 180 ++++++++++---- datafusion/tests/sql.rs | 17 +- 4 files changed, 371 insertions(+), 51 deletions(-) create mode 100644 datafusion/src/physical_plan/expressions/nth_value.rs diff --git a/datafusion/src/physical_plan/expressions/mod.rs b/datafusion/src/physical_plan/expressions/mod.rs index 803870f3f7840..77da95c3a04a3 100644 --- a/datafusion/src/physical_plan/expressions/mod.rs +++ b/datafusion/src/physical_plan/expressions/mod.rs @@ -40,6 +40,7 @@ mod literal; mod min_max; mod negative; mod not; +mod nth_value; mod nullif; mod row_number; mod sum; @@ -58,6 +59,7 @@ pub use literal::{lit, Literal}; pub use min_max::{Max, Min}; pub use negative::{negative, NegativeExpr}; pub use not::{not, NotExpr}; +pub use nth_value::{FirstValue, LastValue, NthValue}; pub use nullif::{nullif_func, SUPPORTED_NULLIF_TYPES}; pub use row_number::RowNumber; pub use sum::{sum_return_type, Sum}; diff --git a/datafusion/src/physical_plan/expressions/nth_value.rs b/datafusion/src/physical_plan/expressions/nth_value.rs new file mode 100644 index 0000000000000..e90ad322aae9d --- /dev/null +++ b/datafusion/src/physical_plan/expressions/nth_value.rs @@ -0,0 +1,223 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines physical expressions that can evaluated at runtime during query execution + +use crate::error::{DataFusionError, Result}; +use crate::physical_plan::{ + window_functions::BuiltInWindowFunctionExpr, PhysicalExpr, WindowAccumulator, +}; +use crate::scalar::ScalarValue; +use arrow::datatypes::{DataType, Field}; +use std::any::Any; +use std::convert::TryFrom; +use std::sync::Arc; + +/// first_value expression +#[derive(Debug)] +pub struct FirstValue { + name: String, + data_type: DataType, + expr: Arc, +} + +impl FirstValue { + /// Create a new FIRST_VALUE window aggregate function + pub fn new(expr: Arc, name: String, data_type: DataType) -> Self { + Self { + name, + data_type, + expr, + } + } +} + +impl BuiltInWindowFunctionExpr for FirstValue { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn field(&self) -> Result { + let nullable = true; + Ok(Field::new(&self.name, self.data_type.clone(), nullable)) + } + + fn expressions(&self) -> Vec> { + vec![self.expr.clone()] + } + + fn name(&self) -> &str { + &self.name + } + + fn create_accumulator(&self) -> Result> { + Ok(Box::new(NthValueAccumulator::try_new( + 1, + self.data_type.clone(), + )?)) + } +} + +// sql values start with 1, so we can use 0 to indicate the special last value behavior +const SPECIAL_SIZE_VALUE_FOR_LAST: u32 = 0; + +/// last_value expression +#[derive(Debug)] +pub struct LastValue { + name: String, + data_type: DataType, + expr: Arc, +} + +impl LastValue { + /// Create a new FIRST_VALUE window aggregate function + pub fn new(expr: Arc, name: String, data_type: DataType) -> Self { + Self { + name, + data_type, + expr, + } + } +} + +impl BuiltInWindowFunctionExpr for LastValue { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn field(&self) -> Result { + let nullable = true; + Ok(Field::new(&self.name, self.data_type.clone(), nullable)) + } + + fn expressions(&self) -> Vec> { + vec![self.expr.clone()] + } + + fn name(&self) -> &str { + &self.name + } + + fn create_accumulator(&self) -> Result> { + Ok(Box::new(NthValueAccumulator::try_new( + SPECIAL_SIZE_VALUE_FOR_LAST, + self.data_type.clone(), + )?)) + } +} + +/// nth_value expression +#[derive(Debug)] +pub struct NthValue { + name: String, + n: u32, + data_type: DataType, + expr: Arc, +} + +impl NthValue { + /// Create a new NTH_VALUE window aggregate function + pub fn try_new( + expr: Arc, + name: String, + n: u32, + data_type: DataType, + ) -> Result { + if n == SPECIAL_SIZE_VALUE_FOR_LAST { + Err(DataFusionError::Execution( + "nth_value expect n to be > 0".to_owned(), + )) + } else { + Ok(Self { + name, + n, + data_type, + expr, + }) + } + } +} + +impl BuiltInWindowFunctionExpr for NthValue { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn field(&self) -> Result { + let nullable = true; + Ok(Field::new(&self.name, self.data_type.clone(), nullable)) + } + + fn expressions(&self) -> Vec> { + vec![self.expr.clone()] + } + + fn name(&self) -> &str { + &self.name + } + + fn create_accumulator(&self) -> Result> { + Ok(Box::new(NthValueAccumulator::try_new( + self.n, + self.data_type.clone(), + )?)) + } +} + +#[derive(Debug)] +struct NthValueAccumulator { + // n the target nth_value, however we'll reuse it for last_value acc, so when n == 0 it specifically + // means last; also note that it is totally valid for n to be larger than the number of rows input + // in which case all the values shall be null + n: u32, + offset: u32, + value: ScalarValue, +} + +impl NthValueAccumulator { + /// new count accumulator + pub fn try_new(n: u32, data_type: DataType) -> Result { + Ok(Self { + n, + offset: 0, + // null value of that data_type by default + value: ScalarValue::try_from(&data_type)?, + }) + } +} + +impl WindowAccumulator for NthValueAccumulator { + fn scan(&mut self, values: &[ScalarValue]) -> Result> { + if self.n == SPECIAL_SIZE_VALUE_FOR_LAST { + // for last_value function + self.value = values[0].clone(); + } else if self.offset < self.n { + self.offset += 1; + if self.offset == self.n { + self.value = values[0].clone(); + } + } + Ok(None) + } + + fn evaluate(&self) -> Result> { + Ok(Some(self.value.clone())) + } +} diff --git a/datafusion/src/physical_plan/windows.rs b/datafusion/src/physical_plan/windows.rs index 8ced3aec8ec11..e790eeaca749e 100644 --- a/datafusion/src/physical_plan/windows.rs +++ b/datafusion/src/physical_plan/windows.rs @@ -20,7 +20,9 @@ use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ aggregates, - expressions::RowNumber, + expressions::{FirstValue, LastValue, Literal, NthValue, RowNumber}, + type_coercion::coerce, + window_functions::signature_for_built_in, window_functions::BuiltInWindowFunctionExpr, window_functions::{BuiltInWindowFunction, WindowFunction}, Accumulator, AggregateExpr, Distribution, ExecutionPlan, Partitioning, PhysicalExpr, @@ -39,6 +41,7 @@ use futures::stream::{Stream, StreamExt}; use futures::Future; use pin_project_lite::pin_project; use std::any::Any; +use std::convert::TryInto; use std::iter; use std::pin::Pin; use std::sync::Arc; @@ -82,12 +85,40 @@ pub fn create_window_expr( fn create_built_in_window_expr( fun: &BuiltInWindowFunction, - _args: &[Arc], - _input_schema: &Schema, + args: &[Arc], + input_schema: &Schema, name: String, ) -> Result> { match fun { BuiltInWindowFunction::RowNumber => Ok(Arc::new(RowNumber::new(name))), + BuiltInWindowFunction::NthValue => { + let coerced_args = coerce(args, input_schema, &signature_for_built_in(fun))?; + let arg = coerced_args[0].clone(); + let n = coerced_args[1] + .as_any() + .downcast_ref::() + .unwrap() + .value(); + let n: i64 = n + .clone() + .try_into() + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; + let n: u32 = n as u32; + let data_type = args[0].data_type(input_schema)?; + Ok(Arc::new(NthValue::try_new(arg, name, n, data_type)?)) + } + BuiltInWindowFunction::FirstValue => { + let arg = + coerce(args, input_schema, &signature_for_built_in(fun))?[0].clone(); + let data_type = args[0].data_type(input_schema)?; + Ok(Arc::new(FirstValue::new(arg, name, data_type))) + } + BuiltInWindowFunction::LastValue => { + let arg = + coerce(args, input_schema, &signature_for_built_in(fun))?[0].clone(); + let data_type = args[0].data_type(input_schema)?; + Ok(Arc::new(LastValue::new(arg, name, data_type))) + } _ => Err(DataFusionError::NotImplemented(format!( "Window function with {:?} not yet implemented", fun @@ -484,45 +515,106 @@ impl RecordBatchStream for WindowAggStream { #[cfg(test)] mod tests { - // use super::*; - - // /// some mock data to test windows - // fn some_data() -> (Arc, Vec) { - // // define a schema. - // let schema = Arc::new(Schema::new(vec![ - // Field::new("a", DataType::UInt32, false), - // Field::new("b", DataType::Float64, false), - // ])); - - // // define data. - // ( - // schema.clone(), - // vec![ - // RecordBatch::try_new( - // schema.clone(), - // vec![ - // Arc::new(UInt32Array::from(vec![2, 3, 4, 4])), - // Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])), - // ], - // ) - // .unwrap(), - // RecordBatch::try_new( - // schema, - // vec![ - // Arc::new(UInt32Array::from(vec![2, 3, 3, 4])), - // Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])), - // ], - // ) - // .unwrap(), - // ], - // ) - // } - - // #[tokio::test] - // async fn window_function() -> Result<()> { - // let input: Arc = unimplemented!(); - // let input_schema = input.schema(); - // let window_expr = vec![]; - // WindowAggExec::try_new(window_expr, input, input_schema); - // } + use super::*; + use crate::physical_plan::aggregates::AggregateFunction; + use crate::physical_plan::collect; + use crate::physical_plan::csv::{CsvExec, CsvReadOptions}; + use crate::physical_plan::expressions::col; + use crate::test; + use arrow::array::*; + + fn create_test_schema(partitions: usize) -> Result<(Arc, SchemaRef)> { + let schema = test::aggr_test_schema(); + let path = test::create_partitioned_csv("aggregate_test_100.csv", partitions)?; + let csv = CsvExec::try_new( + &path, + CsvReadOptions::new().schema(&schema), + None, + 1024, + None, + )?; + + let input = Arc::new(csv); + Ok((input, schema)) + } + + #[tokio::test] + async fn window_function_input_partition() -> Result<()> { + let (input, schema) = create_test_schema(4)?; + + let window_exec = Arc::new(WindowAggExec::try_new( + vec![create_window_expr( + &WindowFunction::AggregateFunction(AggregateFunction::Count), + &[col("c3")], + schema.as_ref(), + "count".to_owned(), + )?], + input, + schema.clone(), + )?); + + let result = collect(window_exec).await; + + assert!(result.is_err()); + if let Some(DataFusionError::Internal(msg)) = result.err() { + assert_eq!( + msg, + "WindowAggExec requires a single input partition".to_owned() + ); + } else { + unreachable!("Expect an internal error to happen"); + } + Ok(()) + } + + #[tokio::test] + async fn window_function() -> Result<()> { + let (input, schema) = create_test_schema(1)?; + + let window_exec = Arc::new(WindowAggExec::try_new( + vec![ + create_window_expr( + &WindowFunction::AggregateFunction(AggregateFunction::Count), + &[col("c3")], + schema.as_ref(), + "count".to_owned(), + )?, + create_window_expr( + &WindowFunction::AggregateFunction(AggregateFunction::Max), + &[col("c3")], + schema.as_ref(), + "max".to_owned(), + )?, + create_window_expr( + &WindowFunction::AggregateFunction(AggregateFunction::Min), + &[col("c3")], + schema.as_ref(), + "min".to_owned(), + )?, + ], + input, + schema.clone(), + )?); + + let result: Vec = collect(window_exec).await?; + assert_eq!(result.len(), 1); + + let columns = result[0].columns(); + + // c3 is small int + + let count: &UInt64Array = as_primitive_array(&columns[0]); + assert_eq!(count.value(0), 100); + assert_eq!(count.value(99), 100); + + let max: &Int8Array = as_primitive_array(&columns[1]); + assert_eq!(max.value(0), 125); + assert_eq!(max.value(99), 125); + + let min: &Int8Array = as_primitive_array(&columns[2]); + assert_eq!(min.value(0), -117); + assert_eq!(min.value(99), -117); + + Ok(()) + } } diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 55bc88eedf9ab..f5b416f789736 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -807,17 +807,20 @@ async fn csv_query_window_with_empty_over() -> Result<()> { avg(c3) over (), \ count(c3) over (), \ max(c3) over (), \ - min(c3) over () \ + min(c3) over (), \ + first_value(c3) over (), \ + last_value(c3) over (), \ + nth_value(c3, 2) over () from aggregate_test_100 \ - order by c2 \ + order by c2 limit 5"; let actual = execute(&mut ctx, sql).await; let expected = vec![ - vec!["1", "781", "7.81", "100", "125", "-117"], - vec!["1", "781", "7.81", "100", "125", "-117"], - vec!["1", "781", "7.81", "100", "125", "-117"], - vec!["1", "781", "7.81", "100", "125", "-117"], - vec!["1", "781", "7.81", "100", "125", "-117"], + vec!["1", "781", "7.81", "100", "125", "-117", "1", "30", "-40"], + vec!["1", "781", "7.81", "100", "125", "-117", "1", "30", "-40"], + vec!["1", "781", "7.81", "100", "125", "-117", "1", "30", "-40"], + vec!["1", "781", "7.81", "100", "125", "-117", "1", "30", "-40"], + vec!["1", "781", "7.81", "100", "125", "-117", "1", "30", "-40"], ]; assert_eq!(expected, actual); Ok(()) From 80abb09e4ee89f670980e02bb7e3fbd457a916d0 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Mon, 31 May 2021 18:16:44 +0800 Subject: [PATCH 136/329] include test data and add aggregation tests in integration test (#425) * include test data * bump --- .github/workflows/rust.yml | 19 ++++++++++- datafusion-cli/src/main.rs | 24 +++++++------ integration-tests/create_test_table.sql | 34 +++++++++++++++++++ integration-tests/sqls/simple_aggregation.sql | 24 +++++++++++++ integration-tests/sqls/simple_group_by.sql | 27 +++++++++++++++ integration-tests/test_psql_parity.py | 6 +++- 6 files changed, 122 insertions(+), 12 deletions(-) create mode 100644 integration-tests/create_test_table.sql create mode 100644 integration-tests/sqls/simple_aggregation.sql create mode 100644 integration-tests/sqls/simple_group_by.sql diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index f492b2e31d0bc..933b51353d06b 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -155,6 +155,8 @@ jobs: --health-retries 5 steps: - uses: actions/checkout@v2 + with: + submodules: true - uses: actions/setup-python@v2 with: python-version: "3.8" @@ -167,7 +169,22 @@ jobs: # make sure psql can access the server echo "$POSTGRES_HOST:$POSTGRES_PORT:$POSTGRES_DB:$POSTGRES_USER:$POSTGRES_PASSWORD" | tee ~/.pgpass chmod 0600 ~/.pgpass - psql -d "$POSTGRES_DB" -h "$POSTGRES_HOST" -p "$POSTGRES_PORT" -U "$POSTGRES_USER" -c 'select now() as now' + psql -d "$POSTGRES_DB" -h "$POSTGRES_HOST" -p "$POSTGRES_PORT" -U "$POSTGRES_USER" -c 'CREATE TABLE IF NOT EXISTS test ( + c1 character varying NOT NULL, + c2 integer NOT NULL, + c3 smallint NOT NULL, + c4 smallint NOT NULL, + c5 integer NOT NULL, + c6 bigint NOT NULL, + c7 smallint NOT NULL, + c8 integer NOT NULL, + c9 bigint NOT NULL, + c10 character varying NOT NULL, + c11 double precision NOT NULL, + c12 double precision NOT NULL, + c13 character varying NOT NULL + );' + psql -d "$POSTGRES_DB" -h "$POSTGRES_HOST" -p "$POSTGRES_PORT" -U "$POSTGRES_USER" -c "\copy test FROM '$(pwd)/testing/data/csv/aggregate_test_100.csv' WITH (FORMAT csv, HEADER true);" env: POSTGRES_HOST: localhost POSTGRES_PORT: ${{ job.services.postgres.ports[5432] }} diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index 5b35880580b20..083710f6dd192 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -58,9 +58,10 @@ pub async fn main() { ) .arg( Arg::with_name("file") - .help("Execute commands from file, then exit") + .help("Execute commands from file(s), then exit") .short("f") .long("file") + .multiple(true) .validator(is_valid_file) .takes_value(true), ) @@ -112,22 +113,25 @@ pub async fn main() { let quiet = matches.is_present("quiet"); let print_options = PrintOptions { format, quiet }; - if let Some(file_path) = matches.value_of("file") { - let file = File::open(file_path) - .unwrap_or_else(|err| panic!("cannot open file '{}': {}", file_path, err)); - let mut reader = BufReader::new(file); - exec_from_lines(&mut reader, execution_config, print_options).await; + if let Some(file_paths) = matches.values_of("file") { + let files = file_paths + .map(|file_path| File::open(file_path).unwrap()) + .collect::>(); + let mut ctx = ExecutionContext::with_config(execution_config); + for file in files { + let mut reader = BufReader::new(file); + exec_from_lines(&mut ctx, &mut reader, print_options.clone()).await; + } } else { exec_from_repl(execution_config, print_options).await; } } async fn exec_from_lines( + ctx: &mut ExecutionContext, reader: &mut BufReader, - execution_config: ExecutionConfig, print_options: PrintOptions, ) { - let mut ctx = ExecutionContext::with_config(execution_config); let mut query = "".to_owned(); for line in reader.lines() { @@ -139,7 +143,7 @@ async fn exec_from_lines( let line = line.trim_end(); query.push_str(line); if line.ends_with(';') { - match exec_and_print(&mut ctx, print_options.clone(), query).await { + match exec_and_print(ctx, print_options.clone(), query).await { Ok(_) => {} Err(err) => println!("{:?}", err), } @@ -156,7 +160,7 @@ async fn exec_from_lines( // run the left over query if the last statement doesn't contain ‘;’ if !query.is_empty() { - match exec_and_print(&mut ctx, print_options, query).await { + match exec_and_print(ctx, print_options, query).await { Ok(_) => {} Err(err) => println!("{:?}", err), } diff --git a/integration-tests/create_test_table.sql b/integration-tests/create_test_table.sql new file mode 100644 index 0000000000000..89b08611d1c07 --- /dev/null +++ b/integration-tests/create_test_table.sql @@ -0,0 +1,34 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at + +-- http://www.apache.org/licenses/LICENSE-2.0 + +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +CREATE EXTERNAL TABLE test ( + c1 VARCHAR NOT NULL, + c2 INT NOT NULL, + c3 SMALLINT NOT NULL, + c4 SMALLINT NOT NULL, + c5 INT NOT NULL, + c6 BIGINT NOT NULL, + c7 SMALLINT NOT NULL, + c8 INT NOT NULL, + c9 BIGINT NOT NULL, + c10 VARCHAR NOT NULL, + c11 FLOAT NOT NULL, + c12 DOUBLE NOT NULL, + c13 VARCHAR NOT NULL +) +STORED AS CSV +WITH HEADER ROW +LOCATION 'testing/data/csv/aggregate_test_100.csv'; diff --git a/integration-tests/sqls/simple_aggregation.sql b/integration-tests/sqls/simple_aggregation.sql new file mode 100644 index 0000000000000..cbe37ed4ba31a --- /dev/null +++ b/integration-tests/sqls/simple_aggregation.sql @@ -0,0 +1,24 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at + +-- http://www.apache.org/licenses/LICENSE-2.0 + +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +SELECT + count(*) AS count_all, + count(c3) AS count_c3, + avg(c3) AS avg, + sum(c3) AS sum, + max(c3) AS max, + min(c3) AS min +FROM test; diff --git a/integration-tests/sqls/simple_group_by.sql b/integration-tests/sqls/simple_group_by.sql new file mode 100644 index 0000000000000..11fe1cce406f6 --- /dev/null +++ b/integration-tests/sqls/simple_group_by.sql @@ -0,0 +1,27 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at + +-- http://www.apache.org/licenses/LICENSE-2.0 + +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + + +select + c2, + sum(c3) sum_c3, + avg(c3) avg_c3, + max(c3) max_c3, + min(c3) min_c3, + count(c3) count_c3 +from test +group by c2 +order by c2; diff --git a/integration-tests/test_psql_parity.py b/integration-tests/test_psql_parity.py index 204f9063297e9..f4967b8457e49 100644 --- a/integration-tests/test_psql_parity.py +++ b/integration-tests/test_psql_parity.py @@ -32,12 +32,16 @@ ) ] +CREATE_TABLE_SQL_FILE = "integration-tests/create_test_table.sql" + def generate_csv_from_datafusion(fname: str): return subprocess.check_output( [ "./target/debug/datafusion-cli", "-f", + CREATE_TABLE_SQL_FILE, + "-f", fname, "--format", "csv", @@ -70,7 +74,7 @@ class PsqlParityTest(unittest.TestCase): def test_parity(self): root = Path(os.path.dirname(__file__)) / "sqls" files = set(root.glob("*.sql")) - self.assertEqual(len(files), 2, msg="tests are missed") + self.assertEqual(len(files), 4, msg="tests are missed") for fname in files: with self.subTest(fname=fname): datafusion_output = pd.read_csv( From c8ab5a4f00fc8b362eed72d5feb43b03b8ad1fdd Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 31 May 2021 04:18:00 -0600 Subject: [PATCH 137/329] Refactor Ballista executor so that FlightService delegates to an Executor struct (#450) * Refactor Ballista executor so that FlightService delegates to an Executor struct * Simplify code * Pass executor to execution_loop * clippy * use arrow via datafusion --- ballista/rust/executor/src/execution_loop.rs | 22 ++-- ballista/rust/executor/src/executor.rs | 100 +++++++++++++++++++ ballista/rust/executor/src/flight_service.rs | 75 +++----------- ballista/rust/executor/src/lib.rs | 1 + ballista/rust/executor/src/main.rs | 27 ++--- ballista/rust/scheduler/Cargo.toml | 1 - 6 files changed, 135 insertions(+), 91 deletions(-) create mode 100644 ballista/rust/executor/src/executor.rs diff --git a/ballista/rust/executor/src/execution_loop.rs b/ballista/rust/executor/src/execution_loop.rs index 5574a14a0915a..afc6f0089b921 100644 --- a/ballista/rust/executor/src/execution_loop.rs +++ b/ballista/rust/executor/src/execution_loop.rs @@ -25,18 +25,16 @@ use log::{debug, error, info, warn}; use tonic::transport::Channel; use ballista_core::serde::protobuf::ExecutorRegistration; -use ballista_core::{ - client::BallistaClient, - serde::protobuf::{ - self, scheduler_grpc_client::SchedulerGrpcClient, task_status, FailedTask, - PartitionId, PollWorkParams, PollWorkResult, TaskDefinition, TaskStatus, - }, +use ballista_core::serde::protobuf::{ + self, scheduler_grpc_client::SchedulerGrpcClient, task_status, FailedTask, + PartitionId, PollWorkParams, PollWorkResult, TaskDefinition, TaskStatus, }; +use ballista_executor::executor::Executor; use protobuf::CompletedTask; pub async fn poll_loop( mut scheduler: SchedulerGrpcClient, - executor_client: BallistaClient, + executor: Arc, executor_meta: ExecutorRegistration, concurrent_tasks: usize, ) { @@ -67,7 +65,7 @@ pub async fn poll_loop( Ok(result) => { if let Some(task) = result.into_inner().task { run_received_tasks( - executor_client.clone(), + executor.clone(), executor_meta.id.clone(), available_tasks_slots.clone(), task_status_sender, @@ -86,7 +84,7 @@ pub async fn poll_loop( } async fn run_received_tasks( - mut executor_client: BallistaClient, + executor: Arc, executor_id: String, available_tasks_slots: Arc, task_status_sender: Sender, @@ -96,15 +94,13 @@ async fn run_received_tasks( available_tasks_slots.fetch_sub(1, Ordering::SeqCst); let plan: Arc = (&task.plan.unwrap()).try_into().unwrap(); let task_id = task.task_id.unwrap(); - // TODO: This is a convoluted way of executing the task. We should move the task - // execution code outside of the FlightService (data plane) into the control plane. tokio::spawn(async move { - let execution_result = executor_client + let execution_result = executor .execute_partition( task_id.job_id.clone(), task_id.stage_id as usize, - vec![task_id.partition_id as usize], + task_id.partition_id as usize, plan, ) .await; diff --git a/ballista/rust/executor/src/executor.rs b/ballista/rust/executor/src/executor.rs new file mode 100644 index 0000000000000..e2945bf8e8714 --- /dev/null +++ b/ballista/rust/executor/src/executor.rs @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Ballista executor logic + +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Instant; + +use ballista_core::error::BallistaError; +use ballista_core::utils; +use datafusion::arrow::array::{ArrayRef, StringBuilder}; +use datafusion::arrow::datatypes::{DataType, Field, Schema}; +use datafusion::arrow::record_batch::RecordBatch; +use datafusion::physical_plan::ExecutionPlan; +use log::info; + +/// Ballista executor +pub struct Executor { + /// Directory for storing partial results + work_dir: String, +} + +impl Executor { + /// Create a new executor instance + pub fn new(work_dir: &str) -> Self { + Self { + work_dir: work_dir.to_owned(), + } + } +} + +impl Executor { + /// Execute one partition of a query stage and persist the result to disk in IPC format. On + /// success, return a RecordBatch containing metadata about the results, including path + /// and statistics. + pub async fn execute_partition( + &self, + job_id: String, + stage_id: usize, + part: usize, + plan: Arc, + ) -> Result { + let mut path = PathBuf::from(&self.work_dir); + path.push(&job_id); + path.push(&format!("{}", stage_id)); + path.push(&format!("{}", part)); + std::fs::create_dir_all(&path)?; + + path.push("data.arrow"); + let path = path.to_str().unwrap(); + info!("Writing results to {}", path); + + let now = Instant::now(); + + // execute the query partition + let mut stream = plan.execute(part).await?; + + // stream results to disk + let stats = utils::write_stream_to_disk(&mut stream, &path).await?; + + info!( + "Executed partition {} in {} seconds. Statistics: {:?}", + part, + now.elapsed().as_secs(), + stats + ); + + let schema = Arc::new(Schema::new(vec![ + Field::new("path", DataType::Utf8, false), + stats.arrow_struct_repr(), + ])); + + // build result set with summary of the partition execution status + let mut c0 = StringBuilder::new(1); + c0.append_value(&path).unwrap(); + let path: ArrayRef = Arc::new(c0.finish()); + + let stats: ArrayRef = stats.to_arrow_arrayref()?; + RecordBatch::try_new(schema, vec![path, stats]).map_err(BallistaError::ArrowError) + } + + pub fn work_dir(&self) -> &str { + &self.work_dir + } +} diff --git a/ballista/rust/executor/src/flight_service.rs b/ballista/rust/executor/src/flight_service.rs index b35ac15ec2673..d4eb1229c294d 100644 --- a/ballista/rust/executor/src/flight_service.rs +++ b/ballista/rust/executor/src/flight_service.rs @@ -21,12 +21,11 @@ use std::fs::File; use std::path::PathBuf; use std::pin::Pin; use std::sync::Arc; -use std::time::Instant; +use crate::executor::Executor; use ballista_core::error::BallistaError; use ballista_core::serde::decode_protobuf; use ballista_core::serde::scheduler::{Action as BallistaAction, PartitionStats}; -use ballista_core::utils; use arrow_flight::{ flight_service_server::FlightService, Action, ActionType, Criteria, Empty, @@ -34,14 +33,12 @@ use arrow_flight::{ PutResult, SchemaResult, Ticket, }; use datafusion::arrow::{ - array::{ArrayRef, StringBuilder}, datatypes::{DataType, Field, Schema}, error::ArrowError, ipc::reader::FileReader, ipc::writer::IpcWriteOptions, record_batch::RecordBatch, }; -use datafusion::error::DataFusionError; use datafusion::physical_plan::displayable; use futures::{Stream, StreamExt}; use log::{info, warn}; @@ -61,12 +58,13 @@ type FlightDataReceiver = Receiver>; /// Service implementing the Apache Arrow Flight Protocol #[derive(Clone)] pub struct BallistaFlightService { - work_dir: String, + /// Executor + executor: Arc, } impl BallistaFlightService { - pub fn new(work_dir: String) -> Self { - Self { work_dir } + pub fn new(executor: Arc) -> Self { + Self { executor } } } @@ -105,59 +103,22 @@ impl FlightService for BallistaFlightService { let mut tasks: Vec>> = vec![]; for &part in &partition.partition_id { - let mut path = PathBuf::from(&self.work_dir); let partition = partition.clone(); + let executor = self.executor.clone(); tasks.push(tokio::spawn(async move { - path.push(partition.job_id); - path.push(&format!("{}", partition.stage_id)); - path.push(&format!("{}", part)); - std::fs::create_dir_all(&path)?; - - path.push("data.arrow"); - let path = path.to_str().unwrap(); - info!("Writing results to {}", path); - - let now = Instant::now(); - - // execute the query partition - let mut stream = partition - .plan - .execute(part) - .await - .map_err(|e| from_datafusion_err(&e))?; - - // stream results to disk - let stats = utils::write_stream_to_disk(&mut stream, &path) - .await - .map_err(|e| from_ballista_err(&e))?; - - info!( - "Executed partition {} in {} seconds. Statistics: {:?}", - part, - now.elapsed().as_secs(), - stats - ); + let results = executor + .execute_partition( + partition.job_id.clone(), + partition.stage_id, + part, + partition.plan.clone(), + ) + .await?; + let results = vec![results]; let mut flights: Vec> = vec![]; let options = arrow::ipc::writer::IpcWriteOptions::default(); - let schema = Arc::new(Schema::new(vec![ - Field::new("path", DataType::Utf8, false), - stats.arrow_struct_repr(), - ])); - - // build result set with summary of the partition execution status - let mut c0 = StringBuilder::new(1); - c0.append_value(&path).unwrap(); - let path: ArrayRef = Arc::new(c0.finish()); - - let stats: ArrayRef = stats.to_arrow_arrayref()?; - let results = vec![RecordBatch::try_new( - schema, - vec![path, stats], - ) - .unwrap()]; - let mut batches: Vec> = results .iter() .flat_map(|batch| create_flight_iter(batch, &options)) @@ -208,7 +169,7 @@ impl FlightService for BallistaFlightService { // fetch a partition that was previously executed by this executor info!("FetchPartition {:?}", partition_id); - let mut path = PathBuf::from(&self.work_dir); + let mut path = PathBuf::from(self.executor.work_dir()); path.push(&partition_id.job_id); path.push(&format!("{}", partition_id.stage_id)); path.push(&format!("{}", partition_id.partition_id)); @@ -368,7 +329,3 @@ fn from_arrow_err(e: &ArrowError) -> Status { fn from_ballista_err(e: &ballista_core::error::BallistaError) -> Status { Status::internal(format!("Ballista Error: {:?}", e)) } - -fn from_datafusion_err(e: &DataFusionError) -> Status { - Status::internal(format!("DataFusion Error: {:?}", e)) -} diff --git a/ballista/rust/executor/src/lib.rs b/ballista/rust/executor/src/lib.rs index 08646ebda6b7f..188b9449db927 100644 --- a/ballista/rust/executor/src/lib.rs +++ b/ballista/rust/executor/src/lib.rs @@ -18,4 +18,5 @@ //! Core executor logic for executing queries and storing results in memory. pub mod collect; +pub mod executor; pub mod flight_service; diff --git a/ballista/rust/executor/src/main.rs b/ballista/rust/executor/src/main.rs index ad7c001e654af..aad53d7a96324 100644 --- a/ballista/rust/executor/src/main.rs +++ b/ballista/rust/executor/src/main.rs @@ -30,17 +30,15 @@ use tempfile::TempDir; use tonic::transport::Server; use uuid::Uuid; -use ballista_core::{ - client::BallistaClient, - serde::protobuf::{ - executor_registration, scheduler_grpc_client::SchedulerGrpcClient, - ExecutorRegistration, - }, +use ballista_core::serde::protobuf::{ + executor_registration, scheduler_grpc_client::SchedulerGrpcClient, + ExecutorRegistration, }; use ballista_core::{ print_version, serde::protobuf::scheduler_grpc_server::SchedulerGrpcServer, BALLISTA_VERSION, }; +use ballista_executor::executor::Executor; use ballista_executor::flight_service::BallistaFlightService; use ballista_scheduler::{state::StandaloneClient, SchedulerServer}; use config::prelude::*; @@ -166,7 +164,10 @@ async fn main() -> Result<()> { let scheduler = SchedulerGrpcClient::connect(scheduler_url) .await .context("Could not connect to scheduler")?; - let service = BallistaFlightService::new(work_dir); + + let executor = Arc::new(Executor::new(&work_dir)); + + let service = BallistaFlightService::new(executor.clone()); let server = FlightServiceServer::new(service); info!( @@ -174,19 +175,9 @@ async fn main() -> Result<()> { BALLISTA_VERSION, addr ); let server_future = tokio::spawn(Server::builder().add_service(server).serve(addr)); - let client_host = external_host.as_deref().unwrap_or_else(|| { - if bind_host == "0.0.0.0" { - // If the executor is being bound to "0.0.0.0" (which means use all ips in all eth devices) - // then use "localhost" to connect to itself through the BallistaClient - "localhost" - } else { - &bind_host - } - }); - let client = BallistaClient::try_new(client_host, port).await?; tokio::spawn(execution_loop::poll_loop( scheduler, - client, + executor, executor_meta, opt.concurrent_tasks, )); diff --git a/ballista/rust/scheduler/Cargo.toml b/ballista/rust/scheduler/Cargo.toml index 19e2574fea598..c009cc6a12bef 100644 --- a/ballista/rust/scheduler/Cargo.toml +++ b/ballista/rust/scheduler/Cargo.toml @@ -52,7 +52,6 @@ tonic = "0.4" tower = { version = "0.4" } warp = "0.3" -arrow = { version = "4.0" } datafusion = { path = "../../../datafusion" } [dev-dependencies] From c794f2df539a10524566cb02b6158ee46cb1459a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 1 Jun 2021 16:15:32 +0100 Subject: [PATCH 138/329] Sort preserving merge (#362) (#379) * Add SortPreservingMergeExec (#362) * Size MutableArrayData based on in_progress length * make SortPreservingMergeStream::build_record_batch fallible * Test SortPreservingMerge with different RecordBatch sizes * fix logical merge conflict Co-authored-by: Andrew Lamb --- datafusion/src/physical_plan/common.rs | 39 +- datafusion/src/physical_plan/merge.rs | 29 +- datafusion/src/physical_plan/mod.rs | 1 + .../physical_plan/sort_preserving_merge.rs | 949 ++++++++++++++++++ 4 files changed, 988 insertions(+), 30 deletions(-) create mode 100644 datafusion/src/physical_plan/sort_preserving_merge.rs diff --git a/datafusion/src/physical_plan/common.rs b/datafusion/src/physical_plan/common.rs index f1ed3742340b0..e60963bbb5b75 100644 --- a/datafusion/src/physical_plan/common.rs +++ b/datafusion/src/physical_plan/common.rs @@ -22,13 +22,18 @@ use std::fs::metadata; use std::sync::Arc; use std::task::{Context, Poll}; -use super::{RecordBatchStream, SendableRecordBatchStream}; -use crate::error::{DataFusionError, Result}; - use arrow::datatypes::SchemaRef; use arrow::error::Result as ArrowResult; use arrow::record_batch::RecordBatch; -use futures::{Stream, TryStreamExt}; +use futures::channel::mpsc; +use futures::{SinkExt, Stream, StreamExt, TryStreamExt}; +use tokio::task::JoinHandle; + +use crate::arrow::error::ArrowError; +use crate::error::{DataFusionError, Result}; +use crate::physical_plan::ExecutionPlan; + +use super::{RecordBatchStream, SendableRecordBatchStream}; /// Stream of record batches pub struct SizedRecordBatchStream { @@ -113,3 +118,29 @@ fn build_file_list_recurse( } Ok(()) } + +/// Spawns a task to the tokio threadpool and writes its outputs to the provided mpsc sender +pub(crate) fn spawn_execution( + input: Arc, + mut output: mpsc::Sender>, + partition: usize, +) -> JoinHandle<()> { + tokio::spawn(async move { + let mut stream = match input.execute(partition).await { + Err(e) => { + // If send fails, plan being torn + // down, no place to send the error + let arrow_error = ArrowError::ExternalError(Box::new(e)); + output.send(Err(arrow_error)).await.ok(); + return; + } + Ok(stream) => stream, + }; + + while let Some(item) = stream.next().await { + // If send fails, plan being torn down, + // there is no place to send the error + output.send(item).await.ok(); + } + }) +} diff --git a/datafusion/src/physical_plan/merge.rs b/datafusion/src/physical_plan/merge.rs index c65227c161148..a25f5c7909fdb 100644 --- a/datafusion/src/physical_plan/merge.rs +++ b/datafusion/src/physical_plan/merge.rs @@ -22,23 +22,19 @@ use std::any::Any; use std::sync::Arc; use futures::channel::mpsc; -use futures::sink::SinkExt; -use futures::stream::StreamExt; use futures::Stream; use async_trait::async_trait; use arrow::record_batch::RecordBatch; -use arrow::{ - datatypes::SchemaRef, - error::{ArrowError, Result as ArrowResult}, -}; +use arrow::{datatypes::SchemaRef, error::Result as ArrowResult}; use super::RecordBatchStream; use crate::error::{DataFusionError, Result}; use crate::physical_plan::{DisplayFormatType, ExecutionPlan, Partitioning}; use super::SendableRecordBatchStream; +use crate::physical_plan::common::spawn_execution; use pin_project_lite::pin_project; /// Merge execution plan executes partitions in parallel and combines them into a single @@ -121,26 +117,7 @@ impl ExecutionPlan for MergeExec { // spawn independent tasks whose resulting streams (of batches) // are sent to the channel for consumption. for part_i in 0..input_partitions { - let input = self.input.clone(); - let mut sender = sender.clone(); - tokio::spawn(async move { - let mut stream = match input.execute(part_i).await { - Err(e) => { - // If send fails, plan being torn - // down, no place to send the error - let arrow_error = ArrowError::ExternalError(Box::new(e)); - sender.send(Err(arrow_error)).await.ok(); - return; - } - Ok(stream) => stream, - }; - - while let Some(item) = stream.next().await { - // If send fails, plan being torn down, - // there is no place to send the error - sender.send(item).await.ok(); - } - }); + spawn_execution(self.input.clone(), sender.clone(), part_i); } Ok(Box::pin(MergeStream { diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index ae84b36b31a8e..af6969c43cbd6 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -608,6 +608,7 @@ pub mod projection; pub mod regex_expressions; pub mod repartition; pub mod sort; +pub mod sort_preserving_merge; pub mod source; pub mod string_expressions; pub mod type_coercion; diff --git a/datafusion/src/physical_plan/sort_preserving_merge.rs b/datafusion/src/physical_plan/sort_preserving_merge.rs new file mode 100644 index 0000000000000..283294a43ec75 --- /dev/null +++ b/datafusion/src/physical_plan/sort_preserving_merge.rs @@ -0,0 +1,949 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines the sort preserving merge plan + +use std::any::Any; +use std::cmp::Ordering; +use std::collections::VecDeque; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use arrow::array::{ArrayRef, MutableArrayData}; +use arrow::compute::SortOptions; +use async_trait::async_trait; +use futures::channel::mpsc; +use futures::stream::FusedStream; +use futures::{Stream, StreamExt}; + +use crate::arrow::datatypes::SchemaRef; +use crate::arrow::error::ArrowError; +use crate::arrow::{error::Result as ArrowResult, record_batch::RecordBatch}; +use crate::error::{DataFusionError, Result}; +use crate::physical_plan::common::spawn_execution; +use crate::physical_plan::expressions::PhysicalSortExpr; +use crate::physical_plan::{ + DisplayFormatType, Distribution, ExecutionPlan, Partitioning, PhysicalExpr, + RecordBatchStream, SendableRecordBatchStream, +}; + +/// Sort preserving merge execution plan +/// +/// This takes an input execution plan and a list of sort expressions, and +/// provided each partition of the input plan is sorted with respect to +/// these sort expressions, this operator will yield a single partition +/// that is also sorted with respect to them +#[derive(Debug)] +pub struct SortPreservingMergeExec { + /// Input plan + input: Arc, + /// Sort expressions + expr: Vec, + /// The target size of yielded batches + target_batch_size: usize, +} + +impl SortPreservingMergeExec { + /// Create a new sort execution plan + pub fn new( + expr: Vec, + input: Arc, + target_batch_size: usize, + ) -> Self { + Self { + input, + expr, + target_batch_size, + } + } + + /// Input schema + pub fn input(&self) -> &Arc { + &self.input + } + + /// Sort expressions + pub fn expr(&self) -> &[PhysicalSortExpr] { + &self.expr + } +} + +#[async_trait] +impl ExecutionPlan for SortPreservingMergeExec { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.input.schema() + } + + /// Get the output partitioning of this plan + fn output_partitioning(&self) -> Partitioning { + Partitioning::UnknownPartitioning(1) + } + + fn required_child_distribution(&self) -> Distribution { + Distribution::UnspecifiedDistribution + } + + fn children(&self) -> Vec> { + vec![self.input.clone()] + } + + fn with_new_children( + &self, + children: Vec>, + ) -> Result> { + match children.len() { + 1 => Ok(Arc::new(SortPreservingMergeExec::new( + self.expr.clone(), + children[0].clone(), + self.target_batch_size, + ))), + _ => Err(DataFusionError::Internal( + "SortPreservingMergeExec wrong number of children".to_string(), + )), + } + } + + async fn execute(&self, partition: usize) -> Result { + if 0 != partition { + return Err(DataFusionError::Internal(format!( + "SortPreservingMergeExec invalid partition {}", + partition + ))); + } + + let input_partitions = self.input.output_partitioning().partition_count(); + match input_partitions { + 0 => Err(DataFusionError::Internal( + "SortPreservingMergeExec requires at least one input partition" + .to_owned(), + )), + 1 => { + // bypass if there is only one partition to merge + self.input.execute(0).await + } + _ => { + let streams = (0..input_partitions) + .into_iter() + .map(|part_i| { + let (sender, receiver) = mpsc::channel(1); + spawn_execution(self.input.clone(), sender, part_i); + receiver + }) + .collect(); + + Ok(Box::pin(SortPreservingMergeStream::new( + streams, + self.schema(), + &self.expr, + self.target_batch_size, + ))) + } + } + } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + let expr: Vec = self.expr.iter().map(|e| e.to_string()).collect(); + write!(f, "SortPreservingMergeExec: [{}]", expr.join(",")) + } + } + } +} + +/// A `SortKeyCursor` is created from a `RecordBatch`, and a set of `PhysicalExpr` that when +/// evaluated on the `RecordBatch` yield the sort keys. +/// +/// Additionally it maintains a row cursor that can be advanced through the rows +/// of the provided `RecordBatch` +/// +/// `SortKeyCursor::compare` can then be used to compare the sort key pointed to by this +/// row cursor, with that of another `SortKeyCursor` +#[derive(Debug, Clone)] +struct SortKeyCursor { + columns: Vec, + batch: RecordBatch, + cur_row: usize, + num_rows: usize, +} + +impl SortKeyCursor { + fn new(batch: RecordBatch, sort_key: &[Arc]) -> Result { + let columns = sort_key + .iter() + .map(|expr| Ok(expr.evaluate(&batch)?.into_array(batch.num_rows()))) + .collect::>()?; + + Ok(Self { + cur_row: 0, + num_rows: batch.num_rows(), + columns, + batch, + }) + } + + fn is_finished(&self) -> bool { + self.num_rows == self.cur_row + } + + fn advance(&mut self) -> usize { + assert!(!self.is_finished()); + let t = self.cur_row; + self.cur_row += 1; + t + } + + /// Compares the sort key pointed to by this instance's row cursor with that of another + fn compare( + &self, + other: &SortKeyCursor, + options: &[SortOptions], + ) -> Result { + if self.columns.len() != other.columns.len() { + return Err(DataFusionError::Internal(format!( + "SortKeyCursors had inconsistent column counts: {} vs {}", + self.columns.len(), + other.columns.len() + ))); + } + + if self.columns.len() != options.len() { + return Err(DataFusionError::Internal(format!( + "Incorrect number of SortOptions provided to SortKeyCursor::compare, expected {} got {}", + self.columns.len(), + options.len() + ))); + } + + let zipped = self + .columns + .iter() + .zip(other.columns.iter()) + .zip(options.iter()); + + for ((l, r), sort_options) in zipped { + match (l.is_valid(self.cur_row), r.is_valid(other.cur_row)) { + (false, true) if sort_options.nulls_first => return Ok(Ordering::Less), + (false, true) => return Ok(Ordering::Greater), + (true, false) if sort_options.nulls_first => { + return Ok(Ordering::Greater) + } + (true, false) => return Ok(Ordering::Less), + (false, false) => {} + (true, true) => { + // TODO: Building the predicate each time is sub-optimal + let c = arrow::array::build_compare(l.as_ref(), r.as_ref())?; + match c(self.cur_row, other.cur_row) { + Ordering::Equal => {} + o if sort_options.descending => return Ok(o.reverse()), + o => return Ok(o), + } + } + } + } + + Ok(Ordering::Equal) + } +} + +/// A `RowIndex` identifies a specific row from those buffered +/// by a `SortPreservingMergeStream` +#[derive(Debug, Clone)] +struct RowIndex { + /// The index of the stream + stream_idx: usize, + /// The index of the cursor within the stream's VecDequeue + cursor_idx: usize, + /// The row index + row_idx: usize, +} + +#[derive(Debug)] +struct SortPreservingMergeStream { + /// The schema of the RecordBatches yielded by this stream + schema: SchemaRef, + /// The sorted input streams to merge together + streams: Vec>>, + /// For each input stream maintain a dequeue of SortKeyCursor + /// + /// Exhausted cursors will be popped off the front once all + /// their rows have been yielded to the output + cursors: Vec>, + /// The accumulated row indexes for the next record batch + in_progress: Vec, + /// The physical expressions to sort by + column_expressions: Vec>, + /// The sort options for each expression + sort_options: Vec, + /// The desired RecordBatch size to yield + target_batch_size: usize, + /// If the stream has encountered an error + aborted: bool, +} + +impl SortPreservingMergeStream { + fn new( + streams: Vec>>, + schema: SchemaRef, + expressions: &[PhysicalSortExpr], + target_batch_size: usize, + ) -> Self { + Self { + schema, + cursors: vec![Default::default(); streams.len()], + streams, + column_expressions: expressions.iter().map(|x| x.expr.clone()).collect(), + sort_options: expressions.iter().map(|x| x.options).collect(), + target_batch_size, + aborted: false, + in_progress: vec![], + } + } + + /// If the stream at the given index is not exhausted, and the last cursor for the + /// stream is finished, poll the stream for the next RecordBatch and create a new + /// cursor for the stream from the returned result + fn maybe_poll_stream( + &mut self, + cx: &mut Context<'_>, + idx: usize, + ) -> Poll> { + if let Some(cursor) = &self.cursors[idx].back() { + if !cursor.is_finished() { + // Cursor is not finished - don't need a new RecordBatch yet + return Poll::Ready(Ok(())); + } + } + + let stream = &mut self.streams[idx]; + if stream.is_terminated() { + return Poll::Ready(Ok(())); + } + + // Fetch a new record and create a cursor from it + match futures::ready!(stream.poll_next_unpin(cx)) { + None => return Poll::Ready(Ok(())), + Some(Err(e)) => { + return Poll::Ready(Err(e)); + } + Some(Ok(batch)) => { + let cursor = match SortKeyCursor::new(batch, &self.column_expressions) { + Ok(cursor) => cursor, + Err(e) => { + return Poll::Ready(Err(ArrowError::ExternalError(Box::new(e)))); + } + }; + self.cursors[idx].push_back(cursor) + } + } + + Poll::Ready(Ok(())) + } + + /// Returns the index of the next stream to pull a row from, or None + /// if all cursors for all streams are exhausted + fn next_stream_idx(&self) -> Result> { + let mut min_cursor: Option<(usize, &SortKeyCursor)> = None; + for (idx, candidate) in self.cursors.iter().enumerate() { + if let Some(candidate) = candidate.back() { + if candidate.is_finished() { + continue; + } + + match min_cursor { + None => min_cursor = Some((idx, candidate)), + Some((_, ref min)) => { + if min.compare(candidate, &self.sort_options)? + == Ordering::Greater + { + min_cursor = Some((idx, candidate)) + } + } + } + } + } + + Ok(min_cursor.map(|(idx, _)| idx)) + } + + /// Drains the in_progress row indexes, and builds a new RecordBatch from them + /// + /// Will then drop any cursors for which all rows have been yielded to the output + fn build_record_batch(&mut self) -> ArrowResult { + // Mapping from stream index to the index of the first buffer from that stream + let mut buffer_idx = 0; + let mut stream_to_buffer_idx = Vec::with_capacity(self.cursors.len()); + + for cursors in &self.cursors { + stream_to_buffer_idx.push(buffer_idx); + buffer_idx += cursors.len(); + } + + let columns = self + .schema + .fields() + .iter() + .enumerate() + .map(|(column_idx, field)| { + let arrays = self + .cursors + .iter() + .flat_map(|cursor| { + cursor + .iter() + .map(|cursor| cursor.batch.column(column_idx).data()) + }) + .collect(); + + let mut array_data = MutableArrayData::new( + arrays, + field.is_nullable(), + self.in_progress.len(), + ); + + for row_index in &self.in_progress { + let buffer_idx = + stream_to_buffer_idx[row_index.stream_idx] + row_index.cursor_idx; + + // TODO: Coalesce contiguous writes + array_data.extend( + buffer_idx, + row_index.row_idx, + row_index.row_idx + 1, + ); + } + + arrow::array::make_array(array_data.freeze()) + }) + .collect(); + + self.in_progress.clear(); + + // New cursors are only created once the previous cursor for the stream + // is finished. This means all remaining rows from all but the last cursor + // for each stream have been yielded to the newly created record batch + // + // Additionally as `in_progress` has been drained, there are no longer + // any RowIndex's reliant on the cursor indexes + // + // We can therefore drop all but the last cursor for each stream + for cursors in &mut self.cursors { + if cursors.len() > 1 { + // Drain all but the last cursor + cursors.drain(0..(cursors.len() - 1)); + } + } + + RecordBatch::try_new(self.schema.clone(), columns) + } +} + +impl Stream for SortPreservingMergeStream { + type Item = ArrowResult; + + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + if self.aborted { + return Poll::Ready(None); + } + + // Ensure all non-exhausted streams have a cursor from which + // rows can be pulled + for i in 0..self.cursors.len() { + match futures::ready!(self.maybe_poll_stream(cx, i)) { + Ok(_) => {} + Err(e) => { + self.aborted = true; + return Poll::Ready(Some(Err(e))); + } + } + } + + loop { + let stream_idx = match self.next_stream_idx() { + Ok(Some(idx)) => idx, + Ok(None) if self.in_progress.is_empty() => return Poll::Ready(None), + Ok(None) => return Poll::Ready(Some(self.build_record_batch())), + Err(e) => { + self.aborted = true; + return Poll::Ready(Some(Err(ArrowError::ExternalError(Box::new( + e, + ))))); + } + }; + + let cursors = &mut self.cursors[stream_idx]; + let cursor_idx = cursors.len() - 1; + let cursor = cursors.back_mut().unwrap(); + let row_idx = cursor.advance(); + let cursor_finished = cursor.is_finished(); + + self.in_progress.push(RowIndex { + stream_idx, + cursor_idx, + row_idx, + }); + + if self.in_progress.len() == self.target_batch_size { + return Poll::Ready(Some(self.build_record_batch())); + } + + // If removed the last row from the cursor, need to fetch a new record + // batch if possible, before looping round again + if cursor_finished { + match futures::ready!(self.maybe_poll_stream(cx, stream_idx)) { + Ok(_) => {} + Err(e) => { + self.aborted = true; + return Poll::Ready(Some(Err(e))); + } + } + } + } + } +} + +impl RecordBatchStream for SortPreservingMergeStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} + +#[cfg(test)] +mod tests { + use std::iter::FromIterator; + + use crate::arrow::array::{Int32Array, StringArray, TimestampNanosecondArray}; + use crate::assert_batches_eq; + use crate::datasource::CsvReadOptions; + use crate::physical_plan::csv::CsvExec; + use crate::physical_plan::expressions::col; + use crate::physical_plan::memory::MemoryExec; + use crate::physical_plan::merge::MergeExec; + use crate::physical_plan::sort::SortExec; + use crate::physical_plan::{collect, common}; + use crate::test; + + use super::*; + use futures::SinkExt; + use tokio_stream::StreamExt; + + #[tokio::test] + async fn test_merge() { + let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("a"), + Some("b"), + Some("c"), + Some("d"), + Some("e"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 4])); + let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("d"), + Some("e"), + Some("g"), + Some("h"), + Some("i"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2, 2, 6])); + let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + let schema = b1.schema(); + + let exec = MemoryExec::try_new(&[vec![b1], vec![b2]], schema, None).unwrap(); + let merge = Arc::new(SortPreservingMergeExec::new( + vec![ + PhysicalSortExpr { + expr: col("b"), + options: Default::default(), + }, + PhysicalSortExpr { + expr: col("c"), + options: Default::default(), + }, + ], + Arc::new(exec), + 1024, + )); + + let collected = collect(merge).await.unwrap(); + assert_eq!(collected.len(), 1); + + assert_batches_eq!( + &[ + "+---+---+-------------------------------+", + "| a | b | c |", + "+---+---+-------------------------------+", + "| 1 | a | 1970-01-01 00:00:00.000000008 |", + "| 2 | b | 1970-01-01 00:00:00.000000007 |", + "| 7 | c | 1970-01-01 00:00:00.000000006 |", + "| 1 | d | 1970-01-01 00:00:00.000000004 |", + "| 9 | d | 1970-01-01 00:00:00.000000005 |", + "| 3 | e | 1970-01-01 00:00:00.000000004 |", + "| 2 | e | 1970-01-01 00:00:00.000000006 |", + "| 3 | g | 1970-01-01 00:00:00.000000002 |", + "| 4 | h | 1970-01-01 00:00:00.000000002 |", + "| 5 | i | 1970-01-01 00:00:00.000000006 |", + "+---+---+-------------------------------+", + ], + collected.as_slice() + ); + } + + async fn sorted_merge( + input: Arc, + sort: Vec, + ) -> RecordBatch { + let merge = Arc::new(SortPreservingMergeExec::new(sort, input, 1024)); + let mut result = collect(merge).await.unwrap(); + assert_eq!(result.len(), 1); + result.remove(0) + } + + async fn partition_sort( + input: Arc, + sort: Vec, + ) -> RecordBatch { + let sort_exec = + Arc::new(SortExec::new_with_partitioning(sort.clone(), input, true)); + sorted_merge(sort_exec, sort).await + } + + async fn basic_sort( + src: Arc, + sort: Vec, + ) -> RecordBatch { + let merge = Arc::new(MergeExec::new(src)); + let sort_exec = Arc::new(SortExec::try_new(sort, merge).unwrap()); + let mut result = collect(sort_exec).await.unwrap(); + assert_eq!(result.len(), 1); + result.remove(0) + } + + #[tokio::test] + async fn test_partition_sort() { + let schema = test::aggr_test_schema(); + let partitions = 4; + let path = + test::create_partitioned_csv("aggregate_test_100.csv", partitions).unwrap(); + let csv = Arc::new( + CsvExec::try_new( + &path, + CsvReadOptions::new().schema(&schema), + None, + 1024, + None, + ) + .unwrap(), + ); + + let sort = vec![ + PhysicalSortExpr { + expr: col("c1"), + options: SortOptions { + descending: true, + nulls_first: true, + }, + }, + PhysicalSortExpr { + expr: col("c2"), + options: Default::default(), + }, + PhysicalSortExpr { + expr: col("c7"), + options: SortOptions::default(), + }, + ]; + + let basic = basic_sort(csv.clone(), sort.clone()).await; + let partition = partition_sort(csv, sort).await; + + let basic = arrow::util::pretty::pretty_format_batches(&[basic]).unwrap(); + let partition = arrow::util::pretty::pretty_format_batches(&[partition]).unwrap(); + + assert_eq!(basic, partition); + } + + // Split the provided record batch into multiple batch_size record batches + fn split_batch(sorted: &RecordBatch, batch_size: usize) -> Vec { + let batches = (sorted.num_rows() + batch_size - 1) / batch_size; + + // Split the sorted RecordBatch into multiple + (0..batches) + .into_iter() + .map(|batch_idx| { + let columns = (0..sorted.num_columns()) + .map(|column_idx| { + let length = + batch_size.min(sorted.num_rows() - batch_idx * batch_size); + + sorted + .column(column_idx) + .slice(batch_idx * batch_size, length) + }) + .collect(); + + RecordBatch::try_new(sorted.schema(), columns).unwrap() + }) + .collect() + } + + async fn sorted_partitioned_input( + sort: Vec, + sizes: &[usize], + ) -> Arc { + let schema = test::aggr_test_schema(); + let partitions = 4; + let path = + test::create_partitioned_csv("aggregate_test_100.csv", partitions).unwrap(); + let csv = Arc::new( + CsvExec::try_new( + &path, + CsvReadOptions::new().schema(&schema), + None, + 1024, + None, + ) + .unwrap(), + ); + + let sorted = basic_sort(csv, sort).await; + let split: Vec<_> = sizes.iter().map(|x| split_batch(&sorted, *x)).collect(); + + Arc::new(MemoryExec::try_new(&split, sorted.schema(), None).unwrap()) + } + + #[tokio::test] + async fn test_partition_sort_streaming_input() { + let sort = vec![ + // uint8 + PhysicalSortExpr { + expr: col("c7"), + options: Default::default(), + }, + // int16 + PhysicalSortExpr { + expr: col("c4"), + options: Default::default(), + }, + // utf-8 + PhysicalSortExpr { + expr: col("c1"), + options: SortOptions::default(), + }, + // utf-8 + PhysicalSortExpr { + expr: col("c13"), + options: SortOptions::default(), + }, + ]; + + let input = sorted_partitioned_input(sort.clone(), &[10, 3, 11]).await; + let basic = basic_sort(input.clone(), sort.clone()).await; + let partition = sorted_merge(input, sort).await; + + assert_eq!(basic.num_rows(), 300); + assert_eq!(partition.num_rows(), 300); + + let basic = arrow::util::pretty::pretty_format_batches(&[basic]).unwrap(); + let partition = arrow::util::pretty::pretty_format_batches(&[partition]).unwrap(); + + assert_eq!(basic, partition); + } + + #[tokio::test] + async fn test_partition_sort_streaming_input_output() { + let sort = vec![ + // float64 + PhysicalSortExpr { + expr: col("c12"), + options: Default::default(), + }, + // utf-8 + PhysicalSortExpr { + expr: col("c13"), + options: Default::default(), + }, + ]; + + let input = sorted_partitioned_input(sort.clone(), &[10, 5, 13]).await; + let basic = basic_sort(input.clone(), sort.clone()).await; + + let merge = Arc::new(SortPreservingMergeExec::new(sort, input, 23)); + let merged = collect(merge).await.unwrap(); + + assert_eq!(merged.len(), 14); + + assert_eq!(basic.num_rows(), 300); + assert_eq!(merged.iter().map(|x| x.num_rows()).sum::(), 300); + + let basic = arrow::util::pretty::pretty_format_batches(&[basic]).unwrap(); + let partition = + arrow::util::pretty::pretty_format_batches(merged.as_slice()).unwrap(); + + assert_eq!(basic, partition); + } + + #[tokio::test] + async fn test_nulls() { + let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + None, + Some("a"), + Some("b"), + Some("d"), + Some("e"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![ + Some(8), + None, + Some(6), + None, + Some(4), + ])); + let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + None, + Some("b"), + Some("g"), + Some("h"), + Some("i"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![ + Some(8), + None, + Some(5), + None, + Some(4), + ])); + let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + let schema = b1.schema(); + + let exec = MemoryExec::try_new(&[vec![b1], vec![b2]], schema, None).unwrap(); + let merge = Arc::new(SortPreservingMergeExec::new( + vec![ + PhysicalSortExpr { + expr: col("b"), + options: SortOptions { + descending: false, + nulls_first: true, + }, + }, + PhysicalSortExpr { + expr: col("c"), + options: SortOptions { + descending: false, + nulls_first: false, + }, + }, + ], + Arc::new(exec), + 1024, + )); + + let collected = collect(merge).await.unwrap(); + assert_eq!(collected.len(), 1); + + assert_batches_eq!( + &[ + "+---+---+-------------------------------+", + "| a | b | c |", + "+---+---+-------------------------------+", + "| 1 | | 1970-01-01 00:00:00.000000008 |", + "| 1 | | 1970-01-01 00:00:00.000000008 |", + "| 2 | a | |", + "| 7 | b | 1970-01-01 00:00:00.000000006 |", + "| 2 | b | |", + "| 9 | d | |", + "| 3 | e | 1970-01-01 00:00:00.000000004 |", + "| 3 | g | 1970-01-01 00:00:00.000000005 |", + "| 4 | h | |", + "| 5 | i | 1970-01-01 00:00:00.000000004 |", + "+---+---+-------------------------------+", + ], + collected.as_slice() + ); + } + + #[tokio::test] + async fn test_async() { + let sort = vec![PhysicalSortExpr { + expr: col("c7"), + options: SortOptions::default(), + }]; + + let batches = sorted_partitioned_input(sort.clone(), &[5, 7, 3]).await; + + let partition_count = batches.output_partitioning().partition_count(); + let mut tasks = Vec::with_capacity(partition_count); + let mut streams = Vec::with_capacity(partition_count); + + for partition in 0..partition_count { + let (mut sender, receiver) = mpsc::channel(1); + let mut stream = batches.execute(partition).await.unwrap(); + let task = tokio::spawn(async move { + while let Some(batch) = stream.next().await { + sender.send(batch).await.unwrap(); + // This causes the MergeStream to wait for more input + tokio::time::sleep(tokio::time::Duration::from_millis(10)).await; + } + }); + tasks.push(task); + streams.push(receiver); + } + + let merge_stream = SortPreservingMergeStream::new( + streams, + batches.schema(), + sort.as_slice(), + 1024, + ); + + let mut merged = common::collect(Box::pin(merge_stream)).await.unwrap(); + + // Propagate any errors + for task in tasks { + task.await.unwrap(); + } + + assert_eq!(merged.len(), 1); + let merged = merged.remove(0); + let basic = basic_sort(batches, sort.clone()).await; + + let basic = arrow::util::pretty::pretty_format_batches(&[basic]).unwrap(); + let partition = arrow::util::pretty::pretty_format_batches(&[merged]).unwrap(); + + assert_eq!(basic, partition); + } +} From e5264f60a461aa44d8253688950099b6c9c47a1c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 1 Jun 2021 12:08:33 -0600 Subject: [PATCH 139/329] Refactor QueryStageExec in preparation for implementing map-side shuffle (#459) * Refactor in preparation for shuffle * Refactor QueryStageExec in preparation for implementing map-side shuffle * unit test * use temp dir in test * Address PR feedback * Fix test faiure on Windows --- ballista/rust/core/Cargo.toml | 1 + .../core/src/execution_plans/query_stage.rs | 184 ++++++++++++++++-- ballista/rust/core/src/utils.rs | 12 +- ballista/rust/executor/src/executor.rs | 50 +---- ballista/rust/scheduler/src/lib.rs | 6 +- ballista/rust/scheduler/src/planner.rs | 33 ++-- 6 files changed, 206 insertions(+), 80 deletions(-) diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml index 1fc0e0c78bf7e..99822cfe2aee5 100644 --- a/ballista/rust/core/Cargo.toml +++ b/ballista/rust/core/Cargo.toml @@ -45,6 +45,7 @@ arrow-flight = { version = "4.0" } datafusion = { path = "../../../datafusion" } [dev-dependencies] +tempfile = "3" [build-dependencies] tonic-build = { version = "0.4" } diff --git a/ballista/rust/core/src/execution_plans/query_stage.rs b/ballista/rust/core/src/execution_plans/query_stage.rs index e95a5d8b51972..233dee5b9b529 100644 --- a/ballista/rust/core/src/execution_plans/query_stage.rs +++ b/ballista/rust/core/src/execution_plans/query_stage.rs @@ -15,13 +15,27 @@ // specific language governing permissions and limitations // under the License. +//! QueryStageExec represents a section of a query plan that has consistent partitioning and +//! can be executed as one unit with each partition being executed in parallel. The output of +//! a query stage either forms the input of another query stage or can be the final result of +//! a query. + +use std::path::PathBuf; use std::sync::Arc; +use std::time::Instant; use std::{any::Any, pin::Pin}; +use crate::error::BallistaError; +use crate::memory_stream::MemoryStream; +use crate::utils; + use async_trait::async_trait; -use datafusion::arrow::datatypes::SchemaRef; -use datafusion::physical_plan::{ExecutionPlan, Partitioning}; -use datafusion::{error::Result, physical_plan::RecordBatchStream}; +use datafusion::arrow::array::{ArrayRef, StringBuilder}; +use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use datafusion::arrow::record_batch::RecordBatch; +use datafusion::error::{DataFusionError, Result}; +use datafusion::physical_plan::{ExecutionPlan, Partitioning, RecordBatchStream}; +use log::info; use uuid::Uuid; /// QueryStageExec represents a section of a query plan that has consistent partitioning and @@ -31,11 +45,15 @@ use uuid::Uuid; #[derive(Debug, Clone)] pub struct QueryStageExec { /// Unique ID for the job (query) that this stage is a part of - pub job_id: String, + job_id: String, /// Unique query stage ID within the job - pub stage_id: usize, + stage_id: usize, /// Physical execution plan for this query stage - pub child: Arc, + plan: Arc, + /// Path to write output streams to + work_dir: String, + /// Optional shuffle output partitioning + shuffle_output_partitioning: Option, } impl QueryStageExec { @@ -43,14 +61,28 @@ impl QueryStageExec { pub fn try_new( job_id: String, stage_id: usize, - child: Arc, + plan: Arc, + work_dir: String, + shuffle_output_partitioning: Option, ) -> Result { Ok(Self { job_id, stage_id, - child, + plan, + work_dir, + shuffle_output_partitioning, }) } + + /// Get the Job ID for this query stage + pub fn job_id(&self) -> &str { + &self.job_id + } + + /// Get the Stage ID for this query stage + pub fn stage_id(&self) -> usize { + self.stage_id + } } #[async_trait] @@ -60,15 +92,15 @@ impl ExecutionPlan for QueryStageExec { } fn schema(&self) -> SchemaRef { - self.child.schema() + self.plan.schema() } fn output_partitioning(&self) -> Partitioning { - self.child.output_partitioning() + self.plan.output_partitioning() } fn children(&self) -> Vec> { - vec![self.child.clone()] + vec![self.plan.clone()] } fn with_new_children( @@ -80,6 +112,8 @@ impl ExecutionPlan for QueryStageExec { self.job_id.clone(), self.stage_id, children[0].clone(), + self.work_dir.clone(), + None, )?)) } @@ -87,6 +121,132 @@ impl ExecutionPlan for QueryStageExec { &self, partition: usize, ) -> Result>> { - self.child.execute(partition).await + let now = Instant::now(); + + let mut stream = self.plan.execute(partition).await?; + + let mut path = PathBuf::from(&self.work_dir); + path.push(&self.job_id); + path.push(&format!("{}", self.stage_id)); + + match &self.shuffle_output_partitioning { + None => { + path.push(&format!("{}", partition)); + std::fs::create_dir_all(&path)?; + + path.push("data.arrow"); + let path = path.to_str().unwrap(); + info!("Writing results to {}", path); + + // stream results to disk + let stats = utils::write_stream_to_disk(&mut stream, &path) + .await + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; + + info!( + "Executed partition {} in {} seconds. Statistics: {:?}", + partition, + now.elapsed().as_secs(), + stats + ); + + let schema = Arc::new(Schema::new(vec![ + Field::new("path", DataType::Utf8, false), + stats.arrow_struct_repr(), + ])); + + // build result set with summary of the partition execution status + let mut c0 = StringBuilder::new(1); + c0.append_value(&path).unwrap(); + let path: ArrayRef = Arc::new(c0.finish()); + + let stats: ArrayRef = stats + .to_arrow_arrayref() + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; + let batch = RecordBatch::try_new(schema.clone(), vec![path, stats]) + .map_err(DataFusionError::ArrowError)?; + + Ok(Box::pin(MemoryStream::try_new(vec![batch], schema, None)?)) + } + + Some(Partitioning::Hash(_, _)) => { + //TODO re-use code from RepartitionExec to split each batch into + // partitions and write to one IPC file per partition + // See https://github.com/apache/arrow-datafusion/issues/456 + Err(DataFusionError::NotImplemented( + "Shuffle partitioning not implemented yet".to_owned(), + )) + } + + _ => Err(DataFusionError::Execution( + "Invalid shuffle partitioning scheme".to_owned(), + )), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use datafusion::arrow::array::{StringArray, StructArray, UInt32Array, UInt64Array}; + use datafusion::physical_plan::memory::MemoryExec; + use tempfile::TempDir; + + #[tokio::test] + async fn test() -> Result<()> { + let input_plan = create_input_plan()?; + let work_dir = TempDir::new()?; + let query_stage = QueryStageExec::try_new( + "jobOne".to_owned(), + 1, + input_plan, + work_dir.into_path().to_str().unwrap().to_owned(), + None, + )?; + let mut stream = query_stage.execute(0).await?; + let batches = utils::collect_stream(&mut stream) + .await + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; + assert!(batches.len() == 1); + let batch = &batches[0]; + assert_eq!(2, batch.num_columns()); + assert_eq!(1, batch.num_rows()); + let path = batch.columns()[0] + .as_any() + .downcast_ref::() + .unwrap(); + let file = path.value(0); + assert!(file.ends_with("data.arrow")); + let stats = batch.columns()[1] + .as_any() + .downcast_ref::() + .unwrap(); + let num_rows = stats + .column_by_name("num_rows") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(4, num_rows.value(0)); + Ok(()) + } + + fn create_input_plan() -> Result> { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::UInt32, true), + Field::new("b", DataType::Utf8, true), + ])); + + // define data. + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(UInt32Array::from(vec![Some(1), Some(2)])), + Arc::new(StringArray::from(vec![Some("hello"), Some("world")])), + ], + )?; + let partition = vec![batch.clone(), batch]; + let partitions = vec![partition.clone(), partition]; + Ok(Arc::new(MemoryExec::try_new(&partitions, schema, None)?)) } } diff --git a/ballista/rust/core/src/utils.rs b/ballista/rust/core/src/utils.rs index 85d1161795ad3..4ba6ec40fec90 100644 --- a/ballista/rust/core/src/utils.rs +++ b/ballista/rust/core/src/utils.rs @@ -112,13 +112,13 @@ pub fn produce_diagram(filename: &str, stages: &[Arc]) -> Result // draw stages and entities for stage in stages { - writeln!(w, "\tsubgraph cluster{} {{", stage.stage_id)?; - writeln!(w, "\t\tlabel = \"Stage {}\";", stage.stage_id)?; + writeln!(w, "\tsubgraph cluster{} {{", stage.stage_id())?; + writeln!(w, "\t\tlabel = \"Stage {}\";", stage.stage_id())?; let mut id = AtomicUsize::new(0); build_exec_plan_diagram( &mut w, - stage.child.as_ref(), - stage.stage_id, + stage.children()[0].as_ref(), + stage.stage_id(), &mut id, true, )?; @@ -130,8 +130,8 @@ pub fn produce_diagram(filename: &str, stages: &[Arc]) -> Result let mut id = AtomicUsize::new(0); build_exec_plan_diagram( &mut w, - stage.child.as_ref(), - stage.stage_id, + stage.children()[0].as_ref(), + stage.stage_id(), &mut id, false, )?; diff --git a/ballista/rust/executor/src/executor.rs b/ballista/rust/executor/src/executor.rs index e2945bf8e8714..90c39277e2fcf 100644 --- a/ballista/rust/executor/src/executor.rs +++ b/ballista/rust/executor/src/executor.rs @@ -17,17 +17,13 @@ //! Ballista executor logic -use std::path::PathBuf; use std::sync::Arc; -use std::time::Instant; use ballista_core::error::BallistaError; +use ballista_core::execution_plans::QueryStageExec; use ballista_core::utils; -use datafusion::arrow::array::{ArrayRef, StringBuilder}; -use datafusion::arrow::datatypes::{DataType, Field, Schema}; use datafusion::arrow::record_batch::RecordBatch; use datafusion::physical_plan::ExecutionPlan; -use log::info; /// Ballista executor pub struct Executor { @@ -55,43 +51,13 @@ impl Executor { part: usize, plan: Arc, ) -> Result { - let mut path = PathBuf::from(&self.work_dir); - path.push(&job_id); - path.push(&format!("{}", stage_id)); - path.push(&format!("{}", part)); - std::fs::create_dir_all(&path)?; - - path.push("data.arrow"); - let path = path.to_str().unwrap(); - info!("Writing results to {}", path); - - let now = Instant::now(); - - // execute the query partition - let mut stream = plan.execute(part).await?; - - // stream results to disk - let stats = utils::write_stream_to_disk(&mut stream, &path).await?; - - info!( - "Executed partition {} in {} seconds. Statistics: {:?}", - part, - now.elapsed().as_secs(), - stats - ); - - let schema = Arc::new(Schema::new(vec![ - Field::new("path", DataType::Utf8, false), - stats.arrow_struct_repr(), - ])); - - // build result set with summary of the partition execution status - let mut c0 = StringBuilder::new(1); - c0.append_value(&path).unwrap(); - let path: ArrayRef = Arc::new(c0.finish()); - - let stats: ArrayRef = stats.to_arrow_arrayref()?; - RecordBatch::try_new(schema, vec![path, stats]).map_err(BallistaError::ArrowError) + let exec = + QueryStageExec::try_new(job_id, stage_id, plan, self.work_dir.clone(), None)?; + let mut stream = exec.execute(part).await?; + let batches = utils::collect_stream(&mut stream).await?; + // the output should be a single batch containing metadata (path and statistics) + assert!(batches.len() == 1); + Ok(batches[0].clone()) } pub fn work_dir(&self) -> &str { diff --git a/ballista/rust/scheduler/src/lib.rs b/ballista/rust/scheduler/src/lib.rs index 3dc8df29bd038..99c7be66a646c 100644 --- a/ballista/rust/scheduler/src/lib.rs +++ b/ballista/rust/scheduler/src/lib.rs @@ -388,8 +388,8 @@ impl SchedulerGrpc for SchedulerServer { fail_job!(state .save_stage_plan( &job_id_spawn, - stage.stage_id, - stage.child.clone() + stage.stage_id(), + stage.children()[0].clone() ) .await .map_err(|e| { @@ -402,7 +402,7 @@ impl SchedulerGrpc for SchedulerServer { let pending_status = TaskStatus { partition_id: Some(PartitionId { job_id: job_id_spawn.clone(), - stage_id: stage.stage_id as u32, + stage_id: stage.stage_id() as u32, partition_id: partition_id as u32, }), status: None, diff --git a/ballista/rust/scheduler/src/planner.rs b/ballista/rust/scheduler/src/planner.rs index b1d999b733334..445ef9a07787b 100644 --- a/ballista/rust/scheduler/src/planner.rs +++ b/ballista/rust/scheduler/src/planner.rs @@ -71,11 +71,7 @@ impl DistributedPlanner { info!("planning query stages"); let (new_plan, mut stages) = self.plan_query_stages_internal(job_id, execution_plan)?; - stages.push(create_query_stage( - job_id.to_string(), - self.next_stage_id(), - new_plan, - )?); + stages.push(create_query_stage(job_id, self.next_stage_id(), new_plan)?); Ok(stages) } @@ -112,12 +108,12 @@ impl DistributedPlanner { Ok((ctx.create_physical_plan(&adapter.logical_plan)?, stages)) } else if let Some(merge) = execution_plan.as_any().downcast_ref::() { let query_stage = create_query_stage( - job_id.to_string(), + job_id, self.next_stage_id(), merge.children()[0].clone(), )?; let unresolved_shuffle = Arc::new(UnresolvedShuffleExec::new( - vec![query_stage.stage_id], + vec![query_stage.stage_id()], query_stage.schema(), query_stage.output_partitioning().partition_count(), )); @@ -133,12 +129,12 @@ impl DistributedPlanner { let mut new_children: Vec> = vec![]; for child in &children { let new_stage = create_query_stage( - job_id.to_string(), + job_id, self.next_stage_id(), child.clone(), )?; new_children.push(Arc::new(UnresolvedShuffleExec::new( - vec![new_stage.stage_id], + vec![new_stage.stage_id()], new_stage.schema().clone(), new_stage.output_partitioning().partition_count(), ))); @@ -165,13 +161,10 @@ impl DistributedPlanner { { let mut new_children: Vec> = vec![]; for child in &children { - let new_stage = create_query_stage( - job_id.to_string(), - self.next_stage_id(), - child.clone(), - )?; + let new_stage = + create_query_stage(job_id, self.next_stage_id(), child.clone())?; new_children.push(Arc::new(UnresolvedShuffleExec::new( - vec![new_stage.stage_id], + vec![new_stage.stage_id()], new_stage.schema().clone(), new_stage.output_partitioning().partition_count(), ))); @@ -229,11 +222,17 @@ pub fn remove_unresolved_shuffles( } fn create_query_stage( - job_id: String, + job_id: &str, stage_id: usize, plan: Arc, ) -> Result> { - Ok(Arc::new(QueryStageExec::try_new(job_id, stage_id, plan)?)) + Ok(Arc::new(QueryStageExec::try_new( + job_id.to_owned(), + stage_id, + plan, + "".to_owned(), // executor will decide on the work_dir path + None, + )?)) } #[cfg(test)] From aab40f8affd86605ea75becc4fb242fa0ab3c937 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Wed, 2 Jun 2021 02:13:54 +0800 Subject: [PATCH 140/329] Optimize `nth_value`, remove `first_value`, `last_value` structs and use idiomatic rust style (#452) * optimize nth_value * fix unit test case --- .../src/physical_plan/expressions/mod.rs | 2 +- .../physical_plan/expressions/nth_value.rs | 238 ++++++++++-------- datafusion/src/physical_plan/windows.rs | 8 +- 3 files changed, 132 insertions(+), 116 deletions(-) diff --git a/datafusion/src/physical_plan/expressions/mod.rs b/datafusion/src/physical_plan/expressions/mod.rs index 77da95c3a04a3..d18365c47ed5e 100644 --- a/datafusion/src/physical_plan/expressions/mod.rs +++ b/datafusion/src/physical_plan/expressions/mod.rs @@ -59,7 +59,7 @@ pub use literal::{lit, Literal}; pub use min_max::{Max, Min}; pub use negative::{negative, NegativeExpr}; pub use not::{not, NotExpr}; -pub use nth_value::{FirstValue, LastValue, NthValue}; +pub use nth_value::NthValue; pub use nullif::{nullif_func, SUPPORTED_NULLIF_TYPES}; pub use row_number::RowNumber; pub use sum::{sum_return_type, Sum}; diff --git a/datafusion/src/physical_plan/expressions/nth_value.rs b/datafusion/src/physical_plan/expressions/nth_value.rs index e90ad322aae9d..fb0e79f7ad3c6 100644 --- a/datafusion/src/physical_plan/expressions/nth_value.rs +++ b/datafusion/src/physical_plan/expressions/nth_value.rs @@ -27,129 +27,69 @@ use std::any::Any; use std::convert::TryFrom; use std::sync::Arc; -/// first_value expression +/// nth_value kind +#[derive(Debug, Copy, Clone)] +enum NthValueKind { + First, + Last, + Nth(u32), +} + +/// nth_value expression #[derive(Debug)] -pub struct FirstValue { +pub struct NthValue { name: String, - data_type: DataType, expr: Arc, + data_type: DataType, + kind: NthValueKind, } -impl FirstValue { +impl NthValue { /// Create a new FIRST_VALUE window aggregate function - pub fn new(expr: Arc, name: String, data_type: DataType) -> Self { + pub fn first_value( + name: String, + expr: Arc, + data_type: DataType, + ) -> Self { Self { name, - data_type, expr, + data_type, + kind: NthValueKind::First, } } -} - -impl BuiltInWindowFunctionExpr for FirstValue { - /// Return a reference to Any that can be used for downcasting - fn as_any(&self) -> &dyn Any { - self - } - fn field(&self) -> Result { - let nullable = true; - Ok(Field::new(&self.name, self.data_type.clone(), nullable)) - } - - fn expressions(&self) -> Vec> { - vec![self.expr.clone()] - } - - fn name(&self) -> &str { - &self.name - } - - fn create_accumulator(&self) -> Result> { - Ok(Box::new(NthValueAccumulator::try_new( - 1, - self.data_type.clone(), - )?)) - } -} - -// sql values start with 1, so we can use 0 to indicate the special last value behavior -const SPECIAL_SIZE_VALUE_FOR_LAST: u32 = 0; - -/// last_value expression -#[derive(Debug)] -pub struct LastValue { - name: String, - data_type: DataType, - expr: Arc, -} - -impl LastValue { - /// Create a new FIRST_VALUE window aggregate function - pub fn new(expr: Arc, name: String, data_type: DataType) -> Self { + /// Create a new LAST_VALUE window aggregate function + pub fn last_value( + name: String, + expr: Arc, + data_type: DataType, + ) -> Self { Self { name, - data_type, expr, + data_type, + kind: NthValueKind::Last, } } -} - -impl BuiltInWindowFunctionExpr for LastValue { - /// Return a reference to Any that can be used for downcasting - fn as_any(&self) -> &dyn Any { - self - } - - fn field(&self) -> Result { - let nullable = true; - Ok(Field::new(&self.name, self.data_type.clone(), nullable)) - } - - fn expressions(&self) -> Vec> { - vec![self.expr.clone()] - } - - fn name(&self) -> &str { - &self.name - } - fn create_accumulator(&self) -> Result> { - Ok(Box::new(NthValueAccumulator::try_new( - SPECIAL_SIZE_VALUE_FOR_LAST, - self.data_type.clone(), - )?)) - } -} - -/// nth_value expression -#[derive(Debug)] -pub struct NthValue { - name: String, - n: u32, - data_type: DataType, - expr: Arc, -} - -impl NthValue { /// Create a new NTH_VALUE window aggregate function - pub fn try_new( - expr: Arc, + pub fn nth_value( name: String, - n: u32, + expr: Arc, data_type: DataType, + n: u32, ) -> Result { - if n == SPECIAL_SIZE_VALUE_FOR_LAST { - Err(DataFusionError::Execution( + match n { + 0 => Err(DataFusionError::Execution( "nth_value expect n to be > 0".to_owned(), - )) - } else { - Ok(Self { + )), + _ => Ok(Self { name, - n, - data_type, expr, - }) + data_type, + kind: NthValueKind::Nth(n), + }), } } } @@ -175,7 +115,7 @@ impl BuiltInWindowFunctionExpr for NthValue { fn create_accumulator(&self) -> Result> { Ok(Box::new(NthValueAccumulator::try_new( - self.n, + self.kind, self.data_type.clone(), )?)) } @@ -183,19 +123,16 @@ impl BuiltInWindowFunctionExpr for NthValue { #[derive(Debug)] struct NthValueAccumulator { - // n the target nth_value, however we'll reuse it for last_value acc, so when n == 0 it specifically - // means last; also note that it is totally valid for n to be larger than the number of rows input - // in which case all the values shall be null - n: u32, + kind: NthValueKind, offset: u32, value: ScalarValue, } impl NthValueAccumulator { /// new count accumulator - pub fn try_new(n: u32, data_type: DataType) -> Result { + pub fn try_new(kind: NthValueKind, data_type: DataType) -> Result { Ok(Self { - n, + kind, offset: 0, // null value of that data_type by default value: ScalarValue::try_from(&data_type)?, @@ -205,15 +142,20 @@ impl NthValueAccumulator { impl WindowAccumulator for NthValueAccumulator { fn scan(&mut self, values: &[ScalarValue]) -> Result> { - if self.n == SPECIAL_SIZE_VALUE_FOR_LAST { - // for last_value function - self.value = values[0].clone(); - } else if self.offset < self.n { - self.offset += 1; - if self.offset == self.n { + self.offset += 1; + match self.kind { + NthValueKind::Last => { + self.value = values[0].clone(); + } + NthValueKind::First if self.offset == 1 => { + self.value = values[0].clone(); + } + NthValueKind::Nth(n) if self.offset == n => { self.value = values[0].clone(); } + _ => {} } + Ok(None) } @@ -221,3 +163,77 @@ impl WindowAccumulator for NthValueAccumulator { Ok(Some(self.value.clone())) } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::error::Result; + use crate::physical_plan::expressions::col; + use arrow::record_batch::RecordBatch; + use arrow::{array::*, datatypes::*}; + + fn test_i32_result(expr: Arc, expected: i32) -> Result<()> { + let arr: ArrayRef = Arc::new(Int32Array::from(vec![1, -2, 3, -4, 5, -6, 7, 8])); + let schema = Schema::new(vec![Field::new("arr", DataType::Int32, false)]); + let batch = RecordBatch::try_new(Arc::new(schema), vec![arr])?; + + let mut acc = expr.create_accumulator()?; + let expr = expr.expressions(); + let values = expr + .iter() + .map(|e| e.evaluate(&batch)) + .map(|r| r.map(|v| v.into_array(batch.num_rows()))) + .collect::>>()?; + let result = acc.scan_batch(batch.num_rows(), &values)?; + assert_eq!(false, result.is_some()); + let result = acc.evaluate()?; + assert_eq!(Some(ScalarValue::Int32(Some(expected))), result); + Ok(()) + } + + #[test] + fn first_value() -> Result<()> { + let first_value = Arc::new(NthValue::first_value( + "first_value".to_owned(), + col("arr"), + DataType::Int32, + )); + test_i32_result(first_value, 1)?; + Ok(()) + } + + #[test] + fn last_value() -> Result<()> { + let last_value = Arc::new(NthValue::last_value( + "last_value".to_owned(), + col("arr"), + DataType::Int32, + )); + test_i32_result(last_value, 8)?; + Ok(()) + } + + #[test] + fn nth_value_1() -> Result<()> { + let nth_value = Arc::new(NthValue::nth_value( + "nth_value".to_owned(), + col("arr"), + DataType::Int32, + 1, + )?); + test_i32_result(nth_value, 1)?; + Ok(()) + } + + #[test] + fn nth_value_2() -> Result<()> { + let nth_value = Arc::new(NthValue::nth_value( + "nth_value".to_owned(), + col("arr"), + DataType::Int32, + 2, + )?); + test_i32_result(nth_value, -2)?; + Ok(()) + } +} diff --git a/datafusion/src/physical_plan/windows.rs b/datafusion/src/physical_plan/windows.rs index e790eeaca749e..659d2183819d3 100644 --- a/datafusion/src/physical_plan/windows.rs +++ b/datafusion/src/physical_plan/windows.rs @@ -20,7 +20,7 @@ use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ aggregates, - expressions::{FirstValue, LastValue, Literal, NthValue, RowNumber}, + expressions::{Literal, NthValue, RowNumber}, type_coercion::coerce, window_functions::signature_for_built_in, window_functions::BuiltInWindowFunctionExpr, @@ -105,19 +105,19 @@ fn create_built_in_window_expr( .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; let n: u32 = n as u32; let data_type = args[0].data_type(input_schema)?; - Ok(Arc::new(NthValue::try_new(arg, name, n, data_type)?)) + Ok(Arc::new(NthValue::nth_value(name, arg, data_type, n)?)) } BuiltInWindowFunction::FirstValue => { let arg = coerce(args, input_schema, &signature_for_built_in(fun))?[0].clone(); let data_type = args[0].data_type(input_schema)?; - Ok(Arc::new(FirstValue::new(arg, name, data_type))) + Ok(Arc::new(NthValue::first_value(name, arg, data_type))) } BuiltInWindowFunction::LastValue => { let arg = coerce(args, input_schema, &signature_for_built_in(fun))?[0].clone(); let data_type = args[0].data_type(input_schema)?; - Ok(Arc::new(LastValue::new(arg, name, data_type))) + Ok(Arc::new(NthValue::last_value(name, arg, data_type))) } _ => Err(DataFusionError::NotImplemented(format!( "Window function with {:?} not yet implemented", From 13c907cab8eb0fa7a9a121d9699a4805d5248adc Mon Sep 17 00:00:00 2001 From: Michael Lu Date: Tue, 1 Jun 2021 21:52:54 +0300 Subject: [PATCH 141/329] Examples section in datafusion crate. Instructions how to run them (#457) --- datafusion/src/lib.rs | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/datafusion/src/lib.rs b/datafusion/src/lib.rs index b6f64feb70d2a..5b8c9c13006ab 100644 --- a/datafusion/src/lib.rs +++ b/datafusion/src/lib.rs @@ -182,6 +182,35 @@ //! * declare and use user-defined aggregate functions ([`AggregateUDF`](physical_plan::udaf::AggregateUDF)) //! //! you can find examples of each of them in examples section. +//! +//! ## Examples +//! +//! Examples are located in [datafusion-examples directory](https://github.com/apache/arrow-datafusion/tree/master/datafusion-examples) +//! +//! Here's how to run them +//! +//! ```bash +//! git clone https://github.com/apache/arrow-datafusion +//! cd arrow-datafusion +//! # Download test data +//! git submodule update --init +//! export PARQUET_TEST_DATA=parquet-testing/data +//! export ARROW_TEST_DATA=testing/data +//! +//! cargo run --example csv_sql +//! +//! cargo run --example parquet_sql +//! +//! cargo run --example dataframe +//! +//! cargo run --example dataframe_in_memory +//! +//! cargo run --example parquet_sql +//! +//! cargo run --example simple_udaf +//! +//! cargo run --example simple_udf +//! ``` extern crate sqlparser; From d83bd170c379b5669e55e3faa41058bd8d0c6b91 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Wed, 2 Jun 2021 02:54:27 +0800 Subject: [PATCH 142/329] update readme to reflect work on window functions (#471) --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index f3ae412fde940..730bbc34d7038 100644 --- a/README.md +++ b/README.md @@ -221,6 +221,12 @@ DataFusion also includes a simple command-line interactive SQL utility. See the - [x] FULL JOIN - [x] CROSS JOIN - [ ] Window + - [x] [Empty window](https://github.com/apache/arrow-datafusion/issues/298) + - [x] Common window functions + - [ ] [Window with ORDER BY clause](https://github.com/apache/arrow-datafusion/issues/360) + - [ ] [Window with PARTITION BY clause](https://github.com/apache/arrow-datafusion/issues/299) + - [ ] [Window with custom WINDOW FRAME](https://github.com/apache/arrow-datafusion/issues/361) + - [ ] UDF and UDAF for window functions ## Data Sources From 16011120a1b73798049c5be49f9548b00f8a0a00 Mon Sep 17 00:00:00 2001 From: Nga Tran Date: Tue, 1 Jun 2021 15:53:50 -0400 Subject: [PATCH 143/329] fix: display the content of debug explain (#434) * test: display of each plan * fix: Fix debug display of explain and potential fix of predicate pushdown * refactor: adddress Andrew's comments * fix: avoid comparing file path that are test environmentally dependant --- datafusion/src/logical_plan/plan.rs | 8 +- datafusion/src/optimizer/filter_push_down.rs | 4 + datafusion/tests/sql.rs | 370 +++++++++++++++++++ 3 files changed, 378 insertions(+), 4 deletions(-) diff --git a/datafusion/src/logical_plan/plan.rs b/datafusion/src/logical_plan/plan.rs index 4027916c8a7cd..2d85abb64bbbd 100644 --- a/datafusion/src/logical_plan/plan.rs +++ b/datafusion/src/logical_plan/plan.rs @@ -356,11 +356,11 @@ impl LogicalPlan { LogicalPlan::Limit { input, .. } => vec![input], LogicalPlan::Extension { node } => node.inputs(), LogicalPlan::Union { inputs, .. } => inputs.iter().collect(), + LogicalPlan::Explain { plan, .. } => vec![plan], // plans without inputs LogicalPlan::TableScan { .. } | LogicalPlan::EmptyRelation { .. } - | LogicalPlan::CreateExternalTable { .. } - | LogicalPlan::Explain { .. } => vec![], + | LogicalPlan::CreateExternalTable { .. } => vec![], } } } @@ -466,11 +466,11 @@ impl LogicalPlan { } true } + LogicalPlan::Explain { plan, .. } => plan.accept(visitor)?, // plans without inputs LogicalPlan::TableScan { .. } | LogicalPlan::EmptyRelation { .. } - | LogicalPlan::CreateExternalTable { .. } - | LogicalPlan::Explain { .. } => true, + | LogicalPlan::CreateExternalTable { .. } => true, }; if !recurse { return Ok(false); diff --git a/datafusion/src/optimizer/filter_push_down.rs b/datafusion/src/optimizer/filter_push_down.rs index 2056e1972950a..4b1ae76927b4a 100644 --- a/datafusion/src/optimizer/filter_push_down.rs +++ b/datafusion/src/optimizer/filter_push_down.rs @@ -234,6 +234,10 @@ fn split_members<'a>(predicate: &'a Expr, predicates: &mut Vec<&'a Expr>) { fn optimize(plan: &LogicalPlan, mut state: State) -> Result { match plan { + LogicalPlan::Explain { .. } => { + // push the optimization to the plan of this explain + push_down(&state, plan) + } LogicalPlan::Filter { input, predicate } => { let mut predicates = vec![]; split_members(predicate, &mut predicates); diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index f5b416f789736..029e9307e5f68 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -1584,6 +1584,8 @@ fn create_join_context_qualified() -> Result { #[tokio::test] async fn csv_explain() { + // This test uses the execute function that create full plan cycle: logical, optimized logical, and physical, + // then execute the physical plan and return the final explain results let mut ctx = ExecutionContext::new(); register_aggregate_csv_by_sql(&mut ctx).await; let sql = "EXPLAIN SELECT c1 FROM aggregate_test_100 where c2 > 10"; @@ -1602,6 +1604,185 @@ async fn csv_explain() { assert_eq!(expected, actual); } +#[tokio::test] +async fn csv_explain_plans() { + // This test verify the look of each plan in its full cycle plan creation + + let mut ctx = ExecutionContext::new(); + register_aggregate_csv_by_sql(&mut ctx).await; + let sql = "EXPLAIN SELECT c1 FROM aggregate_test_100 where c2 > 10"; + + // Logical plan + // Create plan + let msg = format!("Creating logical plan for '{}'", sql); + let plan = ctx.create_logical_plan(&sql).expect(&msg); + let logical_schema = plan.schema(); + // + println!("SQL: {}", sql); + // + // Verify schema + let expected = vec![ + "Explain [plan_type:Utf8, plan:Utf8]", + " Projection: #c1 [c1:Utf8]", + " Filter: #c2 Gt Int64(10) [c1:Utf8, c2:Int32, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:Int64, c10:Utf8, c11:Float32, c12:Float64, c13:Utf8]", + " TableScan: aggregate_test_100 projection=None [c1:Utf8, c2:Int32, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:Int64, c10:Utf8, c11:Float32, c12:Float64, c13:Utf8]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + expected, actual + ); + // + // Verify the text format of the plan + let expected = vec![ + "Explain", + " Projection: #c1", + " Filter: #c2 Gt Int64(10)", + " TableScan: aggregate_test_100 projection=None", + ]; + let formatted = plan.display_indent().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + expected, actual + ); + // + // verify the grahviz format of the plan + let expected = vec![ + "// Begin DataFusion GraphViz Plan (see https://graphviz.org)", + "digraph {", + " subgraph cluster_1", + " {", + " graph[label=\"LogicalPlan\"]", + " 2[shape=box label=\"Explain\"]", + " 3[shape=box label=\"Projection: #c1\"]", + " 2 -> 3 [arrowhead=none, arrowtail=normal, dir=back]", + " 4[shape=box label=\"Filter: #c2 Gt Int64(10)\"]", + " 3 -> 4 [arrowhead=none, arrowtail=normal, dir=back]", + " 5[shape=box label=\"TableScan: aggregate_test_100 projection=None\"]", + " 4 -> 5 [arrowhead=none, arrowtail=normal, dir=back]", + " }", + " subgraph cluster_6", + " {", + " graph[label=\"Detailed LogicalPlan\"]", + " 7[shape=box label=\"Explain\\nSchema: [plan_type:Utf8, plan:Utf8]\"]", + " 8[shape=box label=\"Projection: #c1\\nSchema: [c1:Utf8]\"]", + " 7 -> 8 [arrowhead=none, arrowtail=normal, dir=back]", + " 9[shape=box label=\"Filter: #c2 Gt Int64(10)\\nSchema: [c1:Utf8, c2:Int32, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:Int64, c10:Utf8, c11:Float32, c12:Float64, c13:Utf8]\"]", + " 8 -> 9 [arrowhead=none, arrowtail=normal, dir=back]", + " 10[shape=box label=\"TableScan: aggregate_test_100 projection=None\\nSchema: [c1:Utf8, c2:Int32, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:Int64, c10:Utf8, c11:Float32, c12:Float64, c13:Utf8]\"]", + " 9 -> 10 [arrowhead=none, arrowtail=normal, dir=back]", + " }", + "}", + "// End DataFusion GraphViz Plan", + ]; + let formatted = plan.display_graphviz().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + expected, actual + ); + + // Optimized logical plan + // + let msg = format!("Optimizing logical plan for '{}': {:?}", sql, plan); + let plan = ctx.optimize(&plan).expect(&msg); + let optimized_logical_schema = plan.schema(); + // Both schema has to be the same + assert_eq!(logical_schema.as_ref(), optimized_logical_schema.as_ref()); + // + // Verify schema + let expected = vec![ + "Explain [plan_type:Utf8, plan:Utf8]", + " Projection: #c1 [c1:Utf8]", + " Filter: #c2 Gt Int64(10) [c1:Utf8, c2:Int32]", + " TableScan: aggregate_test_100 projection=Some([0, 1]) [c1:Utf8, c2:Int32]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + expected, actual + ); + // + // Verify the text format of the plan + let expected = vec![ + "Explain", + " Projection: #c1", + " Filter: #c2 Gt Int64(10)", + " TableScan: aggregate_test_100 projection=Some([0, 1])", + ]; + let formatted = plan.display_indent().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + expected, actual + ); + // + // verify the grahviz format of the plan + let expected = vec![ + "// Begin DataFusion GraphViz Plan (see https://graphviz.org)", + "digraph {", + " subgraph cluster_1", + " {", + " graph[label=\"LogicalPlan\"]", + " 2[shape=box label=\"Explain\"]", + " 3[shape=box label=\"Projection: #c1\"]", + " 2 -> 3 [arrowhead=none, arrowtail=normal, dir=back]", + " 4[shape=box label=\"Filter: #c2 Gt Int64(10)\"]", + " 3 -> 4 [arrowhead=none, arrowtail=normal, dir=back]", + " 5[shape=box label=\"TableScan: aggregate_test_100 projection=Some([0, 1])\"]", + " 4 -> 5 [arrowhead=none, arrowtail=normal, dir=back]", + " }", + " subgraph cluster_6", + " {", + " graph[label=\"Detailed LogicalPlan\"]", + " 7[shape=box label=\"Explain\\nSchema: [plan_type:Utf8, plan:Utf8]\"]", + " 8[shape=box label=\"Projection: #c1\\nSchema: [c1:Utf8]\"]", + " 7 -> 8 [arrowhead=none, arrowtail=normal, dir=back]", + " 9[shape=box label=\"Filter: #c2 Gt Int64(10)\\nSchema: [c1:Utf8, c2:Int32]\"]", + " 8 -> 9 [arrowhead=none, arrowtail=normal, dir=back]", + " 10[shape=box label=\"TableScan: aggregate_test_100 projection=Some([0, 1])\\nSchema: [c1:Utf8, c2:Int32]\"]", + " 9 -> 10 [arrowhead=none, arrowtail=normal, dir=back]", + " }", + "}", + "// End DataFusion GraphViz Plan", + ]; + let formatted = plan.display_graphviz().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + expected, actual + ); + + // Physical plan + // Create plan + let msg = format!("Creating physical plan for '{}': {:?}", sql, plan); + let plan = ctx.create_physical_plan(&plan).expect(&msg); + // + // Execute plan + let msg = format!("Executing physical plan for '{}': {:?}", sql, plan); + let results = collect(plan).await.expect(&msg); + let actual = result_vec(&results); + // flatten to a single string + let actual = actual.into_iter().map(|r| r.join("\t")).collect::(); + // Since the plan contains path that are environmentally dependant (e.g. full path of the test file), only verify important content + assert!(actual.contains("logical_plan"), "Actual: '{}'", actual); + assert!(actual.contains("Projection: #c1"), "Actual: '{}'", actual); + assert!( + actual.contains("Filter: #c2 Gt Int64(10)"), + "Actual: '{}'", + actual + ); +} + #[tokio::test] async fn csv_explain_verbose() { let mut ctx = ExecutionContext::new(); @@ -1620,6 +1801,195 @@ async fn csv_explain_verbose() { assert!(actual.contains("#c2 Gt Int64(10)"), "Actual: '{}'", actual); } +#[tokio::test] +async fn csv_explain_verbose_plans() { + // This test verify the look of each plan in its full cycle plan creation + + let mut ctx = ExecutionContext::new(); + register_aggregate_csv_by_sql(&mut ctx).await; + let sql = "EXPLAIN VERBOSE SELECT c1 FROM aggregate_test_100 where c2 > 10"; + + // Logical plan + // Create plan + let msg = format!("Creating logical plan for '{}'", sql); + let plan = ctx.create_logical_plan(&sql).expect(&msg); + let logical_schema = plan.schema(); + // + println!("SQL: {}", sql); + + // + // Verify schema + let expected = vec![ + "Explain [plan_type:Utf8, plan:Utf8]", + " Projection: #c1 [c1:Utf8]", + " Filter: #c2 Gt Int64(10) [c1:Utf8, c2:Int32, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:Int64, c10:Utf8, c11:Float32, c12:Float64, c13:Utf8]", + " TableScan: aggregate_test_100 projection=None [c1:Utf8, c2:Int32, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:Int64, c10:Utf8, c11:Float32, c12:Float64, c13:Utf8]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + expected, actual + ); + // + // Verify the text format of the plan + let expected = vec![ + "Explain", + " Projection: #c1", + " Filter: #c2 Gt Int64(10)", + " TableScan: aggregate_test_100 projection=None", + ]; + let formatted = plan.display_indent().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + expected, actual + ); + // + // verify the grahviz format of the plan + let expected = vec![ + "// Begin DataFusion GraphViz Plan (see https://graphviz.org)", + "digraph {", + " subgraph cluster_1", + " {", + " graph[label=\"LogicalPlan\"]", + " 2[shape=box label=\"Explain\"]", + " 3[shape=box label=\"Projection: #c1\"]", + " 2 -> 3 [arrowhead=none, arrowtail=normal, dir=back]", + " 4[shape=box label=\"Filter: #c2 Gt Int64(10)\"]", + " 3 -> 4 [arrowhead=none, arrowtail=normal, dir=back]", + " 5[shape=box label=\"TableScan: aggregate_test_100 projection=None\"]", + " 4 -> 5 [arrowhead=none, arrowtail=normal, dir=back]", + " }", + " subgraph cluster_6", + " {", + " graph[label=\"Detailed LogicalPlan\"]", + " 7[shape=box label=\"Explain\\nSchema: [plan_type:Utf8, plan:Utf8]\"]", + " 8[shape=box label=\"Projection: #c1\\nSchema: [c1:Utf8]\"]", + " 7 -> 8 [arrowhead=none, arrowtail=normal, dir=back]", + " 9[shape=box label=\"Filter: #c2 Gt Int64(10)\\nSchema: [c1:Utf8, c2:Int32, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:Int64, c10:Utf8, c11:Float32, c12:Float64, c13:Utf8]\"]", + " 8 -> 9 [arrowhead=none, arrowtail=normal, dir=back]", + " 10[shape=box label=\"TableScan: aggregate_test_100 projection=None\\nSchema: [c1:Utf8, c2:Int32, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:Int64, c10:Utf8, c11:Float32, c12:Float64, c13:Utf8]\"]", + " 9 -> 10 [arrowhead=none, arrowtail=normal, dir=back]", + " }", + "}", + "// End DataFusion GraphViz Plan", + ]; + let formatted = plan.display_graphviz().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + expected, actual + ); + + // Optimized logical plan + // + let msg = format!("Optimizing logical plan for '{}': {:?}", sql, plan); + let plan = ctx.optimize(&plan).expect(&msg); + let optimized_logical_schema = plan.schema(); + // Both schema has to be the same + assert_eq!(logical_schema.as_ref(), optimized_logical_schema.as_ref()); + // + // Verify schema + let expected = vec![ + "Explain [plan_type:Utf8, plan:Utf8]", + " Projection: #c1 [c1:Utf8]", + " Filter: #c2 Gt Int64(10) [c1:Utf8, c2:Int32]", + " TableScan: aggregate_test_100 projection=Some([0, 1]) [c1:Utf8, c2:Int32]", + ]; + let formatted = plan.display_indent_schema().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + expected, actual + ); + // + // Verify the text format of the plan + let expected = vec![ + "Explain", + " Projection: #c1", + " Filter: #c2 Gt Int64(10)", + " TableScan: aggregate_test_100 projection=Some([0, 1])", + ]; + let formatted = plan.display_indent().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + expected, actual + ); + // + // verify the grahviz format of the plan + let expected = vec![ + "// Begin DataFusion GraphViz Plan (see https://graphviz.org)", + "digraph {", + " subgraph cluster_1", + " {", + " graph[label=\"LogicalPlan\"]", + " 2[shape=box label=\"Explain\"]", + " 3[shape=box label=\"Projection: #c1\"]", + " 2 -> 3 [arrowhead=none, arrowtail=normal, dir=back]", + " 4[shape=box label=\"Filter: #c2 Gt Int64(10)\"]", + " 3 -> 4 [arrowhead=none, arrowtail=normal, dir=back]", + " 5[shape=box label=\"TableScan: aggregate_test_100 projection=Some([0, 1])\"]", + " 4 -> 5 [arrowhead=none, arrowtail=normal, dir=back]", + " }", + " subgraph cluster_6", + " {", + " graph[label=\"Detailed LogicalPlan\"]", + " 7[shape=box label=\"Explain\\nSchema: [plan_type:Utf8, plan:Utf8]\"]", + " 8[shape=box label=\"Projection: #c1\\nSchema: [c1:Utf8]\"]", + " 7 -> 8 [arrowhead=none, arrowtail=normal, dir=back]", + " 9[shape=box label=\"Filter: #c2 Gt Int64(10)\\nSchema: [c1:Utf8, c2:Int32]\"]", + " 8 -> 9 [arrowhead=none, arrowtail=normal, dir=back]", + " 10[shape=box label=\"TableScan: aggregate_test_100 projection=Some([0, 1])\\nSchema: [c1:Utf8, c2:Int32]\"]", + " 9 -> 10 [arrowhead=none, arrowtail=normal, dir=back]", + " }", + "}", + "// End DataFusion GraphViz Plan", + ]; + let formatted = plan.display_graphviz().to_string(); + let actual: Vec<&str> = formatted.trim().lines().collect(); + assert_eq!( + expected, actual, + "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + expected, actual + ); + + // Physical plan + // Create plan + let msg = format!("Creating physical plan for '{}': {:?}", sql, plan); + let plan = ctx.create_physical_plan(&plan).expect(&msg); + // + // Execute plan + let msg = format!("Executing physical plan for '{}': {:?}", sql, plan); + let results = collect(plan).await.expect(&msg); + let actual = result_vec(&results); + // flatten to a single string + let actual = actual.into_iter().map(|r| r.join("\t")).collect::(); + // Since the plan contains path that are environmentally dependant(e.g. full path of the test file), only verify important content + assert!( + actual.contains("logical_plan after projection_push_down"), + "Actual: '{}'", + actual + ); + assert!(actual.contains("physical_plan"), "Actual: '{}'", actual); + assert!( + actual.contains("FilterExec: CAST(c2 AS Int64) > 10"), + "Actual: '{}'", + actual + ); + assert!( + actual.contains("ProjectionExec: expr=[c1]"), + "Actual: '{}'", + actual + ); +} + fn aggr_test_schema() -> SchemaRef { Arc::new(Schema::new(vec![ Field::new("c1", DataType::Utf8, false), From c3fc0c75af5ff2ebb99dba197d9d2ccd83eb5952 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Wed, 2 Jun 2021 15:23:48 +0800 Subject: [PATCH 144/329] fix window aggregation with alias and add integration test case (#454) * fix window expression with alias * add integration test --- datafusion/src/logical_plan/builder.rs | 2 +- datafusion/src/sql/planner.rs | 25 ++++++++++++------- .../sqls/simple_window_full_aggregation.sql | 25 +++++++++++++++++++ integration-tests/test_psql_parity.py | 2 +- 4 files changed, 43 insertions(+), 11 deletions(-) create mode 100644 integration-tests/sqls/simple_window_full_aggregation.sql diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs index 9515ac2ff3739..c02555d63314a 100644 --- a/datafusion/src/logical_plan/builder.rs +++ b/datafusion/src/logical_plan/builder.rs @@ -309,7 +309,7 @@ impl LogicalPlanBuilder { // FIXME: implement next // window_frame: Option, ) -> Result { - let window_expr = window_expr.into_iter().collect::>(); + let window_expr = window_expr.into_iter().collect::>(); // FIXME: implement next // let partition_by_expr = partition_by_expr.into_iter().collect::>(); // FIXME: implement next diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index a3027e589985e..63499aa1abe22 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -17,10 +17,6 @@ //! SQL Query Planner (produces logical plan from SQL AST) -use std::str::FromStr; -use std::sync::Arc; -use std::{convert::TryInto, vec}; - use crate::catalog::TableReference; use crate::datasource::TableProvider; use crate::logical_plan::Expr::Alias; @@ -28,6 +24,7 @@ use crate::logical_plan::{ and, lit, DFSchema, Expr, LogicalPlan, LogicalPlanBuilder, Operator, PlanType, StringifiedPlan, ToDFSchema, }; +use crate::prelude::JoinType; use crate::scalar::ScalarValue; use crate::{ error::{DataFusionError, Result}, @@ -38,11 +35,8 @@ use crate::{ physical_plan::{aggregates, functions, window_functions}, sql::parser::{CreateExternalTable, FileType, Statement as DFStatement}, }; - use arrow::datatypes::*; use hashbrown::HashMap; - -use crate::prelude::JoinType; use sqlparser::ast::{ BinaryOperator, DataType as SQLDataType, DateTimeField, Expr as SQLExpr, FunctionArg, Ident, Join, JoinConstraint, JoinOperator, ObjectName, Query, Select, SelectItem, @@ -52,6 +46,9 @@ use sqlparser::ast::{ use sqlparser::ast::{ColumnDef as SQLColumnDef, ColumnOption}; use sqlparser::ast::{OrderByExpr, Statement}; use sqlparser::parser::ParserError::ParserError; +use std::str::FromStr; +use std::sync::Arc; +use std::{convert::TryInto, vec}; use super::{ parser::DFParser, @@ -678,11 +675,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { select_exprs: &[Expr], ) -> Result<(LogicalPlan, Vec)> { let plan = LogicalPlanBuilder::from(input) - .window(window_exprs)? + .window(window_exprs.clone())? .build()?; let select_exprs = select_exprs .iter() - .map(|expr| expr_as_column_expr(&expr, &plan)) + .map(|expr| rebase_expr(expr, &window_exprs, &plan)) .into_iter() .collect::>>()?; Ok((plan, select_exprs)) @@ -2710,6 +2707,16 @@ mod tests { quick_test(sql, expected); } + #[test] + fn empty_over_with_alias() { + let sql = "SELECT order_id oid, MAX(order_id) OVER () max_oid from orders"; + let expected = "\ + Projection: #order_id AS oid, #MAX(order_id) AS max_oid\ + \n WindowAggr: windowExpr=[[MAX(#order_id)]] partitionBy=[], orderBy=[]\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); + } + #[test] fn empty_over_plus() { let sql = "SELECT order_id, MAX(qty * 1.1) OVER () from orders"; diff --git a/integration-tests/sqls/simple_window_full_aggregation.sql b/integration-tests/sqls/simple_window_full_aggregation.sql new file mode 100644 index 0000000000000..94860bc3b1835 --- /dev/null +++ b/integration-tests/sqls/simple_window_full_aggregation.sql @@ -0,0 +1,25 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at + +-- http://www.apache.org/licenses/LICENSE-2.0 + +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language gOVERning permissions and +-- limitations under the License. + +SELECT + row_number() OVER () AS row_number, + count(c3) OVER () AS count_c3, + avg(c3) OVER () AS avg, + sum(c3) OVER () AS sum, + max(c3) OVER () AS max, + min(c3) OVER () AS min +FROM test +ORDER BY row_number; diff --git a/integration-tests/test_psql_parity.py b/integration-tests/test_psql_parity.py index f4967b8457e49..5bd308180e598 100644 --- a/integration-tests/test_psql_parity.py +++ b/integration-tests/test_psql_parity.py @@ -74,7 +74,7 @@ class PsqlParityTest(unittest.TestCase): def test_parity(self): root = Path(os.path.dirname(__file__)) / "sqls" files = set(root.glob("*.sql")) - self.assertEqual(len(files), 4, msg="tests are missed") + self.assertEqual(len(files), 5, msg="tests are missed") for fname in files: with self.subTest(fname=fname): datafusion_output = pd.read_csv( From 01b57f70241e158d471a1396c0b6461eccbd6e82 Mon Sep 17 00:00:00 2001 From: Ximo Guanter Date: Wed, 2 Jun 2021 16:13:13 +0200 Subject: [PATCH 145/329] Update k8s user guide to use deployments (#474) --- .../rust/executor/executor_config_spec.toml | 2 +- ballista/rust/executor/src/main.rs | 2 +- .../rust/scheduler/scheduler_config_spec.toml | 2 +- ballista/rust/scheduler/src/main.rs | 2 +- benchmarks/README.md | 6 +- benchmarks/docker-compose.yaml | 4 +- .../src/distributed/docker-compose.md | 2 +- docs/user-guide/src/distributed/kubernetes.md | 70 +++++++++---------- .../user-guide/src/distributed/raspberrypi.md | 2 +- docs/user-guide/src/distributed/standalone.md | 12 ++-- 10 files changed, 50 insertions(+), 54 deletions(-) diff --git a/ballista/rust/executor/executor_config_spec.toml b/ballista/rust/executor/executor_config_spec.toml index 8d817fee9cc5c..3cb168e772416 100644 --- a/ballista/rust/executor/executor_config_spec.toml +++ b/ballista/rust/executor/executor_config_spec.toml @@ -53,7 +53,7 @@ doc = "Host name or IP address to register with scheduler so that other executor [[param]] abbr = "p" -name = "port" +name = "bind_port" type = "u16" default = "50051" doc = "bind port" diff --git a/ballista/rust/executor/src/main.rs b/ballista/rust/executor/src/main.rs index aad53d7a96324..4c63ba89680a1 100644 --- a/ballista/rust/executor/src/main.rs +++ b/ballista/rust/executor/src/main.rs @@ -75,7 +75,7 @@ async fn main() -> Result<()> { let external_host = opt.external_host; let bind_host = opt.bind_host; - let port = opt.port; + let port = opt.bind_port; let addr = format!("{}:{}", bind_host, port); let addr = addr diff --git a/ballista/rust/scheduler/scheduler_config_spec.toml b/ballista/rust/scheduler/scheduler_config_spec.toml index 560e9a2599bd5..81e77d31b0a00 100644 --- a/ballista/rust/scheduler/scheduler_config_spec.toml +++ b/ballista/rust/scheduler/scheduler_config_spec.toml @@ -54,7 +54,7 @@ doc = "Local host name or IP address to bind to. Default: 0.0.0.0" [[param]] abbr = "p" -name = "port" +name = "bind_port" type = "u16" default = "50050" doc = "bind port. Default: 50050" \ No newline at end of file diff --git a/ballista/rust/scheduler/src/main.rs b/ballista/rust/scheduler/src/main.rs index 713103fcf0439..34386ca6c5617 100644 --- a/ballista/rust/scheduler/src/main.rs +++ b/ballista/rust/scheduler/src/main.rs @@ -116,7 +116,7 @@ async fn main() -> Result<()> { let namespace = opt.namespace; let bind_host = opt.bind_host; - let port = opt.port; + let port = opt.bind_port; let addr = format!("{}:{}", bind_host, port); let addr = addr.parse()?; diff --git a/benchmarks/README.md b/benchmarks/README.md index e347130689b3d..0b5ccfc16e466 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -122,7 +122,7 @@ RUST_LOG=info RUSTFLAGS='-C target-cpu=native -C lto -C codegen-units=1 -C embed To run the benchmarks: ```bash -cd $ARROW_HOME/ballista/rust/benchmarks/tpch +cd $ARROW_HOME/benchmarks cargo run --release benchmark ballista --host localhost --port 50050 --query 1 --path $(pwd)/data --format tbl ``` @@ -131,9 +131,9 @@ cargo run --release benchmark ballista --host localhost --port 50050 --query 1 - To start a Rust scheduler and executor using Docker Compose: ```bash -cd $BALLISTA_HOME +cd $ARROW_HOME ./dev/build-rust.sh -cd $BALLISTA_HOME/rust/benchmarks/tpch +cd $ARROW_HOME/benchmarks docker-compose up ``` diff --git a/benchmarks/docker-compose.yaml b/benchmarks/docker-compose.yaml index c13e9eb48c884..74c6703f30b1c 100644 --- a/benchmarks/docker-compose.yaml +++ b/benchmarks/docker-compose.yaml @@ -21,7 +21,7 @@ services: command: "etcd -advertise-client-urls http://etcd:2379 -listen-client-urls http://0.0.0.0:2379" ballista-scheduler: image: ballista:0.5.0-SNAPSHOT - command: "/scheduler --config-backend etcd --etcd-urls etcd:2379 --bind-host 0.0.0.0 --port 50050" + command: "/scheduler --config-backend etcd --etcd-urls etcd:2379 --bind-host 0.0.0.0 --bind-port 50050" environment: - RUST_LOG=ballista=debug volumes: @@ -30,7 +30,7 @@ services: - etcd ballista-executor: image: ballista:0.5.0-SNAPSHOT - command: "/executor --bind-host 0.0.0.0 --port 50051 --scheduler-host ballista-scheduler" + command: "/executor --bind-host 0.0.0.0 --bind-port 50051 --scheduler-host ballista-scheduler" scale: 2 environment: - RUST_LOG=info diff --git a/docs/user-guide/src/distributed/docker-compose.md b/docs/user-guide/src/distributed/docker-compose.md index de27364fc2528..5ea86b5caea4b 100644 --- a/docs/user-guide/src/distributed/docker-compose.md +++ b/docs/user-guide/src/distributed/docker-compose.md @@ -33,7 +33,7 @@ services: - "2379:2379" ballista-executor: image: ballistacompute/ballista-rust:0.4.2-SNAPSHOT - command: "/executor --bind-host 0.0.0.0 --port 50051 --local" + command: "/executor --bind-host 0.0.0.0 --bind-port 50051 --local" environment: - RUST_LOG=info ports: diff --git a/docs/user-guide/src/distributed/kubernetes.md b/docs/user-guide/src/distributed/kubernetes.md index 7b9b356dfa428..07b51f7871b6c 100644 --- a/docs/user-guide/src/distributed/kubernetes.md +++ b/docs/user-guide/src/distributed/kubernetes.md @@ -24,8 +24,8 @@ you are already comfortable with managing Kubernetes deployments. The k8s deployment consists of: -- k8s stateful set for one or more scheduler processes -- k8s stateful set for one or more executor processes +- k8s deployment for one or more scheduler processes +- k8s deployment for one or more executor processes - k8s service to route traffic to the schedulers - k8s persistent volume and persistent volume claims to make local data accessible to Ballista @@ -38,6 +38,14 @@ Ballista is at an early stage of development and therefore has some significant - Only a single scheduler instance is currently supported unless the scheduler is configured to use `etcd` as a backing store. +## Publishing your images + +Currently there are no official Ballista images that work with the instructions in this guide. For the time being, +you will need to build and publish your own images. You can do that by invoking the `dev/build-ballista-docker.sh`. + +Once the images have been built, you can retag them with `docker tag ballista:0.5.0-SNAPSHOT ` so you +can push them to your favourite docker registry. + ## Create Persistent Volume and Persistent Volume Claim Copy the following yaml to a `pv.yaml` file and apply to the cluster to create a persistent volume and a persistent @@ -88,7 +96,7 @@ persistentvolumeclaim/data-pv-claim created ## Deploying Ballista Scheduler and Executors -Copy the following yaml to a `cluster.yaml` file. +Copy the following yaml to a `cluster.yaml` file and change `` with the name of your Ballista Docker image. ```yaml apiVersion: v1 @@ -101,16 +109,14 @@ spec: ports: - port: 50050 name: scheduler - clusterIP: None selector: app: ballista-scheduler --- apiVersion: apps/v1 -kind: StatefulSet +kind: Deployment metadata: name: ballista-scheduler spec: - serviceName: "ballista-scheduler" replicas: 1 selector: matchLabels: @@ -122,27 +128,26 @@ spec: ballista-cluster: ballista spec: containers: - - name: ballista-scheduler - image: ballistacompute/ballista-rust:0.4.2-SNAPSHOT - command: ["/scheduler"] - args: ["--port=50050"] - ports: - - containerPort: 50050 - name: flight - volumeMounts: - - mountPath: /mnt - name: data + - name: ballista-scheduler + image: + command: ["/scheduler"] + args: ["--bind-port=50050"] + ports: + - containerPort: 50050 + name: flight + volumeMounts: + - mountPath: /mnt + name: data volumes: - name: data persistentVolumeClaim: claimName: data-pv-claim --- apiVersion: apps/v1 -kind: StatefulSet +kind: Deployment metadata: name: ballista-executor spec: - serviceName: "ballista-scheduler" replicas: 2 selector: matchLabels: @@ -155,20 +160,12 @@ spec: spec: containers: - name: ballista-executor - image: ballistacompute/ballista-rust:0.4.2-SNAPSHOT + image: command: ["/executor"] args: - [ - "--port=50051", - "--scheduler-host=ballista-scheduler", - "--scheduler-port=50050", - "--external-host=$(MY_POD_IP)", - ] - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP + - "--bind-port=50051", + - "--scheduler-host=ballista-scheduler", + - "--scheduler-port=50050" ports: - containerPort: 50051 name: flight @@ -189,19 +186,18 @@ This should show the following output: ``` service/ballista-scheduler created -statefulset.apps/ballista-scheduler created -statefulset.apps/ballista-executor created +deployment.apps/ballista-scheduler created +deployment.apps/ballista-executor created ``` You can also check status by running `kubectl get pods`: ```bash $ kubectl get pods -NAME READY STATUS RESTARTS AGE -busybox 1/1 Running 0 16m -ballista-scheduler-0 1/1 Running 0 42s -ballista-executor-0 1/1 Running 2 42s -ballista-executor-1 1/1 Running 0 26s +NAME READY STATUS RESTARTS AGE +ballista-executor-78cc5b6486-4rkn4 0/1 Pending 0 42s +ballista-executor-78cc5b6486-7crdm 0/1 Pending 0 42s +ballista-scheduler-879f874c5-rnbd6 0/1 Pending 0 42s ``` You can view the scheduler logs with `kubectl logs ballista-scheduler-0`: diff --git a/docs/user-guide/src/distributed/raspberrypi.md b/docs/user-guide/src/distributed/raspberrypi.md index 0083d191770b6..3bf36c7227a75 100644 --- a/docs/user-guide/src/distributed/raspberrypi.md +++ b/docs/user-guide/src/distributed/raspberrypi.md @@ -116,7 +116,7 @@ Run the benchmarks: ```bash docker run -it myrepo/ballista-arm64 \ /tpch benchmark datafusion --query=1 --path=/path/to/data --format=parquet \ - --concurrency=24 --iterations=1 --debug --host=ballista-scheduler --port=50050 + --concurrency=24 --iterations=1 --debug --host=ballista-scheduler --bind-port=50050 ``` Note that it will be necessary to mount appropriate volumes into the containers and also configure networking diff --git a/docs/user-guide/src/distributed/standalone.md b/docs/user-guide/src/distributed/standalone.md index e9db425dc1119..66b6bc8356957 100644 --- a/docs/user-guide/src/distributed/standalone.md +++ b/docs/user-guide/src/distributed/standalone.md @@ -26,7 +26,7 @@ Start a scheduler using the following syntax: ```bash docker run --network=host \ -d ballistacompute/ballista-rust:0.4.2-SNAPSHOT \ - /scheduler --port 50050 + /scheduler --bind-port 50050 ``` Run `docker ps` to check that the process is running: @@ -34,7 +34,7 @@ Run `docker ps` to check that the process is running: ``` $ docker ps CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES -59452ce72138 ballistacompute/ballista-rust:0.4.2-SNAPSHOT "/scheduler --port 5…" 6 seconds ago Up 5 seconds affectionate_hofstadter +59452ce72138 ballistacompute/ballista-rust:0.4.2-SNAPSHOT "/scheduler --bind-p…" 6 seconds ago Up 5 seconds affectionate_hofstadter ``` Run `docker logs CONTAINER_ID` to check the output from the process: @@ -51,7 +51,7 @@ Start one or more executor processes. Each executor process will need to listen ```bash docker run --network=host \ -d ballistacompute/ballista-rust:0.4.2-SNAPSHOT \ - /executor --external-host localhost --port 50051 + /executor --external-host localhost --bind-port 50051 ``` Use `docker ps` to check that both the scheduer and executor(s) are now running: @@ -60,14 +60,14 @@ Use `docker ps` to check that both the scheduer and executor(s) are now running: $ docker ps CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 0746ce262a19 ballistacompute/ballista-rust:0.4.2-SNAPSHOT "/executor --externa…" 2 seconds ago Up 1 second naughty_mclean -59452ce72138 ballistacompute/ballista-rust:0.4.2-SNAPSHOT "/scheduler --port 5…" 4 minutes ago Up 4 minutes affectionate_hofstadter +59452ce72138 ballistacompute/ballista-rust:0.4.2-SNAPSHOT "/scheduler --bind-p…" 4 minutes ago Up 4 minutes affectionate_hofstadter ``` Use `docker logs CONTAINER_ID` to check the output from the executor(s): ``` $ docker logs 0746ce262a19 -[2021-02-14T18:36:25Z INFO executor] Running with config: ExecutorConfig { host: "localhost", port: 50051, work_dir: "/tmp/.tmpVRFSvn", concurrent_tasks: 4 } +[2021-02-14T18:36:25Z INFO executor] Running with config: ExecutorConfig { host: "localhost", bind_port: 50051, work_dir: "/tmp/.tmpVRFSvn", concurrent_tasks: 4 } [2021-02-14T18:36:25Z INFO executor] Ballista v0.4.2-SNAPSHOT Rust Executor listening on 0.0.0.0:50051 [2021-02-14T18:36:25Z INFO executor] Starting registration with scheduler ``` @@ -84,7 +84,7 @@ Ballista can optionally use [etcd](https://etcd.io/) as a backing store for the ```bash docker run --network=host \ -d ballistacompute/ballista-rust:0.4.2-SNAPSHOT \ - /scheduler --port 50050 \ + /scheduler --bind-port 50050 \ --config-backend etcd \ --etcd-urls etcd:2379 ``` From ce95d3e8aa25ca7fd6bbfdb7eba795fe31278206 Mon Sep 17 00:00:00 2001 From: QP Hou Date: Wed, 2 Jun 2021 11:45:32 -0700 Subject: [PATCH 146/329] add invariants spec (#443) * add invariants spec * Update docs/specification/invariants.md Co-authored-by: Andrew Lamb * Update docs/specification/invariants.md Co-authored-by: Andrew Lamb * Update docs/specification/invariants.md Co-authored-by: Andrew Lamb * Update docs/specification/invariants.md Co-authored-by: Andrew Lamb Co-authored-by: Andrew Lamb --- DEVELOPERS.md | 1 + docs/specification/invariants.md | 327 +++++++++++++++++++++++++++++++ 2 files changed, 328 insertions(+) create mode 100644 docs/specification/invariants.md diff --git a/DEVELOPERS.md b/DEVELOPERS.md index 9223d990e1d63..b915e2912c24e 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -106,6 +106,7 @@ new specifications as you see fit. Here is the list current active specifications: * [Output field name semantic](docs/specification/output-field-name-semantic.md) +* [Invariants](docs/specification/invariants.md) ## How to format `.md` document diff --git a/docs/specification/invariants.md b/docs/specification/invariants.md new file mode 100644 index 0000000000000..628bb7710b1f5 --- /dev/null +++ b/docs/specification/invariants.md @@ -0,0 +1,327 @@ + + +# DataFusion's Invariants + +This document enumerates invariants of DataFusion's logical and physical planes +(functions, and nodes). Some of these invariants are currently not enforced. +This document assumes that the reader is familiar with some of the codebase, +including rust arrow's RecordBatch and Array. + +## Rational + +DataFusion's computational model is built on top of a dynamically typed arrow +object, Array, that offers the interface `Array::as_any` to downcast itself to +its statically typed versions (e.g. `Int32Array`). DataFusion uses +`Array::data_type` to perform the respective downcasting on its physical +operations. DataFusion uses a dynamic type system because the queries being +executed are not always known at compile time: they are only known during the +runtime (or query time) of programs built with DataFusion. This document is +built on top of this principle. + +In dynamically typed interfaces, it is up to developers to enforce type +invariances. This document declares some of these invariants, so that users +know what they can expect from a query in DataFusion, and DataFusion developers +know what they need to enforce at the coding level. + +## Notation + +* Field or physical field: the tuple name, `arrow::DataType` and nullability flag (a bool whether values can be null), represented in this document by `PF(name, type, nullable)` +* Logical field: Field with a relation name. Represented in this document by `LF(relation, name, type, nullable)` +* Projected plan: plan with projection as the root node. +* Logical schema: a vector of logical fields, used by logical plan. +* Physical schema: a vector of physical fields, used by both physical plan and Arrow record batch. + +### Logical + +#### Function + +An object that knows its valid incoming logical fields and how to derive its +output logical field from its arguments' logical fields. A functions' output +field is itself a function of its input fields: + +``` +logical_field(lf1: LF, lf2: LF, ...) -> LF +``` + +Examples: + +* `plus(a,b) -> LF(None, "{a} Plus {b}", d(a.type,b.type), a.nullable | b.nullable)` where d is the function mapping input types to output type (`get_supertype` in our current implementation). +* `length(a) -> LF(None, "length({a})", u32, a.nullable)` + +#### Plan + +A tree composed of other plans and functions (e.g. `Projection c1 + c2, c1 - c2 AS sum12; Scan c1 as u32, c2 as u64`) +that knows how to derive its schema. + +Certain plans have a frozen schema (e.g. Scan), while others derive their +schema from their child nodes. + +#### Column + +An identifier in a logical plan consists of field name and relation name. + +### Physical + +#### Function + +An object that knows how to derive its physical field from its arguments' +physical fields, and also how to actually perform the computation on data. A +functions' output physical field is a function of its input physical fields: + +``` +physical_field(PF1, PF2, ...) -> PF +``` + +Examples: + +* `plus(a,b) -> PF("{a} Plus {b}", d(a.type,b.type), a.nullable | b.nullable)` where d is a complex function (`get_supertype` in our current implementation) whose computation is for each element in the columns, sum the two entries together and return it in the same type as the smallest type of both columns. +* `length(&str) -> PF("length({a})", u32, a.nullable)` whose computation is "count number of bytes in the string". + +#### Plan + +A tree (e.g. `Projection c1 + c2, c1 - c2 AS sum12; Scan c1 as u32, c2 as u64`) +that knows how to derive its metadata and compute itself. + +Note how the physical plane does not know how to derive field names: field +names are solely a property of the logical plane, as they are not needed in the +physical plane. + +#### Column + +A type of physical node in a physical plan consists of a field name and unique index. + +### Data Sources' registry + +A map of source name/relation -> Schema plus associated properties necessary to read data from it (e.g. file path). + +### Functions' registry + +A map of function name -> logical + physical function. + +### Physical Planner + +A function that knows how to derive a physical plan from a logical plan: + +``` +plan(LogicalPlan) -> PhysicalPlan +``` + +### Logical Optimizer + +A function that accepts a logical plan and returns an (optimized) logical plan +which computes the same results, but in a more efficient manner: + +``` +optimize(LogicalPlan) -> LogicalPlan +``` + +### Physical Optimizer + +A function that accepts a physical plan and returns an (optimized) physical +plan which computes the same results, but may differ based on the actual +hardware or execution environment being run: + +``` +optimize(PhysicalPlan) -> PhysicalPlan +``` + +### Builder + +A function that knows how to build a new logical plan from an existing logical +plan and some extra parameters. + +``` +build(logical_plan, params...) -> logical_plan +``` + +## Invariants + +The following subsections describe invariants. Since functions' output schema +depends on its arguments' schema (e.g. min, plus), the resulting schema can +only be derived based on a known set of input schemas (TableProvider). +Likewise, schemas of functions depend on the specific registry of functions +registered (e.g. does `my_op` return u32 or u64?). Thus, in this section, the +wording "same schema" is understood to mean "same schema under a given registry +of data sources and functions". + +### (relation, name) tuples in logical fields and logical columns are unique + +Every logical field's (relation, name) tuple in a logical schema MUST be unique. +Every logical column's (relation, name) tuple in a logical plan MUST be unique. + +This invariant guarantees that `SELECT t1.id, t2.id FROM t1 JOIN t2...` +unambiguously selects the field `t1.id` and `t2.id` in a logical schema in the +logical plane. + +#### Responsibility + +It is the logical builder and optimizer's responsibility to guarantee this +invariant. + +#### Validation + +Builder and optimizer MUST error if this invariant is violated on any logical +node that creates a new schema (e.g. scan, projection, aggregation, join, etc.). + +### Physical schema is consistent with data + +The contents of every Array in every RecordBatch in every partition returned by +a physical plan MUST be consistent with RecordBatch's schema, in that every +Array in the RecordBatch must be downcastable to its corresponding type +declared in the RecordBatch. + +#### Responsibility + +Physical functions MUST guarantee this invariant. This is particularly +important in aggregate functions, whose aggregating type may be different from +the intermediary types during calculations (e.g. sum(i32) -> i64). + +#### Validation + +Since the validation of this invariant is computationally expensive, execution +contexts CAN validate this invariant. It is acceptable for physical nodes to +`panic!` if their input does not satisfy this invariant. + +### Physical schema is consistent in physical functions + +The schema of every Array returned by a physical function MUST match the +DataType reported by the physical function itself. + +This ensures that when a physical function claims that it returns a type +(e.g. Int32), users can safely downcast its resulting Array to the +corresponding type (e.g. Int32Array), as well as to write data to formats that +have a schema with nullability flag (e.g. parquet). + +#### Responsibility + +It is the responsibility of the developer that writes a physical function to +guarantee this invariant. + +In particular: + +* The derived DataType matches the code it uses to build the array for every branch of valid input type combinations. +* The nullability flag matches how the values are built. + +#### Validation + +Since the validation of this invariant is computationally expensive, execution +contexts CAN validate this invariant. + +### The physical schema is invariant under planning + +The physical schema derived by a physical plan returned by the planner MUST be +equivalent to the physical schema derived by the logical plan passed to the +planner. Specifically: + +``` +plan(logical_plan).schema === logical_plan.physical_schema +``` + +Logical plan's physical schema is defined as logical schema with relation +qualifiers stripped for all logical fields: + +``` +logical_plan.physical_schema = vector[ strip_relation(f) for f in logical_plan.logical_fields ] +``` + +This is used to ensure that the physical schema of its (logical) plan is what +it gets in record batches, so that users can rely on the optimized logical plan +to know the resulting physical schema. + +Note that since a logical plan can be as simple as a single projection with a +single function, `Projection f(c1,c2)`, a corollary of this is that the +physical schema of every `logical function -> physical function` must be +invariant under planning. + +#### Responsibility + +Developers of physical and logical plans and planners MUST guarantee this +invariant for every triplet (logical plan, physical plan, conversion rule). + +#### Validation + +Planners MUST validate this invariant. In particular they MUST return an error +when, during planning, a physical function's derived schema does not match the +logical functions' derived schema. + +### The output schema equals the physical plan schema + +The schema of every RecordBatch in every partition outputted by a physical plan +MUST be equal to the schema of the physical plan. Specifically: + +``` +physical_plan.evaluate(batch).schema = physical_plan.schema +``` + +Together with other invariants, this ensures that the consumers of record +batches do not need to know the output schema of the physical plan; they can +safely rely on the record batch's schema to perform downscaling and naming. + +#### Responsibility + +Physical nodes MUST guarantee this invariant. + +#### Validation + +Execution Contexts CAN validate this invariant. + +### Logical schema is invariant under logical optimization + +The logical schema derived by a projected logical plan returned by the logical +optimizer MUST be equivalent to the logical schema derived by the logical plan +passed to the planner: + +``` +optimize(logical_plan).schema === logical_plan.schema +``` + +This is used to ensure that plans can be optimized without jeopardizing future +referencing logical columns (name and index) or assumptions about their +schemas. + +#### Responsibility + +Logical optimizers MUST guarantee this invariant. + +#### Validation + +Users of logical optimizers SHOULD validate this invariant. + +### Physical schema is invariant under physical optimization + +The physical schema derived by a projected physical plan returned by the +physical optimizer MUST match the physical schema derived by the physical plan +passed to the planner: + +``` +optimize(physical_plan).schema === physical_plan.schema +``` + +This is used to ensure that plans can be optimized without jeopardizing future +referencs of logical columns (name and index) or assumptions about their +schemas. + +#### Responsibility + +Optimizers MUST guarantee this invariant. + +#### Validation + +Users of optimizers SHOULD validate this invariant. From d0d677bb574cf9e1d21e8f3c594c0c2f09ab8e64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 2 Jun 2021 22:41:45 +0200 Subject: [PATCH 147/329] Expose DataFrame::sort (#469) --- python/Cargo.toml | 3 ++- python/src/dataframe.rs | 12 +++++++++ python/src/expression.rs | 8 ++++++ python/src/functions.rs | 14 +++++++--- python/src/to_py.rs | 3 ++- python/src/to_rust.rs | 3 ++- python/tests/test_df.py | 55 ++++++++++++++++++++++------------------ 7 files changed, 67 insertions(+), 31 deletions(-) diff --git a/python/Cargo.toml b/python/Cargo.toml index 070720554f0ed..117190714e59d 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -27,9 +27,10 @@ license = "Apache-2.0" edition = "2018" [dependencies] +libc = "0.2" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } rand = "0.7" -pyo3 = { version = "0.12.1", features = ["extension-module"] } +pyo3 = { version = "0.13.2", features = ["extension-module"] } datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "2423ff0d" } [lib] diff --git a/python/src/dataframe.rs b/python/src/dataframe.rs index f90a7cf2f0dcf..66e6916b68156 100644 --- a/python/src/dataframe.rs +++ b/python/src/dataframe.rs @@ -93,6 +93,18 @@ impl DataFrame { }) } + /// Sort by specified sorting expressions + fn sort(&self, exprs: Vec) -> PyResult { + let exprs = exprs.into_iter().map(|e| e.expr); + let builder = LogicalPlanBuilder::from(&self.plan); + let builder = errors::wrap(builder.sort(exprs))?; + let plan = errors::wrap(builder.build())?; + Ok(DataFrame { + ctx_state: self.ctx_state.clone(), + plan, + }) + } + /// Limits the plan to return at most `count` rows fn limit(&self, count: usize) -> PyResult { let builder = LogicalPlanBuilder::from(&self.plan); diff --git a/python/src/expression.rs b/python/src/expression.rs index 78ca6d7e598ec..4320b1d14c8b7 100644 --- a/python/src/expression.rs +++ b/python/src/expression.rs @@ -117,6 +117,14 @@ impl Expression { expr: self.expr.clone().alias(name), }) } + + /// Create a sort expression from an existing expression. + #[args(ascending = true, nulls_first = true)] + pub fn sort(&self, ascending: bool, nulls_first: bool) -> PyResult { + Ok(Expression { + expr: self.expr.clone().sort(ascending, nulls_first), + }) + } } /// Represents a ScalarUDF diff --git a/python/src/functions.rs b/python/src/functions.rs index b51c76ba4b66e..f46dd3e0e5f7b 100644 --- a/python/src/functions.rs +++ b/python/src/functions.rs @@ -101,9 +101,17 @@ fn concat_ws(value: expression::Expression) -> expression::Expression { } #[pyfunction] -fn in_list(expr: expression::Expression, value: Vec, negated: bool) -> expression::Expression { - expression::Expression { - expr: logical_plan::in_list(expr.expr, value.into_iter().map(|x| x.expr).collect::>(), negated), +fn in_list( + expr: expression::Expression, + value: Vec, + negated: bool, +) -> expression::Expression { + expression::Expression { + expr: logical_plan::in_list( + expr.expr, + value.into_iter().map(|x| x.expr).collect::>(), + negated, + ), } } diff --git a/python/src/to_py.rs b/python/src/to_py.rs index deeb9719891a3..ff03e03325258 100644 --- a/python/src/to_py.rs +++ b/python/src/to_py.rs @@ -15,8 +15,9 @@ // specific language governing permissions and limitations // under the License. +use libc::uintptr_t; use pyo3::prelude::*; -use pyo3::{libc::uintptr_t, PyErr}; +use pyo3::PyErr; use std::convert::From; diff --git a/python/src/to_rust.rs b/python/src/to_rust.rs index d8f2307a49823..2e3f7f05ec588 100644 --- a/python/src/to_rust.rs +++ b/python/src/to_rust.rs @@ -25,7 +25,8 @@ use datafusion::arrow::{ record_batch::RecordBatch, }; use datafusion::scalar::ScalarValue; -use pyo3::{libc::uintptr_t, prelude::*}; +use libc::uintptr_t; +use pyo3::prelude::*; use crate::{errors, types::PyDataType}; diff --git a/python/tests/test_df.py b/python/tests/test_df.py index 520d4e6a54723..2da23f908dc73 100644 --- a/python/tests/test_df.py +++ b/python/tests/test_df.py @@ -17,7 +17,7 @@ import unittest -import pyarrow +import pyarrow as pa import datafusion f = datafusion.functions @@ -28,8 +28,8 @@ def _prepare(self): ctx = datafusion.ExecutionContext() # create a RecordBatch and a new DataFrame from it - batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], names=["a", "b"], ) return ctx.create_dataframe([[batch]]) @@ -45,8 +45,8 @@ def test_select(self): # execute and collect the first (and only) batch result = df.collect()[0] - self.assertEqual(result.column(0), pyarrow.array([5, 7, 9])) - self.assertEqual(result.column(1), pyarrow.array([-3, -3, -3])) + self.assertEqual(result.column(0), pa.array([5, 7, 9])) + self.assertEqual(result.column(1), pa.array([-3, -3, -3])) def test_filter(self): df = self._prepare() @@ -61,8 +61,18 @@ def test_filter(self): # execute and collect the first (and only) batch result = df.collect()[0] - self.assertEqual(result.column(0), pyarrow.array([9])) - self.assertEqual(result.column(1), pyarrow.array([-3])) + self.assertEqual(result.column(0), pa.array([9])) + self.assertEqual(result.column(1), pa.array([-3])) + + def test_sort(self): + df = self._prepare() + df = df.sort([ + f.col("b").sort(ascending=False) + ]) + + table = pa.Table.from_batches(df.collect()) + expected = {'a': [3, 2, 1], 'b': [6, 5, 4]} + self.assertEqual(table.to_pydict(), expected) def test_limit(self): df = self._prepare() @@ -78,38 +88,33 @@ def test_limit(self): def test_udf(self): df = self._prepare() - # is_null is a pyarrow function over arrays - udf = f.udf(lambda x: x.is_null(), [pyarrow.int64()], pyarrow.bool_()) + # is_null is a pa function over arrays + udf = f.udf(lambda x: x.is_null(), [pa.int64()], pa.bool_()) df = df.select(udf(f.col("a"))) - self.assertEqual(df.collect()[0].column(0), pyarrow.array([False, False, False])) + self.assertEqual(df.collect()[0].column(0), pa.array([False, False, False])) def test_join(self): ctx = datafusion.ExecutionContext() - batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], names=["a", "b"], ) df = ctx.create_dataframe([[batch]]) - batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2]), pyarrow.array([8, 10])], + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2]), pa.array([8, 10])], names=["a", "c"], ) df1 = ctx.create_dataframe([[batch]]) df = df.join(df1, on="a", how="inner") + df = df.sort([ + f.col("a").sort(ascending=True) + ]) + table = pa.Table.from_batches(df.collect()) - # execute and collect the first (and only) batch - batch = df.collect()[0] - - if batch.column(0) == pyarrow.array([1, 2]): - self.assertEqual(batch.column(0), pyarrow.array([1, 2])) - self.assertEqual(batch.column(1), pyarrow.array([8, 10])) - self.assertEqual(batch.column(2), pyarrow.array([4, 5])) - else: - self.assertEqual(batch.column(0), pyarrow.array([2, 1])) - self.assertEqual(batch.column(1), pyarrow.array([10, 8])) - self.assertEqual(batch.column(2), pyarrow.array([5, 4])) + expected = {'a': [1, 2], 'c': [8, 10], 'b': [4, 5]} + self.assertEqual(table.to_pydict(), expected) From 265699f8c6c92fa53e06acd8b70acde108a8cd20 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Wed, 2 Jun 2021 22:42:11 +0200 Subject: [PATCH 148/329] make `VOLUME` declaration in docker absolute (#466) Otherwise some docker versions complain about: ```text docker: Error response from daemon: failed to create shim: OCI runtime create failed: invalid mount {Destination:data Type:bind Source:/var/lib/docker/volumes/8c57860badfdf66bd32f64fe4b22970bcbb1f0f13a5d134ec451458a823dec6f/_data Options:[rbind]}: mount destination data not absolute: unknown. ``` Also see docs for `VOLUME` which suggest this path should be absolute: https://docs.docker.com/engine/reference/builder/#volume Fixes #465. --- benchmarks/tpchgen.dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/tpchgen.dockerfile b/benchmarks/tpchgen.dockerfile index 7fc2e5005a52e..69434708bf9a7 100644 --- a/benchmarks/tpchgen.dockerfile +++ b/benchmarks/tpchgen.dockerfile @@ -27,6 +27,6 @@ RUN git clone https://github.com/databricks/tpch-dbgen.git && \ WORKDIR /tpch-dbgen ADD entrypoint.sh /tpch-dbgen/ -VOLUME data +VOLUME /data ENTRYPOINT [ "bash", "./entrypoint.sh" ] From 33ff660318bf60d0c9aa45ceba2c4c943bfe9438 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Thu, 3 Jun 2021 00:14:41 +0200 Subject: [PATCH 149/329] Support semi join (#470) * Support semi join * Fmt * Match on Semi * Simplify * Fmt * Undo match * Update datafusion/src/physical_plan/hash_join.rs Co-authored-by: Andrew Lamb * Add item on the left for semi join * Simplify pattern match Co-authored-by: Andrew Lamb --- ballista/rust/core/proto/ballista.proto | 1 + .../core/src/serde/logical_plan/from_proto.rs | 1 + .../core/src/serde/logical_plan/to_proto.rs | 1 + .../src/serde/physical_plan/from_proto.rs | 1 + .../core/src/serde/physical_plan/to_proto.rs | 1 + datafusion/src/logical_plan/builder.rs | 4 + datafusion/src/logical_plan/plan.rs | 4 +- .../src/optimizer/hash_build_probe_order.rs | 10 ++- datafusion/src/physical_plan/hash_join.rs | 89 +++++++++++++++---- datafusion/src/physical_plan/hash_utils.rs | 3 + datafusion/src/physical_plan/planner.rs | 1 + 11 files changed, 99 insertions(+), 17 deletions(-) diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index da0c615e3b23e..03872147b797b 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -363,6 +363,7 @@ enum JoinType { LEFT = 1; RIGHT = 2; FULL = 3; + SEMI = 4; } message JoinNode { diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index 10c4670e809aa..48471263885f6 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -265,6 +265,7 @@ impl TryInto for &protobuf::LogicalPlanNode { protobuf::JoinType::Left => JoinType::Left, protobuf::JoinType::Right => JoinType::Right, protobuf::JoinType::Full => JoinType::Full, + protobuf::JoinType::Semi => JoinType::Semi, }; LogicalPlanBuilder::from(&convert_box_required!(join.left)?) .join( diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/ballista/rust/core/src/serde/logical_plan/to_proto.rs index b630dfcc0d1b4..e1c0c5e44df64 100644 --- a/ballista/rust/core/src/serde/logical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/to_proto.rs @@ -834,6 +834,7 @@ impl TryInto for &LogicalPlan { JoinType::Left => protobuf::JoinType::Left, JoinType::Right => protobuf::JoinType::Right, JoinType::Full => protobuf::JoinType::Full, + JoinType::Semi => protobuf::JoinType::Semi, }; let left_join_column = on.iter().map(|on| on.0.to_owned()).collect(); let right_join_column = on.iter().map(|on| on.1.to_owned()).collect(); diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index 2039def908bc0..7f98a8378b0b2 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -379,6 +379,7 @@ impl TryInto> for &protobuf::PhysicalPlanNode { protobuf::JoinType::Left => JoinType::Left, protobuf::JoinType::Right => JoinType::Right, protobuf::JoinType::Full => JoinType::Full, + protobuf::JoinType::Semi => JoinType::Semi, }; Ok(Arc::new(HashJoinExec::try_new( left, diff --git a/ballista/rust/core/src/serde/physical_plan/to_proto.rs b/ballista/rust/core/src/serde/physical_plan/to_proto.rs index 9571f3de2e76b..c409f94749518 100644 --- a/ballista/rust/core/src/serde/physical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/to_proto.rs @@ -133,6 +133,7 @@ impl TryInto for Arc { JoinType::Left => protobuf::JoinType::Left, JoinType::Right => protobuf::JoinType::Right, JoinType::Full => protobuf::JoinType::Full, + JoinType::Semi => protobuf::JoinType::Semi, }; Ok(protobuf::PhysicalPlanNode { physical_plan_type: Some(PhysicalPlanType::HashJoin(Box::new( diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs index c02555d63314a..71de48cdb8f8f 100644 --- a/datafusion/src/logical_plan/builder.rs +++ b/datafusion/src/logical_plan/builder.rs @@ -410,6 +410,10 @@ fn build_join_schema( // left then right left_fields.chain(right_fields).cloned().collect() } + JoinType::Semi => { + // Only use the left side for the schema + left.fields().clone() + } JoinType::Right => { // remove left-side join keys if they have the same names as the right-side let duplicate_keys = &on diff --git a/datafusion/src/logical_plan/plan.rs b/datafusion/src/logical_plan/plan.rs index 2d85abb64bbbd..5cb94be405e7b 100644 --- a/datafusion/src/logical_plan/plan.rs +++ b/datafusion/src/logical_plan/plan.rs @@ -34,7 +34,7 @@ use std::{ }; /// Join type -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum JoinType { /// Inner Join Inner, @@ -44,6 +44,8 @@ pub enum JoinType { Right, /// Full Join Full, + /// Semi Join + Semi, } /// A LogicalPlan represents the different types of relational diff --git a/datafusion/src/optimizer/hash_build_probe_order.rs b/datafusion/src/optimizer/hash_build_probe_order.rs index 100ae4fb09b73..86d38ef313ce8 100644 --- a/datafusion/src/optimizer/hash_build_probe_order.rs +++ b/datafusion/src/optimizer/hash_build_probe_order.rs @@ -106,6 +106,13 @@ fn should_swap_join_order(left: &LogicalPlan, right: &LogicalPlan) -> bool { } } +fn supports_swap(join_type: JoinType) -> bool { + match join_type { + JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => true, + JoinType::Semi => false, + } +} + impl OptimizerRule for HashBuildProbeOrder { fn name(&self) -> &str { "hash_build_probe_order" @@ -128,7 +135,7 @@ impl OptimizerRule for HashBuildProbeOrder { } => { let left = self.optimize(left, execution_props)?; let right = self.optimize(right, execution_props)?; - if should_swap_join_order(&left, &right) { + if should_swap_join_order(&left, &right) && supports_swap(*join_type) { // Swap left and right, change join type and (equi-)join key order Ok(LogicalPlan::Join { left: Arc::new(right), @@ -216,6 +223,7 @@ fn swap_join_type(join_type: JoinType) -> JoinType { JoinType::Full => JoinType::Full, JoinType::Left => JoinType::Right, JoinType::Right => JoinType::Left, + _ => unreachable!(), } } diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index 01551cd4daf4c..6653b9a356a45 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -184,7 +184,7 @@ impl HashJoinExec { /// Calculates column indices and left/right placement on input / output schemas and jointype fn column_indices_from_schema(&self) -> ArrowResult> { let (primary_is_left, primary_schema, secondary_schema) = match self.join_type { - JoinType::Inner | JoinType::Left | JoinType::Full => { + JoinType::Inner | JoinType::Left | JoinType::Full | JoinType::Semi => { (true, self.left.schema(), self.right.schema()) } JoinType::Right => (false, self.right.schema(), self.left.schema()), @@ -376,7 +376,7 @@ impl ExecutionPlan for HashJoinExec { let column_indices = self.column_indices_from_schema()?; let num_rows = left_data.1.num_rows(); let visited_left_side = match self.join_type { - JoinType::Left | JoinType::Full => vec![false; num_rows], + JoinType::Left | JoinType::Full | JoinType::Semi => vec![false; num_rows], JoinType::Inner | JoinType::Right => vec![], }; Ok(Box::pin(HashJoinStream { @@ -544,6 +544,13 @@ fn build_batch( ) .unwrap(); + if join_type == JoinType::Semi { + return Ok(( + RecordBatch::new_empty(Arc::new(schema.clone())), + left_indices, + )); + } + build_batch_from_indices( schema, &left_data.1, @@ -606,7 +613,7 @@ fn build_join_indexes( let left = &left_data.0; match join_type { - JoinType::Inner => { + JoinType::Inner | JoinType::Semi => { // Using a buffer builder to avoid slower normal builder let mut left_indices = UInt64BufferBuilder::new(0); let mut right_indices = UInt32BufferBuilder::new(0); @@ -1108,23 +1115,35 @@ pub fn create_hashes<'a>( Ok(hashes_buffer) } -// Produces a batch for left-side rows that are not marked as being visited during the whole join -fn produce_unmatched( +// Produces a batch for left-side rows that have/have not been matched during the whole join +fn produce_from_matched( visited_left_side: &[bool], schema: &SchemaRef, column_indices: &[ColumnIndex], left_data: &JoinLeftData, + unmatched: bool, ) -> ArrowResult { // Find indices which didn't match any right row (are false) - let unmatched_indices: Vec = visited_left_side - .iter() - .enumerate() - .filter(|&(_, &value)| !value) - .map(|(index, _)| index as u64) - .collect(); + let indices = if unmatched { + UInt64Array::from_iter_values( + visited_left_side + .iter() + .enumerate() + .filter(|&(_, &value)| !value) + .map(|(index, _)| index as u64), + ) + } else { + // produce those that did match + UInt64Array::from_iter_values( + visited_left_side + .iter() + .enumerate() + .filter(|&(_, &value)| value) + .map(|(index, _)| index as u64), + ) + }; // generate batches by taking values from the left side and generating columns filled with null on the right side - let indices = UInt64Array::from_iter_values(unmatched_indices); let num_rows = indices.len(); let mut columns: Vec> = Vec::with_capacity(schema.fields().len()); for (idx, column_index) in column_indices.iter().enumerate() { @@ -1171,7 +1190,7 @@ impl Stream for HashJoinStream { self.num_output_rows += batch.num_rows(); match self.join_type { - JoinType::Left | JoinType::Full => { + JoinType::Left | JoinType::Full | JoinType::Semi => { left_side.iter().flatten().for_each(|x| { self.visited_left_side[x as usize] = true; }); @@ -1185,12 +1204,15 @@ impl Stream for HashJoinStream { let start = Instant::now(); // For the left join, produce rows for unmatched rows match self.join_type { - JoinType::Left | JoinType::Full if !self.is_exhausted => { - let result = produce_unmatched( + JoinType::Left | JoinType::Full | JoinType::Semi + if !self.is_exhausted => + { + let result = produce_from_matched( &self.visited_left_side, &self.schema, &self.column_indices, &self.left_data, + self.join_type != JoinType::Semi, ); if let Ok(ref batch) = result { self.num_input_batches += 1; @@ -1207,6 +1229,7 @@ impl Stream for HashJoinStream { } JoinType::Left | JoinType::Full + | JoinType::Semi | JoinType::Inner | JoinType::Right => {} } @@ -1666,6 +1689,42 @@ mod tests { Ok(()) } + #[tokio::test] + async fn join_semi() -> Result<()> { + let left = build_table( + ("a1", &vec![1, 2, 2, 3]), + ("b1", &vec![4, 5, 5, 7]), // 7 does not exist on the right + ("c1", &vec![7, 8, 8, 9]), + ); + let right = build_table( + ("a2", &vec![10, 20, 30, 40]), + ("b1", &vec![4, 5, 6, 5]), // 5 is double on the right + ("c2", &vec![70, 80, 90, 100]), + ); + let on = &[("b1", "b1")]; + + let join = join(left, right, on, &JoinType::Semi)?; + + let columns = columns(&join.schema()); + assert_eq!(columns, vec!["a1", "b1", "c1"]); + + let stream = join.execute(0).await?; + let batches = common::collect(stream).await?; + + let expected = vec![ + "+----+----+----+", + "| a1 | b1 | c1 |", + "+----+----+----+", + "| 1 | 4 | 7 |", + "| 2 | 5 | 8 |", + "| 2 | 5 | 8 |", + "+----+----+----+", + ]; + assert_batches_sorted_eq!(expected, &batches); + + Ok(()) + } + #[tokio::test] async fn join_right_one() -> Result<()> { let left = build_table( diff --git a/datafusion/src/physical_plan/hash_utils.rs b/datafusion/src/physical_plan/hash_utils.rs index 7e030af3a124c..110319e4bb6b8 100644 --- a/datafusion/src/physical_plan/hash_utils.rs +++ b/datafusion/src/physical_plan/hash_utils.rs @@ -32,6 +32,8 @@ pub enum JoinType { Right, /// Full Join Full, + /// Semi Join + Semi, } /// The on clause of the join, as vector of (left, right) columns. @@ -130,6 +132,7 @@ pub fn build_join_schema( // left then right left_fields.chain(right_fields).cloned().collect() } + JoinType::Semi => left.fields().clone(), }; Schema::new(fields) } diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index 7ddfaf8f68972..4971a027ef1e4 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -367,6 +367,7 @@ impl DefaultPhysicalPlanner { JoinType::Left => hash_utils::JoinType::Left, JoinType::Right => hash_utils::JoinType::Right, JoinType::Full => hash_utils::JoinType::Full, + JoinType::Semi => hash_utils::JoinType::Semi, }; if ctx_state.config.concurrency > 1 && ctx_state.config.repartition_joins { From a1b8305cec4a9d2cd16797a876f98a5de410f605 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 3 Jun 2021 20:33:39 +0800 Subject: [PATCH 150/329] use prettier check in CI (#453) * use prettier check in CI * format two more files * fix docs/specification/invariants.md --- .github/workflows/dev.yml | 15 +- DEVELOPERS.md | 4 +- docs/specification/invariants.md | 22 +-- .../output-field-name-semantic.md | 132 ++++++++---------- 4 files changed, 88 insertions(+), 85 deletions(-) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 6c6dd830e1610..3f52ccd344505 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -23,7 +23,6 @@ on: pull_request: jobs: - lint: name: Lint C++, Python, R, Rust, Docker, RAT runs-on: ubuntu-latest @@ -37,3 +36,17 @@ jobs: run: pip install -e dev/archery[docker] - name: Lint run: archery lint --rat + + prettier: + name: Use prettier to check formatting of documents + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-node@v2 + with: + node-version: "14" + - name: Prettier check + run: | + # if you encounter error, try rerun the command below with --write instead of --check + # and commit the changes + npx prettier@2.3.0 --check {ballista,datafusion,datafusion-examples,dev,docs,python}/**/*.md README.md DEVELOPERS.md diff --git a/DEVELOPERS.md b/DEVELOPERS.md index b915e2912c24e..cd0792f7fa2ca 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -105,8 +105,8 @@ new specifications as you see fit. Here is the list current active specifications: -* [Output field name semantic](docs/specification/output-field-name-semantic.md) -* [Invariants](docs/specification/invariants.md) +- [Output field name semantic](docs/specification/output-field-name-semantic.md) +- [Invariants](docs/specification/invariants.md) ## How to format `.md` document diff --git a/docs/specification/invariants.md b/docs/specification/invariants.md index 628bb7710b1f5..17b7c1dbbabac 100644 --- a/docs/specification/invariants.md +++ b/docs/specification/invariants.md @@ -42,11 +42,11 @@ know what they need to enforce at the coding level. ## Notation -* Field or physical field: the tuple name, `arrow::DataType` and nullability flag (a bool whether values can be null), represented in this document by `PF(name, type, nullable)` -* Logical field: Field with a relation name. Represented in this document by `LF(relation, name, type, nullable)` -* Projected plan: plan with projection as the root node. -* Logical schema: a vector of logical fields, used by logical plan. -* Physical schema: a vector of physical fields, used by both physical plan and Arrow record batch. +- Field or physical field: the tuple name, `arrow::DataType` and nullability flag (a bool whether values can be null), represented in this document by `PF(name, type, nullable)` +- Logical field: Field with a relation name. Represented in this document by `LF(relation, name, type, nullable)` +- Projected plan: plan with projection as the root node. +- Logical schema: a vector of logical fields, used by logical plan. +- Physical schema: a vector of physical fields, used by both physical plan and Arrow record batch. ### Logical @@ -62,8 +62,8 @@ logical_field(lf1: LF, lf2: LF, ...) -> LF Examples: -* `plus(a,b) -> LF(None, "{a} Plus {b}", d(a.type,b.type), a.nullable | b.nullable)` where d is the function mapping input types to output type (`get_supertype` in our current implementation). -* `length(a) -> LF(None, "length({a})", u32, a.nullable)` +- `plus(a,b) -> LF(None, "{a} Plus {b}", d(a.type,b.type), a.nullable | b.nullable)` where d is the function mapping input types to output type (`get_supertype` in our current implementation). +- `length(a) -> LF(None, "length({a})", u32, a.nullable)` #### Plan @@ -91,8 +91,8 @@ physical_field(PF1, PF2, ...) -> PF Examples: -* `plus(a,b) -> PF("{a} Plus {b}", d(a.type,b.type), a.nullable | b.nullable)` where d is a complex function (`get_supertype` in our current implementation) whose computation is for each element in the columns, sum the two entries together and return it in the same type as the smallest type of both columns. -* `length(&str) -> PF("length({a})", u32, a.nullable)` whose computation is "count number of bytes in the string". +- `plus(a,b) -> PF("{a} Plus {b}", d(a.type,b.type), a.nullable | b.nullable)` where d is a complex function (`get_supertype` in our current implementation) whose computation is for each element in the columns, sum the two entries together and return it in the same type as the smallest type of both columns. +- `length(&str) -> PF("length({a})", u32, a.nullable)` whose computation is "count number of bytes in the string". #### Plan @@ -216,8 +216,8 @@ guarantee this invariant. In particular: -* The derived DataType matches the code it uses to build the array for every branch of valid input type combinations. -* The nullability flag matches how the values are built. +- The derived DataType matches the code it uses to build the array for every branch of valid input type combinations. +- The nullability flag matches how the values are built. #### Validation diff --git a/docs/specification/output-field-name-semantic.md b/docs/specification/output-field-name-semantic.md index fd28d118921b2..0407a17617e95 100644 --- a/docs/specification/output-field-name-semantic.md +++ b/docs/specification/output-field-name-semantic.md @@ -23,20 +23,20 @@ This specification documents how field names in output record batches should be generated based on given user queries. The filed name rules apply to Datafusion queries planned from both SQL queries and Dataframe APIs. -## Field name rules - -* All field names MUST not contain relation/table qualifier. - * Both `SELECT t1.id`, `SELECT id` and `df.select_columns(&["id"])` SHOULD result in field name: `id` -* Function names MUST be converted to lowercase. - * `SELECT AVG(c1)` SHOULD result in field name: `avg(c1)` -* Literal string MUST not be wrapped with quotes or double quotes. - * `SELECT 'foo'` SHOULD result in field name: `foo` -* Operator expressions MUST be wrapped with parentheses. - * `SELECT -2` SHOULD result in field name: `(- 2)` -* Operator and operand MUST be separated by spaces. - * `SELECT 1+2` SHOULD result in field name: `(1 + 2)` -* Function arguments MUST be separated by a comma `,` and a space. - * `SELECT f(c1,c2)` and `df.select(vec![f.udf("f")?.call(vec![col("c1"), col("c2")])])` SHOULD result in field name: `f(c1, c2)` +## Field name rules + +- All field names MUST not contain relation/table qualifier. + - Both `SELECT t1.id`, `SELECT id` and `df.select_columns(&["id"])` SHOULD result in field name: `id` +- Function names MUST be converted to lowercase. + - `SELECT AVG(c1)` SHOULD result in field name: `avg(c1)` +- Literal string MUST not be wrapped with quotes or double quotes. + - `SELECT 'foo'` SHOULD result in field name: `foo` +- Operator expressions MUST be wrapped with parentheses. + - `SELECT -2` SHOULD result in field name: `(- 2)` +- Operator and operand MUST be separated by spaces. + - `SELECT 1+2` SHOULD result in field name: `(1 + 2)` +- Function arguments MUST be separated by a comma `,` and a space. + - `SELECT f(c1,c2)` and `df.select(vec![f.udf("f")?.call(vec![col("c1"), col("c2")])])` SHOULD result in field name: `f(c1, c2)` ## Appendices @@ -66,26 +66,24 @@ JOIN t2 ON t1.id = t2.id Datafusion Arrow record batches output: -| id | a | id | b | -|----|-----|----|-------| -| 1 | foo | 1 | hello | -| 2 | bar | 2 | world | - +| id | a | id | b | +| --- | --- | --- | ----- | +| 1 | foo | 1 | hello | +| 2 | bar | 2 | world | Spark, MySQL 8 and PostgreSQL 13 output: -| id | a | id | b | -|----|-----|----|-------| -| 1 | foo | 1 | hello | -| 2 | bar | 2 | world | +| id | a | id | b | +| --- | --- | --- | ----- | +| 1 | foo | 1 | hello | +| 2 | bar | 2 | world | SQLite 3 output: -| id | a | b | -|----|-----|-------| -| 1 | foo | hello | -| 2 | bar | world | - +| id | a | b | +| --- | --- | ----- | +| 1 | foo | hello | +| 2 | bar | world | #### Function transformed columns @@ -98,41 +96,38 @@ SELECT ABS(t1.id), abs(-id) FROM t1; Datafusion Arrow record batches output: | abs(id) | abs((- id)) | -|---------|-------------| +| ------- | ----------- | | 1 | 1 | | 2 | 2 | - Spark output: | abs(id) | abs((- id)) | -|---------|-------------| +| ------- | ----------- | | 1 | 1 | | 2 | 2 | - MySQL 8 output: | ABS(t1.id) | abs(-id) | -|------------|----------| +| ---------- | -------- | | 1 | 1 | | 2 | 2 | PostgreSQL 13 output: | abs | abs | -|-----|-----| +| --- | --- | | 1 | 1 | | 2 | 2 | SQlite 3 output: | ABS(t1.id) | abs(-id) | -|------------|----------| +| ---------- | -------- | | 1 | 1 | | 2 | 2 | - #### Function with operators Query: @@ -143,40 +138,38 @@ SELECT t1.id + ABS(id), ABS(id * t1.id) FROM t1; Datafusion Arrow record batches output: -| id + abs(id) | abs(id * id) | -|--------------|--------------| -| 2 | 1 | -| 4 | 4 | - +| id + abs(id) | abs(id \* id) | +| ------------ | ------------- | +| 2 | 1 | +| 4 | 4 | Spark output: -| id + abs(id) | abs(id * id) | -|--------------|--------------| -| 2 | 1 | -| 4 | 4 | +| id + abs(id) | abs(id \* id) | +| ------------ | ------------- | +| 2 | 1 | +| 4 | 4 | MySQL 8 output: -| t1.id + ABS(id) | ABS(id * t1.id) | -|-----------------|-----------------| -| 2 | 1 | -| 4 | 4 | +| t1.id + ABS(id) | ABS(id \* t1.id) | +| --------------- | ---------------- | +| 2 | 1 | +| 4 | 4 | PostgreSQL output: | ?column? | abs | -|----------|-----| +| -------- | --- | | 2 | 1 | | 4 | 4 | SQLite output: -| t1.id + ABS(id) | ABS(id * t1.id) | -|-----------------|-----------------| -| 2 | 1 | -| 4 | 4 | - +| t1.id + ABS(id) | ABS(id \* t1.id) | +| --------------- | ---------------- | +| 2 | 1 | +| 4 | 4 | #### Project literals @@ -188,33 +181,30 @@ SELECT 1, 2+5, 'foo_bar'; Datafusion Arrow record batches output: -| 1 | (2 + 5) | foo_bar | -|---|---------|---------| -| 1 | 7 | foo_bar | - +| 1 | (2 + 5) | foo_bar | +| --- | ------- | ------- | +| 1 | 7 | foo_bar | Spark output: -| 1 | (2 + 5) | foo_bar | -|---|---------|---------| -| 1 | 7 | foo_bar | +| 1 | (2 + 5) | foo_bar | +| --- | ------- | ------- | +| 1 | 7 | foo_bar | MySQL output: -| 1 | 2+5 | foo_bar | -|---|-----|---------| -| 1 | 7 | foo_bar | - +| 1 | 2+5 | foo_bar | +| --- | --- | ------- | +| 1 | 7 | foo_bar | PostgreSQL output: | ?column? | ?column? | ?column? | -|----------|----------|----------| +| -------- | -------- | -------- | | 1 | 7 | foo_bar | - SQLite 3 output: -| 1 | 2+5 | 'foo_bar' | -|---|-----|-----------| -| 1 | 7 | foo_bar | +| 1 | 2+5 | 'foo_bar' | +| --- | --- | --------- | +| 1 | 7 | foo_bar | From 4392eea63e78a2c0afc68f6380fc85c4874b1245 Mon Sep 17 00:00:00 2001 From: Javier Goday Date: Thu, 3 Jun 2021 17:53:45 +0200 Subject: [PATCH 151/329] Remove reundant filters (e.g. c> 5 AND c>5 --> c>5) (#436) * #410: Remove reundant filters (e.g. c> 5 AND c>5 --> c>5) * RemoveDuplicateFilters: fix unit tests * fix lint * fix erroneous simplifications of arithmetic expressions * RemoveDuplicateFilters: add more simplification rules (@dandandan) * Remove unnecessary lifetime specifiers and other fixes ... * RemoveDuplicateFilter: fix is_one matches * Change remove_duplicate_filter to simplify_expressions * fix simplify expressions * fix lint and clippy * Update datafusion/src/optimizer/simplify_expressions.rs Co-authored-by: Andrew Lamb * Update datafusion/src/optimizer/simplify_expressions.rs Co-authored-by: Andrew Lamb * Update datafusion/src/optimizer/simplify_expressions.rs Co-authored-by: Andrew Lamb * Update datafusion/src/optimizer/simplify_expressions.rs Co-authored-by: Andrew Lamb * simplify test expressions Co-authored-by: Andrew Lamb --- datafusion/src/execution/context.rs | 2 + datafusion/src/optimizer/mod.rs | 1 + .../src/optimizer/simplify_expressions.rs | 541 ++++++++++++++++++ 3 files changed, 544 insertions(+) create mode 100644 datafusion/src/optimizer/simplify_expressions.rs diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index cfd3b7194429e..950ba2b88691c 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -59,6 +59,7 @@ use crate::optimizer::filter_push_down::FilterPushDown; use crate::optimizer::limit_push_down::LimitPushDown; use crate::optimizer::optimizer::OptimizerRule; use crate::optimizer::projection_push_down::ProjectionPushDown; +use crate::optimizer::simplify_expressions::SimplifyExpressions; use crate::physical_optimizer::coalesce_batches::CoalesceBatches; use crate::physical_optimizer::merge_exec::AddMergeExec; use crate::physical_optimizer::repartition::Repartition; @@ -652,6 +653,7 @@ impl ExecutionConfig { Arc::new(EliminateLimit::new()), Arc::new(ProjectionPushDown::new()), Arc::new(FilterPushDown::new()), + Arc::new(SimplifyExpressions::new()), Arc::new(HashBuildProbeOrder::new()), Arc::new(LimitPushDown::new()), ], diff --git a/datafusion/src/optimizer/mod.rs b/datafusion/src/optimizer/mod.rs index 2fb8a3d629509..e360a54f2a965 100644 --- a/datafusion/src/optimizer/mod.rs +++ b/datafusion/src/optimizer/mod.rs @@ -25,4 +25,5 @@ pub mod hash_build_probe_order; pub mod limit_push_down; pub mod optimizer; pub mod projection_push_down; +pub mod simplify_expressions; pub mod utils; diff --git a/datafusion/src/optimizer/simplify_expressions.rs b/datafusion/src/optimizer/simplify_expressions.rs new file mode 100644 index 0000000000000..0697d689c4019 --- /dev/null +++ b/datafusion/src/optimizer/simplify_expressions.rs @@ -0,0 +1,541 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Simplify expressions optimizer rule + +use crate::execution::context::ExecutionProps; +use crate::logical_plan::LogicalPlan; +use crate::logical_plan::{lit, Expr}; +use crate::optimizer::optimizer::OptimizerRule; +use crate::optimizer::utils; +use crate::optimizer::utils::optimize_explain; +use crate::scalar::ScalarValue; +use crate::{error::Result, logical_plan::Operator}; + +/// Simplify expressions optimizer. +/// # Introduction +/// It uses boolean algebra laws to simplify or reduce the number of terms in expressions. +/// +/// Filter: b > 2 AND b > 2 +/// is optimized to +/// Filter: b > 2 +pub struct SimplifyExpressions {} + +fn expr_contains(expr: &Expr, needle: &Expr) -> bool { + match expr { + Expr::BinaryExpr { + left, + op: Operator::And, + right, + } => expr_contains(left, needle) || expr_contains(right, needle), + Expr::BinaryExpr { + left, + op: Operator::Or, + right, + } => expr_contains(left, needle) || expr_contains(right, needle), + _ => expr == needle, + } +} + +fn as_binary_expr(expr: &Expr) -> Option<&Expr> { + match expr { + Expr::BinaryExpr { .. } => Some(expr), + _ => None, + } +} + +fn operator_is_boolean(op: Operator) -> bool { + op == Operator::And || op == Operator::Or +} + +fn is_one(s: &Expr) -> bool { + match s { + Expr::Literal(ScalarValue::Int8(Some(1))) + | Expr::Literal(ScalarValue::Int16(Some(1))) + | Expr::Literal(ScalarValue::Int32(Some(1))) + | Expr::Literal(ScalarValue::Int64(Some(1))) + | Expr::Literal(ScalarValue::UInt8(Some(1))) + | Expr::Literal(ScalarValue::UInt16(Some(1))) + | Expr::Literal(ScalarValue::UInt32(Some(1))) + | Expr::Literal(ScalarValue::UInt64(Some(1))) => true, + Expr::Literal(ScalarValue::Float32(Some(v))) if *v == 1. => true, + Expr::Literal(ScalarValue::Float64(Some(v))) if *v == 1. => true, + _ => false, + } +} + +fn is_true(expr: &Expr) -> bool { + match expr { + Expr::Literal(ScalarValue::Boolean(Some(v))) => *v, + _ => false, + } +} + +fn is_null(expr: &Expr) -> bool { + match expr { + Expr::Literal(v) => v.is_null(), + _ => false, + } +} + +fn is_false(expr: &Expr) -> bool { + match expr { + Expr::Literal(ScalarValue::Boolean(Some(v))) => !(*v), + _ => false, + } +} + +fn simplify(expr: &Expr) -> Expr { + match expr { + Expr::BinaryExpr { + left, + op: Operator::Or, + right, + } if is_true(left) || is_true(right) => lit(true), + Expr::BinaryExpr { + left, + op: Operator::Or, + right, + } if is_false(left) => simplify(right), + Expr::BinaryExpr { + left, + op: Operator::Or, + right, + } if is_false(right) => simplify(left), + Expr::BinaryExpr { + left, + op: Operator::Or, + right, + } if left == right => simplify(left), + Expr::BinaryExpr { + left, + op: Operator::And, + right, + } if is_false(left) || is_false(right) => lit(false), + Expr::BinaryExpr { + left, + op: Operator::And, + right, + } if is_true(right) => simplify(left), + Expr::BinaryExpr { + left, + op: Operator::And, + right, + } if is_true(left) => simplify(right), + Expr::BinaryExpr { + left, + op: Operator::And, + right, + } if left == right => simplify(right), + Expr::BinaryExpr { + left, + op: Operator::Multiply, + right, + } if is_one(left) => simplify(right), + Expr::BinaryExpr { + left, + op: Operator::Multiply, + right, + } if is_one(right) => simplify(left), + Expr::BinaryExpr { + left, + op: Operator::Divide, + right, + } if is_one(right) => simplify(left), + Expr::BinaryExpr { + left, + op: Operator::Divide, + right, + } if left == right && is_null(left) => *left.clone(), + Expr::BinaryExpr { + left, + op: Operator::Divide, + right, + } if left == right => lit(1), + Expr::BinaryExpr { left, op, right } + if left == right && operator_is_boolean(*op) => + { + simplify(left) + } + Expr::BinaryExpr { + left, + op: Operator::Or, + right, + } if expr_contains(left, right) => as_binary_expr(left) + .map(|x| match x { + Expr::BinaryExpr { + left: _, + op: Operator::Or, + right: _, + } => simplify(&x.clone()), + Expr::BinaryExpr { + left: _, + op: Operator::And, + right: _, + } => simplify(&*right.clone()), + _ => expr.clone(), + }) + .unwrap_or_else(|| expr.clone()), + Expr::BinaryExpr { + left, + op: Operator::Or, + right, + } if expr_contains(right, left) => as_binary_expr(right) + .map(|x| match x { + Expr::BinaryExpr { + left: _, + op: Operator::Or, + right: _, + } => simplify(&*right.clone()), + Expr::BinaryExpr { + left: _, + op: Operator::And, + right: _, + } => simplify(&*left.clone()), + _ => expr.clone(), + }) + .unwrap_or_else(|| expr.clone()), + Expr::BinaryExpr { + left, + op: Operator::And, + right, + } if expr_contains(left, right) => as_binary_expr(left) + .map(|x| match x { + Expr::BinaryExpr { + left: _, + op: Operator::Or, + right: _, + } => simplify(&*right.clone()), + Expr::BinaryExpr { + left: _, + op: Operator::And, + right: _, + } => simplify(&x.clone()), + _ => expr.clone(), + }) + .unwrap_or_else(|| expr.clone()), + Expr::BinaryExpr { + left, + op: Operator::And, + right, + } if expr_contains(right, left) => as_binary_expr(right) + .map(|x| match x { + Expr::BinaryExpr { + left: _, + op: Operator::Or, + right: _, + } => simplify(&*left.clone()), + Expr::BinaryExpr { + left: _, + op: Operator::And, + right: _, + } => simplify(&x.clone()), + _ => expr.clone(), + }) + .unwrap_or_else(|| expr.clone()), + Expr::BinaryExpr { left, op, right } => Expr::BinaryExpr { + left: Box::new(simplify(&left)), + op: *op, + right: Box::new(simplify(right)), + }, + _ => expr.clone(), + } +} + +fn optimize(plan: &LogicalPlan) -> Result { + let new_inputs = plan + .inputs() + .iter() + .map(|input| optimize(input)) + .collect::>>()?; + let expr = plan + .expressions() + .into_iter() + .map(|x| simplify(&x)) + .collect::>(); + utils::from_plan(&plan, &expr, &new_inputs) +} + +impl OptimizerRule for SimplifyExpressions { + fn name(&self) -> &str { + "simplify_expressions" + } + + fn optimize( + &self, + plan: &LogicalPlan, + execution_props: &ExecutionProps, + ) -> Result { + match plan { + LogicalPlan::Explain { + verbose, + plan, + stringified_plans, + schema, + } => { + let schema = schema.as_ref().to_owned().into(); + optimize_explain( + self, + *verbose, + &*plan, + stringified_plans, + &schema, + execution_props, + ) + } + _ => optimize(plan), + } + } +} + +impl SimplifyExpressions { + #[allow(missing_docs)] + pub fn new() -> Self { + Self {} + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::logical_plan::{and, binary_expr, col, lit, Expr, LogicalPlanBuilder}; + use crate::test::*; + + fn assert_optimized_plan_eq(plan: &LogicalPlan, expected: &str) { + let rule = SimplifyExpressions::new(); + let optimized_plan = rule + .optimize(plan, &ExecutionProps::new()) + .expect("failed to optimize plan"); + let formatted_plan = format!("{:?}", optimized_plan); + assert_eq!(formatted_plan, expected); + } + + #[test] + fn test_simplify_or_true() -> Result<()> { + let expr_a = col("c").or(lit(true)); + let expr_b = lit(true).or(col("c")); + let expected = lit(true); + + assert_eq!(simplify(&expr_a), expected); + assert_eq!(simplify(&expr_b), expected); + Ok(()) + } + + #[test] + fn test_simplify_or_false() -> Result<()> { + let expr_a = lit(false).or(col("c")); + let expr_b = col("c").or(lit(false)); + let expected = col("c"); + + assert_eq!(simplify(&expr_a), expected); + assert_eq!(simplify(&expr_b), expected); + Ok(()) + } + + #[test] + fn test_simplify_or_same() -> Result<()> { + let expr = col("c").or(col("c")); + let expected = col("c"); + + assert_eq!(simplify(&expr), expected); + Ok(()) + } + + #[test] + fn test_simplify_and_false() -> Result<()> { + let expr_a = lit(false).and(col("c")); + let expr_b = col("c").and(lit(false)); + let expected = lit(false); + + assert_eq!(simplify(&expr_a), expected); + assert_eq!(simplify(&expr_b), expected); + Ok(()) + } + + #[test] + fn test_simplify_and_same() -> Result<()> { + let expr = col("c").and(col("c")); + let expected = col("c"); + + assert_eq!(simplify(&expr), expected); + Ok(()) + } + + #[test] + fn test_simplify_and_true() -> Result<()> { + let expr_a = lit(true).and(col("c")); + let expr_b = col("c").and(lit(true)); + let expected = col("c"); + + assert_eq!(simplify(&expr_a), expected); + assert_eq!(simplify(&expr_b), expected); + Ok(()) + } + + #[test] + fn test_simplify_multiply_by_one() -> Result<()> { + let expr_a = binary_expr(col("c"), Operator::Multiply, lit(1)); + let expr_b = binary_expr(lit(1), Operator::Multiply, col("c")); + let expected = col("c"); + + assert_eq!(simplify(&expr_a), expected); + assert_eq!(simplify(&expr_b), expected); + Ok(()) + } + + #[test] + fn test_simplify_divide_by_one() -> Result<()> { + let expr = binary_expr(col("c"), Operator::Divide, lit(1)); + let expected = col("c"); + + assert_eq!(simplify(&expr), expected); + Ok(()) + } + + #[test] + fn test_simplify_divide_by_same() -> Result<()> { + let expr = binary_expr(col("c"), Operator::Divide, col("c")); + let expected = lit(1); + + assert_eq!(simplify(&expr), expected); + Ok(()) + } + + #[test] + fn test_simplify_simple_and() -> Result<()> { + // (c > 5) AND (c > 5) + let expr = (col("c").gt(lit(5))).and(col("c").gt(lit(5))); + let expected = col("c").gt(lit(5)); + + assert_eq!(simplify(&expr), expected); + Ok(()) + } + + #[test] + fn test_simplify_composed_and() -> Result<()> { + // ((c > 5) AND (d < 6)) AND (c > 5) + let expr = binary_expr( + binary_expr(col("c").gt(lit(5)), Operator::And, col("d").lt(lit(6))), + Operator::And, + col("c").gt(lit(5)), + ); + let expected = + binary_expr(col("c").gt(lit(5)), Operator::And, col("d").lt(lit(6))); + + assert_eq!(simplify(&expr), expected); + Ok(()) + } + + #[test] + fn test_simplify_negated_and() -> Result<()> { + // (c > 5) AND !(c > 5) -- can't remove + let expr = binary_expr( + col("c").gt(lit(5)), + Operator::And, + Expr::not(col("c").gt(lit(5))), + ); + let expected = expr.clone(); + + assert_eq!(simplify(&expr), expected); + Ok(()) + } + + #[test] + fn test_simplify_or_and() -> Result<()> { + // (c > 5) OR ((d < 6) AND (c > 5) -- can remove + let expr = binary_expr( + col("c").gt(lit(5)), + Operator::Or, + binary_expr(col("d").lt(lit(6)), Operator::And, col("c").gt(lit(5))), + ); + let expected = col("c").gt(lit(5)); + + assert_eq!(simplify(&expr), expected); + Ok(()) + } + + #[test] + fn test_simplify_and_and_false() -> Result<()> { + let expr = + binary_expr(lit(ScalarValue::Boolean(None)), Operator::And, lit(false)); + let expr_eq = lit(false); + + assert_eq!(simplify(&expr), expr_eq); + Ok(()) + } + + #[test] + fn test_simplify_divide_null_by_null() -> Result<()> { + let null = Expr::Literal(ScalarValue::Int32(None)); + let expr_plus = binary_expr(null.clone(), Operator::Divide, null.clone()); + let expr_eq = null; + + assert_eq!(simplify(&expr_plus), expr_eq); + Ok(()) + } + + #[test] + fn test_simplify_do_not_simplify_arithmetic_expr() -> Result<()> { + let expr_plus = binary_expr(lit(1), Operator::Plus, lit(1)); + let expr_eq = binary_expr(lit(1), Operator::Eq, lit(1)); + + assert_eq!(simplify(&expr_plus), expr_plus); + assert_eq!(simplify(&expr_eq), expr_eq); + + Ok(()) + } + + #[test] + fn test_simplify_optimized_plan() -> Result<()> { + let table_scan = test_table_scan()?; + let plan = LogicalPlanBuilder::from(&table_scan) + .project(vec![col("a")])? + .filter(and(col("b").gt(lit(1)), col("b").gt(lit(1))))? + .build()?; + + assert_optimized_plan_eq( + &plan, + "\ + Filter: #b Gt Int32(1)\ + \n Projection: #a\ + \n TableScan: test projection=None", + ); + Ok(()) + } + + // ((c > 5) AND (d < 6)) AND (c > 5) --> (c > 5) AND (d < 6) + #[test] + fn test_simplify_optimized_plan_with_composed_and() -> Result<()> { + let table_scan = test_table_scan()?; + let plan = LogicalPlanBuilder::from(&table_scan) + .project(vec![col("a")])? + .filter(and( + and(col("a").gt(lit(5)), col("b").lt(lit(6))), + col("a").gt(lit(5)), + ))? + .build()?; + + assert_optimized_plan_eq( + &plan, + "\ + Filter: #a Gt Int32(5) And #b Lt Int32(6)\ + \n Projection: #a\ + \n TableScan: test projection=None", + ); + Ok(()) + } +} From 28b0dad82be302fea240bb6b177ff60abbd0f090 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 3 Jun 2021 12:14:15 -0400 Subject: [PATCH 152/329] Avoid warnings when compiling without default features (#489) --- datafusion/src/physical_plan/functions.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datafusion/src/physical_plan/functions.rs b/datafusion/src/physical_plan/functions.rs index 367e594f6e977..eb312cabd7f0f 100644 --- a/datafusion/src/physical_plan/functions.rs +++ b/datafusion/src/physical_plan/functions.rs @@ -1275,7 +1275,7 @@ mod tests { use arrow::{ array::{ Array, ArrayRef, BinaryArray, BooleanArray, FixedSizeListArray, Float64Array, - Int32Array, ListArray, StringArray, UInt32Array, UInt64Array, + Int32Array, StringArray, UInt32Array, UInt64Array, }, datatypes::Field, record_batch::RecordBatch, @@ -3555,6 +3555,7 @@ mod tests { #[test] #[cfg(feature = "regex_expressions")] fn test_regexp_match() -> Result<()> { + use arrow::array::ListArray; let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]); let ctx_state = ExecutionContextState::new(); @@ -3594,6 +3595,7 @@ mod tests { #[test] #[cfg(feature = "regex_expressions")] fn test_regexp_match_all_literals() -> Result<()> { + use arrow::array::ListArray; let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); let ctx_state = ExecutionContextState::new(); From e82d053b526d669e9c845e3fda70147aaf7d3488 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Fri, 4 Jun 2021 05:18:53 +0800 Subject: [PATCH 153/329] fix window expression with alias (#463) --- ballista/rust/core/proto/ballista.proto | 14 +- .../core/src/serde/logical_plan/from_proto.rs | 12 +- .../core/src/serde/logical_plan/to_proto.rs | 39 ++-- .../src/serde/physical_plan/from_proto.rs | 14 +- datafusion/src/logical_plan/builder.rs | 24 +- datafusion/src/logical_plan/expr.rs | 23 +- datafusion/src/logical_plan/plan.rs | 28 +-- .../src/optimizer/projection_push_down.rs | 45 ++-- datafusion/src/optimizer/utils.rs | 39 ++-- datafusion/src/physical_plan/planner.rs | 7 +- datafusion/src/sql/mod.rs | 2 +- datafusion/src/sql/planner.rs | 220 +++++++++++++++--- datafusion/src/sql/utils.rs | 211 ++++++++++++++++- 13 files changed, 508 insertions(+), 170 deletions(-) diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index 03872147b797b..d21cbf694b9d4 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -174,6 +174,12 @@ message WindowExprNode { // udaf = 3 } LogicalExprNode expr = 4; + // repeated LogicalExprNode partition_by = 5; + repeated LogicalExprNode order_by = 6; + // repeated LogicalExprNode filter = 7; + // oneof window_frame { + // WindowFrame frame = 8; + // } } message BetweenNode { @@ -317,14 +323,6 @@ message AggregateNode { message WindowNode { LogicalPlanNode input = 1; repeated LogicalExprNode window_expr = 2; - repeated LogicalExprNode partition_by_expr = 3; - repeated LogicalExprNode order_by_expr = 4; - // "optional" keyword is stable in protoc 3.15 but prost is still on 3.14 (see https://github.com/danburkert/prost/issues/430) - // this syntax is ugly but is binary compatible with the "optional" keyword (see https://stackoverflow.com/questions/42622015/how-to-define-an-optional-field-in-protobuf-3) - oneof window_frame { - WindowFrame frame = 5; - } - // TODO add filter by expr } enum WindowFrameUnits { diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index 48471263885f6..522d60cb8a054 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -98,9 +98,7 @@ impl TryInto for &protobuf::LogicalPlanNode { // // FIXME: parse the window_frame data // let window_frame = None; LogicalPlanBuilder::from(&input) - .window( - window_expr, /* filter_by_expr, partition_by_expr, order_by_expr, window_frame*/ - )? + .window(window_expr)? .build() .map_err(|e| e.into()) } @@ -924,6 +922,12 @@ impl TryInto for &protobuf::LogicalExprNode { .window_function .as_ref() .ok_or_else(|| proto_error("Received empty window function"))?; + let order_by = expr + .order_by + .iter() + .map(|e| e.try_into()) + .into_iter() + .collect::, _>>()?; match window_function { window_expr_node::WindowFunction::AggrFunction(i) => { let aggr_function = protobuf::AggregateFunction::from_i32(*i) @@ -939,6 +943,7 @@ impl TryInto for &protobuf::LogicalExprNode { AggregateFunction::from(aggr_function), ), args: vec![parse_required_expr(&expr.expr)?], + order_by, }) } window_expr_node::WindowFunction::BuiltInFunction(i) => { @@ -957,6 +962,7 @@ impl TryInto for &protobuf::LogicalExprNode { BuiltInWindowFunction::from(built_in_function), ), args: vec![parse_required_expr(&expr.expr)?], + order_by, }) } } diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/ballista/rust/core/src/serde/logical_plan/to_proto.rs index e1c0c5e44df64..088e93120e4f4 100644 --- a/ballista/rust/core/src/serde/logical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/to_proto.rs @@ -761,27 +761,9 @@ impl TryInto for &LogicalPlan { }) } LogicalPlan::Window { - input, - window_expr, - // FIXME implement next - // filter_by_expr, - // FIXME implement next - // partition_by_expr, - // FIXME implement next - // order_by_expr, - // FIXME implement next - // window_frame, - .. + input, window_expr, .. } => { let input: protobuf::LogicalPlanNode = input.as_ref().try_into()?; - // FIXME: implement - // let filter_by_expr = vec![]; - // FIXME: implement - let partition_by_expr = vec![]; - // FIXME: implement - let order_by_expr = vec![]; - // FIXME: implement - let window_frame = None; Ok(protobuf::LogicalPlanNode { logical_plan_type: Some(LogicalPlanType::Window(Box::new( protobuf::WindowNode { @@ -789,10 +771,7 @@ impl TryInto for &LogicalPlan { window_expr: window_expr .iter() .map(|expr| expr.try_into()) - .collect::, BallistaError>>()?, - partition_by_expr, - order_by_expr, - window_frame, + .collect::, _>>()?, }, ))), }) @@ -811,11 +790,11 @@ impl TryInto for &LogicalPlan { group_expr: group_expr .iter() .map(|expr| expr.try_into()) - .collect::, BallistaError>>()?, + .collect::, _>>()?, aggr_expr: aggr_expr .iter() .map(|expr| expr.try_into()) - .collect::, BallistaError>>()?, + .collect::, _>>()?, }, ))), }) @@ -1024,7 +1003,10 @@ impl TryInto for &Expr { }) } Expr::WindowFunction { - ref fun, ref args, .. + ref fun, + ref args, + ref order_by, + .. } => { let window_function = match fun { WindowFunction::AggregateFunction(fun) => { @@ -1039,9 +1021,14 @@ impl TryInto for &Expr { } }; let arg = &args[0]; + let order_by = order_by + .iter() + .map(|e| e.try_into()) + .collect::, _>>()?; let window_expr = Box::new(protobuf::WindowExprNode { expr: Some(Box::new(arg.try_into()?)), window_function: Some(window_function), + order_by, }); Ok(protobuf::LogicalExprNode { expr_type: Some(ExprType::WindowExpr(window_expr)), diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index 7f98a8378b0b2..c19739a6b061b 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -233,7 +233,11 @@ impl TryInto> for &protobuf::PhysicalPlanNode { for (expr, name) in &window_agg_expr { match expr { - Expr::WindowFunction { fun, args } => { + Expr::WindowFunction { + fun, + args, + order_by, + } => { let arg = df_planner .create_physical_expr( &args[0], @@ -243,12 +247,16 @@ impl TryInto> for &protobuf::PhysicalPlanNode { .map_err(|e| { BallistaError::General(format!("{:?}", e)) })?; - physical_window_expr.push(create_window_expr( + if !order_by.is_empty() { + return Err(BallistaError::NotImplemented("Window function with order by is not yet implemented".to_owned())); + } + let window_expr = create_window_expr( &fun, &[arg], &physical_schema, name.to_owned(), - )?); + )?; + physical_window_expr.push(window_expr); } _ => { return Err(BallistaError::General( diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs index 71de48cdb8f8f..dc80a41c0c01b 100644 --- a/datafusion/src/logical_plan/builder.rs +++ b/datafusion/src/logical_plan/builder.rs @@ -297,23 +297,7 @@ impl LogicalPlanBuilder { /// - https://github.com/apache/arrow-datafusion/issues/299 with partition clause /// - https://github.com/apache/arrow-datafusion/issues/360 with order by /// - https://github.com/apache/arrow-datafusion/issues/361 with window frame - pub fn window( - &self, - window_expr: impl IntoIterator, - // FIXME: implement next - // filter_by_expr: impl IntoIterator, - // FIXME: implement next - // partition_by_expr: impl IntoIterator, - // FIXME: implement next - // order_by_expr: impl IntoIterator, - // FIXME: implement next - // window_frame: Option, - ) -> Result { - let window_expr = window_expr.into_iter().collect::>(); - // FIXME: implement next - // let partition_by_expr = partition_by_expr.into_iter().collect::>(); - // FIXME: implement next - // let order_by_expr = order_by_expr.into_iter().collect::>(); + pub fn window(&self, window_expr: Vec) -> Result { let all_expr = window_expr.iter(); validate_unique_names("Windows", all_expr.clone(), self.plan.schema())?; @@ -323,12 +307,6 @@ impl LogicalPlanBuilder { Ok(Self::from(&LogicalPlan::Window { input: Arc::new(self.plan.clone()), - // FIXME implement next - // partition_by_expr, - // FIXME implement next - // order_by_expr, - // FIXME implement next - // window_frame, window_expr, schema: Arc::new(DFSchema::new(window_fields)?), })) diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index 29723e73d25ca..5103d5dc5051c 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -197,6 +197,8 @@ pub enum Expr { fun: window_functions::WindowFunction, /// List of expressions to feed to the functions as arguments args: Vec, + /// List of order by expressions + order_by: Vec, }, /// aggregate function AggregateUDF { @@ -587,9 +589,15 @@ impl Expr { Expr::ScalarUDF { args, .. } => args .iter() .try_fold(visitor, |visitor, arg| arg.accept(visitor)), - Expr::WindowFunction { args, .. } => args - .iter() - .try_fold(visitor, |visitor, arg| arg.accept(visitor)), + Expr::WindowFunction { args, order_by, .. } => { + let visitor = args + .iter() + .try_fold(visitor, |visitor, arg| arg.accept(visitor))?; + let visitor = order_by + .iter() + .try_fold(visitor, |visitor, arg| arg.accept(visitor))?; + Ok(visitor) + } Expr::AggregateFunction { args, .. } => args .iter() .try_fold(visitor, |visitor, arg| arg.accept(visitor)), @@ -723,9 +731,14 @@ impl Expr { args: rewrite_vec(args, rewriter)?, fun, }, - Expr::WindowFunction { args, fun } => Expr::WindowFunction { + Expr::WindowFunction { + args, + fun, + order_by, + } => Expr::WindowFunction { args: rewrite_vec(args, rewriter)?, fun, + order_by: rewrite_vec(order_by, rewriter)?, }, Expr::AggregateFunction { args, @@ -1388,7 +1401,7 @@ fn create_name(e: &Expr, input_schema: &DFSchema) -> Result { Expr::ScalarUDF { fun, args, .. } => { create_function_name(&fun.name, false, args, input_schema) } - Expr::WindowFunction { fun, args } => { + Expr::WindowFunction { fun, args, .. } => { create_function_name(&fun.to_string(), false, args, input_schema) } Expr::AggregateFunction { diff --git a/datafusion/src/logical_plan/plan.rs b/datafusion/src/logical_plan/plan.rs index 5cb94be405e7b..fe1dfb6de990f 100644 --- a/datafusion/src/logical_plan/plan.rs +++ b/datafusion/src/logical_plan/plan.rs @@ -92,8 +92,6 @@ pub enum LogicalPlan { // filter_by_expr: Vec, /// Partition by expressions // partition_by_expr: Vec, - /// Order by expressions - // order_by_expr: Vec, /// Window Frame // window_frame: Option, /// The schema description of the window output @@ -306,25 +304,12 @@ impl LogicalPlan { Partitioning::Hash(expr, _) => expr.clone(), _ => vec![], }, - LogicalPlan::Window { - window_expr, - // FIXME implement next - // filter_by_expr, - // FIXME implement next - // partition_by_expr, - // FIXME implement next - // order_by_expr, - .. - } => window_expr.clone(), + LogicalPlan::Window { window_expr, .. } => window_expr.clone(), LogicalPlan::Aggregate { group_expr, aggr_expr, .. - } => { - let mut result = group_expr.clone(); - result.extend(aggr_expr.clone()); - result - } + } => group_expr.iter().chain(aggr_expr.iter()).cloned().collect(), LogicalPlan::Join { on, .. } => { on.iter().flat_map(|(l, r)| vec![col(l), col(r)]).collect() } @@ -698,16 +683,11 @@ impl LogicalPlan { .. } => write!(f, "Filter: {:?}", expr), LogicalPlan::Window { - ref window_expr, - // FIXME implement next - // ref partition_by_expr, - // FIXME implement next - // ref order_by_expr, - .. + ref window_expr, .. } => { write!( f, - "WindowAggr: windowExpr=[{:?}] partitionBy=[], orderBy=[]", + "WindowAggr: windowExpr=[{:?}] partitionBy=[]", window_expr ) } diff --git a/datafusion/src/optimizer/projection_push_down.rs b/datafusion/src/optimizer/projection_push_down.rs index e47832b07f921..f0b364ab9852a 100644 --- a/datafusion/src/optimizer/projection_push_down.rs +++ b/datafusion/src/optimizer/projection_push_down.rs @@ -23,6 +23,7 @@ use crate::execution::context::ExecutionProps; use crate::logical_plan::{DFField, DFSchema, DFSchemaRef, LogicalPlan, ToDFSchema}; use crate::optimizer::optimizer::OptimizerRule; use crate::optimizer::utils; +use crate::sql::utils::find_sort_exprs; use arrow::datatypes::Schema; use arrow::error::Result as ArrowResult; use std::{collections::HashSet, sync::Arc}; @@ -197,29 +198,29 @@ fn optimize_plan( schema, window_expr, input, - // FIXME implement next - // filter_by_expr, - // FIXME implement next - // partition_by_expr, - // FIXME implement next - // order_by_expr, - // FIXME implement next - // window_frame, .. } => { // Gather all columns needed for expressions in this Window let mut new_window_expr = Vec::new(); - window_expr.iter().try_for_each(|expr| { - let name = &expr.name(&schema)?; - if required_columns.contains(name) { - new_window_expr.push(expr.clone()); - new_required_columns.insert(name.clone()); - // add to the new set of required columns - utils::expr_to_column_names(expr, &mut new_required_columns) - } else { - Ok(()) - } - })?; + { + window_expr.iter().try_for_each(|expr| { + let name = &expr.name(&schema)?; + if required_columns.contains(name) { + new_window_expr.push(expr.clone()); + new_required_columns.insert(name.clone()); + // add to the new set of required columns + utils::expr_to_column_names(expr, &mut new_required_columns) + } else { + Ok(()) + } + })?; + } + + // for all the retained window expr, find their sort expressions if any, and retain these + utils::exprlist_to_column_names( + &find_sort_exprs(&new_window_expr), + &mut new_required_columns, + )?; let new_schema = DFSchema::new( schema @@ -232,12 +233,6 @@ fn optimize_plan( Ok(LogicalPlan::Window { window_expr: new_window_expr, - // FIXME implement next - // partition_by_expr: partition_by_expr.clone(), - // FIXME implement next - // order_by_expr: order_by_expr.clone(), - // FIXME implement next - // window_frame: window_frame.clone(), input: Arc::new(optimize_plan( optimizer, &input, diff --git a/datafusion/src/optimizer/utils.rs b/datafusion/src/optimizer/utils.rs index 284ead252ac67..2cb65066feb93 100644 --- a/datafusion/src/optimizer/utils.rs +++ b/datafusion/src/optimizer/utils.rs @@ -36,6 +36,7 @@ use crate::{ const CASE_EXPR_MARKER: &str = "__DATAFUSION_CASE_EXPR__"; const CASE_ELSE_MARKER: &str = "__DATAFUSION_CASE_ELSE__"; +const WINDOW_SORT_MARKER: &str = "__DATAFUSION_WINDOW_SORT__"; /// Recursively walk a list of expression trees, collecting the unique set of column /// names referenced in the expression @@ -190,14 +191,6 @@ pub fn from_plan( }), }, LogicalPlan::Window { - // FIXME implement next - // filter_by_expr, - // FIXME implement next - // partition_by_expr, - // FIXME implement next - // order_by_expr, - // FIXME implement next - // window_frame, window_expr, schema, .. @@ -265,7 +258,13 @@ pub fn expr_sub_expressions(expr: &Expr) -> Result> { Expr::IsNotNull(e) => Ok(vec![e.as_ref().to_owned()]), Expr::ScalarFunction { args, .. } => Ok(args.clone()), Expr::ScalarUDF { args, .. } => Ok(args.clone()), - Expr::WindowFunction { args, .. } => Ok(args.clone()), + Expr::WindowFunction { args, order_by, .. } => { + let mut expr_list: Vec = vec![]; + expr_list.extend(args.clone()); + expr_list.push(lit(WINDOW_SORT_MARKER)); + expr_list.extend(order_by.clone()); + Ok(expr_list) + } Expr::AggregateFunction { args, .. } => Ok(args.clone()), Expr::AggregateUDF { args, .. } => Ok(args.clone()), Expr::Case { @@ -338,10 +337,24 @@ pub fn rewrite_expression(expr: &Expr, expressions: &[Expr]) -> Result { fun: fun.clone(), args: expressions.to_vec(), }), - Expr::WindowFunction { fun, .. } => Ok(Expr::WindowFunction { - fun: fun.clone(), - args: expressions.to_vec(), - }), + Expr::WindowFunction { fun, .. } => { + let index = expressions + .iter() + .position(|expr| { + matches!(expr, Expr::Literal(ScalarValue::Utf8(Some(str))) + if str == WINDOW_SORT_MARKER) + }) + .ok_or_else(|| { + DataFusionError::Internal( + "Ill-formed window function expressions".to_owned(), + ) + })?; + Ok(Expr::WindowFunction { + fun: fun.clone(), + args: expressions[..index].to_vec(), + order_by: expressions[index + 1..].to_vec(), + }) + } Expr::AggregateFunction { fun, distinct, .. } => Ok(Expr::AggregateFunction { fun: fun.clone(), args: expressions.to_vec(), diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index 4971a027ef1e4..b77850f9d67fe 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -746,13 +746,18 @@ impl DefaultPhysicalPlanner { }; match e { - Expr::WindowFunction { fun, args } => { + Expr::WindowFunction { fun, args, .. } => { let args = args .iter() .map(|e| { self.create_physical_expr(e, physical_input_schema, ctx_state) }) .collect::>>()?; + // if !order_by.is_empty() { + // return Err(DataFusionError::NotImplemented( + // "Window function with order by is not yet implemented".to_owned(), + // )); + // } windows::create_window_expr(fun, &args, physical_input_schema, name) } other => Err(DataFusionError::Internal(format!( diff --git a/datafusion/src/sql/mod.rs b/datafusion/src/sql/mod.rs index 456ad4c2e3611..cc8b004505fbc 100644 --- a/datafusion/src/sql/mod.rs +++ b/datafusion/src/sql/mod.rs @@ -20,4 +20,4 @@ pub mod parser; pub mod planner; -mod utils; +pub(crate) mod utils; diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 63499aa1abe22..3b8acc67ccb23 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -54,8 +54,8 @@ use super::{ parser::DFParser, utils::{ can_columns_satisfy_exprs, expand_wildcard, expr_as_column_expr, extract_aliases, - find_aggregate_exprs, find_column_exprs, find_window_exprs, rebase_expr, - resolve_aliases_to_exprs, + find_aggregate_exprs, find_column_exprs, find_window_exprs, + group_window_expr_by_sort_keys, rebase_expr, resolve_aliases_to_exprs, }, }; @@ -628,7 +628,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let (plan, exprs) = if window_func_exprs.is_empty() { (plan, select_exprs_post_aggr) } else { - self.window(&plan, window_func_exprs, &select_exprs_post_aggr)? + self.window(plan, window_func_exprs, &select_exprs_post_aggr)? }; let plan = if select.distinct { @@ -670,13 +670,28 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { /// Wrap a plan in a window fn window( &self, - input: &LogicalPlan, + input: LogicalPlan, window_exprs: Vec, select_exprs: &[Expr], ) -> Result<(LogicalPlan, Vec)> { - let plan = LogicalPlanBuilder::from(input) - .window(window_exprs.clone())? - .build()?; + let mut plan = input; + let mut groups = group_window_expr_by_sort_keys(&window_exprs)?; + // sort by sort_key len descending, so that more deeply sorted plans gets nested further + // down as children; to further minic the behavior of PostgreSQL, we want stable sort + // and a reverse so that tieing sort keys are reversed in order; note that by this rule + // if there's an empty over, it'll be at the top level + groups.sort_by(|(key_a, _), (key_b, _)| key_a.len().cmp(&key_b.len())); + groups.reverse(); + for (sort_keys, exprs) in groups { + if !sort_keys.is_empty() { + let sort_keys: Vec = sort_keys.to_vec(); + plan = LogicalPlanBuilder::from(&plan).sort(sort_keys)?.build()?; + } + let window_exprs: Vec = exprs.into_iter().cloned().collect(); + plan = LogicalPlanBuilder::from(&plan) + .window(window_exprs)? + .build()?; + } let select_exprs = select_exprs .iter() .map(|expr| rebase_expr(expr, &window_exprs, &plan)) @@ -779,21 +794,24 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { return Ok(plan.clone()); } - let input_schema = plan.schema(); - let order_by_rex: Result> = order_by + let order_by_rex = order_by .iter() - .map(|e| { - Ok(Expr::Sort { - expr: Box::new(self.sql_to_rex(&e.expr, &input_schema)?), - // by default asc - asc: e.asc.unwrap_or(true), - // by default nulls first to be consistent with spark - nulls_first: e.nulls_first.unwrap_or(true), - }) - }) - .collect(); + .map(|e| self.order_by_to_sort_expr(e)) + .into_iter() + .collect::>>()?; - LogicalPlanBuilder::from(&plan).sort(order_by_rex?)?.build() + LogicalPlanBuilder::from(&plan).sort(order_by_rex)?.build() + } + + /// convert sql OrderByExpr to Expr::Sort + fn order_by_to_sort_expr(&self, e: &OrderByExpr) -> Result { + Ok(Expr::Sort { + expr: Box::new(self.sql_expr_to_logical_expr(&e.expr)?), + // by default asc + asc: e.asc.unwrap_or(true), + // by default nulls first to be consistent with spark + nulls_first: e.nulls_first.unwrap_or(true), + }) } /// Validate the schema provides all of the columns referenced in the expressions. @@ -982,7 +1000,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { UnaryOperator::Plus => Ok(self.sql_expr_to_logical_expr(expr)?), UnaryOperator::Minus => { match expr.as_ref() { - // optimization: if it's a number literal, we applly the negative operator + // optimization: if it's a number literal, we apply the negative operator // here directly to calculate the new literal. SQLExpr::Value(Value::Number(n,_)) => match n.parse::() { Ok(n) => Ok(lit(-n)), @@ -1091,10 +1109,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // then, window function if let Some(window) = &function.over { - if window.partition_by.is_empty() - && window.order_by.is_empty() - && window.window_frame.is_none() - { + if window.partition_by.is_empty() && window.window_frame.is_none() { + let order_by = window + .order_by + .iter() + .map(|e| self.order_by_to_sort_expr(e)) + .into_iter() + .collect::>>()?; let fun = window_functions::WindowFunction::from_str(&name); if let Ok(window_functions::WindowFunction::AggregateFunction( aggregate_fun, @@ -1106,6 +1127,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ), args: self .aggregate_fn_to_expr(&aggregate_fun, function)?, + order_by, }); } else if let Ok( window_functions::WindowFunction::BuiltInWindowFunction( @@ -1118,6 +1140,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { window_fun, ), args:self.function_args_to_expr(function)?, + order_by }); } } @@ -2702,7 +2725,7 @@ mod tests { let sql = "SELECT order_id, MAX(order_id) OVER () from orders"; let expected = "\ Projection: #order_id, #MAX(order_id)\ - \n WindowAggr: windowExpr=[[MAX(#order_id)]] partitionBy=[], orderBy=[]\ + \n WindowAggr: windowExpr=[[MAX(#order_id)]] partitionBy=[]\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2712,7 +2735,7 @@ mod tests { let sql = "SELECT order_id oid, MAX(order_id) OVER () max_oid from orders"; let expected = "\ Projection: #order_id AS oid, #MAX(order_id) AS max_oid\ - \n WindowAggr: windowExpr=[[MAX(#order_id)]] partitionBy=[], orderBy=[]\ + \n WindowAggr: windowExpr=[[MAX(#order_id)]] partitionBy=[]\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2722,7 +2745,7 @@ mod tests { let sql = "SELECT order_id, MAX(qty * 1.1) OVER () from orders"; let expected = "\ Projection: #order_id, #MAX(qty Multiply Float64(1.1))\ - \n WindowAggr: windowExpr=[[MAX(#qty Multiply Float64(1.1))]] partitionBy=[], orderBy=[]\ + \n WindowAggr: windowExpr=[[MAX(#qty Multiply Float64(1.1))]] partitionBy=[]\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2733,7 +2756,7 @@ mod tests { "SELECT order_id, MAX(qty) OVER (), min(qty) over (), aVg(qty) OVER () from orders"; let expected = "\ Projection: #order_id, #MAX(qty), #MIN(qty), #AVG(qty)\ - \n WindowAggr: windowExpr=[[MAX(#qty), MIN(#qty), AVG(#qty)]] partitionBy=[], orderBy=[]\ + \n WindowAggr: windowExpr=[[MAX(#qty), MIN(#qty), AVG(#qty)]] partitionBy=[]\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2749,14 +2772,139 @@ mod tests { ); } + /// psql result + /// ``` + /// QUERY PLAN + /// ---------------------------------------------------------------------------------- + /// WindowAgg (cost=137.16..154.66 rows=1000 width=12) + /// -> Sort (cost=137.16..139.66 rows=1000 width=12) + /// Sort Key: order_id + /// -> WindowAgg (cost=69.83..87.33 rows=1000 width=12) + /// -> Sort (cost=69.83..72.33 rows=1000 width=8) + /// Sort Key: order_id DESC + /// -> Seq Scan on orders (cost=0.00..20.00 rows=1000 width=8) + /// ``` + #[test] + fn over_order_by() { + let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; + let expected = "\ + Projection: #order_id, #MAX(qty), #MIN(qty)\ + \n WindowAggr: windowExpr=[[MAX(#qty)]] partitionBy=[]\ + \n Sort: #order_id ASC NULLS FIRST\ + \n WindowAggr: windowExpr=[[MIN(#qty)]] partitionBy=[]\ + \n Sort: #order_id DESC NULLS FIRST\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); + } + + /// psql result + /// ``` + /// QUERY PLAN + /// ----------------------------------------------------------------------------------- + /// WindowAgg (cost=142.16..162.16 rows=1000 width=16) + /// -> Sort (cost=142.16..144.66 rows=1000 width=16) + /// Sort Key: order_id + /// -> WindowAgg (cost=72.33..92.33 rows=1000 width=16) + /// -> Sort (cost=72.33..74.83 rows=1000 width=12) + /// Sort Key: ((order_id + 1)) + /// -> Seq Scan on orders (cost=0.00..22.50 rows=1000 width=12) + /// ``` + #[test] + fn over_order_by_two_sort_keys() { + let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id), MIN(qty) OVER (ORDER BY (order_id + 1)) from orders"; + let expected = "\ + Projection: #order_id, #MAX(qty), #MIN(qty)\ + \n WindowAggr: windowExpr=[[MAX(#qty)]] partitionBy=[]\ + \n Sort: #order_id ASC NULLS FIRST\ + \n WindowAggr: windowExpr=[[MIN(#qty)]] partitionBy=[]\ + \n Sort: #order_id Plus Int64(1) ASC NULLS FIRST\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); + } + + /// psql result + /// ``` + /// QUERY PLAN + /// ---------------------------------------------------------------------------------------- + /// WindowAgg (cost=139.66..172.16 rows=1000 width=24) + /// -> WindowAgg (cost=139.66..159.66 rows=1000 width=16) + /// -> Sort (cost=139.66..142.16 rows=1000 width=12) + /// Sort Key: qty, order_id + /// -> WindowAgg (cost=69.83..89.83 rows=1000 width=12) + /// -> Sort (cost=69.83..72.33 rows=1000 width=8) + /// Sort Key: order_id, qty + /// -> Seq Scan on orders (cost=0.00..20.00 rows=1000 width=8) + /// ``` + #[test] + fn over_order_by_sort_keys_sorting() { + let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY qty, order_id), SUM(qty) OVER (), MIN(qty) OVER (ORDER BY order_id, qty) from orders"; + let expected = "\ + Projection: #order_id, #MAX(qty), #SUM(qty), #MIN(qty)\ + \n WindowAggr: windowExpr=[[SUM(#qty)]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[MAX(#qty)]] partitionBy=[]\ + \n Sort: #qty ASC NULLS FIRST, #order_id ASC NULLS FIRST\ + \n WindowAggr: windowExpr=[[MIN(#qty)]] partitionBy=[]\ + \n Sort: #order_id ASC NULLS FIRST, #qty ASC NULLS FIRST\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); + } + + /// psql result + /// ``` + /// QUERY PLAN + /// ---------------------------------------------------------------------------------- + /// WindowAgg (cost=69.83..117.33 rows=1000 width=24) + /// -> WindowAgg (cost=69.83..104.83 rows=1000 width=16) + /// -> WindowAgg (cost=69.83..89.83 rows=1000 width=12) + /// -> Sort (cost=69.83..72.33 rows=1000 width=8) + /// Sort Key: order_id, qty + /// -> Seq Scan on orders (cost=0.00..20.00 rows=1000 width=8) + /// ``` + /// + /// FIXME: for now we are not detecting prefix of sorting keys in order to save one sort exec phase #[test] - fn over_order_by_not_supported() { - let sql = "SELECT order_id, MAX(delivered) OVER (order BY order_id) from orders"; - let err = logical_plan(sql).expect_err("query should have failed"); - assert_eq!( - "NotImplemented(\"Unsupported OVER clause (ORDER BY order_id)\")", - format!("{:?}", err) - ); + fn over_order_by_sort_keys_sorting_prefix_compacting() { + let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id), SUM(qty) OVER (), MIN(qty) OVER (ORDER BY order_id, qty) from orders"; + let expected = "\ + Projection: #order_id, #MAX(qty), #SUM(qty), #MIN(qty)\ + \n WindowAggr: windowExpr=[[SUM(#qty)]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[MAX(#qty)]] partitionBy=[]\ + \n Sort: #order_id ASC NULLS FIRST\ + \n WindowAggr: windowExpr=[[MIN(#qty)]] partitionBy=[]\ + \n Sort: #order_id ASC NULLS FIRST, #qty ASC NULLS FIRST\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); + } + + /// psql result + /// ``` + /// QUERY PLAN + /// ---------------------------------------------------------------------------------------- + /// WindowAgg (cost=139.66..172.16 rows=1000 width=24) + /// -> WindowAgg (cost=139.66..159.66 rows=1000 width=16) + /// -> Sort (cost=139.66..142.16 rows=1000 width=12) + /// Sort Key: order_id, qty + /// -> WindowAgg (cost=69.83..89.83 rows=1000 width=12) + /// -> Sort (cost=69.83..72.33 rows=1000 width=8) + /// Sort Key: qty, order_id + /// -> Seq Scan on orders (cost=0.00..20.00 rows=1000 width=8) + /// ``` + /// + /// FIXME: for now we are not detecting prefix of sorting keys in order to re-arrange with global + /// sort + #[test] + fn over_order_by_sort_keys_sorting_global_order_compacting() { + let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY qty, order_id), SUM(qty) OVER (), MIN(qty) OVER (ORDER BY order_id, qty) from orders ORDER BY order_id"; + let expected = "\ + Sort: #order_id ASC NULLS FIRST\ + \n Projection: #order_id, #MAX(qty), #SUM(qty), #MIN(qty)\ + \n WindowAggr: windowExpr=[[SUM(#qty)]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[MAX(#qty)]] partitionBy=[]\ + \n Sort: #qty ASC NULLS FIRST, #order_id ASC NULLS FIRST\ + \n WindowAggr: windowExpr=[[MIN(#qty)]] partitionBy=[]\ + \n Sort: #order_id ASC NULLS FIRST, #qty ASC NULLS FIRST\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); } #[test] diff --git a/datafusion/src/sql/utils.rs b/datafusion/src/sql/utils.rs index 70b9df0608397..80a25d04468fb 100644 --- a/datafusion/src/sql/utils.rs +++ b/datafusion/src/sql/utils.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! SQL Utility Functions + use crate::logical_plan::{DFSchema, Expr, LogicalPlan}; use crate::{ error::{DataFusionError, Result}, @@ -46,6 +48,14 @@ pub(crate) fn find_aggregate_exprs(exprs: &[Expr]) -> Vec { }) } +/// Collect all deeply nested `Expr::Sort`. They are returned in order of occurrence +/// (depth first), with duplicates omitted. +pub(crate) fn find_sort_exprs(exprs: &[Expr]) -> Vec { + find_exprs_in_exprs(exprs, &|nested_expr| { + matches!(nested_expr, Expr::Sort { .. }) + }) +} + /// Collect all deeply nested `Expr::WindowFunction`. They are returned in order of occurrence /// (depth first), with duplicates omitted. pub(crate) fn find_window_exprs(exprs: &[Expr]) -> Vec { @@ -225,12 +235,20 @@ where .collect::>>()?, distinct: *distinct, }), - Expr::WindowFunction { fun, args } => Ok(Expr::WindowFunction { + Expr::WindowFunction { + fun, + args, + order_by, + } => Ok(Expr::WindowFunction { fun: fun.clone(), args: args .iter() .map(|e| clone_with_replacement(e, replacement_fn)) - .collect::>>()?, + .collect::>>()?, + order_by: order_by + .iter() + .map(|e| clone_with_replacement(e, replacement_fn)) + .collect::>>()?, }), Expr::AggregateUDF { fun, args } => Ok(Expr::AggregateUDF { fun: fun.clone(), @@ -389,3 +407,192 @@ pub(crate) fn resolve_aliases_to_exprs( _ => Ok(None), }) } + +/// group a slice of window expression expr by their order by expressions +pub(crate) fn group_window_expr_by_sort_keys( + window_expr: &[Expr], +) -> Result)>> { + let mut result = vec![]; + window_expr.iter().try_for_each(|expr| match expr { + Expr::WindowFunction { order_by, .. } => { + if let Some((_, values)) = result.iter_mut().find( + |group: &&mut (&[Expr], Vec<&Expr>)| matches!(group, (key, _) if key == order_by), + ) { + values.push(expr); + } else { + result.push((order_by, vec![expr])) + } + Ok(()) + } + other => Err(DataFusionError::Internal(format!( + "Impossibly got non-window expr {:?}", + other, + ))), + })?; + Ok(result) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::logical_plan::col; + use crate::physical_plan::aggregates::AggregateFunction; + use crate::physical_plan::window_functions::WindowFunction; + + #[test] + fn test_group_window_expr_by_sort_keys_empty_case() -> Result<()> { + let result = group_window_expr_by_sort_keys(&[])?; + let expected: Vec<(&[Expr], Vec<&Expr>)> = vec![]; + assert_eq!(expected, result); + Ok(()) + } + + #[test] + fn test_group_window_expr_by_sort_keys_empty_window() -> Result<()> { + let max1 = Expr::WindowFunction { + fun: WindowFunction::AggregateFunction(AggregateFunction::Max), + args: vec![col("name")], + order_by: vec![], + }; + let max2 = Expr::WindowFunction { + fun: WindowFunction::AggregateFunction(AggregateFunction::Max), + args: vec![col("name")], + order_by: vec![], + }; + let min3 = Expr::WindowFunction { + fun: WindowFunction::AggregateFunction(AggregateFunction::Min), + args: vec![col("name")], + order_by: vec![], + }; + let sum4 = Expr::WindowFunction { + fun: WindowFunction::AggregateFunction(AggregateFunction::Sum), + args: vec![col("age")], + order_by: vec![], + }; + // FIXME use as_ref + let exprs = &[max1.clone(), max2.clone(), min3.clone(), sum4.clone()]; + let result = group_window_expr_by_sort_keys(exprs)?; + let key = &[]; + let expected: Vec<(&[Expr], Vec<&Expr>)> = + vec![(key, vec![&max1, &max2, &min3, &sum4])]; + assert_eq!(expected, result); + Ok(()) + } + + #[test] + fn test_group_window_expr_by_sort_keys() -> Result<()> { + let age_asc = Expr::Sort { + expr: Box::new(col("age")), + asc: true, + nulls_first: true, + }; + let name_desc = Expr::Sort { + expr: Box::new(col("name")), + asc: false, + nulls_first: true, + }; + let created_at_desc = Expr::Sort { + expr: Box::new(col("created_at")), + asc: false, + nulls_first: true, + }; + let max1 = Expr::WindowFunction { + fun: WindowFunction::AggregateFunction(AggregateFunction::Max), + args: vec![col("name")], + order_by: vec![age_asc.clone(), name_desc.clone()], + }; + let max2 = Expr::WindowFunction { + fun: WindowFunction::AggregateFunction(AggregateFunction::Max), + args: vec![col("name")], + order_by: vec![], + }; + let min3 = Expr::WindowFunction { + fun: WindowFunction::AggregateFunction(AggregateFunction::Min), + args: vec![col("name")], + order_by: vec![age_asc.clone(), name_desc.clone()], + }; + let sum4 = Expr::WindowFunction { + fun: WindowFunction::AggregateFunction(AggregateFunction::Sum), + args: vec![col("age")], + order_by: vec![name_desc.clone(), age_asc.clone(), created_at_desc.clone()], + }; + // FIXME use as_ref + let exprs = &[max1.clone(), max2.clone(), min3.clone(), sum4.clone()]; + let result = group_window_expr_by_sort_keys(exprs)?; + + let key1 = &[age_asc.clone(), name_desc.clone()]; + let key2 = &[]; + let key3 = &[name_desc, age_asc, created_at_desc]; + + let expected: Vec<(&[Expr], Vec<&Expr>)> = vec![ + (key1, vec![&max1, &min3]), + (key2, vec![&max2]), + (key3, vec![&sum4]), + ]; + assert_eq!(expected, result); + Ok(()) + } + + #[test] + fn test_find_sort_exprs() -> Result<()> { + let exprs = &[ + Expr::WindowFunction { + fun: WindowFunction::AggregateFunction(AggregateFunction::Max), + args: vec![col("name")], + order_by: vec![ + Expr::Sort { + expr: Box::new(col("age")), + asc: true, + nulls_first: true, + }, + Expr::Sort { + expr: Box::new(col("name")), + asc: false, + nulls_first: true, + }, + ], + }, + Expr::WindowFunction { + fun: WindowFunction::AggregateFunction(AggregateFunction::Sum), + args: vec![col("age")], + order_by: vec![ + Expr::Sort { + expr: Box::new(col("name")), + asc: false, + nulls_first: true, + }, + Expr::Sort { + expr: Box::new(col("age")), + asc: true, + nulls_first: true, + }, + Expr::Sort { + expr: Box::new(col("created_at")), + asc: false, + nulls_first: true, + }, + ], + }, + ]; + let expected = vec![ + Expr::Sort { + expr: Box::new(col("age")), + asc: true, + nulls_first: true, + }, + Expr::Sort { + expr: Box::new(col("name")), + asc: false, + nulls_first: true, + }, + Expr::Sort { + expr: Box::new(col("created_at")), + asc: false, + nulls_first: true, + }, + ]; + let result = find_sort_exprs(exprs); + assert_eq!(expected, result); + Ok(()) + } +} From e713bc3b33fc4483eaf799b1c4cca355ca421f3c Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Fri, 4 Jun 2021 05:45:52 +0800 Subject: [PATCH 154/329] update cargo.toml in python crate and fix unit test due to hash joins (#483) * update cargo.toml * fix group by * remove unused imports --- python/Cargo.toml | 2 +- python/tests/generic.py | 6 ------ python/tests/test_df.py | 24 +++++++++--------------- python/tests/test_sql.py | 12 +++++++++--- python/tests/test_udaf.py | 8 ++++---- 5 files changed, 23 insertions(+), 29 deletions(-) diff --git a/python/Cargo.toml b/python/Cargo.toml index 117190714e59d..859cf350ca51f 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -31,7 +31,7 @@ libc = "0.2" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } rand = "0.7" pyo3 = { version = "0.13.2", features = ["extension-module"] } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "2423ff0d" } +datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "c3fc0c75af5ff2ebb99dba197d9d2ccd83eb5952" } [lib] name = "datafusion" diff --git a/python/tests/generic.py b/python/tests/generic.py index 7362f0bb29569..267d6f656ce01 100644 --- a/python/tests/generic.py +++ b/python/tests/generic.py @@ -15,15 +15,9 @@ # specific language governing permissions and limitations # under the License. -import unittest -import tempfile import datetime -import os.path -import shutil - import numpy import pyarrow -import datafusion # used to write parquet files import pyarrow.parquet diff --git a/python/tests/test_df.py b/python/tests/test_df.py index 2da23f908dc73..fdafdfa7f509c 100644 --- a/python/tests/test_df.py +++ b/python/tests/test_df.py @@ -19,11 +19,11 @@ import pyarrow as pa import datafusion + f = datafusion.functions class TestCase(unittest.TestCase): - def _prepare(self): ctx = datafusion.ExecutionContext() @@ -51,12 +51,10 @@ def test_select(self): def test_filter(self): df = self._prepare() - df = df \ - .select( - f.col("a") + f.col("b"), - f.col("a") - f.col("b"), - ) \ - .filter(f.col("a") > f.lit(2)) + df = df.select( + f.col("a") + f.col("b"), + f.col("a") - f.col("b"), + ).filter(f.col("a") > f.lit(2)) # execute and collect the first (and only) batch result = df.collect()[0] @@ -66,12 +64,10 @@ def test_filter(self): def test_sort(self): df = self._prepare() - df = df.sort([ - f.col("b").sort(ascending=False) - ]) + df = df.sort([f.col("b").sort(ascending=False)]) table = pa.Table.from_batches(df.collect()) - expected = {'a': [3, 2, 1], 'b': [6, 5, 4]} + expected = {"a": [3, 2, 1], "b": [6, 5, 4]} self.assertEqual(table.to_pydict(), expected) def test_limit(self): @@ -111,10 +107,8 @@ def test_join(self): df1 = ctx.create_dataframe([[batch]]) df = df.join(df1, on="a", how="inner") - df = df.sort([ - f.col("a").sort(ascending=True) - ]) + df = df.sort([f.col("a").sort(ascending=True)]) table = pa.Table.from_batches(df.collect()) - expected = {'a': [1, 2], 'c': [8, 10], 'b': [4, 5]} + expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]} self.assertEqual(table.to_pydict(), expected) diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py index e9047ea6e70c3..117284973fb77 100644 --- a/python/tests/test_sql.py +++ b/python/tests/test_sql.py @@ -82,12 +82,18 @@ def test_execute(self): ) # group by - result = ctx.sql( + results = ctx.sql( "SELECT CAST(a as int), COUNT(a) FROM t GROUP BY CAST(a as int)" ).collect() - result_keys = result[0].to_pydict()["CAST(a AS Int32)"] - result_values = result[0].to_pydict()["COUNT(a)"] + # group by returns batches + result_keys = [] + result_values = [] + for result in results: + pydict = result.to_pydict() + result_keys.extend(pydict["CAST(a AS Int32)"]) + result_values.extend(pydict["COUNT(a)"]) + result_keys, result_values = ( list(t) for t in zip(*sorted(zip(result_keys, result_values))) ) diff --git a/python/tests/test_udaf.py b/python/tests/test_udaf.py index ffd235e285f80..e1e4f933a9b47 100644 --- a/python/tests/test_udaf.py +++ b/python/tests/test_udaf.py @@ -16,7 +16,6 @@ # under the License. import unittest - import pyarrow import pyarrow.compute import datafusion @@ -86,6 +85,7 @@ def test_group_by(self): df = df.aggregate([f.col("b")], [udaf(f.col("a"))]) # execute and collect the first (and only) batch - result = df.collect()[0] - - self.assertEqual(result.column(1), pyarrow.array([1.0 + 2.0, 3.0])) + batches = df.collect() + arrays = [batch.column(1) for batch in batches] + joined = pyarrow.concat_arrays(arrays) + self.assertEqual(joined, pyarrow.array([1.0 + 2.0, 3.0])) From 53792ecf0bcaca6f15d36ca6a7e7f2b591c45831 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Fri, 4 Jun 2021 11:37:03 +0800 Subject: [PATCH 155/329] simplify python function definitions (#477) --- python/README.md | 16 +- python/src/functions.rs | 352 ++++++++++------------------------------ 2 files changed, 98 insertions(+), 270 deletions(-) diff --git a/python/README.md b/python/README.md index 1859fca9811c0..50143aef42c54 100644 --- a/python/README.md +++ b/python/README.md @@ -115,7 +115,7 @@ df = df.aggregate( ) ``` -## How to install +## How to install (from pip) ```bash pip install datafusion @@ -135,12 +135,18 @@ cd arrow-datafusion/python # prepare development environment (used to build wheel / install in development) python3 -m venv venv -pip install maturin==0.10.4 toml==0.10.1 pyarrow==1.0.0 + +# activate the venv +source venv/bin/activate + +# install dependencies +pip install maturin==0.10.6 toml==0.10.1 pyarrow==4.0.0 ``` -Whenever rust code changes (your changes or via git pull): +Whenever rust code changes (your changes or via `git pull`): ```bash -venv/bin/maturin develop -venv/bin/python -m unittest discover tests +# make sure you activate the venv using "source venv/bin/activate" first +maturin develop +python -m unittest discover tests ``` diff --git a/python/src/functions.rs b/python/src/functions.rs index f46dd3e0e5f7b..b03004fae4312 100644 --- a/python/src/functions.rs +++ b/python/src/functions.rs @@ -15,16 +15,13 @@ // specific language governing permissions and limitations // under the License. -use std::sync::Arc; - -use datafusion::arrow::datatypes::DataType; -use pyo3::{prelude::*, wrap_pyfunction}; - -use datafusion::logical_plan; - use crate::udaf; use crate::udf; use crate::{expression, types::PyDataType}; +use datafusion::arrow::datatypes::DataType; +use datafusion::logical_plan; +use pyo3::{prelude::*, wrap_pyfunction}; +use std::sync::Arc; /// Expression representing a column on the existing plan. #[pyfunction] @@ -51,55 +48,6 @@ fn array(value: Vec) -> expression::Expression { } } -#[pyfunction] -fn ascii(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::ascii(value.expr), - } -} - -#[pyfunction] -fn sum(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::sum(value.expr), - } -} - -#[pyfunction] -fn bit_length(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::bit_length(value.expr), - } -} - -#[pyfunction] -fn btrim(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::btrim(value.expr), - } -} - -#[pyfunction] -fn character_length(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::character_length(value.expr), - } -} - -#[pyfunction] -fn chr(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::chr(value.expr), - } -} - -#[pyfunction] -fn concat_ws(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::concat_ws(value.expr), - } -} - #[pyfunction] fn in_list( expr: expression::Expression, @@ -115,215 +63,87 @@ fn in_list( } } -#[pyfunction] -fn initcap(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::initcap(value.expr), - } -} - -#[pyfunction] -fn left(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::left(value.expr), - } -} - -#[pyfunction] -fn lower(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::lower(value.expr), - } -} - -#[pyfunction] -fn lpad(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::lpad(value.expr), - } -} - -#[pyfunction] -fn ltrim(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::ltrim(value.expr), - } -} - -#[pyfunction] -fn md5(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::md5(value.expr), - } -} - -#[pyfunction] -fn octet_length(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::octet_length(value.expr), - } -} - -#[pyfunction] -fn regexp_replace(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::regexp_replace(value.expr), - } -} - -#[pyfunction] -fn repeat(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::repeat(value.expr), - } -} - -#[pyfunction] -fn replace(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::replace(value.expr), - } -} - -#[pyfunction] -fn reverse(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::reverse(value.expr), - } -} - -#[pyfunction] -fn right(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::right(value.expr), - } -} - -#[pyfunction] -fn rpad(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::rpad(value.expr), - } -} - -#[pyfunction] -fn rtrim(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::rtrim(value.expr), - } -} - -#[pyfunction] -fn sha224(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::sha224(value.expr), - } -} - -#[pyfunction] -fn sha256(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::sha256(value.expr), - } -} - -#[pyfunction] -fn sha384(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::sha384(value.expr), - } -} - -#[pyfunction] -fn sha512(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::sha512(value.expr), - } -} - -#[pyfunction] -fn split_part(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::split_part(value.expr), - } -} - -#[pyfunction] -fn starts_with(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::starts_with(value.expr), - } -} - -#[pyfunction] -fn strpos(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::strpos(value.expr), - } -} - -#[pyfunction] -fn substr(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::substr(value.expr), - } -} - -#[pyfunction] -fn to_hex(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::to_hex(value.expr), - } -} - -#[pyfunction] -fn translate(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::translate(value.expr), - } -} - -#[pyfunction] -fn trim(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::trim(value.expr), - } -} - -#[pyfunction] -fn upper(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::upper(value.expr), - } -} - -#[pyfunction] -fn avg(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::avg(value.expr), - } -} - -#[pyfunction] -fn min(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::min(value.expr), - } -} - -#[pyfunction] -fn max(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::max(value.expr), - } -} - -#[pyfunction] -fn count(value: expression::Expression) -> expression::Expression { - expression::Expression { - expr: logical_plan::count(value.expr), - } -} +macro_rules! define_function { + ($NAME: ident) => { + #[doc = "This function is not documented yet"] + #[pyfunction] + fn $NAME(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::$NAME(value.expr), + } + } + }; + ($NAME: ident, $DOC: expr) => { + #[doc = $DOC] + #[pyfunction] + fn $NAME(value: expression::Expression) -> expression::Expression { + expression::Expression { + expr: logical_plan::$NAME(value.expr), + } + } + }; +} + +define_function!(ascii, "Returns the numeric code of the first character of the argument. In UTF8 encoding, returns the Unicode code point of the character. In other multibyte encodings, the argument must be an ASCII character."); +define_function!(sum); +define_function!( + bit_length, + "Returns number of bits in the string (8 times the octet_length)." +); +define_function!(btrim, "Removes the longest string containing only characters in characters (a space by default) from the start and end of string."); +define_function!( + character_length, + "Returns number of characters in the string." +); +define_function!(chr, "Returns the character with the given code."); +define_function!(concat_ws, "Concatenates all but the first argument, with separators. The first argument is used as the separator string, and should not be NULL. Other NULL arguments are ignored."); +define_function!(initcap, "Converts the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters."); +define_function!(left, "Returns first n characters in the string, or when n is negative, returns all but last |n| characters."); +define_function!(lower, "Converts the string to all lower case"); +define_function!(lpad, "Extends the string to length length by prepending the characters fill (a space by default). If the string is already longer than length then it is truncated (on the right)."); +define_function!(ltrim, "Removes the longest string containing only characters in characters (a space by default) from the start of string."); +define_function!( + md5, + "Computes the MD5 hash of the argument, with the result written in hexadecimal." +); +define_function!(now); +define_function!(octet_length, "Returns number of bytes in the string. Since this version of the function accepts type character directly, it will not strip trailing spaces."); +define_function!(random, "Returns a random value in the range 0.0 <= x < 1.0"); +define_function!( + replace, + "Replaces all occurrences in string of substring from with substring to." +); +define_function!(repeat, "Repeats string the specified number of times."); +define_function!( + regexp_replace, + "Replaces substring(s) matching a POSIX regular expression" +); +define_function!( + reverse, + "Reverses the order of the characters in the string." +); +define_function!(right, "Returns last n characters in the string, or when n is negative, returns all but first |n| characters."); +define_function!(rpad, "Extends the string to length length by appending the characters fill (a space by default). If the string is already longer than length then it is truncated."); +define_function!(rtrim, "Removes the longest string containing only characters in characters (a space by default) from the end of string."); +define_function!(sha224); +define_function!(sha256); +define_function!(sha384); +define_function!(sha512); +define_function!(split_part, "Splits string at occurrences of delimiter and returns the n'th field (counting from one)."); +define_function!(starts_with, "Returns true if string starts with prefix."); +define_function!(strpos,"Returns starting index of specified substring within string, or zero if it's not present. (Same as position(substring in string), but note the reversed argument order.)"); +define_function!(substr); +define_function!( + to_hex, + "Converts the number to its equivalent hexadecimal representation." +); +define_function!(translate, "Replaces each character in string that matches a character in the from set with the corresponding character in the to set. If from is longer than to, occurrences of the extra characters in from are deleted."); +define_function!(trim, "Removes the longest string containing only characters in characters (a space by default) from the start, end, or both ends (BOTH is the default) of string."); +define_function!(upper, "Converts the string to all upper case."); +define_function!(avg); +define_function!(min); +define_function!(max); +define_function!(count); /* #[pyfunction] @@ -414,8 +234,10 @@ pub fn init(module: &PyModule) -> PyResult<()> { module.add_function(wrap_pyfunction!(lower, module)?)?; module.add_function(wrap_pyfunction!(lpad, module)?)?; module.add_function(wrap_pyfunction!(md5, module)?)?; + module.add_function(wrap_pyfunction!(now, module)?)?; module.add_function(wrap_pyfunction!(ltrim, module)?)?; module.add_function(wrap_pyfunction!(octet_length, module)?)?; + module.add_function(wrap_pyfunction!(random, module)?)?; module.add_function(wrap_pyfunction!(regexp_replace, module)?)?; module.add_function(wrap_pyfunction!(repeat, module)?)?; module.add_function(wrap_pyfunction!(replace, module)?)?; From ac9d4ae7c6ecde354fe95a9ecafcb715e943f988 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Fri, 4 Jun 2021 12:27:46 +0200 Subject: [PATCH 156/329] Support anti join (#482) * Support semi join * Fmt * Match on Semi * Simplify * Fmt * Undo match * Support anti join * Remove check * Revert doc change * Add back test * Fix * Linting * Make test case more complete --- ballista/rust/core/proto/ballista.proto | 1 + .../core/src/serde/logical_plan/from_proto.rs | 1 + .../core/src/serde/logical_plan/to_proto.rs | 1 + .../src/serde/physical_plan/from_proto.rs | 1 + .../core/src/serde/physical_plan/to_proto.rs | 1 + datafusion/src/logical_plan/builder.rs | 2 +- datafusion/src/logical_plan/plan.rs | 2 + .../src/optimizer/hash_build_probe_order.rs | 2 +- datafusion/src/physical_plan/hash_join.rs | 61 ++++++++++++++++--- datafusion/src/physical_plan/hash_utils.rs | 4 +- datafusion/src/physical_plan/planner.rs | 1 + 11 files changed, 66 insertions(+), 11 deletions(-) diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index d21cbf694b9d4..0ed9f243fd0ad 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -362,6 +362,7 @@ enum JoinType { RIGHT = 2; FULL = 3; SEMI = 4; + ANTI = 5; } message JoinNode { diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index 522d60cb8a054..662d9d0a929a8 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -264,6 +264,7 @@ impl TryInto for &protobuf::LogicalPlanNode { protobuf::JoinType::Right => JoinType::Right, protobuf::JoinType::Full => JoinType::Full, protobuf::JoinType::Semi => JoinType::Semi, + protobuf::JoinType::Anti => JoinType::Anti, }; LogicalPlanBuilder::from(&convert_box_required!(join.left)?) .join( diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/ballista/rust/core/src/serde/logical_plan/to_proto.rs index 088e93120e4f4..d7734f05da56c 100644 --- a/ballista/rust/core/src/serde/logical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/to_proto.rs @@ -814,6 +814,7 @@ impl TryInto for &LogicalPlan { JoinType::Right => protobuf::JoinType::Right, JoinType::Full => protobuf::JoinType::Full, JoinType::Semi => protobuf::JoinType::Semi, + JoinType::Anti => protobuf::JoinType::Anti, }; let left_join_column = on.iter().map(|on| on.0.to_owned()).collect(); let right_join_column = on.iter().map(|on| on.1.to_owned()).collect(); diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index c19739a6b061b..22944313666f5 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -388,6 +388,7 @@ impl TryInto> for &protobuf::PhysicalPlanNode { protobuf::JoinType::Right => JoinType::Right, protobuf::JoinType::Full => JoinType::Full, protobuf::JoinType::Semi => JoinType::Semi, + protobuf::JoinType::Anti => JoinType::Anti, }; Ok(Arc::new(HashJoinExec::try_new( left, diff --git a/ballista/rust/core/src/serde/physical_plan/to_proto.rs b/ballista/rust/core/src/serde/physical_plan/to_proto.rs index c409f94749518..26092e74a096a 100644 --- a/ballista/rust/core/src/serde/physical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/to_proto.rs @@ -134,6 +134,7 @@ impl TryInto for Arc { JoinType::Right => protobuf::JoinType::Right, JoinType::Full => protobuf::JoinType::Full, JoinType::Semi => protobuf::JoinType::Semi, + JoinType::Anti => protobuf::JoinType::Anti, }; Ok(protobuf::PhysicalPlanNode { physical_plan_type: Some(PhysicalPlanType::HashJoin(Box::new( diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs index dc80a41c0c01b..6bd5181050fd6 100644 --- a/datafusion/src/logical_plan/builder.rs +++ b/datafusion/src/logical_plan/builder.rs @@ -388,7 +388,7 @@ fn build_join_schema( // left then right left_fields.chain(right_fields).cloned().collect() } - JoinType::Semi => { + JoinType::Semi | JoinType::Anti => { // Only use the left side for the schema left.fields().clone() } diff --git a/datafusion/src/logical_plan/plan.rs b/datafusion/src/logical_plan/plan.rs index fe1dfb6de990f..25cf9e33d2ca7 100644 --- a/datafusion/src/logical_plan/plan.rs +++ b/datafusion/src/logical_plan/plan.rs @@ -46,6 +46,8 @@ pub enum JoinType { Full, /// Semi Join Semi, + /// Anti Join + Anti, } /// A LogicalPlan represents the different types of relational diff --git a/datafusion/src/optimizer/hash_build_probe_order.rs b/datafusion/src/optimizer/hash_build_probe_order.rs index 86d38ef313ce8..74d2b00901942 100644 --- a/datafusion/src/optimizer/hash_build_probe_order.rs +++ b/datafusion/src/optimizer/hash_build_probe_order.rs @@ -109,7 +109,7 @@ fn should_swap_join_order(left: &LogicalPlan, right: &LogicalPlan) -> bool { fn supports_swap(join_type: JoinType) -> bool { match join_type { JoinType::Inner | JoinType::Left | JoinType::Right | JoinType::Full => true, - JoinType::Semi => false, + JoinType::Semi | JoinType::Anti => false, } } diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index 6653b9a356a45..d12e249cbe347 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -184,9 +184,11 @@ impl HashJoinExec { /// Calculates column indices and left/right placement on input / output schemas and jointype fn column_indices_from_schema(&self) -> ArrowResult> { let (primary_is_left, primary_schema, secondary_schema) = match self.join_type { - JoinType::Inner | JoinType::Left | JoinType::Full | JoinType::Semi => { - (true, self.left.schema(), self.right.schema()) - } + JoinType::Inner + | JoinType::Left + | JoinType::Full + | JoinType::Semi + | JoinType::Anti => (true, self.left.schema(), self.right.schema()), JoinType::Right => (false, self.right.schema(), self.left.schema()), }; let mut column_indices = Vec::with_capacity(self.schema.fields().len()); @@ -376,7 +378,9 @@ impl ExecutionPlan for HashJoinExec { let column_indices = self.column_indices_from_schema()?; let num_rows = left_data.1.num_rows(); let visited_left_side = match self.join_type { - JoinType::Left | JoinType::Full | JoinType::Semi => vec![false; num_rows], + JoinType::Left | JoinType::Full | JoinType::Semi | JoinType::Anti => { + vec![false; num_rows] + } JoinType::Inner | JoinType::Right => vec![], }; Ok(Box::pin(HashJoinStream { @@ -544,7 +548,7 @@ fn build_batch( ) .unwrap(); - if join_type == JoinType::Semi { + if matches!(join_type, JoinType::Semi | JoinType::Anti) { return Ok(( RecordBatch::new_empty(Arc::new(schema.clone())), left_indices, @@ -613,7 +617,7 @@ fn build_join_indexes( let left = &left_data.0; match join_type { - JoinType::Inner | JoinType::Semi => { + JoinType::Inner | JoinType::Semi | JoinType::Anti => { // Using a buffer builder to avoid slower normal builder let mut left_indices = UInt64BufferBuilder::new(0); let mut right_indices = UInt32BufferBuilder::new(0); @@ -1190,7 +1194,10 @@ impl Stream for HashJoinStream { self.num_output_rows += batch.num_rows(); match self.join_type { - JoinType::Left | JoinType::Full | JoinType::Semi => { + JoinType::Left + | JoinType::Full + | JoinType::Semi + | JoinType::Anti => { left_side.iter().flatten().for_each(|x| { self.visited_left_side[x as usize] = true; }); @@ -1204,7 +1211,10 @@ impl Stream for HashJoinStream { let start = Instant::now(); // For the left join, produce rows for unmatched rows match self.join_type { - JoinType::Left | JoinType::Full | JoinType::Semi + JoinType::Left + | JoinType::Full + | JoinType::Semi + | JoinType::Anti if !self.is_exhausted => { let result = produce_from_matched( @@ -1230,6 +1240,7 @@ impl Stream for HashJoinStream { JoinType::Left | JoinType::Full | JoinType::Semi + | JoinType::Anti | JoinType::Inner | JoinType::Right => {} } @@ -1725,6 +1736,40 @@ mod tests { Ok(()) } + #[tokio::test] + async fn join_anti() -> Result<()> { + let left = build_table( + ("a1", &vec![1, 2, 2, 3, 5]), + ("b1", &vec![4, 5, 5, 7, 7]), // 7 does not exist on the right + ("c1", &vec![7, 8, 8, 9, 11]), + ); + let right = build_table( + ("a2", &vec![10, 20, 30, 40]), + ("b1", &vec![4, 5, 6, 5]), // 5 is double on the right + ("c2", &vec![70, 80, 90, 100]), + ); + let on = &[("b1", "b1")]; + + let join = join(left, right, on, &JoinType::Anti)?; + + let columns = columns(&join.schema()); + assert_eq!(columns, vec!["a1", "b1", "c1"]); + + let stream = join.execute(0).await?; + let batches = common::collect(stream).await?; + + let expected = vec![ + "+----+----+----+", + "| a1 | b1 | c1 |", + "+----+----+----+", + "| 3 | 7 | 9 |", + "| 5 | 7 | 11 |", + "+----+----+----+", + ]; + assert_batches_sorted_eq!(expected, &batches); + Ok(()) + } + #[tokio::test] async fn join_right_one() -> Result<()> { let left = build_table( diff --git a/datafusion/src/physical_plan/hash_utils.rs b/datafusion/src/physical_plan/hash_utils.rs index 110319e4bb6b8..a48710bfbfc35 100644 --- a/datafusion/src/physical_plan/hash_utils.rs +++ b/datafusion/src/physical_plan/hash_utils.rs @@ -34,6 +34,8 @@ pub enum JoinType { Full, /// Semi Join Semi, + /// Anti Join + Anti, } /// The on clause of the join, as vector of (left, right) columns. @@ -132,7 +134,7 @@ pub fn build_join_schema( // left then right left_fields.chain(right_fields).cloned().collect() } - JoinType::Semi => left.fields().clone(), + JoinType::Semi | JoinType::Anti => left.fields().clone(), }; Schema::new(fields) } diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index b77850f9d67fe..53bff62f4f813 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -368,6 +368,7 @@ impl DefaultPhysicalPlanner { JoinType::Right => hash_utils::JoinType::Right, JoinType::Full => hash_utils::JoinType::Full, JoinType::Semi => hash_utils::JoinType::Semi, + JoinType::Anti => hash_utils::JoinType::Anti, }; if ctx_state.config.concurrency > 1 && ctx_state.config.repartition_joins { From c92079dfb3045a9a46d12c3bc22361a44d11b8bc Mon Sep 17 00:00:00 2001 From: Michael Lu Date: Fri, 4 Jun 2021 18:18:40 +0300 Subject: [PATCH 157/329] Add datafusion::test_util, resolve test data paths without env vars (#498) --- datafusion-examples/examples/csv_sql.rs | 2 +- datafusion/benches/sort_limit_query_sql.rs | 2 +- datafusion/src/datasource/csv.rs | 4 +- datafusion/src/datasource/parquet.rs | 2 +- datafusion/src/execution/dataframe_impl.rs | 2 +- datafusion/src/lib.rs | 3 +- datafusion/src/physical_plan/csv.rs | 6 +- datafusion/src/physical_plan/parquet.rs | 2 +- datafusion/src/physical_plan/planner.rs | 12 +- datafusion/src/test/mod.rs | 2 +- datafusion/src/test_util.rs | 166 +++++++++++++++++++++ datafusion/tests/sql.rs | 14 +- 12 files changed, 191 insertions(+), 26 deletions(-) create mode 100644 datafusion/src/test_util.rs diff --git a/datafusion-examples/examples/csv_sql.rs b/datafusion-examples/examples/csv_sql.rs index 76c87960d71d3..a06b42ad4cb09 100644 --- a/datafusion-examples/examples/csv_sql.rs +++ b/datafusion-examples/examples/csv_sql.rs @@ -27,7 +27,7 @@ async fn main() -> Result<()> { // create local execution context let mut ctx = ExecutionContext::new(); - let testdata = datafusion::arrow::util::test_util::arrow_test_data(); + let testdata = datafusion::test_util::arrow_test_data(); // register csv file with the execution context ctx.register_csv( diff --git a/datafusion/benches/sort_limit_query_sql.rs b/datafusion/benches/sort_limit_query_sql.rs index be065f32e0090..1e8339ea31eb1 100644 --- a/datafusion/benches/sort_limit_query_sql.rs +++ b/datafusion/benches/sort_limit_query_sql.rs @@ -57,7 +57,7 @@ fn create_context() -> Arc> { Field::new("c13", DataType::Utf8, false), ])); - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = datafusion::test_util::arrow_test_data(); // create CSV data source let csv = CsvFile::try_new( diff --git a/datafusion/src/datasource/csv.rs b/datafusion/src/datasource/csv.rs index 10e6659089b47..e1a61595f2eeb 100644 --- a/datafusion/src/datasource/csv.rs +++ b/datafusion/src/datasource/csv.rs @@ -25,7 +25,7 @@ //! use datafusion::datasource::TableProvider; //! use datafusion::datasource::csv::{CsvFile, CsvReadOptions}; //! -//! let testdata = arrow::util::test_util::arrow_test_data(); +//! let testdata = datafusion::test_util::arrow_test_data(); //! let csvdata = CsvFile::try_new( //! &format!("{}/csv/aggregate_test_100.csv", testdata), //! CsvReadOptions::new().delimiter(b'|'), @@ -222,7 +222,7 @@ mod tests { #[tokio::test] async fn csv_file_from_reader() -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test_util::arrow_test_data(); let filename = "aggregate_test_100.csv"; let path = format!("{}/csv/{}", testdata, filename); let buf = std::fs::read(path).unwrap(); diff --git a/datafusion/src/datasource/parquet.rs b/datafusion/src/datasource/parquet.rs index 30e47df5f6491..abfb81d99887d 100644 --- a/datafusion/src/datasource/parquet.rs +++ b/datafusion/src/datasource/parquet.rs @@ -328,7 +328,7 @@ mod tests { } fn load_table(name: &str) -> Result> { - let testdata = arrow::util::test_util::parquet_test_data(); + let testdata = crate::test_util::parquet_test_data(); let filename = format!("{}/{}", testdata, name); let table = ParquetTable::try_new(&filename, 2)?; Ok(Arc::new(table)) diff --git a/datafusion/src/execution/dataframe_impl.rs b/datafusion/src/execution/dataframe_impl.rs index fdc75f92f2e75..19f71eb79268f 100644 --- a/datafusion/src/execution/dataframe_impl.rs +++ b/datafusion/src/execution/dataframe_impl.rs @@ -369,7 +369,7 @@ mod tests { fn register_aggregate_csv(ctx: &mut ExecutionContext) -> Result<()> { let schema = test::aggr_test_schema(); - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test_util::arrow_test_data(); ctx.register_csv( "aggregate_test_100", &format!("{}/csv/aggregate_test_100.csv", testdata), diff --git a/datafusion/src/lib.rs b/datafusion/src/lib.rs index 5b8c9c13006ab..e4501a78ada41 100644 --- a/datafusion/src/lib.rs +++ b/datafusion/src/lib.rs @@ -194,8 +194,6 @@ //! cd arrow-datafusion //! # Download test data //! git submodule update --init -//! export PARQUET_TEST_DATA=parquet-testing/data -//! export ARROW_TEST_DATA=testing/data //! //! cargo run --example csv_sql //! @@ -234,6 +232,7 @@ pub use parquet; #[cfg(test)] pub mod test; +pub mod test_util; #[macro_use] #[cfg(feature = "regex_expressions")] diff --git a/datafusion/src/physical_plan/csv.rs b/datafusion/src/physical_plan/csv.rs index 9f88a53bc17cd..544f98cba0c6f 100644 --- a/datafusion/src/physical_plan/csv.rs +++ b/datafusion/src/physical_plan/csv.rs @@ -442,7 +442,7 @@ mod tests { #[tokio::test] async fn csv_exec_with_projection() -> Result<()> { let schema = aggr_test_schema(); - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test_util::arrow_test_data(); let filename = "aggregate_test_100.csv"; let path = format!("{}/csv/{}", testdata, filename); let csv = CsvExec::try_new( @@ -470,7 +470,7 @@ mod tests { #[tokio::test] async fn csv_exec_without_projection() -> Result<()> { let schema = aggr_test_schema(); - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test_util::arrow_test_data(); let filename = "aggregate_test_100.csv"; let path = format!("{}/csv/{}", testdata, filename); let csv = CsvExec::try_new( @@ -498,7 +498,7 @@ mod tests { #[tokio::test] async fn csv_exec_with_reader() -> Result<()> { let schema = aggr_test_schema(); - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test_util::arrow_test_data(); let filename = "aggregate_test_100.csv"; let path = format!("{}/csv/{}", testdata, filename); let buf = std::fs::read(path).unwrap(); diff --git a/datafusion/src/physical_plan/parquet.rs b/datafusion/src/physical_plan/parquet.rs index 55a6d96738cb4..2bea94aee1e5b 100644 --- a/datafusion/src/physical_plan/parquet.rs +++ b/datafusion/src/physical_plan/parquet.rs @@ -705,7 +705,7 @@ mod tests { #[tokio::test] async fn test() -> Result<()> { - let testdata = arrow::util::test_util::parquet_test_data(); + let testdata = crate::test_util::parquet_test_data(); let filename = format!("{}/alltypes_plain.parquet", testdata); let parquet_exec = ParquetExec::try_from_path( &filename, diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index 53bff62f4f813..754ace08de6a8 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -872,7 +872,7 @@ mod tests { #[test] fn test_all_operators() -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test_util::arrow_test_data(); let path = format!("{}/csv/aggregate_test_100.csv", testdata); let options = CsvReadOptions::new().schema_infer_max_records(100); @@ -912,7 +912,7 @@ mod tests { #[test] fn test_with_csv_plan() -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test_util::arrow_test_data(); let path = format!("{}/csv/aggregate_test_100.csv", testdata); let options = CsvReadOptions::new().schema_infer_max_records(100); @@ -931,7 +931,7 @@ mod tests { #[test] fn errors() -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test_util::arrow_test_data(); let path = format!("{}/csv/aggregate_test_100.csv", testdata); let options = CsvReadOptions::new().schema_infer_max_records(100); @@ -1033,7 +1033,7 @@ mod tests { #[test] fn in_list_types() -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test_util::arrow_test_data(); let path = format!("{}/csv/aggregate_test_100.csv", testdata); let options = CsvReadOptions::new().schema_infer_max_records(100); @@ -1081,7 +1081,7 @@ mod tests { #[test] fn hash_agg_input_schema() -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test_util::arrow_test_data(); let path = format!("{}/csv/aggregate_test_100.csv", testdata); let options = CsvReadOptions::new().schema_infer_max_records(100); @@ -1104,7 +1104,7 @@ mod tests { #[test] fn hash_agg_group_by_partitioned() -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test_util::arrow_test_data(); let path = format!("{}/csv/aggregate_test_100.csv", testdata); let options = CsvReadOptions::new().schema_infer_max_records(100); diff --git a/datafusion/src/test/mod.rs b/datafusion/src/test/mod.rs index 926a692261691..51dfe7f3a0993 100644 --- a/datafusion/src/test/mod.rs +++ b/datafusion/src/test/mod.rs @@ -52,7 +52,7 @@ pub fn create_table_dual() -> Arc { /// Generated partitioned copy of a CSV file pub fn create_partitioned_csv(filename: &str, partitions: usize) -> Result { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = crate::test_util::arrow_test_data(); let path = format!("{}/csv/{}", testdata, filename); let tmp_dir = TempDir::new()?; diff --git a/datafusion/src/test_util.rs b/datafusion/src/test_util.rs new file mode 100644 index 0000000000000..e96e8e0c209f7 --- /dev/null +++ b/datafusion/src/test_util.rs @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Utils to make testing easier + +use std::{env, error::Error, path::PathBuf}; + +/// Returns the arrow test data directory, which is by default stored +/// in a git submodule rooted at `testing/data`. +/// +/// The default can be overridden by the optional environment +/// variable `ARROW_TEST_DATA` +/// +/// panics when the directory can not be found. +/// +/// Example: +/// ``` +/// let testdata = datafusion::test_util::arrow_test_data(); +/// let csvdata = format!("{}/csv/aggregate_test_100.csv", testdata); +/// assert!(std::path::PathBuf::from(csvdata).exists()); +/// ``` +pub fn arrow_test_data() -> String { + match get_data_dir("ARROW_TEST_DATA", "../testing/data") { + Ok(pb) => pb.display().to_string(), + Err(err) => panic!("failed to get arrow data dir: {}", err), + } +} + +/// Returns the parquest test data directory, which is by default +/// stored in a git submodule rooted at +/// `parquest-testing/data`. +/// +/// The default can be overridden by the optional environment variable +/// `PARQUET_TEST_DATA` +/// +/// panics when the directory can not be found. +/// +/// Example: +/// ``` +/// let testdata = datafusion::test_util::parquet_test_data(); +/// let filename = format!("{}/binary.parquet", testdata); +/// assert!(std::path::PathBuf::from(filename).exists()); +/// ``` +pub fn parquet_test_data() -> String { + match get_data_dir("PARQUET_TEST_DATA", "../parquet-testing/data") { + Ok(pb) => pb.display().to_string(), + Err(err) => panic!("failed to get parquet data dir: {}", err), + } +} + +/// Returns a directory path for finding test data. +/// +/// udf_env: name of an environment variable +/// +/// submodule_dir: fallback path (relative to CARGO_MANIFEST_DIR) +/// +/// Returns either: +/// The path referred to in `udf_env` if that variable is set and refers to a directory +/// The submodule_data directory relative to CARGO_MANIFEST_PATH +fn get_data_dir(udf_env: &str, submodule_data: &str) -> Result> { + // Try user defined env. + if let Ok(dir) = env::var(udf_env) { + let trimmed = dir.trim().to_string(); + if !trimmed.is_empty() { + let pb = PathBuf::from(trimmed); + if pb.is_dir() { + return Ok(pb); + } else { + return Err(format!( + "the data dir `{}` defined by env {} not found", + pb.display().to_string(), + udf_env + ) + .into()); + } + } + } + + // The env is undefined or its value is trimmed to empty, let's try default dir. + + // env "CARGO_MANIFEST_DIR" is "the directory containing the manifest of your package", + // set by `cargo run` or `cargo test`, see: + // https://doc.rust-lang.org/cargo/reference/environment-variables.html + let dir = env!("CARGO_MANIFEST_DIR"); + + let pb = PathBuf::from(dir).join(submodule_data); + if pb.is_dir() { + Ok(pb) + } else { + Err(format!( + "env `{}` is undefined or has empty value, and the pre-defined data dir `{}` not found\n\ + HINT: try running `git submodule update --init`", + udf_env, + pb.display().to_string(), + ).into()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::env; + + #[test] + fn test_data_dir() { + let udf_env = "get_data_dir"; + let cwd = env::current_dir().unwrap(); + + let existing_pb = cwd.join(".."); + let existing = existing_pb.display().to_string(); + let existing_str = existing.as_str(); + + let non_existing = cwd.join("non-existing-dir").display().to_string(); + let non_existing_str = non_existing.as_str(); + + env::set_var(udf_env, non_existing_str); + let res = get_data_dir(udf_env, existing_str); + assert!(res.is_err()); + + env::set_var(udf_env, ""); + let res = get_data_dir(udf_env, existing_str); + assert!(res.is_ok()); + assert_eq!(res.unwrap(), existing_pb); + + env::set_var(udf_env, " "); + let res = get_data_dir(udf_env, existing_str); + assert!(res.is_ok()); + assert_eq!(res.unwrap(), existing_pb); + + env::set_var(udf_env, existing_str); + let res = get_data_dir(udf_env, existing_str); + assert!(res.is_ok()); + assert_eq!(res.unwrap(), existing_pb); + + env::remove_var(udf_env); + let res = get_data_dir(udf_env, non_existing_str); + assert!(res.is_err()); + + let res = get_data_dir(udf_env, existing_str); + assert!(res.is_ok()); + assert_eq!(res.unwrap(), existing_pb); + } + + #[test] + fn test_happy() { + let res = arrow_test_data(); + assert!(PathBuf::from(res).is_dir()); + + let res = parquet_test_data(); + assert!(PathBuf::from(res).is_dir()); + } +} diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 029e9307e5f68..d77671e7f4ffd 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -126,7 +126,7 @@ async fn parquet_query() { #[tokio::test] async fn parquet_single_nan_schema() { let mut ctx = ExecutionContext::new(); - let testdata = arrow::util::test_util::parquet_test_data(); + let testdata = datafusion::test_util::parquet_test_data(); ctx.register_parquet("single_nan", &format!("{}/single_nan.parquet", testdata)) .unwrap(); let sql = "SELECT mycol FROM single_nan"; @@ -144,7 +144,7 @@ async fn parquet_single_nan_schema() { #[ignore = "Test ignored, will be enabled as part of the nested Parquet reader"] async fn parquet_list_columns() { let mut ctx = ExecutionContext::new(); - let testdata = arrow::util::test_util::parquet_test_data(); + let testdata = datafusion::test_util::parquet_test_data(); ctx.register_parquet( "list_columns", &format!("{}/list_columns.parquet", testdata), @@ -2009,7 +2009,7 @@ fn aggr_test_schema() -> SchemaRef { } async fn register_aggregate_csv_by_sql(ctx: &mut ExecutionContext) { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = datafusion::test_util::arrow_test_data(); // TODO: The following c9 should be migrated to UInt32 and c10 should be UInt64 once // unsigned is supported. @@ -2049,7 +2049,7 @@ async fn register_aggregate_csv_by_sql(ctx: &mut ExecutionContext) { } fn register_aggregate_csv(ctx: &mut ExecutionContext) -> Result<()> { - let testdata = arrow::util::test_util::arrow_test_data(); + let testdata = datafusion::test_util::arrow_test_data(); let schema = aggr_test_schema(); ctx.register_csv( "aggregate_test_100", @@ -2076,7 +2076,7 @@ fn register_aggregate_simple_csv(ctx: &mut ExecutionContext) -> Result<()> { } fn register_alltypes_parquet(ctx: &mut ExecutionContext) { - let testdata = arrow::util::test_util::parquet_test_data(); + let testdata = datafusion::test_util::parquet_test_data(); ctx.register_parquet( "alltypes_plain", &format!("{}/alltypes_plain.parquet", testdata), @@ -3374,7 +3374,7 @@ async fn test_physical_plan_display_indent() { " CsvExec: source=Path(ARROW_TEST_DATA/csv/aggregate_test_100.csv: [ARROW_TEST_DATA/csv/aggregate_test_100.csv]), has_header=true", ]; - let data_path = arrow::util::test_util::arrow_test_data(); + let data_path = datafusion::test_util::arrow_test_data(); let actual = format!("{}", displayable(physical_plan.as_ref()).indent()) .trim() .lines() @@ -3423,7 +3423,7 @@ async fn test_physical_plan_display_indent_multi_children() { " CsvExec: source=Path(ARROW_TEST_DATA/csv/aggregate_test_100.csv: [ARROW_TEST_DATA/csv/aggregate_test_100.csv]), has_header=true", ]; - let data_path = arrow::util::test_util::arrow_test_data(); + let data_path = datafusion::test_util::arrow_test_data(); let actual = format!("{}", displayable(physical_plan.as_ref()).indent()) .trim() .lines() From a0370b273c6dd4c972eadbee2c5465ac46a58fd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Fri, 4 Jun 2021 18:37:15 +0200 Subject: [PATCH 158/329] Implement missing join types for Python dataframe (#503) * Implement missing join types for Python dataframe * Fix mapping * Use commit hash instead * undo some changes * Remove imports * Undo removed part * Undo removed part * minimize changes --- python/Cargo.toml | 2 +- python/src/dataframe.rs | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/python/Cargo.toml b/python/Cargo.toml index 859cf350ca51f..8f1480deedbc9 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -31,7 +31,7 @@ libc = "0.2" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } rand = "0.7" pyo3 = { version = "0.13.2", features = ["extension-module"] } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "c3fc0c75af5ff2ebb99dba197d9d2ccd83eb5952" } +datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "c92079dfb3045a9a46d12c3bc22361a44d11b8bc" } [lib] name = "datafusion" diff --git a/python/src/dataframe.rs b/python/src/dataframe.rs index 66e6916b68156..8ceac64741e9e 100644 --- a/python/src/dataframe.rs +++ b/python/src/dataframe.rs @@ -147,6 +147,9 @@ impl DataFrame { "inner" => JoinType::Inner, "left" => JoinType::Left, "right" => JoinType::Right, + "full" => JoinType::Full, + "semi" => JoinType::Semi, + "anti" => JoinType::Anti, how => { return Err(DataFusionError::Common(format!( "The join type {} does not exist or is not implemented", From e1a03a417e5ad7c9d30426e61b56ef06a2ff19ac Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Sat, 5 Jun 2021 01:01:39 +0800 Subject: [PATCH 159/329] use requirements.txt to formalize python deps (#484) * use requirements.txt to formalize python deps * fix header * update readme --- .github/workflows/python_build.yml | 6 +- .github/workflows/python_test.yaml | 63 ++++++++++---------- python/README.md | 25 ++++++-- python/requirements.in | 19 ++++++ python/requirements.txt | 94 ++++++++++++++++++++++++++++++ 5 files changed, 169 insertions(+), 38 deletions(-) create mode 100644 python/requirements.in create mode 100644 python/requirements.txt diff --git a/.github/workflows/python_build.yml b/.github/workflows/python_build.yml index c86bb81581a71..eba11b8e3a41f 100644 --- a/.github/workflows/python_build.yml +++ b/.github/workflows/python_build.yml @@ -28,12 +28,12 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.6, 3.7, 3.8] + python-version: ["3.6", "3.7", "3.8", "3.9"] os: [macos-latest, windows-latest] steps: - uses: actions/checkout@v2 - - uses: actions/setup-python@v1 + - uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} @@ -54,7 +54,7 @@ jobs: run: dir python/target\wheels\ - name: List wheels - if: matrix.os != 'windows-latest' + if: matrix.os != 'windows-latest' run: find ./python/target/wheels/ - name: Archive wheels diff --git a/.github/workflows/python_test.yaml b/.github/workflows/python_test.yaml index 3b2111b59d49d..13516ff699dac 100644 --- a/.github/workflows/python_test.yaml +++ b/.github/workflows/python_test.yaml @@ -22,37 +22,38 @@ jobs: test: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - name: Setup Rust toolchain - run: | - rustup toolchain install nightly-2021-01-06 - rustup default nightly-2021-01-06 - rustup component add rustfmt - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /home/runner/.cargo - key: cargo-maturin-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /home/runner/target - key: target-maturin-cache- - - uses: actions/setup-python@v2 - with: - python-version: '3.7' - - name: Install Python dependencies - run: python -m pip install --upgrade pip setuptools wheel - - name: Run tests - run: | - cd python/ - export CARGO_HOME="/home/runner/.cargo" - export CARGO_TARGET_DIR="/home/runner/target" + - uses: actions/checkout@v2 + - name: Setup Rust toolchain + run: | + rustup toolchain install nightly-2021-01-06 + rustup default nightly-2021-01-06 + rustup component add rustfmt + - name: Cache Cargo + uses: actions/cache@v2 + with: + path: /home/runner/.cargo + key: cargo-maturin-cache- + - name: Cache Rust dependencies + uses: actions/cache@v2 + with: + path: /home/runner/target + key: target-maturin-cache- + - uses: actions/setup-python@v2 + with: + python-version: "3.9" + - name: Install Python dependencies + run: python -m pip install --upgrade pip setuptools wheel + - name: Run tests + run: | + cd python/ - python -m venv venv - source venv/bin/activate + python -m venv venv + source venv/bin/activate - pip install maturin==0.10.4 toml==0.10.1 pyarrow==4.0.0 - maturin develop + pip install -r requirements.txt + maturin develop - python -m unittest discover tests + python -m unittest discover tests + env: + CARGO_HOME: "/home/runner/.cargo" + CARGO_TARGET_DIR: "/home/runner/target" diff --git a/python/README.md b/python/README.md index 50143aef42c54..05561f712cae4 100644 --- a/python/README.md +++ b/python/README.md @@ -135,12 +135,9 @@ cd arrow-datafusion/python # prepare development environment (used to build wheel / install in development) python3 -m venv venv - # activate the venv source venv/bin/activate - -# install dependencies -pip install maturin==0.10.6 toml==0.10.1 pyarrow==4.0.0 +pip install -r requirements.txt ``` Whenever rust code changes (your changes or via `git pull`): @@ -150,3 +147,23 @@ Whenever rust code changes (your changes or via `git pull`): maturin develop python -m unittest discover tests ``` + +## How to update dependencies + +To change test dependencies, change the `requirements.in` and run + +```bash +# install pip-tools (this can be done only once), also consider running in venv +pip install pip-tools + +# change requirements.in and then run +pip-compile --generate-hashes +``` + +To update dependencies, run + +```bash +pip-compile update +``` + +More details [here](https://github.com/jazzband/pip-tools) diff --git a/python/requirements.in b/python/requirements.in new file mode 100644 index 0000000000000..3ef9f18966d4b --- /dev/null +++ b/python/requirements.in @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +maturin +toml +pyarrow diff --git a/python/requirements.txt b/python/requirements.txt new file mode 100644 index 0000000000000..ff02b80cf6fc3 --- /dev/null +++ b/python/requirements.txt @@ -0,0 +1,94 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# This file is autogenerated by pip-compile +# To update, run: +# +# pip-compile --generate-hashes +# +maturin==0.10.6 \ + --hash=sha256:0e81496f70a4805e6ea7dda7b0425246c111ccb119a2e22c64abeff131f4dd21 \ + --hash=sha256:3b5d5429bc05a816824420d99973f0cab39d8e274f6c3647bfd9afd95a030304 \ + --hash=sha256:4177a223727a0ad57bc3f69ca4c3bc04bb3cc4da787cc59a8e25808c85685c67 \ + --hash=sha256:4eb4481b6c7d6cac043b969d2eb993c982523e91bb2709f0b09e231cf4846731 \ + --hash=sha256:532625f312185b06ec196fdb0fc79efafc0e98768153d226fb9417c0ca85e410 \ + --hash=sha256:53ef64a147f8a5241a3e932f2db22b5ae7dc5892dae994da319446c5db89dc94 \ + --hash=sha256:a04589da42f62b1d515f35c81274a56fe0d29216894525e8a37fd1e3c69d87b1 \ + --hash=sha256:b58e9e2ba5a3f651d8885c41370a00bb1d3e4d7313cbb63354077153be7650f4 \ + --hash=sha256:bd39f7e08eb9908d4fe1cd9b3c953fad5b1fb4fec9c82d14c2973a65751e1899 \ + --hash=sha256:d63f2a15f0b8db4e70d9a59766ca240b2c2ee2146ed5e4385a6118d941d68b25 \ + --hash=sha256:fa7e1cea2a768257a33aeb556fdec5fc36011bfe82d96730117433c635629dd8 + # via -r requirements.in +numpy==1.20.3 \ + --hash=sha256:1676b0a292dd3c99e49305a16d7a9f42a4ab60ec522eac0d3dd20cdf362ac010 \ + --hash=sha256:16f221035e8bd19b9dc9a57159e38d2dd060b48e93e1d843c49cb370b0f415fd \ + --hash=sha256:43909c8bb289c382170e0282158a38cf306a8ad2ff6dfadc447e90f9961bef43 \ + --hash=sha256:4e465afc3b96dbc80cf4a5273e5e2b1e3451286361b4af70ce1adb2984d392f9 \ + --hash=sha256:55b745fca0a5ab738647d0e4db099bd0a23279c32b31a783ad2ccea729e632df \ + --hash=sha256:5d050e1e4bc9ddb8656d7b4f414557720ddcca23a5b88dd7cff65e847864c400 \ + --hash=sha256:637d827248f447e63585ca3f4a7d2dfaa882e094df6cfa177cc9cf9cd6cdf6d2 \ + --hash=sha256:6690080810f77485667bfbff4f69d717c3be25e5b11bb2073e76bb3f578d99b4 \ + --hash=sha256:66fbc6fed94a13b9801fb70b96ff30605ab0a123e775a5e7a26938b717c5d71a \ + --hash=sha256:67d44acb72c31a97a3d5d33d103ab06d8ac20770e1c5ad81bdb3f0c086a56cf6 \ + --hash=sha256:6ca2b85a5997dabc38301a22ee43c82adcb53ff660b89ee88dded6b33687e1d8 \ + --hash=sha256:6e51534e78d14b4a009a062641f465cfaba4fdcb046c3ac0b1f61dd97c861b1b \ + --hash=sha256:70eb5808127284c4e5c9e836208e09d685a7978b6a216db85960b1a112eeace8 \ + --hash=sha256:830b044f4e64a76ba71448fce6e604c0fc47a0e54d8f6467be23749ac2cbd2fb \ + --hash=sha256:8b7bb4b9280da3b2856cb1fc425932f46fba609819ee1c62256f61799e6a51d2 \ + --hash=sha256:a9c65473ebc342715cb2d7926ff1e202c26376c0dcaaee85a1fd4b8d8c1d3b2f \ + --hash=sha256:c1c09247ccea742525bdb5f4b5ceeacb34f95731647fe55774aa36557dbb5fa4 \ + --hash=sha256:c5bf0e132acf7557fc9bb8ded8b53bbbbea8892f3c9a1738205878ca9434206a \ + --hash=sha256:db250fd3e90117e0312b611574cd1b3f78bec046783195075cbd7ba9c3d73f16 \ + --hash=sha256:e515c9a93aebe27166ec9593411c58494fa98e5fcc219e47260d9ab8a1cc7f9f \ + --hash=sha256:e55185e51b18d788e49fe8305fd73ef4470596b33fc2c1ceb304566b99c71a69 \ + --hash=sha256:ea9cff01e75a956dbee133fa8e5b68f2f92175233de2f88de3a682dd94deda65 \ + --hash=sha256:f1452578d0516283c87608a5a5548b0cdde15b99650efdfd85182102ef7a7c17 \ + --hash=sha256:f39a995e47cb8649673cfa0579fbdd1cdd33ea497d1728a6cb194d6252268e48 + # via pyarrow +pyarrow==4.0.1 \ + --hash=sha256:04be0f7cb9090bd029b5b53bed628548fef569e5d0b5c6cd7f6d0106dbbc782d \ + --hash=sha256:0fde9c7a3d5d37f3fe5d18c4ed015e8f585b68b26d72a10d7012cad61afe43ff \ + --hash=sha256:11517f0b4f4acbab0c37c674b4d1aad3c3dfea0f6b1bb322e921555258101ab3 \ + --hash=sha256:150db335143edd00d3ec669c7c8167d401c4aa0a290749351c80bbf146892b2e \ + --hash=sha256:24040a20208e9b16ba7b284624ebfe67e40f5c40b5dc8d874da322ac0053f9d3 \ + --hash=sha256:33c457728a1ce825b80aa8c8ed573709f1efe72003d45fa6fdbb444de9cc0b74 \ + --hash=sha256:423cd6a14810f4e40cb76e13d4240040fc1594d69fe1c4f2c70be00ad512ade5 \ + --hash=sha256:5387db80c6a7b5598884bf4df3fc546b3373771ad614548b782e840b71704877 \ + --hash=sha256:5a76ec44af838862b23fb5cfc48765bc7978f7b58a181c96ad92856280de548b \ + --hash=sha256:5f2660f59dfcfd34adac7c08dc7f615920de703f191066ed6277628975f06878 \ + --hash=sha256:6b7bd8f5aa327cc32a1b9b02a76502851575f5edb110f93c59a45c70211a5618 \ + --hash=sha256:72cf3477538bd8504f14d6299a387cc335444f7a188f548096dfea9533551f02 \ + --hash=sha256:76b75a9cfc572e890a1e000fd532bdd2084ec3f1ee94ee51802a477913a21072 \ + --hash=sha256:a81adbfbe2f6528d4593b5a8962b2751838517401d14e9d4cab6787478802693 \ + --hash=sha256:a968375c66e505f72b421f5864a37f51aad5da61b6396fa283f956e9f2b2b923 \ + --hash=sha256:afd4f7c0a225a326d2c0039cdc8631b5e8be30f78f6b7a3e5ce741cf5dd81c72 \ + --hash=sha256:b05bdd513f045d43228247ef4d9269c88139788e2d566f4cb3e855e282ad0330 \ + --hash=sha256:c2733c9bcd00074ce5497dd0a7b8a10c91d3395ddce322d7021c7fdc4ea6f610 \ + --hash=sha256:d0f080b2d9720bec42624cb0df66f60ae66b84a2ccd1fe2c291322df915ac9db \ + --hash=sha256:dcd20ee0240a88772eeb5691102c276f5cdec79527fb3a0679af7f93f93cb4bd \ + --hash=sha256:e1351576877764fb4d5690e4721ce902e987c85f4ab081c70a34e1d24646586e \ + --hash=sha256:e44dfd7e61c9eb6dda59bc49ad69e77945f6d049185a517c130417e3ca0494d8 \ + --hash=sha256:ee3d87615876550fee9a523307dd4b00f0f44cf47a94a32a07793da307df31a0 \ + --hash=sha256:fa7b165cfa97158c1e6d15c68428317b4f4ae786d1dc2dbab43f1328c1eb43aa \ + --hash=sha256:fe976695318560a97c6d31bba828eeca28c44c6f6401005e54ba476a28ac0a10 + # via -r requirements.in +toml==0.10.2 \ + --hash=sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b \ + --hash=sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f + # via + # -r requirements.in + # maturin From ae15a5186369815ce17180e83b93d3003c1ed7b6 Mon Sep 17 00:00:00 2001 From: Rich Date: Sat, 5 Jun 2021 01:02:05 +0800 Subject: [PATCH 160/329] #215 resolve aliases for group by exprs (#485) * #215 resolve aliases for group by exprs * cargo fmt --all --- datafusion/src/sql/planner.rs | 97 +++++++++++++++++++---------------- 1 file changed, 53 insertions(+), 44 deletions(-) diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 3b8acc67ccb23..aa6b5a93f4837 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -582,37 +582,54 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // All of the aggregate expressions (deduplicated). let aggr_exprs = find_aggregate_exprs(&aggr_expr_haystack); - let (plan, select_exprs_post_aggr, having_expr_post_aggr_opt) = - if !select.group_by.is_empty() || !aggr_exprs.is_empty() { - self.aggregate( - &plan, - &select_exprs, - &having_expr_opt, - &select.group_by, - aggr_exprs, - )? - } else { - if let Some(having_expr) = &having_expr_opt { - let available_columns = select_exprs - .iter() - .map(|expr| expr_as_column_expr(expr, &plan)) - .collect::>>()?; - - // Ensure the HAVING expression is using only columns - // provided by the SELECT. - if !can_columns_satisfy_exprs( - &available_columns, - &[having_expr.clone()], - )? { - return Err(DataFusionError::Plan( - "Having references column(s) not provided by the select" - .to_owned(), - )); - } + let group_by_exprs = select + .group_by + .iter() + .map(|e| { + let group_by_expr = self.sql_expr_to_logical_expr(e)?; + let group_by_expr = resolve_aliases_to_exprs( + &group_by_expr, + &extract_aliases(&select_exprs), + )?; + self.validate_schema_satisfies_exprs( + plan.schema(), + &[group_by_expr.clone()], + )?; + Ok(group_by_expr) + }) + .collect::>>()?; + + let (plan, select_exprs_post_aggr, having_expr_post_aggr_opt) = if !group_by_exprs + .is_empty() + || !aggr_exprs.is_empty() + { + self.aggregate( + &plan, + &select_exprs, + &having_expr_opt, + group_by_exprs, + aggr_exprs, + )? + } else { + if let Some(having_expr) = &having_expr_opt { + let available_columns = select_exprs + .iter() + .map(|expr| expr_as_column_expr(expr, &plan)) + .collect::>>()?; + + // Ensure the HAVING expression is using only columns + // provided by the SELECT. + if !can_columns_satisfy_exprs(&available_columns, &[having_expr.clone()])? + { + return Err(DataFusionError::Plan( + "Having references column(s) not provided by the select" + .to_owned(), + )); } + } - (plan, select_exprs, having_expr_opt) - }; + (plan, select_exprs, having_expr_opt) + }; let plan = if let Some(having_expr_post_aggr) = having_expr_post_aggr_opt { LogicalPlanBuilder::from(&plan) @@ -706,14 +723,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { input: &LogicalPlan, select_exprs: &[Expr], having_expr_opt: &Option, - group_by: &[SQLExpr], + group_by_exprs: Vec, aggr_exprs: Vec, ) -> Result<(LogicalPlan, Vec, Option)> { - let group_by_exprs = group_by - .iter() - .map(|e| self.sql_to_rex(e, &input.schema())) - .collect::>>()?; - let aggr_projection_exprs = group_by_exprs .iter() .chain(aggr_exprs.iter()) @@ -2308,15 +2320,12 @@ mod tests { } #[test] - fn select_simple_aggregate_with_groupby_cannot_use_alias() { - let sql = "SELECT state AS x, MAX(age) FROM person GROUP BY x"; - let err = logical_plan(sql).expect_err("query should have failed"); - assert_eq!( - format!( - "Plan(\"Invalid identifier \\\'x\\\' for schema {}\")", - PERSON_COLUMN_NAMES - ), - format!("{:?}", err) + fn select_simple_aggregate_with_groupby_can_use_alias() { + quick_test( + "SELECT state AS a, MIN(age) AS b FROM person GROUP BY a", + "Projection: #state AS a, #MIN(age) AS b\ + \n Aggregate: groupBy=[[#state]], aggr=[[MIN(#age)]]\ + \n TableScan: person projection=None", ); } From 964f49449ae7b338999fec133ae0174f01a931ae Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 4 Jun 2021 13:19:54 -0400 Subject: [PATCH 161/329] Add support for boolean columns in pruning logic (#500) * Add support for boolean columns in pruning logic * fix clippy * Use more functional style * Add a few more tests --- datafusion/src/physical_optimizer/pruning.rs | 483 +++++++++++++++---- 1 file changed, 390 insertions(+), 93 deletions(-) diff --git a/datafusion/src/physical_optimizer/pruning.rs b/datafusion/src/physical_optimizer/pruning.rs index 3a5a64c6f6689..c65733bd75267 100644 --- a/datafusion/src/physical_optimizer/pruning.rs +++ b/datafusion/src/physical_optimizer/pruning.rs @@ -32,7 +32,7 @@ use std::{collections::HashSet, sync::Arc}; use arrow::{ array::{new_null_array, ArrayRef, BooleanArray}, - datatypes::{Field, Schema, SchemaRef}, + datatypes::{DataType, Field, Schema, SchemaRef}, record_batch::RecordBatch, }; @@ -86,12 +86,8 @@ pub struct PruningPredicate { schema: SchemaRef, /// Actual pruning predicate (rewritten in terms of column min/max statistics) predicate_expr: Arc, - /// The statistics required to evaluate this predicate: - /// * The column name in the input schema - /// * Statistics type (e.g. Min or Max) - /// * The field the statistics value should be placed in for - /// pruning predicate evaluation - stat_column_req: Vec<(String, StatisticsType, Field)>, + /// The statistics required to evaluate this predicate + required_columns: RequiredStatColumns, } impl PruningPredicate { @@ -116,10 +112,10 @@ impl PruningPredicate { /// `(column_min / 2) <= 4 && 4 <= (column_max / 2))` pub fn try_new(expr: &Expr, schema: SchemaRef) -> Result { // build predicate expression once - let mut stat_column_req = Vec::<(String, StatisticsType, Field)>::new(); + let mut required_columns = RequiredStatColumns::new(); let logical_predicate_expr = - build_predicate_expression(expr, schema.as_ref(), &mut stat_column_req)?; - let stat_fields = stat_column_req + build_predicate_expression(expr, schema.as_ref(), &mut required_columns)?; + let stat_fields = required_columns .iter() .map(|(_, _, f)| f.clone()) .collect::>(); @@ -133,7 +129,7 @@ impl PruningPredicate { Ok(Self { schema, predicate_expr, - stat_column_req, + required_columns, }) } @@ -148,10 +144,16 @@ impl PruningPredicate { /// Note this function takes a slice of statistics as a parameter /// to amortize the cost of the evaluation of the predicate /// against a single record batch. + /// + /// Note: the predicate passed to `prune` should be simplified as + /// much as possible (e.g. this pass doesn't handle some + /// expressions like `b = false`, but it does handle the + /// simplified version `b`. The predicates are simplified via the + /// ConstantFolding optimizer pass pub fn prune(&self, statistics: &S) -> Result> { // build statistics record batch let predicate_array = - build_statistics_record_batch(statistics, &self.stat_column_req) + build_statistics_record_batch(statistics, &self.required_columns) .and_then(|statistics_batch| { // execute predicate expression self.predicate_expr.evaluate(&statistics_batch) @@ -189,9 +191,100 @@ impl PruningPredicate { } } +/// Handles creating references to the min/max statistics +/// for columns as well as recording which statistics are needed +#[derive(Debug, Default, Clone)] +struct RequiredStatColumns { + /// The statistics required to evaluate this predicate: + /// * The column name in the input schema + /// * Statistics type (e.g. Min or Max) + /// * The field the statistics value should be placed in for + /// pruning predicate evaluation + columns: Vec<(String, StatisticsType, Field)>, +} + +impl RequiredStatColumns { + fn new() -> Self { + Self::default() + } + + /// Retur an iterator over items in columns (see doc on + /// `self.columns` for details) + fn iter(&self) -> impl Iterator { + self.columns.iter() + } + + fn is_stat_column_missing( + &self, + column_name: &str, + statistics_type: StatisticsType, + ) -> bool { + !self + .columns + .iter() + .any(|(c, t, _f)| c == column_name && t == &statistics_type) + } + + /// Rewrites column_expr so that all appearances of column_name + /// are replaced with a reference to either the min or max + /// statistics column, while keeping track that a reference to the statistics + /// column is required + /// + /// for example, an expression like `col("foo") > 5`, when called + /// with Max would result in an expression like `col("foo_max") > + /// 5` with the approprate entry noted in self.columns + fn stat_column_expr( + &mut self, + column_name: &str, + column_expr: &Expr, + field: &Field, + stat_type: StatisticsType, + suffix: &str, + ) -> Result { + let stat_column_name = format!("{}_{}", column_name, suffix); + let stat_field = Field::new( + stat_column_name.as_str(), + field.data_type().clone(), + field.is_nullable(), + ); + if self.is_stat_column_missing(column_name, stat_type) { + // only add statistics column if not previously added + self.columns + .push((column_name.to_string(), stat_type, stat_field)); + } + rewrite_column_expr(column_expr, column_name, stat_column_name.as_str()) + } + + /// rewrite col --> col_min + fn min_column_expr( + &mut self, + column_name: &str, + column_expr: &Expr, + field: &Field, + ) -> Result { + self.stat_column_expr(column_name, column_expr, field, StatisticsType::Min, "min") + } + + /// rewrite col --> col_max + fn max_column_expr( + &mut self, + column_name: &str, + column_expr: &Expr, + field: &Field, + ) -> Result { + self.stat_column_expr(column_name, column_expr, field, StatisticsType::Max, "max") + } +} + +impl From> for RequiredStatColumns { + fn from(columns: Vec<(String, StatisticsType, Field)>) -> Self { + Self { columns } + } +} + /// Build a RecordBatch from a list of statistics, creating arrays, /// with one row for each PruningStatistics and columns specified in -/// in the stat_column_req parameter. +/// in the required_columns parameter. /// /// For example, if the requested columns are /// ```text @@ -216,12 +309,12 @@ impl PruningPredicate { /// ``` fn build_statistics_record_batch( statistics: &S, - stat_column_req: &[(String, StatisticsType, Field)], + required_columns: &RequiredStatColumns, ) -> Result { let mut fields = Vec::::new(); let mut arrays = Vec::::new(); // For each needed statistics column: - for (column_name, statistics_type, stat_field) in stat_column_req { + for (column_name, statistics_type, stat_field) in required_columns.iter() { let data_type = stat_field.data_type(); let num_containers = statistics.num_containers(); @@ -258,7 +351,7 @@ struct PruningExpressionBuilder<'a> { column_expr: &'a Expr, scalar_expr: &'a Expr, field: &'a Field, - stat_column_req: &'a mut Vec<(String, StatisticsType, Field)>, + required_columns: &'a mut RequiredStatColumns, reverse_operator: bool, } @@ -267,7 +360,7 @@ impl<'a> PruningExpressionBuilder<'a> { left: &'a Expr, right: &'a Expr, schema: &'a Schema, - stat_column_req: &'a mut Vec<(String, StatisticsType, Field)>, + required_columns: &'a mut RequiredStatColumns, ) -> Result { // find column name; input could be a more complicated expression let mut left_columns = HashSet::::new(); @@ -301,7 +394,7 @@ impl<'a> PruningExpressionBuilder<'a> { column_expr, scalar_expr, field, - stat_column_req, + required_columns, reverse_operator, }) } @@ -324,42 +417,20 @@ impl<'a> PruningExpressionBuilder<'a> { self.scalar_expr } - fn is_stat_column_missing(&self, statistics_type: StatisticsType) -> bool { - !self - .stat_column_req - .iter() - .any(|(c, t, _f)| c == &self.column_name && t == &statistics_type) - } - - fn stat_column_expr( - &mut self, - stat_type: StatisticsType, - suffix: &str, - ) -> Result { - let stat_column_name = format!("{}_{}", self.column_name, suffix); - let stat_field = Field::new( - stat_column_name.as_str(), - self.field.data_type().clone(), - self.field.is_nullable(), - ); - if self.is_stat_column_missing(stat_type) { - // only add statistics column if not previously added - self.stat_column_req - .push((self.column_name.clone(), stat_type, stat_field)); - } - rewrite_column_expr( - self.column_expr, - self.column_name.as_str(), - stat_column_name.as_str(), - ) - } - fn min_column_expr(&mut self) -> Result { - self.stat_column_expr(StatisticsType::Min, "min") + self.required_columns.min_column_expr( + &self.column_name, + &self.column_expr, + self.field, + ) } fn max_column_expr(&mut self) -> Result { - self.stat_column_expr(StatisticsType::Max, "max") + self.required_columns.max_column_expr( + &self.column_name, + &self.column_expr, + self.field, + ) } } @@ -383,6 +454,46 @@ fn rewrite_column_expr( utils::rewrite_expression(&expr, &expressions) } +/// Given a column reference to `column_name`, returns a pruning +/// expression in terms of the min and max that will evaluate to true +/// if the column may contain values, and false if definitely does not +/// contain values +fn build_single_column_expr( + column_name: &str, + schema: &Schema, + required_columns: &mut RequiredStatColumns, + is_not: bool, // if true, treat as !col +) -> Option { + use crate::logical_plan; + let field = schema.field_with_name(column_name).ok()?; + + if matches!(field.data_type(), &DataType::Boolean) { + let col_ref = logical_plan::col(column_name); + + let min = required_columns + .min_column_expr(column_name, &col_ref, field) + .ok()?; + let max = required_columns + .max_column_expr(column_name, &col_ref, field) + .ok()?; + + // remember -- we want an expression that is: + // TRUE: if there may be rows that match + // FALSE: if there are no rows that match + if is_not { + // The only way we know a column couldn't match is if both the min and max are true + // !(min && max) + Some((min.and(max)).not()) + } else { + // the only way we know a column couldn't match is if both the min and max are false + // !(!min && !max) --> min || max + Some(min.or(max)) + } + } else { + None + } +} + /// Translate logical filter expression into pruning predicate /// expression that will evaluate to FALSE if it can be determined no /// rows between the min/max values could pass the predicates. @@ -391,28 +502,47 @@ fn rewrite_column_expr( fn build_predicate_expression( expr: &Expr, schema: &Schema, - stat_column_req: &mut Vec<(String, StatisticsType, Field)>, + required_columns: &mut RequiredStatColumns, ) -> Result { use crate::logical_plan; + + // Returned for unsupported expressions. Such expressions are + // converted to TRUE. This can still be useful when multiple + // conditions are joined using AND such as: column > 10 AND TRUE + let unhandled = logical_plan::lit(true); + // predicate expression can only be a binary expression let (left, op, right) = match expr { Expr::BinaryExpr { left, op, right } => (left, *op, right), + Expr::Column(name) => { + let expr = build_single_column_expr(&name, schema, required_columns, false) + .unwrap_or(unhandled); + return Ok(expr); + } + // match !col (don't do so recursively) + Expr::Not(input) => { + if let Expr::Column(name) = input.as_ref() { + let expr = + build_single_column_expr(&name, schema, required_columns, true) + .unwrap_or(unhandled); + return Ok(expr); + } else { + return Ok(unhandled); + } + } _ => { - // unsupported expression - replace with TRUE - // this can still be useful when multiple conditions are joined using AND - // such as: column > 10 AND TRUE - return Ok(logical_plan::lit(true)); + return Ok(unhandled); } }; if op == Operator::And || op == Operator::Or { - let left_expr = build_predicate_expression(left, schema, stat_column_req)?; - let right_expr = build_predicate_expression(right, schema, stat_column_req)?; + let left_expr = build_predicate_expression(left, schema, required_columns)?; + let right_expr = build_predicate_expression(right, schema, required_columns)?; return Ok(logical_plan::binary_expr(left_expr, op, right_expr)); } let expr_builder = - PruningExpressionBuilder::try_new(left, right, schema, stat_column_req); + PruningExpressionBuilder::try_new(left, right, schema, required_columns); let mut expr_builder = match expr_builder { Ok(builder) => builder, // allow partial failure in predicate expression generation @@ -508,6 +638,16 @@ mod tests { } } + fn new_bool( + min: impl IntoIterator>, + max: impl IntoIterator>, + ) -> Self { + Self { + min: Arc::new(min.into_iter().collect::()), + max: Arc::new(max.into_iter().collect::()), + } + } + fn min(&self) -> Option { Some(self.min.clone()) } @@ -591,7 +731,7 @@ mod tests { #[test] fn test_build_statistics_record_batch() { // Request a record batch with of s1_min, s2_max, s3_max, s3_min - let stat_column_req = vec![ + let required_columns = RequiredStatColumns::from(vec![ // min of original column s1, named s1_min ( "s1".to_string(), @@ -616,7 +756,7 @@ mod tests { StatisticsType::Min, Field::new("s3_min", DataType::Utf8, true), ), - ]; + ]); let statistics = TestStatistics::new() .with( @@ -641,7 +781,8 @@ mod tests { ), ); - let batch = build_statistics_record_batch(&statistics, &stat_column_req).unwrap(); + let batch = + build_statistics_record_batch(&statistics, &required_columns).unwrap(); let expected = vec![ "+--------+--------+--------+--------+", "| s1_min | s2_max | s3_max | s3_min |", @@ -662,7 +803,7 @@ mod tests { // which is what Parquet does // Request a record batch with of s1_min as a timestamp - let stat_column_req = vec![( + let required_columns = RequiredStatColumns::from(vec![( "s1".to_string(), StatisticsType::Min, Field::new( @@ -670,7 +811,7 @@ mod tests { DataType::Timestamp(TimeUnit::Nanosecond, None), true, ), - )]; + )]); // Note the statistics pass back i64 (not timestamp) let statistics = OneContainerStats { @@ -679,7 +820,8 @@ mod tests { num_containers: 1, }; - let batch = build_statistics_record_batch(&statistics, &stat_column_req).unwrap(); + let batch = + build_statistics_record_batch(&statistics, &required_columns).unwrap(); let expected = vec![ "+-------------------------------+", "| s1_min |", @@ -693,7 +835,7 @@ mod tests { #[test] fn test_build_statistics_no_stats() { - let stat_column_req = vec![]; + let required_columns = RequiredStatColumns::new(); let statistics = OneContainerStats { min_values: Some(Arc::new(Int64Array::from(vec![Some(10)]))), @@ -702,7 +844,7 @@ mod tests { }; let result = - build_statistics_record_batch(&statistics, &stat_column_req).unwrap_err(); + build_statistics_record_batch(&statistics, &required_columns).unwrap_err(); assert!( result.to_string().contains("Invalid argument error"), "{}", @@ -715,11 +857,11 @@ mod tests { // Test requesting a Utf8 column when the stats return some other type // Request a record batch with of s1_min as a timestamp - let stat_column_req = vec![( + let required_columns = RequiredStatColumns::from(vec![( "s1".to_string(), StatisticsType::Min, Field::new("s1_min", DataType::Utf8, true), - )]; + )]); // Note the statistics return binary (which can't be cast to string) let statistics = OneContainerStats { @@ -728,7 +870,8 @@ mod tests { num_containers: 1, }; - let batch = build_statistics_record_batch(&statistics, &stat_column_req).unwrap(); + let batch = + build_statistics_record_batch(&statistics, &required_columns).unwrap(); let expected = vec![ "+--------+", "| s1_min |", @@ -743,11 +886,11 @@ mod tests { #[test] fn test_build_statistics_inconsistent_length() { // return an inconsistent length to the actual statistics arrays - let stat_column_req = vec![( + let required_columns = RequiredStatColumns::from(vec![( "s1".to_string(), StatisticsType::Min, Field::new("s1_min", DataType::Int64, true), - )]; + )]); // Note the statistics pass back i64 (not timestamp) let statistics = OneContainerStats { @@ -757,7 +900,7 @@ mod tests { }; let result = - build_statistics_record_batch(&statistics, &stat_column_req).unwrap_err(); + build_statistics_record_batch(&statistics, &required_columns).unwrap_err(); assert!( result .to_string() @@ -774,12 +917,14 @@ mod tests { // test column on the left let expr = col("c1").eq(lit(1)); - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; assert_eq!(format!("{:?}", predicate_expr), expected_expr); // test column on the right let expr = lit(1).eq(col("c1")); - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; assert_eq!(format!("{:?}", predicate_expr), expected_expr); Ok(()) @@ -792,12 +937,14 @@ mod tests { // test column on the left let expr = col("c1").gt(lit(1)); - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; assert_eq!(format!("{:?}", predicate_expr), expected_expr); // test column on the right let expr = lit(1).lt(col("c1")); - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; assert_eq!(format!("{:?}", predicate_expr), expected_expr); Ok(()) @@ -810,11 +957,13 @@ mod tests { // test column on the left let expr = col("c1").gt_eq(lit(1)); - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; assert_eq!(format!("{:?}", predicate_expr), expected_expr); // test column on the right let expr = lit(1).lt_eq(col("c1")); - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; assert_eq!(format!("{:?}", predicate_expr), expected_expr); Ok(()) @@ -827,12 +976,14 @@ mod tests { // test column on the left let expr = col("c1").lt(lit(1)); - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; assert_eq!(format!("{:?}", predicate_expr), expected_expr); // test column on the right let expr = lit(1).gt(col("c1")); - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; assert_eq!(format!("{:?}", predicate_expr), expected_expr); Ok(()) @@ -845,11 +996,13 @@ mod tests { // test column on the left let expr = col("c1").lt_eq(lit(1)); - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; assert_eq!(format!("{:?}", predicate_expr), expected_expr); // test column on the right let expr = lit(1).gt_eq(col("c1")); - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; assert_eq!(format!("{:?}", predicate_expr), expected_expr); Ok(()) @@ -865,7 +1018,8 @@ mod tests { // test AND operator joining supported c1 < 1 expression and unsupported c2 > c3 expression let expr = col("c1").lt(lit(1)).and(col("c2").lt(col("c3"))); let expected_expr = "#c1_min Lt Int32(1) And Boolean(true)"; - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; assert_eq!(format!("{:?}", predicate_expr), expected_expr); Ok(()) @@ -880,46 +1034,101 @@ mod tests { // test OR operator joining supported c1 < 1 expression and unsupported c2 % 2 expression let expr = col("c1").lt(lit(1)).or(col("c2").modulus(lit(2))); let expected_expr = "#c1_min Lt Int32(1) Or Boolean(true)"; - let predicate_expr = build_predicate_expression(&expr, &schema, &mut vec![])?; + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; assert_eq!(format!("{:?}", predicate_expr), expected_expr); Ok(()) } #[test] - fn row_group_predicate_stat_column_req() -> Result<()> { + fn row_group_predicate_not() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + let expected_expr = "Boolean(true)"; + + let expr = col("c1").not(); + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_not_bool() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Boolean, false)]); + let expected_expr = "NOT #c1_min And #c1_max"; + + let expr = col("c1").not(); + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_bool() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Boolean, false)]); + let expected_expr = "#c1_min Or #c1_max"; + + let expr = col("c1"); + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_lt_bool() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Boolean, false)]); + let expected_expr = "#c1_min Lt Boolean(true)"; + + // DF doesn't support arithmetic on boolean columns so + // this predicate will error when evaluated + let expr = col("c1").lt(lit(true)); + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_required_columns() -> Result<()> { let schema = Schema::new(vec![ Field::new("c1", DataType::Int32, false), Field::new("c2", DataType::Int32, false), ]); - let mut stat_column_req = vec![]; + let mut required_columns = RequiredStatColumns::new(); // c1 < 1 and (c2 = 2 or c2 = 3) let expr = col("c1") .lt(lit(1)) .and(col("c2").eq(lit(2)).or(col("c2").eq(lit(3)))); let expected_expr = "#c1_min Lt Int32(1) And #c2_min LtEq Int32(2) And Int32(2) LtEq #c2_max Or #c2_min LtEq Int32(3) And Int32(3) LtEq #c2_max"; let predicate_expr = - build_predicate_expression(&expr, &schema, &mut stat_column_req)?; + build_predicate_expression(&expr, &schema, &mut required_columns)?; assert_eq!(format!("{:?}", predicate_expr), expected_expr); // c1 < 1 should add c1_min let c1_min_field = Field::new("c1_min", DataType::Int32, false); assert_eq!( - stat_column_req[0], + required_columns.columns[0], ("c1".to_owned(), StatisticsType::Min, c1_min_field) ); // c2 = 2 should add c2_min and c2_max let c2_min_field = Field::new("c2_min", DataType::Int32, false); assert_eq!( - stat_column_req[1], + required_columns.columns[1], ("c2".to_owned(), StatisticsType::Min, c2_min_field) ); let c2_max_field = Field::new("c2_max", DataType::Int32, false); assert_eq!( - stat_column_req[2], + required_columns.columns[2], ("c2".to_owned(), StatisticsType::Max, c2_max_field) ); // c2 = 3 shouldn't add any new statistics fields - assert_eq!(stat_column_req.len(), 3); + assert_eq!(required_columns.columns.len(), 3); Ok(()) } @@ -927,8 +1136,8 @@ mod tests { #[test] fn prune_api() { let schema = Arc::new(Schema::new(vec![ - Field::new("s1", DataType::Utf8, false), - Field::new("s2", DataType::Int32, false), + Field::new("s1", DataType::Utf8, true), + Field::new("s2", DataType::Int32, true), ])); // Prune using s2 > 5 @@ -953,4 +1162,92 @@ mod tests { assert_eq!(result, expected); } + + /// Creates setup for boolean chunk pruning + /// + /// For predicate "b1" (boolean expr) + /// b1 [false, false] ==> no rows can pass (not keep) + /// b1 [false, true] ==> some rows could pass (must keep) + /// b1 [true, true] ==> all rows must pass (must keep) + /// b1 [NULL, NULL] ==> unknown (must keep) + /// b1 [false, NULL] ==> unknown (must keep) + /// + /// For predicate "!b1" (boolean expr) + /// b1 [false, false] ==> all rows pass (must keep) + /// b1 [false, true] ==> some rows could pass (must keep) + /// b1 [true, true] ==> no rows can pass (not keep) + /// b1 [NULL, NULL] ==> unknown (must keep) + /// b1 [false, NULL] ==> unknown (must keep) + fn bool_setup() -> (SchemaRef, TestStatistics, Vec, Vec) { + let schema = + Arc::new(Schema::new(vec![Field::new("b1", DataType::Boolean, true)])); + + let statistics = TestStatistics::new().with( + "b1", + ContainerStats::new_bool( + vec![Some(false), Some(false), Some(true), None, Some(false)], // min + vec![Some(false), Some(true), Some(true), None, None], // max + ), + ); + let expected_true = vec![false, true, true, true, true]; + let expected_false = vec![true, true, false, true, true]; + + (schema, statistics, expected_true, expected_false) + } + + #[test] + fn prune_bool_column() { + let (schema, statistics, expected_true, _) = bool_setup(); + + // b1 + let expr = col("b1"); + let p = PruningPredicate::try_new(&expr, schema).unwrap(); + let result = p.prune(&statistics).unwrap(); + assert_eq!(result, expected_true); + } + + #[test] + fn prune_bool_not_column() { + let (schema, statistics, _, expected_false) = bool_setup(); + + // !b1 + let expr = col("b1").not(); + let p = PruningPredicate::try_new(&expr, schema).unwrap(); + let result = p.prune(&statistics).unwrap(); + assert_eq!(result, expected_false); + } + + #[test] + fn prune_bool_column_eq_true() { + let (schema, statistics, _, _) = bool_setup(); + + // b1 = true + let expr = col("b1").eq(lit(true)); + let p = PruningPredicate::try_new(&expr, schema).unwrap(); + let result = p.prune(&statistics).unwrap_err(); + assert!( + result.to_string().contains( + "Data type Boolean not supported for scalar operation on dyn array" + ), + "{}", + result + ) + } + + #[test] + fn prune_bool_not_column_eq_true() { + let (schema, statistics, _, _) = bool_setup(); + + // !b1 = true + let expr = col("b1").not().eq(lit(true)); + let p = PruningPredicate::try_new(&expr, schema).unwrap(); + let result = p.prune(&statistics).unwrap_err(); + assert!( + result.to_string().contains( + "Data type Boolean not supported for scalar operation on dyn array" + ), + "{}", + result + ) + } } From a9d04ca570015be6c94b767fef079783efdd877c Mon Sep 17 00:00:00 2001 From: Terry Corley Date: Fri, 4 Jun 2021 15:56:10 -0500 Subject: [PATCH 162/329] Update ballista.proto link in architecture doc (#502) --- ballista/docs/architecture.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ballista/docs/architecture.md b/ballista/docs/architecture.md index 04e1dc26bac1e..2868d52b943eb 100644 --- a/ballista/docs/architecture.md +++ b/ballista/docs/architecture.md @@ -44,7 +44,7 @@ processes. ## Scheduler Process The scheduler process implements a gRPC interface (defined in -[ballista.proto](../rust/ballista/proto/ballista.proto)). The interface provides the following methods: +[ballista.proto](../rust/core/proto/ballista.proto)). The interface provides the following methods: | Method | Description | | -------------------- | -------------------------------------------------------------------- | From b84789afc5a67e3f70cd8903bf96993b13414aaf Mon Sep 17 00:00:00 2001 From: sathis Date: Sun, 6 Jun 2021 03:04:24 +0530 Subject: [PATCH 163/329] Optimize cast function during planning stage (#513) Co-authored-by: Sathis Kumar --- datafusion/src/optimizer/constant_folding.rs | 58 ++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/datafusion/src/optimizer/constant_folding.rs b/datafusion/src/optimizer/constant_folding.rs index 97cc23264bda1..d2ac5ce2f3837 100644 --- a/datafusion/src/optimizer/constant_folding.rs +++ b/datafusion/src/optimizer/constant_folding.rs @@ -30,6 +30,7 @@ use crate::optimizer::utils; use crate::physical_plan::functions::BuiltinScalarFunction; use crate::scalar::ScalarValue; use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos; +use arrow::compute::{kernels, DEFAULT_CAST_OPTIONS}; /// Optimizer that simplifies comparison expressions involving boolean literals. /// @@ -247,6 +248,25 @@ impl<'a> ExprRewriter for ConstantRewriter<'a> { } } } + Expr::Cast { + expr: inner, + data_type, + } => match inner.as_ref() { + Expr::Literal(val) => { + let scalar_array = val.to_array(); + let cast_array = kernels::cast::cast_with_options( + &scalar_array, + &data_type, + &DEFAULT_CAST_OPTIONS, + )?; + let cast_scalar = ScalarValue::try_from_array(&cast_array, 0)?; + Expr::Literal(cast_scalar) + } + _ => Expr::Cast { + expr: inner, + data_type, + }, + }, expr => { // no rewrite possible expr @@ -724,6 +744,44 @@ mod tests { assert_eq!(expected, actual); } + #[test] + fn cast_expr() { + let table_scan = test_table_scan().unwrap(); + let proj = vec![Expr::Cast { + expr: Box::new(Expr::Literal(ScalarValue::Utf8(Some("0".to_string())))), + data_type: DataType::Int32, + }]; + let plan = LogicalPlanBuilder::from(&table_scan) + .project(proj) + .unwrap() + .build() + .unwrap(); + + let expected = "Projection: Int32(0)\ + \n TableScan: test projection=None"; + let actual = get_optimized_plan_formatted(&plan, &chrono::Utc::now()); + assert_eq!(expected, actual); + } + + #[test] + fn cast_expr_wrong_arg() { + let table_scan = test_table_scan().unwrap(); + let proj = vec![Expr::Cast { + expr: Box::new(Expr::Literal(ScalarValue::Utf8(Some("".to_string())))), + data_type: DataType::Int32, + }]; + let plan = LogicalPlanBuilder::from(&table_scan) + .project(proj) + .unwrap() + .build() + .unwrap(); + + let expected = "Projection: Int32(NULL)\ + \n TableScan: test projection=None"; + let actual = get_optimized_plan_formatted(&plan, &chrono::Utc::now()); + assert_eq!(expected, actual); + } + #[test] fn single_now_expr() { let table_scan = test_table_scan().unwrap(); From ee2b9ef049954173231b987f86b4d8eace0d3e79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sun, 6 Jun 2021 09:56:43 +0200 Subject: [PATCH 164/329] Fix display of execution time (#514) --- datafusion-cli/src/lib.rs | 7 +++---- datafusion-cli/src/main.rs | 5 ++++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/datafusion-cli/src/lib.rs b/datafusion-cli/src/lib.rs index 5bd16e333030c..5b110d315364f 100644 --- a/datafusion-cli/src/lib.rs +++ b/datafusion-cli/src/lib.rs @@ -29,17 +29,16 @@ pub struct PrintOptions { fn print_timing_info(row_count: usize, now: Instant) { println!( - "{} {} in set. Query took {} seconds.", + "{} {} in set. Query took {:.3} seconds.", row_count, if row_count == 1 { "row" } else { "rows" }, - now.elapsed().as_secs() + now.elapsed().as_secs_f64() ); } impl PrintOptions { /// print the batches to stdout using the specified format - pub fn print_batches(&self, batches: &[RecordBatch]) -> Result<()> { - let now = Instant::now(); + pub fn print_batches(&self, batches: &[RecordBatch], now: Instant) -> Result<()> { if batches.is_empty() { if !self.quiet { print_timing_info(0, now); diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index 083710f6dd192..39ce02ffbfd82 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -30,6 +30,7 @@ use std::fs::File; use std::io::prelude::*; use std::io::BufReader; use std::path::Path; +use std::time::Instant; #[tokio::main] pub async fn main() { @@ -238,7 +239,9 @@ async fn exec_and_print( sql: String, ) -> Result<()> { let df = ctx.sql(&sql)?; + let now = Instant::now(); let results = df.collect().await?; - print_options.print_batches(&results)?; + + print_options.print_batches(&results, now)?; Ok(()) } From 767eeb0a8bf17916aafb9a88abd52e7350acb596 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Mon, 7 Jun 2021 18:14:25 +0800 Subject: [PATCH 165/329] closing up type checks (#506) --- ballista/rust/core/Cargo.toml | 2 +- ballista/rust/core/proto/ballista.proto | 6 +- .../core/src/serde/logical_plan/from_proto.rs | 49 +-- .../core/src/serde/logical_plan/to_proto.rs | 56 ++- .../src/serde/physical_plan/from_proto.rs | 1 + datafusion/src/logical_plan/expr.rs | 50 ++- datafusion/src/optimizer/utils.rs | 5 +- datafusion/src/physical_plan/mod.rs | 1 + datafusion/src/physical_plan/planner.rs | 3 +- datafusion/src/physical_plan/window_frames.rs | 337 ++++++++++++++++++ datafusion/src/sql/planner.rs | 52 ++- datafusion/src/sql/utils.rs | 12 + 12 files changed, 512 insertions(+), 62 deletions(-) create mode 100644 datafusion/src/physical_plan/window_frames.rs diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml index 99822cfe2aee5..1f23a2a42e2a0 100644 --- a/ballista/rust/core/Cargo.toml +++ b/ballista/rust/core/Cargo.toml @@ -35,7 +35,7 @@ futures = "0.3" log = "0.4" prost = "0.7" serde = {version = "1", features = ["derive"]} -sqlparser = "0.8" +sqlparser = "0.9.0" tokio = "1.0" tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index 0ed9f243fd0ad..38d87e934e5fa 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -177,9 +177,9 @@ message WindowExprNode { // repeated LogicalExprNode partition_by = 5; repeated LogicalExprNode order_by = 6; // repeated LogicalExprNode filter = 7; - // oneof window_frame { - // WindowFrame frame = 8; - // } + oneof window_frame { + WindowFrame frame = 8; + } } message BetweenNode { diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index 662d9d0a929a8..4a198174a2baa 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -20,12 +20,6 @@ use crate::error::BallistaError; use crate::serde::{proto_error, protobuf}; use crate::{convert_box_required, convert_required}; -use sqlparser::ast::{WindowFrame, WindowFrameBound, WindowFrameUnits}; -use std::{ - convert::{From, TryInto}, - unimplemented, -}; - use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use datafusion::logical_plan::{ abs, acos, asin, atan, ceil, cos, exp, floor, ln, log10, log2, round, signum, sin, @@ -33,10 +27,17 @@ use datafusion::logical_plan::{ }; use datafusion::physical_plan::aggregates::AggregateFunction; use datafusion::physical_plan::csv::CsvReadOptions; +use datafusion::physical_plan::window_frames::{ + WindowFrame, WindowFrameBound, WindowFrameUnits, +}; use datafusion::physical_plan::window_functions::BuiltInWindowFunction; use datafusion::scalar::ScalarValue; use protobuf::logical_plan_node::LogicalPlanType; use protobuf::{logical_expr_node::ExprType, scalar_type}; +use std::{ + convert::{From, TryInto}, + unimplemented, +}; // use uuid::Uuid; @@ -83,20 +84,6 @@ impl TryInto for &protobuf::LogicalPlanNode { .iter() .map(|expr| expr.try_into()) .collect::, _>>()?; - - // let partition_by_expr = window - // .partition_by_expr - // .iter() - // .map(|expr| expr.try_into()) - // .collect::, _>>()?; - // let order_by_expr = window - // .order_by_expr - // .iter() - // .map(|expr| expr.try_into()) - // .collect::, _>>()?; - // // FIXME: add filter by expr - // // FIXME: parse the window_frame data - // let window_frame = None; LogicalPlanBuilder::from(&input) .window(window_expr)? .build() @@ -929,6 +916,15 @@ impl TryInto for &protobuf::LogicalExprNode { .map(|e| e.try_into()) .into_iter() .collect::, _>>()?; + let window_frame = expr + .window_frame + .as_ref() + .map::, _>(|e| match e { + window_expr_node::WindowFrame::Frame(frame) => { + frame.clone().try_into() + } + }) + .transpose()?; match window_function { window_expr_node::WindowFunction::AggrFunction(i) => { let aggr_function = protobuf::AggregateFunction::from_i32(*i) @@ -945,6 +941,7 @@ impl TryInto for &protobuf::LogicalExprNode { ), args: vec![parse_required_expr(&expr.expr)?], order_by, + window_frame, }) } window_expr_node::WindowFunction::BuiltInFunction(i) => { @@ -964,6 +961,7 @@ impl TryInto for &protobuf::LogicalExprNode { ), args: vec![parse_required_expr(&expr.expr)?], order_by, + window_frame, }) } } @@ -1333,8 +1331,15 @@ impl TryFrom for WindowFrame { ) })? .try_into()?; - // FIXME parse end bound - let end_bound = None; + let end_bound = window + .end_bound + .map(|end_bound| match end_bound { + protobuf::window_frame::EndBound::Bound(end_bound) => { + end_bound.try_into() + } + }) + .transpose()? + .unwrap_or(WindowFrameBound::CurrentRow); Ok(WindowFrame { units, start_bound, diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/ballista/rust/core/src/serde/logical_plan/to_proto.rs index d7734f05da56c..56270030b59f3 100644 --- a/ballista/rust/core/src/serde/logical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/to_proto.rs @@ -24,12 +24,17 @@ use std::{ convert::{TryFrom, TryInto}, }; +use super::super::proto_error; use crate::datasource::DfTableAdapter; use crate::serde::{protobuf, BallistaError}; use datafusion::arrow::datatypes::{DataType, Field, IntervalUnit, Schema, TimeUnit}; use datafusion::datasource::CsvFile; use datafusion::logical_plan::{Expr, JoinType, LogicalPlan}; use datafusion::physical_plan::aggregates::AggregateFunction; +use datafusion::physical_plan::functions::BuiltinScalarFunction; +use datafusion::physical_plan::window_frames::{ + WindowFrame, WindowFrameBound, WindowFrameUnits, +}; use datafusion::physical_plan::window_functions::{ BuiltInWindowFunction, WindowFunction, }; @@ -38,10 +43,6 @@ use protobuf::{ arrow_type, logical_expr_node::ExprType, scalar_type, DateUnit, PrimitiveScalarType, ScalarListValue, ScalarType, }; -use sqlparser::ast::{WindowFrame, WindowFrameBound, WindowFrameUnits}; - -use super::super::proto_error; -use datafusion::physical_plan::functions::BuiltinScalarFunction; impl protobuf::IntervalUnit { pub fn from_arrow_interval_unit(interval_unit: &IntervalUnit) -> Self { @@ -1007,6 +1008,7 @@ impl TryInto for &Expr { ref fun, ref args, ref order_by, + ref window_frame, .. } => { let window_function = match fun { @@ -1026,10 +1028,16 @@ impl TryInto for &Expr { .iter() .map(|e| e.try_into()) .collect::, _>>()?; + let window_frame = window_frame.map(|window_frame| { + protobuf::window_expr_node::WindowFrame::Frame( + window_frame.clone().into(), + ) + }); let window_expr = Box::new(protobuf::WindowExprNode { expr: Some(Box::new(arg.try_into()?)), window_function: Some(window_function), order_by, + window_frame, }); Ok(protobuf::LogicalExprNode { expr_type: Some(ExprType::WindowExpr(window_expr)), @@ -1256,23 +1264,35 @@ impl From for protobuf::WindowFrameUnits { } } -impl TryFrom for protobuf::WindowFrameBound { - type Error = BallistaError; - - fn try_from(_bound: WindowFrameBound) -> Result { - Err(BallistaError::NotImplemented( - "WindowFrameBound => protobuf::WindowFrameBound".to_owned(), - )) +impl From for protobuf::WindowFrameBound { + fn from(bound: WindowFrameBound) -> Self { + match bound { + WindowFrameBound::CurrentRow => protobuf::WindowFrameBound { + window_frame_bound_type: protobuf::WindowFrameBoundType::CurrentRow + .into(), + bound_value: None, + }, + WindowFrameBound::Preceding(v) => protobuf::WindowFrameBound { + window_frame_bound_type: protobuf::WindowFrameBoundType::Preceding.into(), + bound_value: v.map(protobuf::window_frame_bound::BoundValue::Value), + }, + WindowFrameBound::Following(v) => protobuf::WindowFrameBound { + window_frame_bound_type: protobuf::WindowFrameBoundType::Following.into(), + bound_value: v.map(protobuf::window_frame_bound::BoundValue::Value), + }, + } } } -impl TryFrom for protobuf::WindowFrame { - type Error = BallistaError; - - fn try_from(_window: WindowFrame) -> Result { - Err(BallistaError::NotImplemented( - "WindowFrame => protobuf::WindowFrame".to_owned(), - )) +impl From for protobuf::WindowFrame { + fn from(window: WindowFrame) -> Self { + protobuf::WindowFrame { + window_frame_units: protobuf::WindowFrameUnits::from(window.units).into(), + start_bound: Some(window.start_bound.into()), + end_bound: Some(protobuf::window_frame::EndBound::Bound( + window.end_bound.into(), + )), + } } } diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index 22944313666f5..5fcc971527c67 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -237,6 +237,7 @@ impl TryInto> for &protobuf::PhysicalPlanNode { fun, args, order_by, + .. } => { let arg = df_planner .create_physical_expr( diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index 5103d5dc5051c..bbc6ffabe9289 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -19,22 +19,19 @@ //! such as `col = 5` or `SUM(col)`. See examples on the [`Expr`] struct. pub use super::Operator; - -use std::fmt; -use std::sync::Arc; - -use aggregates::{AccumulatorFunctionImplementation, StateTypeFunction}; -use arrow::{compute::can_cast_types, datatypes::DataType}; - use crate::error::{DataFusionError, Result}; use crate::logical_plan::{DFField, DFSchema}; use crate::physical_plan::{ aggregates, expressions::binary_operator_data_type, functions, udf::ScalarUDF, - window_functions, + window_frames, window_functions, }; use crate::{physical_plan::udaf::AggregateUDF, scalar::ScalarValue}; +use aggregates::{AccumulatorFunctionImplementation, StateTypeFunction}; +use arrow::{compute::can_cast_types, datatypes::DataType}; use functions::{ReturnTypeFunction, ScalarFunctionImplementation, Signature}; use std::collections::HashSet; +use std::fmt; +use std::sync::Arc; /// `Expr` is a central struct of DataFusion's query API, and /// represent logical expressions such as `A + 1`, or `CAST(c1 AS @@ -199,6 +196,8 @@ pub enum Expr { args: Vec, /// List of order by expressions order_by: Vec, + /// Window frame + window_frame: Option, }, /// aggregate function AggregateUDF { @@ -735,10 +734,12 @@ impl Expr { args, fun, order_by, + window_frame, } => Expr::WindowFunction { args: rewrite_vec(args, rewriter)?, fun, order_by: rewrite_vec(order_by, rewriter)?, + window_frame, }, Expr::AggregateFunction { args, @@ -1283,8 +1284,23 @@ impl fmt::Debug for Expr { Expr::ScalarUDF { fun, ref args, .. } => { fmt_function(f, &fun.name, false, args) } - Expr::WindowFunction { fun, ref args, .. } => { - fmt_function(f, &fun.to_string(), false, args) + Expr::WindowFunction { + fun, + ref args, + window_frame, + .. + } => { + fmt_function(f, &fun.to_string(), false, args)?; + if let Some(window_frame) = window_frame { + write!( + f, + " {} BETWEEN {} AND {}", + window_frame.units, + window_frame.start_bound, + window_frame.end_bound + )?; + } + Ok(()) } Expr::AggregateFunction { fun, @@ -1401,8 +1417,18 @@ fn create_name(e: &Expr, input_schema: &DFSchema) -> Result { Expr::ScalarUDF { fun, args, .. } => { create_function_name(&fun.name, false, args, input_schema) } - Expr::WindowFunction { fun, args, .. } => { - create_function_name(&fun.to_string(), false, args, input_schema) + Expr::WindowFunction { + fun, + args, + window_frame, + .. + } => { + let fun_name = + create_function_name(&fun.to_string(), false, args, input_schema)?; + Ok(match window_frame { + Some(window_frame) => format!("{} {}", fun_name, window_frame), + None => fun_name, + }) } Expr::AggregateFunction { fun, diff --git a/datafusion/src/optimizer/utils.rs b/datafusion/src/optimizer/utils.rs index 2cb65066feb93..65c95bee20d46 100644 --- a/datafusion/src/optimizer/utils.rs +++ b/datafusion/src/optimizer/utils.rs @@ -337,7 +337,9 @@ pub fn rewrite_expression(expr: &Expr, expressions: &[Expr]) -> Result { fun: fun.clone(), args: expressions.to_vec(), }), - Expr::WindowFunction { fun, .. } => { + Expr::WindowFunction { + fun, window_frame, .. + } => { let index = expressions .iter() .position(|expr| { @@ -353,6 +355,7 @@ pub fn rewrite_expression(expr: &Expr, expressions: &[Expr]) -> Result { fun: fun.clone(), args: expressions[..index].to_vec(), order_by: expressions[index + 1..].to_vec(), + window_frame: *window_frame, }) } Expr::AggregateFunction { fun, distinct, .. } => Ok(Expr::AggregateFunction { diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index af6969c43cbd6..490e02875c428 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -617,5 +617,6 @@ pub mod udf; #[cfg(feature = "unicode_expressions")] pub mod unicode_expressions; pub mod union; +pub mod window_frames; pub mod window_functions; pub mod windows; diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index 754ace08de6a8..d7451c7870961 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -17,8 +17,6 @@ //! Physical query planner -use std::sync::Arc; - use super::{ aggregates, cross_join::CrossJoinExec, empty::EmptyExec, expressions::binary, functions, hash_join::PartitionMode, udaf, union::UnionExec, windows, @@ -56,6 +54,7 @@ use arrow::datatypes::{Schema, SchemaRef}; use arrow::{compute::can_cast_types, datatypes::DataType}; use expressions::col; use log::debug; +use std::sync::Arc; /// This trait exposes the ability to plan an [`ExecutionPlan`] out of a [`LogicalPlan`]. pub trait ExtensionPlanner { diff --git a/datafusion/src/physical_plan/window_frames.rs b/datafusion/src/physical_plan/window_frames.rs new file mode 100644 index 0000000000000..f0be5a221fbf7 --- /dev/null +++ b/datafusion/src/physical_plan/window_frames.rs @@ -0,0 +1,337 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Window frame +//! +//! The frame-spec determines which output rows are read by an aggregate window function. The frame-spec consists of four parts: +//! - A frame type - either ROWS, RANGE or GROUPS, +//! - A starting frame boundary, +//! - An ending frame boundary, +//! - An EXCLUDE clause. + +use crate::error::{DataFusionError, Result}; +use sqlparser::ast; +use std::cmp::Ordering; +use std::convert::{From, TryFrom}; +use std::fmt; + +/// The frame-spec determines which output rows are read by an aggregate window function. +/// +/// The ending frame boundary can be omitted (if the BETWEEN and AND keywords that surround the +/// starting frame boundary are also omitted), in which case the ending frame boundary defaults to +/// CURRENT ROW. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct WindowFrame { + /// A frame type - either ROWS, RANGE or GROUPS + pub units: WindowFrameUnits, + /// A starting frame boundary + pub start_bound: WindowFrameBound, + /// An ending frame boundary + pub end_bound: WindowFrameBound, +} + +impl fmt::Display for WindowFrame { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "{} BETWEEN {} AND {}", + self.units, self.start_bound, self.end_bound + )?; + Ok(()) + } +} + +impl TryFrom for WindowFrame { + type Error = DataFusionError; + + fn try_from(value: ast::WindowFrame) -> Result { + let start_bound = value.start_bound.into(); + let end_bound = value + .end_bound + .map(WindowFrameBound::from) + .unwrap_or(WindowFrameBound::CurrentRow); + + if let WindowFrameBound::Following(None) = start_bound { + Err(DataFusionError::Execution( + "Invalid window frame: start bound cannot be unbounded following" + .to_owned(), + )) + } else if let WindowFrameBound::Preceding(None) = end_bound { + Err(DataFusionError::Execution( + "Invalid window frame: end bound cannot be unbounded preceding" + .to_owned(), + )) + } else if start_bound > end_bound { + Err(DataFusionError::Execution(format!( + "Invalid window frame: start bound ({}) cannot be larger than end bound ({})", + start_bound, end_bound + ))) + } else { + let units = value.units.into(); + Ok(Self { + units, + start_bound, + end_bound, + }) + } + } +} + +impl Default for WindowFrame { + fn default() -> Self { + WindowFrame { + units: WindowFrameUnits::Range, + start_bound: WindowFrameBound::Preceding(None), + end_bound: WindowFrameBound::CurrentRow, + } + } +} + +/// There are five ways to describe starting and ending frame boundaries: +/// +/// 1. UNBOUNDED PRECEDING +/// 2. PRECEDING +/// 3. CURRENT ROW +/// 4. FOLLOWING +/// 5. UNBOUNDED FOLLOWING +/// +/// in this implementation we'll only allow to be u64 (i.e. no dynamic boundary) +#[derive(Debug, Clone, Copy, Eq)] +pub enum WindowFrameBound { + /// 1. UNBOUNDED PRECEDING + /// The frame boundary is the first row in the partition. + /// + /// 2. PRECEDING + /// must be a non-negative constant numeric expression. The boundary is a row that + /// is "units" prior to the current row. + Preceding(Option), + /// 3. The current row. + /// + /// For RANGE and GROUPS frame types, peers of the current row are also + /// included in the frame, unless specifically excluded by the EXCLUDE clause. + /// This is true regardless of whether CURRENT ROW is used as the starting or ending frame + /// boundary. + CurrentRow, + /// 4. This is the same as " PRECEDING" except that the boundary is units after the + /// current rather than before the current row. + /// + /// 5. UNBOUNDED FOLLOWING + /// The frame boundary is the last row in the partition. + Following(Option), +} + +impl From for WindowFrameBound { + fn from(value: ast::WindowFrameBound) -> Self { + match value { + ast::WindowFrameBound::Preceding(v) => Self::Preceding(v), + ast::WindowFrameBound::Following(v) => Self::Following(v), + ast::WindowFrameBound::CurrentRow => Self::CurrentRow, + } + } +} + +impl fmt::Display for WindowFrameBound { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + WindowFrameBound::CurrentRow => f.write_str("CURRENT ROW"), + WindowFrameBound::Preceding(None) => f.write_str("UNBOUNDED PRECEDING"), + WindowFrameBound::Following(None) => f.write_str("UNBOUNDED FOLLOWING"), + WindowFrameBound::Preceding(Some(n)) => write!(f, "{} PRECEDING", n), + WindowFrameBound::Following(Some(n)) => write!(f, "{} FOLLOWING", n), + } + } +} + +impl PartialEq for WindowFrameBound { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == Ordering::Equal + } +} + +impl PartialOrd for WindowFrameBound { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for WindowFrameBound { + fn cmp(&self, other: &Self) -> Ordering { + self.get_rank().cmp(&other.get_rank()) + } +} + +impl WindowFrameBound { + /// get the rank of this window frame bound. + /// + /// the rank is a tuple of (u8, u64) because we'll firstly compare the kind and then the value + /// which requires special handling e.g. with preceding the larger the value the smaller the + /// rank and also for 0 preceding / following it is the same as current row + fn get_rank(&self) -> (u8, u64) { + match self { + WindowFrameBound::Preceding(None) => (0, 0), + WindowFrameBound::Following(None) => (4, 0), + WindowFrameBound::Preceding(Some(0)) + | WindowFrameBound::CurrentRow + | WindowFrameBound::Following(Some(0)) => (2, 0), + WindowFrameBound::Preceding(Some(v)) => (1, u64::MAX - *v), + WindowFrameBound::Following(Some(v)) => (3, *v), + } + } +} + +/// There are three frame types: ROWS, GROUPS, and RANGE. The frame type determines how the +/// starting and ending boundaries of the frame are measured. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum WindowFrameUnits { + /// The ROWS frame type means that the starting and ending boundaries for the frame are + /// determined by counting individual rows relative to the current row. + Rows, + /// The RANGE frame type requires that the ORDER BY clause of the window have exactly one + /// term. Call that term "X". With the RANGE frame type, the elements of the frame are + /// determined by computing the value of expression X for all rows in the partition and framing + /// those rows for which the value of X is within a certain range of the value of X for the + /// current row. + Range, + /// The GROUPS frame type means that the starting and ending boundaries are determine + /// by counting "groups" relative to the current group. A "group" is a set of rows that all have + /// equivalent values for all all terms of the window ORDER BY clause. + Groups, +} + +impl fmt::Display for WindowFrameUnits { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(match self { + WindowFrameUnits::Rows => "ROWS", + WindowFrameUnits::Range => "RANGE", + WindowFrameUnits::Groups => "GROUPS", + }) + } +} + +impl From for WindowFrameUnits { + fn from(value: ast::WindowFrameUnits) -> Self { + match value { + ast::WindowFrameUnits::Range => Self::Range, + ast::WindowFrameUnits::Groups => Self::Groups, + ast::WindowFrameUnits::Rows => Self::Rows, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_window_frame_creation() -> Result<()> { + let window_frame = ast::WindowFrame { + units: ast::WindowFrameUnits::Range, + start_bound: ast::WindowFrameBound::Following(None), + end_bound: None, + }; + let result = WindowFrame::try_from(window_frame); + assert_eq!( + result.err().unwrap().to_string(), + "Execution error: Invalid window frame: start bound cannot be unbounded following".to_owned() + ); + + let window_frame = ast::WindowFrame { + units: ast::WindowFrameUnits::Range, + start_bound: ast::WindowFrameBound::Preceding(None), + end_bound: Some(ast::WindowFrameBound::Preceding(None)), + }; + let result = WindowFrame::try_from(window_frame); + assert_eq!( + result.err().unwrap().to_string(), + "Execution error: Invalid window frame: end bound cannot be unbounded preceding".to_owned() + ); + + let window_frame = ast::WindowFrame { + units: ast::WindowFrameUnits::Range, + start_bound: ast::WindowFrameBound::Preceding(Some(1)), + end_bound: Some(ast::WindowFrameBound::Preceding(Some(2))), + }; + let result = WindowFrame::try_from(window_frame); + assert_eq!( + result.err().unwrap().to_string(), + "Execution error: Invalid window frame: start bound (1 PRECEDING) cannot be larger than end bound (2 PRECEDING)".to_owned() + ); + Ok(()) + } + + #[test] + fn test_eq() { + assert_eq!( + WindowFrameBound::Preceding(Some(0)), + WindowFrameBound::CurrentRow + ); + assert_eq!( + WindowFrameBound::CurrentRow, + WindowFrameBound::Following(Some(0)) + ); + assert_eq!( + WindowFrameBound::Following(Some(2)), + WindowFrameBound::Following(Some(2)) + ); + assert_eq!( + WindowFrameBound::Following(None), + WindowFrameBound::Following(None) + ); + assert_eq!( + WindowFrameBound::Preceding(Some(2)), + WindowFrameBound::Preceding(Some(2)) + ); + assert_eq!( + WindowFrameBound::Preceding(None), + WindowFrameBound::Preceding(None) + ); + } + + #[test] + fn test_ord() { + assert!(WindowFrameBound::Preceding(Some(1)) < WindowFrameBound::CurrentRow); + // ! yes this is correct! + assert!( + WindowFrameBound::Preceding(Some(2)) < WindowFrameBound::Preceding(Some(1)) + ); + assert!( + WindowFrameBound::Preceding(Some(u64::MAX)) + < WindowFrameBound::Preceding(Some(u64::MAX - 1)) + ); + assert!( + WindowFrameBound::Preceding(None) + < WindowFrameBound::Preceding(Some(1000000)) + ); + assert!( + WindowFrameBound::Preceding(None) + < WindowFrameBound::Preceding(Some(u64::MAX)) + ); + assert!(WindowFrameBound::Preceding(None) < WindowFrameBound::Following(Some(0))); + assert!( + WindowFrameBound::Preceding(Some(1)) < WindowFrameBound::Following(Some(1)) + ); + assert!(WindowFrameBound::CurrentRow < WindowFrameBound::Following(Some(1))); + assert!( + WindowFrameBound::Following(Some(1)) < WindowFrameBound::Following(Some(2)) + ); + assert!(WindowFrameBound::Following(Some(2)) < WindowFrameBound::Following(None)); + assert!( + WindowFrameBound::Following(Some(u64::MAX)) + < WindowFrameBound::Following(None) + ); + } +} diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index aa6b5a93f4837..6bf7b776c8db7 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -1121,13 +1121,18 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // then, window function if let Some(window) = &function.over { - if window.partition_by.is_empty() && window.window_frame.is_none() { + if window.partition_by.is_empty() { let order_by = window .order_by .iter() .map(|e| self.order_by_to_sort_expr(e)) .into_iter() .collect::>>()?; + let window_frame = window + .window_frame + .as_ref() + .map(|window_frame| window_frame.clone().try_into()) + .transpose()?; let fun = window_functions::WindowFunction::from_str(&name); if let Ok(window_functions::WindowFunction::AggregateFunction( aggregate_fun, @@ -1140,6 +1145,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { args: self .aggregate_fn_to_expr(&aggregate_fun, function)?, order_by, + window_frame, }); } else if let Ok( window_functions::WindowFunction::BuiltInWindowFunction( @@ -1151,8 +1157,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { fun: window_functions::WindowFunction::BuiltInWindowFunction( window_fun, ), - args:self.function_args_to_expr(function)?, - order_by + args: self.function_args_to_expr(function)?, + order_by, + window_frame, }); } } @@ -2806,6 +2813,45 @@ mod tests { quick_test(sql, expected); } + #[test] + fn over_order_by_with_window_frame_double_end() { + let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id RANGE BETWEEN 3 PRECEDING and 3 FOLLOWING), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; + let expected = "\ + Projection: #order_id, #MAX(qty) RANGE BETWEEN 3 PRECEDING AND 3 FOLLOWING, #MIN(qty)\ + \n WindowAggr: windowExpr=[[MAX(#qty) RANGE BETWEEN 3 PRECEDING AND 3 FOLLOWING]] partitionBy=[]\ + \n Sort: #order_id ASC NULLS FIRST\ + \n WindowAggr: windowExpr=[[MIN(#qty)]] partitionBy=[]\ + \n Sort: #order_id DESC NULLS FIRST\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); + } + + #[test] + fn over_order_by_with_window_frame_single_end() { + let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id RANGE 3 PRECEDING), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; + let expected = "\ + Projection: #order_id, #MAX(qty) RANGE BETWEEN 3 PRECEDING AND CURRENT ROW, #MIN(qty)\ + \n WindowAggr: windowExpr=[[MAX(#qty) RANGE BETWEEN 3 PRECEDING AND CURRENT ROW]] partitionBy=[]\ + \n Sort: #order_id ASC NULLS FIRST\ + \n WindowAggr: windowExpr=[[MIN(#qty)]] partitionBy=[]\ + \n Sort: #order_id DESC NULLS FIRST\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); + } + + #[test] + fn over_order_by_with_window_frame_single_end_groups() { + let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id GROUPS 3 PRECEDING), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; + let expected = "\ + Projection: #order_id, #MAX(qty) GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW, #MIN(qty)\ + \n WindowAggr: windowExpr=[[MAX(#qty) GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW]] partitionBy=[]\ + \n Sort: #order_id ASC NULLS FIRST\ + \n WindowAggr: windowExpr=[[MIN(#qty)]] partitionBy=[]\ + \n Sort: #order_id DESC NULLS FIRST\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); + } + /// psql result /// ``` /// QUERY PLAN diff --git a/datafusion/src/sql/utils.rs b/datafusion/src/sql/utils.rs index 80a25d04468fb..7a5dc0da1b535 100644 --- a/datafusion/src/sql/utils.rs +++ b/datafusion/src/sql/utils.rs @@ -239,6 +239,7 @@ where fun, args, order_by, + window_frame, } => Ok(Expr::WindowFunction { fun: fun.clone(), args: args @@ -249,6 +250,7 @@ where .iter() .map(|e| clone_with_replacement(e, replacement_fn)) .collect::>>()?, + window_frame: *window_frame, }), Expr::AggregateUDF { fun, args } => Ok(Expr::AggregateUDF { fun: fun.clone(), @@ -453,21 +455,25 @@ mod tests { fun: WindowFunction::AggregateFunction(AggregateFunction::Max), args: vec![col("name")], order_by: vec![], + window_frame: None, }; let max2 = Expr::WindowFunction { fun: WindowFunction::AggregateFunction(AggregateFunction::Max), args: vec![col("name")], order_by: vec![], + window_frame: None, }; let min3 = Expr::WindowFunction { fun: WindowFunction::AggregateFunction(AggregateFunction::Min), args: vec![col("name")], order_by: vec![], + window_frame: None, }; let sum4 = Expr::WindowFunction { fun: WindowFunction::AggregateFunction(AggregateFunction::Sum), args: vec![col("age")], order_by: vec![], + window_frame: None, }; // FIXME use as_ref let exprs = &[max1.clone(), max2.clone(), min3.clone(), sum4.clone()]; @@ -500,21 +506,25 @@ mod tests { fun: WindowFunction::AggregateFunction(AggregateFunction::Max), args: vec![col("name")], order_by: vec![age_asc.clone(), name_desc.clone()], + window_frame: None, }; let max2 = Expr::WindowFunction { fun: WindowFunction::AggregateFunction(AggregateFunction::Max), args: vec![col("name")], order_by: vec![], + window_frame: None, }; let min3 = Expr::WindowFunction { fun: WindowFunction::AggregateFunction(AggregateFunction::Min), args: vec![col("name")], order_by: vec![age_asc.clone(), name_desc.clone()], + window_frame: None, }; let sum4 = Expr::WindowFunction { fun: WindowFunction::AggregateFunction(AggregateFunction::Sum), args: vec![col("age")], order_by: vec![name_desc.clone(), age_asc.clone(), created_at_desc.clone()], + window_frame: None, }; // FIXME use as_ref let exprs = &[max1.clone(), max2.clone(), min3.clone(), sum4.clone()]; @@ -551,6 +561,7 @@ mod tests { nulls_first: true, }, ], + window_frame: None, }, Expr::WindowFunction { fun: WindowFunction::AggregateFunction(AggregateFunction::Sum), @@ -572,6 +583,7 @@ mod tests { nulls_first: true, }, ], + window_frame: None, }, ]; let expected = vec![ From 5773a03fe6f03f00d5aa78b219cc46009611cca7 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Mon, 7 Jun 2021 22:51:04 +0800 Subject: [PATCH 166/329] refactor sort exec stream and combine batches (#515) --- datafusion/src/physical_plan/common.rs | 96 +++++++++++++++++++++++--- datafusion/src/physical_plan/sort.rs | 86 +++++++++-------------- integration-tests/sqls/simple_sort.sql | 22 ++++++ integration-tests/test_psql_parity.py | 2 +- 4 files changed, 140 insertions(+), 66 deletions(-) create mode 100644 integration-tests/sqls/simple_sort.sql diff --git a/datafusion/src/physical_plan/common.rs b/datafusion/src/physical_plan/common.rs index e60963bbb5b75..2482bfc0872c8 100644 --- a/datafusion/src/physical_plan/common.rs +++ b/datafusion/src/physical_plan/common.rs @@ -17,24 +17,22 @@ //! Defines common code used in execution plans -use std::fs; -use std::fs::metadata; -use std::sync::Arc; -use std::task::{Context, Poll}; - +use super::{RecordBatchStream, SendableRecordBatchStream}; +use crate::error::{DataFusionError, Result}; +use crate::physical_plan::ExecutionPlan; +use arrow::compute::concat; use arrow::datatypes::SchemaRef; +use arrow::error::ArrowError; use arrow::error::Result as ArrowResult; use arrow::record_batch::RecordBatch; use futures::channel::mpsc; use futures::{SinkExt, Stream, StreamExt, TryStreamExt}; +use std::fs; +use std::fs::metadata; +use std::sync::Arc; +use std::task::{Context, Poll}; use tokio::task::JoinHandle; -use crate::arrow::error::ArrowError; -use crate::error::{DataFusionError, Result}; -use crate::physical_plan::ExecutionPlan; - -use super::{RecordBatchStream, SendableRecordBatchStream}; - /// Stream of record batches pub struct SizedRecordBatchStream { schema: SchemaRef, @@ -83,6 +81,32 @@ pub async fn collect(stream: SendableRecordBatchStream) -> Result ArrowResult> { + if batches.is_empty() { + Ok(None) + } else { + let columns = schema + .fields() + .iter() + .enumerate() + .map(|(i, _)| { + concat( + &batches + .iter() + .map(|batch| batch.column(i).as_ref()) + .collect::>(), + ) + }) + .collect::>>()?; + Ok(Some(RecordBatch::try_new(schema.clone(), columns)?)) + } +} + /// Recursively builds a list of files in a directory with a given extension pub fn build_file_list(dir: &str, ext: &str) -> Result> { let mut filenames: Vec = Vec::new(); @@ -144,3 +168,53 @@ pub(crate) fn spawn_execution( } }) } + +#[cfg(test)] +mod tests { + use super::*; + use arrow::{ + array::{Float32Array, Float64Array}, + datatypes::{DataType, Field, Schema}, + record_batch::RecordBatch, + }; + + #[test] + fn test_combine_batches_empty() -> Result<()> { + let schema = Arc::new(Schema::new(vec![ + Field::new("f32", DataType::Float32, false), + Field::new("f64", DataType::Float64, false), + ])); + let result = combine_batches(&[], schema)?; + assert!(result.is_none()); + Ok(()) + } + + #[test] + fn test_combine_batches() -> Result<()> { + let schema = Arc::new(Schema::new(vec![ + Field::new("f32", DataType::Float32, false), + Field::new("f64", DataType::Float64, false), + ])); + + let batch_count = 1000; + let batch_size = 10; + let batches = (0..batch_count) + .map(|i| { + RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(Float32Array::from(vec![i as f32; batch_size])), + Arc::new(Float64Array::from(vec![i as f64; batch_size])), + ], + ) + .unwrap() + }) + .collect::>(); + + let result = combine_batches(&batches, schema)?; + assert!(result.is_some()); + let result = result.unwrap(); + assert_eq!(batch_count * batch_size, result.num_rows()); + Ok(()) + } +} diff --git a/datafusion/src/physical_plan/sort.rs b/datafusion/src/physical_plan/sort.rs index c5b838c6e84bb..7747030d8a93e 100644 --- a/datafusion/src/physical_plan/sort.rs +++ b/datafusion/src/physical_plan/sort.rs @@ -17,32 +17,28 @@ //! Defines the SORT plan -use std::any::Any; -use std::pin::Pin; -use std::sync::Arc; -use std::task::{Context, Poll}; -use std::time::Instant; - -use async_trait::async_trait; -use futures::stream::Stream; -use futures::Future; -use hashbrown::HashMap; - -use pin_project_lite::pin_project; - -pub use arrow::compute::SortOptions; -use arrow::compute::{concat, lexsort_to_indices, take, SortColumn, TakeOptions}; -use arrow::datatypes::SchemaRef; -use arrow::error::Result as ArrowResult; -use arrow::record_batch::RecordBatch; -use arrow::{array::ArrayRef, error::ArrowError}; - use super::{RecordBatchStream, SendableRecordBatchStream}; use crate::error::{DataFusionError, Result}; use crate::physical_plan::expressions::PhysicalSortExpr; use crate::physical_plan::{ common, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, SQLMetric, }; +pub use arrow::compute::SortOptions; +use arrow::compute::{lexsort_to_indices, take, SortColumn, TakeOptions}; +use arrow::datatypes::SchemaRef; +use arrow::error::Result as ArrowResult; +use arrow::record_batch::RecordBatch; +use arrow::{array::ArrayRef, error::ArrowError}; +use async_trait::async_trait; +use futures::stream::Stream; +use futures::Future; +use hashbrown::HashMap; +use pin_project_lite::pin_project; +use std::any::Any; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; +use std::time::Instant; /// Sort execution plan #[derive(Debug)] @@ -190,47 +186,25 @@ impl ExecutionPlan for SortExec { } } -fn sort_batches( - batches: &[RecordBatch], - schema: &SchemaRef, +fn sort_batch( + batch: RecordBatch, + schema: SchemaRef, expr: &[PhysicalSortExpr], -) -> ArrowResult> { - if batches.is_empty() { - return Ok(None); - } - // combine all record batches into one for each column - let combined_batch = RecordBatch::try_new( - schema.clone(), - schema - .fields() - .iter() - .enumerate() - .map(|(i, _)| { - concat( - &batches - .iter() - .map(|batch| batch.column(i).as_ref()) - .collect::>(), - ) - }) - .collect::>>()?, - )?; - - // sort combined record batch +) -> ArrowResult { // TODO: pushup the limit expression to sort let indices = lexsort_to_indices( &expr .iter() - .map(|e| e.evaluate_to_sort_column(&combined_batch)) + .map(|e| e.evaluate_to_sort_column(&batch)) .collect::>>() .map_err(DataFusionError::into_arrow_external_error)?, None, )?; // reorder all rows based on sorted indices - let sorted_batch = RecordBatch::try_new( - schema.clone(), - combined_batch + RecordBatch::try_new( + schema, + batch .columns() .iter() .map(|column| { @@ -245,8 +219,7 @@ fn sort_batches( ) }) .collect::>>()?, - ); - sorted_batch.map(Some) + ) } pin_project! { @@ -277,9 +250,14 @@ impl SortStream { .map_err(DataFusionError::into_arrow_external_error) .and_then(move |batches| { let now = Instant::now(); - let result = sort_batches(&batches, &schema, &expr); + // combine all record batches into one for each column + let combined = common::combine_batches(&batches, schema.clone())?; + // sort combined record batch + let result = combined + .map(|batch| sort_batch(batch, schema, &expr)) + .transpose()?; sort_time.add(now.elapsed().as_nanos() as usize); - result + Ok(result) }); tx.send(sorted_batch) diff --git a/integration-tests/sqls/simple_sort.sql b/integration-tests/sqls/simple_sort.sql new file mode 100644 index 0000000000000..50fb12dfdc707 --- /dev/null +++ b/integration-tests/sqls/simple_sort.sql @@ -0,0 +1,22 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at + +-- http://www.apache.org/licenses/LICENSE-2.0 + +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +SELECT + c2, + c3, + c10 +FROM test +ORDER BY c2 ASC, c3 DESC, c10; diff --git a/integration-tests/test_psql_parity.py b/integration-tests/test_psql_parity.py index 5bd308180e598..51861c583f8a6 100644 --- a/integration-tests/test_psql_parity.py +++ b/integration-tests/test_psql_parity.py @@ -74,7 +74,7 @@ class PsqlParityTest(unittest.TestCase): def test_parity(self): root = Path(os.path.dirname(__file__)) / "sqls" files = set(root.glob("*.sql")) - self.assertEqual(len(files), 5, msg="tests are missed") + self.assertEqual(len(files), 6, msg="tests are missed") for fname in files: with self.subTest(fname=fname): datafusion_output = pd.read_csv( From 63accf8630e734cd96ba11baa9a89b437703acc5 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Mon, 7 Jun 2021 22:56:39 +0800 Subject: [PATCH 167/329] closing up type checks (#518) --- .../core/src/serde/logical_plan/from_proto.rs | 6 +- .../core/src/serde/logical_plan/to_proto.rs | 17 +- datafusion/src/logical_plan/expr.rs | 4 +- datafusion/src/logical_plan/mod.rs | 1 + datafusion/src/logical_plan/window_frames.rs | 337 ++++++++++++++++++ 5 files changed, 351 insertions(+), 14 deletions(-) create mode 100644 datafusion/src/logical_plan/window_frames.rs diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index 4a198174a2baa..36a37a1e472c0 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -21,15 +21,15 @@ use crate::error::BallistaError; use crate::serde::{proto_error, protobuf}; use crate::{convert_box_required, convert_required}; use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit}; +use datafusion::logical_plan::window_frames::{ + WindowFrame, WindowFrameBound, WindowFrameUnits, +}; use datafusion::logical_plan::{ abs, acos, asin, atan, ceil, cos, exp, floor, ln, log10, log2, round, signum, sin, sqrt, tan, trunc, Expr, JoinType, LogicalPlan, LogicalPlanBuilder, Operator, }; use datafusion::physical_plan::aggregates::AggregateFunction; use datafusion::physical_plan::csv::CsvReadOptions; -use datafusion::physical_plan::window_frames::{ - WindowFrame, WindowFrameBound, WindowFrameUnits, -}; use datafusion::physical_plan::window_functions::BuiltInWindowFunction; use datafusion::scalar::ScalarValue; use protobuf::logical_plan_node::LogicalPlanType; diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/ballista/rust/core/src/serde/logical_plan/to_proto.rs index 56270030b59f3..fb1383daab3a6 100644 --- a/ballista/rust/core/src/serde/logical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/to_proto.rs @@ -19,22 +19,17 @@ //! buffer format, allowing DataFusion logical plans to be serialized and transmitted between //! processes. -use std::{ - boxed, - convert::{TryFrom, TryInto}, -}; - use super::super::proto_error; use crate::datasource::DfTableAdapter; use crate::serde::{protobuf, BallistaError}; use datafusion::arrow::datatypes::{DataType, Field, IntervalUnit, Schema, TimeUnit}; use datafusion::datasource::CsvFile; -use datafusion::logical_plan::{Expr, JoinType, LogicalPlan}; +use datafusion::logical_plan::{ + window_frames::{WindowFrame, WindowFrameBound, WindowFrameUnits}, + Expr, JoinType, LogicalPlan, +}; use datafusion::physical_plan::aggregates::AggregateFunction; use datafusion::physical_plan::functions::BuiltinScalarFunction; -use datafusion::physical_plan::window_frames::{ - WindowFrame, WindowFrameBound, WindowFrameUnits, -}; use datafusion::physical_plan::window_functions::{ BuiltInWindowFunction, WindowFunction, }; @@ -43,6 +38,10 @@ use protobuf::{ arrow_type, logical_expr_node::ExprType, scalar_type, DateUnit, PrimitiveScalarType, ScalarListValue, ScalarType, }; +use std::{ + boxed, + convert::{TryFrom, TryInto}, +}; impl protobuf::IntervalUnit { pub fn from_arrow_interval_unit(interval_unit: &IntervalUnit) -> Self { diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index bbc6ffabe9289..d5c92dbd21438 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -20,10 +20,10 @@ pub use super::Operator; use crate::error::{DataFusionError, Result}; -use crate::logical_plan::{DFField, DFSchema}; +use crate::logical_plan::{window_frames, DFField, DFSchema}; use crate::physical_plan::{ aggregates, expressions::binary_operator_data_type, functions, udf::ScalarUDF, - window_frames, window_functions, + window_functions, }; use crate::{physical_plan::udaf::AggregateUDF, scalar::ScalarValue}; use aggregates::{AccumulatorFunctionImplementation, StateTypeFunction}; diff --git a/datafusion/src/logical_plan/mod.rs b/datafusion/src/logical_plan/mod.rs index f948770e6437b..4a39e114d53f1 100644 --- a/datafusion/src/logical_plan/mod.rs +++ b/datafusion/src/logical_plan/mod.rs @@ -29,6 +29,7 @@ mod extension; mod operators; mod plan; mod registry; +pub mod window_frames; pub use builder::LogicalPlanBuilder; pub use dfschema::{DFField, DFSchema, DFSchemaRef, ToDFSchema}; pub use display::display_schema; diff --git a/datafusion/src/logical_plan/window_frames.rs b/datafusion/src/logical_plan/window_frames.rs new file mode 100644 index 0000000000000..f0be5a221fbf7 --- /dev/null +++ b/datafusion/src/logical_plan/window_frames.rs @@ -0,0 +1,337 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Window frame +//! +//! The frame-spec determines which output rows are read by an aggregate window function. The frame-spec consists of four parts: +//! - A frame type - either ROWS, RANGE or GROUPS, +//! - A starting frame boundary, +//! - An ending frame boundary, +//! - An EXCLUDE clause. + +use crate::error::{DataFusionError, Result}; +use sqlparser::ast; +use std::cmp::Ordering; +use std::convert::{From, TryFrom}; +use std::fmt; + +/// The frame-spec determines which output rows are read by an aggregate window function. +/// +/// The ending frame boundary can be omitted (if the BETWEEN and AND keywords that surround the +/// starting frame boundary are also omitted), in which case the ending frame boundary defaults to +/// CURRENT ROW. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct WindowFrame { + /// A frame type - either ROWS, RANGE or GROUPS + pub units: WindowFrameUnits, + /// A starting frame boundary + pub start_bound: WindowFrameBound, + /// An ending frame boundary + pub end_bound: WindowFrameBound, +} + +impl fmt::Display for WindowFrame { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "{} BETWEEN {} AND {}", + self.units, self.start_bound, self.end_bound + )?; + Ok(()) + } +} + +impl TryFrom for WindowFrame { + type Error = DataFusionError; + + fn try_from(value: ast::WindowFrame) -> Result { + let start_bound = value.start_bound.into(); + let end_bound = value + .end_bound + .map(WindowFrameBound::from) + .unwrap_or(WindowFrameBound::CurrentRow); + + if let WindowFrameBound::Following(None) = start_bound { + Err(DataFusionError::Execution( + "Invalid window frame: start bound cannot be unbounded following" + .to_owned(), + )) + } else if let WindowFrameBound::Preceding(None) = end_bound { + Err(DataFusionError::Execution( + "Invalid window frame: end bound cannot be unbounded preceding" + .to_owned(), + )) + } else if start_bound > end_bound { + Err(DataFusionError::Execution(format!( + "Invalid window frame: start bound ({}) cannot be larger than end bound ({})", + start_bound, end_bound + ))) + } else { + let units = value.units.into(); + Ok(Self { + units, + start_bound, + end_bound, + }) + } + } +} + +impl Default for WindowFrame { + fn default() -> Self { + WindowFrame { + units: WindowFrameUnits::Range, + start_bound: WindowFrameBound::Preceding(None), + end_bound: WindowFrameBound::CurrentRow, + } + } +} + +/// There are five ways to describe starting and ending frame boundaries: +/// +/// 1. UNBOUNDED PRECEDING +/// 2. PRECEDING +/// 3. CURRENT ROW +/// 4. FOLLOWING +/// 5. UNBOUNDED FOLLOWING +/// +/// in this implementation we'll only allow to be u64 (i.e. no dynamic boundary) +#[derive(Debug, Clone, Copy, Eq)] +pub enum WindowFrameBound { + /// 1. UNBOUNDED PRECEDING + /// The frame boundary is the first row in the partition. + /// + /// 2. PRECEDING + /// must be a non-negative constant numeric expression. The boundary is a row that + /// is "units" prior to the current row. + Preceding(Option), + /// 3. The current row. + /// + /// For RANGE and GROUPS frame types, peers of the current row are also + /// included in the frame, unless specifically excluded by the EXCLUDE clause. + /// This is true regardless of whether CURRENT ROW is used as the starting or ending frame + /// boundary. + CurrentRow, + /// 4. This is the same as " PRECEDING" except that the boundary is units after the + /// current rather than before the current row. + /// + /// 5. UNBOUNDED FOLLOWING + /// The frame boundary is the last row in the partition. + Following(Option), +} + +impl From for WindowFrameBound { + fn from(value: ast::WindowFrameBound) -> Self { + match value { + ast::WindowFrameBound::Preceding(v) => Self::Preceding(v), + ast::WindowFrameBound::Following(v) => Self::Following(v), + ast::WindowFrameBound::CurrentRow => Self::CurrentRow, + } + } +} + +impl fmt::Display for WindowFrameBound { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + WindowFrameBound::CurrentRow => f.write_str("CURRENT ROW"), + WindowFrameBound::Preceding(None) => f.write_str("UNBOUNDED PRECEDING"), + WindowFrameBound::Following(None) => f.write_str("UNBOUNDED FOLLOWING"), + WindowFrameBound::Preceding(Some(n)) => write!(f, "{} PRECEDING", n), + WindowFrameBound::Following(Some(n)) => write!(f, "{} FOLLOWING", n), + } + } +} + +impl PartialEq for WindowFrameBound { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == Ordering::Equal + } +} + +impl PartialOrd for WindowFrameBound { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for WindowFrameBound { + fn cmp(&self, other: &Self) -> Ordering { + self.get_rank().cmp(&other.get_rank()) + } +} + +impl WindowFrameBound { + /// get the rank of this window frame bound. + /// + /// the rank is a tuple of (u8, u64) because we'll firstly compare the kind and then the value + /// which requires special handling e.g. with preceding the larger the value the smaller the + /// rank and also for 0 preceding / following it is the same as current row + fn get_rank(&self) -> (u8, u64) { + match self { + WindowFrameBound::Preceding(None) => (0, 0), + WindowFrameBound::Following(None) => (4, 0), + WindowFrameBound::Preceding(Some(0)) + | WindowFrameBound::CurrentRow + | WindowFrameBound::Following(Some(0)) => (2, 0), + WindowFrameBound::Preceding(Some(v)) => (1, u64::MAX - *v), + WindowFrameBound::Following(Some(v)) => (3, *v), + } + } +} + +/// There are three frame types: ROWS, GROUPS, and RANGE. The frame type determines how the +/// starting and ending boundaries of the frame are measured. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum WindowFrameUnits { + /// The ROWS frame type means that the starting and ending boundaries for the frame are + /// determined by counting individual rows relative to the current row. + Rows, + /// The RANGE frame type requires that the ORDER BY clause of the window have exactly one + /// term. Call that term "X". With the RANGE frame type, the elements of the frame are + /// determined by computing the value of expression X for all rows in the partition and framing + /// those rows for which the value of X is within a certain range of the value of X for the + /// current row. + Range, + /// The GROUPS frame type means that the starting and ending boundaries are determine + /// by counting "groups" relative to the current group. A "group" is a set of rows that all have + /// equivalent values for all all terms of the window ORDER BY clause. + Groups, +} + +impl fmt::Display for WindowFrameUnits { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(match self { + WindowFrameUnits::Rows => "ROWS", + WindowFrameUnits::Range => "RANGE", + WindowFrameUnits::Groups => "GROUPS", + }) + } +} + +impl From for WindowFrameUnits { + fn from(value: ast::WindowFrameUnits) -> Self { + match value { + ast::WindowFrameUnits::Range => Self::Range, + ast::WindowFrameUnits::Groups => Self::Groups, + ast::WindowFrameUnits::Rows => Self::Rows, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_window_frame_creation() -> Result<()> { + let window_frame = ast::WindowFrame { + units: ast::WindowFrameUnits::Range, + start_bound: ast::WindowFrameBound::Following(None), + end_bound: None, + }; + let result = WindowFrame::try_from(window_frame); + assert_eq!( + result.err().unwrap().to_string(), + "Execution error: Invalid window frame: start bound cannot be unbounded following".to_owned() + ); + + let window_frame = ast::WindowFrame { + units: ast::WindowFrameUnits::Range, + start_bound: ast::WindowFrameBound::Preceding(None), + end_bound: Some(ast::WindowFrameBound::Preceding(None)), + }; + let result = WindowFrame::try_from(window_frame); + assert_eq!( + result.err().unwrap().to_string(), + "Execution error: Invalid window frame: end bound cannot be unbounded preceding".to_owned() + ); + + let window_frame = ast::WindowFrame { + units: ast::WindowFrameUnits::Range, + start_bound: ast::WindowFrameBound::Preceding(Some(1)), + end_bound: Some(ast::WindowFrameBound::Preceding(Some(2))), + }; + let result = WindowFrame::try_from(window_frame); + assert_eq!( + result.err().unwrap().to_string(), + "Execution error: Invalid window frame: start bound (1 PRECEDING) cannot be larger than end bound (2 PRECEDING)".to_owned() + ); + Ok(()) + } + + #[test] + fn test_eq() { + assert_eq!( + WindowFrameBound::Preceding(Some(0)), + WindowFrameBound::CurrentRow + ); + assert_eq!( + WindowFrameBound::CurrentRow, + WindowFrameBound::Following(Some(0)) + ); + assert_eq!( + WindowFrameBound::Following(Some(2)), + WindowFrameBound::Following(Some(2)) + ); + assert_eq!( + WindowFrameBound::Following(None), + WindowFrameBound::Following(None) + ); + assert_eq!( + WindowFrameBound::Preceding(Some(2)), + WindowFrameBound::Preceding(Some(2)) + ); + assert_eq!( + WindowFrameBound::Preceding(None), + WindowFrameBound::Preceding(None) + ); + } + + #[test] + fn test_ord() { + assert!(WindowFrameBound::Preceding(Some(1)) < WindowFrameBound::CurrentRow); + // ! yes this is correct! + assert!( + WindowFrameBound::Preceding(Some(2)) < WindowFrameBound::Preceding(Some(1)) + ); + assert!( + WindowFrameBound::Preceding(Some(u64::MAX)) + < WindowFrameBound::Preceding(Some(u64::MAX - 1)) + ); + assert!( + WindowFrameBound::Preceding(None) + < WindowFrameBound::Preceding(Some(1000000)) + ); + assert!( + WindowFrameBound::Preceding(None) + < WindowFrameBound::Preceding(Some(u64::MAX)) + ); + assert!(WindowFrameBound::Preceding(None) < WindowFrameBound::Following(Some(0))); + assert!( + WindowFrameBound::Preceding(Some(1)) < WindowFrameBound::Following(Some(1)) + ); + assert!(WindowFrameBound::CurrentRow < WindowFrameBound::Following(Some(1))); + assert!( + WindowFrameBound::Following(Some(1)) < WindowFrameBound::Following(Some(2)) + ); + assert!(WindowFrameBound::Following(Some(2)) < WindowFrameBound::Following(None)); + assert!( + WindowFrameBound::Following(Some(u64::MAX)) + < WindowFrameBound::Following(None) + ); + } +} From 2f73e795d3ae68638d6509bfa02388bfa3727381 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Tue, 8 Jun 2021 00:43:09 +0800 Subject: [PATCH 168/329] Refactor window aggregation, simplify batch processing logic (#516) * refactor sort exec stream and combine batches * refactor async function --- datafusion/src/physical_plan/sort.rs | 1 - datafusion/src/physical_plan/windows.rs | 149 +++++++++++------------- 2 files changed, 71 insertions(+), 79 deletions(-) diff --git a/datafusion/src/physical_plan/sort.rs b/datafusion/src/physical_plan/sort.rs index 7747030d8a93e..437519a7d2a29 100644 --- a/datafusion/src/physical_plan/sort.rs +++ b/datafusion/src/physical_plan/sort.rs @@ -241,7 +241,6 @@ impl SortStream { sort_time: Arc, ) -> Self { let (tx, rx) = futures::channel::oneshot::channel(); - let schema = input.schema(); tokio::spawn(async move { let schema = input.schema(); diff --git a/datafusion/src/physical_plan/windows.rs b/datafusion/src/physical_plan/windows.rs index 659d2183819d3..7eb14943facf1 100644 --- a/datafusion/src/physical_plan/windows.rs +++ b/datafusion/src/physical_plan/windows.rs @@ -19,7 +19,7 @@ use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ - aggregates, + aggregates, common, expressions::{Literal, NthValue, RowNumber}, type_coercion::coerce, window_functions::signature_for_built_in, @@ -29,20 +29,18 @@ use crate::physical_plan::{ RecordBatchStream, SendableRecordBatchStream, WindowAccumulator, WindowExpr, }; use crate::scalar::ScalarValue; -use arrow::compute::concat; use arrow::{ - array::{Array, ArrayRef}, + array::ArrayRef, datatypes::{Field, Schema, SchemaRef}, error::{ArrowError, Result as ArrowResult}, record_batch::RecordBatch, }; use async_trait::async_trait; -use futures::stream::{Stream, StreamExt}; +use futures::stream::Stream; use futures::Future; use pin_project_lite::pin_project; use std::any::Any; use std::convert::TryInto; -use std::iter; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; @@ -339,22 +337,15 @@ fn window_aggregate_batch( window_accumulators: &mut [WindowAccumulatorItem], expressions: &[Vec>], ) -> Result>> { - // 1.1 iterate accumulators and respective expressions together - // 1.2 evaluate expressions - // 1.3 update / merge window accumulators with the expressions' values - - // 1.1 window_accumulators .iter_mut() .zip(expressions) .map(|(window_acc, expr)| { - // 1.2 let values = &expr .iter() - .map(|e| e.evaluate(batch)) + .map(|e| e.evaluate(&batch)) .map(|r| r.map(|v| v.into_array(batch.num_rows()))) .collect::>>()?; - window_acc.scan_batch(batch.num_rows(), values) }) .into_iter() @@ -380,60 +371,50 @@ fn create_window_accumulators( .collect::>>() } -async fn compute_window_aggregate( - schema: SchemaRef, +/// Compute the window aggregate columns +/// +/// 1. get a list of window accumulators +/// 2. evaluate the args +/// 3. scan args with window functions +/// 4. concat with final aggregations +/// +/// FIXME so far this fn does not support: +/// 1. partition by +/// 2. order by +/// 3. window frame +/// +/// which will require further work: +/// 1. inter-partition order by using vec partition-point (https://github.com/apache/arrow-datafusion/issues/360) +/// 2. inter-partition parallelism using one-shot channel (https://github.com/apache/arrow-datafusion/issues/299) +/// 3. convert aggregation based window functions to be self-contain so that: (https://github.com/apache/arrow-datafusion/issues/361) +/// a. some can be grow-only window-accumulating +/// b. some can be grow-and-shrink window-accumulating +/// c. some can be based on segment tree +fn compute_window_aggregates( window_expr: Vec>, - mut input: SendableRecordBatchStream, -) -> ArrowResult { - let mut window_accumulators = create_window_accumulators(&window_expr) - .map_err(DataFusionError::into_arrow_external_error)?; - - let expressions = window_expressions(&window_expr) - .map_err(DataFusionError::into_arrow_external_error)?; - - let expressions = Arc::new(expressions); - - // TODO each element shall have some size hint - let mut accumulator: Vec> = - iter::repeat(vec![]).take(window_expr.len()).collect(); - - let mut original_batches: Vec = vec![]; - - let mut total_num_rows = 0; - - while let Some(batch) = input.next().await { - let batch = batch?; - total_num_rows += batch.num_rows(); - original_batches.push(batch.clone()); - - let batch_aggregated = - window_aggregate_batch(&batch, &mut window_accumulators, &expressions) - .map_err(DataFusionError::into_arrow_external_error)?; - accumulator.iter_mut().zip(batch_aggregated).for_each( - |(acc_for_window, window_batch)| { - if let Some(data) = window_batch { - acc_for_window.push(data); - } - }, - ); + batch: &RecordBatch, +) -> Result> { + let mut window_accumulators = create_window_accumulators(&window_expr)?; + let expressions = Arc::new(window_expressions(&window_expr)?); + let num_rows = batch.num_rows(); + let window_aggregates = + window_aggregate_batch(batch, &mut window_accumulators, &expressions)?; + let final_aggregates = finalize_window_aggregation(&window_accumulators)?; + + // both must equal to window_expr.len() + if window_aggregates.len() != final_aggregates.len() { + return Err(DataFusionError::Internal( + "Impossibly got len mismatch".to_owned(), + )); } - let aggregated_mapped = finalize_window_aggregation(&window_accumulators) - .map_err(DataFusionError::into_arrow_external_error)?; - - let mut columns: Vec = accumulator + window_aggregates .iter() - .zip(aggregated_mapped) - .map(|(acc, agg)| { - Ok(match (acc, agg) { - (acc, Some(scalar_value)) if acc.is_empty() => { - scalar_value.to_array_of_size(total_num_rows) - } - (acc, None) if !acc.is_empty() => { - let vec_array: Vec<&dyn Array> = - acc.iter().map(|arc| arc.as_ref()).collect(); - concat(&vec_array)? - } + .zip(final_aggregates) + .map(|(wa, fa)| { + Ok(match (wa, fa) { + (None, Some(fa)) => fa.to_array_of_size(num_rows), + (Some(wa), None) if wa.len() == num_rows => wa.clone(), _ => { return Err(DataFusionError::Execution( "Invalid window function behavior".to_owned(), @@ -441,20 +422,7 @@ async fn compute_window_aggregate( } }) }) - .collect::>>() - .map_err(DataFusionError::into_arrow_external_error)?; - - for i in 0..(schema.fields().len() - window_expr.len()) { - let col = concat( - &original_batches - .iter() - .map(|batch| batch.column(i).as_ref()) - .collect::>(), - )?; - columns.push(col); - } - - RecordBatch::try_new(schema.clone(), columns) + .collect() } impl WindowAggStream { @@ -467,7 +435,8 @@ impl WindowAggStream { let (tx, rx) = futures::channel::oneshot::channel(); let schema_clone = schema.clone(); tokio::spawn(async move { - let result = compute_window_aggregate(schema_clone, window_expr, input).await; + let schema = schema_clone.clone(); + let result = WindowAggStream::process(input, window_expr, schema).await; tx.send(result) }); @@ -477,6 +446,30 @@ impl WindowAggStream { schema, } } + + async fn process( + input: SendableRecordBatchStream, + window_expr: Vec>, + schema: SchemaRef, + ) -> ArrowResult { + let input_schema = input.schema(); + let batches = common::collect(input) + .await + .map_err(DataFusionError::into_arrow_external_error)?; + let batch = common::combine_batches(&batches, input_schema.clone())?; + if let Some(batch) = batch { + // calculate window cols + let mut columns = compute_window_aggregates(window_expr, &batch) + .map_err(DataFusionError::into_arrow_external_error)?; + // combine with the original cols + // note the setup of window aggregates is that they newly calculated window + // expressions are always prepended to the columns + columns.extend_from_slice(batch.columns()); + RecordBatch::try_new(schema, columns) + } else { + Ok(RecordBatch::new_empty(schema)) + } + } } impl Stream for WindowAggStream { From e39f3116684b836829a2e02c9013d8a84d87b82e Mon Sep 17 00:00:00 2001 From: Rich Date: Tue, 8 Jun 2021 23:26:03 +0800 Subject: [PATCH 169/329] 110 support group by positions (#519) * 110 support group by positions * try resolve positions via array, not map * Add comment for i64 and simplify the pattern match * combine match and if condition, add more test cases * replace '0 as i64' with 0_i64 --- datafusion/src/sql/planner.rs | 42 +++++++++++++++++++++++++++++++---- datafusion/src/sql/utils.rs | 22 ++++++++++++++++++ 2 files changed, 60 insertions(+), 4 deletions(-) diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 6bf7b776c8db7..7df0068c5f547 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -56,6 +56,7 @@ use super::{ can_columns_satisfy_exprs, expand_wildcard, expr_as_column_expr, extract_aliases, find_aggregate_exprs, find_column_exprs, find_window_exprs, group_window_expr_by_sort_keys, rebase_expr, resolve_aliases_to_exprs, + resolve_positions_to_exprs, }, }; @@ -582,15 +583,15 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // All of the aggregate expressions (deduplicated). let aggr_exprs = find_aggregate_exprs(&aggr_expr_haystack); + let alias_map = extract_aliases(&select_exprs); let group_by_exprs = select .group_by .iter() .map(|e| { let group_by_expr = self.sql_expr_to_logical_expr(e)?; - let group_by_expr = resolve_aliases_to_exprs( - &group_by_expr, - &extract_aliases(&select_exprs), - )?; + let group_by_expr = resolve_aliases_to_exprs(&group_by_expr, &alias_map)?; + let group_by_expr = + resolve_positions_to_exprs(&group_by_expr, &select_exprs)?; self.validate_schema_satisfies_exprs( plan.schema(), &[group_by_expr.clone()], @@ -2326,6 +2327,39 @@ mod tests { ); } + #[test] + fn select_simple_aggregate_with_groupby_can_use_positions() { + quick_test( + "SELECT state, age AS b, COUNT(1) FROM person GROUP BY 1, 2", + "Projection: #state, #age AS b, #COUNT(UInt8(1))\ + \n Aggregate: groupBy=[[#state, #age]], aggr=[[COUNT(UInt8(1))]]\ + \n TableScan: person projection=None", + ); + quick_test( + "SELECT state, age AS b, COUNT(1) FROM person GROUP BY 2, 1", + "Projection: #state, #age AS b, #COUNT(UInt8(1))\ + \n Aggregate: groupBy=[[#age, #state]], aggr=[[COUNT(UInt8(1))]]\ + \n TableScan: person projection=None", + ); + } + + #[test] + fn select_simple_aggregate_with_groupby_position_out_of_range() { + let sql = "SELECT state, MIN(age) FROM person GROUP BY 0"; + let err = logical_plan(sql).expect_err("query should have failed"); + assert_eq!( + "Plan(\"Projection references non-aggregate values\")", + format!("{:?}", err) + ); + + let sql2 = "SELECT state, MIN(age) FROM person GROUP BY 5"; + let err2 = logical_plan(sql2).expect_err("query should have failed"); + assert_eq!( + "Plan(\"Projection references non-aggregate values\")", + format!("{:?}", err2) + ); + } + #[test] fn select_simple_aggregate_with_groupby_can_use_alias() { quick_test( diff --git a/datafusion/src/sql/utils.rs b/datafusion/src/sql/utils.rs index 7a5dc0da1b535..848fb3ee31fc3 100644 --- a/datafusion/src/sql/utils.rs +++ b/datafusion/src/sql/utils.rs @@ -18,6 +18,7 @@ //! SQL Utility Functions use crate::logical_plan::{DFSchema, Expr, LogicalPlan}; +use crate::scalar::ScalarValue; use crate::{ error::{DataFusionError, Result}, logical_plan::{ExpressionVisitor, Recursion}, @@ -392,6 +393,27 @@ pub(crate) fn extract_aliases(exprs: &[Expr]) -> HashMap { .collect::>() } +pub(crate) fn resolve_positions_to_exprs( + expr: &Expr, + select_exprs: &[Expr], +) -> Result { + match expr { + // sql_expr_to_logical_expr maps number to i64 + // https://github.com/apache/arrow-datafusion/blob/8d175c759e17190980f270b5894348dc4cff9bbf/datafusion/src/sql/planner.rs#L882-L887 + Expr::Literal(ScalarValue::Int64(Some(position))) + if position > &0_i64 && position <= &(select_exprs.len() as i64) => + { + let index = (position - 1) as usize; + let select_expr = &select_exprs[index]; + match select_expr { + Expr::Alias(nested_expr, _alias_name) => Ok(*nested_expr.clone()), + _ => Ok(select_expr.clone()), + } + } + _ => Ok(expr.clone()), + } +} + /// Rebuilds an `Expr` with columns that refer to aliases replaced by the /// alias' underlying `Expr`. pub(crate) fn resolve_aliases_to_exprs( From 8495f95d7b510109c70cf2b4b606ba020bffd27a Mon Sep 17 00:00:00 2001 From: Javier Goday Date: Tue, 8 Jun 2021 23:32:19 +0200 Subject: [PATCH 170/329] Wrong aggregation arguments error. (#505) * Fix aggregate fn with invalid column * Fix error message * Fix error message * fix clippy * fix message and test * avoid unwrap in test_aggregation_with_bad_arguments * Update datafusion/tests/sql.rs Co-authored-by: Andrew Lamb * Fix test_aggregation_with_bad_arguments Co-authored-by: Andrew Lamb --- datafusion/src/physical_plan/aggregates.rs | 9 ++++++++- datafusion/tests/sql.rs | 12 ++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/datafusion/src/physical_plan/aggregates.rs b/datafusion/src/physical_plan/aggregates.rs index 3607f29debba1..60025a316228d 100644 --- a/datafusion/src/physical_plan/aggregates.rs +++ b/datafusion/src/physical_plan/aggregates.rs @@ -113,7 +113,14 @@ pub fn create_aggregate_expr( name: String, ) -> Result> { // coerce - let arg = coerce(args, input_schema, &signature(fun))?[0].clone(); + let arg = coerce(args, input_schema, &signature(fun))?; + if arg.is_empty() { + return Err(DataFusionError::Plan(format!( + "Invalid or wrong number of arguments passed to aggregate: '{}'", + name, + ))); + } + let arg = arg[0].clone(); let arg_types = args .iter() diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index d77671e7f4ffd..5ce1884049d84 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -3437,3 +3437,15 @@ async fn test_physical_plan_display_indent_multi_children() { expected, actual ); } + +#[tokio::test] +async fn test_aggregation_with_bad_arguments() -> Result<()> { + let mut ctx = ExecutionContext::new(); + register_aggregate_csv(&mut ctx)?; + let sql = "SELECT COUNT(DISTINCT) FROM aggregate_test_100"; + let logical_plan = ctx.create_logical_plan(&sql)?; + let physical_plan = ctx.create_physical_plan(&logical_plan); + let err = physical_plan.unwrap_err(); + assert_eq!(err.to_string(), "Error during planning: Invalid or wrong number of arguments passed to aggregate: 'COUNT(DISTINCT )'"); + Ok(()) +} From 42f908e2b5d2bd2abd4e396ade1a94fb0ff28ba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 9 Jun 2021 20:23:23 +0200 Subject: [PATCH 171/329] Define the unittests using pytest (#493) * Use pytest * Formatting * Update GHA conf * Remove TODO note * Format * Test requirements file * Update workflow file * Merge requirements file * Update workflow file --- .github/workflows/python_test.yaml | 2 +- dev/release/rat_exclude_files.txt | 1 + python/requirements.in | 1 + python/requirements.txt | 47 ++-- python/tests/generic.py | 51 ++-- python/tests/test_df.py | 136 +++++----- python/tests/test_sql.py | 416 +++++++++++------------------ python/tests/test_udaf.py | 86 +++--- 8 files changed, 328 insertions(+), 412 deletions(-) diff --git a/.github/workflows/python_test.yaml b/.github/workflows/python_test.yaml index 13516ff699dac..e689396b5dcd1 100644 --- a/.github/workflows/python_test.yaml +++ b/.github/workflows/python_test.yaml @@ -53,7 +53,7 @@ jobs: pip install -r requirements.txt maturin develop - python -m unittest discover tests + pytest -v . env: CARGO_HOME: "/home/runner/.cargo" CARGO_TARGET_DIR: "/home/runner/target" diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 6126699bbc1fa..96beccd0af81e 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -105,3 +105,4 @@ benchmarks/queries/q*.sql ballista/rust/scheduler/testdata/* ballista/ui/scheduler/yarn.lock python/rust-toolchain +python/requirements*.txt diff --git a/python/requirements.in b/python/requirements.in index 3ef9f18966d4b..4ff7f4ee618ba 100644 --- a/python/requirements.in +++ b/python/requirements.in @@ -17,3 +17,4 @@ maturin toml pyarrow +pytest diff --git a/python/requirements.txt b/python/requirements.txt index ff02b80cf6fc3..f7ede1ebd58e2 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,25 +1,17 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. # # This file is autogenerated by pip-compile # To update, run: # -# pip-compile --generate-hashes +# pip-compile --generate-hashes requirements.in # +attrs==21.2.0 \ + --hash=sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1 \ + --hash=sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb + # via pytest +iniconfig==1.1.1 \ + --hash=sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3 \ + --hash=sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32 + # via pytest maturin==0.10.6 \ --hash=sha256:0e81496f70a4805e6ea7dda7b0425246c111ccb119a2e22c64abeff131f4dd21 \ --hash=sha256:3b5d5429bc05a816824420d99973f0cab39d8e274f6c3647bfd9afd95a030304 \ @@ -59,6 +51,18 @@ numpy==1.20.3 \ --hash=sha256:f1452578d0516283c87608a5a5548b0cdde15b99650efdfd85182102ef7a7c17 \ --hash=sha256:f39a995e47cb8649673cfa0579fbdd1cdd33ea497d1728a6cb194d6252268e48 # via pyarrow +packaging==20.9 \ + --hash=sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5 \ + --hash=sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a + # via pytest +pluggy==0.13.1 \ + --hash=sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0 \ + --hash=sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d + # via pytest +py==1.10.0 \ + --hash=sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3 \ + --hash=sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a + # via pytest pyarrow==4.0.1 \ --hash=sha256:04be0f7cb9090bd029b5b53bed628548fef569e5d0b5c6cd7f6d0106dbbc782d \ --hash=sha256:0fde9c7a3d5d37f3fe5d18c4ed015e8f585b68b26d72a10d7012cad61afe43ff \ @@ -86,9 +90,18 @@ pyarrow==4.0.1 \ --hash=sha256:fa7b165cfa97158c1e6d15c68428317b4f4ae786d1dc2dbab43f1328c1eb43aa \ --hash=sha256:fe976695318560a97c6d31bba828eeca28c44c6f6401005e54ba476a28ac0a10 # via -r requirements.in +pyparsing==2.4.7 \ + --hash=sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1 \ + --hash=sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b + # via packaging +pytest==6.2.4 \ + --hash=sha256:50bcad0a0b9c5a72c8e4e7c9855a3ad496ca6a881a3641b4260605450772c54b \ + --hash=sha256:91ef2131a9bd6be8f76f1f08eac5c5317221d6ad1e143ae03894b862e8976890 + # via -r requirements.in toml==0.10.2 \ --hash=sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b \ --hash=sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f # via # -r requirements.in # maturin + # pytest diff --git a/python/tests/generic.py b/python/tests/generic.py index 267d6f656ce01..e61542e6ab37f 100644 --- a/python/tests/generic.py +++ b/python/tests/generic.py @@ -16,24 +16,30 @@ # under the License. import datetime -import numpy -import pyarrow + +import numpy as np +import pyarrow as pa +import pyarrow.parquet as pq # used to write parquet files -import pyarrow.parquet def data(): - data = numpy.concatenate( - [numpy.random.normal(0, 0.01, size=50), numpy.random.normal(50, 0.01, size=50)] + np.random.seed(1) + data = np.concatenate( + [ + np.random.normal(0, 0.01, size=50), + np.random.normal(50, 0.01, size=50), + ] ) - return pyarrow.array(data) + return pa.array(data) def data_with_nans(): - data = numpy.random.normal(0, 0.01, size=50) - mask = numpy.random.randint(0, 2, size=50) - data[mask == 0] = numpy.NaN + np.random.seed(0) + data = np.random.normal(0, 0.01, size=50) + mask = np.random.randint(0, 2, size=50) + data[mask == 0] = np.NaN return data @@ -43,8 +49,19 @@ def data_datetime(f): datetime.datetime.now() - datetime.timedelta(days=1), datetime.datetime.now() + datetime.timedelta(days=1), ] - return pyarrow.array( - data, type=pyarrow.timestamp(f), mask=numpy.array([False, True, False]) + return pa.array( + data, type=pa.timestamp(f), mask=np.array([False, True, False]) + ) + + +def data_date32(): + data = [ + datetime.date(2000, 1, 1), + datetime.date(1980, 1, 1), + datetime.date(2030, 1, 1), + ] + return pa.array( + data, type=pa.date32(), mask=np.array([False, True, False]) ) @@ -54,16 +71,16 @@ def data_timedelta(f): datetime.timedelta(days=1), datetime.timedelta(seconds=1), ] - return pyarrow.array( - data, type=pyarrow.duration(f), mask=numpy.array([False, True, False]) + return pa.array( + data, type=pa.duration(f), mask=np.array([False, True, False]) ) def data_binary_other(): - return numpy.array([1, 0, 0], dtype="u4") + return np.array([1, 0, 0], dtype="u4") def write_parquet(path, data): - table = pyarrow.Table.from_arrays([data], names=["a"]) - pyarrow.parquet.write_table(table, path) - return path + table = pa.Table.from_arrays([data], names=["a"]) + pq.write_table(table, path) + return str(path) diff --git a/python/tests/test_df.py b/python/tests/test_df.py index fdafdfa7f509c..5b6cbddbd74ba 100644 --- a/python/tests/test_df.py +++ b/python/tests/test_df.py @@ -15,100 +15,98 @@ # specific language governing permissions and limitations # under the License. -import unittest - import pyarrow as pa -import datafusion +import pytest +from datafusion import ExecutionContext +from datafusion import functions as f + + +@pytest.fixture +def df(): + ctx = ExecutionContext() + + # create a RecordBatch and a new DataFrame from it + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], + names=["a", "b"], + ) -f = datafusion.functions + return ctx.create_dataframe([[batch]]) -class TestCase(unittest.TestCase): - def _prepare(self): - ctx = datafusion.ExecutionContext() +def test_select(df): + df = df.select( + f.col("a") + f.col("b"), + f.col("a") - f.col("b"), + ) - # create a RecordBatch and a new DataFrame from it - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"], - ) - return ctx.create_dataframe([[batch]]) + # execute and collect the first (and only) batch + result = df.collect()[0] - def test_select(self): - df = self._prepare() + assert result.column(0) == pa.array([5, 7, 9]) + assert result.column(1) == pa.array([-3, -3, -3]) - df = df.select( - f.col("a") + f.col("b"), - f.col("a") - f.col("b"), - ) - # execute and collect the first (and only) batch - result = df.collect()[0] +def test_filter(df): + df = df.select( + f.col("a") + f.col("b"), + f.col("a") - f.col("b"), + ).filter(f.col("a") > f.lit(2)) - self.assertEqual(result.column(0), pa.array([5, 7, 9])) - self.assertEqual(result.column(1), pa.array([-3, -3, -3])) + # execute and collect the first (and only) batch + result = df.collect()[0] - def test_filter(self): - df = self._prepare() + assert result.column(0) == pa.array([9]) + assert result.column(1) == pa.array([-3]) - df = df.select( - f.col("a") + f.col("b"), - f.col("a") - f.col("b"), - ).filter(f.col("a") > f.lit(2)) - # execute and collect the first (and only) batch - result = df.collect()[0] +def test_sort(df): + df = df.sort([f.col("b").sort(ascending=False)]) - self.assertEqual(result.column(0), pa.array([9])) - self.assertEqual(result.column(1), pa.array([-3])) + table = pa.Table.from_batches(df.collect()) + expected = {"a": [3, 2, 1], "b": [6, 5, 4]} - def test_sort(self): - df = self._prepare() - df = df.sort([f.col("b").sort(ascending=False)]) + assert table.to_pydict() == expected - table = pa.Table.from_batches(df.collect()) - expected = {"a": [3, 2, 1], "b": [6, 5, 4]} - self.assertEqual(table.to_pydict(), expected) - def test_limit(self): - df = self._prepare() +def test_limit(df): + df = df.limit(1) - df = df.limit(1) + # execute and collect the first (and only) batch + result = df.collect()[0] - # execute and collect the first (and only) batch - result = df.collect()[0] + assert len(result.column(0)) == 1 + assert len(result.column(1)) == 1 - self.assertEqual(len(result.column(0)), 1) - self.assertEqual(len(result.column(1)), 1) - def test_udf(self): - df = self._prepare() +def test_udf(df): + # is_null is a pa function over arrays + udf = f.udf(lambda x: x.is_null(), [pa.int64()], pa.bool_()) - # is_null is a pa function over arrays - udf = f.udf(lambda x: x.is_null(), [pa.int64()], pa.bool_()) + df = df.select(udf(f.col("a"))) + result = df.collect()[0].column(0) - df = df.select(udf(f.col("a"))) + assert result == pa.array([False, False, False]) - self.assertEqual(df.collect()[0].column(0), pa.array([False, False, False])) - def test_join(self): - ctx = datafusion.ExecutionContext() +def test_join(): + ctx = ExecutionContext() - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"], - ) - df = ctx.create_dataframe([[batch]]) + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], + names=["a", "b"], + ) + df = ctx.create_dataframe([[batch]]) - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2]), pa.array([8, 10])], - names=["a", "c"], - ) - df1 = ctx.create_dataframe([[batch]]) + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2]), pa.array([8, 10])], + names=["a", "c"], + ) + df1 = ctx.create_dataframe([[batch]]) - df = df.join(df1, on="a", how="inner") - df = df.sort([f.col("a").sort(ascending=True)]) - table = pa.Table.from_batches(df.collect()) + df = df.join(df1, on="a", how="inner") + df = df.sort([f.col("a").sort(ascending=True)]) + table = pa.Table.from_batches(df.collect()) - expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]} - self.assertEqual(table.to_pydict(), expected) + expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]} + assert table.to_pydict() == expected diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py index 117284973fb77..361526d069702 100644 --- a/python/tests/test_sql.py +++ b/python/tests/test_sql.py @@ -15,286 +15,182 @@ # specific language governing permissions and limitations # under the License. -import unittest -import tempfile -import datetime -import os.path -import shutil +import numpy as np +import pyarrow as pa +import pytest +from datafusion import ExecutionContext -import numpy -import pyarrow -import datafusion +from . import generic as helpers -# used to write parquet files -import pyarrow.parquet -from tests.generic import * +@pytest.fixture +def ctx(): + return ExecutionContext() -class TestCase(unittest.TestCase): - def setUp(self): - # Create a temporary directory - self.test_dir = tempfile.mkdtemp() - numpy.random.seed(1) +def test_no_table(ctx): + with pytest.raises(Exception, match="DataFusion error"): + ctx.sql("SELECT a FROM b").collect() - def tearDown(self): - # Remove the directory after the test - shutil.rmtree(self.test_dir) - def test_no_table(self): - with self.assertRaises(Exception): - datafusion.Context().sql("SELECT a FROM b").collect() +def test_register(ctx, tmp_path): + path = helpers.write_parquet(tmp_path / "a.parquet", helpers.data()) + ctx.register_parquet("t", path) - def test_register(self): - ctx = datafusion.ExecutionContext() + assert ctx.tables() == {"t"} - path = write_parquet(os.path.join(self.test_dir, "a.parquet"), data()) - ctx.register_parquet("t", path) +def test_execute(ctx, tmp_path): + data = [1, 1, 2, 2, 3, 11, 12] - self.assertEqual(ctx.tables(), {"t"}) + # single column, "a" + path = helpers.write_parquet(tmp_path / "a.parquet", pa.array(data)) + ctx.register_parquet("t", path) - def test_execute(self): - data = [1, 1, 2, 2, 3, 11, 12] + assert ctx.tables() == {"t"} - ctx = datafusion.ExecutionContext() + # count + result = ctx.sql("SELECT COUNT(a) FROM t").collect() - # single column, "a" - path = write_parquet( - os.path.join(self.test_dir, "a.parquet"), pyarrow.array(data) - ) - ctx.register_parquet("t", path) + expected = pa.array([7], pa.uint64()) + expected = [pa.RecordBatch.from_arrays([expected], ["COUNT(a)"])] + assert result == expected - self.assertEqual(ctx.tables(), {"t"}) + # where + expected = pa.array([2], pa.uint64()) + expected = [pa.RecordBatch.from_arrays([expected], ["COUNT(a)"])] + result = ctx.sql("SELECT COUNT(a) FROM t WHERE a > 10").collect() + assert result == expected - # count - result = ctx.sql("SELECT COUNT(a) FROM t").collect() + # group by + results = ctx.sql( + "SELECT CAST(a as int), COUNT(a) FROM t GROUP BY CAST(a as int)" + ).collect() - expected = pyarrow.array([7], pyarrow.uint64()) - expected = [pyarrow.RecordBatch.from_arrays([expected], ["COUNT(a)"])] - self.assertEqual(expected, result) + # group by returns batches + result_keys = [] + result_values = [] + for result in results: + pydict = result.to_pydict() + result_keys.extend(pydict["CAST(a AS Int32)"]) + result_values.extend(pydict["COUNT(a)"]) - # where - expected = pyarrow.array([2], pyarrow.uint64()) - expected = [pyarrow.RecordBatch.from_arrays([expected], ["COUNT(a)"])] - self.assertEqual( - expected, ctx.sql("SELECT COUNT(a) FROM t WHERE a > 10").collect() - ) + result_keys, result_values = ( + list(t) for t in zip(*sorted(zip(result_keys, result_values))) + ) - # group by - results = ctx.sql( - "SELECT CAST(a as int), COUNT(a) FROM t GROUP BY CAST(a as int)" - ).collect() - - # group by returns batches - result_keys = [] - result_values = [] - for result in results: - pydict = result.to_pydict() - result_keys.extend(pydict["CAST(a AS Int32)"]) - result_values.extend(pydict["COUNT(a)"]) - - result_keys, result_values = ( - list(t) for t in zip(*sorted(zip(result_keys, result_values))) - ) + assert result_keys == [1, 2, 3, 11, 12] + assert result_values == [2, 2, 1, 1, 1] - self.assertEqual(result_keys, [1, 2, 3, 11, 12]) - self.assertEqual(result_values, [2, 2, 1, 1, 1]) - - # order by - result = ctx.sql( - "SELECT a, CAST(a AS int) FROM t ORDER BY a DESC LIMIT 2" - ).collect() - expected_a = pyarrow.array([50.0219, 50.0152], pyarrow.float64()) - expected_cast = pyarrow.array([50, 50], pyarrow.int32()) - expected = [ - pyarrow.RecordBatch.from_arrays( - [expected_a, expected_cast], ["a", "CAST(a AS Int32)"] - ) - ] - numpy.testing.assert_equal(expected[0].column(1), expected[0].column(1)) - - def test_cast(self): - """ - Verify that we can cast - """ - ctx = datafusion.ExecutionContext() - - path = write_parquet(os.path.join(self.test_dir, "a.parquet"), data()) - ctx.register_parquet("t", path) - - valid_types = [ - "smallint", - "int", - "bigint", - "float(32)", - "float(64)", - "float", - ] - - select = ", ".join( - [f"CAST(9 AS {t}) AS A{i}" for i, t in enumerate(valid_types)] + # order by + result = ctx.sql( + "SELECT a, CAST(a AS int) FROM t ORDER BY a DESC LIMIT 2" + ).collect() + expected_a = pa.array([50.0219, 50.0152], pa.float64()) + expected_cast = pa.array([50, 50], pa.int32()) + expected = [ + pa.RecordBatch.from_arrays( + [expected_a, expected_cast], ["a", "CAST(a AS Int32)"] ) - - # can execute, which implies that we can cast - ctx.sql(f"SELECT {select} FROM t").collect() - - def _test_udf(self, udf, args, return_type, array, expected): - ctx = datafusion.ExecutionContext() - - # write to disk - path = write_parquet(os.path.join(self.test_dir, "a.parquet"), array) - ctx.register_parquet("t", path) - - ctx.register_udf("udf", udf, args, return_type) - - batches = ctx.sql("SELECT udf(a) AS tt FROM t").collect() - - result = batches[0].column(0) - - self.assertEqual(expected, result) - - def test_udf_identity(self): - self._test_udf( + ] + np.testing.assert_equal(expected[0].column(1), expected[0].column(1)) + + +def test_cast(ctx, tmp_path): + """ + Verify that we can cast + """ + path = helpers.write_parquet(tmp_path / "a.parquet", helpers.data()) + ctx.register_parquet("t", path) + + valid_types = [ + "smallint", + "int", + "bigint", + "float(32)", + "float(64)", + "float", + ] + + select = ", ".join( + [f"CAST(9 AS {t}) AS A{i}" for i, t in enumerate(valid_types)] + ) + + # can execute, which implies that we can cast + ctx.sql(f"SELECT {select} FROM t").collect() + + +@pytest.mark.parametrize( + ("fn", "input_types", "output_type", "input_values", "expected_values"), + [ + ( lambda x: x, - [pyarrow.float64()], - pyarrow.float64(), - pyarrow.array([-1.2, None, 1.2]), - pyarrow.array([-1.2, None, 1.2]), - ) - - def test_udf(self): - self._test_udf( + [pa.float64()], + pa.float64(), + [-1.2, None, 1.2], + [-1.2, None, 1.2], + ), + ( lambda x: x.is_null(), - [pyarrow.float64()], - pyarrow.bool_(), - pyarrow.array([-1.2, None, 1.2]), - pyarrow.array([False, True, False]), - ) - - -class TestIO(unittest.TestCase): - def setUp(self): - # Create a temporary directory - self.test_dir = tempfile.mkdtemp() - - def tearDown(self): - # Remove the directory after the test - shutil.rmtree(self.test_dir) - - def _test_data(self, data): - ctx = datafusion.ExecutionContext() - - # write to disk - path = write_parquet(os.path.join(self.test_dir, "a.parquet"), data) - ctx.register_parquet("t", path) - - batches = ctx.sql("SELECT a AS tt FROM t").collect() - - result = batches[0].column(0) - - numpy.testing.assert_equal(data, result) - - def test_nans(self): - self._test_data(data_with_nans()) - - def test_utf8(self): - array = pyarrow.array( - ["a", "b", "c"], pyarrow.utf8(), numpy.array([False, True, False]) - ) - self._test_data(array) - - def test_large_utf8(self): - array = pyarrow.array( - ["a", "b", "c"], pyarrow.large_utf8(), numpy.array([False, True, False]) - ) - self._test_data(array) - - # Error from Arrow - @unittest.expectedFailure - def test_datetime_s(self): - self._test_data(data_datetime("s")) - - # C data interface missing - @unittest.expectedFailure - def test_datetime_ms(self): - self._test_data(data_datetime("ms")) - - # C data interface missing - @unittest.expectedFailure - def test_datetime_us(self): - self._test_data(data_datetime("us")) - - # Not writtable to parquet - @unittest.expectedFailure - def test_datetime_ns(self): - self._test_data(data_datetime("ns")) - - # Not writtable to parquet - @unittest.expectedFailure - def test_timedelta_s(self): - self._test_data(data_timedelta("s")) - - # Not writtable to parquet - @unittest.expectedFailure - def test_timedelta_ms(self): - self._test_data(data_timedelta("ms")) - - # Not writtable to parquet - @unittest.expectedFailure - def test_timedelta_us(self): - self._test_data(data_timedelta("us")) - - # Not writtable to parquet - @unittest.expectedFailure - def test_timedelta_ns(self): - self._test_data(data_timedelta("ns")) - - def test_date32(self): - array = pyarrow.array( - [ - datetime.date(2000, 1, 1), - datetime.date(1980, 1, 1), - datetime.date(2030, 1, 1), - ], - pyarrow.date32(), - numpy.array([False, True, False]), - ) - self._test_data(array) - - def test_binary_variable(self): - array = pyarrow.array( - [b"1", b"2", b"3"], pyarrow.binary(), numpy.array([False, True, False]) - ) - self._test_data(array) - - # C data interface missing - @unittest.expectedFailure - def test_binary_fixed(self): - array = pyarrow.array( - [b"1111", b"2222", b"3333"], - pyarrow.binary(4), - numpy.array([False, True, False]), - ) - self._test_data(array) - - def test_large_binary(self): - array = pyarrow.array( - [b"1111", b"2222", b"3333"], - pyarrow.large_binary(), - numpy.array([False, True, False]), - ) - self._test_data(array) - - def test_binary_other(self): - self._test_data(data_binary_other()) - - def test_bool(self): - array = pyarrow.array( - [False, True, True], None, numpy.array([False, True, False]) - ) - self._test_data(array) - - def test_u32(self): - array = pyarrow.array([0, 1, 2], None, numpy.array([False, True, False])) - self._test_data(array) + [pa.float64()], + pa.bool_(), + [-1.2, None, 1.2], + [False, True, False], + ), + ], +) +def test_udf( + ctx, tmp_path, fn, input_types, output_type, input_values, expected_values +): + # write to disk + path = helpers.write_parquet( + tmp_path / "a.parquet", pa.array(input_values) + ) + ctx.register_parquet("t", path) + ctx.register_udf("udf", fn, input_types, output_type) + + batches = ctx.sql("SELECT udf(a) AS tt FROM t").collect() + result = batches[0].column(0) + + assert result == pa.array(expected_values) + + +_null_mask = np.array([False, True, False]) + + +@pytest.mark.parametrize( + "arr", + [ + pa.array(["a", "b", "c"], pa.utf8(), _null_mask), + pa.array(["a", "b", "c"], pa.large_utf8(), _null_mask), + pa.array([b"1", b"2", b"3"], pa.binary(), _null_mask), + pa.array([b"1111", b"2222", b"3333"], pa.large_binary(), _null_mask), + pa.array([False, True, True], None, _null_mask), + pa.array([0, 1, 2], None), + helpers.data_binary_other(), + helpers.data_date32(), + helpers.data_with_nans(), + # C data interface missing + pytest.param( + pa.array([b"1111", b"2222", b"3333"], pa.binary(4), _null_mask), + marks=pytest.mark.xfail, + ), + pytest.param(helpers.data_datetime("s"), marks=pytest.mark.xfail), + pytest.param(helpers.data_datetime("ms"), marks=pytest.mark.xfail), + pytest.param(helpers.data_datetime("us"), marks=pytest.mark.xfail), + pytest.param(helpers.data_datetime("ns"), marks=pytest.mark.xfail), + # Not writtable to parquet + pytest.param(helpers.data_timedelta("s"), marks=pytest.mark.xfail), + pytest.param(helpers.data_timedelta("ms"), marks=pytest.mark.xfail), + pytest.param(helpers.data_timedelta("us"), marks=pytest.mark.xfail), + pytest.param(helpers.data_timedelta("ns"), marks=pytest.mark.xfail), + ], +) +def test_simple_select(ctx, tmp_path, arr): + path = helpers.write_parquet(tmp_path / "a.parquet", arr) + ctx.register_parquet("t", path) + + batches = ctx.sql("SELECT a AS tt FROM t").collect() + result = batches[0].column(0) + + np.testing.assert_equal(result, arr) diff --git a/python/tests/test_udaf.py b/python/tests/test_udaf.py index e1e4f933a9b47..b24c08dbc8674 100644 --- a/python/tests/test_udaf.py +++ b/python/tests/test_udaf.py @@ -15,12 +15,11 @@ # specific language governing permissions and limitations # under the License. -import unittest -import pyarrow -import pyarrow.compute -import datafusion - -f = datafusion.functions +import pyarrow as pa +import pyarrow.compute as pc +import pytest +from datafusion import ExecutionContext +from datafusion import functions as f class Accumulator: @@ -29,63 +28,54 @@ class Accumulator: """ def __init__(self): - self._sum = pyarrow.scalar(0.0) + self._sum = pa.scalar(0.0) - def to_scalars(self) -> [pyarrow.Scalar]: + def to_scalars(self) -> [pa.Scalar]: return [self._sum] - def update(self, values: pyarrow.Array) -> None: - # not nice since pyarrow scalars can't be summed yet. This breaks on `None` - self._sum = pyarrow.scalar( - self._sum.as_py() + pyarrow.compute.sum(values).as_py() - ) + def update(self, values: pa.Array) -> None: + # Not nice since pyarrow scalars can't be summed yet. + # This breaks on `None` + self._sum = pa.scalar(self._sum.as_py() + pc.sum(values).as_py()) - def merge(self, states: pyarrow.Array) -> None: - # not nice since pyarrow scalars can't be summed yet. This breaks on `None` - self._sum = pyarrow.scalar( - self._sum.as_py() + pyarrow.compute.sum(states).as_py() - ) + def merge(self, states: pa.Array) -> None: + # Not nice since pyarrow scalars can't be summed yet. + # This breaks on `None` + self._sum = pa.scalar(self._sum.as_py() + pc.sum(states).as_py()) - def evaluate(self) -> pyarrow.Scalar: + def evaluate(self) -> pa.Scalar: return self._sum -class TestCase(unittest.TestCase): - def _prepare(self): - ctx = datafusion.ExecutionContext() +@pytest.fixture +def df(): + ctx = ExecutionContext() - # create a RecordBatch and a new DataFrame from it - batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 4, 6])], - names=["a", "b"], - ) - return ctx.create_dataframe([[batch]]) + # create a RecordBatch and a new DataFrame from it + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 4, 6])], + names=["a", "b"], + ) + return ctx.create_dataframe([[batch]]) - def test_aggregate(self): - df = self._prepare() - udaf = f.udaf( - Accumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()] - ) +def test_aggregate(df): + udaf = f.udaf(Accumulator, pa.float64(), pa.float64(), [pa.float64()]) - df = df.aggregate([], [udaf(f.col("a"))]) + df = df.aggregate([], [udaf(f.col("a"))]) - # execute and collect the first (and only) batch - result = df.collect()[0] + # execute and collect the first (and only) batch + result = df.collect()[0] - self.assertEqual(result.column(0), pyarrow.array([1.0 + 2.0 + 3.0])) + assert result.column(0) == pa.array([1.0 + 2.0 + 3.0]) - def test_group_by(self): - df = self._prepare() - udaf = f.udaf( - Accumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()] - ) +def test_group_by(df): + udaf = f.udaf(Accumulator, pa.float64(), pa.float64(), [pa.float64()]) - df = df.aggregate([f.col("b")], [udaf(f.col("a"))]) + df = df.aggregate([f.col("b")], [udaf(f.col("a"))]) - # execute and collect the first (and only) batch - batches = df.collect() - arrays = [batch.column(1) for batch in batches] - joined = pyarrow.concat_arrays(arrays) - self.assertEqual(joined, pyarrow.array([1.0 + 2.0, 3.0])) + batches = df.collect() + arrays = [batch.column(1) for batch in batches] + joined = pa.concat_arrays(arrays) + assert joined == pa.array([1.0 + 2.0, 3.0]) From d5bca0e350d94a1e1063bed8a0da0cb09c6e3e1c Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 10 Jun 2021 02:26:01 +0800 Subject: [PATCH 172/329] Add `partition by` constructs in window functions and modify logical planning (#501) * closing up type checks * add fmt --- ballista/rust/core/proto/ballista.proto | 2 +- .../core/src/serde/logical_plan/from_proto.rs | 8 + .../core/src/serde/logical_plan/to_proto.rs | 6 + .../src/serde/physical_plan/from_proto.rs | 8 + datafusion/src/logical_plan/expr.rs | 14 +- datafusion/src/logical_plan/plan.rs | 6 +- datafusion/src/optimizer/utils.rs | 46 +++- datafusion/src/sql/planner.rs | 217 +++++++++++++----- datafusion/src/sql/utils.rs | 57 ++++- 9 files changed, 280 insertions(+), 84 deletions(-) diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index 38d87e934e5fa..85af9023fb468 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -174,7 +174,7 @@ message WindowExprNode { // udaf = 3 } LogicalExprNode expr = 4; - // repeated LogicalExprNode partition_by = 5; + repeated LogicalExprNode partition_by = 5; repeated LogicalExprNode order_by = 6; // repeated LogicalExprNode filter = 7; oneof window_frame { diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index 36a37a1e472c0..86daeb063c471 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -910,6 +910,12 @@ impl TryInto for &protobuf::LogicalExprNode { .window_function .as_ref() .ok_or_else(|| proto_error("Received empty window function"))?; + let partition_by = expr + .partition_by + .iter() + .map(|e| e.try_into()) + .into_iter() + .collect::, _>>()?; let order_by = expr .order_by .iter() @@ -940,6 +946,7 @@ impl TryInto for &protobuf::LogicalExprNode { AggregateFunction::from(aggr_function), ), args: vec![parse_required_expr(&expr.expr)?], + partition_by, order_by, window_frame, }) @@ -960,6 +967,7 @@ impl TryInto for &protobuf::LogicalExprNode { BuiltInWindowFunction::from(built_in_function), ), args: vec![parse_required_expr(&expr.expr)?], + partition_by, order_by, window_frame, }) diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/ballista/rust/core/src/serde/logical_plan/to_proto.rs index fb1383daab3a6..5d996843d6248 100644 --- a/ballista/rust/core/src/serde/logical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/to_proto.rs @@ -1006,6 +1006,7 @@ impl TryInto for &Expr { Expr::WindowFunction { ref fun, ref args, + ref partition_by, ref order_by, ref window_frame, .. @@ -1023,6 +1024,10 @@ impl TryInto for &Expr { } }; let arg = &args[0]; + let partition_by = partition_by + .iter() + .map(|e| e.try_into()) + .collect::, _>>()?; let order_by = order_by .iter() .map(|e| e.try_into()) @@ -1035,6 +1040,7 @@ impl TryInto for &Expr { let window_expr = Box::new(protobuf::WindowExprNode { expr: Some(Box::new(arg.try_into()?)), window_function: Some(window_function), + partition_by, order_by, window_frame, }); diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index 5fcc971527c67..b319d5b25f121 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -236,7 +236,9 @@ impl TryInto> for &protobuf::PhysicalPlanNode { Expr::WindowFunction { fun, args, + partition_by, order_by, + window_frame, .. } => { let arg = df_planner @@ -248,9 +250,15 @@ impl TryInto> for &protobuf::PhysicalPlanNode { .map_err(|e| { BallistaError::General(format!("{:?}", e)) })?; + if !partition_by.is_empty() { + return Err(BallistaError::NotImplemented("Window function with partition by is not yet implemented".to_owned())); + } if !order_by.is_empty() { return Err(BallistaError::NotImplemented("Window function with order by is not yet implemented".to_owned())); } + if window_frame.is_some() { + return Err(BallistaError::NotImplemented("Window function with window frame is not yet implemented".to_owned())); + } let window_expr = create_window_expr( &fun, &[arg], diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index d5c92dbd21438..58dba16f02efe 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -194,6 +194,8 @@ pub enum Expr { fun: window_functions::WindowFunction, /// List of expressions to feed to the functions as arguments args: Vec, + /// List of partition by expressions + partition_by: Vec, /// List of order by expressions order_by: Vec, /// Window frame @@ -588,10 +590,18 @@ impl Expr { Expr::ScalarUDF { args, .. } => args .iter() .try_fold(visitor, |visitor, arg| arg.accept(visitor)), - Expr::WindowFunction { args, order_by, .. } => { + Expr::WindowFunction { + args, + partition_by, + order_by, + .. + } => { let visitor = args .iter() .try_fold(visitor, |visitor, arg| arg.accept(visitor))?; + let visitor = partition_by + .iter() + .try_fold(visitor, |visitor, arg| arg.accept(visitor))?; let visitor = order_by .iter() .try_fold(visitor, |visitor, arg| arg.accept(visitor))?; @@ -733,11 +743,13 @@ impl Expr { Expr::WindowFunction { args, fun, + partition_by, order_by, window_frame, } => Expr::WindowFunction { args: rewrite_vec(args, rewriter)?, fun, + partition_by: rewrite_vec(partition_by, rewriter)?, order_by: rewrite_vec(order_by, rewriter)?, window_frame, }, diff --git a/datafusion/src/logical_plan/plan.rs b/datafusion/src/logical_plan/plan.rs index 25cf9e33d2ca7..3344dce1d81df 100644 --- a/datafusion/src/logical_plan/plan.rs +++ b/datafusion/src/logical_plan/plan.rs @@ -687,11 +687,7 @@ impl LogicalPlan { LogicalPlan::Window { ref window_expr, .. } => { - write!( - f, - "WindowAggr: windowExpr=[{:?}] partitionBy=[]", - window_expr - ) + write!(f, "WindowAggr: windowExpr=[{:?}]", window_expr) } LogicalPlan::Aggregate { ref group_expr, diff --git a/datafusion/src/optimizer/utils.rs b/datafusion/src/optimizer/utils.rs index 65c95bee20d46..e707d30bc9ace 100644 --- a/datafusion/src/optimizer/utils.rs +++ b/datafusion/src/optimizer/utils.rs @@ -36,6 +36,7 @@ use crate::{ const CASE_EXPR_MARKER: &str = "__DATAFUSION_CASE_EXPR__"; const CASE_ELSE_MARKER: &str = "__DATAFUSION_CASE_ELSE__"; +const WINDOW_PARTITION_MARKER: &str = "__DATAFUSION_WINDOW_PARTITION__"; const WINDOW_SORT_MARKER: &str = "__DATAFUSION_WINDOW_SORT__"; /// Recursively walk a list of expression trees, collecting the unique set of column @@ -258,9 +259,16 @@ pub fn expr_sub_expressions(expr: &Expr) -> Result> { Expr::IsNotNull(e) => Ok(vec![e.as_ref().to_owned()]), Expr::ScalarFunction { args, .. } => Ok(args.clone()), Expr::ScalarUDF { args, .. } => Ok(args.clone()), - Expr::WindowFunction { args, order_by, .. } => { + Expr::WindowFunction { + args, + partition_by, + order_by, + .. + } => { let mut expr_list: Vec = vec![]; expr_list.extend(args.clone()); + expr_list.push(lit(WINDOW_PARTITION_MARKER)); + expr_list.extend(partition_by.clone()); expr_list.push(lit(WINDOW_SORT_MARKER)); expr_list.extend(order_by.clone()); Ok(expr_list) @@ -340,7 +348,20 @@ pub fn rewrite_expression(expr: &Expr, expressions: &[Expr]) -> Result { Expr::WindowFunction { fun, window_frame, .. } => { - let index = expressions + let partition_index = expressions + .iter() + .position(|expr| { + matches!(expr, Expr::Literal(ScalarValue::Utf8(Some(str))) + if str == WINDOW_PARTITION_MARKER) + }) + .ok_or_else(|| { + DataFusionError::Internal( + "Ill-formed window function expressions: unexpected marker" + .to_owned(), + ) + })?; + + let sort_index = expressions .iter() .position(|expr| { matches!(expr, Expr::Literal(ScalarValue::Utf8(Some(str))) @@ -351,12 +372,21 @@ pub fn rewrite_expression(expr: &Expr, expressions: &[Expr]) -> Result { "Ill-formed window function expressions".to_owned(), ) })?; - Ok(Expr::WindowFunction { - fun: fun.clone(), - args: expressions[..index].to_vec(), - order_by: expressions[index + 1..].to_vec(), - window_frame: *window_frame, - }) + + if partition_index >= sort_index { + Err(DataFusionError::Internal( + "Ill-formed window function expressions: partition index too large" + .to_owned(), + )) + } else { + Ok(Expr::WindowFunction { + fun: fun.clone(), + args: expressions[..partition_index].to_vec(), + partition_by: expressions[partition_index + 1..sort_index].to_vec(), + order_by: expressions[sort_index + 1..].to_vec(), + window_frame: *window_frame, + }) + } } Expr::AggregateFunction { fun, distinct, .. } => Ok(Expr::AggregateFunction { fun: fun.clone(), diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 7df0068c5f547..53f22ecaf3f25 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -1122,52 +1122,53 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // then, window function if let Some(window) = &function.over { - if window.partition_by.is_empty() { - let order_by = window - .order_by - .iter() - .map(|e| self.order_by_to_sort_expr(e)) - .into_iter() - .collect::>>()?; - let window_frame = window - .window_frame - .as_ref() - .map(|window_frame| window_frame.clone().try_into()) - .transpose()?; - let fun = window_functions::WindowFunction::from_str(&name); - if let Ok(window_functions::WindowFunction::AggregateFunction( + let partition_by = window + .partition_by + .iter() + .map(|e| self.sql_expr_to_logical_expr(e)) + .into_iter() + .collect::>>()?; + let order_by = window + .order_by + .iter() + .map(|e| self.order_by_to_sort_expr(e)) + .into_iter() + .collect::>>()?; + let window_frame = window + .window_frame + .as_ref() + .map(|window_frame| window_frame.clone().try_into()) + .transpose()?; + let fun = window_functions::WindowFunction::from_str(&name)?; + match fun { + window_functions::WindowFunction::AggregateFunction( aggregate_fun, - )) = fun - { + ) => { return Ok(Expr::WindowFunction { fun: window_functions::WindowFunction::AggregateFunction( aggregate_fun.clone(), ), args: self .aggregate_fn_to_expr(&aggregate_fun, function)?, + partition_by, order_by, window_frame, }); - } else if let Ok( - window_functions::WindowFunction::BuiltInWindowFunction( - window_fun, - ), - ) = fun - { + } + window_functions::WindowFunction::BuiltInWindowFunction( + window_fun, + ) => { return Ok(Expr::WindowFunction { fun: window_functions::WindowFunction::BuiltInWindowFunction( window_fun, ), args: self.function_args_to_expr(function)?, + partition_by, order_by, window_frame, }); } } - return Err(DataFusionError::NotImplemented(format!( - "Unsupported OVER clause ({})", - window - ))); } // next, aggregate built-ins @@ -2775,7 +2776,7 @@ mod tests { let sql = "SELECT order_id, MAX(order_id) OVER () from orders"; let expected = "\ Projection: #order_id, #MAX(order_id)\ - \n WindowAggr: windowExpr=[[MAX(#order_id)]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[MAX(#order_id)]]\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2785,7 +2786,7 @@ mod tests { let sql = "SELECT order_id oid, MAX(order_id) OVER () max_oid from orders"; let expected = "\ Projection: #order_id AS oid, #MAX(order_id) AS max_oid\ - \n WindowAggr: windowExpr=[[MAX(#order_id)]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[MAX(#order_id)]]\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2795,7 +2796,7 @@ mod tests { let sql = "SELECT order_id, MAX(qty * 1.1) OVER () from orders"; let expected = "\ Projection: #order_id, #MAX(qty Multiply Float64(1.1))\ - \n WindowAggr: windowExpr=[[MAX(#qty Multiply Float64(1.1))]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[MAX(#qty Multiply Float64(1.1))]]\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2806,20 +2807,29 @@ mod tests { "SELECT order_id, MAX(qty) OVER (), min(qty) over (), aVg(qty) OVER () from orders"; let expected = "\ Projection: #order_id, #MAX(qty), #MIN(qty), #AVG(qty)\ - \n WindowAggr: windowExpr=[[MAX(#qty), MIN(#qty), AVG(#qty)]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[MAX(#qty), MIN(#qty), AVG(#qty)]]\ \n TableScan: orders projection=None"; quick_test(sql, expected); } + /// psql result + /// ``` + /// QUERY PLAN + /// ---------------------------------------------------------------------- + /// WindowAgg (cost=69.83..87.33 rows=1000 width=8) + /// -> Sort (cost=69.83..72.33 rows=1000 width=8) + /// Sort Key: order_id + /// -> Seq Scan on orders (cost=0.00..20.00 rows=1000 width=8) + /// ``` #[test] - fn over_partition_by_not_supported() { - let sql = - "SELECT order_id, MAX(delivered) OVER (PARTITION BY order_id) from orders"; - let err = logical_plan(sql).expect_err("query should have failed"); - assert_eq!( - "NotImplemented(\"Unsupported OVER clause (PARTITION BY order_id)\")", - format!("{:?}", err) - ); + fn over_partition_by() { + let sql = "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id) from orders"; + let expected = "\ + Projection: #order_id, #MAX(qty)\ + \n WindowAggr: windowExpr=[[MAX(#qty)]]\ + \n Sort: #order_id ASC NULLS FIRST\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); } /// psql result @@ -2839,9 +2849,9 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; let expected = "\ Projection: #order_id, #MAX(qty), #MIN(qty)\ - \n WindowAggr: windowExpr=[[MAX(#qty)]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[MAX(#qty)]]\ \n Sort: #order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#qty)]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[MIN(#qty)]]\ \n Sort: #order_id DESC NULLS FIRST\ \n TableScan: orders projection=None"; quick_test(sql, expected); @@ -2852,9 +2862,9 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id RANGE BETWEEN 3 PRECEDING and 3 FOLLOWING), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; let expected = "\ Projection: #order_id, #MAX(qty) RANGE BETWEEN 3 PRECEDING AND 3 FOLLOWING, #MIN(qty)\ - \n WindowAggr: windowExpr=[[MAX(#qty) RANGE BETWEEN 3 PRECEDING AND 3 FOLLOWING]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[MAX(#qty) RANGE BETWEEN 3 PRECEDING AND 3 FOLLOWING]]\ \n Sort: #order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#qty)]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[MIN(#qty)]]\ \n Sort: #order_id DESC NULLS FIRST\ \n TableScan: orders projection=None"; quick_test(sql, expected); @@ -2865,9 +2875,9 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id RANGE 3 PRECEDING), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; let expected = "\ Projection: #order_id, #MAX(qty) RANGE BETWEEN 3 PRECEDING AND CURRENT ROW, #MIN(qty)\ - \n WindowAggr: windowExpr=[[MAX(#qty) RANGE BETWEEN 3 PRECEDING AND CURRENT ROW]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[MAX(#qty) RANGE BETWEEN 3 PRECEDING AND CURRENT ROW]]\ \n Sort: #order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#qty)]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[MIN(#qty)]]\ \n Sort: #order_id DESC NULLS FIRST\ \n TableScan: orders projection=None"; quick_test(sql, expected); @@ -2878,9 +2888,9 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id GROUPS 3 PRECEDING), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; let expected = "\ Projection: #order_id, #MAX(qty) GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW, #MIN(qty)\ - \n WindowAggr: windowExpr=[[MAX(#qty) GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[MAX(#qty) GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW]]\ \n Sort: #order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#qty)]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[MIN(#qty)]]\ \n Sort: #order_id DESC NULLS FIRST\ \n TableScan: orders projection=None"; quick_test(sql, expected); @@ -2903,9 +2913,9 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id), MIN(qty) OVER (ORDER BY (order_id + 1)) from orders"; let expected = "\ Projection: #order_id, #MAX(qty), #MIN(qty)\ - \n WindowAggr: windowExpr=[[MAX(#qty)]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[MAX(#qty)]]\ \n Sort: #order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#qty)]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[MIN(#qty)]]\ \n Sort: #order_id Plus Int64(1) ASC NULLS FIRST\ \n TableScan: orders projection=None"; quick_test(sql, expected); @@ -2929,10 +2939,10 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY qty, order_id), SUM(qty) OVER (), MIN(qty) OVER (ORDER BY order_id, qty) from orders"; let expected = "\ Projection: #order_id, #MAX(qty), #SUM(qty), #MIN(qty)\ - \n WindowAggr: windowExpr=[[SUM(#qty)]] partitionBy=[]\ - \n WindowAggr: windowExpr=[[MAX(#qty)]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[SUM(#qty)]]\ + \n WindowAggr: windowExpr=[[MAX(#qty)]]\ \n Sort: #qty ASC NULLS FIRST, #order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#qty)]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[MIN(#qty)]]\ \n Sort: #order_id ASC NULLS FIRST, #qty ASC NULLS FIRST\ \n TableScan: orders projection=None"; quick_test(sql, expected); @@ -2956,10 +2966,10 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id), SUM(qty) OVER (), MIN(qty) OVER (ORDER BY order_id, qty) from orders"; let expected = "\ Projection: #order_id, #MAX(qty), #SUM(qty), #MIN(qty)\ - \n WindowAggr: windowExpr=[[SUM(#qty)]] partitionBy=[]\ - \n WindowAggr: windowExpr=[[MAX(#qty)]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[SUM(#qty)]]\ + \n WindowAggr: windowExpr=[[MAX(#qty)]]\ \n Sort: #order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#qty)]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[MIN(#qty)]]\ \n Sort: #order_id ASC NULLS FIRST, #qty ASC NULLS FIRST\ \n TableScan: orders projection=None"; quick_test(sql, expected); @@ -2987,15 +2997,108 @@ mod tests { let expected = "\ Sort: #order_id ASC NULLS FIRST\ \n Projection: #order_id, #MAX(qty), #SUM(qty), #MIN(qty)\ - \n WindowAggr: windowExpr=[[SUM(#qty)]] partitionBy=[]\ - \n WindowAggr: windowExpr=[[MAX(#qty)]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[SUM(#qty)]]\ + \n WindowAggr: windowExpr=[[MAX(#qty)]]\ \n Sort: #qty ASC NULLS FIRST, #order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#qty)]] partitionBy=[]\ + \n WindowAggr: windowExpr=[[MIN(#qty)]]\ \n Sort: #order_id ASC NULLS FIRST, #qty ASC NULLS FIRST\ \n TableScan: orders projection=None"; quick_test(sql, expected); } + /// psql result + /// ``` + /// QUERY PLAN + /// ---------------------------------------------------------------------- + /// WindowAgg (cost=69.83..89.83 rows=1000 width=12) + /// -> Sort (cost=69.83..72.33 rows=1000 width=8) + /// Sort Key: order_id, qty + /// -> Seq Scan on orders (cost=0.00..20.00 rows=1000 width=8) + /// ``` + #[test] + fn over_partition_by_order_by() { + let sql = + "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id ORDER BY qty) from orders"; + let expected = "\ + Projection: #order_id, #MAX(qty)\ + \n WindowAggr: windowExpr=[[MAX(#qty)]]\ + \n Sort: #order_id ASC NULLS FIRST, #qty ASC NULLS FIRST\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); + } + + /// psql result + /// ``` + /// QUERY PLAN + /// ---------------------------------------------------------------------- + /// WindowAgg (cost=69.83..89.83 rows=1000 width=12) + /// -> Sort (cost=69.83..72.33 rows=1000 width=8) + /// Sort Key: order_id, qty + /// -> Seq Scan on orders (cost=0.00..20.00 rows=1000 width=8) + /// ``` + #[test] + fn over_partition_by_order_by_no_dup() { + let sql = + "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id, qty ORDER BY qty) from orders"; + let expected = "\ + Projection: #order_id, #MAX(qty)\ + \n WindowAggr: windowExpr=[[MAX(#qty)]]\ + \n Sort: #order_id ASC NULLS FIRST, #qty ASC NULLS FIRST\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); + } + + /// psql result + /// ``` + /// QUERY PLAN + /// ---------------------------------------------------------------------------------- + /// WindowAgg (cost=142.16..162.16 rows=1000 width=16) + /// -> Sort (cost=142.16..144.66 rows=1000 width=12) + /// Sort Key: qty, order_id + /// -> WindowAgg (cost=69.83..92.33 rows=1000 width=12) + /// -> Sort (cost=69.83..72.33 rows=1000 width=8) + /// Sort Key: order_id, qty + /// -> Seq Scan on orders (cost=0.00..20.00 rows=1000 width=8) + /// ``` + #[test] + fn over_partition_by_order_by_mix_up() { + let sql = + "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id, qty ORDER BY qty), MIN(qty) OVER (PARTITION BY qty ORDER BY order_id) from orders"; + let expected = "\ + Projection: #order_id, #MAX(qty), #MIN(qty)\ + \n WindowAggr: windowExpr=[[MAX(#qty)]]\ + \n Sort: #order_id ASC NULLS FIRST, #qty ASC NULLS FIRST\ + \n WindowAggr: windowExpr=[[MIN(#qty)]]\ + \n Sort: #qty ASC NULLS FIRST, #order_id ASC NULLS FIRST\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); + } + + /// psql result + /// ``` + /// QUERY PLAN + /// ----------------------------------------------------------------------------- + /// WindowAgg (cost=69.83..109.83 rows=1000 width=24) + /// -> WindowAgg (cost=69.83..92.33 rows=1000 width=20) + /// -> Sort (cost=69.83..72.33 rows=1000 width=16) + /// Sort Key: order_id, qty, price + /// -> Seq Scan on orders (cost=0.00..20.00 rows=1000 width=16) + /// ``` + /// FIXME: for now we are not detecting prefix of sorting keys in order to save one sort exec phase + #[test] + fn over_partition_by_order_by_mix_up_prefix() { + let sql = + "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id ORDER BY qty), MIN(qty) OVER (PARTITION BY order_id, qty ORDER BY price) from orders"; + let expected = "\ + Projection: #order_id, #MAX(qty), #MIN(qty)\ + \n WindowAggr: windowExpr=[[MAX(#qty)]]\ + \n Sort: #order_id ASC NULLS FIRST, #qty ASC NULLS FIRST\ + \n WindowAggr: windowExpr=[[MIN(#qty)]]\ + \n Sort: #order_id ASC NULLS FIRST, #qty ASC NULLS FIRST, #price ASC NULLS FIRST\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); + } + #[test] fn only_union_all_supported() { let sql = "SELECT order_id from orders EXCEPT SELECT order_id FROM orders"; diff --git a/datafusion/src/sql/utils.rs b/datafusion/src/sql/utils.rs index 848fb3ee31fc3..5e9b9526ea834 100644 --- a/datafusion/src/sql/utils.rs +++ b/datafusion/src/sql/utils.rs @@ -239,6 +239,7 @@ where Expr::WindowFunction { fun, args, + partition_by, order_by, window_frame, } => Ok(Expr::WindowFunction { @@ -247,6 +248,10 @@ where .iter() .map(|e| clone_with_replacement(e, replacement_fn)) .collect::>>()?, + partition_by: partition_by + .iter() + .map(|e| clone_with_replacement(e, replacement_fn)) + .collect::>>()?, order_by: order_by .iter() .map(|e| clone_with_replacement(e, replacement_fn)) @@ -432,19 +437,38 @@ pub(crate) fn resolve_aliases_to_exprs( }) } +type WindowSortKey = Vec; + +fn generate_sort_key(partition_by: &[Expr], order_by: &[Expr]) -> WindowSortKey { + let mut sort_key = vec![]; + partition_by.iter().for_each(|e| { + let e = e.clone().sort(true, true); + if !sort_key.contains(&e) { + sort_key.push(e); + } + }); + order_by.iter().for_each(|e| { + if !sort_key.contains(&e) { + sort_key.push(e.clone()); + } + }); + sort_key +} + /// group a slice of window expression expr by their order by expressions pub(crate) fn group_window_expr_by_sort_keys( window_expr: &[Expr], -) -> Result)>> { +) -> Result)>> { let mut result = vec![]; window_expr.iter().try_for_each(|expr| match expr { - Expr::WindowFunction { order_by, .. } => { + Expr::WindowFunction { partition_by, order_by, .. } => { + let sort_key = generate_sort_key(partition_by, order_by); if let Some((_, values)) = result.iter_mut().find( - |group: &&mut (&[Expr], Vec<&Expr>)| matches!(group, (key, _) if key == order_by), + |group: &&mut (WindowSortKey, Vec<&Expr>)| matches!(group, (key, _) if *key == sort_key), ) { values.push(expr); } else { - result.push((order_by, vec![expr])) + result.push((sort_key, vec![expr])) } Ok(()) } @@ -466,7 +490,7 @@ mod tests { #[test] fn test_group_window_expr_by_sort_keys_empty_case() -> Result<()> { let result = group_window_expr_by_sort_keys(&[])?; - let expected: Vec<(&[Expr], Vec<&Expr>)> = vec![]; + let expected: Vec<(WindowSortKey, Vec<&Expr>)> = vec![]; assert_eq!(expected, result); Ok(()) } @@ -476,32 +500,35 @@ mod tests { let max1 = Expr::WindowFunction { fun: WindowFunction::AggregateFunction(AggregateFunction::Max), args: vec![col("name")], + partition_by: vec![], order_by: vec![], window_frame: None, }; let max2 = Expr::WindowFunction { fun: WindowFunction::AggregateFunction(AggregateFunction::Max), args: vec![col("name")], + partition_by: vec![], order_by: vec![], window_frame: None, }; let min3 = Expr::WindowFunction { fun: WindowFunction::AggregateFunction(AggregateFunction::Min), args: vec![col("name")], + partition_by: vec![], order_by: vec![], window_frame: None, }; let sum4 = Expr::WindowFunction { fun: WindowFunction::AggregateFunction(AggregateFunction::Sum), args: vec![col("age")], + partition_by: vec![], order_by: vec![], window_frame: None, }; - // FIXME use as_ref let exprs = &[max1.clone(), max2.clone(), min3.clone(), sum4.clone()]; let result = group_window_expr_by_sort_keys(exprs)?; - let key = &[]; - let expected: Vec<(&[Expr], Vec<&Expr>)> = + let key = vec![]; + let expected: Vec<(WindowSortKey, Vec<&Expr>)> = vec![(key, vec![&max1, &max2, &min3, &sum4])]; assert_eq!(expected, result); Ok(()) @@ -527,24 +554,28 @@ mod tests { let max1 = Expr::WindowFunction { fun: WindowFunction::AggregateFunction(AggregateFunction::Max), args: vec![col("name")], + partition_by: vec![], order_by: vec![age_asc.clone(), name_desc.clone()], window_frame: None, }; let max2 = Expr::WindowFunction { fun: WindowFunction::AggregateFunction(AggregateFunction::Max), args: vec![col("name")], + partition_by: vec![], order_by: vec![], window_frame: None, }; let min3 = Expr::WindowFunction { fun: WindowFunction::AggregateFunction(AggregateFunction::Min), args: vec![col("name")], + partition_by: vec![], order_by: vec![age_asc.clone(), name_desc.clone()], window_frame: None, }; let sum4 = Expr::WindowFunction { fun: WindowFunction::AggregateFunction(AggregateFunction::Sum), args: vec![col("age")], + partition_by: vec![], order_by: vec![name_desc.clone(), age_asc.clone(), created_at_desc.clone()], window_frame: None, }; @@ -552,11 +583,11 @@ mod tests { let exprs = &[max1.clone(), max2.clone(), min3.clone(), sum4.clone()]; let result = group_window_expr_by_sort_keys(exprs)?; - let key1 = &[age_asc.clone(), name_desc.clone()]; - let key2 = &[]; - let key3 = &[name_desc, age_asc, created_at_desc]; + let key1 = vec![age_asc.clone(), name_desc.clone()]; + let key2 = vec![]; + let key3 = vec![name_desc, age_asc, created_at_desc]; - let expected: Vec<(&[Expr], Vec<&Expr>)> = vec![ + let expected: Vec<(WindowSortKey, Vec<&Expr>)> = vec![ (key1, vec![&max1, &min3]), (key2, vec![&max2]), (key3, vec![&sum4]), @@ -571,6 +602,7 @@ mod tests { Expr::WindowFunction { fun: WindowFunction::AggregateFunction(AggregateFunction::Max), args: vec![col("name")], + partition_by: vec![], order_by: vec![ Expr::Sort { expr: Box::new(col("age")), @@ -588,6 +620,7 @@ mod tests { Expr::WindowFunction { fun: WindowFunction::AggregateFunction(AggregateFunction::Sum), args: vec![col("age")], + partition_by: vec![], order_by: vec![ Expr::Sort { expr: Box::new(col("name")), From 8f84564edab1679163d91691f63381f38907d515 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 10 Jun 2021 09:18:15 -0400 Subject: [PATCH 173/329] Return errors properly from RepartitionExec (#521) --- datafusion/src/physical_plan/repartition.rs | 205 ++++++++++++++++++-- datafusion/src/test/exec.rs | 183 ++++++++++++++++- 2 files changed, 372 insertions(+), 16 deletions(-) diff --git a/datafusion/src/physical_plan/repartition.rs b/datafusion/src/physical_plan/repartition.rs index e5747dda88b75..37d98c7d118b6 100644 --- a/datafusion/src/physical_plan/repartition.rs +++ b/datafusion/src/physical_plan/repartition.rs @@ -147,12 +147,13 @@ impl ExecutionPlan for RepartitionExec { let fetch_time = self.fetch_time_nanos.clone(); let repart_time = self.repart_time_nanos.clone(); let send_time = self.send_time_nanos.clone(); - let mut txs: HashMap<_, _> = channels + let txs: HashMap<_, _> = channels .iter() .map(|(partition, (tx, _rx))| (*partition, tx.clone())) .collect(); let partitioning = self.partitioning.clone(); - let _: JoinHandle> = tokio::spawn(async move { + let mut txs_captured = txs.clone(); + let input_task: JoinHandle> = tokio::spawn(async move { // execute the child operator let now = Instant::now(); let mut stream = input.execute(i).await?; @@ -170,13 +171,13 @@ impl ExecutionPlan for RepartitionExec { if result.is_none() { break; } - let result = result.unwrap(); + let result: ArrowResult = result.unwrap(); match &partitioning { Partitioning::RoundRobinBatch(_) => { let now = Instant::now(); let output_partition = counter % num_output_partitions; - let tx = txs.get_mut(&output_partition).unwrap(); + let tx = txs_captured.get_mut(&output_partition).unwrap(); tx.send(Some(result)).map_err(|e| { DataFusionError::Execution(e.to_string()) })?; @@ -230,7 +231,9 @@ impl ExecutionPlan for RepartitionExec { ); repart_time.add(now.elapsed().as_nanos() as usize); let now = Instant::now(); - let tx = txs.get_mut(&num_output_partition).unwrap(); + let tx = txs_captured + .get_mut(&num_output_partition) + .unwrap(); tx.send(Some(output_batch)).map_err(|e| { DataFusionError::Execution(e.to_string()) })?; @@ -249,13 +252,12 @@ impl ExecutionPlan for RepartitionExec { counter += 1; } - // notify each output partition that this input partition has no more data - for (_, tx) in txs { - tx.send(None) - .map_err(|e| DataFusionError::Execution(e.to_string()))?; - } Ok(()) }); + + // In a separate task, wait for each input to be done + // (and pass along any errors) + tokio::spawn(async move { Self::wait_for_task(input_task, txs).await }); } } @@ -308,6 +310,45 @@ impl RepartitionExec { send_time_nanos: SQLMetric::time_nanos(), }) } + + /// Waits for `input_task` which is consuming one of the inputs to + /// complete. Upon each successful completion, sends a `None` to + /// each of the output tx channels to signal one of the inputs is + /// complete. Upon error, propagates the errors to all output tx + /// channels. + async fn wait_for_task( + input_task: JoinHandle>, + txs: HashMap>>>, + ) { + // wait for completion, and propagate error + // note we ignore errors on send (.ok) as that means the receiver has already shutdown. + match input_task.await { + // Error in joining task + Err(e) => { + for (_, tx) in txs { + let err = DataFusionError::Execution(format!("Join Error: {}", e)); + let err = Err(err.into_arrow_external_error()); + tx.send(Some(err)).ok(); + } + } + // Error from running input task + Ok(Err(e)) => { + for (_, tx) in txs { + // wrap it because need to send error to all output partitions + let err = DataFusionError::Execution(e.to_string()); + let err = Err(err.into_arrow_external_error()); + tx.send(Some(err)).ok(); + } + } + // Input task completed successfully + Ok(Ok(())) => { + // notify each output partition that this input partition has no more data + for (_, tx) in txs { + tx.send(None).ok(); + } + } + } + } } struct RepartitionStream { @@ -356,10 +397,17 @@ impl RecordBatchStream for RepartitionStream { #[cfg(test)] mod tests { use super::*; - use crate::physical_plan::memory::MemoryExec; - use arrow::array::UInt32Array; + use crate::{ + assert_batches_sorted_eq, + physical_plan::memory::MemoryExec, + test::exec::{ErrorExec, MockExec}, + }; use arrow::datatypes::{DataType, Field, Schema}; use arrow::record_batch::RecordBatch; + use arrow::{ + array::{ArrayRef, StringArray, UInt32Array}, + error::ArrowError, + }; #[tokio::test] async fn one_to_many_round_robin() -> Result<()> { @@ -517,4 +565,137 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn unsupported_partitioning() { + // have to send at least one batch through to provoke error + let batch = RecordBatch::try_from_iter(vec![( + "my_awesome_field", + Arc::new(StringArray::from(vec!["foo", "bar"])) as ArrayRef, + )]) + .unwrap(); + + let schema = batch.schema(); + let input = MockExec::new(vec![Ok(batch)], schema); + // This generates an error (partitioning type not supported) + // but only after the plan is executed. The error should be + // returned and no results produced + let partitioning = Partitioning::UnknownPartitioning(1); + let exec = RepartitionExec::try_new(Arc::new(input), partitioning).unwrap(); + let output_stream = exec.execute(0).await.unwrap(); + + // Expect that an error is returned + let result_string = crate::physical_plan::common::collect(output_stream) + .await + .unwrap_err() + .to_string(); + assert!( + result_string + .contains("Unsupported repartitioning scheme UnknownPartitioning(1)"), + "actual: {}", + result_string + ); + } + + #[tokio::test] + async fn error_for_input_exec() { + // This generates an error on a call to execute. The error + // should be returned and no results produced. + + let input = ErrorExec::new(); + let partitioning = Partitioning::RoundRobinBatch(1); + let exec = RepartitionExec::try_new(Arc::new(input), partitioning).unwrap(); + + // Note: this should pass (the stream can be created) but the + // error when the input is executed should get passed back + let output_stream = exec.execute(0).await.unwrap(); + + // Expect that an error is returned + let result_string = crate::physical_plan::common::collect(output_stream) + .await + .unwrap_err() + .to_string(); + assert!( + result_string.contains("ErrorExec, unsurprisingly, errored in partition 0"), + "actual: {}", + result_string + ); + } + + #[tokio::test] + async fn repartition_with_error_in_stream() { + let batch = RecordBatch::try_from_iter(vec![( + "my_awesome_field", + Arc::new(StringArray::from(vec!["foo", "bar"])) as ArrayRef, + )]) + .unwrap(); + + // input stream returns one good batch and then one error. The + // error should be returned. + let err = Err(ArrowError::ComputeError("bad data error".to_string())); + + let schema = batch.schema(); + let input = MockExec::new(vec![Ok(batch), err], schema); + let partitioning = Partitioning::RoundRobinBatch(1); + let exec = RepartitionExec::try_new(Arc::new(input), partitioning).unwrap(); + + // Note: this should pass (the stream can be created) but the + // error when the input is executed should get passed back + let output_stream = exec.execute(0).await.unwrap(); + + // Expect that an error is returned + let result_string = crate::physical_plan::common::collect(output_stream) + .await + .unwrap_err() + .to_string(); + assert!( + result_string.contains("bad data error"), + "actual: {}", + result_string + ); + } + + #[tokio::test] + async fn repartition_with_delayed_stream() { + let batch1 = RecordBatch::try_from_iter(vec![( + "my_awesome_field", + Arc::new(StringArray::from(vec!["foo", "bar"])) as ArrayRef, + )]) + .unwrap(); + + let batch2 = RecordBatch::try_from_iter(vec![( + "my_awesome_field", + Arc::new(StringArray::from(vec!["frob", "baz"])) as ArrayRef, + )]) + .unwrap(); + + // The mock exec doesn't return immediately (instead it + // requires the input to wait at least once) + let schema = batch1.schema(); + let expected_batches = vec![batch1.clone(), batch2.clone()]; + let input = MockExec::new(vec![Ok(batch1), Ok(batch2)], schema); + let partitioning = Partitioning::RoundRobinBatch(1); + + let exec = RepartitionExec::try_new(Arc::new(input), partitioning).unwrap(); + + let expected = vec![ + "+------------------+", + "| my_awesome_field |", + "+------------------+", + "| foo |", + "| bar |", + "| frob |", + "| baz |", + "+------------------+", + ]; + + assert_batches_sorted_eq!(&expected, &expected_batches); + + let output_stream = exec.execute(0).await.unwrap(); + let batches = crate::physical_plan::common::collect(output_stream) + .await + .unwrap(); + + assert_batches_sorted_eq!(&expected, &batches); + } } diff --git a/datafusion/src/test/exec.rs b/datafusion/src/test/exec.rs index 04cd29530c016..bcd94dd6d6397 100644 --- a/datafusion/src/test/exec.rs +++ b/datafusion/src/test/exec.rs @@ -17,14 +17,25 @@ //! Simple iterator over batches for use in testing -use std::task::{Context, Poll}; +use async_trait::async_trait; +use std::{ + any::Any, + sync::Arc, + task::{Context, Poll}, +}; use arrow::{ - datatypes::SchemaRef, error::Result as ArrowResult, record_batch::RecordBatch, + datatypes::{DataType, Field, Schema, SchemaRef}, + error::{ArrowError, Result as ArrowResult}, + record_batch::RecordBatch, }; -use futures::Stream; +use futures::{Stream, StreamExt}; +use tokio_stream::wrappers::ReceiverStream; -use crate::physical_plan::RecordBatchStream; +use crate::error::{DataFusionError, Result}; +use crate::physical_plan::{ + ExecutionPlan, Partitioning, RecordBatchStream, SendableRecordBatchStream, +}; /// Index into the data that has been returned so far #[derive(Debug, Default, Clone)] @@ -100,3 +111,167 @@ impl RecordBatchStream for TestStream { self.data[0].schema() } } + +/// A Mock ExecutionPlan that can be used for writing tests of other ExecutionPlans +/// +#[derive(Debug)] +pub struct MockExec { + /// the results to send back + data: Vec>, + schema: SchemaRef, +} + +impl MockExec { + /// Create a new exec with a single partition that returns the + /// record batches in this Exec. Note the batches are not produced + /// immediately (the caller has to actually yield and another task + /// must run) to ensure any poll loops are correct. + pub fn new(data: Vec>, schema: SchemaRef) -> Self { + Self { data, schema } + } +} + +#[async_trait] +impl ExecutionPlan for MockExec { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + + fn output_partitioning(&self) -> Partitioning { + Partitioning::UnknownPartitioning(1) + } + + fn children(&self) -> Vec> { + unimplemented!() + } + + fn with_new_children( + &self, + _children: Vec>, + ) -> Result> { + unimplemented!() + } + + /// Returns a stream which yields data + async fn execute(&self, partition: usize) -> Result { + assert_eq!(partition, 0); + + let schema = self.schema(); + + // Result doesn't implement clone, so do it ourself + let data: Vec<_> = self + .data + .iter() + .map(|r| match r { + Ok(batch) => Ok(batch.clone()), + Err(e) => Err(clone_error(e)), + }) + .collect(); + + let (tx, rx) = tokio::sync::mpsc::channel(2); + + // task simply sends data in order but in a separate + // thread (to ensure the batches are not available without the + // DelayedStream yielding). + tokio::task::spawn(async move { + for batch in data { + println!("Sending batch via delayed stream"); + if let Err(e) = tx.send(batch).await { + println!("ERROR batch via delayed stream: {}", e); + } + } + }); + + // returned stream simply reads off the rx stream + let stream = DelayedStream { + schema, + inner: ReceiverStream::new(rx), + }; + Ok(Box::pin(stream)) + } +} + +fn clone_error(e: &ArrowError) -> ArrowError { + use ArrowError::*; + match e { + ComputeError(msg) => ComputeError(msg.to_string()), + _ => unimplemented!(), + } +} + +#[derive(Debug)] +pub struct DelayedStream { + schema: SchemaRef, + inner: ReceiverStream>, +} + +impl Stream for DelayedStream { + type Item = ArrowResult; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + self.inner.poll_next_unpin(cx) + } +} + +impl RecordBatchStream for DelayedStream { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } +} + +/// A mock execution plan that errors on a call to execute +#[derive(Debug)] +pub struct ErrorExec { + schema: SchemaRef, +} +impl ErrorExec { + pub fn new() -> Self { + let schema = Arc::new(Schema::new(vec![Field::new( + "dummy", + DataType::Int64, + true, + )])); + Self { schema } + } +} + +#[async_trait] +impl ExecutionPlan for ErrorExec { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + + fn output_partitioning(&self) -> Partitioning { + Partitioning::UnknownPartitioning(1) + } + + fn children(&self) -> Vec> { + unimplemented!() + } + + fn with_new_children( + &self, + _children: Vec>, + ) -> Result> { + unimplemented!() + } + + /// Returns a stream which yields data + async fn execute(&self, partition: usize) -> Result { + Err(DataFusionError::Internal(format!( + "ErrorExec, unsurprisingly, errored in partition {}", + partition + ))) + } +} From 77775b77967a1912b2a423618e4eaa44192bdc23 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 10 Jun 2021 22:55:59 +0800 Subject: [PATCH 174/329] add boundary check (#530) --- .../core/src/serde/logical_plan/from_proto.rs | 10 +- datafusion/src/logical_plan/window_frames.rs | 35 ++ datafusion/src/physical_plan/mod.rs | 1 - datafusion/src/physical_plan/window_frames.rs | 337 ------------------ datafusion/src/sql/planner.rs | 58 ++- 5 files changed, 95 insertions(+), 346 deletions(-) delete mode 100644 datafusion/src/physical_plan/window_frames.rs diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index 86daeb063c471..894a5f0a7d985 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -927,10 +927,18 @@ impl TryInto for &protobuf::LogicalExprNode { .as_ref() .map::, _>(|e| match e { window_expr_node::WindowFrame::Frame(frame) => { - frame.clone().try_into() + let window_frame: WindowFrame = frame.clone().try_into()?; + if WindowFrameUnits::Range == window_frame.units + && order_by.len() != 1 + { + Err(proto_error("With window frame of type RANGE, the order by expression must be of length 1")) + } else { + Ok(window_frame) + } } }) .transpose()?; + match window_function { window_expr_node::WindowFunction::AggrFunction(i) => { let aggr_function = protobuf::AggregateFunction::from_i32(*i) diff --git a/datafusion/src/logical_plan/window_frames.rs b/datafusion/src/logical_plan/window_frames.rs index f0be5a221fbf7..8aaebd3155c19 100644 --- a/datafusion/src/logical_plan/window_frames.rs +++ b/datafusion/src/logical_plan/window_frames.rs @@ -82,6 +82,22 @@ impl TryFrom for WindowFrame { ))) } else { let units = value.units.into(); + if units == WindowFrameUnits::Range { + for bound in &[start_bound, end_bound] { + match bound { + WindowFrameBound::Preceding(Some(v)) + | WindowFrameBound::Following(Some(v)) + if *v > 0 => + { + Err(DataFusionError::NotImplemented(format!( + "With WindowFrameUnits={}, the bound cannot be {} PRECEDING or FOLLOWING at the moment", + units, v + ))) + } + _ => Ok(()), + }?; + } + } Ok(Self { units, start_bound, @@ -270,6 +286,25 @@ mod tests { result.err().unwrap().to_string(), "Execution error: Invalid window frame: start bound (1 PRECEDING) cannot be larger than end bound (2 PRECEDING)".to_owned() ); + + let window_frame = ast::WindowFrame { + units: ast::WindowFrameUnits::Range, + start_bound: ast::WindowFrameBound::Preceding(Some(2)), + end_bound: Some(ast::WindowFrameBound::Preceding(Some(1))), + }; + let result = WindowFrame::try_from(window_frame); + assert_eq!( + result.err().unwrap().to_string(), + "This feature is not implemented: With WindowFrameUnits=RANGE, the bound cannot be 2 PRECEDING or FOLLOWING at the moment".to_owned() + ); + + let window_frame = ast::WindowFrame { + units: ast::WindowFrameUnits::Rows, + start_bound: ast::WindowFrameBound::Preceding(Some(2)), + end_bound: Some(ast::WindowFrameBound::Preceding(Some(1))), + }; + let result = WindowFrame::try_from(window_frame); + assert!(result.is_ok()); Ok(()) } diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index 490e02875c428..af6969c43cbd6 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -617,6 +617,5 @@ pub mod udf; #[cfg(feature = "unicode_expressions")] pub mod unicode_expressions; pub mod union; -pub mod window_frames; pub mod window_functions; pub mod windows; diff --git a/datafusion/src/physical_plan/window_frames.rs b/datafusion/src/physical_plan/window_frames.rs deleted file mode 100644 index f0be5a221fbf7..0000000000000 --- a/datafusion/src/physical_plan/window_frames.rs +++ /dev/null @@ -1,337 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Window frame -//! -//! The frame-spec determines which output rows are read by an aggregate window function. The frame-spec consists of four parts: -//! - A frame type - either ROWS, RANGE or GROUPS, -//! - A starting frame boundary, -//! - An ending frame boundary, -//! - An EXCLUDE clause. - -use crate::error::{DataFusionError, Result}; -use sqlparser::ast; -use std::cmp::Ordering; -use std::convert::{From, TryFrom}; -use std::fmt; - -/// The frame-spec determines which output rows are read by an aggregate window function. -/// -/// The ending frame boundary can be omitted (if the BETWEEN and AND keywords that surround the -/// starting frame boundary are also omitted), in which case the ending frame boundary defaults to -/// CURRENT ROW. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct WindowFrame { - /// A frame type - either ROWS, RANGE or GROUPS - pub units: WindowFrameUnits, - /// A starting frame boundary - pub start_bound: WindowFrameBound, - /// An ending frame boundary - pub end_bound: WindowFrameBound, -} - -impl fmt::Display for WindowFrame { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!( - f, - "{} BETWEEN {} AND {}", - self.units, self.start_bound, self.end_bound - )?; - Ok(()) - } -} - -impl TryFrom for WindowFrame { - type Error = DataFusionError; - - fn try_from(value: ast::WindowFrame) -> Result { - let start_bound = value.start_bound.into(); - let end_bound = value - .end_bound - .map(WindowFrameBound::from) - .unwrap_or(WindowFrameBound::CurrentRow); - - if let WindowFrameBound::Following(None) = start_bound { - Err(DataFusionError::Execution( - "Invalid window frame: start bound cannot be unbounded following" - .to_owned(), - )) - } else if let WindowFrameBound::Preceding(None) = end_bound { - Err(DataFusionError::Execution( - "Invalid window frame: end bound cannot be unbounded preceding" - .to_owned(), - )) - } else if start_bound > end_bound { - Err(DataFusionError::Execution(format!( - "Invalid window frame: start bound ({}) cannot be larger than end bound ({})", - start_bound, end_bound - ))) - } else { - let units = value.units.into(); - Ok(Self { - units, - start_bound, - end_bound, - }) - } - } -} - -impl Default for WindowFrame { - fn default() -> Self { - WindowFrame { - units: WindowFrameUnits::Range, - start_bound: WindowFrameBound::Preceding(None), - end_bound: WindowFrameBound::CurrentRow, - } - } -} - -/// There are five ways to describe starting and ending frame boundaries: -/// -/// 1. UNBOUNDED PRECEDING -/// 2. PRECEDING -/// 3. CURRENT ROW -/// 4. FOLLOWING -/// 5. UNBOUNDED FOLLOWING -/// -/// in this implementation we'll only allow to be u64 (i.e. no dynamic boundary) -#[derive(Debug, Clone, Copy, Eq)] -pub enum WindowFrameBound { - /// 1. UNBOUNDED PRECEDING - /// The frame boundary is the first row in the partition. - /// - /// 2. PRECEDING - /// must be a non-negative constant numeric expression. The boundary is a row that - /// is "units" prior to the current row. - Preceding(Option), - /// 3. The current row. - /// - /// For RANGE and GROUPS frame types, peers of the current row are also - /// included in the frame, unless specifically excluded by the EXCLUDE clause. - /// This is true regardless of whether CURRENT ROW is used as the starting or ending frame - /// boundary. - CurrentRow, - /// 4. This is the same as " PRECEDING" except that the boundary is units after the - /// current rather than before the current row. - /// - /// 5. UNBOUNDED FOLLOWING - /// The frame boundary is the last row in the partition. - Following(Option), -} - -impl From for WindowFrameBound { - fn from(value: ast::WindowFrameBound) -> Self { - match value { - ast::WindowFrameBound::Preceding(v) => Self::Preceding(v), - ast::WindowFrameBound::Following(v) => Self::Following(v), - ast::WindowFrameBound::CurrentRow => Self::CurrentRow, - } - } -} - -impl fmt::Display for WindowFrameBound { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - WindowFrameBound::CurrentRow => f.write_str("CURRENT ROW"), - WindowFrameBound::Preceding(None) => f.write_str("UNBOUNDED PRECEDING"), - WindowFrameBound::Following(None) => f.write_str("UNBOUNDED FOLLOWING"), - WindowFrameBound::Preceding(Some(n)) => write!(f, "{} PRECEDING", n), - WindowFrameBound::Following(Some(n)) => write!(f, "{} FOLLOWING", n), - } - } -} - -impl PartialEq for WindowFrameBound { - fn eq(&self, other: &Self) -> bool { - self.cmp(other) == Ordering::Equal - } -} - -impl PartialOrd for WindowFrameBound { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for WindowFrameBound { - fn cmp(&self, other: &Self) -> Ordering { - self.get_rank().cmp(&other.get_rank()) - } -} - -impl WindowFrameBound { - /// get the rank of this window frame bound. - /// - /// the rank is a tuple of (u8, u64) because we'll firstly compare the kind and then the value - /// which requires special handling e.g. with preceding the larger the value the smaller the - /// rank and also for 0 preceding / following it is the same as current row - fn get_rank(&self) -> (u8, u64) { - match self { - WindowFrameBound::Preceding(None) => (0, 0), - WindowFrameBound::Following(None) => (4, 0), - WindowFrameBound::Preceding(Some(0)) - | WindowFrameBound::CurrentRow - | WindowFrameBound::Following(Some(0)) => (2, 0), - WindowFrameBound::Preceding(Some(v)) => (1, u64::MAX - *v), - WindowFrameBound::Following(Some(v)) => (3, *v), - } - } -} - -/// There are three frame types: ROWS, GROUPS, and RANGE. The frame type determines how the -/// starting and ending boundaries of the frame are measured. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum WindowFrameUnits { - /// The ROWS frame type means that the starting and ending boundaries for the frame are - /// determined by counting individual rows relative to the current row. - Rows, - /// The RANGE frame type requires that the ORDER BY clause of the window have exactly one - /// term. Call that term "X". With the RANGE frame type, the elements of the frame are - /// determined by computing the value of expression X for all rows in the partition and framing - /// those rows for which the value of X is within a certain range of the value of X for the - /// current row. - Range, - /// The GROUPS frame type means that the starting and ending boundaries are determine - /// by counting "groups" relative to the current group. A "group" is a set of rows that all have - /// equivalent values for all all terms of the window ORDER BY clause. - Groups, -} - -impl fmt::Display for WindowFrameUnits { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.write_str(match self { - WindowFrameUnits::Rows => "ROWS", - WindowFrameUnits::Range => "RANGE", - WindowFrameUnits::Groups => "GROUPS", - }) - } -} - -impl From for WindowFrameUnits { - fn from(value: ast::WindowFrameUnits) -> Self { - match value { - ast::WindowFrameUnits::Range => Self::Range, - ast::WindowFrameUnits::Groups => Self::Groups, - ast::WindowFrameUnits::Rows => Self::Rows, - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_window_frame_creation() -> Result<()> { - let window_frame = ast::WindowFrame { - units: ast::WindowFrameUnits::Range, - start_bound: ast::WindowFrameBound::Following(None), - end_bound: None, - }; - let result = WindowFrame::try_from(window_frame); - assert_eq!( - result.err().unwrap().to_string(), - "Execution error: Invalid window frame: start bound cannot be unbounded following".to_owned() - ); - - let window_frame = ast::WindowFrame { - units: ast::WindowFrameUnits::Range, - start_bound: ast::WindowFrameBound::Preceding(None), - end_bound: Some(ast::WindowFrameBound::Preceding(None)), - }; - let result = WindowFrame::try_from(window_frame); - assert_eq!( - result.err().unwrap().to_string(), - "Execution error: Invalid window frame: end bound cannot be unbounded preceding".to_owned() - ); - - let window_frame = ast::WindowFrame { - units: ast::WindowFrameUnits::Range, - start_bound: ast::WindowFrameBound::Preceding(Some(1)), - end_bound: Some(ast::WindowFrameBound::Preceding(Some(2))), - }; - let result = WindowFrame::try_from(window_frame); - assert_eq!( - result.err().unwrap().to_string(), - "Execution error: Invalid window frame: start bound (1 PRECEDING) cannot be larger than end bound (2 PRECEDING)".to_owned() - ); - Ok(()) - } - - #[test] - fn test_eq() { - assert_eq!( - WindowFrameBound::Preceding(Some(0)), - WindowFrameBound::CurrentRow - ); - assert_eq!( - WindowFrameBound::CurrentRow, - WindowFrameBound::Following(Some(0)) - ); - assert_eq!( - WindowFrameBound::Following(Some(2)), - WindowFrameBound::Following(Some(2)) - ); - assert_eq!( - WindowFrameBound::Following(None), - WindowFrameBound::Following(None) - ); - assert_eq!( - WindowFrameBound::Preceding(Some(2)), - WindowFrameBound::Preceding(Some(2)) - ); - assert_eq!( - WindowFrameBound::Preceding(None), - WindowFrameBound::Preceding(None) - ); - } - - #[test] - fn test_ord() { - assert!(WindowFrameBound::Preceding(Some(1)) < WindowFrameBound::CurrentRow); - // ! yes this is correct! - assert!( - WindowFrameBound::Preceding(Some(2)) < WindowFrameBound::Preceding(Some(1)) - ); - assert!( - WindowFrameBound::Preceding(Some(u64::MAX)) - < WindowFrameBound::Preceding(Some(u64::MAX - 1)) - ); - assert!( - WindowFrameBound::Preceding(None) - < WindowFrameBound::Preceding(Some(1000000)) - ); - assert!( - WindowFrameBound::Preceding(None) - < WindowFrameBound::Preceding(Some(u64::MAX)) - ); - assert!(WindowFrameBound::Preceding(None) < WindowFrameBound::Following(Some(0))); - assert!( - WindowFrameBound::Preceding(Some(1)) < WindowFrameBound::Following(Some(1)) - ); - assert!(WindowFrameBound::CurrentRow < WindowFrameBound::Following(Some(1))); - assert!( - WindowFrameBound::Following(Some(1)) < WindowFrameBound::Following(Some(2)) - ); - assert!(WindowFrameBound::Following(Some(2)) < WindowFrameBound::Following(None)); - assert!( - WindowFrameBound::Following(Some(u64::MAX)) - < WindowFrameBound::Following(None) - ); - } -} diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 53f22ecaf3f25..c128634091a08 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -19,6 +19,7 @@ use crate::catalog::TableReference; use crate::datasource::TableProvider; +use crate::logical_plan::window_frames::{WindowFrame, WindowFrameUnits}; use crate::logical_plan::Expr::Alias; use crate::logical_plan::{ and, lit, DFSchema, Expr, LogicalPlan, LogicalPlanBuilder, Operator, PlanType, @@ -1137,7 +1138,18 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let window_frame = window .window_frame .as_ref() - .map(|window_frame| window_frame.clone().try_into()) + .map(|window_frame| { + let window_frame: WindowFrame = window_frame.clone().try_into()?; + if WindowFrameUnits::Range == window_frame.units + && order_by.len() != 1 + { + Err(DataFusionError::Plan(format!( + "With window frame of type RANGE, the order by expression must be of length 1, got {}", order_by.len()))) + } else { + Ok(window_frame) + } + + }) .transpose()?; let fun = window_functions::WindowFunction::from_str(&name)?; match fun { @@ -2859,10 +2871,10 @@ mod tests { #[test] fn over_order_by_with_window_frame_double_end() { - let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id RANGE BETWEEN 3 PRECEDING and 3 FOLLOWING), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; + let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id ROWS BETWEEN 3 PRECEDING and 3 FOLLOWING), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; let expected = "\ - Projection: #order_id, #MAX(qty) RANGE BETWEEN 3 PRECEDING AND 3 FOLLOWING, #MIN(qty)\ - \n WindowAggr: windowExpr=[[MAX(#qty) RANGE BETWEEN 3 PRECEDING AND 3 FOLLOWING]]\ + Projection: #order_id, #MAX(qty) ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING, #MIN(qty)\ + \n WindowAggr: windowExpr=[[MAX(#qty) ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING]]\ \n Sort: #order_id ASC NULLS FIRST\ \n WindowAggr: windowExpr=[[MIN(#qty)]]\ \n Sort: #order_id DESC NULLS FIRST\ @@ -2872,10 +2884,10 @@ mod tests { #[test] fn over_order_by_with_window_frame_single_end() { - let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id RANGE 3 PRECEDING), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; + let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id ROWS 3 PRECEDING), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; let expected = "\ - Projection: #order_id, #MAX(qty) RANGE BETWEEN 3 PRECEDING AND CURRENT ROW, #MIN(qty)\ - \n WindowAggr: windowExpr=[[MAX(#qty) RANGE BETWEEN 3 PRECEDING AND CURRENT ROW]]\ + Projection: #order_id, #MAX(qty) ROWS BETWEEN 3 PRECEDING AND CURRENT ROW, #MIN(qty)\ + \n WindowAggr: windowExpr=[[MAX(#qty) ROWS BETWEEN 3 PRECEDING AND CURRENT ROW]]\ \n Sort: #order_id ASC NULLS FIRST\ \n WindowAggr: windowExpr=[[MIN(#qty)]]\ \n Sort: #order_id DESC NULLS FIRST\ @@ -2883,6 +2895,38 @@ mod tests { quick_test(sql, expected); } + #[test] + fn over_order_by_with_window_frame_range_value_check() { + let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id RANGE 3 PRECEDING) from orders"; + let err = logical_plan(sql).expect_err("query should have failed"); + assert_eq!( + "NotImplemented(\"With WindowFrameUnits=RANGE, the bound cannot be 3 PRECEDING or FOLLOWING at the moment\")", + format!("{:?}", err) + ); + } + + #[test] + fn over_order_by_with_window_frame_range_order_by_check() { + let sql = + "SELECT order_id, MAX(qty) OVER (RANGE UNBOUNDED PRECEDING) from orders"; + let err = logical_plan(sql).expect_err("query should have failed"); + assert_eq!( + "Plan(\"With window frame of type RANGE, the order by expression must be of length 1, got 0\")", + format!("{:?}", err) + ); + } + + #[test] + fn over_order_by_with_window_frame_range_order_by_check_2() { + let sql = + "SELECT order_id, MAX(qty) OVER (ORDER BY order_id, qty RANGE UNBOUNDED PRECEDING) from orders"; + let err = logical_plan(sql).expect_err("query should have failed"); + assert_eq!( + "Plan(\"With window frame of type RANGE, the order by expression must be of length 1, got 2\")", + format!("{:?}", err) + ); + } + #[test] fn over_order_by_with_window_frame_single_end_groups() { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id GROUPS 3 PRECEDING), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; From 5c88450a0286c98cdd4b0679f6b09b7eee1c3570 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 10 Jun 2021 22:58:19 +0800 Subject: [PATCH 175/329] remove redundant into_iter() calls (#527) --- ballista/rust/core/src/serde/logical_plan/from_proto.rs | 2 -- datafusion/src/physical_plan/windows.rs | 1 - datafusion/src/sql/planner.rs | 4 ---- 3 files changed, 7 deletions(-) diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index 894a5f0a7d985..c2c1001b939c1 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -914,13 +914,11 @@ impl TryInto for &protobuf::LogicalExprNode { .partition_by .iter() .map(|e| e.try_into()) - .into_iter() .collect::, _>>()?; let order_by = expr .order_by .iter() .map(|e| e.try_into()) - .into_iter() .collect::, _>>()?; let window_frame = expr .window_frame diff --git a/datafusion/src/physical_plan/windows.rs b/datafusion/src/physical_plan/windows.rs index 7eb14943facf1..9a6b92985b519 100644 --- a/datafusion/src/physical_plan/windows.rs +++ b/datafusion/src/physical_plan/windows.rs @@ -348,7 +348,6 @@ fn window_aggregate_batch( .collect::>>()?; window_acc.scan_batch(batch.num_rows(), values) }) - .into_iter() .collect::>>() } diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index c128634091a08..860d21714ec66 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -714,7 +714,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let select_exprs = select_exprs .iter() .map(|expr| rebase_expr(expr, &window_exprs, &plan)) - .into_iter() .collect::>>()?; Ok((plan, select_exprs)) } @@ -811,7 +810,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let order_by_rex = order_by .iter() .map(|e| self.order_by_to_sort_expr(e)) - .into_iter() .collect::>>()?; LogicalPlanBuilder::from(&plan).sort(order_by_rex)?.build() @@ -1127,13 +1125,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .partition_by .iter() .map(|e| self.sql_expr_to_logical_expr(e)) - .into_iter() .collect::>>()?; let order_by = window .order_by .iter() .map(|e| self.order_by_to_sort_expr(e)) - .into_iter() .collect::>>()?; let window_frame = window .window_frame From 3ef7f3495b9501f9a14db64a6ae4d923f681c649 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Fri, 11 Jun 2021 06:16:21 +0800 Subject: [PATCH 176/329] use nightly nightly-2021-05-10 (#536) --- .env | 2 +- .github/workflows/python_build.yml | 2 +- .github/workflows/python_test.yaml | 4 ++-- python/rust-toolchain | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.env b/.env index 4fb24bed40a12..05517d00f8e44 100644 --- a/.env +++ b/.env @@ -47,7 +47,7 @@ FEDORA=33 PYTHON=3.6 LLVM=11 CLANG_TOOLS=8 -RUST=nightly-2021-03-24 +RUST=nightly-2021-05-10 GO=1.15 NODE=14 MAVEN=3.5.4 diff --git a/.github/workflows/python_build.yml b/.github/workflows/python_build.yml index eba11b8e3a41f..1f083de7827f3 100644 --- a/.github/workflows/python_build.yml +++ b/.github/workflows/python_build.yml @@ -39,7 +39,7 @@ jobs: - uses: actions-rs/toolchain@v1 with: - toolchain: nightly-2021-01-06 + toolchain: nightly-2021-05-10 - name: Install dependencies run: | diff --git a/.github/workflows/python_test.yaml b/.github/workflows/python_test.yaml index e689396b5dcd1..ebf5e9f594c0b 100644 --- a/.github/workflows/python_test.yaml +++ b/.github/workflows/python_test.yaml @@ -25,8 +25,8 @@ jobs: - uses: actions/checkout@v2 - name: Setup Rust toolchain run: | - rustup toolchain install nightly-2021-01-06 - rustup default nightly-2021-01-06 + rustup toolchain install nightly-2021-05-10 + rustup default nightly-2021-05-10 rustup component add rustfmt - name: Cache Cargo uses: actions/cache@v2 diff --git a/python/rust-toolchain b/python/rust-toolchain index 9d0cf79d367d6..6231a95e3036d 100644 --- a/python/rust-toolchain +++ b/python/rust-toolchain @@ -1 +1 @@ -nightly-2021-01-06 +nightly-2021-05-10 From 63e3045c9e0dd0579ec2be92bb174401f898833f Mon Sep 17 00:00:00 2001 From: Ximo Guanter Date: Fri, 11 Jun 2021 17:45:27 +0200 Subject: [PATCH 177/329] Make BallistaContext::collect streaming (#535) --- ballista/rust/client/src/context.rs | 113 ++++++++++++++++++---------- 1 file changed, 72 insertions(+), 41 deletions(-) diff --git a/ballista/rust/client/src/context.rs b/ballista/rust/client/src/context.rs index 4c0ab4244be35..4e5cc1a7a76b6 100644 --- a/ballista/rust/client/src/context.rs +++ b/ballista/rust/client/src/context.rs @@ -24,21 +24,27 @@ use std::{collections::HashMap, convert::TryInto}; use std::{fs, time::Duration}; use ballista_core::serde::protobuf::scheduler_grpc_client::SchedulerGrpcClient; +use ballista_core::serde::protobuf::PartitionLocation; use ballista_core::serde::protobuf::{ execute_query_params::Query, job_status, ExecuteQueryParams, GetJobStatusParams, GetJobStatusResult, }; use ballista_core::{ - client::BallistaClient, datasource::DfTableAdapter, memory_stream::MemoryStream, - utils::create_datafusion_context, + client::BallistaClient, datasource::DfTableAdapter, utils::create_datafusion_context, }; use datafusion::arrow::datatypes::Schema; +use datafusion::arrow::datatypes::SchemaRef; +use datafusion::arrow::error::Result as ArrowResult; +use datafusion::arrow::record_batch::RecordBatch; use datafusion::catalog::TableReference; use datafusion::error::{DataFusionError, Result}; use datafusion::logical_plan::LogicalPlan; use datafusion::physical_plan::csv::CsvReadOptions; use datafusion::{dataframe::DataFrame, physical_plan::RecordBatchStream}; +use futures::future; +use futures::Stream; +use futures::StreamExt; use log::{error, info}; #[allow(dead_code)] @@ -68,6 +74,32 @@ impl BallistaContextState { } } +struct WrappedStream { + stream: Pin> + Send + Sync>>, + schema: SchemaRef, +} + +impl RecordBatchStream for WrappedStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} + +impl Stream for WrappedStream { + type Item = ArrowResult; + + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + self.stream.poll_next_unpin(cx) + } + + fn size_hint(&self) -> (usize, Option) { + self.stream.size_hint() + } +} + #[allow(dead_code)] pub struct BallistaContext { @@ -155,6 +187,29 @@ impl BallistaContext { ctx.sql(sql) } + async fn fetch_partition( + location: PartitionLocation, + ) -> Result>> { + let metadata = location.executor_meta.ok_or_else(|| { + DataFusionError::Internal("Received empty executor metadata".to_owned()) + })?; + let partition_id = location.partition_id.ok_or_else(|| { + DataFusionError::Internal("Received empty partition id".to_owned()) + })?; + let mut ballista_client = + BallistaClient::try_new(metadata.host.as_str(), metadata.port as u16) + .await + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; + Ok(ballista_client + .fetch_partition( + &partition_id.job_id, + partition_id.stage_id as usize, + partition_id.partition_id as usize, + ) + .await + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?) + } + pub async fn collect( &self, plan: &LogicalPlan, @@ -222,45 +277,21 @@ impl BallistaContext { break Err(DataFusionError::Execution(msg)); } job_status::Status::Completed(completed) => { - // TODO: use streaming. Probably need to change the signature of fetch_partition to achieve that - let mut result = vec![]; - for location in completed.partition_location { - let metadata = location.executor_meta.ok_or_else(|| { - DataFusionError::Internal( - "Received empty executor metadata".to_owned(), - ) - })?; - let partition_id = location.partition_id.ok_or_else(|| { - DataFusionError::Internal( - "Received empty partition id".to_owned(), - ) - })?; - let mut ballista_client = BallistaClient::try_new( - metadata.host.as_str(), - metadata.port as u16, - ) - .await - .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; - let stream = ballista_client - .fetch_partition( - &partition_id.job_id, - partition_id.stage_id as usize, - partition_id.partition_id as usize, - ) - .await - .map_err(|e| { - DataFusionError::Execution(format!("{:?}", e)) - })?; - result.append( - &mut datafusion::physical_plan::common::collect(stream) - .await?, - ); - } - break Ok(Box::pin(MemoryStream::try_new( - result, - Arc::new(schema), - None, - )?)); + let result = future::join_all( + completed + .partition_location + .into_iter() + .map(BallistaContext::fetch_partition), + ) + .await + .into_iter() + .collect::>>()?; + + let result = WrappedStream { + stream: Box::pin(futures::stream::iter(result).flatten()), + schema: Arc::new(schema), + }; + break Ok(Box::pin(result)); } }; } From ad70a1e91667174436f2110a70e3e557c7069e9a Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Sat, 12 Jun 2021 02:50:23 +0800 Subject: [PATCH 178/329] reuse datafusion physical planner in ballista building from protobuf (#532) * use logical planner in ballista building * simplify statement * fix unit test * fix per comment --- .../src/serde/physical_plan/from_proto.rs | 142 ++++-------------- datafusion/src/physical_plan/planner.rs | 116 +++++++++++--- datafusion/src/physical_plan/windows.rs | 44 ++++-- 3 files changed, 153 insertions(+), 149 deletions(-) diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index b319d5b25f121..d49d53cf8d855 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -36,7 +36,7 @@ use datafusion::execution::context::{ ExecutionConfig, ExecutionContextState, ExecutionProps, }; use datafusion::logical_plan::{DFSchema, Expr}; -use datafusion::physical_plan::aggregates::{create_aggregate_expr, AggregateFunction}; +use datafusion::physical_plan::aggregates::AggregateFunction; use datafusion::physical_plan::expressions::col; use datafusion::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec}; use datafusion::physical_plan::hash_join::PartitionMode; @@ -45,7 +45,6 @@ use datafusion::physical_plan::planner::DefaultPhysicalPlanner; use datafusion::physical_plan::window_functions::{ BuiltInWindowFunction, WindowFunction, }; -use datafusion::physical_plan::windows::create_window_expr; use datafusion::physical_plan::windows::WindowAggExec; use datafusion::physical_plan::{ coalesce_batches::CoalesceBatchesExec, @@ -205,76 +204,27 @@ impl TryInto> for &protobuf::PhysicalPlanNode { ) })? .clone(); - let physical_schema: SchemaRef = SchemaRef::new((&input_schema).try_into()?); - - let catalog_list = - Arc::new(MemoryCatalogList::new()) as Arc; - let ctx_state = ExecutionContextState { - catalog_list, - scalar_functions: Default::default(), - var_provider: Default::default(), - aggregate_functions: Default::default(), - config: ExecutionConfig::new(), - execution_props: ExecutionProps::new(), - }; - + let ctx_state = ExecutionContextState::new(); let window_agg_expr: Vec<(Expr, String)> = window_agg .window_expr .iter() .zip(window_agg.window_expr_name.iter()) .map(|(expr, name)| expr.try_into().map(|expr| (expr, name.clone()))) .collect::, _>>()?; - - let mut physical_window_expr = vec![]; - let df_planner = DefaultPhysicalPlanner::default(); - - for (expr, name) in &window_agg_expr { - match expr { - Expr::WindowFunction { - fun, - args, - partition_by, - order_by, - window_frame, - .. - } => { - let arg = df_planner - .create_physical_expr( - &args[0], - &physical_schema, - &ctx_state, - ) - .map_err(|e| { - BallistaError::General(format!("{:?}", e)) - })?; - if !partition_by.is_empty() { - return Err(BallistaError::NotImplemented("Window function with partition by is not yet implemented".to_owned())); - } - if !order_by.is_empty() { - return Err(BallistaError::NotImplemented("Window function with order by is not yet implemented".to_owned())); - } - if window_frame.is_some() { - return Err(BallistaError::NotImplemented("Window function with window frame is not yet implemented".to_owned())); - } - let window_expr = create_window_expr( - &fun, - &[arg], - &physical_schema, - name.to_owned(), - )?; - physical_window_expr.push(window_expr); - } - _ => { - return Err(BallistaError::General( - "Invalid expression for WindowAggrExec".to_string(), - )); - } - } - } - + let physical_window_expr = window_agg_expr + .iter() + .map(|(expr, name)| { + df_planner.create_window_expr_with_name( + expr, + name.to_string(), + &physical_schema, + &ctx_state, + ) + }) + .collect::, _>>()?; Ok(Arc::new(WindowAggExec::try_new( physical_window_expr, input, @@ -297,7 +247,6 @@ impl TryInto> for &protobuf::PhysicalPlanNode { AggregateMode::FinalPartitioned } }; - let group = hash_agg .group_expr .iter() @@ -306,25 +255,13 @@ impl TryInto> for &protobuf::PhysicalPlanNode { compile_expr(expr, &input.schema()).map(|e| (e, name.to_string())) }) .collect::, _>>()?; - let logical_agg_expr: Vec<(Expr, String)> = hash_agg .aggr_expr .iter() .zip(hash_agg.aggr_expr_name.iter()) .map(|(expr, name)| expr.try_into().map(|expr| (expr, name.clone()))) .collect::, _>>()?; - - let catalog_list = - Arc::new(MemoryCatalogList::new()) as Arc; - let ctx_state = ExecutionContextState { - catalog_list, - scalar_functions: Default::default(), - var_provider: Default::default(), - aggregate_functions: Default::default(), - config: ExecutionConfig::new(), - execution_props: ExecutionProps::new(), - }; - + let ctx_state = ExecutionContextState::new(); let input_schema = hash_agg .input_schema .as_ref() @@ -336,37 +273,18 @@ impl TryInto> for &protobuf::PhysicalPlanNode { .clone(); let physical_schema: SchemaRef = SchemaRef::new((&input_schema).try_into()?); - - let mut physical_aggr_expr = vec![]; - let df_planner = DefaultPhysicalPlanner::default(); - for (expr, name) in &logical_agg_expr { - match expr { - Expr::AggregateFunction { fun, args, .. } => { - let arg = df_planner - .create_physical_expr( - &args[0], - &physical_schema, - &ctx_state, - ) - .map_err(|e| { - BallistaError::General(format!("{:?}", e)) - })?; - physical_aggr_expr.push(create_aggregate_expr( - &fun, - false, - &[arg], - &physical_schema, - name.to_string(), - )?); - } - _ => { - return Err(BallistaError::General( - "Invalid expression for HashAggregateExec".to_string(), - )) - } - } - } + let physical_aggr_expr = logical_agg_expr + .iter() + .map(|(expr, name)| { + df_planner.create_aggregate_expr_with_name( + expr, + name.to_string(), + &physical_schema, + &ctx_state, + ) + }) + .collect::, _>>()?; Ok(Arc::new(HashAggregateExec::try_new( agg_mode, group, @@ -484,15 +402,7 @@ fn compile_expr( schema: &Schema, ) -> Result, BallistaError> { let df_planner = DefaultPhysicalPlanner::default(); - let catalog_list = Arc::new(MemoryCatalogList::new()) as Arc; - let state = ExecutionContextState { - catalog_list, - scalar_functions: HashMap::new(), - var_provider: HashMap::new(), - aggregate_functions: HashMap::new(), - config: ExecutionConfig::new(), - execution_props: ExecutionProps::new(), - }; + let state = ExecutionContextState::new(); let expr: Expr = expr.try_into()?; df_planner .create_physical_expr(&expr, schema, &state) diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index d7451c7870961..d42948a8666c6 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -731,34 +731,82 @@ impl DefaultPhysicalPlanner { } } - /// Create a window expression from a logical expression - pub fn create_window_expr( + /// Create a window expression with a name from a logical expression + pub fn create_window_expr_with_name( &self, e: &Expr, - logical_input_schema: &DFSchema, + name: String, physical_input_schema: &Schema, ctx_state: &ExecutionContextState, ) -> Result> { - // unpack aliased logical expressions, e.g. "sum(col) over () as total" - let (name, e) = match e { - Expr::Alias(sub_expr, alias) => (alias.clone(), sub_expr.as_ref()), - _ => (e.name(logical_input_schema)?, e), - }; - match e { - Expr::WindowFunction { fun, args, .. } => { + Expr::WindowFunction { + fun, + args, + partition_by, + order_by, + window_frame, + } => { let args = args .iter() .map(|e| { self.create_physical_expr(e, physical_input_schema, ctx_state) }) .collect::>>()?; - // if !order_by.is_empty() { - // return Err(DataFusionError::NotImplemented( - // "Window function with order by is not yet implemented".to_owned(), - // )); - // } - windows::create_window_expr(fun, &args, physical_input_schema, name) + let partition_by = partition_by + .iter() + .map(|e| { + self.create_physical_expr(e, physical_input_schema, ctx_state) + }) + .collect::>>()?; + let order_by = order_by + .iter() + .map(|e| match e { + Expr::Sort { + expr, + asc, + nulls_first, + } => self.create_physical_sort_expr( + expr, + &physical_input_schema, + SortOptions { + descending: !*asc, + nulls_first: *nulls_first, + }, + &ctx_state, + ), + _ => Err(DataFusionError::Plan( + "Sort only accepts sort expressions".to_string(), + )), + }) + .collect::>>()?; + if !partition_by.is_empty() { + return Err(DataFusionError::NotImplemented( + "window expression with non-empty partition by clause is not yet supported" + .to_owned(), + )); + } + if !order_by.is_empty() { + return Err(DataFusionError::NotImplemented( + "window expression with non-empty order by clause is not yet supported" + .to_owned(), + )); + } + if window_frame.is_some() { + return Err(DataFusionError::NotImplemented( + "window expression with window frame definition is not yet supported" + .to_owned(), + )); + } + windows::create_window_expr( + fun, + name, + &args, + &partition_by, + &order_by, + *window_frame, + physical_input_schema, + ) } other => Err(DataFusionError::Internal(format!( "Invalid window expression '{:?}'", @@ -767,20 +815,30 @@ impl DefaultPhysicalPlanner { } } - /// Create an aggregate expression from a logical expression - pub fn create_aggregate_expr( + /// Create a window expression from a logical expression or an alias + pub fn create_window_expr( &self, e: &Expr, logical_input_schema: &DFSchema, physical_input_schema: &Schema, ctx_state: &ExecutionContextState, - ) -> Result> { - // unpack aliased logical expressions, e.g. "sum(col) as total" + ) -> Result> { + // unpack aliased logical expressions, e.g. "sum(col) over () as total" let (name, e) = match e { Expr::Alias(sub_expr, alias) => (alias.clone(), sub_expr.as_ref()), _ => (e.name(logical_input_schema)?, e), }; + self.create_window_expr_with_name(e, name, physical_input_schema, ctx_state) + } + /// Create an aggregate expression with a name from a logical expression + pub fn create_aggregate_expr_with_name( + &self, + e: &Expr, + name: String, + physical_input_schema: &Schema, + ctx_state: &ExecutionContextState, + ) -> Result> { match e { Expr::AggregateFunction { fun, @@ -819,7 +877,23 @@ impl DefaultPhysicalPlanner { } } - /// Create an aggregate expression from a logical expression + /// Create an aggregate expression from a logical expression or an alias + pub fn create_aggregate_expr( + &self, + e: &Expr, + logical_input_schema: &DFSchema, + physical_input_schema: &Schema, + ctx_state: &ExecutionContextState, + ) -> Result> { + // unpack aliased logical expressions, e.g. "sum(col) as total" + let (name, e) = match e { + Expr::Alias(sub_expr, alias) => (alias.clone(), sub_expr.as_ref()), + _ => (e.name(logical_input_schema)?, e), + }; + self.create_aggregate_expr_with_name(e, name, physical_input_schema, ctx_state) + } + + /// Create a physical sort expression from a logical expression pub fn create_physical_sort_expr( &self, e: &Expr, diff --git a/datafusion/src/physical_plan/windows.rs b/datafusion/src/physical_plan/windows.rs index 9a6b92985b519..565a9eef28575 100644 --- a/datafusion/src/physical_plan/windows.rs +++ b/datafusion/src/physical_plan/windows.rs @@ -18,9 +18,11 @@ //! Execution plan for window functions use crate::error::{DataFusionError, Result}; + +use crate::logical_plan::window_frames::WindowFrame; use crate::physical_plan::{ aggregates, common, - expressions::{Literal, NthValue, RowNumber}, + expressions::{Literal, NthValue, PhysicalSortExpr, RowNumber}, type_coercion::coerce, window_functions::signature_for_built_in, window_functions::BuiltInWindowFunctionExpr, @@ -61,12 +63,18 @@ pub struct WindowAggExec { /// Create a physical expression for window function pub fn create_window_expr( fun: &WindowFunction, + name: String, args: &[Arc], + // https://github.com/apache/arrow-datafusion/issues/299 + _partition_by: &[Arc], + // https://github.com/apache/arrow-datafusion/issues/360 + _order_by: &[PhysicalSortExpr], + // https://github.com/apache/arrow-datafusion/issues/361 + _window_frame: Option, input_schema: &Schema, - name: String, ) -> Result> { - match fun { - WindowFunction::AggregateFunction(fun) => Ok(Arc::new(AggregateWindowExpr { + Ok(match fun { + WindowFunction::AggregateFunction(fun) => Arc::new(AggregateWindowExpr { aggregate: aggregates::create_aggregate_expr( fun, false, @@ -74,11 +82,11 @@ pub fn create_window_expr( input_schema, name, )?, - })), - WindowFunction::BuiltInWindowFunction(fun) => Ok(Arc::new(BuiltInWindowExpr { + }), + WindowFunction::BuiltInWindowFunction(fun) => Arc::new(BuiltInWindowExpr { window: create_built_in_window_expr(fun, args, input_schema, name)?, - })), - } + }), + }) } fn create_built_in_window_expr( @@ -537,9 +545,12 @@ mod tests { let window_exec = Arc::new(WindowAggExec::try_new( vec![create_window_expr( &WindowFunction::AggregateFunction(AggregateFunction::Count), + "count".to_owned(), &[col("c3")], + &[], + &[], + Some(WindowFrame::default()), schema.as_ref(), - "count".to_owned(), )?], input, schema.clone(), @@ -567,21 +578,30 @@ mod tests { vec![ create_window_expr( &WindowFunction::AggregateFunction(AggregateFunction::Count), + "count".to_owned(), &[col("c3")], + &[], + &[], + Some(WindowFrame::default()), schema.as_ref(), - "count".to_owned(), )?, create_window_expr( &WindowFunction::AggregateFunction(AggregateFunction::Max), + "max".to_owned(), &[col("c3")], + &[], + &[], + Some(WindowFrame::default()), schema.as_ref(), - "max".to_owned(), )?, create_window_expr( &WindowFunction::AggregateFunction(AggregateFunction::Min), + "min".to_owned(), &[col("c3")], + &[], + &[], + Some(WindowFrame::default()), schema.as_ref(), - "min".to_owned(), )?, ], input, From 8f4078d83f7ea0348fa43906d26156bf8a95de4c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 12 Jun 2021 06:45:06 -0600 Subject: [PATCH 179/329] ShuffleReaderExec now supports multiple locations per partition (#541) * ShuffleReaderExec now supports multiple locations per partition * Remove TODO * avoid clone --- ballista/rust/client/src/context.rs | 39 +------- ballista/rust/core/proto/ballista.proto | 7 +- .../src/execution_plans/shuffle_reader.rs | 94 +++++++++++-------- .../src/serde/physical_plan/from_proto.rs | 12 ++- .../core/src/serde/physical_plan/to_proto.rs | 18 ++-- ballista/rust/core/src/utils.rs | 40 +++++++- ballista/rust/scheduler/src/planner.rs | 2 +- ballista/rust/scheduler/src/state/mod.rs | 6 +- 8 files changed, 130 insertions(+), 88 deletions(-) diff --git a/ballista/rust/client/src/context.rs b/ballista/rust/client/src/context.rs index 4e5cc1a7a76b6..695045d220d07 100644 --- a/ballista/rust/client/src/context.rs +++ b/ballista/rust/client/src/context.rs @@ -29,21 +29,18 @@ use ballista_core::serde::protobuf::{ execute_query_params::Query, job_status, ExecuteQueryParams, GetJobStatusParams, GetJobStatusResult, }; +use ballista_core::utils::WrappedStream; use ballista_core::{ client::BallistaClient, datasource::DfTableAdapter, utils::create_datafusion_context, }; use datafusion::arrow::datatypes::Schema; -use datafusion::arrow::datatypes::SchemaRef; -use datafusion::arrow::error::Result as ArrowResult; -use datafusion::arrow::record_batch::RecordBatch; use datafusion::catalog::TableReference; use datafusion::error::{DataFusionError, Result}; use datafusion::logical_plan::LogicalPlan; use datafusion::physical_plan::csv::CsvReadOptions; use datafusion::{dataframe::DataFrame, physical_plan::RecordBatchStream}; use futures::future; -use futures::Stream; use futures::StreamExt; use log::{error, info}; @@ -74,32 +71,6 @@ impl BallistaContextState { } } -struct WrappedStream { - stream: Pin> + Send + Sync>>, - schema: SchemaRef, -} - -impl RecordBatchStream for WrappedStream { - fn schema(&self) -> SchemaRef { - self.schema.clone() - } -} - -impl Stream for WrappedStream { - type Item = ArrowResult; - - fn poll_next( - mut self: Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { - self.stream.poll_next_unpin(cx) - } - - fn size_hint(&self) -> (usize, Option) { - self.stream.size_hint() - } -} - #[allow(dead_code)] pub struct BallistaContext { @@ -287,10 +258,10 @@ impl BallistaContext { .into_iter() .collect::>>()?; - let result = WrappedStream { - stream: Box::pin(futures::stream::iter(result).flatten()), - schema: Arc::new(schema), - }; + let result = WrappedStream::new( + Box::pin(futures::stream::iter(result).flatten()), + Arc::new(schema), + ); break Ok(Box::pin(result)); } }; diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index 85af9023fb468..5aafd00cf1b05 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -489,10 +489,15 @@ message HashAggregateExecNode { } message ShuffleReaderExecNode { - repeated PartitionLocation partition_location = 1; + repeated ShuffleReaderPartition partition = 1; Schema schema = 2; } +message ShuffleReaderPartition { + // each partition of a shuffle read can read data from multiple locations + repeated PartitionLocation location = 1; +} + message GlobalLimitExecNode { PhysicalPlanNode input = 1; uint32 limit = 2; diff --git a/ballista/rust/core/src/execution_plans/shuffle_reader.rs b/ballista/rust/core/src/execution_plans/shuffle_reader.rs index db29cf13b5fed..3a7f795f1a7fd 100644 --- a/ballista/rust/core/src/execution_plans/shuffle_reader.rs +++ b/ballista/rust/core/src/execution_plans/shuffle_reader.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use std::fmt::Formatter; use std::sync::Arc; use std::{any::Any, pin::Pin}; @@ -22,35 +23,35 @@ use crate::client::BallistaClient; use crate::memory_stream::MemoryStream; use crate::serde::scheduler::PartitionLocation; +use crate::utils::WrappedStream; use async_trait::async_trait; use datafusion::arrow::datatypes::SchemaRef; +use datafusion::arrow::error::Result as ArrowResult; +use datafusion::arrow::record_batch::RecordBatch; use datafusion::physical_plan::{DisplayFormatType, ExecutionPlan, Partitioning}; use datafusion::{ error::{DataFusionError, Result}, physical_plan::RecordBatchStream, }; +use futures::{future, Stream, StreamExt}; use log::info; -use std::fmt::Formatter; -/// ShuffleReaderExec reads partitions that have already been materialized by an executor. +/// ShuffleReaderExec reads partitions that have already been materialized by a query stage +/// being executed by an executor #[derive(Debug, Clone)] pub struct ShuffleReaderExec { - // The query stage that is responsible for producing the shuffle partitions that - // this operator will read - pub(crate) partition_location: Vec, + /// Each partition of a shuffle can read data from multiple locations + pub(crate) partition: Vec>, pub(crate) schema: SchemaRef, } impl ShuffleReaderExec { /// Create a new ShuffleReaderExec pub fn try_new( - partition_meta: Vec, + partition: Vec>, schema: SchemaRef, ) -> Result { - Ok(Self { - partition_location: partition_meta, - schema, - }) + Ok(Self { partition, schema }) } } @@ -65,7 +66,7 @@ impl ExecutionPlan for ShuffleReaderExec { } fn output_partitioning(&self) -> Partitioning { - Partitioning::UnknownPartitioning(self.partition_location.len()) + Partitioning::UnknownPartitioning(self.partition.len()) } fn children(&self) -> Vec> { @@ -86,23 +87,18 @@ impl ExecutionPlan for ShuffleReaderExec { partition: usize, ) -> Result>> { info!("ShuffleReaderExec::execute({})", partition); - let partition_location = &self.partition_location[partition]; - - let mut client = BallistaClient::try_new( - &partition_location.executor_meta.host, - partition_location.executor_meta.port, - ) - .await - .map_err(|e| DataFusionError::Execution(format!("Ballista Error: {:?}", e)))?; - client - .fetch_partition( - &partition_location.partition_id.job_id, - partition_location.partition_id.stage_id, - partition, - ) + let partition_locations = &self.partition[partition]; + let result = future::join_all(partition_locations.iter().map(fetch_partition)) .await - .map_err(|e| DataFusionError::Execution(format!("Ballista Error: {:?}", e))) + .into_iter() + .collect::>>()?; + + let result = WrappedStream::new( + Box::pin(futures::stream::iter(result).flatten()), + Arc::new(self.schema.as_ref().clone()), + ); + Ok(Box::pin(result)) } fn fmt_as( @@ -113,22 +109,46 @@ impl ExecutionPlan for ShuffleReaderExec { match t { DisplayFormatType::Default => { let loc_str = self - .partition_location + .partition .iter() - .map(|l| { - format!( - "[executor={} part={}:{}:{} stats={:?}]", - l.executor_meta.id, - l.partition_id.job_id, - l.partition_id.stage_id, - l.partition_id.partition_id, - l.partition_stats - ) + .map(|x| { + x.iter() + .map(|l| { + format!( + "[executor={} part={}:{}:{} stats={:?}]", + l.executor_meta.id, + l.partition_id.job_id, + l.partition_id.stage_id, + l.partition_id.partition_id, + l.partition_stats + ) + }) + .collect::>() + .join(",") }) .collect::>() - .join(","); + .join("\n"); write!(f, "ShuffleReaderExec: partition_locations={}", loc_str) } } } } + +async fn fetch_partition( + location: &PartitionLocation, +) -> Result>> { + let metadata = &location.executor_meta; + let partition_id = &location.partition_id; + let mut ballista_client = + BallistaClient::try_new(metadata.host.as_str(), metadata.port as u16) + .await + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; + Ok(ballista_client + .fetch_partition( + &partition_id.job_id, + partition_id.stage_id as usize, + partition_id.partition_id as usize, + ) + .await + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?) +} diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index d49d53cf8d855..a2c9db9ecafbf 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -25,6 +25,7 @@ use crate::error::BallistaError; use crate::execution_plans::{ShuffleReaderExec, UnresolvedShuffleExec}; use crate::serde::protobuf::repartition_exec_node::PartitionMethod; use crate::serde::protobuf::LogicalExprNode; +use crate::serde::protobuf::ShuffleReaderPartition; use crate::serde::scheduler::PartitionLocation; use crate::serde::{proto_error, protobuf}; use crate::{convert_box_required, convert_required}; @@ -327,10 +328,15 @@ impl TryInto> for &protobuf::PhysicalPlanNode { } PhysicalPlanType::ShuffleReader(shuffle_reader) => { let schema = Arc::new(convert_required!(shuffle_reader.schema)?); - let partition_location: Vec = shuffle_reader - .partition_location + let partition_location: Vec> = shuffle_reader + .partition .iter() - .map(|p| p.clone().try_into()) + .map(|p| { + p.location + .iter() + .map(|l| l.clone().try_into()) + .collect::, _>>() + }) .collect::, BallistaError>>()?; let shuffle_reader = ShuffleReaderExec::try_new(partition_location, schema)?; diff --git a/ballista/rust/core/src/serde/physical_plan/to_proto.rs b/ballista/rust/core/src/serde/physical_plan/to_proto.rs index 26092e74a096a..15d5d4b931ff2 100644 --- a/ballista/rust/core/src/serde/physical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/to_proto.rs @@ -57,6 +57,7 @@ use protobuf::physical_plan_node::PhysicalPlanType; use crate::execution_plans::{ShuffleReaderExec, UnresolvedShuffleExec}; use crate::serde::protobuf::repartition_exec_node::PartitionMethod; +use crate::serde::scheduler::PartitionLocation; use crate::serde::{protobuf, BallistaError}; use datafusion::physical_plan::functions::{BuiltinScalarFunction, ScalarFunctionExpr}; use datafusion::physical_plan::merge::MergeExec; @@ -268,16 +269,19 @@ impl TryInto for Arc { )), }) } else if let Some(exec) = plan.downcast_ref::() { - let partition_location = exec - .partition_location - .iter() - .map(|l| l.clone().try_into()) - .collect::>()?; - + let mut partition = vec![]; + for location in &exec.partition { + partition.push(protobuf::ShuffleReaderPartition { + location: location + .iter() + .map(|l| l.clone().try_into()) + .collect::, _>>()?, + }); + } Ok(protobuf::PhysicalPlanNode { physical_plan_type: Some(PhysicalPlanType::ShuffleReader( protobuf::ShuffleReaderExecNode { - partition_location, + partition, schema: Some(exec.schema().as_ref().into()), }, )), diff --git a/ballista/rust/core/src/utils.rs b/ballista/rust/core/src/utils.rs index 4ba6ec40fec90..b58be2800f7b1 100644 --- a/ballista/rust/core/src/utils.rs +++ b/ballista/rust/core/src/utils.rs @@ -27,11 +27,12 @@ use crate::execution_plans::{QueryStageExec, UnresolvedShuffleExec}; use crate::memory_stream::MemoryStream; use crate::serde::scheduler::PartitionStats; +use datafusion::arrow::error::Result as ArrowResult; use datafusion::arrow::{ array::{ ArrayBuilder, ArrayRef, StructArray, StructBuilder, UInt64Array, UInt64Builder, }, - datatypes::{DataType, Field}, + datatypes::{DataType, Field, SchemaRef}, ipc::reader::FileReader, ipc::writer::FileWriter, record_batch::RecordBatch, @@ -54,7 +55,7 @@ use datafusion::physical_plan::sort::SortExec; use datafusion::physical_plan::{ AggregateExpr, ExecutionPlan, PhysicalExpr, RecordBatchStream, }; -use futures::StreamExt; +use futures::{future, Stream, StreamExt}; /// Stream data to disk in Arrow IPC format @@ -234,3 +235,38 @@ pub fn create_datafusion_context() -> ExecutionContext { .with_physical_optimizer_rules(rules); ExecutionContext::with_config(config) } + +pub struct WrappedStream { + stream: Pin> + Send + Sync>>, + schema: SchemaRef, +} + +impl WrappedStream { + pub fn new( + stream: Pin> + Send + Sync>>, + schema: SchemaRef, + ) -> Self { + Self { stream, schema } + } +} + +impl RecordBatchStream for WrappedStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} + +impl Stream for WrappedStream { + type Item = ArrowResult; + + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + self.stream.poll_next_unpin(cx) + } + + fn size_hint(&self) -> (usize, Option) { + self.stream.size_hint() + } +} diff --git a/ballista/rust/scheduler/src/planner.rs b/ballista/rust/scheduler/src/planner.rs index 445ef9a07787b..2ac9f6121e00d 100644 --- a/ballista/rust/scheduler/src/planner.rs +++ b/ballista/rust/scheduler/src/planner.rs @@ -186,7 +186,7 @@ impl DistributedPlanner { pub fn remove_unresolved_shuffles( stage: &dyn ExecutionPlan, - partition_locations: &HashMap>, + partition_locations: &HashMap>>, ) -> Result> { let mut new_children: Vec> = vec![]; for child in stage.children() { diff --git a/ballista/rust/scheduler/src/state/mod.rs b/ballista/rust/scheduler/src/state/mod.rs index a15efd618ff13..506fd1c0db985 100644 --- a/ballista/rust/scheduler/src/state/mod.rs +++ b/ballista/rust/scheduler/src/state/mod.rs @@ -234,7 +234,7 @@ impl SchedulerState { let unresolved_shuffles = find_unresolved_shuffles(&plan)?; let mut partition_locations: HashMap< usize, - Vec, + Vec>, > = HashMap::new(); for unresolved_shuffle in unresolved_shuffles { for stage_id in unresolved_shuffle.query_stage_ids { @@ -256,7 +256,7 @@ impl SchedulerState { let empty = vec![]; let locations = partition_locations.entry(stage_id).or_insert(empty); - locations.push( + locations.push(vec![ ballista_core::serde::scheduler::PartitionLocation { partition_id: ballista_core::serde::scheduler::PartitionId { @@ -271,7 +271,7 @@ impl SchedulerState { .clone(), partition_stats: PartitionStats::default(), }, - ); + ]); } else { continue 'tasks; } From 519698a0cd792a9c263d96079d341816f746c6ec Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Sun, 13 Jun 2021 19:12:26 +0800 Subject: [PATCH 180/329] Refactor hash aggregates's planner building code (#539) * refactor hash aggregates * remove stale comments --- datafusion/src/physical_plan/mod.rs | 5 +-- datafusion/src/physical_plan/planner.rs | 54 +++++++++++-------------- 2 files changed, 26 insertions(+), 33 deletions(-) diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index af6969c43cbd6..ebc6fd6ce94a2 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -341,9 +341,8 @@ pub async fn collect_partitioned( pub enum Partitioning { /// Allocate batches using a round-robin algorithm and the specified number of partitions RoundRobinBatch(usize), - /// Allocate rows based on a hash of one of more expressions and the specified - /// number of partitions - /// This partitioning scheme is not yet fully supported. See [ARROW-11011](https://issues.apache.org/jira/browse/ARROW-11011) + /// Allocate rows based on a hash of one of more expressions and the specified number of + /// partitions Hash(Vec>, usize), /// Unknown partitioning scheme with a known number of partitions UnknownPartitioning(usize), diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index d42948a8666c6..adae9224a19aa 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -222,11 +222,15 @@ impl DefaultPhysicalPlanner { .flat_map(|x| x.0.data_type(physical_input_schema.as_ref())) .any(|x| matches!(x, DataType::Dictionary(_, _))); - if !groups.is_empty() + let can_repartition = !groups.is_empty() && ctx_state.config.concurrency > 1 && ctx_state.config.repartition_aggregations - && !contains_dict - { + && !contains_dict; + + let (initial_aggr, next_partition_mode): ( + Arc, + AggregateMode, + ) = if can_repartition { // Divide partial hash aggregates into multiple partitions by hash key let hash_repartition = Arc::new(RepartitionExec::try_new( initial_aggr, @@ -235,35 +239,25 @@ impl DefaultPhysicalPlanner { ctx_state.config.concurrency, ), )?); - - // Combine hashaggregates within the partition - Ok(Arc::new(HashAggregateExec::try_new( - AggregateMode::FinalPartitioned, - final_group - .iter() - .enumerate() - .map(|(i, expr)| (expr.clone(), groups[i].1.clone())) - .collect(), - aggregates, - hash_repartition, - input_schema, - )?)) + // Combine hash aggregates within the partition + (hash_repartition, AggregateMode::FinalPartitioned) } else { - // construct a second aggregation, keeping the final column name equal to the first aggregation - // and the expressions corresponding to the respective aggregate + // construct a second aggregation, keeping the final column name equal to the + // first aggregation and the expressions corresponding to the respective aggregate + (initial_aggr, AggregateMode::Final) + }; - Ok(Arc::new(HashAggregateExec::try_new( - AggregateMode::Final, - final_group - .iter() - .enumerate() - .map(|(i, expr)| (expr.clone(), groups[i].1.clone())) - .collect(), - aggregates, - initial_aggr, - input_schema, - )?)) - } + Ok(Arc::new(HashAggregateExec::try_new( + next_partition_mode, + final_group + .iter() + .enumerate() + .map(|(i, expr)| (expr.clone(), groups[i].1.clone())) + .collect(), + aggregates, + initial_aggr, + input_schema, + )?)) } LogicalPlan::Projection { input, expr, .. } => { let input_exec = self.create_initial_plan(input, ctx_state)?; From 738f13b39de21224396ab447572d9ef573d06bc8 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Sun, 13 Jun 2021 19:13:04 +0800 Subject: [PATCH 181/329] turn on clippy rule for needless borrow (#545) * turn on clippy rule for needless borrow * do a format round * use warn not deny --- .../core/src/execution_plans/query_stage.rs | 2 +- .../core/src/serde/logical_plan/to_proto.rs | 4 +- ballista/rust/executor/src/flight_service.rs | 2 +- ballista/rust/scheduler/src/state/mod.rs | 2 +- benchmarks/src/bin/tpch.rs | 16 ++--- datafusion/benches/aggregate_query_sql.rs | 2 +- datafusion/benches/filter_query_sql.rs | 2 +- datafusion/benches/math_query_sql.rs | 2 +- datafusion/benches/sort_limit_query_sql.rs | 2 +- datafusion/src/datasource/csv.rs | 2 +- datafusion/src/datasource/json.rs | 2 +- datafusion/src/execution/context.rs | 8 +-- datafusion/src/execution/dataframe_impl.rs | 2 +- datafusion/src/lib.rs | 2 +- datafusion/src/logical_plan/dfschema.rs | 4 +- datafusion/src/logical_plan/plan.rs | 32 ++++----- datafusion/src/optimizer/filter_push_down.rs | 20 +++--- .../src/optimizer/projection_push_down.rs | 14 ++-- .../src/optimizer/simplify_expressions.rs | 4 +- datafusion/src/physical_optimizer/pruning.rs | 15 ++--- datafusion/src/physical_plan/aggregates.rs | 2 +- .../src/physical_plan/expressions/case.rs | 2 +- .../physical_plan/expressions/row_number.rs | 2 +- datafusion/src/physical_plan/filter.rs | 2 +- datafusion/src/physical_plan/functions.rs | 6 +- .../src/physical_plan/hash_aggregate.rs | 8 +-- datafusion/src/physical_plan/hash_join.rs | 24 +++---- datafusion/src/physical_plan/planner.rs | 26 ++++---- datafusion/src/physical_plan/projection.rs | 2 +- datafusion/src/physical_plan/repartition.rs | 2 +- .../physical_plan/sort_preserving_merge.rs | 2 +- .../src/physical_plan/string_expressions.rs | 6 +- datafusion/src/physical_plan/type_coercion.rs | 6 +- datafusion/src/physical_plan/windows.rs | 6 +- datafusion/src/sql/planner.rs | 66 +++++++++---------- datafusion/src/sql/utils.rs | 4 +- datafusion/tests/sql.rs | 28 ++++---- 37 files changed, 160 insertions(+), 173 deletions(-) diff --git a/ballista/rust/core/src/execution_plans/query_stage.rs b/ballista/rust/core/src/execution_plans/query_stage.rs index 233dee5b9b529..264c44dc43dca 100644 --- a/ballista/rust/core/src/execution_plans/query_stage.rs +++ b/ballista/rust/core/src/execution_plans/query_stage.rs @@ -139,7 +139,7 @@ impl ExecutionPlan for QueryStageExec { info!("Writing results to {}", path); // stream results to disk - let stats = utils::write_stream_to_disk(&mut stream, &path) + let stats = utils::write_stream_to_disk(&mut stream, path) .await .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/ballista/rust/core/src/serde/logical_plan/to_proto.rs index 5d996843d6248..c454d03257f0a 100644 --- a/ballista/rust/core/src/serde/logical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/to_proto.rs @@ -1033,9 +1033,7 @@ impl TryInto for &Expr { .map(|e| e.try_into()) .collect::, _>>()?; let window_frame = window_frame.map(|window_frame| { - protobuf::window_expr_node::WindowFrame::Frame( - window_frame.clone().into(), - ) + protobuf::window_expr_node::WindowFrame::Frame(window_frame.into()) }); let window_expr = Box::new(protobuf::WindowExprNode { expr: Some(Box::new(arg.try_into()?)), diff --git a/ballista/rust/executor/src/flight_service.rs b/ballista/rust/executor/src/flight_service.rs index d4eb1229c294d..99424b6e8db46 100644 --- a/ballista/rust/executor/src/flight_service.rs +++ b/ballista/rust/executor/src/flight_service.rs @@ -279,7 +279,7 @@ fn create_flight_iter( options: &IpcWriteOptions, ) -> Box>> { let (flight_dictionaries, flight_batch) = - arrow_flight::utils::flight_data_from_arrow_batch(batch, &options); + arrow_flight::utils::flight_data_from_arrow_batch(batch, options); Box::new( flight_dictionaries .into_iter() diff --git a/ballista/rust/scheduler/src/state/mod.rs b/ballista/rust/scheduler/src/state/mod.rs index 506fd1c0db985..75f1574ef1257 100644 --- a/ballista/rust/scheduler/src/state/mod.rs +++ b/ballista/rust/scheduler/src/state/mod.rs @@ -223,7 +223,7 @@ impl SchedulerState { .collect(); let executors = self.get_executors_metadata().await?; 'tasks: for (_key, value) in kvs.iter() { - let mut status: TaskStatus = decode_protobuf(&value)?; + let mut status: TaskStatus = decode_protobuf(value)?; if status.status.is_none() { let partition = status.partition_id.as_ref().unwrap(); let plan = self diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index 9ac66e136dbdb..34b8d3a27b194 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -350,7 +350,7 @@ async fn execute_query( if debug { println!("Logical plan:\n{:?}", plan); } - let plan = ctx.optimize(&plan)?; + let plan = ctx.optimize(plan)?; if debug { println!("Optimized logical plan:\n{:?}", plan); } @@ -921,9 +921,9 @@ mod tests { .iter() .map(|field| { Field::new( - Field::name(&field), + Field::name(field), DataType::Utf8, - Field::is_nullable(&field), + Field::is_nullable(field), ) }) .collect::>(), @@ -939,8 +939,8 @@ mod tests { .iter() .map(|field| { Field::new( - Field::name(&field), - Field::data_type(&field).to_owned(), + Field::name(field), + Field::data_type(field).to_owned(), true, ) }) @@ -990,10 +990,10 @@ mod tests { .map(|field| { Expr::Alias( Box::new(Cast { - expr: Box::new(trim(col(Field::name(&field)))), - data_type: Field::data_type(&field).to_owned(), + expr: Box::new(trim(col(Field::name(field)))), + data_type: Field::data_type(field).to_owned(), }), - Field::name(&field).to_string(), + Field::name(field).to_string(), ) }) .collect::>(), diff --git a/datafusion/benches/aggregate_query_sql.rs b/datafusion/benches/aggregate_query_sql.rs index 8f1a97e198d3b..74798ae572cd5 100644 --- a/datafusion/benches/aggregate_query_sql.rs +++ b/datafusion/benches/aggregate_query_sql.rs @@ -47,7 +47,7 @@ fn query(ctx: Arc>, sql: &str) { let rt = Runtime::new().unwrap(); // execute the query - let df = ctx.lock().unwrap().sql(&sql).unwrap(); + let df = ctx.lock().unwrap().sql(sql).unwrap(); rt.block_on(df.collect()).unwrap(); } diff --git a/datafusion/benches/filter_query_sql.rs b/datafusion/benches/filter_query_sql.rs index 8600bdc88c6af..253ef455f5af2 100644 --- a/datafusion/benches/filter_query_sql.rs +++ b/datafusion/benches/filter_query_sql.rs @@ -28,7 +28,7 @@ use std::sync::Arc; async fn query(ctx: &mut ExecutionContext, sql: &str) { // execute the query - let df = ctx.sql(&sql).unwrap(); + let df = ctx.sql(sql).unwrap(); let results = df.collect().await.unwrap(); // display the relation diff --git a/datafusion/benches/math_query_sql.rs b/datafusion/benches/math_query_sql.rs index 1aaa2d3403cfd..51e52e8acddb4 100644 --- a/datafusion/benches/math_query_sql.rs +++ b/datafusion/benches/math_query_sql.rs @@ -40,7 +40,7 @@ fn query(ctx: Arc>, sql: &str) { let rt = Runtime::new().unwrap(); // execute the query - let df = ctx.lock().unwrap().sql(&sql).unwrap(); + let df = ctx.lock().unwrap().sql(sql).unwrap(); rt.block_on(df.collect()).unwrap(); } diff --git a/datafusion/benches/sort_limit_query_sql.rs b/datafusion/benches/sort_limit_query_sql.rs index 1e8339ea31eb1..5a875d3d8799f 100644 --- a/datafusion/benches/sort_limit_query_sql.rs +++ b/datafusion/benches/sort_limit_query_sql.rs @@ -35,7 +35,7 @@ fn query(ctx: Arc>, sql: &str) { let rt = Runtime::new().unwrap(); // execute the query - let df = ctx.lock().unwrap().sql(&sql).unwrap(); + let df = ctx.lock().unwrap().sql(sql).unwrap(); rt.block_on(df.collect()).unwrap(); } diff --git a/datafusion/src/datasource/csv.rs b/datafusion/src/datasource/csv.rs index e1a61595f2eeb..906a1ce415f60 100644 --- a/datafusion/src/datasource/csv.rs +++ b/datafusion/src/datasource/csv.rs @@ -204,7 +204,7 @@ impl TableProvider for CsvFile { } } Source::Path(p) => { - CsvExec::try_new(&p, opts, projection.clone(), batch_size, limit)? + CsvExec::try_new(p, opts, projection.clone(), batch_size, limit)? } }; Ok(Arc::new(exec)) diff --git a/datafusion/src/datasource/json.rs b/datafusion/src/datasource/json.rs index f916f6c1e382c..90fedfd6f528d 100644 --- a/datafusion/src/datasource/json.rs +++ b/datafusion/src/datasource/json.rs @@ -149,7 +149,7 @@ impl TableProvider for NdJsonFile { } } Source::Path(p) => { - NdJsonExec::try_new(&p, opts, projection.clone(), batch_size, limit)? + NdJsonExec::try_new(p, opts, projection.clone(), batch_size, limit)? } }; Ok(Arc::new(exec)) diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index 950ba2b88691c..f09d7f4f90c93 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -275,7 +275,7 @@ impl ExecutionContext { ) -> Result> { Ok(Arc::new(DataFrameImpl::new( self.state.clone(), - &LogicalPlanBuilder::scan_csv(&filename, options, None)?.build()?, + &LogicalPlanBuilder::scan_csv(filename, options, None)?.build()?, ))) } @@ -284,7 +284,7 @@ impl ExecutionContext { Ok(Arc::new(DataFrameImpl::new( self.state.clone(), &LogicalPlanBuilder::scan_parquet( - &filename, + filename, None, self.state.lock().unwrap().config.concurrency, )? @@ -328,7 +328,7 @@ impl ExecutionContext { /// executed against this context. pub fn register_parquet(&mut self, name: &str, filename: &str) -> Result<()> { let table = ParquetTable::try_new( - &filename, + filename, self.state.lock().unwrap().config.concurrency, )?; self.register_table(name, Arc::new(table))?; @@ -3205,7 +3205,7 @@ mod tests { .expect("Executing CREATE EXTERNAL TABLE"); let sql = "SELECT * from csv_with_timestamps"; - let result = plan_and_collect(&mut ctx, &sql).await.unwrap(); + let result = plan_and_collect(&mut ctx, sql).await.unwrap(); let expected = vec![ "+--------+-------------------------+", "| name | ts |", diff --git a/datafusion/src/execution/dataframe_impl.rs b/datafusion/src/execution/dataframe_impl.rs index 19f71eb79268f..a674e3cdb0f1b 100644 --- a/datafusion/src/execution/dataframe_impl.rs +++ b/datafusion/src/execution/dataframe_impl.rs @@ -373,7 +373,7 @@ mod tests { ctx.register_csv( "aggregate_test_100", &format!("{}/csv/aggregate_test_100.csv", testdata), - CsvReadOptions::new().schema(&schema.as_ref()), + CsvReadOptions::new().schema(schema.as_ref()), )?; Ok(()) } diff --git a/datafusion/src/lib.rs b/datafusion/src/lib.rs index e4501a78ada41..64cc0a1349a23 100644 --- a/datafusion/src/lib.rs +++ b/datafusion/src/lib.rs @@ -14,7 +14,7 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. -#![warn(missing_docs)] +#![warn(missing_docs, clippy::needless_borrow)] // Clippy lints, some should be disabled incrementally #![allow( clippy::float_cmp, diff --git a/datafusion/src/logical_plan/dfschema.rs b/datafusion/src/logical_plan/dfschema.rs index 9adb22b43d075..5a9167e58b053 100644 --- a/datafusion/src/logical_plan/dfschema.rs +++ b/datafusion/src/logical_plan/dfschema.rs @@ -325,12 +325,12 @@ impl DFField { /// Returns an immutable reference to the `DFField`'s unqualified name pub fn name(&self) -> &String { - &self.field.name() + self.field.name() } /// Returns an immutable reference to the `DFField`'s data-type pub fn data_type(&self) -> &DataType { - &self.field.data_type() + self.field.data_type() } /// Indicates whether this `DFField` supports null values diff --git a/datafusion/src/logical_plan/plan.rs b/datafusion/src/logical_plan/plan.rs index 3344dce1d81df..a80bc54b4a2f1 100644 --- a/datafusion/src/logical_plan/plan.rs +++ b/datafusion/src/logical_plan/plan.rs @@ -221,23 +221,23 @@ impl LogicalPlan { /// Get a reference to the logical plan's schema pub fn schema(&self) -> &DFSchemaRef { match self { - LogicalPlan::EmptyRelation { schema, .. } => &schema, + LogicalPlan::EmptyRelation { schema, .. } => schema, LogicalPlan::TableScan { projected_schema, .. - } => &projected_schema, - LogicalPlan::Projection { schema, .. } => &schema, + } => projected_schema, + LogicalPlan::Projection { schema, .. } => schema, LogicalPlan::Filter { input, .. } => input.schema(), - LogicalPlan::Window { schema, .. } => &schema, - LogicalPlan::Aggregate { schema, .. } => &schema, + LogicalPlan::Window { schema, .. } => schema, + LogicalPlan::Aggregate { schema, .. } => schema, LogicalPlan::Sort { input, .. } => input.schema(), - LogicalPlan::Join { schema, .. } => &schema, - LogicalPlan::CrossJoin { schema, .. } => &schema, + LogicalPlan::Join { schema, .. } => schema, + LogicalPlan::CrossJoin { schema, .. } => schema, LogicalPlan::Repartition { input, .. } => input.schema(), LogicalPlan::Limit { input, .. } => input.schema(), - LogicalPlan::CreateExternalTable { schema, .. } => &schema, - LogicalPlan::Explain { schema, .. } => &schema, - LogicalPlan::Extension { node } => &node.schema(), - LogicalPlan::Union { schema, .. } => &schema, + LogicalPlan::CreateExternalTable { schema, .. } => schema, + LogicalPlan::Explain { schema, .. } => schema, + LogicalPlan::Extension { node } => node.schema(), + LogicalPlan::Union { schema, .. } => schema, } } @@ -246,12 +246,12 @@ impl LogicalPlan { match self { LogicalPlan::TableScan { projected_schema, .. - } => vec![&projected_schema], + } => vec![projected_schema], LogicalPlan::Window { input, schema, .. } | LogicalPlan::Aggregate { input, schema, .. } | LogicalPlan::Projection { input, schema, .. } => { let mut schemas = input.all_schemas(); - schemas.insert(0, &schema); + schemas.insert(0, schema); schemas } LogicalPlan::Join { @@ -267,16 +267,16 @@ impl LogicalPlan { } => { let mut schemas = left.all_schemas(); schemas.extend(right.all_schemas()); - schemas.insert(0, &schema); + schemas.insert(0, schema); schemas } LogicalPlan::Union { schema, .. } => { vec![schema] } - LogicalPlan::Extension { node } => vec![&node.schema()], + LogicalPlan::Extension { node } => vec![node.schema()], LogicalPlan::Explain { schema, .. } | LogicalPlan::EmptyRelation { schema, .. } - | LogicalPlan::CreateExternalTable { schema, .. } => vec![&schema], + | LogicalPlan::CreateExternalTable { schema, .. } => vec![schema], LogicalPlan::Limit { input, .. } | LogicalPlan::Repartition { input, .. } | LogicalPlan::Sort { input, .. } diff --git a/datafusion/src/optimizer/filter_push_down.rs b/datafusion/src/optimizer/filter_push_down.rs index 4b1ae76927b4a..85d1f812f41ac 100644 --- a/datafusion/src/optimizer/filter_push_down.rs +++ b/datafusion/src/optimizer/filter_push_down.rs @@ -137,7 +137,7 @@ fn get_join_predicates<'a>( let all_in_right = right.len() == columns.len(); !all_in_left && !all_in_right }) - .map(|((ref a, ref b), _)| (a, b)) + .map(|((a, b), _)| (a, b)) .unzip(); (pushable_to_left, pushable_to_right, keep) } @@ -151,7 +151,7 @@ fn push_down(state: &State, plan: &LogicalPlan) -> Result { .collect::>>()?; let expr = plan.expressions(); - utils::from_plan(&plan, &expr, &new_inputs) + utils::from_plan(plan, &expr, &new_inputs) } /// returns a new [LogicalPlan] that wraps `plan` in a [LogicalPlan::Filter] with @@ -225,8 +225,8 @@ fn split_members<'a>(predicate: &'a Expr, predicates: &mut Vec<&'a Expr>) { op: Operator::And, left, } => { - split_members(&left, predicates); - split_members(&right, predicates); + split_members(left, predicates); + split_members(right, predicates); } other => predicates.push(other), } @@ -297,7 +297,7 @@ fn optimize(plan: &LogicalPlan, mut state: State) -> Result { // optimize inner let new_input = optimize(input, state)?; - utils::from_plan(&plan, &expr, &[new_input]) + utils::from_plan(plan, expr, &[new_input]) } LogicalPlan::Aggregate { input, aggr_expr, .. @@ -335,7 +335,7 @@ fn optimize(plan: &LogicalPlan, mut state: State) -> Result { LogicalPlan::Join { left, right, .. } | LogicalPlan::CrossJoin { left, right, .. } => { let (pushable_to_left, pushable_to_right, keep) = - get_join_predicates(&state, &left.schema(), &right.schema()); + get_join_predicates(&state, left.schema(), right.schema()); let mut left_state = state.clone(); left_state.filters = keep_filters(&left_state.filters, &pushable_to_left); @@ -347,7 +347,7 @@ fn optimize(plan: &LogicalPlan, mut state: State) -> Result { // create a new Join with the new `left` and `right` let expr = plan.expressions(); - let plan = utils::from_plan(&plan, &expr, &[left, right])?; + let plan = utils::from_plan(plan, &expr, &[left, right])?; if keep.0.is_empty() { Ok(plan) @@ -437,11 +437,11 @@ impl FilterPushDown { /// replaces columns by its name on the projection. fn rewrite(expr: &Expr, projection: &HashMap) -> Result { - let expressions = utils::expr_sub_expressions(&expr)?; + let expressions = utils::expr_sub_expressions(expr)?; let expressions = expressions .iter() - .map(|e| rewrite(e, &projection)) + .map(|e| rewrite(e, projection)) .collect::>>()?; if let Expr::Column(name) = expr { @@ -450,7 +450,7 @@ fn rewrite(expr: &Expr, projection: &HashMap) -> Result { } } - utils::rewrite_expression(&expr, &expressions) + utils::rewrite_expression(expr, &expressions) } #[cfg(test)] diff --git a/datafusion/src/optimizer/projection_push_down.rs b/datafusion/src/optimizer/projection_push_down.rs index f0b364ab9852a..ad795f5f5dd52 100644 --- a/datafusion/src/optimizer/projection_push_down.rs +++ b/datafusion/src/optimizer/projection_push_down.rs @@ -146,7 +146,7 @@ fn optimize_plan( let new_input = optimize_plan( optimizer, - &input, + input, &new_required_columns, true, execution_props, @@ -176,14 +176,14 @@ fn optimize_plan( Ok(LogicalPlan::Join { left: Arc::new(optimize_plan( optimizer, - &left, + left, &new_required_columns, true, execution_props, )?), right: Arc::new(optimize_plan( optimizer, - &right, + right, &new_required_columns, true, execution_props, @@ -204,7 +204,7 @@ fn optimize_plan( let mut new_window_expr = Vec::new(); { window_expr.iter().try_for_each(|expr| { - let name = &expr.name(&schema)?; + let name = &expr.name(schema)?; if required_columns.contains(name) { new_window_expr.push(expr.clone()); new_required_columns.insert(name.clone()); @@ -235,7 +235,7 @@ fn optimize_plan( window_expr: new_window_expr, input: Arc::new(optimize_plan( optimizer, - &input, + input, &new_required_columns, true, execution_props, @@ -259,7 +259,7 @@ fn optimize_plan( // Gather all columns needed for expressions in this Aggregate let mut new_aggr_expr = Vec::new(); aggr_expr.iter().try_for_each(|expr| { - let name = &expr.name(&schema)?; + let name = &expr.name(schema)?; if required_columns.contains(name) { new_aggr_expr.push(expr.clone()); @@ -286,7 +286,7 @@ fn optimize_plan( aggr_expr: new_aggr_expr, input: Arc::new(optimize_plan( optimizer, - &input, + input, &new_required_columns, true, execution_props, diff --git a/datafusion/src/optimizer/simplify_expressions.rs b/datafusion/src/optimizer/simplify_expressions.rs index 0697d689c4019..9ad7a94d8bfe2 100644 --- a/datafusion/src/optimizer/simplify_expressions.rs +++ b/datafusion/src/optimizer/simplify_expressions.rs @@ -248,7 +248,7 @@ fn simplify(expr: &Expr) -> Expr { }) .unwrap_or_else(|| expr.clone()), Expr::BinaryExpr { left, op, right } => Expr::BinaryExpr { - left: Box::new(simplify(&left)), + left: Box::new(simplify(left)), op: *op, right: Box::new(simplify(right)), }, @@ -267,7 +267,7 @@ fn optimize(plan: &LogicalPlan) -> Result { .into_iter() .map(|x| simplify(&x)) .collect::>(); - utils::from_plan(&plan, &expr, &new_inputs) + utils::from_plan(plan, &expr, &new_inputs) } impl OptimizerRule for SimplifyExpressions { diff --git a/datafusion/src/physical_optimizer/pruning.rs b/datafusion/src/physical_optimizer/pruning.rs index c65733bd75267..da82d53871a84 100644 --- a/datafusion/src/physical_optimizer/pruning.rs +++ b/datafusion/src/physical_optimizer/pruning.rs @@ -420,7 +420,7 @@ impl<'a> PruningExpressionBuilder<'a> { fn min_column_expr(&mut self) -> Result { self.required_columns.min_column_expr( &self.column_name, - &self.column_expr, + self.column_expr, self.field, ) } @@ -428,7 +428,7 @@ impl<'a> PruningExpressionBuilder<'a> { fn max_column_expr(&mut self) -> Result { self.required_columns.max_column_expr( &self.column_name, - &self.column_expr, + self.column_expr, self.field, ) } @@ -440,7 +440,7 @@ fn rewrite_column_expr( column_old_name: &str, column_new_name: &str, ) -> Result { - let expressions = utils::expr_sub_expressions(&expr)?; + let expressions = utils::expr_sub_expressions(expr)?; let expressions = expressions .iter() .map(|e| rewrite_column_expr(e, column_old_name, column_new_name)) @@ -451,7 +451,7 @@ fn rewrite_column_expr( return Ok(Expr::Column(column_new_name.to_string())); } } - utils::rewrite_expression(&expr, &expressions) + utils::rewrite_expression(expr, &expressions) } /// Given a column reference to `column_name`, returns a pruning @@ -515,16 +515,15 @@ fn build_predicate_expression( let (left, op, right) = match expr { Expr::BinaryExpr { left, op, right } => (left, *op, right), Expr::Column(name) => { - let expr = build_single_column_expr(&name, schema, required_columns, false) + let expr = build_single_column_expr(name, schema, required_columns, false) .unwrap_or(unhandled); return Ok(expr); } // match !col (don't do so recursively) Expr::Not(input) => { if let Expr::Column(name) = input.as_ref() { - let expr = - build_single_column_expr(&name, schema, required_columns, true) - .unwrap_or(unhandled); + let expr = build_single_column_expr(name, schema, required_columns, true) + .unwrap_or(unhandled); return Ok(expr); } else { return Ok(unhandled); diff --git a/datafusion/src/physical_plan/aggregates.rs b/datafusion/src/physical_plan/aggregates.rs index 60025a316228d..897c78fd46ff6 100644 --- a/datafusion/src/physical_plan/aggregates.rs +++ b/datafusion/src/physical_plan/aggregates.rs @@ -127,7 +127,7 @@ pub fn create_aggregate_expr( .map(|e| e.data_type(input_schema)) .collect::>>()?; - let return_type = return_type(&fun, &arg_types)?; + let return_type = return_type(fun, &arg_types)?; Ok(match (fun, distinct) { (AggregateFunction::Count, false) => { diff --git a/datafusion/src/physical_plan/expressions/case.rs b/datafusion/src/physical_plan/expressions/case.rs index 95ae5325af119..f89ea8d1e2964 100644 --- a/datafusion/src/physical_plan/expressions/case.rs +++ b/datafusion/src/physical_plan/expressions/case.rs @@ -377,7 +377,7 @@ impl CaseExpr { let then_value = then_value.into_array(batch.num_rows()); current_value = Some(if_then_else( - &when_value, + when_value, then_value, current_value.unwrap(), &return_type, diff --git a/datafusion/src/physical_plan/expressions/row_number.rs b/datafusion/src/physical_plan/expressions/row_number.rs index f399995461f70..eaf9b21cbc649 100644 --- a/datafusion/src/physical_plan/expressions/row_number.rs +++ b/datafusion/src/physical_plan/expressions/row_number.rs @@ -49,7 +49,7 @@ impl BuiltInWindowFunctionExpr for RowNumber { fn field(&self) -> Result { let nullable = false; let data_type = DataType::UInt64; - Ok(Field::new(&self.name(), data_type, nullable)) + Ok(Field::new(self.name(), data_type, nullable)) } fn expressions(&self) -> Vec> { diff --git a/datafusion/src/physical_plan/filter.rs b/datafusion/src/physical_plan/filter.rs index bc2b17aa4f47d..0a8c825aba1ae 100644 --- a/datafusion/src/physical_plan/filter.rs +++ b/datafusion/src/physical_plan/filter.rs @@ -151,7 +151,7 @@ fn batch_filter( predicate: &Arc, ) -> ArrowResult { predicate - .evaluate(&batch) + .evaluate(batch) .map(|v| v.into_array(batch.num_rows())) .map_err(DataFusionError::into_arrow_external_error) .and_then(|array| { diff --git a/datafusion/src/physical_plan/functions.rs b/datafusion/src/physical_plan/functions.rs index eb312cabd7f0f..49ca79a004960 100644 --- a/datafusion/src/physical_plan/functions.rs +++ b/datafusion/src/physical_plan/functions.rs @@ -344,7 +344,7 @@ pub fn return_type( // or the execution panics. // verify that this is a valid set of data types for this function - data_types(&arg_types, &signature(fun))?; + data_types(arg_types, &signature(fun))?; // the return type of the built in function. // Some built-in functions' return type depends on the incoming type. @@ -624,7 +624,7 @@ pub fn create_physical_expr( &format!("{}", fun), fun_expr, args, - &return_type(&fun, &arg_types)?, + &return_type(fun, &arg_types)?, ))); } BuiltinScalarFunction::InitCap => |args| match args[0].data_type() { @@ -953,7 +953,7 @@ pub fn create_physical_expr( &format!("{}", fun), fun_expr, args, - &return_type(&fun, &arg_types)?, + &return_type(fun, &arg_types)?, ))) } diff --git a/datafusion/src/physical_plan/hash_aggregate.rs b/datafusion/src/physical_plan/hash_aggregate.rs index ffb51b2e8a1f2..453d500e98bd8 100644 --- a/datafusion/src/physical_plan/hash_aggregate.rs +++ b/datafusion/src/physical_plan/hash_aggregate.rs @@ -120,8 +120,8 @@ fn create_schema( for (expr, name) in group_expr { fields.push(Field::new( name, - expr.data_type(&input_schema)?, - expr.nullable(&input_schema)?, + expr.data_type(input_schema)?, + expr.nullable(input_schema)?, )) } @@ -413,7 +413,7 @@ fn group_aggregate_batch( let mut offset_so_far = 0; for key in batch_keys.iter() { let (_, _, indices) = accumulators.get_mut(key).unwrap(); - batch_indices.append_slice(&indices)?; + batch_indices.append_slice(indices)?; offset_so_far += indices.len(); offsets.push(offset_so_far); } @@ -779,7 +779,7 @@ fn evaluate( batch: &RecordBatch, ) -> Result> { expr.iter() - .map(|expr| expr.evaluate(&batch)) + .map(|expr| expr.evaluate(batch)) .map(|r| r.map(|v| v.into_array(batch.num_rows()))) .collect::>>() } diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index d12e249cbe347..1b0322b521a52 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -133,13 +133,13 @@ impl HashJoinExec { ) -> Result { let left_schema = left.schema(); let right_schema = right.schema(); - check_join_is_valid(&left_schema, &right_schema, &on)?; + check_join_is_valid(&left_schema, &right_schema, on)?; let schema = Arc::new(build_join_schema( &left_schema, &right_schema, on, - &join_type, + join_type, )); let on = on @@ -289,7 +289,7 @@ impl ExecutionPlan for HashJoinExec { hashes_buffer.resize(batch.num_rows(), 0); update_hash( &on_left, - &batch, + batch, &mut hashmap, offset, &self.random_state, @@ -342,7 +342,7 @@ impl ExecutionPlan for HashJoinExec { hashes_buffer.resize(batch.num_rows(), 0); update_hash( &on_left, - &batch, + batch, &mut hashmap, offset, &self.random_state, @@ -436,7 +436,7 @@ fn update_hash( .collect::>>()?; // calculate the hash values - let hash_values = create_hashes(&keys_values, &random_state, hashes_buffer)?; + let hash_values = create_hashes(&keys_values, random_state, hashes_buffer)?; // insert hashes to key of the hashmap for (row, hash_value) in hash_values.iter().enumerate() { @@ -538,15 +538,9 @@ fn build_batch( column_indices: &[ColumnIndex], random_state: &RandomState, ) -> ArrowResult<(RecordBatch, UInt64Array)> { - let (left_indices, right_indices) = build_join_indexes( - &left_data, - &batch, - join_type, - on_left, - on_right, - random_state, - ) - .unwrap(); + let (left_indices, right_indices) = + build_join_indexes(left_data, batch, join_type, on_left, on_right, random_state) + .unwrap(); if matches!(join_type, JoinType::Semi | JoinType::Anti) { return Ok(( @@ -613,7 +607,7 @@ fn build_join_indexes( }) .collect::>>()?; let hashes_buffer = &mut vec![0; keys_values[0].len()]; - let hash_values = create_hashes(&keys_values, &random_state, hashes_buffer)?; + let hash_values = create_hashes(&keys_values, random_state, hashes_buffer)?; let left = &left_data.0; match join_type { diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index adae9224a19aa..31b3749dd3549 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -155,7 +155,7 @@ impl DefaultPhysicalPlanner { .map(|e| { self.create_window_expr( e, - &logical_input_schema, + logical_input_schema, &physical_input_schema, ctx_state, ) @@ -189,7 +189,7 @@ impl DefaultPhysicalPlanner { &physical_input_schema, ctx_state, ), - e.name(&logical_input_schema), + e.name(logical_input_schema), )) }) .collect::>>()?; @@ -198,7 +198,7 @@ impl DefaultPhysicalPlanner { .map(|e| { self.create_aggregate_expr( e, - &logical_input_schema, + logical_input_schema, &physical_input_schema, ctx_state, ) @@ -266,12 +266,8 @@ impl DefaultPhysicalPlanner { .iter() .map(|e| { tuple_err(( - self.create_physical_expr( - e, - &input_exec.schema(), - &ctx_state, - ), - e.name(&input_schema), + self.create_physical_expr(e, &input_exec.schema(), ctx_state), + e.name(input_schema), )) }) .collect::>>()?; @@ -307,7 +303,7 @@ impl DefaultPhysicalPlanner { let runtime_expr = expr .iter() .map(|e| { - self.create_physical_expr(e, &input_schema, &ctx_state) + self.create_physical_expr(e, &input_schema, ctx_state) }) .collect::>>()?; Partitioning::Hash(runtime_expr, *n) @@ -378,7 +374,7 @@ impl DefaultPhysicalPlanner { right, Partitioning::Hash(right_expr, ctx_state.config.concurrency), )?), - &keys, + keys, &physical_join_type, PartitionMode::Partitioned, )?)) @@ -386,7 +382,7 @@ impl DefaultPhysicalPlanner { Ok(Arc::new(HashJoinExec::try_new( left, right, - &keys, + keys, &physical_join_type, PartitionMode::CollectLeft, )?)) @@ -504,7 +500,7 @@ impl DefaultPhysicalPlanner { } Expr::Column(name) => { // check that name exists - input_schema.field_with_name(&name)?; + input_schema.field_with_name(name)?; Ok(Arc::new(Column::new(name))) } Expr::Literal(value) => Ok(Arc::new(Literal::new(value.clone()))), @@ -762,12 +758,12 @@ impl DefaultPhysicalPlanner { nulls_first, } => self.create_physical_sort_expr( expr, - &physical_input_schema, + physical_input_schema, SortOptions { descending: !*asc, nulls_first: *nulls_first, }, - &ctx_state, + ctx_state, ), _ => Err(DataFusionError::Plan( "Sort only accepts sort expressions".to_string(), diff --git a/datafusion/src/physical_plan/projection.rs b/datafusion/src/physical_plan/projection.rs index c0d78ff7168bf..d4c0459c211be 100644 --- a/datafusion/src/physical_plan/projection.rs +++ b/datafusion/src/physical_plan/projection.rs @@ -166,7 +166,7 @@ fn batch_project( ) -> ArrowResult { expressions .iter() - .map(|expr| expr.evaluate(&batch)) + .map(|expr| expr.evaluate(batch)) .map(|r| r.map(|v| v.into_array(batch.num_rows()))) .collect::>>() .map_or_else( diff --git a/datafusion/src/physical_plan/repartition.rs b/datafusion/src/physical_plan/repartition.rs index 37d98c7d118b6..5d1f8d7760cf1 100644 --- a/datafusion/src/physical_plan/repartition.rs +++ b/datafusion/src/physical_plan/repartition.rs @@ -479,7 +479,7 @@ mod tests { partitions, Partitioning::Hash( vec![Arc::new(crate::physical_plan::expressions::Column::new( - &"c0", + "c0", ))], 8, ), diff --git a/datafusion/src/physical_plan/sort_preserving_merge.rs b/datafusion/src/physical_plan/sort_preserving_merge.rs index 283294a43ec75..c39acc474d315 100644 --- a/datafusion/src/physical_plan/sort_preserving_merge.rs +++ b/datafusion/src/physical_plan/sort_preserving_merge.rs @@ -376,7 +376,7 @@ impl SortPreservingMergeStream { match min_cursor { None => min_cursor = Some((idx, candidate)), - Some((_, ref min)) => { + Some((_, min)) => { if min.compare(candidate, &self.sort_options)? == Ordering::Greater { diff --git a/datafusion/src/physical_plan/string_expressions.rs b/datafusion/src/physical_plan/string_expressions.rs index 882fe30502fdf..09e19c4dfa47a 100644 --- a/datafusion/src/physical_plan/string_expressions.rs +++ b/datafusion/src/physical_plan/string_expressions.rs @@ -299,7 +299,7 @@ pub fn concat(args: &[ColumnarValue]) -> Result { ColumnarValue::Array(v) => { if v.is_valid(index) { let v = v.as_any().downcast_ref::().unwrap(); - owned_string.push_str(&v.value(index)); + owned_string.push_str(v.value(index)); } } _ => unreachable!(), @@ -353,10 +353,10 @@ pub fn concat_ws(args: &[ArrayRef]) -> Result { for arg_index in 1..args.len() { let arg = &args[arg_index]; if !arg.is_null(index) { - owned_string.push_str(&arg.value(index)); + owned_string.push_str(arg.value(index)); // if not last push separator if arg_index != args.len() - 1 { - owned_string.push_str(&sep); + owned_string.push_str(sep); } } } diff --git a/datafusion/src/physical_plan/type_coercion.rs b/datafusion/src/physical_plan/type_coercion.rs index 06d3739b53b27..fe87ecda872cb 100644 --- a/datafusion/src/physical_plan/type_coercion.rs +++ b/datafusion/src/physical_plan/type_coercion.rs @@ -60,7 +60,7 @@ pub fn coerce( expressions .iter() .enumerate() - .map(|(i, expr)| try_cast(expr.clone(), &schema, new_types[i].clone())) + .map(|(i, expr)| try_cast(expr.clone(), schema, new_types[i].clone())) .collect::>>() } @@ -85,7 +85,7 @@ pub fn data_types( } for valid_types in valid_types { - if let Some(types) = maybe_data_types(&valid_types, ¤t_types) { + if let Some(types) = maybe_data_types(&valid_types, current_types) { return Ok(types); } } @@ -157,7 +157,7 @@ fn maybe_data_types( new_type.push(current_type.clone()) } else { // attempt to coerce - if can_coerce_from(valid_type, ¤t_type) { + if can_coerce_from(valid_type, current_type) { new_type.push(valid_type.clone()) } else { // not possible diff --git a/datafusion/src/physical_plan/windows.rs b/datafusion/src/physical_plan/windows.rs index 565a9eef28575..f95dd446844d0 100644 --- a/datafusion/src/physical_plan/windows.rs +++ b/datafusion/src/physical_plan/windows.rs @@ -145,7 +145,7 @@ impl WindowExpr for BuiltInWindowExpr { } fn name(&self) -> &str { - &self.window.name() + self.window.name() } fn field(&self) -> Result { @@ -191,7 +191,7 @@ impl WindowExpr for AggregateWindowExpr { } fn name(&self) -> &str { - &self.aggregate.name() + self.aggregate.name() } fn field(&self) -> Result { @@ -351,7 +351,7 @@ fn window_aggregate_batch( .map(|(window_acc, expr)| { let values = &expr .iter() - .map(|e| e.evaluate(&batch)) + .map(|e| e.evaluate(batch)) .map(|r| r.map(|v| v.into_array(batch.num_rows()))) .collect::>>()?; window_acc.scan_batch(batch.num_rows(), values) diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 860d21714ec66..7e7462ef390e0 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -86,8 +86,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { /// Generate a logical plan from an DataFusion SQL statement pub fn statement_to_plan(&self, statement: &DFStatement) -> Result { match statement { - DFStatement::CreateExternalTable(s) => self.external_table_to_plan(&s), - DFStatement::Statement(s) => self.sql_statement_to_plan(&s), + DFStatement::CreateExternalTable(s) => self.external_table_to_plan(s), + DFStatement::Statement(s) => self.sql_statement_to_plan(s), } } @@ -98,9 +98,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { verbose, statement, analyze: _, - } => self.explain_statement_to_plan(*verbose, &statement), - Statement::Query(query) => self.query_to_plan(&query), - Statement::ShowVariable { variable } => self.show_variable_to_plan(&variable), + } => self.explain_statement_to_plan(*verbose, statement), + Statement::Query(query) => self.query_to_plan(query), + Statement::ShowVariable { variable } => self.show_variable_to_plan(variable), Statement::ShowColumns { extended, full, @@ -232,7 +232,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { FileType::NdJson => {} }; - let schema = self.build_schema(&columns)?; + let schema = self.build_schema(columns)?; Ok(LogicalPlan::CreateExternalTable { schema: schema.to_dfschema_ref()?, @@ -250,7 +250,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { verbose: bool, statement: &Statement, ) -> Result { - let plan = self.sql_statement_to_plan(&statement)?; + let plan = self.sql_statement_to_plan(statement)?; let stringified_plans = vec![StringifiedPlan::new( PlanType::LogicalPlan, @@ -370,7 +370,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { left: &LogicalPlan, right: &LogicalPlan, ) -> Result { - LogicalPlanBuilder::from(&left).cross_join(&right)?.build() + LogicalPlanBuilder::from(left).cross_join(right)?.build() } fn parse_join( @@ -383,7 +383,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { match constraint { JoinConstraint::On(sql_expr) => { let mut keys: Vec<(String, String)> = vec![]; - let join_schema = left.schema().join(&right.schema())?; + let join_schema = left.schema().join(right.schema())?; // parse ON expression let expr = self.sql_to_rex(sql_expr, &join_schema)?; @@ -396,14 +396,14 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { keys.iter().map(|pair| pair.1.as_str()).collect(); // return the logical plan representing the join - LogicalPlanBuilder::from(&left) - .join(&right, join_type, &left_keys, &right_keys)? + LogicalPlanBuilder::from(left) + .join(right, join_type, &left_keys, &right_keys)? .build() } JoinConstraint::Using(idents) => { let keys: Vec<&str> = idents.iter().map(|x| x.value.as_str()).collect(); - LogicalPlanBuilder::from(&left) - .join(&right, join_type, &keys, &keys)? + LogicalPlanBuilder::from(left) + .join(right, join_type, &keys, &keys)? .build() } JoinConstraint::Natural => { @@ -472,7 +472,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // build join schema let mut fields = vec![]; for plan in &plans { - fields.extend_from_slice(&plan.schema().fields()); + fields.extend_from_slice(plan.schema().fields()); } let join_schema = DFSchema::new(fields)?; @@ -673,16 +673,16 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Ok(projection .iter() - .map(|expr| self.sql_select_to_rex(&expr, &input_schema)) + .map(|expr| self.sql_select_to_rex(expr, input_schema)) .collect::>>()? .iter() - .flat_map(|expr| expand_wildcard(&expr, &input_schema)) + .flat_map(|expr| expand_wildcard(expr, input_schema)) .collect::>()) } /// Wrap a plan in a projection fn project(&self, input: &LogicalPlan, expr: Vec) -> Result { - self.validate_schema_satisfies_exprs(&input.schema(), &expr)?; + self.validate_schema_satisfies_exprs(input.schema(), &expr)?; LogicalPlanBuilder::from(input).project(expr)?.build() } @@ -733,7 +733,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .cloned() .collect::>(); - let plan = LogicalPlanBuilder::from(&input) + let plan = LogicalPlanBuilder::from(input) .aggregate(group_by_exprs, aggr_exprs)? .build()?; @@ -784,14 +784,14 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { fn limit(&self, input: &LogicalPlan, limit: &Option) -> Result { match *limit { Some(ref limit_expr) => { - let n = match self.sql_to_rex(&limit_expr, &input.schema())? { + let n = match self.sql_to_rex(limit_expr, input.schema())? { Expr::Literal(ScalarValue::Int64(Some(n))) => Ok(n as usize), _ => Err(DataFusionError::Plan( "Unexpected expression for LIMIT clause".to_string(), )), }?; - LogicalPlanBuilder::from(&input).limit(n)?.build() + LogicalPlanBuilder::from(input).limit(n)?.build() } _ => Ok(input.clone()), } @@ -812,7 +812,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .map(|e| self.order_by_to_sort_expr(e)) .collect::>>()?; - LogicalPlanBuilder::from(&plan).sort(order_by_rex)?.build() + LogicalPlanBuilder::from(plan).sort(order_by_rex)?.build() } /// convert sql OrderByExpr to Expr::Sort @@ -836,7 +836,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .iter() .try_for_each(|col| match col { Expr::Column(name) => { - schema.field_with_unqualified_name(&name).map_err(|_| { + schema.field_with_unqualified_name(name).map_err(|_| { DataFusionError::Plan(format!( "Invalid identifier '{}' for schema {}", name, @@ -854,7 +854,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { match sql { SelectItem::UnnamedExpr(expr) => self.sql_to_rex(expr, schema), SelectItem::ExprWithAlias { expr, alias } => Ok(Alias( - Box::new(self.sql_to_rex(&expr, schema)?), + Box::new(self.sql_to_rex(expr, schema)?), alias.value.clone(), )), SelectItem::Wildcard => Ok(Expr::Wildcard), @@ -977,7 +977,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ref expr, ref data_type, } => Ok(Expr::Cast { - expr: Box::new(self.sql_expr_to_logical_expr(&expr)?), + expr: Box::new(self.sql_expr_to_logical_expr(expr)?), data_type: convert_data_type(data_type)?, }), @@ -985,7 +985,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ref expr, ref data_type, } => Ok(Expr::TryCast { - expr: Box::new(self.sql_expr_to_logical_expr(&expr)?), + expr: Box::new(self.sql_expr_to_logical_expr(expr)?), data_type: convert_data_type(data_type)?, }), @@ -1040,10 +1040,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ref low, ref high, } => Ok(Expr::Between { - expr: Box::new(self.sql_expr_to_logical_expr(&expr)?), + expr: Box::new(self.sql_expr_to_logical_expr(expr)?), negated: *negated, - low: Box::new(self.sql_expr_to_logical_expr(&low)?), - high: Box::new(self.sql_expr_to_logical_expr(&high)?), + low: Box::new(self.sql_expr_to_logical_expr(low)?), + high: Box::new(self.sql_expr_to_logical_expr(high)?), }), SQLExpr::InList { @@ -1057,7 +1057,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .collect::>>()?; Ok(Expr::InList { - expr: Box::new(self.sql_expr_to_logical_expr(&expr)?), + expr: Box::new(self.sql_expr_to_logical_expr(expr)?), list: list_expr, negated: *negated, }) @@ -1091,9 +1091,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { }?; Ok(Expr::BinaryExpr { - left: Box::new(self.sql_expr_to_logical_expr(&left)?), + left: Box::new(self.sql_expr_to_logical_expr(left)?), op: operator, - right: Box::new(self.sql_expr_to_logical_expr(&right)?), + right: Box::new(self.sql_expr_to_logical_expr(right)?), }) } @@ -1209,7 +1209,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } } - SQLExpr::Nested(e) => self.sql_expr_to_logical_expr(&e), + SQLExpr::Nested(e) => self.sql_expr_to_logical_expr(e), _ => Err(DataFusionError::NotImplemented(format!( "Unsupported ast node {:?} in sqltorel", @@ -3167,7 +3167,7 @@ mod tests { fn logical_plan(sql: &str) -> Result { let planner = SqlToRel::new(&MockContextProvider {}); - let result = DFParser::parse_sql(&sql); + let result = DFParser::parse_sql(sql); let ast = result.unwrap(); planner.statement_to_plan(&ast[0]) } diff --git a/datafusion/src/sql/utils.rs b/datafusion/src/sql/utils.rs index 5e9b9526ea834..82431c2314ab2 100644 --- a/datafusion/src/sql/utils.rs +++ b/datafusion/src/sql/utils.rs @@ -146,7 +146,7 @@ where pub(crate) fn expr_as_column_expr(expr: &Expr, plan: &LogicalPlan) -> Result { match expr { Expr::Column(_) => Ok(expr.clone()), - _ => Ok(Expr::Column(expr.name(&plan.schema())?)), + _ => Ok(Expr::Column(expr.name(plan.schema())?)), } } @@ -448,7 +448,7 @@ fn generate_sort_key(partition_by: &[Expr], order_by: &[Expr]) -> WindowSortKey } }); order_by.iter().for_each(|e| { - if !sort_key.contains(&e) { + if !sort_key.contains(e) { sort_key.push(e.clone()); } }); diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 5ce1884049d84..d9d77648c7427 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -130,7 +130,7 @@ async fn parquet_single_nan_schema() { ctx.register_parquet("single_nan", &format!("{}/single_nan.parquet", testdata)) .unwrap(); let sql = "SELECT mycol FROM single_nan"; - let plan = ctx.create_logical_plan(&sql).unwrap(); + let plan = ctx.create_logical_plan(sql).unwrap(); let plan = ctx.optimize(&plan).unwrap(); let plan = ctx.create_physical_plan(&plan).unwrap(); let results = collect(plan).await.unwrap(); @@ -165,7 +165,7 @@ async fn parquet_list_columns() { ])); let sql = "SELECT int64_list, utf8_list FROM list_columns"; - let plan = ctx.create_logical_plan(&sql).unwrap(); + let plan = ctx.create_logical_plan(sql).unwrap(); let plan = ctx.optimize(&plan).unwrap(); let plan = ctx.create_physical_plan(&plan).unwrap(); let results = collect(plan).await.unwrap(); @@ -647,7 +647,7 @@ async fn csv_query_error() -> Result<()> { let mut ctx = create_ctx()?; register_aggregate_csv(&mut ctx)?; let sql = "SELECT sin(c1) FROM aggregate_test_100"; - let plan = ctx.create_logical_plan(&sql); + let plan = ctx.create_logical_plan(sql); assert!(plan.is_err()); Ok(()) } @@ -748,7 +748,7 @@ async fn csv_query_avg_multi_batch() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "SELECT avg(c12) FROM aggregate_test_100"; - let plan = ctx.create_logical_plan(&sql).unwrap(); + let plan = ctx.create_logical_plan(sql).unwrap(); let plan = ctx.optimize(&plan).unwrap(); let plan = ctx.create_physical_plan(&plan).unwrap(); let results = collect(plan).await.unwrap(); @@ -1615,7 +1615,7 @@ async fn csv_explain_plans() { // Logical plan // Create plan let msg = format!("Creating logical plan for '{}'", sql); - let plan = ctx.create_logical_plan(&sql).expect(&msg); + let plan = ctx.create_logical_plan(sql).expect(&msg); let logical_schema = plan.schema(); // println!("SQL: {}", sql); @@ -1812,7 +1812,7 @@ async fn csv_explain_verbose_plans() { // Logical plan // Create plan let msg = format!("Creating logical plan for '{}'", sql); - let plan = ctx.create_logical_plan(&sql).expect(&msg); + let plan = ctx.create_logical_plan(sql).expect(&msg); let logical_schema = plan.schema(); // println!("SQL: {}", sql); @@ -2088,7 +2088,7 @@ fn register_alltypes_parquet(ctx: &mut ExecutionContext) { /// `result[row][column]` async fn execute(ctx: &mut ExecutionContext, sql: &str) -> Vec> { let msg = format!("Creating logical plan for '{}'", sql); - let plan = ctx.create_logical_plan(&sql).expect(&msg); + let plan = ctx.create_logical_plan(sql).expect(&msg); let logical_schema = plan.schema(); let msg = format!("Optimizing logical plan for '{}': {:?}", sql, plan); @@ -2561,7 +2561,7 @@ async fn query_cte_incorrect() -> Result<()> { // self reference let sql = "WITH t AS (SELECT * FROM t) SELECT * from u"; - let plan = ctx.create_logical_plan(&sql); + let plan = ctx.create_logical_plan(sql); assert!(plan.is_err()); assert_eq!( format!("{}", plan.unwrap_err()), @@ -2570,7 +2570,7 @@ async fn query_cte_incorrect() -> Result<()> { // forward referencing let sql = "WITH t AS (SELECT * FROM u), u AS (SELECT 1) SELECT * from u"; - let plan = ctx.create_logical_plan(&sql); + let plan = ctx.create_logical_plan(sql); assert!(plan.is_err()); assert_eq!( format!("{}", plan.unwrap_err()), @@ -2579,7 +2579,7 @@ async fn query_cte_incorrect() -> Result<()> { // wrapping should hide u let sql = "WITH t AS (WITH u as (SELECT 1) SELECT 1) SELECT * from u"; - let plan = ctx.create_logical_plan(&sql); + let plan = ctx.create_logical_plan(sql); assert!(plan.is_err()); assert_eq!( format!("{}", plan.unwrap_err()), @@ -3326,7 +3326,7 @@ async fn test_cast_expressions_error() -> Result<()> { let mut ctx = create_ctx()?; register_aggregate_csv(&mut ctx)?; let sql = "SELECT CAST(c1 AS INT) FROM aggregate_test_100"; - let plan = ctx.create_logical_plan(&sql).unwrap(); + let plan = ctx.create_logical_plan(sql).unwrap(); let plan = ctx.optimize(&plan).unwrap(); let plan = ctx.create_physical_plan(&plan).unwrap(); let result = collect(plan).await; @@ -3355,7 +3355,7 @@ async fn test_physical_plan_display_indent() { GROUP BY c1 \ ORDER BY the_min DESC \ LIMIT 10"; - let plan = ctx.create_logical_plan(&sql).unwrap(); + let plan = ctx.create_logical_plan(sql).unwrap(); let plan = ctx.optimize(&plan).unwrap(); let physical_plan = ctx.create_physical_plan(&plan).unwrap(); @@ -3403,7 +3403,7 @@ async fn test_physical_plan_display_indent_multi_children() { ON c1=c2\ "; - let plan = ctx.create_logical_plan(&sql).unwrap(); + let plan = ctx.create_logical_plan(sql).unwrap(); let plan = ctx.optimize(&plan).unwrap(); let physical_plan = ctx.create_physical_plan(&plan).unwrap(); @@ -3443,7 +3443,7 @@ async fn test_aggregation_with_bad_arguments() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "SELECT COUNT(DISTINCT) FROM aggregate_test_100"; - let logical_plan = ctx.create_logical_plan(&sql)?; + let logical_plan = ctx.create_logical_plan(sql)?; let physical_plan = ctx.create_physical_plan(&logical_plan); let err = physical_plan.unwrap_err(); assert_eq!(err.to_string(), "Error during planning: Invalid or wrong number of arguments passed to aggregate: 'COUNT(DISTINCT )'"); From 2568323dbd85e05f2bf3e6e484f7cc39983ff26c Mon Sep 17 00:00:00 2001 From: Javier Goday Date: Sun, 13 Jun 2021 13:15:24 +0200 Subject: [PATCH 182/329] #420: Support for not_eq predicate in pruning predicates (#544) --- datafusion/src/physical_optimizer/pruning.rs | 28 ++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/datafusion/src/physical_optimizer/pruning.rs b/datafusion/src/physical_optimizer/pruning.rs index da82d53871a84..a7e1fb00c230b 100644 --- a/datafusion/src/physical_optimizer/pruning.rs +++ b/datafusion/src/physical_optimizer/pruning.rs @@ -552,6 +552,14 @@ fn build_predicate_expression( }; let corrected_op = expr_builder.correct_operator(op); let statistics_expr = match corrected_op { + Operator::NotEq => { + // column != literal => (min, max) = literal => min > literal || literal > max + let min_column_expr = expr_builder.min_column_expr()?; + let max_column_expr = expr_builder.max_column_expr()?; + min_column_expr + .gt(expr_builder.scalar_expr().clone()) + .or(expr_builder.scalar_expr().clone().gt(max_column_expr)) + } Operator::Eq => { // column = literal => (min, max) = literal => min <= literal && literal <= max // (column / 2) = 4 => (column_min / 2) <= 4 && 4 <= (column_max / 2) @@ -929,6 +937,26 @@ mod tests { Ok(()) } + #[test] + fn row_group_predicate_not_eq() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + let expected_expr = "#c1_min Gt Int32(1) Or Int32(1) Gt #c1_max"; + + // test column on the left + let expr = col("c1").not_eq(lit(1)); + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + // test column on the right + let expr = lit(1).not_eq(col("c1")); + let predicate_expr = + build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?; + assert_eq!(format!("{:?}", predicate_expr), expected_expr); + + Ok(()) + } + #[test] fn row_group_predicate_gt() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); From d3828541a61b5681b93590a47e22d63715949136 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 13 Jun 2021 07:34:07 -0400 Subject: [PATCH 183/329] Cleanup Repartition Exec code (#538) * Cleanup RepartitionExec code * cleanup metric handling * Add elapsed_nanos --- datafusion/src/physical_plan/mod.rs | 5 + datafusion/src/physical_plan/repartition.rs | 279 +++++++++++--------- 2 files changed, 157 insertions(+), 127 deletions(-) diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index ebc6fd6ce94a2..2dcba802560a2 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -107,6 +107,11 @@ impl SQLMetric { self.value.fetch_add(n, Ordering::Relaxed); } + /// Add elapsed nanoseconds since `start`to self + pub fn add_elapsed(&self, start: std::time::Instant) { + self.add(start.elapsed().as_nanos() as usize) + } + /// Get the current value pub fn value(&self) -> usize { self.value.load(Ordering::Relaxed) diff --git a/datafusion/src/physical_plan/repartition.rs b/datafusion/src/physical_plan/repartition.rs index 5d1f8d7760cf1..7ef1948490741 100644 --- a/datafusion/src/physical_plan/repartition.rs +++ b/datafusion/src/physical_plan/repartition.rs @@ -38,7 +38,7 @@ use futures::stream::Stream; use futures::StreamExt; use hashbrown::HashMap; use tokio::sync::{ - mpsc::{UnboundedReceiver, UnboundedSender}, + mpsc::{self, UnboundedReceiver, UnboundedSender}, Mutex, }; use tokio::task::JoinHandle; @@ -60,12 +60,40 @@ pub struct RepartitionExec { HashMap, UnboundedReceiver)>, >, >, + + /// Execution metrics + metrics: RepartitionMetrics, +} + +#[derive(Debug, Clone)] +struct RepartitionMetrics { /// Time in nanos to execute child operator and fetch batches - fetch_time_nanos: Arc, + fetch_nanos: Arc, /// Time in nanos to perform repartitioning - repart_time_nanos: Arc, + repart_nanos: Arc, /// Time in nanos for sending resulting batches to channels - send_time_nanos: Arc, + send_nanos: Arc, +} + +impl RepartitionMetrics { + fn new() -> Self { + Self { + fetch_nanos: SQLMetric::time_nanos(), + repart_nanos: SQLMetric::time_nanos(), + send_nanos: SQLMetric::time_nanos(), + } + } + /// Convert into the external metrics form + fn to_hashmap(&self) -> HashMap { + let mut metrics = HashMap::new(); + metrics.insert("fetchTime".to_owned(), self.fetch_nanos.as_ref().clone()); + metrics.insert( + "repartitionTime".to_owned(), + self.repart_nanos.as_ref().clone(), + ); + metrics.insert("sendTime".to_owned(), self.send_nanos.as_ref().clone()); + metrics + } } impl RepartitionExec { @@ -132,9 +160,8 @@ impl ExecutionPlan for RepartitionExec { // being read yet. This may cause high memory usage if the next operator is // reading output partitions in order rather than concurrently. One workaround // for this would be to add spill-to-disk capabilities. - let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::< - Option>, - >(); + let (sender, receiver) = + mpsc::unbounded_channel::>>(); channels.insert(partition, (sender, receiver)); } // Use fixed random state @@ -142,122 +169,24 @@ impl ExecutionPlan for RepartitionExec { // launch one async task per *input* partition for i in 0..num_input_partitions { - let random_state = random.clone(); - let input = self.input.clone(); - let fetch_time = self.fetch_time_nanos.clone(); - let repart_time = self.repart_time_nanos.clone(); - let send_time = self.send_time_nanos.clone(); let txs: HashMap<_, _> = channels .iter() .map(|(partition, (tx, _rx))| (*partition, tx.clone())) .collect(); - let partitioning = self.partitioning.clone(); - let mut txs_captured = txs.clone(); - let input_task: JoinHandle> = tokio::spawn(async move { - // execute the child operator - let now = Instant::now(); - let mut stream = input.execute(i).await?; - fetch_time.add(now.elapsed().as_nanos() as usize); - let mut counter = 0; - let hashes_buf = &mut vec![]; - - loop { - // fetch the next batch - let now = Instant::now(); - let result = stream.next().await; - fetch_time.add(now.elapsed().as_nanos() as usize); - - if result.is_none() { - break; - } - let result: ArrowResult = result.unwrap(); - - match &partitioning { - Partitioning::RoundRobinBatch(_) => { - let now = Instant::now(); - let output_partition = counter % num_output_partitions; - let tx = txs_captured.get_mut(&output_partition).unwrap(); - tx.send(Some(result)).map_err(|e| { - DataFusionError::Execution(e.to_string()) - })?; - send_time.add(now.elapsed().as_nanos() as usize); - } - Partitioning::Hash(exprs, _) => { - let now = Instant::now(); - let input_batch = result?; - let arrays = exprs - .iter() - .map(|expr| { - Ok(expr - .evaluate(&input_batch)? - .into_array(input_batch.num_rows())) - }) - .collect::>>()?; - hashes_buf.clear(); - hashes_buf.resize(arrays[0].len(), 0); - // Hash arrays and compute buckets based on number of partitions - let hashes = - create_hashes(&arrays, &random_state, hashes_buf)?; - let mut indices = vec![vec![]; num_output_partitions]; - for (index, hash) in hashes.iter().enumerate() { - indices - [(*hash % num_output_partitions as u64) as usize] - .push(index as u64) - } - repart_time.add(now.elapsed().as_nanos() as usize); - for (num_output_partition, partition_indices) in - indices.into_iter().enumerate() - { - let now = Instant::now(); - let indices = partition_indices.into(); - // Produce batches based on indices - let columns = input_batch - .columns() - .iter() - .map(|c| { - take(c.as_ref(), &indices, None).map_err( - |e| { - DataFusionError::Execution( - e.to_string(), - ) - }, - ) - }) - .collect::>>>()?; - let output_batch = RecordBatch::try_new( - input_batch.schema(), - columns, - ); - repart_time.add(now.elapsed().as_nanos() as usize); - let now = Instant::now(); - let tx = txs_captured - .get_mut(&num_output_partition) - .unwrap(); - tx.send(Some(output_batch)).map_err(|e| { - DataFusionError::Execution(e.to_string()) - })?; - send_time.add(now.elapsed().as_nanos() as usize); - } - } - other => { - // this should be unreachable as long as the validation logic - // in the constructor is kept up-to-date - return Err(DataFusionError::NotImplemented(format!( - "Unsupported repartitioning scheme {:?}", - other - ))); - } - } - counter += 1; - } - - Ok(()) - }); + let input_task: JoinHandle> = + tokio::spawn(Self::pull_from_input( + random.clone(), + self.input.clone(), + i, + txs.clone(), + self.partitioning.clone(), + self.metrics.clone(), + )); // In a separate task, wait for each input to be done - // (and pass along any errors) - tokio::spawn(async move { Self::wait_for_task(input_task, txs).await }); + // (and pass along any errors, including panic!s) + tokio::spawn(Self::wait_for_task(input_task, txs)); } } @@ -272,14 +201,7 @@ impl ExecutionPlan for RepartitionExec { } fn metrics(&self) -> HashMap { - let mut metrics = HashMap::new(); - metrics.insert("fetchTime".to_owned(), (*self.fetch_time_nanos).clone()); - metrics.insert( - "repartitionTime".to_owned(), - (*self.repart_time_nanos).clone(), - ); - metrics.insert("sendTime".to_owned(), (*self.send_time_nanos).clone()); - metrics + self.metrics.to_hashmap() } fn fmt_as( @@ -305,12 +227,115 @@ impl RepartitionExec { input, partitioning, channels: Arc::new(Mutex::new(HashMap::new())), - fetch_time_nanos: SQLMetric::time_nanos(), - repart_time_nanos: SQLMetric::time_nanos(), - send_time_nanos: SQLMetric::time_nanos(), + metrics: RepartitionMetrics::new(), }) } + /// Pulls data from the specified input plan, feeding it to the + /// output partitions based on the desired partitioning + /// + /// i is the input partition index + /// + /// txs hold the output sending channels for each output partition + async fn pull_from_input( + random_state: ahash::RandomState, + input: Arc, + i: usize, + mut txs: HashMap>>>, + partitioning: Partitioning, + metrics: RepartitionMetrics, + ) -> Result<()> { + let num_output_partitions = txs.len(); + + // execute the child operator + let now = Instant::now(); + let mut stream = input.execute(i).await?; + metrics.fetch_nanos.add_elapsed(now); + + let mut counter = 0; + let hashes_buf = &mut vec![]; + + loop { + // fetch the next batch + let now = Instant::now(); + let result = stream.next().await; + metrics.fetch_nanos.add_elapsed(now); + + if result.is_none() { + break; + } + let result: ArrowResult = result.unwrap(); + + match &partitioning { + Partitioning::RoundRobinBatch(_) => { + let now = Instant::now(); + let output_partition = counter % num_output_partitions; + let tx = txs.get_mut(&output_partition).unwrap(); + tx.send(Some(result)) + .map_err(|e| DataFusionError::Execution(e.to_string()))?; + metrics.send_nanos.add_elapsed(now); + } + Partitioning::Hash(exprs, _) => { + let now = Instant::now(); + let input_batch = result?; + let arrays = exprs + .iter() + .map(|expr| { + Ok(expr + .evaluate(&input_batch)? + .into_array(input_batch.num_rows())) + }) + .collect::>>()?; + hashes_buf.clear(); + hashes_buf.resize(arrays[0].len(), 0); + // Hash arrays and compute buckets based on number of partitions + let hashes = create_hashes(&arrays, &random_state, hashes_buf)?; + let mut indices = vec![vec![]; num_output_partitions]; + for (index, hash) in hashes.iter().enumerate() { + indices[(*hash % num_output_partitions as u64) as usize] + .push(index as u64) + } + metrics.repart_nanos.add_elapsed(now); + for (num_output_partition, partition_indices) in + indices.into_iter().enumerate() + { + let now = Instant::now(); + let indices = partition_indices.into(); + // Produce batches based on indices + let columns = input_batch + .columns() + .iter() + .map(|c| { + take(c.as_ref(), &indices, None).map_err(|e| { + DataFusionError::Execution(e.to_string()) + }) + }) + .collect::>>>()?; + let output_batch = + RecordBatch::try_new(input_batch.schema(), columns); + metrics.repart_nanos.add_elapsed(now); + let now = Instant::now(); + let tx = txs.get_mut(&num_output_partition).unwrap(); + tx.send(Some(output_batch)) + .map_err(|e| DataFusionError::Execution(e.to_string()))?; + metrics.send_nanos.add_elapsed(now); + } + } + other => { + // this should be unreachable as long as the validation logic + // in the constructor is kept up-to-date + return Err(DataFusionError::NotImplemented(format!( + "Unsupported repartitioning scheme {:?}", + other + ))); + } + } + counter += 1; + } + + Ok(()) + } + /// Waits for `input_task` which is consuming one of the inputs to /// complete. Upon each successful completion, sends a `None` to /// each of the output tx channels to signal one of the inputs is From 91af8203faa80e959e30a6350b1486c9ddc25247 Mon Sep 17 00:00:00 2001 From: QP Hou Date: Mon, 14 Jun 2021 00:02:39 -0700 Subject: [PATCH 184/329] support table alias in join clause (#547) * support table alias in join clause * Update datafusion/src/sql/planner.rs Co-authored-by: Andrew Lamb Co-authored-by: Andrew Lamb --- datafusion/src/sql/planner.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 7e7462ef390e0..e860bd74641dc 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -424,7 +424,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ctes: &mut HashMap, ) -> Result { match relation { - TableFactor::Table { name, .. } => { + TableFactor::Table { name, alias, .. } => { let table_name = name.to_string(); let cte = ctes.get(&table_name); match ( @@ -432,10 +432,17 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { self.schema_provider.get_table_provider(name.try_into()?), ) { (Some(cte_plan), _) => Ok(cte_plan.clone()), - (_, Some(provider)) => { - LogicalPlanBuilder::scan(&table_name, provider, None)?.build() - } - (_, None) => Err(DataFusionError::Plan(format!( + (_, Some(provider)) => LogicalPlanBuilder::scan( + // take alias into account to support `JOIN table1 as table2` + alias + .as_ref() + .map(|a| a.name.value.as_str()) + .unwrap_or(&table_name), + provider, + None, + )? + .build(), + (None, None) => Err(DataFusionError::Plan(format!( "Table or CTE with name '{}' not found", name ))), From fe810e980834db2582b530188823e308ed9f097c Mon Sep 17 00:00:00 2001 From: QP Hou Date: Mon, 14 Jun 2021 06:35:09 -0700 Subject: [PATCH 185/329] reuse code for now function expr creation (#548) --- datafusion/src/physical_plan/functions.rs | 332 +++++++++++----------- 1 file changed, 168 insertions(+), 164 deletions(-) diff --git a/datafusion/src/physical_plan/functions.rs b/datafusion/src/physical_plan/functions.rs index 49ca79a004960..1e423c367cd8f 100644 --- a/datafusion/src/physical_plan/functions.rs +++ b/datafusion/src/physical_plan/functions.rs @@ -512,39 +512,35 @@ macro_rules! invoke_if_unicode_expressions_feature_flag { }; } -/// Create a physical (function) expression. -/// This function errors when `args`' can't be coerced to a valid argument type of the function. -pub fn create_physical_expr( +/// Create a physical scalar function. +pub fn create_physical_fun( fun: &BuiltinScalarFunction, - args: &[Arc], - input_schema: &Schema, ctx_state: &ExecutionContextState, -) -> Result> { - let fun_expr: ScalarFunctionImplementation = Arc::new(match fun { +) -> Result { + Ok(match fun { // math functions - BuiltinScalarFunction::Abs => math_expressions::abs, - BuiltinScalarFunction::Acos => math_expressions::acos, - BuiltinScalarFunction::Asin => math_expressions::asin, - BuiltinScalarFunction::Atan => math_expressions::atan, - BuiltinScalarFunction::Ceil => math_expressions::ceil, - BuiltinScalarFunction::Cos => math_expressions::cos, - BuiltinScalarFunction::Exp => math_expressions::exp, - BuiltinScalarFunction::Floor => math_expressions::floor, - BuiltinScalarFunction::Log => math_expressions::log10, - BuiltinScalarFunction::Ln => math_expressions::ln, - BuiltinScalarFunction::Log10 => math_expressions::log10, - BuiltinScalarFunction::Log2 => math_expressions::log2, - BuiltinScalarFunction::Random => math_expressions::random, - BuiltinScalarFunction::Round => math_expressions::round, - BuiltinScalarFunction::Signum => math_expressions::signum, - BuiltinScalarFunction::Sin => math_expressions::sin, - BuiltinScalarFunction::Sqrt => math_expressions::sqrt, - BuiltinScalarFunction::Tan => math_expressions::tan, - BuiltinScalarFunction::Trunc => math_expressions::trunc, - + BuiltinScalarFunction::Abs => Arc::new(math_expressions::abs), + BuiltinScalarFunction::Acos => Arc::new(math_expressions::acos), + BuiltinScalarFunction::Asin => Arc::new(math_expressions::asin), + BuiltinScalarFunction::Atan => Arc::new(math_expressions::atan), + BuiltinScalarFunction::Ceil => Arc::new(math_expressions::ceil), + BuiltinScalarFunction::Cos => Arc::new(math_expressions::cos), + BuiltinScalarFunction::Exp => Arc::new(math_expressions::exp), + BuiltinScalarFunction::Floor => Arc::new(math_expressions::floor), + BuiltinScalarFunction::Log => Arc::new(math_expressions::log10), + BuiltinScalarFunction::Ln => Arc::new(math_expressions::ln), + BuiltinScalarFunction::Log10 => Arc::new(math_expressions::log10), + BuiltinScalarFunction::Log2 => Arc::new(math_expressions::log2), + BuiltinScalarFunction::Random => Arc::new(math_expressions::random), + BuiltinScalarFunction::Round => Arc::new(math_expressions::round), + BuiltinScalarFunction::Signum => Arc::new(math_expressions::signum), + BuiltinScalarFunction::Sin => Arc::new(math_expressions::sin), + BuiltinScalarFunction::Sqrt => Arc::new(math_expressions::sqrt), + BuiltinScalarFunction::Tan => Arc::new(math_expressions::tan), + BuiltinScalarFunction::Trunc => Arc::new(math_expressions::trunc), // string functions - BuiltinScalarFunction::Array => array_expressions::array, - BuiltinScalarFunction::Ascii => |args| match args[0].data_type() { + BuiltinScalarFunction::Array => Arc::new(array_expressions::array), + BuiltinScalarFunction::Ascii => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { make_scalar_function(string_expressions::ascii::)(args) } @@ -555,8 +551,8 @@ pub fn create_physical_expr( "Unsupported data type {:?} for function ascii", other, ))), - }, - BuiltinScalarFunction::BitLength => |args| match &args[0] { + }), + BuiltinScalarFunction::BitLength => Arc::new(|args| match &args[0] { ColumnarValue::Array(v) => Ok(ColumnarValue::Array(bit_length(v.as_ref())?)), ColumnarValue::Scalar(v) => match v { ScalarValue::Utf8(v) => Ok(ColumnarValue::Scalar(ScalarValue::Int32( @@ -567,8 +563,8 @@ pub fn create_physical_expr( )), _ => unreachable!(), }, - }, - BuiltinScalarFunction::Btrim => |args| match args[0].data_type() { + }), + BuiltinScalarFunction::Btrim => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { make_scalar_function(string_expressions::btrim::)(args) } @@ -579,55 +575,47 @@ pub fn create_physical_expr( "Unsupported data type {:?} for function btrim", other, ))), - }, - BuiltinScalarFunction::CharacterLength => |args| match args[0].data_type() { - DataType::Utf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - character_length, - Int32Type, - "character_length" - ); - make_scalar_function(func)(args) - } - DataType::LargeUtf8 => { - let func = invoke_if_unicode_expressions_feature_flag!( - character_length, - Int64Type, - "character_length" - ); - make_scalar_function(func)(args) - } - other => Err(DataFusionError::Internal(format!( - "Unsupported data type {:?} for function character_length", - other, - ))), - }, + }), + BuiltinScalarFunction::CharacterLength => { + Arc::new(|args| match args[0].data_type() { + DataType::Utf8 => { + let func = invoke_if_unicode_expressions_feature_flag!( + character_length, + Int32Type, + "character_length" + ); + make_scalar_function(func)(args) + } + DataType::LargeUtf8 => { + let func = invoke_if_unicode_expressions_feature_flag!( + character_length, + Int64Type, + "character_length" + ); + make_scalar_function(func)(args) + } + other => Err(DataFusionError::Internal(format!( + "Unsupported data type {:?} for function character_length", + other, + ))), + }) + } BuiltinScalarFunction::Chr => { - |args| make_scalar_function(string_expressions::chr)(args) + Arc::new(|args| make_scalar_function(string_expressions::chr)(args)) } - BuiltinScalarFunction::Concat => string_expressions::concat, + BuiltinScalarFunction::Concat => Arc::new(string_expressions::concat), BuiltinScalarFunction::ConcatWithSeparator => { - |args| make_scalar_function(string_expressions::concat_ws)(args) + Arc::new(|args| make_scalar_function(string_expressions::concat_ws)(args)) } - BuiltinScalarFunction::DatePart => datetime_expressions::date_part, - BuiltinScalarFunction::DateTrunc => datetime_expressions::date_trunc, + BuiltinScalarFunction::DatePart => Arc::new(datetime_expressions::date_part), + BuiltinScalarFunction::DateTrunc => Arc::new(datetime_expressions::date_trunc), BuiltinScalarFunction::Now => { // bind value for now at plan time - let fun_expr = Arc::new(datetime_expressions::make_now( + Arc::new(datetime_expressions::make_now( ctx_state.execution_props.query_execution_start_time, - )); - - // TODO refactor code to not return here, but instead fall through below - let args = vec![]; - let arg_types = vec![]; // has no args - return Ok(Arc::new(ScalarFunctionExpr::new( - &format!("{}", fun), - fun_expr, - args, - &return_type(fun, &arg_types)?, - ))); + )) } - BuiltinScalarFunction::InitCap => |args| match args[0].data_type() { + BuiltinScalarFunction::InitCap => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { make_scalar_function(string_expressions::initcap::)(args) } @@ -638,8 +626,8 @@ pub fn create_physical_expr( "Unsupported data type {:?} for function initcap", other, ))), - }, - BuiltinScalarFunction::Left => |args| match args[0].data_type() { + }), + BuiltinScalarFunction::Left => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { let func = invoke_if_unicode_expressions_feature_flag!(left, i32, "left"); make_scalar_function(func)(args) @@ -652,9 +640,9 @@ pub fn create_physical_expr( "Unsupported data type {:?} for function left", other, ))), - }, - BuiltinScalarFunction::Lower => string_expressions::lower, - BuiltinScalarFunction::Lpad => |args| match args[0].data_type() { + }), + BuiltinScalarFunction::Lower => Arc::new(string_expressions::lower), + BuiltinScalarFunction::Lpad => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { let func = invoke_if_unicode_expressions_feature_flag!(lpad, i32, "lpad"); make_scalar_function(func)(args) @@ -667,8 +655,8 @@ pub fn create_physical_expr( "Unsupported data type {:?} for function lpad", other, ))), - }, - BuiltinScalarFunction::Ltrim => |args| match args[0].data_type() { + }), + BuiltinScalarFunction::Ltrim => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { make_scalar_function(string_expressions::ltrim::)(args) } @@ -679,12 +667,12 @@ pub fn create_physical_expr( "Unsupported data type {:?} for function ltrim", other, ))), - }, + }), BuiltinScalarFunction::MD5 => { - invoke_if_crypto_expressions_feature_flag!(md5, "md5") + Arc::new(invoke_if_crypto_expressions_feature_flag!(md5, "md5")) } - BuiltinScalarFunction::NullIf => nullif_func, - BuiltinScalarFunction::OctetLength => |args| match &args[0] { + BuiltinScalarFunction::NullIf => Arc::new(nullif_func), + BuiltinScalarFunction::OctetLength => Arc::new(|args| match &args[0] { ColumnarValue::Array(v) => Ok(ColumnarValue::Array(length(v.as_ref())?)), ColumnarValue::Scalar(v) => match v { ScalarValue::Utf8(v) => Ok(ColumnarValue::Scalar(ScalarValue::Int32( @@ -695,52 +683,56 @@ pub fn create_physical_expr( )), _ => unreachable!(), }, - }, - BuiltinScalarFunction::RegexpMatch => |args| match args[0].data_type() { - DataType::Utf8 => { - let func = invoke_if_regex_expressions_feature_flag!( - regexp_match, - i32, - "regexp_match" - ); - make_scalar_function(func)(args) - } - DataType::LargeUtf8 => { - let func = invoke_if_regex_expressions_feature_flag!( - regexp_match, - i64, - "regexp_match" - ); - make_scalar_function(func)(args) - } - other => Err(DataFusionError::Internal(format!( - "Unsupported data type {:?} for function regexp_match", - other - ))), - }, - BuiltinScalarFunction::RegexpReplace => |args| match args[0].data_type() { - DataType::Utf8 => { - let func = invoke_if_regex_expressions_feature_flag!( - regexp_replace, - i32, - "regexp_replace" - ); - make_scalar_function(func)(args) - } - DataType::LargeUtf8 => { - let func = invoke_if_regex_expressions_feature_flag!( - regexp_replace, - i64, - "regexp_replace" - ); - make_scalar_function(func)(args) - } - other => Err(DataFusionError::Internal(format!( - "Unsupported data type {:?} for function regexp_replace", - other, - ))), - }, - BuiltinScalarFunction::Repeat => |args| match args[0].data_type() { + }), + BuiltinScalarFunction::RegexpMatch => { + Arc::new(|args| match args[0].data_type() { + DataType::Utf8 => { + let func = invoke_if_regex_expressions_feature_flag!( + regexp_match, + i32, + "regexp_match" + ); + make_scalar_function(func)(args) + } + DataType::LargeUtf8 => { + let func = invoke_if_regex_expressions_feature_flag!( + regexp_match, + i64, + "regexp_match" + ); + make_scalar_function(func)(args) + } + other => Err(DataFusionError::Internal(format!( + "Unsupported data type {:?} for function regexp_match", + other + ))), + }) + } + BuiltinScalarFunction::RegexpReplace => { + Arc::new(|args| match args[0].data_type() { + DataType::Utf8 => { + let func = invoke_if_regex_expressions_feature_flag!( + regexp_replace, + i32, + "regexp_replace" + ); + make_scalar_function(func)(args) + } + DataType::LargeUtf8 => { + let func = invoke_if_regex_expressions_feature_flag!( + regexp_replace, + i64, + "regexp_replace" + ); + make_scalar_function(func)(args) + } + other => Err(DataFusionError::Internal(format!( + "Unsupported data type {:?} for function regexp_replace", + other, + ))), + }) + } + BuiltinScalarFunction::Repeat => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { make_scalar_function(string_expressions::repeat::)(args) } @@ -751,8 +743,8 @@ pub fn create_physical_expr( "Unsupported data type {:?} for function repeat", other, ))), - }, - BuiltinScalarFunction::Replace => |args| match args[0].data_type() { + }), + BuiltinScalarFunction::Replace => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { make_scalar_function(string_expressions::replace::)(args) } @@ -763,8 +755,8 @@ pub fn create_physical_expr( "Unsupported data type {:?} for function replace", other, ))), - }, - BuiltinScalarFunction::Reverse => |args| match args[0].data_type() { + }), + BuiltinScalarFunction::Reverse => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { let func = invoke_if_unicode_expressions_feature_flag!(reverse, i32, "reverse"); @@ -779,8 +771,8 @@ pub fn create_physical_expr( "Unsupported data type {:?} for function reverse", other, ))), - }, - BuiltinScalarFunction::Right => |args| match args[0].data_type() { + }), + BuiltinScalarFunction::Right => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { let func = invoke_if_unicode_expressions_feature_flag!(right, i32, "right"); @@ -795,8 +787,8 @@ pub fn create_physical_expr( "Unsupported data type {:?} for function right", other, ))), - }, - BuiltinScalarFunction::Rpad => |args| match args[0].data_type() { + }), + BuiltinScalarFunction::Rpad => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { let func = invoke_if_unicode_expressions_feature_flag!(rpad, i32, "rpad"); make_scalar_function(func)(args) @@ -809,8 +801,8 @@ pub fn create_physical_expr( "Unsupported data type {:?} for function rpad", other, ))), - }, - BuiltinScalarFunction::Rtrim => |args| match args[0].data_type() { + }), + BuiltinScalarFunction::Rtrim => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { make_scalar_function(string_expressions::rtrim::)(args) } @@ -821,20 +813,20 @@ pub fn create_physical_expr( "Unsupported data type {:?} for function rtrim", other, ))), - }, + }), BuiltinScalarFunction::SHA224 => { - invoke_if_crypto_expressions_feature_flag!(sha224, "sha224") + Arc::new(invoke_if_crypto_expressions_feature_flag!(sha224, "sha224")) } BuiltinScalarFunction::SHA256 => { - invoke_if_crypto_expressions_feature_flag!(sha256, "sha256") + Arc::new(invoke_if_crypto_expressions_feature_flag!(sha256, "sha256")) } BuiltinScalarFunction::SHA384 => { - invoke_if_crypto_expressions_feature_flag!(sha384, "sha384") + Arc::new(invoke_if_crypto_expressions_feature_flag!(sha384, "sha384")) } BuiltinScalarFunction::SHA512 => { - invoke_if_crypto_expressions_feature_flag!(sha512, "sha512") + Arc::new(invoke_if_crypto_expressions_feature_flag!(sha512, "sha512")) } - BuiltinScalarFunction::SplitPart => |args| match args[0].data_type() { + BuiltinScalarFunction::SplitPart => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { make_scalar_function(string_expressions::split_part::)(args) } @@ -845,8 +837,8 @@ pub fn create_physical_expr( "Unsupported data type {:?} for function split_part", other, ))), - }, - BuiltinScalarFunction::StartsWith => |args| match args[0].data_type() { + }), + BuiltinScalarFunction::StartsWith => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { make_scalar_function(string_expressions::starts_with::)(args) } @@ -857,8 +849,8 @@ pub fn create_physical_expr( "Unsupported data type {:?} for function starts_with", other, ))), - }, - BuiltinScalarFunction::Strpos => |args| match args[0].data_type() { + }), + BuiltinScalarFunction::Strpos => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { let func = invoke_if_unicode_expressions_feature_flag!( strpos, Int32Type, "strpos" @@ -875,8 +867,8 @@ pub fn create_physical_expr( "Unsupported data type {:?} for function strpos", other, ))), - }, - BuiltinScalarFunction::Substr => |args| match args[0].data_type() { + }), + BuiltinScalarFunction::Substr => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { let func = invoke_if_unicode_expressions_feature_flag!(substr, i32, "substr"); @@ -891,8 +883,8 @@ pub fn create_physical_expr( "Unsupported data type {:?} for function substr", other, ))), - }, - BuiltinScalarFunction::ToHex => |args| match args[0].data_type() { + }), + BuiltinScalarFunction::ToHex => Arc::new(|args| match args[0].data_type() { DataType::Int32 => { make_scalar_function(string_expressions::to_hex::)(args) } @@ -903,9 +895,11 @@ pub fn create_physical_expr( "Unsupported data type {:?} for function to_hex", other, ))), - }, - BuiltinScalarFunction::ToTimestamp => datetime_expressions::to_timestamp, - BuiltinScalarFunction::Translate => |args| match args[0].data_type() { + }), + BuiltinScalarFunction::ToTimestamp => { + Arc::new(datetime_expressions::to_timestamp) + } + BuiltinScalarFunction::Translate => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { let func = invoke_if_unicode_expressions_feature_flag!( translate, @@ -926,8 +920,8 @@ pub fn create_physical_expr( "Unsupported data type {:?} for function translate", other, ))), - }, - BuiltinScalarFunction::Trim => |args| match args[0].data_type() { + }), + BuiltinScalarFunction::Trim => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { make_scalar_function(string_expressions::btrim::)(args) } @@ -938,10 +932,20 @@ pub fn create_physical_expr( "Unsupported data type {:?} for function trim", other, ))), - }, - BuiltinScalarFunction::Upper => string_expressions::upper, - }); - // coerce + }), + BuiltinScalarFunction::Upper => Arc::new(string_expressions::upper), + }) +} + +/// Create a physical (function) expression. +/// This function errors when `args`' can't be coerced to a valid argument type of the function. +pub fn create_physical_expr( + fun: &BuiltinScalarFunction, + args: &[Arc], + input_schema: &Schema, + ctx_state: &ExecutionContextState, +) -> Result> { + let fun_expr = create_physical_fun(fun, ctx_state)?; let args = coerce(args, input_schema, &signature(fun))?; let arg_types = args From 396a50b452aefa369c0e3cdb7e4db8f471132d4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Mon, 14 Jun 2021 17:51:54 +0200 Subject: [PATCH 186/329] Filter push down for Union (#559) * Push down filter through UNION * Fix comments * Fmt --- datafusion/src/optimizer/filter_push_down.rs | 22 ++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/datafusion/src/optimizer/filter_push_down.rs b/datafusion/src/optimizer/filter_push_down.rs index 85d1f812f41ac..dc4d5e993a380 100644 --- a/datafusion/src/optimizer/filter_push_down.rs +++ b/datafusion/src/optimizer/filter_push_down.rs @@ -322,6 +322,10 @@ fn optimize(plan: &LogicalPlan, mut state: State) -> Result { // sort is filter-commutable push_down(&state, plan) } + LogicalPlan::Union { .. } => { + // union all is filter-commutable + push_down(&state, plan) + } LogicalPlan::Limit { input, .. } => { // limit is _not_ filter-commutable => collect all columns from its input let used_columns = input @@ -766,6 +770,24 @@ mod tests { Ok(()) } + #[test] + fn union_all() -> Result<()> { + let table_scan = test_table_scan()?; + let plan = LogicalPlanBuilder::from(&table_scan) + .union(LogicalPlanBuilder::from(&table_scan).build()?)? + .filter(col("a").eq(lit(1i64)))? + .build()?; + // filter appears below Union + let expected = "\ + Union\ + \n Filter: #a Eq Int64(1)\ + \n TableScan: test projection=None\ + \n Filter: #a Eq Int64(1)\ + \n TableScan: test projection=None"; + assert_optimized_plan_eq(&plan, expected); + Ok(()) + } + /// verifies that filters with the same columns are correctly placed #[test] fn filter_2_breaks_limits() -> Result<()> { From e46920a091473e9414e449770fce4c0bc57c3fa5 Mon Sep 17 00:00:00 2001 From: QP Hou Date: Mon, 14 Jun 2021 08:52:18 -0700 Subject: [PATCH 187/329] hash float arrays using primitive usigned integer type (#556) Generate hash using u8 slices will be less efficient and breaks sepcialization in ahash. See https://github.com/tkaitchuck/aHash/issues/93 for more details. --- datafusion/src/physical_plan/hash_join.rs | 36 ++++++++++++++++++++--- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index 1b0322b521a52..644d2d486c854 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -877,13 +877,19 @@ macro_rules! hash_array_float { if $multi_col { for (hash, value) in $hashes.iter_mut().zip(values.iter()) { *hash = combine_hashes( - $ty::get_hash(&value.to_le_bytes(), $random_state), + $ty::get_hash( + &$ty::from_le_bytes(value.to_le_bytes()), + $random_state, + ), *hash, ); } } else { for (hash, value) in $hashes.iter_mut().zip(values.iter()) { - *hash = $ty::get_hash(&value.to_le_bytes(), $random_state) + *hash = $ty::get_hash( + &$ty::from_le_bytes(value.to_le_bytes()), + $random_state, + ) } } } else { @@ -893,7 +899,10 @@ macro_rules! hash_array_float { { if !array.is_null(i) { *hash = combine_hashes( - $ty::get_hash(&value.to_le_bytes(), $random_state), + $ty::get_hash( + &$ty::from_le_bytes(value.to_le_bytes()), + $random_state, + ), *hash, ); } @@ -903,7 +912,10 @@ macro_rules! hash_array_float { $hashes.iter_mut().zip(values.iter()).enumerate() { if !array.is_null(i) { - *hash = $ty::get_hash(&value.to_le_bytes(), $random_state); + *hash = $ty::get_hash( + &$ty::from_le_bytes(value.to_le_bytes()), + $random_state, + ); } } } @@ -1838,6 +1850,22 @@ mod tests { Ok(()) } + #[test] + fn create_hashes_for_float_arrays() -> Result<()> { + let f32_arr = Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7])); + let f64_arr = Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7])); + + let random_state = RandomState::with_seeds(0, 0, 0, 0); + let hashes_buff = &mut vec![0; f32_arr.len()]; + let hashes = create_hashes(&[f32_arr], &random_state, hashes_buff)?; + assert_eq!(hashes.len(), 4,); + + let hashes = create_hashes(&[f64_arr], &random_state, hashes_buff)?; + assert_eq!(hashes.len(), 4,); + + Ok(()) + } + #[test] fn join_with_hash_collision() -> Result<()> { let mut hashmap_left = HashMap::with_capacity_and_hasher(2, IdHashBuilder {}); From e3e7e293c4482af1475406bf4e922d5c99e7a792 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 14 Jun 2021 15:49:48 -0400 Subject: [PATCH 188/329] Fix pruning on not equal predicate (#561) --- datafusion/src/physical_optimizer/pruning.rs | 38 +++++++++++++++++--- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/datafusion/src/physical_optimizer/pruning.rs b/datafusion/src/physical_optimizer/pruning.rs index a7e1fb00c230b..9e8d9fa778583 100644 --- a/datafusion/src/physical_optimizer/pruning.rs +++ b/datafusion/src/physical_optimizer/pruning.rs @@ -553,12 +553,14 @@ fn build_predicate_expression( let corrected_op = expr_builder.correct_operator(op); let statistics_expr = match corrected_op { Operator::NotEq => { - // column != literal => (min, max) = literal => min > literal || literal > max + // column != literal => (min, max) = literal => + // !(min != literal && max != literal) ==> + // min != literal || literal != max let min_column_expr = expr_builder.min_column_expr()?; let max_column_expr = expr_builder.max_column_expr()?; min_column_expr - .gt(expr_builder.scalar_expr().clone()) - .or(expr_builder.scalar_expr().clone().gt(max_column_expr)) + .not_eq(expr_builder.scalar_expr().clone()) + .or(expr_builder.scalar_expr().clone().not_eq(max_column_expr)) } Operator::Eq => { // column = literal => (min, max) = literal => min <= literal && literal <= max @@ -940,7 +942,7 @@ mod tests { #[test] fn row_group_predicate_not_eq() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = "#c1_min Gt Int32(1) Or Int32(1) Gt #c1_max"; + let expected_expr = "#c1_min NotEq Int32(1) Or Int32(1) NotEq #c1_max"; // test column on the left let expr = col("c1").not_eq(lit(1)); @@ -1190,6 +1192,34 @@ mod tests { assert_eq!(result, expected); } + #[test] + fn prune_not_eq_data() { + let schema = Arc::new(Schema::new(vec![Field::new("s1", DataType::Utf8, true)])); + + // Prune using s2 != 'M' + let expr = col("s1").not_eq(lit("M")); + + let statistics = TestStatistics::new().with( + "s1", + ContainerStats::new_utf8( + vec![Some("A"), Some("A"), Some("N"), Some("M"), None, Some("A")], // min + vec![Some("Z"), Some("L"), Some("Z"), Some("M"), None, None], // max + ), + ); + + // s1 [A, Z] ==> might have values that pass predicate + // s1 [A, L] ==> all rows pass the predicate + // s1 [N, Z] ==> all rows pass the predicate + // s1 [M, M] ==> all rows do not pass the predicate + // No stats for s2 ==> some rows could pass + // s2 [3, None] (null max) ==> some rows could pass + + let p = PruningPredicate::try_new(&expr, schema).unwrap(); + let result = p.prune(&statistics).unwrap(); + let expected = vec![true, true, true, false, true, true]; + assert_eq!(result, expected); + } + /// Creates setup for boolean chunk pruning /// /// For predicate "b1" (boolean expr) From 51e5445fa51cef4f72df5db7804906a729fc5aa6 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Wed, 16 Jun 2021 18:25:35 +0800 Subject: [PATCH 189/329] add window function implementation with order_by clause (#520) --- datafusion/src/execution/context.rs | 55 +++- .../physical_plan/expressions/nth_value.rs | 137 ++++------ .../physical_plan/expressions/row_number.rs | 89 +------ .../src/physical_plan/hash_aggregate.rs | 4 +- datafusion/src/physical_plan/mod.rs | 130 +++++----- datafusion/src/physical_plan/planner.rs | 15 +- .../src/physical_plan/window_functions.rs | 14 +- datafusion/src/physical_plan/windows.rs | 244 +++++++++--------- datafusion/src/scalar.rs | 2 +- datafusion/src/sql/planner.rs | 2 +- datafusion/tests/sql.rs | 147 +++++++++-- .../simple_window_ordered_aggregation.sql | 26 ++ integration-tests/test_psql_parity.py | 2 +- 13 files changed, 476 insertions(+), 391 deletions(-) create mode 100644 integration-tests/sqls/simple_window_ordered_aggregation.sql diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index f09d7f4f90c93..1835244979402 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -1273,7 +1273,17 @@ mod tests { #[tokio::test] async fn window() -> Result<()> { let results = execute( - "SELECT c1, c2, SUM(c2) OVER (), COUNT(c2) OVER (), MAX(c2) OVER (), MIN(c2) OVER (), AVG(c2) OVER () FROM test ORDER BY c1, c2 LIMIT 5", + "SELECT \ + c1, \ + c2, \ + SUM(c2) OVER (), \ + COUNT(c2) OVER (), \ + MAX(c2) OVER (), \ + MIN(c2) OVER (), \ + AVG(c2) OVER () \ + FROM test \ + ORDER BY c1, c2 \ + LIMIT 5", 4, ) .await?; @@ -1299,6 +1309,49 @@ mod tests { Ok(()) } + #[tokio::test] + async fn window_order_by() -> Result<()> { + let results = execute( + "SELECT \ + c1, \ + c2, \ + ROW_NUMBER() OVER (ORDER BY c1, c2), \ + FIRST_VALUE(c2) OVER (ORDER BY c1, c2), \ + LAST_VALUE(c2) OVER (ORDER BY c1, c2), \ + NTH_VALUE(c2, 2) OVER (ORDER BY c1, c2), \ + SUM(c2) OVER (ORDER BY c1, c2), \ + COUNT(c2) OVER (ORDER BY c1, c2), \ + MAX(c2) OVER (ORDER BY c1, c2), \ + MIN(c2) OVER (ORDER BY c1, c2), \ + AVG(c2) OVER (ORDER BY c1, c2) \ + FROM test \ + ORDER BY c1, c2 \ + LIMIT 5", + 4, + ) + .await?; + // result in one batch, although e.g. having 2 batches do not change + // result semantics, having a len=1 assertion upfront keeps surprises + // at bay + assert_eq!(results.len(), 1); + + let expected = vec![ + "+----+----+--------------+-----------------+----------------+------------------------+---------+-----------+---------+---------+---------+", + "| c1 | c2 | ROW_NUMBER() | FIRST_VALUE(c2) | LAST_VALUE(c2) | NTH_VALUE(c2,Int64(2)) | SUM(c2) | COUNT(c2) | MAX(c2) | MIN(c2) | AVG(c2) |", + "+----+----+--------------+-----------------+----------------+------------------------+---------+-----------+---------+---------+---------+", + "| 0 | 1 | 1 | 1 | 10 | 2 | 1 | 1 | 1 | 1 | 1 |", + "| 0 | 2 | 2 | 1 | 10 | 2 | 3 | 2 | 2 | 1 | 1.5 |", + "| 0 | 3 | 3 | 1 | 10 | 2 | 6 | 3 | 3 | 1 | 2 |", + "| 0 | 4 | 4 | 1 | 10 | 2 | 10 | 4 | 4 | 1 | 2.5 |", + "| 0 | 5 | 5 | 1 | 10 | 2 | 15 | 5 | 5 | 1 | 3 |", + "+----+----+--------------+-----------------+----------------+------------------------+---------+-----------+---------+---------+---------+", + ]; + + // window function shall respect ordering + assert_batches_eq!(expected, &results); + Ok(()) + } + #[tokio::test] async fn aggregate() -> Result<()> { let results = execute("SELECT SUM(c1), SUM(c2) FROM test", 4).await?; diff --git a/datafusion/src/physical_plan/expressions/nth_value.rs b/datafusion/src/physical_plan/expressions/nth_value.rs index fb0e79f7ad3c6..98083fa26eaa9 100644 --- a/datafusion/src/physical_plan/expressions/nth_value.rs +++ b/datafusion/src/physical_plan/expressions/nth_value.rs @@ -18,13 +18,11 @@ //! Defines physical expressions that can evaluated at runtime during query execution use crate::error::{DataFusionError, Result}; -use crate::physical_plan::{ - window_functions::BuiltInWindowFunctionExpr, PhysicalExpr, WindowAccumulator, -}; +use crate::physical_plan::{window_functions::BuiltInWindowFunctionExpr, PhysicalExpr}; use crate::scalar::ScalarValue; +use arrow::array::{new_empty_array, ArrayRef}; use arrow::datatypes::{DataType, Field}; use std::any::Any; -use std::convert::TryFrom; use std::sync::Arc; /// nth_value kind @@ -113,54 +111,32 @@ impl BuiltInWindowFunctionExpr for NthValue { &self.name } - fn create_accumulator(&self) -> Result> { - Ok(Box::new(NthValueAccumulator::try_new( - self.kind, - self.data_type.clone(), - )?)) - } -} - -#[derive(Debug)] -struct NthValueAccumulator { - kind: NthValueKind, - offset: u32, - value: ScalarValue, -} - -impl NthValueAccumulator { - /// new count accumulator - pub fn try_new(kind: NthValueKind, data_type: DataType) -> Result { - Ok(Self { - kind, - offset: 0, - // null value of that data_type by default - value: ScalarValue::try_from(&data_type)?, - }) - } -} - -impl WindowAccumulator for NthValueAccumulator { - fn scan(&mut self, values: &[ScalarValue]) -> Result> { - self.offset += 1; - match self.kind { - NthValueKind::Last => { - self.value = values[0].clone(); - } - NthValueKind::First if self.offset == 1 => { - self.value = values[0].clone(); - } - NthValueKind::Nth(n) if self.offset == n => { - self.value = values[0].clone(); - } - _ => {} + fn evaluate(&self, num_rows: usize, values: &[ArrayRef]) -> Result { + if values.is_empty() { + return Err(DataFusionError::Execution(format!( + "No arguments supplied to {}", + self.name() + ))); } - - Ok(None) - } - - fn evaluate(&self) -> Result> { - Ok(Some(self.value.clone())) + let value = &values[0]; + if value.len() != num_rows { + return Err(DataFusionError::Execution(format!( + "Invalid data supplied to {}, expect {} rows, got {} rows", + self.name(), + num_rows, + value.len() + ))); + } + if num_rows == 0 { + return Ok(new_empty_array(value.data_type())); + } + let index: usize = match self.kind { + NthValueKind::First => 0, + NthValueKind::Last => (num_rows as usize) - 1, + NthValueKind::Nth(n) => (n as usize) - 1, + }; + let value = ScalarValue::try_from_array(value, index)?; + Ok(value.to_array_of_size(num_rows)) } } @@ -172,68 +148,47 @@ mod tests { use arrow::record_batch::RecordBatch; use arrow::{array::*, datatypes::*}; - fn test_i32_result(expr: Arc, expected: i32) -> Result<()> { + fn test_i32_result(expr: NthValue, expected: Vec) -> Result<()> { let arr: ArrayRef = Arc::new(Int32Array::from(vec![1, -2, 3, -4, 5, -6, 7, 8])); + let values = vec![arr]; let schema = Schema::new(vec![Field::new("arr", DataType::Int32, false)]); - let batch = RecordBatch::try_new(Arc::new(schema), vec![arr])?; - - let mut acc = expr.create_accumulator()?; - let expr = expr.expressions(); - let values = expr - .iter() - .map(|e| e.evaluate(&batch)) - .map(|r| r.map(|v| v.into_array(batch.num_rows()))) - .collect::>>()?; - let result = acc.scan_batch(batch.num_rows(), &values)?; - assert_eq!(false, result.is_some()); - let result = acc.evaluate()?; - assert_eq!(Some(ScalarValue::Int32(Some(expected))), result); + let batch = RecordBatch::try_new(Arc::new(schema), values.clone())?; + let result = expr.evaluate(batch.num_rows(), &values)?; + let result = result.as_any().downcast_ref::().unwrap(); + let result = result.values(); + assert_eq!(expected, result); Ok(()) } #[test] fn first_value() -> Result<()> { - let first_value = Arc::new(NthValue::first_value( - "first_value".to_owned(), - col("arr"), - DataType::Int32, - )); - test_i32_result(first_value, 1)?; + let first_value = + NthValue::first_value("first_value".to_owned(), col("arr"), DataType::Int32); + test_i32_result(first_value, vec![1; 8])?; Ok(()) } #[test] fn last_value() -> Result<()> { - let last_value = Arc::new(NthValue::last_value( - "last_value".to_owned(), - col("arr"), - DataType::Int32, - )); - test_i32_result(last_value, 8)?; + let last_value = + NthValue::last_value("last_value".to_owned(), col("arr"), DataType::Int32); + test_i32_result(last_value, vec![8; 8])?; Ok(()) } #[test] fn nth_value_1() -> Result<()> { - let nth_value = Arc::new(NthValue::nth_value( - "nth_value".to_owned(), - col("arr"), - DataType::Int32, - 1, - )?); - test_i32_result(nth_value, 1)?; + let nth_value = + NthValue::nth_value("nth_value".to_owned(), col("arr"), DataType::Int32, 1)?; + test_i32_result(nth_value, vec![1; 8])?; Ok(()) } #[test] fn nth_value_2() -> Result<()> { - let nth_value = Arc::new(NthValue::nth_value( - "nth_value".to_owned(), - col("arr"), - DataType::Int32, - 2, - )?); - test_i32_result(nth_value, -2)?; + let nth_value = + NthValue::nth_value("nth_value".to_owned(), col("arr"), DataType::Int32, 2)?; + test_i32_result(nth_value, vec![-2; 8])?; Ok(()) } } diff --git a/datafusion/src/physical_plan/expressions/row_number.rs b/datafusion/src/physical_plan/expressions/row_number.rs index eaf9b21cbc649..0444ee971f40d 100644 --- a/datafusion/src/physical_plan/expressions/row_number.rs +++ b/datafusion/src/physical_plan/expressions/row_number.rs @@ -18,10 +18,7 @@ //! Defines physical expression for `row_number` that can evaluated at runtime during query execution use crate::error::Result; -use crate::physical_plan::{ - window_functions::BuiltInWindowFunctionExpr, PhysicalExpr, WindowAccumulator, -}; -use crate::scalar::ScalarValue; +use crate::physical_plan::{window_functions::BuiltInWindowFunctionExpr, PhysicalExpr}; use arrow::array::{ArrayRef, UInt64Array}; use arrow::datatypes::{DataType, Field}; use std::any::Any; @@ -60,46 +57,10 @@ impl BuiltInWindowFunctionExpr for RowNumber { self.name.as_str() } - fn create_accumulator(&self) -> Result> { - Ok(Box::new(RowNumberAccumulator::new())) - } -} - -#[derive(Debug)] -struct RowNumberAccumulator { - row_number: u64, -} - -impl RowNumberAccumulator { - /// new row_number accumulator - pub fn new() -> Self { - // row number is 1 based - Self { row_number: 1 } - } -} - -impl WindowAccumulator for RowNumberAccumulator { - fn scan(&mut self, _values: &[ScalarValue]) -> Result> { - let result = Some(ScalarValue::UInt64(Some(self.row_number))); - self.row_number += 1; - Ok(result) - } - - fn scan_batch( - &mut self, - num_rows: usize, - _values: &[ArrayRef], - ) -> Result> { - let new_row_number = self.row_number + (num_rows as u64); - // TODO: probably would be nice to have a (optimized) kernel for this at some point to - // generate an array like this. - let result = UInt64Array::from_iter_values(self.row_number..new_row_number); - self.row_number = new_row_number; - Ok(Some(Arc::new(result))) - } - - fn evaluate(&self) -> Result> { - Ok(None) + fn evaluate(&self, num_rows: usize, _values: &[ArrayRef]) -> Result { + Ok(Arc::new(UInt64Array::from_iter_values( + (1..num_rows + 1).map(|i| i as u64), + ))) } } @@ -117,27 +78,11 @@ mod tests { ])); let schema = Schema::new(vec![Field::new("arr", DataType::Boolean, false)]); let batch = RecordBatch::try_new(Arc::new(schema), vec![arr])?; - - let row_number = Arc::new(RowNumber::new("row_number".to_owned())); - - let mut acc = row_number.create_accumulator()?; - let expr = row_number.expressions(); - let values = expr - .iter() - .map(|e| e.evaluate(&batch)) - .map(|r| r.map(|v| v.into_array(batch.num_rows()))) - .collect::>>()?; - - let result = acc.scan_batch(batch.num_rows(), &values)?; - assert_eq!(true, result.is_some()); - - let result = result.unwrap(); + let row_number = RowNumber::new("row_number".to_owned()); + let result = row_number.evaluate(batch.num_rows(), &[])?; let result = result.as_any().downcast_ref::().unwrap(); let result = result.values(); assert_eq!(vec![1, 2, 3, 4, 5, 6, 7, 8], result); - - let result = acc.evaluate()?; - assert_eq!(false, result.is_some()); Ok(()) } @@ -148,27 +93,11 @@ mod tests { ])); let schema = Schema::new(vec![Field::new("arr", DataType::Boolean, false)]); let batch = RecordBatch::try_new(Arc::new(schema), vec![arr])?; - - let row_number = Arc::new(RowNumber::new("row_number".to_owned())); - - let mut acc = row_number.create_accumulator()?; - let expr = row_number.expressions(); - let values = expr - .iter() - .map(|e| e.evaluate(&batch)) - .map(|r| r.map(|v| v.into_array(batch.num_rows()))) - .collect::>>()?; - - let result = acc.scan_batch(batch.num_rows(), &values)?; - assert_eq!(true, result.is_some()); - - let result = result.unwrap(); + let row_number = RowNumber::new("row_number".to_owned()); + let result = row_number.evaluate(batch.num_rows(), &[])?; let result = result.as_any().downcast_ref::().unwrap(); let result = result.values(); assert_eq!(vec![1, 2, 3, 4, 5, 6, 7, 8], result); - - let result = acc.evaluate()?; - assert_eq!(false, result.is_some()); Ok(()) } } diff --git a/datafusion/src/physical_plan/hash_aggregate.rs b/datafusion/src/physical_plan/hash_aggregate.rs index 453d500e98bd8..f1611ebd7a775 100644 --- a/datafusion/src/physical_plan/hash_aggregate.rs +++ b/datafusion/src/physical_plan/hash_aggregate.rs @@ -500,7 +500,7 @@ fn dictionary_create_key_for_col( let dict_col = col.as_any().downcast_ref::>().unwrap(); // look up the index in the values dictionary - let keys_col = dict_col.keys_array(); + let keys_col = dict_col.keys(); let values_index = keys_col.value(row).to_usize().ok_or_else(|| { DataFusionError::Internal(format!( "Can not convert index to usize in dictionary of type creating group by value {:?}", @@ -1083,7 +1083,7 @@ fn dictionary_create_group_by_value( let dict_col = col.as_any().downcast_ref::>().unwrap(); // look up the index in the values dictionary - let keys_col = dict_col.keys_array(); + let keys_col = dict_col.keys(); let values_index = keys_col.value(row).to_usize().ok_or_else(|| { DataFusionError::Internal(format!( "Can not convert index to usize in dictionary of type creating group by value {:?}", diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index 2dcba802560a2..713956f00a9e3 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -17,17 +17,16 @@ //! Traits for physical query plan, supporting parallel execution for partitioned relations. -use std::fmt; -use std::fmt::{Debug, Display}; -use std::sync::atomic::{AtomicUsize, Ordering}; -use std::sync::Arc; - +use self::{display::DisplayableExecutionPlan, merge::MergeExec}; use crate::execution::context::ExecutionContextState; use crate::logical_plan::LogicalPlan; +use crate::physical_plan::expressions::PhysicalSortExpr; use crate::{ error::{DataFusionError, Result}, scalar::ScalarValue, }; +use arrow::compute::kernels::partition::lexicographical_partition_ranges; +use arrow::compute::kernels::sort::{SortColumn, SortOptions}; use arrow::datatypes::{DataType, Schema, SchemaRef}; use arrow::error::Result as ArrowResult; use arrow::record_batch::RecordBatch; @@ -35,10 +34,13 @@ use arrow::{array::ArrayRef, datatypes::Field}; use async_trait::async_trait; pub use display::DisplayFormatType; use futures::stream::Stream; -use std::{any::Any, pin::Pin}; - -use self::{display::DisplayableExecutionPlan, merge::MergeExec}; use hashbrown::HashMap; +use std::fmt; +use std::fmt::{Debug, Display}; +use std::ops::Range; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; +use std::{any::Any, pin::Pin}; /// Trait for types that stream [arrow::record_batch::RecordBatch] pub trait RecordBatchStream: Stream> { @@ -465,15 +467,65 @@ pub trait WindowExpr: Send + Sync + Debug { "WindowExpr: default name" } - /// the accumulator used to accumulate values from the expressions. - /// the accumulator expects the same number of arguments as `expressions` and must - /// return states with the same description as `state_fields` - fn create_accumulator(&self) -> Result>; - /// expressions that are passed to the WindowAccumulator. /// Functions which take a single input argument, such as `sum`, return a single [`Expr`], /// others (e.g. `cov`) return many. fn expressions(&self) -> Vec>; + + /// evaluate the window function arguments against the batch and return + /// array ref, normally the resulting vec is a single element one. + fn evaluate_args(&self, batch: &RecordBatch) -> Result> { + self.expressions() + .iter() + .map(|e| e.evaluate(batch)) + .map(|r| r.map(|v| v.into_array(batch.num_rows()))) + .collect() + } + + /// evaluate the window function values against the batch + fn evaluate(&self, batch: &RecordBatch) -> Result; + + /// evaluate the sort partition points + fn evaluate_sort_partition_points( + &self, + batch: &RecordBatch, + ) -> Result>> { + let sort_columns = self.sort_columns(batch)?; + if sort_columns.is_empty() { + Ok(vec![Range { + start: 0, + end: batch.num_rows(), + }]) + } else { + lexicographical_partition_ranges(&sort_columns) + .map_err(DataFusionError::ArrowError) + } + } + + /// expressions that's from the window function's partition by clause, empty if absent + fn partition_by(&self) -> &[Arc]; + + /// expressions that's from the window function's order by clause, empty if absent + fn order_by(&self) -> &[PhysicalSortExpr]; + + /// get sort columns that can be used for partitioning, empty if absent + fn sort_columns(&self, batch: &RecordBatch) -> Result> { + self.partition_by() + .iter() + .map(|expr| { + PhysicalSortExpr { + expr: expr.clone(), + options: SortOptions::default(), + } + .evaluate_to_sort_column(batch) + }) + .chain( + self.order_by() + .iter() + .map(|e| e.evaluate_to_sort_column(batch)), + ) + .collect() + } } /// An accumulator represents a stateful object that lives throughout the evaluation of multiple rows and @@ -528,58 +580,6 @@ pub trait Accumulator: Send + Sync + Debug { fn evaluate(&self) -> Result; } -/// A window accumulator represents a stateful object that lives throughout the evaluation of multiple -/// rows and generically accumulates values. -/// -/// An accumulator knows how to: -/// * update its state from inputs via `update` -/// * convert its internal state to a vector of scalar values -/// * update its state from multiple accumulators' states via `merge` -/// * compute the final value from its internal state via `evaluate` -pub trait WindowAccumulator: Send + Sync + Debug { - /// scans the accumulator's state from a vector of scalars, similar to Accumulator it also - /// optionally generates values. - fn scan(&mut self, values: &[ScalarValue]) -> Result>; - - /// scans the accumulator's state from a vector of arrays. - fn scan_batch( - &mut self, - num_rows: usize, - values: &[ArrayRef], - ) -> Result> { - if values.is_empty() { - return Ok(None); - }; - // transpose columnar to row based so that we can apply window - let result = (0..num_rows) - .map(|index| { - let v = values - .iter() - .map(|array| ScalarValue::try_from_array(array, index)) - .collect::>>()?; - self.scan(&v) - }) - .collect::>>>()? - .into_iter() - .collect::>>(); - - Ok(match result { - Some(arr) if num_rows == arr.len() => Some(ScalarValue::iter_to_array(arr)?), - None => None, - Some(arr) => { - return Err(DataFusionError::Internal(format!( - "expect scan batch to return {:?} rows, but got {:?}", - num_rows, - arr.len() - ))) - } - }) - } - - /// returns its value based on its current state. - fn evaluate(&self) -> Result>; -} - pub mod aggregates; pub mod array_expressions; pub mod coalesce_batches; diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index 31b3749dd3549..1121c28184bd7 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -143,7 +143,12 @@ impl DefaultPhysicalPlanner { LogicalPlan::Window { input, window_expr, .. } => { - // Initially need to perform the aggregate and then merge the partitions + if window_expr.is_empty() { + return Err(DataFusionError::Internal( + "Impossibly got empty window expression".to_owned(), + )); + } + let input_exec = self.create_initial_plan(input, ctx_state)?; let input_schema = input_exec.schema(); @@ -364,7 +369,7 @@ impl DefaultPhysicalPlanner { let left_expr = keys.iter().map(|x| col(&x.0)).collect(); let right_expr = keys.iter().map(|x| col(&x.1)).collect(); - // Use hash partition by defualt to parallelize hash joins + // Use hash partition by default to parallelize hash joins Ok(Arc::new(HashJoinExec::try_new( Arc::new(RepartitionExec::try_new( left, @@ -776,12 +781,6 @@ impl DefaultPhysicalPlanner { .to_owned(), )); } - if !order_by.is_empty() { - return Err(DataFusionError::NotImplemented( - "window expression with non-empty order by clause is not yet supported" - .to_owned(), - )); - } if window_frame.is_some() { return Err(DataFusionError::NotImplemented( "window expression with window frame definition is not yet supported" diff --git a/datafusion/src/physical_plan/window_functions.rs b/datafusion/src/physical_plan/window_functions.rs index e6afcaad8ad6b..4f56aa7d38262 100644 --- a/datafusion/src/physical_plan/window_functions.rs +++ b/datafusion/src/physical_plan/window_functions.rs @@ -20,11 +20,12 @@ //! //! see also https://www.postgresql.org/docs/current/functions-window.html +use crate::arrow::array::ArrayRef; use crate::arrow::datatypes::Field; use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ aggregates, aggregates::AggregateFunction, functions::Signature, - type_coercion::data_types, PhysicalExpr, WindowAccumulator, + type_coercion::data_types, PhysicalExpr, }; use arrow::datatypes::DataType; use std::any::Any; @@ -207,7 +208,10 @@ pub(super) fn signature_for_built_in(fun: &BuiltInWindowFunction) -> Signature { } } -/// A window expression that is a built-in window function +/// A window expression that is a built-in window function. +/// +/// Note that unlike aggregation based window functions, built-in window functions normally ignore +/// window frame spec, with the exception of first_value, last_value, and nth_value. pub trait BuiltInWindowFunctionExpr: Send + Sync + std::fmt::Debug { /// Returns the aggregate expression as [`Any`](std::any::Any) so that it can be /// downcast to a specific implementation. @@ -226,10 +230,8 @@ pub trait BuiltInWindowFunctionExpr: Send + Sync + std::fmt::Debug { "BuiltInWindowFunctionExpr: default name" } - /// the accumulator used to accumulate values from the expressions. - /// the accumulator expects the same number of arguments as `expressions` and must - /// return states with the same description as `state_fields` - fn create_accumulator(&self) -> Result>; + /// Evaluate the built-in window function against the number of rows and the arguments + fn evaluate(&self, num_rows: usize, values: &[ArrayRef]) -> Result; } #[cfg(test)] diff --git a/datafusion/src/physical_plan/windows.rs b/datafusion/src/physical_plan/windows.rs index f95dd446844d0..e5570971cf166 100644 --- a/datafusion/src/physical_plan/windows.rs +++ b/datafusion/src/physical_plan/windows.rs @@ -18,8 +18,7 @@ //! Execution plan for window functions use crate::error::{DataFusionError, Result}; - -use crate::logical_plan::window_frames::WindowFrame; +use crate::logical_plan::window_frames::{WindowFrame, WindowFrameUnits}; use crate::physical_plan::{ aggregates, common, expressions::{Literal, NthValue, PhysicalSortExpr, RowNumber}, @@ -28,9 +27,9 @@ use crate::physical_plan::{ window_functions::BuiltInWindowFunctionExpr, window_functions::{BuiltInWindowFunction, WindowFunction}, Accumulator, AggregateExpr, Distribution, ExecutionPlan, Partitioning, PhysicalExpr, - RecordBatchStream, SendableRecordBatchStream, WindowAccumulator, WindowExpr, + RecordBatchStream, SendableRecordBatchStream, WindowExpr, }; -use crate::scalar::ScalarValue; +use arrow::compute::concat; use arrow::{ array::ArrayRef, datatypes::{Field, Schema, SchemaRef}, @@ -43,6 +42,7 @@ use futures::Future; use pin_project_lite::pin_project; use std::any::Any; use std::convert::TryInto; +use std::ops::Range; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; @@ -65,12 +65,9 @@ pub fn create_window_expr( fun: &WindowFunction, name: String, args: &[Arc], - // https://github.com/apache/arrow-datafusion/issues/299 - _partition_by: &[Arc], - // https://github.com/apache/arrow-datafusion/issues/360 - _order_by: &[PhysicalSortExpr], - // https://github.com/apache/arrow-datafusion/issues/361 - _window_frame: Option, + partition_by: &[Arc], + order_by: &[PhysicalSortExpr], + window_frame: Option, input_schema: &Schema, ) -> Result> { Ok(match fun { @@ -82,9 +79,15 @@ pub fn create_window_expr( input_schema, name, )?, + partition_by: partition_by.to_vec(), + order_by: order_by.to_vec(), + window_frame, }), WindowFunction::BuiltInWindowFunction(fun) => Arc::new(BuiltInWindowExpr { window: create_built_in_window_expr(fun, args, input_schema, name)?, + partition_by: partition_by.to_vec(), + order_by: order_by.to_vec(), + window_frame, }), }) } @@ -136,6 +139,9 @@ fn create_built_in_window_expr( #[derive(Debug)] pub struct BuiltInWindowExpr { window: Arc, + partition_by: Vec>, + order_by: Vec, + window_frame: Option, } impl WindowExpr for BuiltInWindowExpr { @@ -156,8 +162,20 @@ impl WindowExpr for BuiltInWindowExpr { self.window.expressions() } - fn create_accumulator(&self) -> Result> { - self.window.create_accumulator() + fn partition_by(&self) -> &[Arc] { + &self.partition_by + } + + fn order_by(&self) -> &[PhysicalSortExpr] { + &self.order_by + } + + fn evaluate(&self, batch: &RecordBatch) -> Result { + // FIXME, for now we assume all the rows belong to the same partition, which will not be the + // case when partition_by is supported, in which case we'll parallelize the calls. + // See https://github.com/apache/arrow-datafusion/issues/299 + let values = self.evaluate_args(batch)?; + self.window.evaluate(batch.num_rows(), &values) } } @@ -165,22 +183,51 @@ impl WindowExpr for BuiltInWindowExpr { #[derive(Debug)] pub struct AggregateWindowExpr { aggregate: Arc, + partition_by: Vec>, + order_by: Vec, + window_frame: Option, } -#[derive(Debug)] -struct AggregateWindowAccumulator { - accumulator: Box, -} +impl AggregateWindowExpr { + /// the aggregate window function operates based on window frame, and by default the mode is + /// "range". + fn evaluation_mode(&self) -> WindowFrameUnits { + self.window_frame.unwrap_or_default().units + } -impl WindowAccumulator for AggregateWindowAccumulator { - fn scan(&mut self, values: &[ScalarValue]) -> Result> { - self.accumulator.update(values)?; - Ok(None) + /// create a new accumulator based on the underlying aggregation function + fn create_accumulator(&self) -> Result { + let accumulator = self.aggregate.create_accumulator()?; + Ok(AggregateWindowAccumulator { accumulator }) } - /// returns its value based on its current state. - fn evaluate(&self) -> Result> { - Ok(Some(self.accumulator.evaluate()?)) + /// peer based evaluation based on the fact that batch is pre-sorted given the sort columns + /// and then per partition point we'll evaluate the peer group (e.g. SUM or MAX gives the same + /// results for peers) and concatenate the results. + fn peer_based_evaluate(&self, batch: &RecordBatch) -> Result { + let sort_partition_points = self.evaluate_sort_partition_points(batch)?; + let mut window_accumulators = self.create_accumulator()?; + let values = self.evaluate_args(batch)?; + let results = sort_partition_points + .iter() + .map(|peer_range| window_accumulators.scan_peers(&values, peer_range)) + .collect::>>()?; + let results = results.iter().map(|i| i.as_ref()).collect::>(); + concat(&results).map_err(DataFusionError::ArrowError) + } + + fn group_based_evaluate(&self, _batch: &RecordBatch) -> Result { + Err(DataFusionError::NotImplemented(format!( + "Group based evaluation for {} is not yet implemented", + self.name() + ))) + } + + fn row_based_evaluate(&self, _batch: &RecordBatch) -> Result { + Err(DataFusionError::NotImplemented(format!( + "Row based evaluation for {} is not yet implemented", + self.name() + ))) } } @@ -202,9 +249,55 @@ impl WindowExpr for AggregateWindowExpr { self.aggregate.expressions() } - fn create_accumulator(&self) -> Result> { - let accumulator = self.aggregate.create_accumulator()?; - Ok(Box::new(AggregateWindowAccumulator { accumulator })) + fn partition_by(&self) -> &[Arc] { + &self.partition_by + } + + fn order_by(&self) -> &[PhysicalSortExpr] { + &self.order_by + } + + /// evaluate the window function values against the batch + fn evaluate(&self, batch: &RecordBatch) -> Result { + // FIXME, for now we assume all the rows belong to the same partition, which will not be the + // case when partition_by is supported, in which case we'll parallelize the calls. + // See https://github.com/apache/arrow-datafusion/issues/299 + match self.evaluation_mode() { + WindowFrameUnits::Range => self.peer_based_evaluate(batch), + WindowFrameUnits::Rows => self.row_based_evaluate(batch), + WindowFrameUnits::Groups => self.group_based_evaluate(batch), + } + } +} + +/// Aggregate window accumulator utilizes the accumulator from aggregation and do a accumulative sum +/// across evaluation arguments based on peer equivalences. +#[derive(Debug)] +struct AggregateWindowAccumulator { + accumulator: Box, +} + +impl AggregateWindowAccumulator { + /// scan one peer group of values (as arguments to window function) given by the value_range + /// and return evaluation result that are of the same number of rows. + fn scan_peers( + &mut self, + values: &[ArrayRef], + value_range: &Range, + ) -> Result { + if value_range.is_empty() { + return Err(DataFusionError::Internal( + "Value range cannot be empty".to_owned(), + )); + } + let len = value_range.end - value_range.start; + let values = values + .iter() + .map(|v| v.slice(value_range.start, len)) + .collect::>(); + self.accumulator.update_batch(&values)?; + let value = self.accumulator.evaluate()?; + Ok(value.to_array_of_size(len)) } } @@ -329,106 +422,17 @@ pin_project! { } } -type WindowAccumulatorItem = Box; - -fn window_expressions( - window_expr: &[Arc], -) -> Result>>> { - Ok(window_expr - .iter() - .map(|expr| expr.expressions()) - .collect::>()) -} - -fn window_aggregate_batch( - batch: &RecordBatch, - window_accumulators: &mut [WindowAccumulatorItem], - expressions: &[Vec>], -) -> Result>> { - window_accumulators - .iter_mut() - .zip(expressions) - .map(|(window_acc, expr)| { - let values = &expr - .iter() - .map(|e| e.evaluate(batch)) - .map(|r| r.map(|v| v.into_array(batch.num_rows()))) - .collect::>>()?; - window_acc.scan_batch(batch.num_rows(), values) - }) - .collect::>>() -} - -/// returns a vector of ArrayRefs, where each entry corresponds to one window expr -fn finalize_window_aggregation( - window_accumulators: &[WindowAccumulatorItem], -) -> Result>> { - window_accumulators - .iter() - .map(|window_accumulator| window_accumulator.evaluate()) - .collect::>>() -} - -fn create_window_accumulators( - window_expr: &[Arc], -) -> Result> { - window_expr - .iter() - .map(|expr| expr.create_accumulator()) - .collect::>>() -} - /// Compute the window aggregate columns -/// -/// 1. get a list of window accumulators -/// 2. evaluate the args -/// 3. scan args with window functions -/// 4. concat with final aggregations -/// -/// FIXME so far this fn does not support: -/// 1. partition by -/// 2. order by -/// 3. window frame -/// -/// which will require further work: -/// 1. inter-partition order by using vec partition-point (https://github.com/apache/arrow-datafusion/issues/360) -/// 2. inter-partition parallelism using one-shot channel (https://github.com/apache/arrow-datafusion/issues/299) -/// 3. convert aggregation based window functions to be self-contain so that: (https://github.com/apache/arrow-datafusion/issues/361) -/// a. some can be grow-only window-accumulating -/// b. some can be grow-and-shrink window-accumulating -/// c. some can be based on segment tree fn compute_window_aggregates( window_expr: Vec>, batch: &RecordBatch, ) -> Result> { - let mut window_accumulators = create_window_accumulators(&window_expr)?; - let expressions = Arc::new(window_expressions(&window_expr)?); - let num_rows = batch.num_rows(); - let window_aggregates = - window_aggregate_batch(batch, &mut window_accumulators, &expressions)?; - let final_aggregates = finalize_window_aggregation(&window_accumulators)?; - - // both must equal to window_expr.len() - if window_aggregates.len() != final_aggregates.len() { - return Err(DataFusionError::Internal( - "Impossibly got len mismatch".to_owned(), - )); - } - - window_aggregates + // FIXME, for now we assume all the rows belong to the same partition, which will not be the + // case when partition_by is supported, in which case we'll parallelize the calls. + // See https://github.com/apache/arrow-datafusion/issues/299 + window_expr .iter() - .zip(final_aggregates) - .map(|(wa, fa)| { - Ok(match (wa, fa) { - (None, Some(fa)) => fa.to_array_of_size(num_rows), - (Some(wa), None) if wa.len() == num_rows => wa.clone(), - _ => { - return Err(DataFusionError::Execution( - "Invalid window function behavior".to_owned(), - )) - } - }) - }) + .map(|window_expr| window_expr.evaluate(batch)) .collect() } diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index ac7deeed22c74..933bb8cebcb1c 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -771,7 +771,7 @@ impl ScalarValue { let dict_array = array.as_any().downcast_ref::>().unwrap(); // look up the index in the values dictionary - let keys_col = dict_array.keys_array(); + let keys_col = dict_array.keys(); let values_index = keys_col.value(index).to_usize().ok_or_else(|| { DataFusionError::Internal(format!( "Can not convert index to usize in dictionary of type creating group by value {:?}", diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index e860bd74641dc..4c1d8610dfdd9 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -703,7 +703,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let mut plan = input; let mut groups = group_window_expr_by_sort_keys(&window_exprs)?; // sort by sort_key len descending, so that more deeply sorted plans gets nested further - // down as children; to further minic the behavior of PostgreSQL, we want stable sort + // down as children; to further mimic the behavior of PostgreSQL, we want stable sort // and a reverse so that tieing sort keys are reversed in order; note that by this rule // if there's an empty over, it'll be at the top level groups.sort_by(|(key_a, _), (key_b, _)| key_a.len().cmp(&key_b.len())); diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index d9d77648c7427..21da793b55385 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -802,25 +802,142 @@ async fn csv_query_window_with_empty_over() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "select \ - c2, \ - sum(c3) over (), \ - avg(c3) over (), \ - count(c3) over (), \ - max(c3) over (), \ - min(c3) over (), \ - first_value(c3) over (), \ - last_value(c3) over (), \ - nth_value(c3, 2) over () + c9, \ + count(c5) over (), \ + max(c5) over (), \ + min(c5) over (), \ + first_value(c5) over (), \ + last_value(c5) over (), \ + nth_value(c5, 2) over () \ from aggregate_test_100 \ - order by c2 + order by c9 \ limit 5"; let actual = execute(&mut ctx, sql).await; let expected = vec![ - vec!["1", "781", "7.81", "100", "125", "-117", "1", "30", "-40"], - vec!["1", "781", "7.81", "100", "125", "-117", "1", "30", "-40"], - vec!["1", "781", "7.81", "100", "125", "-117", "1", "30", "-40"], - vec!["1", "781", "7.81", "100", "125", "-117", "1", "30", "-40"], - vec!["1", "781", "7.81", "100", "125", "-117", "1", "30", "-40"], + vec![ + "28774375", + "100", + "2143473091", + "-2141999138", + "2033001162", + "61035129", + "706441268", + ], + vec![ + "63044568", + "100", + "2143473091", + "-2141999138", + "2033001162", + "61035129", + "706441268", + ], + vec![ + "141047417", + "100", + "2143473091", + "-2141999138", + "2033001162", + "61035129", + "706441268", + ], + vec![ + "141680161", + "100", + "2143473091", + "-2141999138", + "2033001162", + "61035129", + "706441268", + ], + vec![ + "145294611", + "100", + "2143473091", + "-2141999138", + "2033001162", + "61035129", + "706441268", + ], + ]; + assert_eq!(expected, actual); + Ok(()) +} + +#[tokio::test] +async fn csv_query_window_with_order_by() -> Result<()> { + let mut ctx = ExecutionContext::new(); + register_aggregate_csv(&mut ctx)?; + let sql = "select \ + c9, \ + sum(c5) over (order by c9), \ + avg(c5) over (order by c9), \ + count(c5) over (order by c9), \ + max(c5) over (order by c9), \ + min(c5) over (order by c9), \ + first_value(c5) over (order by c9), \ + last_value(c5) over (order by c9), \ + nth_value(c5, 2) over (order by c9) \ + from aggregate_test_100 \ + order by c9 \ + limit 5"; + let actual = execute(&mut ctx, sql).await; + let expected = vec![ + vec![ + "28774375", + "61035129", + "61035129", + "1", + "61035129", + "61035129", + "61035129", + "2025611582", + "-108973366", + ], + vec![ + "63044568", + "-47938237", + "-23969118.5", + "2", + "61035129", + "-108973366", + "61035129", + "2025611582", + "-108973366", + ], + vec![ + "141047417", + "575165281", + "191721760.33333334", + "3", + "623103518", + "-108973366", + "61035129", + "2025611582", + "-108973366", + ], + vec![ + "141680161", + "-1352462829", + "-338115707.25", + "4", + "623103518", + "-1927628110", + "61035129", + "2025611582", + "-108973366", + ], + vec![ + "145294611", + "-3251637940", + "-650327588", + "5", + "623103518", + "-1927628110", + "61035129", + "2025611582", + "-108973366", + ], ]; assert_eq!(expected, actual); Ok(()) diff --git a/integration-tests/sqls/simple_window_ordered_aggregation.sql b/integration-tests/sqls/simple_window_ordered_aggregation.sql new file mode 100644 index 0000000000000..d9f467b0cb09a --- /dev/null +++ b/integration-tests/sqls/simple_window_ordered_aggregation.sql @@ -0,0 +1,26 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at + +-- http://www.apache.org/licenses/LICENSE-2.0 + +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language gOVERning permissions and +-- limitations under the License. + +SELECT + c9, + row_number() OVER (ORDER BY c2, c9) AS row_number, + count(c3) OVER (ORDER BY c9) AS count_c3, + avg(c3) OVER (ORDER BY c2) AS avg_c3_by_c2, + sum(c3) OVER (ORDER BY c2) AS sum_c3_by_c2, + max(c3) OVER (ORDER BY c2) AS max_c3_by_c2, + min(c3) OVER (ORDER BY c2) AS min_c3_by_c2 +FROM test +ORDER BY row_number; diff --git a/integration-tests/test_psql_parity.py b/integration-tests/test_psql_parity.py index 51861c583f8a6..4e0878c24b818 100644 --- a/integration-tests/test_psql_parity.py +++ b/integration-tests/test_psql_parity.py @@ -74,7 +74,7 @@ class PsqlParityTest(unittest.TestCase): def test_parity(self): root = Path(os.path.dirname(__file__)) / "sqls" files = set(root.glob("*.sql")) - self.assertEqual(len(files), 6, msg="tests are missed") + self.assertEqual(len(files), 7, msg="tests are missed") for fname in files: with self.subTest(fname=fname): datafusion_output = pd.read_csv( From e510bd6ac897c3dcfdeae2d6b984ae3e9124fc74 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Fri, 18 Jun 2021 21:49:31 +0800 Subject: [PATCH 190/329] implement default for execution config (#570) --- datafusion/src/execution/context.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index 1835244979402..c1a40ba10552f 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -642,9 +642,8 @@ pub struct ExecutionConfig { pub repartition_aggregations: bool, } -impl ExecutionConfig { - /// Create an execution config with default setting - pub fn new() -> Self { +impl Default for ExecutionConfig { + fn default() -> Self { Self { concurrency: num_cpus::get(), batch_size: 8192, @@ -671,6 +670,13 @@ impl ExecutionConfig { repartition_aggregations: true, } } +} + +impl ExecutionConfig { + /// Create an execution config with default setting + pub fn new() -> Self { + Default::default() + } /// Customize max_concurrency pub fn with_concurrency(mut self, n: usize) -> Self { From 4a55364448c81182731f4fa4c3e65aef5b31fa0f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 18 Jun 2021 12:30:47 -0400 Subject: [PATCH 191/329] RepartitionExec should not error if output has hung up (#576) * RepartitionExec should not error if output has hung up * Remove debug logging --- datafusion/src/physical_plan/repartition.rs | 128 ++++++++++++++++++-- datafusion/src/test/exec.rs | 90 ++++++++++++++ 2 files changed, 210 insertions(+), 8 deletions(-) diff --git a/datafusion/src/physical_plan/repartition.rs b/datafusion/src/physical_plan/repartition.rs index 7ef1948490741..a7b17c4161b0a 100644 --- a/datafusion/src/physical_plan/repartition.rs +++ b/datafusion/src/physical_plan/repartition.rs @@ -255,12 +255,15 @@ impl RepartitionExec { let mut counter = 0; let hashes_buf = &mut vec![]; - loop { + // While there are still outputs to send to, keep + // pulling inputs + while !txs.is_empty() { // fetch the next batch let now = Instant::now(); let result = stream.next().await; metrics.fetch_nanos.add_elapsed(now); + // Input is done if result.is_none() { break; } @@ -270,9 +273,13 @@ impl RepartitionExec { Partitioning::RoundRobinBatch(_) => { let now = Instant::now(); let output_partition = counter % num_output_partitions; - let tx = txs.get_mut(&output_partition).unwrap(); - tx.send(Some(result)) - .map_err(|e| DataFusionError::Execution(e.to_string()))?; + // if there is still a receiver, send to it + if let Some(tx) = txs.get_mut(&output_partition) { + if tx.send(Some(result)).is_err() { + // If the other end has hung up, it was an early shutdown (e.g. LIMIT) + txs.remove(&output_partition); + } + } metrics.send_nanos.add_elapsed(now); } Partitioning::Hash(exprs, _) => { @@ -315,9 +322,13 @@ impl RepartitionExec { RecordBatch::try_new(input_batch.schema(), columns); metrics.repart_nanos.add_elapsed(now); let now = Instant::now(); - let tx = txs.get_mut(&num_output_partition).unwrap(); - tx.send(Some(output_batch)) - .map_err(|e| DataFusionError::Execution(e.to_string()))?; + // if there is still a receiver, send to it + if let Some(tx) = txs.get_mut(&num_output_partition) { + if tx.send(Some(output_batch)).is_err() { + // If the other end has hung up, it was an early shutdown (e.g. LIMIT) + txs.remove(&num_output_partition); + } + } metrics.send_nanos.add_elapsed(now); } } @@ -425,7 +436,7 @@ mod tests { use crate::{ assert_batches_sorted_eq, physical_plan::memory::MemoryExec, - test::exec::{ErrorExec, MockExec}, + test::exec::{BarrierExec, ErrorExec, MockExec}, }; use arrow::datatypes::{DataType, Field, Schema}; use arrow::record_batch::RecordBatch; @@ -723,4 +734,105 @@ mod tests { assert_batches_sorted_eq!(&expected, &batches); } + + #[tokio::test] + async fn repartition_with_dropping_output_stream() { + #[derive(Debug)] + struct Case<'a> { + partitioning: Partitioning, + expected: Vec<&'a str>, + } + + let cases = vec![ + Case { + partitioning: Partitioning::RoundRobinBatch(2), + expected: vec![ + "+------------------+", + "| my_awesome_field |", + "+------------------+", + "| baz |", + "| frob |", + "| gaz |", + "| grob |", + "+------------------+", + ], + }, + Case { + partitioning: Partitioning::Hash( + vec![Arc::new(crate::physical_plan::expressions::Column::new( + "my_awesome_field", + ))], + 2, + ), + expected: vec![ + "+------------------+", + "| my_awesome_field |", + "+------------------+", + "| frob |", + "+------------------+", + ], + }, + ]; + + for case in cases { + println!("Running case {:?}", case.partitioning); + + // The barrier exec waits to be pinged + // requires the input to wait at least once) + let input = Arc::new(make_barrier_exec()); + + // partition into two output streams + let exec = + RepartitionExec::try_new(input.clone(), case.partitioning).unwrap(); + + let output_stream0 = exec.execute(0).await.unwrap(); + let output_stream1 = exec.execute(1).await.unwrap(); + + // now, purposely drop output stream 0 + // *before* any outputs are produced + std::mem::drop(output_stream0); + + // Now, start sending input + input.wait().await; + + // output stream 1 should *not* error and have one of the input batches + let batches = crate::physical_plan::common::collect(output_stream1) + .await + .unwrap(); + + assert_batches_sorted_eq!(&case.expected, &batches); + } + } + + /// Create a BarrierExec that returns two partitions of two batches each + fn make_barrier_exec() -> BarrierExec { + let batch1 = RecordBatch::try_from_iter(vec![( + "my_awesome_field", + Arc::new(StringArray::from(vec!["foo", "bar"])) as ArrayRef, + )]) + .unwrap(); + + let batch2 = RecordBatch::try_from_iter(vec![( + "my_awesome_field", + Arc::new(StringArray::from(vec!["frob", "baz"])) as ArrayRef, + )]) + .unwrap(); + + let batch3 = RecordBatch::try_from_iter(vec![( + "my_awesome_field", + Arc::new(StringArray::from(vec!["goo", "gar"])) as ArrayRef, + )]) + .unwrap(); + + let batch4 = RecordBatch::try_from_iter(vec![( + "my_awesome_field", + Arc::new(StringArray::from(vec!["grob", "gaz"])) as ArrayRef, + )]) + .unwrap(); + + // The barrier exec waits to be pinged + // requires the input to wait at least once) + let schema = batch1.schema(); + BarrierExec::new(vec![vec![batch1, batch2], vec![batch3, batch4]], schema) + } } diff --git a/datafusion/src/test/exec.rs b/datafusion/src/test/exec.rs index bcd94dd6d6397..3971db3adf823 100644 --- a/datafusion/src/test/exec.rs +++ b/datafusion/src/test/exec.rs @@ -23,6 +23,7 @@ use std::{ sync::Arc, task::{Context, Poll}, }; +use tokio::sync::Barrier; use arrow::{ datatypes::{DataType, Field, Schema, SchemaRef}, @@ -226,6 +227,95 @@ impl RecordBatchStream for DelayedStream { } } +/// A Mock ExecutionPlan that does not start producing input until a +/// barrier is called +/// +#[derive(Debug)] +pub struct BarrierExec { + /// partitions to send back + data: Vec>, + schema: SchemaRef, + + /// all streams wait on this barrier to produce + barrier: Arc, +} + +impl BarrierExec { + /// Create a new exec with some number of partitions. + pub fn new(data: Vec>, schema: SchemaRef) -> Self { + // wait for all streams and the input + let barrier = Arc::new(Barrier::new(data.len() + 1)); + Self { + data, + schema, + barrier, + } + } + + /// wait until all the input streams and this function is ready + pub async fn wait(&self) { + println!("BarrierExec::wait waiting on barrier"); + self.barrier.wait().await; + println!("BarrierExec::wait done waiting"); + } +} + +#[async_trait] +impl ExecutionPlan for BarrierExec { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + + fn output_partitioning(&self) -> Partitioning { + Partitioning::UnknownPartitioning(self.data.len()) + } + + fn children(&self) -> Vec> { + unimplemented!() + } + + fn with_new_children( + &self, + _children: Vec>, + ) -> Result> { + unimplemented!() + } + + /// Returns a stream which yields data + async fn execute(&self, partition: usize) -> Result { + assert!(partition < self.data.len()); + + let schema = self.schema(); + + let (tx, rx) = tokio::sync::mpsc::channel(2); + + // task simply sends data in order after barrier is reached + let data = self.data[partition].clone(); + let b = self.barrier.clone(); + tokio::task::spawn(async move { + println!("Partition {} waiting on barrier", partition); + b.wait().await; + for batch in data { + println!("Partition {} sending batch", partition); + if let Err(e) = tx.send(Ok(batch)).await { + println!("ERROR batch via barrier stream stream: {}", e); + } + } + }); + + // returned stream simply reads off the rx stream + let stream = DelayedStream { + schema, + inner: ReceiverStream::new(rx), + }; + Ok(Box::pin(stream)) + } +} + /// A mock execution plan that errors on a call to execute #[derive(Debug)] pub struct ErrorExec { From 330c809f7f396ea16f773a666f66fa64b3536c81 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Sat, 19 Jun 2021 06:05:55 +0800 Subject: [PATCH 192/329] fix clippy warnings (#581) * fix clippy warnings * fix test case * upgrade tarpaulin * Disable coverage job while it is failing Co-authored-by: Andrew Lamb --- .github/workflows/rust.yml | 76 ++++++++++--------- ballista/rust/core/src/serde/scheduler/mod.rs | 2 +- datafusion-cli/src/print_format.rs | 2 +- datafusion/src/execution/context.rs | 7 +- datafusion/src/logical_plan/dfschema.rs | 2 + datafusion/src/logical_plan/display.rs | 2 +- datafusion/src/optimizer/utils.rs | 2 +- .../src/physical_plan/expressions/not.rs | 2 +- datafusion/src/physical_plan/hash_join.rs | 3 +- .../src/physical_plan/regex_expressions.rs | 2 +- datafusion/src/scalar.rs | 4 +- datafusion/src/sql/planner.rs | 32 ++++---- 12 files changed, 68 insertions(+), 68 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 933b51353d06b..7a2890c98b9f4 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -321,41 +321,43 @@ jobs: # Ignore MIRI errors until we can get a clean run cargo miri test || true - coverage: - name: Coverage - runs-on: ubuntu-latest - strategy: - matrix: - arch: [amd64] - rust: [stable] - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /home/runner/.cargo - # this key is not equal because the user is different than on a container (runner vs github) - key: cargo-coverage-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /home/runner/target - # this key is not equal because coverage uses different compilation flags. - key: ${{ runner.os }}-${{ matrix.arch }}-target-coverage-cache-${{ matrix.rust }}- - - name: Run coverage - run: | - export ARROW_TEST_DATA=$(pwd)/testing/data - export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data +# Coverage job was failing. https://github.com/apache/arrow-datafusion/issues/590 tracks re-instating it - # 2020-11-15: There is a cargo-tarpaulin regression in 0.17.0 - # see https://github.com/xd009642/tarpaulin/issues/618 - cargo install --version 0.16.0 cargo-tarpaulin - cargo tarpaulin --out Xml - env: - CARGO_HOME: "/home/runner/.cargo" - CARGO_TARGET_DIR: "/home/runner/target" - - name: Report coverage - continue-on-error: true - run: bash <(curl -s https://codecov.io/bash) + # coverage: + # name: Coverage + # runs-on: ubuntu-latest + # strategy: + # matrix: + # arch: [amd64] + # rust: [stable] + # steps: + # - uses: actions/checkout@v2 + # with: + # submodules: true + # - name: Cache Cargo + # uses: actions/cache@v2 + # with: + # path: /home/runner/.cargo + # # this key is not equal because the user is different than on a container (runner vs github) + # key: cargo-coverage-cache- + # - name: Cache Rust dependencies + # uses: actions/cache@v2 + # with: + # path: /home/runner/target + # # this key is not equal because coverage uses different compilation flags. + # key: ${{ runner.os }}-${{ matrix.arch }}-target-coverage-cache-${{ matrix.rust }}- + # - name: Run coverage + # run: | + # export ARROW_TEST_DATA=$(pwd)/testing/data + # export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data + + # # 2020-11-15: There is a cargo-tarpaulin regression in 0.17.0 + # # see https://github.com/xd009642/tarpaulin/issues/618 + # cargo install --version 0.16.0 cargo-tarpaulin + # cargo tarpaulin --out Xml + # env: + # CARGO_HOME: "/home/runner/.cargo" + # CARGO_TARGET_DIR: "/home/runner/target" + # - name: Report coverage + # continue-on-error: true + # run: bash <(curl -s https://codecov.io/bash) diff --git a/ballista/rust/core/src/serde/scheduler/mod.rs b/ballista/rust/core/src/serde/scheduler/mod.rs index b502c325595ff..c9bd1e93db2c4 100644 --- a/ballista/rust/core/src/serde/scheduler/mod.rs +++ b/ballista/rust/core/src/serde/scheduler/mod.rs @@ -142,7 +142,7 @@ impl PartitionStats { ] } - pub fn to_arrow_arrayref(&self) -> Result, BallistaError> { + pub fn to_arrow_arrayref(self) -> Result, BallistaError> { let mut field_builders = Vec::new(); let mut num_rows_builder = UInt64Builder::new(1); diff --git a/datafusion-cli/src/print_format.rs b/datafusion-cli/src/print_format.rs index 34cf5e1f65e91..dadee4c7c8449 100644 --- a/datafusion-cli/src/print_format.rs +++ b/datafusion-cli/src/print_format.rs @@ -151,7 +151,7 @@ mod tests { #[test] fn test_from_str_failure() { - assert_eq!(true, "pretty".parse::().is_err()); + assert!("pretty".parse::().is_err()); } #[test] diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index c1a40ba10552f..ef652c28d1edf 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -1131,7 +1131,7 @@ mod tests { let ctx = create_ctx(&tmp_dir, 1)?; let schema: Schema = ctx.table("test").unwrap().schema().clone().into(); - assert_eq!(schema.field_with_name("c1")?.is_nullable(), false); + assert!(!schema.field_with_name("c1")?.is_nullable()); let plan = LogicalPlanBuilder::scan_empty("", &schema, None)? .project(vec![col("c1")])? @@ -1139,10 +1139,7 @@ mod tests { let plan = ctx.optimize(&plan)?; let physical_plan = ctx.create_physical_plan(&Arc::new(plan))?; - assert_eq!( - physical_plan.schema().field_with_name("c1")?.is_nullable(), - false - ); + assert!(!physical_plan.schema().field_with_name("c1")?.is_nullable()); Ok(()) } diff --git a/datafusion/src/logical_plan/dfschema.rs b/datafusion/src/logical_plan/dfschema.rs index 5a9167e58b053..c5437b3af953c 100644 --- a/datafusion/src/logical_plan/dfschema.rs +++ b/datafusion/src/logical_plan/dfschema.rs @@ -248,12 +248,14 @@ where } impl ToDFSchema for Schema { + #[allow(clippy::wrong_self_convention)] fn to_dfschema(self) -> Result { DFSchema::try_from(self) } } impl ToDFSchema for SchemaRef { + #[allow(clippy::wrong_self_convention)] fn to_dfschema(self) -> Result { // Attempt to use the Schema directly if there are no other // references, otherwise clone diff --git a/datafusion/src/logical_plan/display.rs b/datafusion/src/logical_plan/display.rs index f285534fdf1b6..8178ef4484b25 100644 --- a/datafusion/src/logical_plan/display.rs +++ b/datafusion/src/logical_plan/display.rs @@ -197,7 +197,7 @@ impl<'a, 'b> PlanVisitor for GraphvizVisitor<'a, 'b> { // id [label="foo"] let label = if self.with_schema { format!( - "{}\\nSchema: {}", + r"{}\nSchema: {}", plan.display(), display_schema(&plan.schema().as_ref().to_owned().into()) ) diff --git a/datafusion/src/optimizer/utils.rs b/datafusion/src/optimizer/utils.rs index e707d30bc9ace..014ec74a0bfe1 100644 --- a/datafusion/src/optimizer/utils.rs +++ b/datafusion/src/optimizer/utils.rs @@ -551,7 +551,7 @@ mod tests { stringified_plans, .. } => { - assert_eq!(*verbose, true); + assert!(*verbose); let expected_stringified_plans = vec![ StringifiedPlan::new(PlanType::LogicalPlan, "..."), diff --git a/datafusion/src/physical_plan/expressions/not.rs b/datafusion/src/physical_plan/expressions/not.rs index 23a1a46651dee..7a997b61b488a 100644 --- a/datafusion/src/physical_plan/expressions/not.rs +++ b/datafusion/src/physical_plan/expressions/not.rs @@ -129,7 +129,7 @@ mod tests { let expr = not(col("a"), &schema)?; assert_eq!(expr.data_type(&schema)?, DataType::Boolean); - assert_eq!(expr.nullable(&schema)?, true); + assert!(expr.nullable(&schema)?); let input = BooleanArray::from(vec![Some(true), None, Some(false)]); let expected = &BooleanArray::from(vec![Some(false), None, Some(true)]); diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index 644d2d486c854..928392a844337 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -684,11 +684,10 @@ fn build_join_indexes( &keys_values, )? { left_indices.append_value(i)?; - right_indices.append_value(row as u32)?; } else { left_indices.append_null()?; - right_indices.append_value(row as u32)?; } + right_indices.append_value(row as u32)?; } } None => { diff --git a/datafusion/src/physical_plan/regex_expressions.rs b/datafusion/src/physical_plan/regex_expressions.rs index b526e7259ef61..69b27ffb26626 100644 --- a/datafusion/src/physical_plan/regex_expressions.rs +++ b/datafusion/src/physical_plan/regex_expressions.rs @@ -62,7 +62,7 @@ pub fn regexp_match(args: &[ArrayRef]) -> Result String { lazy_static! { - static ref CAPTURE_GROUPS_RE: Regex = Regex::new("(\\\\)(\\d*)").unwrap(); + static ref CAPTURE_GROUPS_RE: Regex = Regex::new(r"(\\)(\d*)").unwrap(); } CAPTURE_GROUPS_RE .replace_all(replacement, "$${$2}") diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index 933bb8cebcb1c..c7afbf55e367c 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -1123,7 +1123,7 @@ mod tests { let array = value.to_array(); let array = array.as_any().downcast_ref::().unwrap(); assert_eq!(array.len(), 1); - assert_eq!(false, array.is_null(0)); + assert!(!array.is_null(0)); assert_eq!(array.value(0), 13); let value = ScalarValue::UInt64(None); @@ -1139,7 +1139,7 @@ mod tests { let array = value.to_array(); let array = array.as_any().downcast_ref::().unwrap(); assert_eq!(array.len(), 1); - assert_eq!(false, array.is_null(0)); + assert!(!array.is_null(0)); assert_eq!(array.value(0), 13); let value = ScalarValue::UInt32(None); diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 4c1d8610dfdd9..547e9afd38d91 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -1653,7 +1653,7 @@ mod tests { let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( format!( - "Plan(\"Invalid identifier \\\'doesnotexist\\\' for schema {}\")", + r#"Plan("Invalid identifier 'doesnotexist' for schema {}")"#, PERSON_COLUMN_NAMES ), format!("{:?}", err) @@ -1665,7 +1665,7 @@ mod tests { let sql = "SELECT age, age FROM person"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Plan(\"Projections require unique expression names but the expression \\\"#age\\\" at position 0 and \\\"#age\\\" at position 1 have the same name. Consider aliasing (\\\"AS\\\") one of them.\")", + r##"Plan("Projections require unique expression names but the expression \"#age\" at position 0 and \"#age\" at position 1 have the same name. Consider aliasing (\"AS\") one of them.")"##, format!("{:?}", err) ); } @@ -1675,7 +1675,7 @@ mod tests { let sql = "SELECT *, age FROM person"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Plan(\"Projections require unique expression names but the expression \\\"#age\\\" at position 3 and \\\"#age\\\" at position 8 have the same name. Consider aliasing (\\\"AS\\\") one of them.\")", + r##"Plan("Projections require unique expression names but the expression \"#age\" at position 3 and \"#age\" at position 8 have the same name. Consider aliasing (\"AS\") one of them.")"##, format!("{:?}", err) ); } @@ -1714,7 +1714,7 @@ mod tests { let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( format!( - "Plan(\"Invalid identifier \\\'doesnotexist\\\' for schema {}\")", + r#"Plan("Invalid identifier 'doesnotexist' for schema {}")"#, PERSON_COLUMN_NAMES ), format!("{:?}", err) @@ -1727,7 +1727,7 @@ mod tests { let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( format!( - "Plan(\"Invalid identifier \\\'x\\\' for schema {}\")", + r#"Plan("Invalid identifier 'x' for schema {}")"#, PERSON_COLUMN_NAMES ), format!("{:?}", err) @@ -2200,7 +2200,7 @@ mod tests { let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( format!( - "Plan(\"Invalid identifier \\\'doesnotexist\\\' for schema {}\")", + r#"Plan("Invalid identifier 'doesnotexist' for schema {}")"#, PERSON_COLUMN_NAMES ), format!("{:?}", err) @@ -2212,7 +2212,7 @@ mod tests { let sql = "SELECT MIN(age), MIN(age) FROM person"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Plan(\"Projections require unique expression names but the expression \\\"#MIN(age)\\\" at position 0 and \\\"#MIN(age)\\\" at position 1 have the same name. Consider aliasing (\\\"AS\\\") one of them.\")", + r##"Plan("Projections require unique expression names but the expression \"#MIN(age)\" at position 0 and \"#MIN(age)\" at position 1 have the same name. Consider aliasing (\"AS\") one of them.")"##, format!("{:?}", err) ); } @@ -2242,7 +2242,7 @@ mod tests { let sql = "SELECT MIN(age) AS a, MIN(age) AS a FROM person"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Plan(\"Projections require unique expression names but the expression \\\"#MIN(age) AS a\\\" at position 0 and \\\"#MIN(age) AS a\\\" at position 1 have the same name. Consider aliasing (\\\"AS\\\") one of them.\")", + r##"Plan("Projections require unique expression names but the expression \"#MIN(age) AS a\" at position 0 and \"#MIN(age) AS a\" at position 1 have the same name. Consider aliasing (\"AS\") one of them.")"##, format!("{:?}", err) ); } @@ -2272,7 +2272,7 @@ mod tests { let sql = "SELECT state AS a, MIN(age) AS a FROM person GROUP BY state"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Plan(\"Projections require unique expression names but the expression \\\"#state AS a\\\" at position 0 and \\\"#MIN(age) AS a\\\" at position 1 have the same name. Consider aliasing (\\\"AS\\\") one of them.\")", + r##"Plan("Projections require unique expression names but the expression \"#state AS a\" at position 0 and \"#MIN(age) AS a\" at position 1 have the same name. Consider aliasing (\"AS\") one of them.")"##, format!("{:?}", err) ); } @@ -2293,7 +2293,7 @@ mod tests { let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( format!( - "Plan(\"Invalid identifier \\\'doesnotexist\\\' for schema {}\")", + r#"Plan("Invalid identifier 'doesnotexist' for schema {}")"#, PERSON_COLUMN_NAMES ), format!("{:?}", err) @@ -2306,7 +2306,7 @@ mod tests { let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( format!( - "Plan(\"Invalid identifier \\\'doesnotexist\\\' for schema {}\")", + r#"Plan("Invalid identifier 'doesnotexist' for schema {}")"#, PERSON_COLUMN_NAMES ), format!("{:?}", err) @@ -2318,7 +2318,7 @@ mod tests { let sql = "SELECT INTERVAL '100000000000000000 day'"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "NotImplemented(\"Interval field value out of range: \\\"100000000000000000 day\\\"\")", + r#"NotImplemented("Interval field value out of range: \"100000000000000000 day\"")"#, format!("{:?}", err) ); } @@ -2328,7 +2328,7 @@ mod tests { let sql = "SELECT INTERVAL '1 year 1 day'"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "NotImplemented(\"DF does not support intervals that have both a Year/Month part as well as Days/Hours/Mins/Seconds: \\\"1 year 1 day\\\". Hint: try breaking the interval into two parts, one with Year/Month and the other with Days/Hours/Mins/Seconds - e.g. (NOW() + INTERVAL \\\'1 year\\\') + INTERVAL \\\'1 day\\\'\")", + r#"NotImplemented("DF does not support intervals that have both a Year/Month part as well as Days/Hours/Mins/Seconds: \"1 year 1 day\". Hint: try breaking the interval into two parts, one with Year/Month and the other with Days/Hours/Mins/Seconds - e.g. (NOW() + INTERVAL '1 year') + INTERVAL '1 day'")"#, format!("{:?}", err) ); } @@ -2391,7 +2391,7 @@ mod tests { let sql = "SELECT state, MIN(age), MIN(age) FROM person GROUP BY state"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Plan(\"Projections require unique expression names but the expression \\\"#MIN(age)\\\" at position 1 and \\\"#MIN(age)\\\" at position 2 have the same name. Consider aliasing (\\\"AS\\\") one of them.\")", + r##"Plan("Projections require unique expression names but the expression \"#MIN(age)\" at position 1 and \"#MIN(age)\" at position 2 have the same name. Consider aliasing (\"AS\") one of them.")"##, format!("{:?}", err) ); } @@ -2451,7 +2451,7 @@ mod tests { "SELECT ((age + 1) / 2) * (age + 9), MIN(first_name) FROM person GROUP BY age + 1"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Plan(\"Projection references non-aggregate values\")", + r#"Plan("Projection references non-aggregate values")"#, format!("{:?}", err) ); } @@ -2462,7 +2462,7 @@ mod tests { let sql = "SELECT age, MIN(first_name) FROM person GROUP BY age + 1"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - "Plan(\"Projection references non-aggregate values\")", + r#"Plan("Projection references non-aggregate values")"#, format!("{:?}", err) ); } From 5900b4c6829b0bdbe69e1f95fb74e935bc8f33d4 Mon Sep 17 00:00:00 2001 From: Evan Chan Date: Sat, 19 Jun 2021 04:06:47 -0700 Subject: [PATCH 193/329] `to_timestamp_millis()`, `to_timestamp_micros()`, `to_timestamp_seconds()` (#567) * to_timestamp_millis(): support casting to Timestamp(Milliseconds, _) from Int64 * Add testing setup to instructions * to_timestamp_millis(): Convert timestamp strings to TimestampMillis * [functions] Add to_timestamp_micros() and to_timestamp_seconds() functions * Update datafusion/tests/sql.rs Co-authored-by: Andrew Lamb * CR feedback and fix build * Add ability for to_timestamp_xxx() functions to cast from other Timestamp types * For consistency, let to_timestamp() also perform casts * Prettier / clippy * Add docs for to_timestamp() functions Co-authored-by: Evan Chan Co-authored-by: Andrew Lamb --- DEVELOPERS.md | 9 +- README.md | 54 ++-- datafusion/Cargo.toml | 4 +- .../src/physical_plan/datetime_expressions.rs | 32 ++- .../src/physical_plan/expressions/binary.rs | 19 ++ .../src/physical_plan/expressions/cast.rs | 38 +-- .../src/physical_plan/expressions/mod.rs | 4 +- .../src/physical_plan/expressions/nullif.rs | 6 +- datafusion/src/physical_plan/functions.rs | 161 ++++++++++- datafusion/src/scalar.rs | 5 +- datafusion/tests/sql.rs | 267 +++++++++++++++++- docs/user-guide/src/SUMMARY.md | 1 + .../src/sql/datafusion-functions.md | 86 ++++++ 13 files changed, 618 insertions(+), 68 deletions(-) create mode 100644 docs/user-guide/src/sql/datafusion-functions.md diff --git a/DEVELOPERS.md b/DEVELOPERS.md index cd0792f7fa2ca..85384680c02eb 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -33,6 +33,13 @@ DataFusion is written in Rust and it uses a standard rust toolkit: - `cargo test` to test - etc. +Testing setup: + +- `git submodule init` +- `git submodule update` +- `export PARQUET_TEST_DATA=parquet_testing/` +- `export ARROW_TEST_DATA=testing/data/` + ## How to add a new scalar function Below is a checklist of what you need to do to add a new scalar function to DataFusion: @@ -47,7 +54,7 @@ Below is a checklist of what you need to do to add a new scalar function to Data - a new entry to `FromStr` with the name of the function as called by SQL - a new line in `return_type` with the expected return type of the function, given an incoming type - a new line in `signature` with the signature of the function (number and types of its arguments) - - a new line in `create_physical_expr` mapping the built-in to the implementation + - a new line in `create_physical_expr`/`create_physical_fun` mapping the built-in to the implementation - tests to the function. - In [tests/sql.rs](datafusion/tests/sql.rs), add a new test where the function is called through SQL against well known data and returns the expected result. - In [src/logical_plan/expr](datafusion/src/logical_plan/expr.rs), add: diff --git a/README.md b/README.md index 730bbc34d7038..195d1a7b3c316 100644 --- a/README.md +++ b/README.md @@ -197,6 +197,10 @@ DataFusion also includes a simple command-line interactive SQL utility. See the - [ ] Basic date functions - [ ] Basic time functions - [x] Basic timestamp functions + - [x] [to_timestamp](docs/user-guide/book/sql/datafusion-functions.html#to_timestamp) + - [x] [to_timestamp_millis](docs/user-guide/book/sql/datafusion-functions.html#to_timestamp_millis) + - [x] [to_timestamp_micros](docs/user-guide/book/sql/datafusion-functions.html#to_timestamp_micros) + - [x] [to_timestamp_seconds](docs/user-guide/book/sql/datafusion-functions.html#to_timestamp_seconds) - nested functions - [x] Array of columns - [x] Schema Queries @@ -320,31 +324,31 @@ execution. The SQL types from [sqlparser-rs](https://github.com/ballista-compute/sqlparser-rs/blob/main/src/ast/data_type.rs#L57) are mapped to Arrow types according to the following table -| SQL Data Type | Arrow DataType | -| ------------- | ------------------------------- | -| `CHAR` | `Utf8` | -| `VARCHAR` | `Utf8` | -| `UUID` | _Not yet supported_ | -| `CLOB` | _Not yet supported_ | -| `BINARY` | _Not yet supported_ | -| `VARBINARY` | _Not yet supported_ | -| `DECIMAL` | `Float64` | -| `FLOAT` | `Float32` | -| `SMALLINT` | `Int16` | -| `INT` | `Int32` | -| `BIGINT` | `Int64` | -| `REAL` | `Float64` | -| `DOUBLE` | `Float64` | -| `BOOLEAN` | `Boolean` | -| `DATE` | `Date32` | -| `TIME` | `Time64(TimeUnit::Millisecond)` | -| `TIMESTAMP` | `Date64` | -| `INTERVAL` | _Not yet supported_ | -| `REGCLASS` | _Not yet supported_ | -| `TEXT` | _Not yet supported_ | -| `BYTEA` | _Not yet supported_ | -| `CUSTOM` | _Not yet supported_ | -| `ARRAY` | _Not yet supported_ | +| SQL Data Type | Arrow DataType | +| ------------- | --------------------------------- | +| `CHAR` | `Utf8` | +| `VARCHAR` | `Utf8` | +| `UUID` | _Not yet supported_ | +| `CLOB` | _Not yet supported_ | +| `BINARY` | _Not yet supported_ | +| `VARBINARY` | _Not yet supported_ | +| `DECIMAL` | `Float64` | +| `FLOAT` | `Float32` | +| `SMALLINT` | `Int16` | +| `INT` | `Int32` | +| `BIGINT` | `Int64` | +| `REAL` | `Float64` | +| `DOUBLE` | `Float64` | +| `BOOLEAN` | `Boolean` | +| `DATE` | `Date32` | +| `TIME` | `Time64(TimeUnit::Millisecond)` | +| `TIMESTAMP` | `Timestamp(TimeUnit::Nanosecond)` | +| `INTERVAL` | _Not yet supported_ | +| `REGCLASS` | _Not yet supported_ | +| `TEXT` | _Not yet supported_ | +| `BYTEA` | _Not yet supported_ | +| `CUSTOM` | _Not yet supported_ | +| `ARRAY` | _Not yet supported_ | # Architecture Overview diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index 0668ec016ba1f..5da2469a764fb 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -46,8 +46,8 @@ unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = "0.7" hashbrown = "0.11" -arrow = { version = "4.0", features = ["prettyprint"] } -parquet = { version = "4.0", features = ["arrow"] } +arrow = { version = "4.3", features = ["prettyprint"] } +parquet = { version = "4.3", features = ["arrow"] } sqlparser = "0.9.0" paste = "^1.0" num_cpus = "1.13.0" diff --git a/datafusion/src/physical_plan/datetime_expressions.rs b/datafusion/src/physical_plan/datetime_expressions.rs index ec52e6bc4d528..e17ded29749ea 100644 --- a/datafusion/src/physical_plan/datetime_expressions.rs +++ b/datafusion/src/physical_plan/datetime_expressions.rs @@ -25,7 +25,10 @@ use crate::{ }; use arrow::{ array::{Array, ArrayRef, GenericStringArray, PrimitiveArray, StringOffsetSizeTrait}, - datatypes::{ArrowPrimitiveType, DataType, TimestampNanosecondType}, + datatypes::{ + ArrowPrimitiveType, DataType, TimestampMicrosecondType, TimestampMillisecondType, + TimestampNanosecondType, TimestampSecondType, + }, }; use arrow::{ array::{ @@ -268,6 +271,33 @@ pub fn to_timestamp(args: &[ColumnarValue]) -> Result { ) } +/// to_timestamp_millis SQL function +pub fn to_timestamp_millis(args: &[ColumnarValue]) -> Result { + handle::( + args, + |s| string_to_timestamp_nanos(s).map(|n| n / 1_000_000), + "to_timestamp_millis", + ) +} + +/// to_timestamp_micros SQL function +pub fn to_timestamp_micros(args: &[ColumnarValue]) -> Result { + handle::( + args, + |s| string_to_timestamp_nanos(s).map(|n| n / 1_000), + "to_timestamp_micros", + ) +} + +/// to_timestamp_seconds SQL function +pub fn to_timestamp_seconds(args: &[ColumnarValue]) -> Result { + handle::( + args, + |s| string_to_timestamp_nanos(s).map(|n| n / 1_000_000_000), + "to_timestamp_seconds", + ) +} + /// Create an implementation of `now()` that always returns the /// specified timestamp. /// diff --git a/datafusion/src/physical_plan/expressions/binary.rs b/datafusion/src/physical_plan/expressions/binary.rs index 5c2d9ce02f51f..5ed0c74463a6e 100644 --- a/datafusion/src/physical_plan/expressions/binary.rs +++ b/datafusion/src/physical_plan/expressions/binary.rs @@ -17,6 +17,7 @@ use std::{any::Any, sync::Arc}; +use arrow::array::TimestampMillisecondArray; use arrow::array::*; use arrow::compute::kernels::arithmetic::{ add, divide, divide_scalar, multiply, subtract, @@ -256,6 +257,15 @@ macro_rules! binary_array_op_scalar { DataType::Timestamp(TimeUnit::Nanosecond, None) => { compute_op_scalar!($LEFT, $RIGHT, $OP, TimestampNanosecondArray) } + DataType::Timestamp(TimeUnit::Microsecond, None) => { + compute_op_scalar!($LEFT, $RIGHT, $OP, TimestampMicrosecondArray) + } + DataType::Timestamp(TimeUnit::Millisecond, None) => { + compute_op_scalar!($LEFT, $RIGHT, $OP, TimestampMillisecondArray) + } + DataType::Timestamp(TimeUnit::Second, None) => { + compute_op_scalar!($LEFT, $RIGHT, $OP, TimestampSecondArray) + } DataType::Date32 => { compute_op_scalar!($LEFT, $RIGHT, $OP, Date32Array) } @@ -288,6 +298,15 @@ macro_rules! binary_array_op { DataType::Timestamp(TimeUnit::Nanosecond, None) => { compute_op!($LEFT, $RIGHT, $OP, TimestampNanosecondArray) } + DataType::Timestamp(TimeUnit::Microsecond, None) => { + compute_op!($LEFT, $RIGHT, $OP, TimestampMicrosecondArray) + } + DataType::Timestamp(TimeUnit::Millisecond, None) => { + compute_op!($LEFT, $RIGHT, $OP, TimestampMillisecondArray) + } + DataType::Timestamp(TimeUnit::Second, None) => { + compute_op!($LEFT, $RIGHT, $OP, TimestampSecondArray) + } DataType::Date32 => { compute_op!($LEFT, $RIGHT, $OP, Date32Array) } diff --git a/datafusion/src/physical_plan/expressions/cast.rs b/datafusion/src/physical_plan/expressions/cast.rs index ba395f54d917c..558b1e5d7e8b8 100644 --- a/datafusion/src/physical_plan/expressions/cast.rs +++ b/datafusion/src/physical_plan/expressions/cast.rs @@ -91,24 +91,26 @@ impl PhysicalExpr for CastExpr { fn evaluate(&self, batch: &RecordBatch) -> Result { let value = self.expr.evaluate(batch)?; - match value { - ColumnarValue::Array(array) => { - Ok(ColumnarValue::Array(kernels::cast::cast_with_options( - &array, - &self.cast_type, - &self.cast_options, - )?)) - } - ColumnarValue::Scalar(scalar) => { - let scalar_array = scalar.to_array(); - let cast_array = kernels::cast::cast_with_options( - &scalar_array, - &self.cast_type, - &self.cast_options, - )?; - let cast_scalar = ScalarValue::try_from_array(&cast_array, 0)?; - Ok(ColumnarValue::Scalar(cast_scalar)) - } + cast_column(&value, &self.cast_type, &self.cast_options) + } +} + +/// Internal cast function for casting ColumnarValue -> ColumnarValue for cast_type +pub fn cast_column( + value: &ColumnarValue, + cast_type: &DataType, + cast_options: &CastOptions, +) -> Result { + match value { + ColumnarValue::Array(array) => Ok(ColumnarValue::Array( + kernels::cast::cast_with_options(array, cast_type, cast_options)?, + )), + ColumnarValue::Scalar(scalar) => { + let scalar_array = scalar.to_array(); + let cast_array = + kernels::cast::cast_with_options(&scalar_array, cast_type, cast_options)?; + let cast_scalar = ScalarValue::try_from_array(&cast_array, 0)?; + Ok(ColumnarValue::Scalar(cast_scalar)) } } } diff --git a/datafusion/src/physical_plan/expressions/mod.rs b/datafusion/src/physical_plan/expressions/mod.rs index d18365c47ed5e..f8cb40cbacbdc 100644 --- a/datafusion/src/physical_plan/expressions/mod.rs +++ b/datafusion/src/physical_plan/expressions/mod.rs @@ -49,7 +49,9 @@ mod try_cast; pub use average::{avg_return_type, Avg, AvgAccumulator}; pub use binary::{binary, binary_operator_data_type, BinaryExpr}; pub use case::{case, CaseExpr}; -pub use cast::{cast, cast_with_options, CastExpr}; +pub use cast::{ + cast, cast_column, cast_with_options, CastExpr, DEFAULT_DATAFUSION_CAST_OPTIONS, +}; pub use column::{col, Column}; pub use count::Count; pub use in_list::{in_list, InListExpr}; diff --git a/datafusion/src/physical_plan/expressions/nullif.rs b/datafusion/src/physical_plan/expressions/nullif.rs index 7cc58ed2318f4..55e7bda40f83f 100644 --- a/datafusion/src/physical_plan/expressions/nullif.rs +++ b/datafusion/src/physical_plan/expressions/nullif.rs @@ -21,11 +21,7 @@ use super::ColumnarValue; use crate::error::{DataFusionError, Result}; use crate::scalar::ScalarValue; use arrow::array::Array; -use arrow::array::{ - ArrayRef, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, - Int16Array, Int32Array, Int64Array, Int8Array, StringArray, TimestampNanosecondArray, - UInt16Array, UInt32Array, UInt64Array, UInt8Array, -}; +use arrow::array::*; use arrow::compute::kernels::boolean::nullif; use arrow::compute::kernels::comparison::{eq, eq_scalar, eq_utf8, eq_utf8_scalar}; use arrow::datatypes::{DataType, TimeUnit}; diff --git a/datafusion/src/physical_plan/functions.rs b/datafusion/src/physical_plan/functions.rs index 1e423c367cd8f..0e2be51d3ebc1 100644 --- a/datafusion/src/physical_plan/functions.rs +++ b/datafusion/src/physical_plan/functions.rs @@ -36,7 +36,9 @@ use super::{ use crate::execution::context::ExecutionContextState; use crate::physical_plan::array_expressions; use crate::physical_plan::datetime_expressions; -use crate::physical_plan::expressions::{nullif_func, SUPPORTED_NULLIF_TYPES}; +use crate::physical_plan::expressions::{ + cast_column, nullif_func, DEFAULT_DATAFUSION_CAST_OPTIONS, SUPPORTED_NULLIF_TYPES, +}; use crate::physical_plan::math_expressions; use crate::physical_plan::string_expressions; use crate::{ @@ -205,6 +207,12 @@ pub enum BuiltinScalarFunction { ToHex, /// to_timestamp ToTimestamp, + /// to_timestamp_millis + ToTimestampMillis, + /// to_timestamp_micros + ToTimestampMicros, + /// to_timestamp_seconds + ToTimestampSeconds, ///now Now, /// translate @@ -298,6 +306,9 @@ impl FromStr for BuiltinScalarFunction { "substr" => BuiltinScalarFunction::Substr, "to_hex" => BuiltinScalarFunction::ToHex, "to_timestamp" => BuiltinScalarFunction::ToTimestamp, + "to_timestamp_millis" => BuiltinScalarFunction::ToTimestampMillis, + "to_timestamp_micros" => BuiltinScalarFunction::ToTimestampMicros, + "to_timestamp_seconds" => BuiltinScalarFunction::ToTimestampSeconds, "now" => BuiltinScalarFunction::Now, "translate" => BuiltinScalarFunction::Translate, "trim" => BuiltinScalarFunction::Trim, @@ -412,6 +423,15 @@ pub fn return_type( BuiltinScalarFunction::ToTimestamp => { Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)) } + BuiltinScalarFunction::ToTimestampMillis => { + Ok(DataType::Timestamp(TimeUnit::Millisecond, None)) + } + BuiltinScalarFunction::ToTimestampMicros => { + Ok(DataType::Timestamp(TimeUnit::Microsecond, None)) + } + BuiltinScalarFunction::ToTimestampSeconds => { + Ok(DataType::Timestamp(TimeUnit::Second, None)) + } BuiltinScalarFunction::Now => Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)), BuiltinScalarFunction::Translate => utf8_to_str_type(&arg_types[0], "translate"), BuiltinScalarFunction::Trim => utf8_to_str_type(&arg_types[0], "trim"), @@ -896,9 +916,6 @@ pub fn create_physical_fun( other, ))), }), - BuiltinScalarFunction::ToTimestamp => { - Arc::new(datetime_expressions::to_timestamp) - } BuiltinScalarFunction::Translate => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { let func = invoke_if_unicode_expressions_feature_flag!( @@ -934,6 +951,12 @@ pub fn create_physical_fun( ))), }), BuiltinScalarFunction::Upper => Arc::new(string_expressions::upper), + _ => { + return Err(DataFusionError::Internal(format!( + "create_physical_fun: Unsupported scalar function {:?}", + fun + ))) + } }) } @@ -945,7 +968,94 @@ pub fn create_physical_expr( input_schema: &Schema, ctx_state: &ExecutionContextState, ) -> Result> { - let fun_expr = create_physical_fun(fun, ctx_state)?; + let fun_expr: ScalarFunctionImplementation = match fun { + // These functions need args and input schema to pick an implementation + // Unlike the string functions, which actually figure out the function to use with each array, + // here we return either a cast fn or string timestamp translation based on the expression data type + // so we don't have to pay a per-array/batch cost. + BuiltinScalarFunction::ToTimestamp => { + Arc::new(match args[0].data_type(input_schema) { + Ok(DataType::Int64) | Ok(DataType::Timestamp(_, None)) => { + |col_values: &[ColumnarValue]| { + cast_column( + &col_values[0], + &DataType::Timestamp(TimeUnit::Nanosecond, None), + &DEFAULT_DATAFUSION_CAST_OPTIONS, + ) + } + } + Ok(DataType::Utf8) => datetime_expressions::to_timestamp, + other => { + return Err(DataFusionError::Internal(format!( + "Unsupported data type {:?} for function to_timestamp", + other, + ))) + } + }) + } + BuiltinScalarFunction::ToTimestampMillis => { + Arc::new(match args[0].data_type(input_schema) { + Ok(DataType::Int64) | Ok(DataType::Timestamp(_, None)) => { + |col_values: &[ColumnarValue]| { + cast_column( + &col_values[0], + &DataType::Timestamp(TimeUnit::Millisecond, None), + &DEFAULT_DATAFUSION_CAST_OPTIONS, + ) + } + } + Ok(DataType::Utf8) => datetime_expressions::to_timestamp_millis, + other => { + return Err(DataFusionError::Internal(format!( + "Unsupported data type {:?} for function to_timestamp_millis", + other, + ))) + } + }) + } + BuiltinScalarFunction::ToTimestampMicros => { + Arc::new(match args[0].data_type(input_schema) { + Ok(DataType::Int64) | Ok(DataType::Timestamp(_, None)) => { + |col_values: &[ColumnarValue]| { + cast_column( + &col_values[0], + &DataType::Timestamp(TimeUnit::Microsecond, None), + &DEFAULT_DATAFUSION_CAST_OPTIONS, + ) + } + } + Ok(DataType::Utf8) => datetime_expressions::to_timestamp_micros, + other => { + return Err(DataFusionError::Internal(format!( + "Unsupported data type {:?} for function to_timestamp_micros", + other, + ))) + } + }) + } + BuiltinScalarFunction::ToTimestampSeconds => Arc::new({ + match args[0].data_type(input_schema) { + Ok(DataType::Int64) | Ok(DataType::Timestamp(_, None)) => { + |col_values: &[ColumnarValue]| { + cast_column( + &col_values[0], + &DataType::Timestamp(TimeUnit::Second, None), + &DEFAULT_DATAFUSION_CAST_OPTIONS, + ) + } + } + Ok(DataType::Utf8) => datetime_expressions::to_timestamp_seconds, + other => { + return Err(DataFusionError::Internal(format!( + "Unsupported data type {:?} for function to_timestamp_seconds", + other, + ))) + } + } + }), + // These don't need args and input schema + _ => create_physical_fun(fun, ctx_state)?, + }; let args = coerce(args, input_schema, &signature(fun))?; let arg_types = args @@ -1026,7 +1136,46 @@ fn signature(fun: &BuiltinScalarFunction) -> Signature { Signature::Exact(vec![DataType::Utf8, DataType::Int64]), Signature::Exact(vec![DataType::LargeUtf8, DataType::Int64]), ]), - BuiltinScalarFunction::ToTimestamp => Signature::Uniform(1, vec![DataType::Utf8]), + BuiltinScalarFunction::ToTimestamp => Signature::Uniform( + 1, + vec![ + DataType::Utf8, + DataType::Int64, + DataType::Timestamp(TimeUnit::Millisecond, None), + DataType::Timestamp(TimeUnit::Microsecond, None), + DataType::Timestamp(TimeUnit::Second, None), + ], + ), + BuiltinScalarFunction::ToTimestampMillis => Signature::Uniform( + 1, + vec![ + DataType::Utf8, + DataType::Int64, + DataType::Timestamp(TimeUnit::Nanosecond, None), + DataType::Timestamp(TimeUnit::Microsecond, None), + DataType::Timestamp(TimeUnit::Second, None), + ], + ), + BuiltinScalarFunction::ToTimestampMicros => Signature::Uniform( + 1, + vec![ + DataType::Utf8, + DataType::Int64, + DataType::Timestamp(TimeUnit::Nanosecond, None), + DataType::Timestamp(TimeUnit::Millisecond, None), + DataType::Timestamp(TimeUnit::Second, None), + ], + ), + BuiltinScalarFunction::ToTimestampSeconds => Signature::Uniform( + 1, + vec![ + DataType::Utf8, + DataType::Int64, + DataType::Timestamp(TimeUnit::Nanosecond, None), + DataType::Timestamp(TimeUnit::Microsecond, None), + DataType::Timestamp(TimeUnit::Millisecond, None), + ], + ), BuiltinScalarFunction::DateTrunc => Signature::Exact(vec![ DataType::Utf8, DataType::Timestamp(TimeUnit::Nanosecond, None), diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index c7afbf55e367c..c23674bd59db0 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -900,7 +900,10 @@ impl TryFrom for i64 { fn try_from(value: ScalarValue) -> Result { match value { ScalarValue::Int64(Some(inner_value)) - | ScalarValue::TimestampNanosecond(Some(inner_value)) => Ok(inner_value), + | ScalarValue::TimestampNanosecond(Some(inner_value)) + | ScalarValue::TimestampMicrosecond(Some(inner_value)) + | ScalarValue::TimestampMillisecond(Some(inner_value)) + | ScalarValue::TimestampSecond(Some(inner_value)) => Ok(inner_value), _ => Err(DataFusionError::Internal(format!( "Cannot convert {:?} to {}", value, diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 21da793b55385..b6393e91e321b 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -27,7 +27,11 @@ extern crate datafusion; use arrow::{array::*, datatypes::TimeUnit}; use arrow::{datatypes::Int32Type, datatypes::Int64Type, record_batch::RecordBatch}; use arrow::{ - datatypes::{DataType, Field, Schema, SchemaRef}, + datatypes::{ + ArrowNativeType, ArrowPrimitiveType, ArrowTimestampType, DataType, Field, Schema, + SchemaRef, TimestampMicrosecondType, TimestampMillisecondType, + TimestampNanosecondType, TimestampSecondType, + }, util::display::array_value_to_string, }; @@ -1023,6 +1027,188 @@ async fn csv_query_cast_literal() -> Result<()> { Ok(()) } +#[tokio::test] +async fn query_cast_timestamp_millis() -> Result<()> { + let mut ctx = ExecutionContext::new(); + + let t1_schema = Arc::new(Schema::new(vec![Field::new("ts", DataType::Int64, true)])); + let t1_data = RecordBatch::try_new( + t1_schema.clone(), + vec![Arc::new(Int64Array::from(vec![ + 1235865600000, + 1235865660000, + 1238544000000, + ]))], + )?; + let t1_table = MemTable::try_new(t1_schema, vec![vec![t1_data]])?; + ctx.register_table("t1", Arc::new(t1_table))?; + + let sql = "SELECT to_timestamp_millis(ts) FROM t1 LIMIT 3"; + let actual = execute(&mut ctx, sql).await; + let expected = vec![ + vec!["2009-03-01 00:00:00"], + vec!["2009-03-01 00:01:00"], + vec!["2009-04-01 00:00:00"], + ]; + assert_eq!(expected, actual); + Ok(()) +} + +#[tokio::test] +async fn query_cast_timestamp_micros() -> Result<()> { + let mut ctx = ExecutionContext::new(); + + let t1_schema = Arc::new(Schema::new(vec![Field::new("ts", DataType::Int64, true)])); + let t1_data = RecordBatch::try_new( + t1_schema.clone(), + vec![Arc::new(Int64Array::from(vec![ + 1235865600000000, + 1235865660000000, + 1238544000000000, + ]))], + )?; + let t1_table = MemTable::try_new(t1_schema, vec![vec![t1_data]])?; + ctx.register_table("t1", Arc::new(t1_table))?; + + let sql = "SELECT to_timestamp_micros(ts) FROM t1 LIMIT 3"; + let actual = execute(&mut ctx, sql).await; + let expected = vec![ + vec!["2009-03-01 00:00:00"], + vec!["2009-03-01 00:01:00"], + vec!["2009-04-01 00:00:00"], + ]; + assert_eq!(expected, actual); + Ok(()) +} + +#[tokio::test] +async fn query_cast_timestamp_seconds() -> Result<()> { + let mut ctx = ExecutionContext::new(); + + let t1_schema = Arc::new(Schema::new(vec![Field::new("ts", DataType::Int64, true)])); + let t1_data = RecordBatch::try_new( + t1_schema.clone(), + vec![Arc::new(Int64Array::from(vec![ + 1235865600, 1235865660, 1238544000, + ]))], + )?; + let t1_table = MemTable::try_new(t1_schema, vec![vec![t1_data]])?; + ctx.register_table("t1", Arc::new(t1_table))?; + + let sql = "SELECT to_timestamp_seconds(ts) FROM t1 LIMIT 3"; + let actual = execute(&mut ctx, sql).await; + let expected = vec![ + vec!["2009-03-01 00:00:00"], + vec!["2009-03-01 00:01:00"], + vec!["2009-04-01 00:00:00"], + ]; + assert_eq!(expected, actual); + Ok(()) +} + +#[tokio::test] +async fn query_cast_timestamp_nanos_to_others() -> Result<()> { + let mut ctx = ExecutionContext::new(); + ctx.register_table("ts_data", make_timestamp_nano_table()?)?; + + // Original column is nanos, convert to millis and check timestamp + let sql = "SELECT to_timestamp_millis(ts) FROM ts_data LIMIT 3"; + let actual = execute(&mut ctx, sql).await; + let expected = vec![ + vec!["2020-09-08 13:42:29.190"], + vec!["2020-09-08 12:42:29.190"], + vec!["2020-09-08 11:42:29.190"], + ]; + assert_eq!(expected, actual); + + let sql = "SELECT to_timestamp_micros(ts) FROM ts_data LIMIT 3"; + let actual = execute(&mut ctx, sql).await; + let expected = vec![ + vec!["2020-09-08 13:42:29.190855"], + vec!["2020-09-08 12:42:29.190855"], + vec!["2020-09-08 11:42:29.190855"], + ]; + assert_eq!(expected, actual); + + let sql = "SELECT to_timestamp_seconds(ts) FROM ts_data LIMIT 3"; + let actual = execute(&mut ctx, sql).await; + let expected = vec![ + vec!["2020-09-08 13:42:29"], + vec!["2020-09-08 12:42:29"], + vec!["2020-09-08 11:42:29"], + ]; + assert_eq!(expected, actual); + + Ok(()) +} + +#[tokio::test] +async fn query_cast_timestamp_seconds_to_others() -> Result<()> { + let mut ctx = ExecutionContext::new(); + ctx.register_table("ts_secs", make_timestamp_table::()?)?; + + // Original column is seconds, convert to millis and check timestamp + let sql = "SELECT to_timestamp_millis(ts) FROM ts_secs LIMIT 3"; + let actual = execute(&mut ctx, sql).await; + let expected = vec![ + vec!["2020-09-08 13:42:29"], + vec!["2020-09-08 12:42:29"], + vec!["2020-09-08 11:42:29"], + ]; + assert_eq!(expected, actual); + + // Original column is seconds, convert to micros and check timestamp + let sql = "SELECT to_timestamp_micros(ts) FROM ts_secs LIMIT 3"; + let actual = execute(&mut ctx, sql).await; + assert_eq!(expected, actual); + + // to nanos + let sql = "SELECT to_timestamp(ts) FROM ts_secs LIMIT 3"; + let actual = execute(&mut ctx, sql).await; + assert_eq!(expected, actual); + Ok(()) +} + +#[tokio::test] +async fn query_cast_timestamp_micros_to_others() -> Result<()> { + let mut ctx = ExecutionContext::new(); + ctx.register_table( + "ts_micros", + make_timestamp_table::()?, + )?; + + // Original column is micros, convert to millis and check timestamp + let sql = "SELECT to_timestamp_millis(ts) FROM ts_micros LIMIT 3"; + let actual = execute(&mut ctx, sql).await; + let expected = vec![ + vec!["2020-09-08 13:42:29.190"], + vec!["2020-09-08 12:42:29.190"], + vec!["2020-09-08 11:42:29.190"], + ]; + assert_eq!(expected, actual); + + // Original column is micros, convert to seconds and check timestamp + let sql = "SELECT to_timestamp_seconds(ts) FROM ts_micros LIMIT 3"; + let actual = execute(&mut ctx, sql).await; + let expected = vec![ + vec!["2020-09-08 13:42:29"], + vec!["2020-09-08 12:42:29"], + vec!["2020-09-08 11:42:29"], + ]; + assert_eq!(expected, actual); + + // Original column is micros, convert to nanos and check timestamp + let sql = "SELECT to_timestamp(ts) FROM ts_micros LIMIT 3"; + let actual = execute(&mut ctx, sql).await; + let expected = vec![ + vec!["2020-09-08 13:42:29.190855"], + vec!["2020-09-08 12:42:29.190855"], + vec!["2020-09-08 11:42:29.190855"], + ]; + assert_eq!(expected, actual); + Ok(()) +} + #[tokio::test] async fn union_all() -> Result<()> { let mut ctx = ExecutionContext::new(); @@ -2439,17 +2625,33 @@ async fn like() -> Result<()> { Ok(()) } -fn make_timestamp_nano_table() -> Result> { +fn make_timestamp_table() -> Result> +where + A: ArrowTimestampType, +{ let schema = Arc::new(Schema::new(vec![ - Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), false), + Field::new("ts", DataType::Timestamp(A::get_time_unit(), None), false), Field::new("value", DataType::Int32, true), ])); - let mut builder = TimestampNanosecondArray::builder(3); - - builder.append_value(1599572549190855000)?; // 2020-09-08T13:42:29.190855+00:00 - builder.append_value(1599568949190855000)?; // 2020-09-08T12:42:29.190855+00:00 - builder.append_value(1599565349190855000)?; // 2020-09-08T11:42:29.190855+00:00 + let mut builder = PrimitiveBuilder::::new(3); + + let nanotimestamps = vec![ + 1599572549190855000i64, // 2020-09-08T13:42:29.190855+00:00 + 1599568949190855000, // 2020-09-08T12:42:29.190855+00:00 + 1599565349190855000, //2020-09-08T11:42:29.190855+00:00 + ]; // 2020-09-08T11:42:29.190855+00:00 + let divisor = match A::get_time_unit() { + TimeUnit::Nanosecond => 1, + TimeUnit::Microsecond => 1000, + TimeUnit::Millisecond => 1_000_000, + TimeUnit::Second => 1_000_000_000, + }; + for ts in nanotimestamps { + builder.append_value( + ::Native::from_i64(ts / divisor).unwrap(), + )?; + } let data = RecordBatch::try_new( schema.clone(), @@ -2462,6 +2664,10 @@ fn make_timestamp_nano_table() -> Result> { Ok(Arc::new(table)) } +fn make_timestamp_nano_table() -> Result> { + make_timestamp_table::() +} + #[tokio::test] async fn to_timestamp() -> Result<()> { let mut ctx = ExecutionContext::new(); @@ -2475,6 +2681,51 @@ async fn to_timestamp() -> Result<()> { Ok(()) } +#[tokio::test] +async fn to_timestamp_millis() -> Result<()> { + let mut ctx = ExecutionContext::new(); + ctx.register_table( + "ts_data", + make_timestamp_table::()?, + )?; + + let sql = "SELECT COUNT(*) FROM ts_data where ts > to_timestamp_millis('2020-09-08T12:00:00+00:00')"; + let actual = execute(&mut ctx, sql).await; + + let expected = vec![vec!["2"]]; + assert_eq!(expected, actual); + Ok(()) +} + +#[tokio::test] +async fn to_timestamp_micros() -> Result<()> { + let mut ctx = ExecutionContext::new(); + ctx.register_table( + "ts_data", + make_timestamp_table::()?, + )?; + + let sql = "SELECT COUNT(*) FROM ts_data where ts > to_timestamp_micros('2020-09-08T12:00:00+00:00')"; + let actual = execute(&mut ctx, sql).await; + + let expected = vec![vec!["2"]]; + assert_eq!(expected, actual); + Ok(()) +} + +#[tokio::test] +async fn to_timestamp_seconds() -> Result<()> { + let mut ctx = ExecutionContext::new(); + ctx.register_table("ts_data", make_timestamp_table::()?)?; + + let sql = "SELECT COUNT(*) FROM ts_data where ts > to_timestamp_seconds('2020-09-08T12:00:00+00:00')"; + let actual = execute(&mut ctx, sql).await; + + let expected = vec![vec!["2"]]; + assert_eq!(expected, actual); + Ok(()) +} + #[tokio::test] async fn count_distinct_timestamps() -> Result<()> { let mut ctx = ExecutionContext::new(); diff --git a/docs/user-guide/src/SUMMARY.md b/docs/user-guide/src/SUMMARY.md index aa101b3de1173..516fccece99c9 100644 --- a/docs/user-guide/src/SUMMARY.md +++ b/docs/user-guide/src/SUMMARY.md @@ -27,6 +27,7 @@ - [SELECT](sql/select.md) - [DDL](sql/ddl.md) - [CREATE EXTERNAL TABLE](sql/ddl.md) + - [Datafusion Specific Functions](sql/datafusion-functions.md) - [Distributed](distributed/introduction.md) - [Create a Ballista Cluster](distributed/deployment.md) diff --git a/docs/user-guide/src/sql/datafusion-functions.md b/docs/user-guide/src/sql/datafusion-functions.md new file mode 100644 index 0000000000000..8431baf2a3b17 --- /dev/null +++ b/docs/user-guide/src/sql/datafusion-functions.md @@ -0,0 +1,86 @@ + + +# Datafusion-Specific Functions + +These SQL functions are specific to DataFusion, or they are well known and have functionality which is specific to DataFusion. Specifically, the `to_timestamp_xx()` functions exist due to Arrow's support for multiple timestamp resolutions. + +## `to_timestamp` + +`to_timestamp()` is similar to the standard SQL function. It performs conversions to type `Timestamp(Nanoseconds, None)`, from: + +- Timestamp strings + - `1997-01-31T09:26:56.123Z` # RCF3339 + - `1997-01-31T09:26:56.123-05:00` # RCF3339 + - `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space er than T + - `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone et specified + - `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and timezone offset + - `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds +- An Int64 array/column, values are nanoseconds since Epoch UTC +- Other Timestamp() columns or values + +Note that conversions from other Timestamp and Int64 types can also be performed using `CAST(.. AS Timestamp)`. However, the conversion functionality here is present for consistency with the other `to_timestamp_xx()` functions. + +## `to_timestamp_millis` + +`to_timestamp_millis()` does conversions to type `Timestamp(Milliseconds, None)`, from: + +- Timestamp strings, the same as supported by the regular timestamp() function (except the output is a timestamp of Milliseconds resolution) + - `1997-01-31T09:26:56.123Z` # RCF3339 + - `1997-01-31T09:26:56.123-05:00` # RCF3339 + - `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space er than T + - `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone et specified + - `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and timezone offset + - `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds +- An Int64 array/column, values are milliseconds since Epoch UTC +- Other Timestamp() columns or values + +Note that `CAST(.. AS Timestamp)` converts to Timestamps with Nanosecond resolution; this function is the only way to convert/cast to millisecond resolution. + +## `to_timestamp_micros` + +`to_timestamp_micros()` does conversions to type `Timestamp(Microseconds, None)`, from: + +- Timestamp strings, the same as supported by the regular timestamp() function (except the output is a timestamp of microseconds resolution) + - `1997-01-31T09:26:56.123Z` # RCF3339 + - `1997-01-31T09:26:56.123-05:00` # RCF3339 + - `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space er than T + - `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone et specified + - `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and timezone offset + - `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds +- An Int64 array/column, values are microseconds since Epoch UTC +- Other Timestamp() columns or values + +Note that `CAST(.. AS Timestamp)` converts to Timestamps with Nanosecond resolution; this function is the only way to convert/cast to microsecond resolution. + +## `to_timestamp_seconds` + +`to_timestamp_seconds()` does conversions to type `Timestamp(Seconds, None)`, from: + +- Timestamp strings, the same as supported by the regular timestamp() function (except the output is a timestamp of secondseconds resolution) + - `1997-01-31T09:26:56.123Z` # RCF3339 + - `1997-01-31T09:26:56.123-05:00` # RCF3339 + - `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space er than T + - `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone et specified + - `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and timezone offset + - `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds +- An Int64 array/column, values are seconds since Epoch UTC +- Other Timestamp() columns or values + +Note that `CAST(.. AS Timestamp)` converts to Timestamps with Nanosecond resolution; this function is the only way to convert/cast to seconds resolution. From 05d5f01fa8ec7bf9baa3aa632ccedb914d0b49a2 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Mon, 21 Jun 2021 18:57:04 +0800 Subject: [PATCH 194/329] implement window functions with partition by (#558) --- datafusion/src/execution/context.rs | 74 +++++++++++++++++++ .../physical_plan/expressions/nth_value.rs | 10 ++- datafusion/src/physical_plan/mod.rs | 36 +++++---- datafusion/src/physical_plan/planner.rs | 6 -- datafusion/src/physical_plan/windows.rs | 61 +++++++++++++-- datafusion/tests/sql.rs | 64 ++++++++++++++++ .../simple_window_partition_aggregation.sql | 26 +++++++ ...ple_window_partition_order_aggregation.sql | 26 +++++++ integration-tests/test_psql_parity.py | 2 +- 9 files changed, 275 insertions(+), 30 deletions(-) create mode 100644 integration-tests/sqls/simple_window_partition_aggregation.sql create mode 100644 integration-tests/sqls/simple_window_partition_order_aggregation.sql diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index ef652c28d1edf..b42695b0c4c64 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -1355,6 +1355,80 @@ mod tests { Ok(()) } + #[tokio::test] + async fn window_partition_by() -> Result<()> { + let results = execute( + "SELECT \ + c1, \ + c2, \ + SUM(c2) OVER (PARTITION BY c2), \ + COUNT(c2) OVER (PARTITION BY c2), \ + MAX(c2) OVER (PARTITION BY c2), \ + MIN(c2) OVER (PARTITION BY c2), \ + AVG(c2) OVER (PARTITION BY c2) \ + FROM test \ + ORDER BY c1, c2 \ + LIMIT 5", + 4, + ) + .await?; + + let expected = vec![ + "+----+----+---------+-----------+---------+---------+---------+", + "| c1 | c2 | SUM(c2) | COUNT(c2) | MAX(c2) | MIN(c2) | AVG(c2) |", + "+----+----+---------+-----------+---------+---------+---------+", + "| 0 | 1 | 4 | 4 | 1 | 1 | 1 |", + "| 0 | 2 | 8 | 4 | 2 | 2 | 2 |", + "| 0 | 3 | 12 | 4 | 3 | 3 | 3 |", + "| 0 | 4 | 16 | 4 | 4 | 4 | 4 |", + "| 0 | 5 | 20 | 4 | 5 | 5 | 5 |", + "+----+----+---------+-----------+---------+---------+---------+", + ]; + + // window function shall respect ordering + assert_batches_eq!(expected, &results); + Ok(()) + } + + #[tokio::test] + async fn window_partition_by_order_by() -> Result<()> { + let results = execute( + "SELECT \ + c1, \ + c2, \ + ROW_NUMBER() OVER (PARTITION BY c2 ORDER BY c1), \ + FIRST_VALUE(c2 + c1) OVER (PARTITION BY c2 ORDER BY c1), \ + LAST_VALUE(c2 + c1) OVER (PARTITION BY c2 ORDER BY c1), \ + NTH_VALUE(c2 + c1, 2) OVER (PARTITION BY c2 ORDER BY c1), \ + SUM(c2) OVER (PARTITION BY c2 ORDER BY c1), \ + COUNT(c2) OVER (PARTITION BY c2 ORDER BY c1), \ + MAX(c2) OVER (PARTITION BY c2 ORDER BY c1), \ + MIN(c2) OVER (PARTITION BY c2 ORDER BY c1), \ + AVG(c2) OVER (PARTITION BY c2 ORDER BY c1) \ + FROM test \ + ORDER BY c1, c2 \ + LIMIT 5", + 4, + ) + .await?; + + let expected = vec![ + "+----+----+--------------+-------------------------+------------------------+--------------------------------+---------+-----------+---------+---------+---------+", + "| c1 | c2 | ROW_NUMBER() | FIRST_VALUE(c2 Plus c1) | LAST_VALUE(c2 Plus c1) | NTH_VALUE(c2 Plus c1,Int64(2)) | SUM(c2) | COUNT(c2) | MAX(c2) | MIN(c2) | AVG(c2) |", + "+----+----+--------------+-------------------------+------------------------+--------------------------------+---------+-----------+---------+---------+---------+", + "| 0 | 1 | 1 | 1 | 4 | 2 | 1 | 1 | 1 | 1 | 1 |", + "| 0 | 2 | 1 | 2 | 5 | 3 | 2 | 1 | 2 | 2 | 2 |", + "| 0 | 3 | 1 | 3 | 6 | 4 | 3 | 1 | 3 | 3 | 3 |", + "| 0 | 4 | 1 | 4 | 7 | 5 | 4 | 1 | 4 | 4 | 4 |", + "| 0 | 5 | 1 | 5 | 8 | 6 | 5 | 1 | 5 | 5 | 5 |", + "+----+----+--------------+-------------------------+------------------------+--------------------------------+---------+-----------+---------+---------+---------+", + ]; + + // window function shall respect ordering + assert_batches_eq!(expected, &results); + Ok(()) + } + #[tokio::test] async fn aggregate() -> Result<()> { let results = execute("SELECT SUM(c1), SUM(c2) FROM test", 4).await?; diff --git a/datafusion/src/physical_plan/expressions/nth_value.rs b/datafusion/src/physical_plan/expressions/nth_value.rs index 98083fa26eaa9..16897d45119f0 100644 --- a/datafusion/src/physical_plan/expressions/nth_value.rs +++ b/datafusion/src/physical_plan/expressions/nth_value.rs @@ -20,7 +20,7 @@ use crate::error::{DataFusionError, Result}; use crate::physical_plan::{window_functions::BuiltInWindowFunctionExpr, PhysicalExpr}; use crate::scalar::ScalarValue; -use arrow::array::{new_empty_array, ArrayRef}; +use arrow::array::{new_empty_array, new_null_array, ArrayRef}; use arrow::datatypes::{DataType, Field}; use std::any::Any; use std::sync::Arc; @@ -135,8 +135,12 @@ impl BuiltInWindowFunctionExpr for NthValue { NthValueKind::Last => (num_rows as usize) - 1, NthValueKind::Nth(n) => (n as usize) - 1, }; - let value = ScalarValue::try_from_array(value, index)?; - Ok(value.to_array_of_size(num_rows)) + Ok(if index >= num_rows { + new_null_array(value.data_type(), num_rows) + } else { + let value = ScalarValue::try_from_array(value, index)?; + value.to_array_of_size(num_rows) + }) } } diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index 713956f00a9e3..50c30a57b5fea 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -485,19 +485,20 @@ pub trait WindowExpr: Send + Sync + Debug { /// evaluate the window function values against the batch fn evaluate(&self, batch: &RecordBatch) -> Result; - /// evaluate the sort partition points - fn evaluate_sort_partition_points( + /// evaluate the partition points given the sort columns; if the sort columns are + /// empty then the result will be a single element vec of the whole column rows. + fn evaluate_partition_points( &self, - batch: &RecordBatch, + num_rows: usize, + partition_columns: &[SortColumn], ) -> Result>> { - let sort_columns = self.sort_columns(batch)?; - if sort_columns.is_empty() { + if partition_columns.is_empty() { Ok(vec![Range { start: 0, - end: batch.num_rows(), + end: num_rows, }]) } else { - lexicographical_partition_ranges(&sort_columns) + lexicographical_partition_ranges(partition_columns) .map_err(DataFusionError::ArrowError) } } @@ -508,8 +509,8 @@ pub trait WindowExpr: Send + Sync + Debug { /// expressions that's from the window function's order by clause, empty if absent fn order_by(&self) -> &[PhysicalSortExpr]; - /// get sort columns that can be used for partitioning, empty if absent - fn sort_columns(&self, batch: &RecordBatch) -> Result> { + /// get partition columns that can be used for partitioning, empty if absent + fn partition_columns(&self, batch: &RecordBatch) -> Result> { self.partition_by() .iter() .map(|expr| { @@ -519,13 +520,20 @@ pub trait WindowExpr: Send + Sync + Debug { } .evaluate_to_sort_column(batch) }) - .chain( - self.order_by() - .iter() - .map(|e| e.evaluate_to_sort_column(batch)), - ) .collect() } + + /// get sort columns that can be used for peer evaluation, empty if absent + fn sort_columns(&self, batch: &RecordBatch) -> Result> { + let mut sort_columns = self.partition_columns(batch)?; + let order_by_columns = self + .order_by() + .iter() + .map(|e| e.evaluate_to_sort_column(batch)) + .collect::>>()?; + sort_columns.extend(order_by_columns); + Ok(sort_columns) + } } /// An accumulator represents a stateful object that lives throughout the evaluation of multiple rows and diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index 1121c28184bd7..af0e60f2194ca 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -775,12 +775,6 @@ impl DefaultPhysicalPlanner { )), }) .collect::>>()?; - if !partition_by.is_empty() { - return Err(DataFusionError::NotImplemented( - "window expression with non-empty partition by clause is not yet supported" - .to_owned(), - )); - } if window_frame.is_some() { return Err(DataFusionError::NotImplemented( "window expression with window frame definition is not yet supported" diff --git a/datafusion/src/physical_plan/windows.rs b/datafusion/src/physical_plan/windows.rs index e5570971cf166..466cc51b447d0 100644 --- a/datafusion/src/physical_plan/windows.rs +++ b/datafusion/src/physical_plan/windows.rs @@ -175,10 +175,45 @@ impl WindowExpr for BuiltInWindowExpr { // case when partition_by is supported, in which case we'll parallelize the calls. // See https://github.com/apache/arrow-datafusion/issues/299 let values = self.evaluate_args(batch)?; - self.window.evaluate(batch.num_rows(), &values) + let partition_points = self.evaluate_partition_points( + batch.num_rows(), + &self.partition_columns(batch)?, + )?; + let results = partition_points + .iter() + .map(|partition_range| { + let start = partition_range.start; + let len = partition_range.end - start; + let values = values + .iter() + .map(|arr| arr.slice(start, len)) + .collect::>(); + self.window.evaluate(len, &values) + }) + .collect::>>()? + .into_iter() + .collect::>(); + let results = results.iter().map(|i| i.as_ref()).collect::>(); + concat(&results).map_err(DataFusionError::ArrowError) } } +/// Given a partition range, and the full list of sort partition points, given that the sort +/// partition points are sorted using [partition columns..., order columns...], the split +/// boundaries would align (what's sorted on [partition columns...] would definitely be sorted +/// on finer columns), so this will use binary search to find ranges that are within the +/// partition range and return the valid slice. +fn find_ranges_in_range<'a>( + partition_range: &Range, + sort_partition_points: &'a [Range], +) -> &'a [Range] { + let start_idx = sort_partition_points + .partition_point(|sort_range| sort_range.start < partition_range.start); + let end_idx = sort_partition_points + .partition_point(|sort_range| sort_range.end <= partition_range.end); + &sort_partition_points[start_idx..end_idx] +} + /// A window expr that takes the form of an aggregate function #[derive(Debug)] pub struct AggregateWindowExpr { @@ -205,13 +240,27 @@ impl AggregateWindowExpr { /// and then per partition point we'll evaluate the peer group (e.g. SUM or MAX gives the same /// results for peers) and concatenate the results. fn peer_based_evaluate(&self, batch: &RecordBatch) -> Result { - let sort_partition_points = self.evaluate_sort_partition_points(batch)?; - let mut window_accumulators = self.create_accumulator()?; + let num_rows = batch.num_rows(); + let partition_points = + self.evaluate_partition_points(num_rows, &self.partition_columns(batch)?)?; + let sort_partition_points = + self.evaluate_partition_points(num_rows, &self.sort_columns(batch)?)?; let values = self.evaluate_args(batch)?; - let results = sort_partition_points + let results = partition_points .iter() - .map(|peer_range| window_accumulators.scan_peers(&values, peer_range)) - .collect::>>()?; + .map(|partition_range| { + let sort_partition_points = + find_ranges_in_range(partition_range, &sort_partition_points); + let mut window_accumulators = self.create_accumulator()?; + sort_partition_points + .iter() + .map(|range| window_accumulators.scan_peers(&values, range)) + .collect::>>() + }) + .collect::>>>()? + .into_iter() + .flatten() + .collect::>(); let results = results.iter().map(|i| i.as_ref()).collect::>(); concat(&results).map_err(DataFusionError::ArrowError) } diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index b6393e91e321b..cfdb6f4bc9e4b 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -868,6 +868,70 @@ async fn csv_query_window_with_empty_over() -> Result<()> { Ok(()) } +#[tokio::test] +async fn csv_query_window_with_partition_by() -> Result<()> { + let mut ctx = ExecutionContext::new(); + register_aggregate_csv(&mut ctx)?; + let sql = "select \ + c9, \ + sum(cast(c4 as Int)) over (partition by c3), \ + avg(cast(c4 as Int)) over (partition by c3), \ + count(cast(c4 as Int)) over (partition by c3), \ + max(cast(c4 as Int)) over (partition by c3), \ + min(cast(c4 as Int)) over (partition by c3), \ + first_value(cast(c4 as Int)) over (partition by c3), \ + last_value(cast(c4 as Int)) over (partition by c3), \ + nth_value(cast(c4 as Int), 2) over (partition by c3) \ + from aggregate_test_100 \ + order by c9 \ + limit 5"; + let actual = execute(&mut ctx, sql).await; + let expected = vec![ + vec![ + "28774375", "-16110", "-16110", "1", "-16110", "-16110", "-16110", "-16110", + "NULL", + ], + vec![ + "63044568", "3917", "3917", "1", "3917", "3917", "3917", "3917", "NULL", + ], + vec![ + "141047417", + "-38455", + "-19227.5", + "2", + "-16974", + "-21481", + "-16974", + "-21481", + "-21481", + ], + vec![ + "141680161", + "-1114", + "-1114", + "1", + "-1114", + "-1114", + "-1114", + "-1114", + "NULL", + ], + vec![ + "145294611", + "15673", + "15673", + "1", + "15673", + "15673", + "15673", + "15673", + "NULL", + ], + ]; + assert_eq!(expected, actual); + Ok(()) +} + #[tokio::test] async fn csv_query_window_with_order_by() -> Result<()> { let mut ctx = ExecutionContext::new(); diff --git a/integration-tests/sqls/simple_window_partition_aggregation.sql b/integration-tests/sqls/simple_window_partition_aggregation.sql new file mode 100644 index 0000000000000..f395671db8cc8 --- /dev/null +++ b/integration-tests/sqls/simple_window_partition_aggregation.sql @@ -0,0 +1,26 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at + +-- http://www.apache.org/licenses/LICENSE-2.0 + +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language gOVERning permissions and +-- limitations under the License. + +SELECT + c9, + row_number() OVER (PARTITION BY c2, c9) AS row_number, + count(c3) OVER (PARTITION BY c2) AS count_c3, + avg(c3) OVER (PARTITION BY c2) AS avg_c3_by_c2, + sum(c3) OVER (PARTITION BY c2) AS sum_c3_by_c2, + max(c3) OVER (PARTITION BY c2) AS max_c3_by_c2, + min(c3) OVER (PARTITION BY c2) AS min_c3_by_c2 +FROM test +ORDER BY c9; diff --git a/integration-tests/sqls/simple_window_partition_order_aggregation.sql b/integration-tests/sqls/simple_window_partition_order_aggregation.sql new file mode 100644 index 0000000000000..a11a9ec6e4b1e --- /dev/null +++ b/integration-tests/sqls/simple_window_partition_order_aggregation.sql @@ -0,0 +1,26 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at + +-- http://www.apache.org/licenses/LICENSE-2.0 + +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language gOVERning permissions and +-- limitations under the License. + +SELECT + c9, + row_number() OVER (PARTITION BY c2 ORDER BY c9) AS row_number, + count(c3) OVER (PARTITION BY c2 ORDER BY c9) AS count_c3, + avg(c3) OVER (PARTITION BY c2 ORDER BY c9) AS avg_c3_by_c2, + sum(c3) OVER (PARTITION BY c2 ORDER BY c9) AS sum_c3_by_c2, + max(c3) OVER (PARTITION BY c2 ORDER BY c9) AS max_c3_by_c2, + min(c3) OVER (PARTITION BY c2 ORDER BY c9) AS min_c3_by_c2 +FROM test +ORDER BY c9; diff --git a/integration-tests/test_psql_parity.py b/integration-tests/test_psql_parity.py index 4e0878c24b818..c4b5a7596ae94 100644 --- a/integration-tests/test_psql_parity.py +++ b/integration-tests/test_psql_parity.py @@ -74,7 +74,7 @@ class PsqlParityTest(unittest.TestCase): def test_parity(self): root = Path(os.path.dirname(__file__)) / "sqls" files = set(root.glob("*.sql")) - self.assertEqual(len(files), 7, msg="tests are missed") + self.assertEqual(len(files), 9, msg="tests are missed") for fname in files: with self.subTest(fname=fname): datafusion_output = pd.read_csv( From 0bf1c09b513a68f03378f5466d3e0030dad570a0 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Tue, 22 Jun 2021 17:15:25 +0800 Subject: [PATCH 195/329] add benchmark for window functions (#564) --- datafusion/Cargo.toml | 4 + datafusion/benches/aggregate_query_sql.rs | 126 +----------- datafusion/benches/data_utils/mod.rs | 154 +++++++++++++++ datafusion/benches/window_query_sql.rs | 227 ++++++++++++++++++++++ 4 files changed, 393 insertions(+), 118 deletions(-) create mode 100644 datafusion/benches/data_utils/mod.rs create mode 100644 datafusion/benches/window_query_sql.rs diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index 5da2469a764fb..a001fc7c58035 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -88,6 +88,10 @@ harness = false name = "filter_query_sql" harness = false +[[bench]] +name = "window_query_sql" +harness = false + [[bench]] name = "scalar" harness = false diff --git a/datafusion/benches/aggregate_query_sql.rs b/datafusion/benches/aggregate_query_sql.rs index 74798ae572cd5..b8fe06fd91452 100644 --- a/datafusion/benches/aggregate_query_sql.rs +++ b/datafusion/benches/aggregate_query_sql.rs @@ -17,68 +17,21 @@ #[macro_use] extern crate criterion; -use criterion::Criterion; - -use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng}; -use std::sync::{Arc, Mutex}; -use tokio::runtime::Runtime; - extern crate arrow; extern crate datafusion; -use arrow::{ - array::Float32Array, - array::Float64Array, - array::StringArray, - array::UInt64Array, - datatypes::{DataType, Field, Schema}, - record_batch::RecordBatch, -}; - -use datafusion::datasource::MemTable; +mod data_utils; +use crate::criterion::Criterion; +use data_utils::create_table_provider; use datafusion::error::Result; use datafusion::execution::context::ExecutionContext; - -pub fn seedable_rng() -> StdRng { - StdRng::seed_from_u64(42) -} +use std::sync::{Arc, Mutex}; +use tokio::runtime::Runtime; fn query(ctx: Arc>, sql: &str) { let rt = Runtime::new().unwrap(); - - // execute the query let df = ctx.lock().unwrap().sql(sql).unwrap(); - rt.block_on(df.collect()).unwrap(); -} - -fn create_data(size: usize, null_density: f64) -> Vec> { - // use random numbers to avoid spurious compiler optimizations wrt to branching - let mut rng = seedable_rng(); - - (0..size) - .map(|_| { - if rng.gen::() > null_density { - None - } else { - Some(rng.gen::()) - } - }) - .collect() -} - -fn create_integer_data(size: usize, value_density: f64) -> Vec> { - // use random numbers to avoid spurious compiler optimizations wrt to branching - let mut rng = seedable_rng(); - - (0..size) - .map(|_| { - if rng.gen::() > value_density { - None - } else { - Some(rng.gen::()) - } - }) - .collect() + criterion::black_box(rt.block_on(df.collect()).unwrap()); } fn create_context( @@ -86,72 +39,9 @@ fn create_context( array_len: usize, batch_size: usize, ) -> Result>> { - // define a schema. - let schema = Arc::new(Schema::new(vec![ - Field::new("utf8", DataType::Utf8, false), - Field::new("f32", DataType::Float32, false), - Field::new("f64", DataType::Float64, false), - // This field will contain integers randomly selected from a large - // range of values, i.e. [0, u64::MAX], such that there are none (or - // very few) repeated values. - Field::new("u64_wide", DataType::UInt64, false), - // This field will contain integers randomly selected from a narrow - // range of values such that there are a few distinct values, but they - // are repeated often. - Field::new("u64_narrow", DataType::UInt64, false), - ])); - - let mut rng = seedable_rng(); - - // define data. - let partitions = (0..partitions_len) - .map(|_| { - (0..array_len / batch_size / partitions_len) - .map(|i| { - // the 4 here is the number of different keys. - // a higher number increase sparseness - let vs = vec![0, 1, 2, 3]; - let keys: Vec = (0..batch_size) - .map( - // use random numbers to avoid spurious compiler optimizations wrt to branching - |_| format!("hi{:?}", vs.choose(&mut rng)), - ) - .collect(); - let keys: Vec<&str> = keys.iter().map(|e| &**e).collect(); - - let values = create_data(batch_size, 0.5); - - // Integer values between [0, u64::MAX]. - let integer_values_wide = create_integer_data(batch_size, 9.0); - - // Integer values between [0, 9]. - let integer_values_narrow_choices = (0..10).collect::>(); - let integer_values_narrow = (0..batch_size) - .map(|_| *integer_values_narrow_choices.choose(&mut rng).unwrap()) - .collect::>(); - - RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(StringArray::from(keys)), - Arc::new(Float32Array::from(vec![i as f32; batch_size])), - Arc::new(Float64Array::from(values)), - Arc::new(UInt64Array::from(integer_values_wide)), - Arc::new(UInt64Array::from(integer_values_narrow)), - ], - ) - .unwrap() - }) - .collect::>() - }) - .collect::>(); - let mut ctx = ExecutionContext::new(); - - // declare a table in memory. In spark API, this corresponds to createDataFrame(...). - let provider = MemTable::try_new(schema, partitions)?; - ctx.register_table("t", Arc::new(provider))?; - + let provider = create_table_provider(partitions_len, array_len, batch_size)?; + ctx.register_table("t", provider)?; Ok(Arc::new(Mutex::new(ctx))) } diff --git a/datafusion/benches/data_utils/mod.rs b/datafusion/benches/data_utils/mod.rs new file mode 100644 index 0000000000000..4fd8f57fa190e --- /dev/null +++ b/datafusion/benches/data_utils/mod.rs @@ -0,0 +1,154 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! This module provides the in-memory table for more realistic benchmarking. + +use arrow::{ + array::Float32Array, + array::Float64Array, + array::StringArray, + array::UInt64Array, + datatypes::{DataType, Field, Schema, SchemaRef}, + record_batch::RecordBatch, +}; +use datafusion::datasource::MemTable; +use datafusion::error::Result; +use rand::rngs::StdRng; +use rand::seq::SliceRandom; +use rand::{Rng, SeedableRng}; +use std::sync::Arc; + +/// create an in-memory table given the partition len, array len, and batch size, +/// and the result table will be of array_len in total, and then partitioned, and batched. +pub(crate) fn create_table_provider( + partitions_len: usize, + array_len: usize, + batch_size: usize, +) -> Result> { + let schema = Arc::new(create_schema()); + let partitions = + create_record_batches(schema.clone(), array_len, partitions_len, batch_size); + // declare a table in memory. In spark API, this corresponds to createDataFrame(...). + MemTable::try_new(schema, partitions).map(Arc::new) +} + +/// create a seedable [`StdRng`](rand::StdRng) +fn seedable_rng() -> StdRng { + StdRng::seed_from_u64(42) +} + +fn create_schema() -> Schema { + Schema::new(vec![ + Field::new("utf8", DataType::Utf8, false), + Field::new("f32", DataType::Float32, false), + Field::new("f64", DataType::Float64, false), + // This field will contain integers randomly selected from a large + // range of values, i.e. [0, u64::MAX], such that there are none (or + // very few) repeated values. + Field::new("u64_wide", DataType::UInt64, false), + // This field will contain integers randomly selected from a narrow + // range of values such that there are a few distinct values, but they + // are repeated often. + Field::new("u64_narrow", DataType::UInt64, false), + ]) +} + +fn create_data(size: usize, null_density: f64) -> Vec> { + // use random numbers to avoid spurious compiler optimizations wrt to branching + let mut rng = seedable_rng(); + + (0..size) + .map(|_| { + if rng.gen::() > null_density { + None + } else { + Some(rng.gen::()) + } + }) + .collect() +} + +fn create_integer_data(size: usize, value_density: f64) -> Vec> { + // use random numbers to avoid spurious compiler optimizations wrt to branching + let mut rng = seedable_rng(); + + (0..size) + .map(|_| { + if rng.gen::() > value_density { + None + } else { + Some(rng.gen::()) + } + }) + .collect() +} + +fn create_record_batch( + schema: SchemaRef, + rng: &mut StdRng, + batch_size: usize, + i: usize, +) -> RecordBatch { + // the 4 here is the number of different keys. + // a higher number increase sparseness + let vs = vec![0, 1, 2, 3]; + let keys: Vec = (0..batch_size) + .map( + // use random numbers to avoid spurious compiler optimizations wrt to branching + |_| format!("hi{:?}", vs.choose(rng)), + ) + .collect(); + let keys: Vec<&str> = keys.iter().map(|e| &**e).collect(); + + let values = create_data(batch_size, 0.5); + + // Integer values between [0, u64::MAX]. + let integer_values_wide = create_integer_data(batch_size, 9.0); + + // Integer values between [0, 9]. + let integer_values_narrow = (0..batch_size) + .map(|_| rng.gen_range(0_u64..10)) + .collect::>(); + + RecordBatch::try_new( + schema, + vec![ + Arc::new(StringArray::from(keys)), + Arc::new(Float32Array::from(vec![i as f32; batch_size])), + Arc::new(Float64Array::from(values)), + Arc::new(UInt64Array::from(integer_values_wide)), + Arc::new(UInt64Array::from(integer_values_narrow)), + ], + ) + .unwrap() +} + +fn create_record_batches( + schema: SchemaRef, + array_len: usize, + partitions_len: usize, + batch_size: usize, +) -> Vec> { + let mut rng = seedable_rng(); + (0..partitions_len) + .map(|_| { + (0..array_len / batch_size / partitions_len) + .map(|i| create_record_batch(schema.clone(), &mut rng, batch_size, i)) + .collect::>() + }) + .collect::>() +} diff --git a/datafusion/benches/window_query_sql.rs b/datafusion/benches/window_query_sql.rs new file mode 100644 index 0000000000000..7c323be2b5edd --- /dev/null +++ b/datafusion/benches/window_query_sql.rs @@ -0,0 +1,227 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[macro_use] +extern crate criterion; +extern crate arrow; +extern crate datafusion; + +mod data_utils; +use crate::criterion::Criterion; +use data_utils::create_table_provider; +use datafusion::error::Result; +use datafusion::execution::context::ExecutionContext; +use std::sync::{Arc, Mutex}; +use tokio::runtime::Runtime; + +fn query(ctx: Arc>, sql: &str) { + let rt = Runtime::new().unwrap(); + let df = ctx.lock().unwrap().sql(sql).unwrap(); + criterion::black_box(rt.block_on(df.collect()).unwrap()); +} + +fn create_context( + partitions_len: usize, + array_len: usize, + batch_size: usize, +) -> Result>> { + let mut ctx = ExecutionContext::new(); + let provider = create_table_provider(partitions_len, array_len, batch_size)?; + ctx.register_table("t", provider)?; + Ok(Arc::new(Mutex::new(ctx))) +} + +fn criterion_benchmark(c: &mut Criterion) { + let partitions_len = 8; + let array_len = 1024 * 1024; + let batch_size = 8 * 1024; + let ctx = create_context(partitions_len, array_len, batch_size).unwrap(); + + c.bench_function("window empty over, aggregate functions", |b| { + b.iter(|| { + query( + ctx.clone(), + "SELECT \ + MAX(f64) OVER (), \ + MIN(f32) OVER (), \ + SUM(u64_narrow) OVER () \ + FROM t", + ) + }) + }); + + c.bench_function("window empty over, built-in functions", |b| { + b.iter(|| { + query( + ctx.clone(), + "SELECT \ + FIRST_VALUE(f64) OVER (), \ + LAST_VALUE(f32) OVER (), \ + NTH_VALUE(u64_narrow, 50) OVER () \ + FROM t", + ) + }) + }); + + c.bench_function("window order by, aggregate functions", |b| { + b.iter(|| { + query( + ctx.clone(), + "SELECT \ + MAX(f64) OVER (ORDER BY u64_narrow), \ + MIN(f32) OVER (ORDER BY u64_narrow DESC), \ + SUM(u64_narrow) OVER (ORDER BY u64_narrow ASC) \ + FROM t", + ) + }) + }); + + c.bench_function("window order by, built-in functions", |b| { + b.iter(|| { + query( + ctx.clone(), + "SELECT \ + FIRST_VALUE(f64) OVER (ORDER BY u64_narrow), \ + LAST_VALUE(f32) OVER (ORDER BY u64_narrow DESC), \ + NTH_VALUE(u64_narrow, 50) OVER (ORDER BY u64_narrow ASC) \ + FROM t", + ) + }) + }); + + c.bench_function("window partition by, u64_wide, aggregate functions", |b| { + b.iter(|| { + query( + ctx.clone(), + "SELECT \ + MAX(f64) OVER (PARTITION BY u64_wide), \ + MIN(f32) OVER (PARTITION BY u64_wide), \ + SUM(u64_narrow) OVER (PARTITION BY u64_wide) \ + FROM t", + ) + }) + }); + + c.bench_function( + "window partition by, u64_narrow, aggregate functions", + |b| { + b.iter(|| { + query( + ctx.clone(), + "SELECT \ + MAX(f64) OVER (PARTITION BY u64_narrow), \ + MIN(f32) OVER (PARTITION BY u64_narrow), \ + SUM(u64_narrow) OVER (PARTITION BY u64_narrow) \ + FROM t", + ) + }) + }, + ); + + c.bench_function("window partition by, u64_wide, built-in functions", |b| { + b.iter(|| { + query( + ctx.clone(), + "SELECT \ + FIRST_VALUE(f64) OVER (PARTITION BY u64_wide), \ + LAST_VALUE(f32) OVER (PARTITION BY u64_wide), \ + NTH_VALUE(u64_narrow, 50) OVER (PARTITION BY u64_wide) \ + FROM t", + ) + }) + }); + + c.bench_function("window partition by, u64_narrow, built-in functions", |b| { + b.iter(|| { + query( + ctx.clone(), + "SELECT \ + FIRST_VALUE(f64) OVER (PARTITION BY u64_narrow), \ + LAST_VALUE(f32) OVER (PARTITION BY u64_narrow), \ + NTH_VALUE(u64_narrow, 50) OVER (PARTITION BY u64_narrow) \ + FROM t", + ) + }) + }); + + c.bench_function( + "window partition and order by, u64_wide, aggregate functions", + |b| { + b.iter(|| { + query( + ctx.clone(), + "SELECT \ + MAX(f64) OVER (PARTITION BY u64_wide ORDER by f64), \ + MIN(f32) OVER (PARTITION BY u64_wide ORDER by f64), \ + SUM(u64_narrow) OVER (PARTITION BY u64_wide ORDER by f64) \ + FROM t", + ) + }) + }, + ); + + c.bench_function( + "window partition and order by, u64_narrow, aggregate functions", + |b| { + b.iter(|| { + query( + ctx.clone(), + "SELECT \ + MAX(f64) OVER (PARTITION BY u64_narrow ORDER by f64), \ + MIN(f32) OVER (PARTITION BY u64_narrow ORDER by f64), \ + SUM(u64_narrow) OVER (PARTITION BY u64_narrow ORDER by f64) \ + FROM t", + ) + }) + }, + ); + + c.bench_function( + "window partition and order by, u64_wide, built-in functions", + |b| { + b.iter(|| { + query( + ctx.clone(), + "SELECT \ + FIRST_VALUE(f64) OVER (PARTITION BY u64_wide ORDER by f64), \ + LAST_VALUE(f32) OVER (PARTITION BY u64_wide ORDER by f64), \ + NTH_VALUE(u64_narrow, 50) OVER (PARTITION BY u64_wide ORDER by f64) \ + FROM t", + ) + }) + }, + ); + + c.bench_function( + "window partition and order by, u64_narrow, built-in functions", + |b| { + b.iter(|| { + query( + ctx.clone(), + "SELECT \ + FIRST_VALUE(f64) OVER (PARTITION BY u64_narrow ORDER by f64), \ + LAST_VALUE(f32) OVER (PARTITION BY u64_narrow ORDER by f64), \ + NTH_VALUE(u64_narrow, 50) OVER (PARTITION BY u64_narrow ORDER by f64) \ + FROM t", + ) + }) + }, + ); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); From a461e9c1f53a2a89d7bf7fb2e98334c224da13aa Mon Sep 17 00:00:00 2001 From: Ximo Guanter Date: Tue, 22 Jun 2021 14:02:33 +0200 Subject: [PATCH 196/329] Move ballista standalone mode to client (#589) --- .github/workflows/rust.yml | 4 +- ballista/rust/client/Cargo.toml | 6 ++ ballista/rust/client/src/columnar_batch.rs | 4 - ballista/rust/client/src/context.rs | 70 ++++++++++++++---- ballista/rust/executor/Cargo.toml | 3 +- .../rust/executor/executor_config_spec.toml | 9 --- ballista/rust/executor/src/execution_loop.rs | 3 +- ballista/rust/executor/src/lib.rs | 4 + ballista/rust/executor/src/main.rs | 73 +------------------ ballista/rust/executor/src/standalone.rs | 72 ++++++++++++++++++ ballista/rust/scheduler/Cargo.toml | 3 +- ballista/rust/scheduler/src/lib.rs | 4 + ballista/rust/scheduler/src/standalone.rs | 54 ++++++++++++++ benchmarks/src/bin/tpch.rs | 7 +- dev/docker/ballista-base.dockerfile | 2 +- dev/docker/ballista.dockerfile | 2 +- .../src/distributed/docker-compose.md | 22 ++++-- 17 files changed, 227 insertions(+), 115 deletions(-) create mode 100644 ballista/rust/executor/src/standalone.rs create mode 100644 ballista/rust/scheduler/src/standalone.rs diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 7a2890c98b9f4..4a994bfb6b6c9 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -69,7 +69,7 @@ jobs: run: | cd ballista/rust # snmalloc requires cmake so build without default features - cargo build --no-default-features + cargo build --no-default-features --features sled env: CARGO_HOME: "/github/home/.cargo" CARGO_TARGET_DIR: "/github/home/target" @@ -131,7 +131,7 @@ jobs: export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data cd ballista/rust # snmalloc requires cmake so build without default features - cargo test --no-default-features + cargo test --no-default-features --features sled env: CARGO_HOME: "/github/home/.cargo" CARGO_TARGET_DIR: "/github/home/target" diff --git a/ballista/rust/client/Cargo.toml b/ballista/rust/client/Cargo.toml index dd1a57ce14fbe..5c7eb3802a104 100644 --- a/ballista/rust/client/Cargo.toml +++ b/ballista/rust/client/Cargo.toml @@ -27,8 +27,14 @@ edition = "2018" [dependencies] ballista-core = { path = "../core" } +ballista-executor = { path = "../executor", optional = true } +ballista-scheduler = { path = "../scheduler", optional = true } futures = "0.3" log = "0.4" tokio = "1.0" datafusion = { path = "../../../datafusion" } + +[features] +default = [] +standalone = ["ballista-executor", "ballista-scheduler"] \ No newline at end of file diff --git a/ballista/rust/client/src/columnar_batch.rs b/ballista/rust/client/src/columnar_batch.rs index a40b68ff3ebd7..3431f56128839 100644 --- a/ballista/rust/client/src/columnar_batch.rs +++ b/ballista/rust/client/src/columnar_batch.rs @@ -29,9 +29,7 @@ use datafusion::scalar::ScalarValue; pub type MaybeColumnarBatch = Result>; /// Batch of columnar data. -#[allow(dead_code)] #[derive(Debug, Clone)] - pub struct ColumnarBatch { schema: Arc, columns: HashMap, @@ -112,9 +110,7 @@ impl ColumnarBatch { } /// A columnar value can either be a scalar value or an Arrow array. -#[allow(dead_code)] #[derive(Debug, Clone)] - pub enum ColumnarValue { Scalar(ScalarValue, usize), Columnar(ArrayRef), diff --git a/ballista/rust/client/src/context.rs b/ballista/rust/client/src/context.rs index 695045d220d07..aca712e1d8782 100644 --- a/ballista/rust/client/src/context.rs +++ b/ballista/rust/client/src/context.rs @@ -44,7 +44,6 @@ use futures::future; use futures::StreamExt; use log::{error, info}; -#[allow(dead_code)] struct BallistaContextState { /// Scheduler host scheduler_host: String, @@ -52,26 +51,48 @@ struct BallistaContextState { scheduler_port: u16, /// Tables that have been registered with this context tables: HashMap, - /// General purpose settings - settings: HashMap, } impl BallistaContextState { - pub fn new( - scheduler_host: String, - scheduler_port: u16, - settings: HashMap, - ) -> Self { + pub fn new(scheduler_host: String, scheduler_port: u16) -> Self { Self { scheduler_host, scheduler_port, tables: HashMap::new(), - settings, } } -} -#[allow(dead_code)] + #[cfg(feature = "standalone")] + pub async fn new_standalone( + concurrent_tasks: usize, + ) -> ballista_core::error::Result { + info!("Running in local mode. Scheduler will be run in-proc"); + + let addr = ballista_scheduler::new_standalone_scheduler().await?; + + let scheduler = loop { + match SchedulerGrpcClient::connect(format!( + "http://localhost:{}", + addr.port() + )) + .await + { + Err(_) => { + tokio::time::sleep(Duration::from_millis(100)).await; + info!("Attempting to connect to in-proc scheduler..."); + } + Ok(scheduler) => break scheduler, + } + }; + + ballista_executor::new_standalone_executor(scheduler, concurrent_tasks).await?; + Ok(Self { + scheduler_host: "localhost".to_string(), + scheduler_port: addr.port(), + tables: HashMap::new(), + }) + } +} pub struct BallistaContext { state: Arc>, @@ -79,14 +100,25 @@ pub struct BallistaContext { impl BallistaContext { /// Create a context for executing queries against a remote Ballista scheduler instance - pub fn remote(host: &str, port: u16, settings: HashMap) -> Self { - let state = BallistaContextState::new(host.to_owned(), port, settings); + pub fn remote(host: &str, port: u16) -> Self { + let state = BallistaContextState::new(host.to_owned(), port); Self { state: Arc::new(Mutex::new(state)), } } + #[cfg(feature = "standalone")] + pub async fn standalone( + concurrent_tasks: usize, + ) -> ballista_core::error::Result { + let state = BallistaContextState::new_standalone(concurrent_tasks).await?; + + Ok(Self { + state: Arc::new(Mutex::new(state)), + }) + } + /// Create a DataFrame representing a Parquet table scan pub fn read_parquet(&self, path: &str) -> Result> { @@ -268,3 +300,15 @@ impl BallistaContext { } } } + +#[cfg(test)] +mod tests { + #[tokio::test] + #[cfg(feature = "standalone")] + async fn test_standalone_mode() { + use super::*; + let context = BallistaContext::standalone(1).await.unwrap(); + let df = context.sql("SELECT 1;").unwrap(); + context.collect(&df.to_logical_plan()).await.unwrap(); + } +} diff --git a/ballista/rust/executor/Cargo.toml b/ballista/rust/executor/Cargo.toml index 7574fca82774d..68e4920f3b40b 100644 --- a/ballista/rust/executor/Cargo.toml +++ b/ballista/rust/executor/Cargo.toml @@ -32,7 +32,6 @@ snmalloc = ["snmalloc-rs"] anyhow = "1" async-trait = "0.1.36" ballista-core = { path = "../core" } -ballista-scheduler = { path = "../scheduler" } configure_me = "0.4.0" env_logger = "0.8" futures = "0.3" @@ -40,7 +39,7 @@ log = "0.4" snmalloc-rs = {version = "0.2", features= ["cache-friendly"], optional = true} tempfile = "3" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread"] } -tokio-stream = "0.1" +tokio-stream = { version = "0.1", features = ["net"] } tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } diff --git a/ballista/rust/executor/executor_config_spec.toml b/ballista/rust/executor/executor_config_spec.toml index 3cb168e772416..6f170c85e8234 100644 --- a/ballista/rust/executor/executor_config_spec.toml +++ b/ballista/rust/executor/executor_config_spec.toml @@ -36,10 +36,6 @@ type = "u16" default = "50050" doc = "scheduler port" -[[switch]] -name = "local" -doc = "Running in local mode will launch a standalone scheduler inside the executor process. This will create a single-executor cluster, and is useful for development scenarios." - [[param]] name = "bind_host" type = "String" @@ -69,8 +65,3 @@ name = "concurrent_tasks" type = "usize" default = "4" doc = "Max concurrent tasks." - -[[param]] -name = "scheduler_data_path" -type = "String" -doc = "Path for standalone data" diff --git a/ballista/rust/executor/src/execution_loop.rs b/ballista/rust/executor/src/execution_loop.rs index afc6f0089b921..6eb4713f5e396 100644 --- a/ballista/rust/executor/src/execution_loop.rs +++ b/ballista/rust/executor/src/execution_loop.rs @@ -29,9 +29,10 @@ use ballista_core::serde::protobuf::{ self, scheduler_grpc_client::SchedulerGrpcClient, task_status, FailedTask, PartitionId, PollWorkParams, PollWorkResult, TaskDefinition, TaskStatus, }; -use ballista_executor::executor::Executor; use protobuf::CompletedTask; +use crate::executor::Executor; + pub async fn poll_loop( mut scheduler: SchedulerGrpcClient, executor: Arc, diff --git a/ballista/rust/executor/src/lib.rs b/ballista/rust/executor/src/lib.rs index 188b9449db927..f3ab7dcf5ae51 100644 --- a/ballista/rust/executor/src/lib.rs +++ b/ballista/rust/executor/src/lib.rs @@ -18,5 +18,9 @@ //! Core executor logic for executing queries and storing results in memory. pub mod collect; +pub mod execution_loop; pub mod executor; pub mod flight_service; + +mod standalone; +pub use standalone::new_standalone_executor; diff --git a/ballista/rust/executor/src/main.rs b/ballista/rust/executor/src/main.rs index 4c63ba89680a1..b411a776f8291 100644 --- a/ballista/rust/executor/src/main.rs +++ b/ballista/rust/executor/src/main.rs @@ -17,14 +17,11 @@ //! Ballista Rust executor binary. -use std::{ - net::{IpAddr, Ipv4Addr}, - sync::Arc, -}; +use std::sync::Arc; use anyhow::{Context, Result}; use arrow_flight::flight_service_server::FlightServiceServer; -use futures::future::MaybeDone; +use ballista_executor::execution_loop; use log::info; use tempfile::TempDir; use tonic::transport::Server; @@ -34,17 +31,11 @@ use ballista_core::serde::protobuf::{ executor_registration, scheduler_grpc_client::SchedulerGrpcClient, ExecutorRegistration, }; -use ballista_core::{ - print_version, serde::protobuf::scheduler_grpc_server::SchedulerGrpcServer, - BALLISTA_VERSION, -}; +use ballista_core::{print_version, BALLISTA_VERSION}; use ballista_executor::executor::Executor; use ballista_executor::flight_service::BallistaFlightService; -use ballista_scheduler::{state::StandaloneClient, SchedulerServer}; use config::prelude::*; -mod execution_loop; - #[macro_use] extern crate configure_me; @@ -82,11 +73,7 @@ async fn main() -> Result<()> { .parse() .with_context(|| format!("Could not parse address: {}", addr))?; - let scheduler_host = if opt.local { - "localhost".to_string() - } else { - opt.scheduler_host - }; + let scheduler_host = opt.scheduler_host; let scheduler_port = opt.scheduler_port; let scheduler_url = format!("http://{}:{}", scheduler_host, scheduler_port); @@ -109,58 +96,6 @@ async fn main() -> Result<()> { port: port as u32, }; - if opt.local { - info!("Running in local mode. Scheduler will be run in-proc"); - - let client = match opt.scheduler_data_path { - Some(v) => StandaloneClient::try_new(v) - .context("Could not create standalone config backend")?, - None => StandaloneClient::try_new_temporary() - .context("Could not create standalone config backend")?, - }; - - let server = SchedulerGrpcServer::new(SchedulerServer::new( - Arc::new(client), - "ballista".to_string(), - IpAddr::V4(Ipv4Addr::LOCALHOST), - )); - let addr = format!("localhost:{}", scheduler_port); - let addr = addr - .parse() - .with_context(|| format!("Could not parse {}", addr))?; - info!( - "Ballista v{} Rust Scheduler listening on {:?}", - BALLISTA_VERSION, addr - ); - let scheduler_future = - tokio::spawn(Server::builder().add_service(server).serve(addr)); - let mut scheduler_result = futures::future::maybe_done(scheduler_future); - - // Ensure scheduler is ready to receive connections - while SchedulerGrpcClient::connect(scheduler_url.clone()) - .await - .is_err() - { - let scheduler_future = match scheduler_result { - MaybeDone::Future(f) => f, - MaybeDone::Done(Err(e)) => return Err(e).context("Tokio error"), - MaybeDone::Done(Ok(Err(e))) => { - return Err(e).context("Scheduler failed to initialize correctly") - } - MaybeDone::Done(Ok(Ok(()))) => { - return Err(anyhow::format_err!( - "Scheduler unexpectedly finished successfully" - )) - } - MaybeDone::Gone => { - panic!("Received Gone from recently created MaybeDone") - } - }; - tokio::time::sleep(std::time::Duration::from_millis(100)).await; - scheduler_result = futures::future::maybe_done(scheduler_future); - } - } - let scheduler = SchedulerGrpcClient::connect(scheduler_url) .await .context("Could not connect to scheduler")?; diff --git a/ballista/rust/executor/src/standalone.rs b/ballista/rust/executor/src/standalone.rs new file mode 100644 index 0000000000000..39a899c6c630c --- /dev/null +++ b/ballista/rust/executor/src/standalone.rs @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow_flight::flight_service_server::FlightServiceServer; +use ballista_core::{ + error::Result, + serde::protobuf::{scheduler_grpc_client::SchedulerGrpcClient, ExecutorRegistration}, + BALLISTA_VERSION, +}; +use log::info; +use tempfile::TempDir; +use tokio::net::TcpListener; +use tonic::transport::{Channel, Server}; +use uuid::Uuid; + +use crate::{execution_loop, executor::Executor, flight_service::BallistaFlightService}; + +pub async fn new_standalone_executor( + scheduler: SchedulerGrpcClient, + concurrent_tasks: usize, +) -> Result<()> { + let work_dir = TempDir::new()? + .into_path() + .into_os_string() + .into_string() + .unwrap(); + let executor = Arc::new(Executor::new(&work_dir)); + + let service = BallistaFlightService::new(executor.clone()); + + let server = FlightServiceServer::new(service); + // Let the OS assign a random, free port + let listener = TcpListener::bind("localhost:0").await?; + let addr = listener.local_addr()?; + info!( + "Ballista v{} Rust Executor listening on {:?}", + BALLISTA_VERSION, addr + ); + tokio::spawn( + Server::builder().add_service(server).serve_with_incoming( + tokio_stream::wrappers::TcpListenerStream::new(listener), + ), + ); + let executor_meta = ExecutorRegistration { + id: Uuid::new_v4().to_string(), // assign this executor a unique ID + optional_host: None, + port: addr.port() as u32, + }; + tokio::spawn(execution_loop::poll_loop( + scheduler, + executor, + executor_meta, + concurrent_tasks, + )); + Ok(()) +} diff --git a/ballista/rust/scheduler/Cargo.toml b/ballista/rust/scheduler/Cargo.toml index c009cc6a12bef..215c58a7fb3fa 100644 --- a/ballista/rust/scheduler/Cargo.toml +++ b/ballista/rust/scheduler/Cargo.toml @@ -28,7 +28,7 @@ edition = "2018" [features] default = ["etcd", "sled"] etcd = ["etcd-client"] -sled = ["sled_package"] +sled = ["sled_package", "tokio-stream"] [dependencies] anyhow = "1" @@ -48,6 +48,7 @@ rand = "0.8" serde = {version = "1", features = ["derive"]} sled_package = { package = "sled", version = "0.34", optional = true } tokio = { version = "1.0", features = ["full"] } +tokio-stream = { version = "0.1", features = ["net"], optional = true } tonic = "0.4" tower = { version = "0.4" } warp = "0.3" diff --git a/ballista/rust/scheduler/src/lib.rs b/ballista/rust/scheduler/src/lib.rs index 99c7be66a646c..54cba48db2e54 100644 --- a/ballista/rust/scheduler/src/lib.rs +++ b/ballista/rust/scheduler/src/lib.rs @@ -19,7 +19,11 @@ pub mod api; pub mod planner; +#[cfg(feature = "sled")] +mod standalone; pub mod state; +#[cfg(feature = "sled")] +pub use standalone::new_standalone_scheduler; #[cfg(test)] pub mod test_utils; diff --git a/ballista/rust/scheduler/src/standalone.rs b/ballista/rust/scheduler/src/standalone.rs new file mode 100644 index 0000000000000..6ab5bd62a8f03 --- /dev/null +++ b/ballista/rust/scheduler/src/standalone.rs @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use ballista_core::{ + error::Result, serde::protobuf::scheduler_grpc_server::SchedulerGrpcServer, + BALLISTA_VERSION, +}; +use log::info; +use std::{ + net::{IpAddr, Ipv4Addr, SocketAddr}, + sync::Arc, +}; +use tokio::net::TcpListener; +use tonic::transport::Server; + +use crate::{state::StandaloneClient, SchedulerServer}; + +pub async fn new_standalone_scheduler() -> Result { + let client = StandaloneClient::try_new_temporary()?; + + let server = SchedulerGrpcServer::new(SchedulerServer::new( + Arc::new(client), + "ballista".to_string(), + IpAddr::V4(Ipv4Addr::LOCALHOST), + )); + // Let the OS assign a random, free port + let listener = TcpListener::bind("localhost:0").await?; + let addr = listener.local_addr()?; + info!( + "Ballista v{} Rust Scheduler listening on {:?}", + BALLISTA_VERSION, addr + ); + tokio::spawn( + Server::builder().add_service(server).serve_with_incoming( + tokio_stream::wrappers::TcpListenerStream::new(listener), + ), + ); + + Ok(addr) +} diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index 34b8d3a27b194..08c47630fa18a 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -18,7 +18,6 @@ //! Benchmark derived from TPC-H. This is not an official TPC-H benchmark. use std::{ - collections::HashMap, fs, iter::Iterator, path::{Path, PathBuf}, @@ -252,11 +251,7 @@ async fn benchmark_datafusion(opt: DataFusionBenchmarkOpt) -> Result Result<()> { println!("Running benchmarks with the following options: {:?}", opt); - let mut settings = HashMap::new(); - settings.insert("batch.size".to_owned(), format!("{}", opt.batch_size)); - - let ctx = - BallistaContext::remote(opt.host.unwrap().as_str(), opt.port.unwrap(), settings); + let ctx = BallistaContext::remote(opt.host.unwrap().as_str(), opt.port.unwrap()); // register tables with Ballista context let path = opt.path.to_str().unwrap(); diff --git a/dev/docker/ballista-base.dockerfile b/dev/docker/ballista-base.dockerfile index 31620b38cf39b..e977f5eeff752 100644 --- a/dev/docker/ballista-base.dockerfile +++ b/dev/docker/ballista-base.dockerfile @@ -23,7 +23,7 @@ # Base image extends debian:buster-slim -FROM rust:1.51.0-buster AS builder +FROM rust:1.52.1-buster AS builder RUN apt update && apt -y install musl musl-dev musl-tools libssl-dev openssl diff --git a/dev/docker/ballista.dockerfile b/dev/docker/ballista.dockerfile index 59f21b76d411b..730e86749a63b 100644 --- a/dev/docker/ballista.dockerfile +++ b/dev/docker/ballista.dockerfile @@ -91,4 +91,4 @@ COPY benchmarks/queries/ /queries/ ENV RUST_LOG=info ENV RUST_BACKTRACE=full -CMD ["/executor", "--local"] +CMD ["/executor"] diff --git a/docs/user-guide/src/distributed/docker-compose.md b/docs/user-guide/src/distributed/docker-compose.md index 5ea86b5caea4b..14989e58034d0 100644 --- a/docs/user-guide/src/distributed/docker-compose.md +++ b/docs/user-guide/src/distributed/docker-compose.md @@ -24,23 +24,33 @@ demonstrates how to start a cluster using a single process that acts as both a s volume mounted into the container so that Ballista can access the host file system. ```yaml -version: "2.0" +version: '2.2' services: etcd: image: quay.io/coreos/etcd:v3.4.9 command: "etcd -advertise-client-urls http://etcd:2379 -listen-client-urls http://0.0.0.0:2379" + ballista-scheduler: + image: ballista:0.5.0-SNAPSHOT + command: "/scheduler --config-backend etcd --etcd-urls etcd:2379 --bind-host 0.0.0.0 --bind-port 50050" ports: - - "2379:2379" - ballista-executor: - image: ballistacompute/ballista-rust:0.4.2-SNAPSHOT - command: "/executor --bind-host 0.0.0.0 --bind-port 50051 --local" + - "50050:50050" environment: - RUST_LOG=info + volumes: + - ./data:/data + depends_on: + - etcd + ballista-executor: + image: ballista:0.5.0-SNAPSHOT + command: "/executor --bind-host 0.0.0.0 --bind-port 50051 --scheduler-host ballista-scheduler" ports: - - "50050:50050" - "50051:50051" + environment: + - RUST_LOG=info volumes: - ./data:/data + depends_on: + - ballista-scheduler ``` With the above content saved to a `docker-compose.yaml` file, the following command can be used to start the single From f2c01de7d620081eb370966d928673c8d38ac798 Mon Sep 17 00:00:00 2001 From: QP Hou Date: Tue, 22 Jun 2021 11:06:39 -0700 Subject: [PATCH 197/329] Support qualified columns in queries (#55) * support qualified columns in queries * handle coalesced hash join partition in HashJoinStream * implement Into for &str * add todo for ARROW-10971 * fix cross join handling in production push down optimizer When a projection is pushed down to cross join inputs, fields from resulting plan's schema need to be trimmed to only contain projected fields. * maintain field order during plan optimization using projections * change TableScane name from Option to String * WIP: fix ballista * separate logical and physical expressions in proto, fix ballista build * fix join schema handling in production push down optimizer schema needs to be recalculated based on newly optimized inputs * tpch 7 & 8 are now passing! * fix roundtrip_join test * fix clippy warnings * fix sql planner test error checking with matches `format("{:?}", err)` yields different results between stable and nightly rust. * address FIXMEs * honor datafusion field name semantic strip qualifer name in physical field names * add more comment * enable more queries in benchmark/run.sh * use unzip to avoid unnecessary iterators * reduce diff by discarding style related changes * simplify hash_join tests * reduce diff for easier revuew * fix unnecessary reference clippy error * incorporate code review feedback * fix window schema handling in projection pushdown optimizer --- ballista/rust/core/proto/ballista.proto | 175 +++- .../core/src/serde/logical_plan/from_proto.rs | 275 ++---- .../rust/core/src/serde/logical_plan/mod.rs | 31 +- .../core/src/serde/logical_plan/to_proto.rs | 54 +- ballista/rust/core/src/serde/mod.rs | 224 +++++ .../src/serde/physical_plan/from_proto.rs | 364 ++++++-- .../rust/core/src/serde/physical_plan/mod.rs | 32 +- .../core/src/serde/physical_plan/to_proto.rs | 160 ++-- benchmarks/run.sh | 2 +- benchmarks/src/bin/tpch.rs | 10 + datafusion/src/dataframe.rs | 2 + datafusion/src/execution/context.rs | 52 +- datafusion/src/execution/dataframe_impl.rs | 7 +- datafusion/src/logical_plan/builder.rs | 332 ++++--- datafusion/src/logical_plan/dfschema.rs | 139 ++- datafusion/src/logical_plan/expr.rs | 208 ++++- datafusion/src/logical_plan/mod.rs | 22 +- datafusion/src/logical_plan/plan.rs | 86 +- datafusion/src/optimizer/constant_folding.rs | 32 +- datafusion/src/optimizer/eliminate_limit.rs | 2 +- datafusion/src/optimizer/filter_push_down.rs | 234 ++--- .../src/optimizer/hash_build_probe_order.rs | 29 +- datafusion/src/optimizer/limit_push_down.rs | 6 +- .../src/optimizer/projection_push_down.rs | 272 ++++-- .../src/optimizer/simplify_expressions.rs | 8 +- datafusion/src/optimizer/utils.rs | 71 +- datafusion/src/physical_optimizer/pruning.rs | 155 ++-- .../src/physical_plan/expressions/binary.rs | 41 +- .../src/physical_plan/expressions/case.rs | 20 +- .../src/physical_plan/expressions/cast.rs | 12 +- .../src/physical_plan/expressions/column.rs | 33 +- .../src/physical_plan/expressions/in_list.rs | 112 ++- .../physical_plan/expressions/is_not_null.rs | 2 +- .../src/physical_plan/expressions/is_null.rs | 5 +- .../src/physical_plan/expressions/min_max.rs | 2 +- .../src/physical_plan/expressions/mod.rs | 8 +- .../src/physical_plan/expressions/not.rs | 4 +- .../physical_plan/expressions/nth_value.rs | 32 +- .../src/physical_plan/expressions/try_cast.rs | 9 +- datafusion/src/physical_plan/filter.rs | 4 +- datafusion/src/physical_plan/functions.rs | 4 +- .../src/physical_plan/hash_aggregate.rs | 55 +- datafusion/src/physical_plan/hash_join.rs | 419 +++++++-- datafusion/src/physical_plan/hash_utils.rs | 96 +- datafusion/src/physical_plan/mod.rs | 4 +- datafusion/src/physical_plan/parquet.rs | 9 +- datafusion/src/physical_plan/planner.rs | 511 ++++++++--- datafusion/src/physical_plan/projection.rs | 6 +- datafusion/src/physical_plan/repartition.rs | 10 +- datafusion/src/physical_plan/sort.rs | 10 +- .../physical_plan/sort_preserving_merge.rs | 84 +- datafusion/src/physical_plan/type_coercion.rs | 4 +- datafusion/src/physical_plan/windows.rs | 10 +- datafusion/src/prelude.rs | 2 +- datafusion/src/sql/planner.rs | 831 +++++++++--------- datafusion/src/sql/utils.rs | 12 +- datafusion/src/test/mod.rs | 2 +- datafusion/tests/custom_sources.rs | 11 +- datafusion/tests/sql.rs | 118 +-- datafusion/tests/user_defined_plan.rs | 15 +- integration-tests/test_psql_parity.py | 2 +- 61 files changed, 3603 insertions(+), 1880 deletions(-) diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index 5aafd00cf1b05..d75cbaa73efe0 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -28,11 +28,29 @@ option java_outer_classname = "BallistaProto"; // Ballista Logical Plan /////////////////////////////////////////////////////////////////////////////////////////////////// +message ColumnRelation { + string relation = 1; +} + +message Column { + string name = 1; + ColumnRelation relation = 2; +} + +message DfField{ + Field field = 1; + ColumnRelation qualifier = 2; +} + +message DfSchema { + repeated DfField columns = 1; +} + // logical expressions message LogicalExprNode { oneof ExprType { // column references - string column_name = 1; + Column column = 1; // alias AliasNode alias = 2; @@ -295,7 +313,7 @@ message CreateExternalTableNode{ string location = 2; FileType file_type = 3; bool has_header = 4; - Schema schema = 5; + DfSchema schema = 5; } enum FileType{ @@ -309,11 +327,6 @@ message ExplainNode{ bool verbose = 2; } -message DfField{ - string qualifier = 2; - Field field = 1; -} - message AggregateNode { LogicalPlanNode input = 1; repeated LogicalExprNode group_expr = 2; @@ -369,8 +382,8 @@ message JoinNode { LogicalPlanNode left = 1; LogicalPlanNode right = 2; JoinType join_type = 3; - repeated string left_join_column = 4; - repeated string right_join_column = 5; + repeated Column left_join_column = 4; + repeated Column right_join_column = 5; } message LimitNode { @@ -408,6 +421,119 @@ message PhysicalPlanNode { } } +// physical expressions +message PhysicalExprNode { + oneof ExprType { + // column references + PhysicalColumn column = 1; + + ScalarValue literal = 2; + + // binary expressions + PhysicalBinaryExprNode binary_expr = 3; + + // aggregate expressions + PhysicalAggregateExprNode aggregate_expr = 4; + + // null checks + PhysicalIsNull is_null_expr = 5; + PhysicalIsNotNull is_not_null_expr = 6; + PhysicalNot not_expr = 7; + + PhysicalCaseNode case_ = 8; + PhysicalCastNode cast = 9; + PhysicalSortExprNode sort = 10; + PhysicalNegativeNode negative = 11; + PhysicalInListNode in_list = 12; + PhysicalScalarFunctionNode scalar_function = 13; + PhysicalTryCastNode try_cast = 14; + + // window expressions + PhysicalWindowExprNode window_expr = 15; + } +} + +message PhysicalAggregateExprNode { + AggregateFunction aggr_function = 1; + PhysicalExprNode expr = 2; +} + +message PhysicalWindowExprNode { + oneof window_function { + AggregateFunction aggr_function = 1; + BuiltInWindowFunction built_in_function = 2; + // udaf = 3 + } + PhysicalExprNode expr = 4; +} + +message PhysicalIsNull { + PhysicalExprNode expr = 1; +} + +message PhysicalIsNotNull { + PhysicalExprNode expr = 1; +} + +message PhysicalNot { + PhysicalExprNode expr = 1; +} + +message PhysicalAliasNode { + PhysicalExprNode expr = 1; + string alias = 2; +} + +message PhysicalBinaryExprNode { + PhysicalExprNode l = 1; + PhysicalExprNode r = 2; + string op = 3; +} + +message PhysicalSortExprNode { + PhysicalExprNode expr = 1; + bool asc = 2; + bool nulls_first = 3; +} + +message PhysicalWhenThen { + PhysicalExprNode when_expr = 1; + PhysicalExprNode then_expr = 2; +} + +message PhysicalInListNode { + PhysicalExprNode expr = 1; + repeated PhysicalExprNode list = 2; + bool negated = 3; +} + +message PhysicalCaseNode { + PhysicalExprNode expr = 1; + repeated PhysicalWhenThen when_then_expr = 2; + PhysicalExprNode else_expr = 3; +} + +message PhysicalScalarFunctionNode { + string name = 1; + ScalarFunction fun = 2; + repeated PhysicalExprNode args = 3; + ArrowType return_type = 4; +} + +message PhysicalTryCastNode { + PhysicalExprNode expr = 1; + ArrowType arrow_type = 2; +} + +message PhysicalCastNode { + PhysicalExprNode expr = 1; + ArrowType arrow_type = 2; +} + +message PhysicalNegativeNode { + PhysicalExprNode expr = 1; +} + message UnresolvedShuffleExecNode { repeated uint32 query_stage_ids = 1; Schema schema = 2; @@ -416,7 +542,7 @@ message UnresolvedShuffleExecNode { message FilterExecNode { PhysicalPlanNode input = 1; - LogicalExprNode expr = 2; + PhysicalExprNode expr = 2; } message ParquetScanExecNode { @@ -447,11 +573,15 @@ message HashJoinExecNode { } -message JoinOn { - string left = 1; - string right = 2; +message PhysicalColumn { + string name = 1; + uint32 index = 2; } +message JoinOn { + PhysicalColumn left = 1; + PhysicalColumn right = 2; +} message EmptyExecNode { bool produce_one_row = 1; @@ -460,7 +590,7 @@ message EmptyExecNode { message ProjectionExecNode { PhysicalPlanNode input = 1; - repeated LogicalExprNode expr = 2; + repeated PhysicalExprNode expr = 2; repeated string expr_name = 3; } @@ -472,14 +602,14 @@ enum AggregateMode { message WindowAggExecNode { PhysicalPlanNode input = 1; - repeated LogicalExprNode window_expr = 2; + repeated PhysicalExprNode window_expr = 2; repeated string window_expr_name = 3; Schema input_schema = 4; } message HashAggregateExecNode { - repeated LogicalExprNode group_expr = 1; - repeated LogicalExprNode aggr_expr = 2; + repeated PhysicalExprNode group_expr = 1; + repeated PhysicalExprNode aggr_expr = 2; AggregateMode mode = 3; PhysicalPlanNode input = 4; repeated string group_expr_name = 5; @@ -510,7 +640,7 @@ message LocalLimitExecNode { message SortExecNode { PhysicalPlanNode input = 1; - repeated LogicalExprNode expr = 2; + repeated PhysicalExprNode expr = 2; } message CoalesceBatchesExecNode { @@ -522,11 +652,16 @@ message MergeExecNode { PhysicalPlanNode input = 1; } +message PhysicalHashRepartition { + repeated PhysicalExprNode hash_expr = 1; + uint64 partition_count = 2; +} + message RepartitionExecNode{ PhysicalPlanNode input = 1; oneof partition_method { uint64 round_robin = 2; - HashRepartition hash = 3; + PhysicalHashRepartition hash = 3; uint64 unknown = 4; } } @@ -803,7 +938,7 @@ message ScalarListValue{ message ScalarValue{ - oneof value{ + oneof value { bool bool_value = 1; string utf8_value = 2; string large_utf8_value = 3; diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index c2c1001b939c1..1b7deb7b7126c 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -18,7 +18,7 @@ //! Serde code to convert from protocol buffers to Rust data structures. use crate::error::BallistaError; -use crate::serde::{proto_error, protobuf}; +use crate::serde::{from_proto_binary_op, proto_error, protobuf}; use crate::{convert_box_required, convert_required}; use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use datafusion::logical_plan::window_frames::{ @@ -26,7 +26,8 @@ use datafusion::logical_plan::window_frames::{ }; use datafusion::logical_plan::{ abs, acos, asin, atan, ceil, cos, exp, floor, ln, log10, log2, round, signum, sin, - sqrt, tan, trunc, Expr, JoinType, LogicalPlan, LogicalPlanBuilder, Operator, + sqrt, tan, trunc, Column, DFField, DFSchema, Expr, JoinType, LogicalPlan, + LogicalPlanBuilder, Operator, }; use datafusion::physical_plan::aggregates::AggregateFunction; use datafusion::physical_plan::csv::CsvReadOptions; @@ -36,6 +37,7 @@ use protobuf::logical_plan_node::LogicalPlanType; use protobuf::{logical_expr_node::ExprType, scalar_type}; use std::{ convert::{From, TryInto}, + sync::Arc, unimplemented, }; @@ -115,8 +117,8 @@ impl TryInto for &protobuf::LogicalPlanNode { .has_header(scan.has_header); let mut projection = None; - if let Some(column_names) = &scan.projection { - let column_indices = column_names + if let Some(columns) = &scan.projection { + let column_indices = columns .columns .iter() .map(|name| schema.index_of(name)) @@ -234,10 +236,10 @@ impl TryInto for &protobuf::LogicalPlanNode { .map_err(|e| e.into()) } LogicalPlanType::Join(join) => { - let left_keys: Vec<&str> = - join.left_join_column.iter().map(|i| i.as_str()).collect(); - let right_keys: Vec<&str> = - join.right_join_column.iter().map(|i| i.as_str()).collect(); + let left_keys: Vec = + join.left_join_column.iter().map(|i| i.into()).collect(); + let right_keys: Vec = + join.right_join_column.iter().map(|i| i.into()).collect(); let join_type = protobuf::JoinType::from_i32(join.join_type).ok_or_else(|| { proto_error(format!( @@ -257,8 +259,8 @@ impl TryInto for &protobuf::LogicalPlanNode { .join( &convert_box_required!(join.right)?, join_type, - &left_keys, - &right_keys, + left_keys, + right_keys, )? .build() .map_err(|e| e.into()) @@ -267,22 +269,48 @@ impl TryInto for &protobuf::LogicalPlanNode { } } -impl TryInto for protobuf::Schema { +impl From<&protobuf::Column> for Column { + fn from(c: &protobuf::Column) -> Column { + let c = c.clone(); + Column { + relation: c.relation.map(|r| r.relation), + name: c.name, + } + } +} + +impl TryInto for &protobuf::DfSchema { type Error = BallistaError; - fn try_into(self) -> Result { - let schema: Schema = (&self).try_into()?; - schema.try_into().map_err(BallistaError::DataFusionError) + + fn try_into(self) -> Result { + let fields = self + .columns + .iter() + .map(|c| c.try_into()) + .collect::, _>>()?; + Ok(DFSchema::new(fields)?) } } -impl TryInto for protobuf::Schema { +impl TryInto for protobuf::DfSchema { type Error = BallistaError; + fn try_into(self) -> Result { - use datafusion::logical_plan::ToDFSchema; - let schema: Schema = (&self).try_into()?; - schema - .to_dfschema_ref() - .map_err(BallistaError::DataFusionError) + let dfschema: DFSchema = (&self).try_into()?; + Ok(Arc::new(dfschema)) + } +} + +impl TryInto for &protobuf::DfField { + type Error = BallistaError; + + fn try_into(self) -> Result { + let field: Field = convert_required!(self.field)?; + + Ok(match &self.qualifier { + Some(q) => DFField::from_qualified(&q.relation, field), + None => DFField::from(field), + }) } } @@ -339,149 +367,6 @@ impl TryInto for &protobuf::scalar_type::Datatype { } } -impl TryInto for &protobuf::arrow_type::ArrowTypeEnum { - type Error = BallistaError; - fn try_into(self) -> Result { - use protobuf::arrow_type; - Ok(match self { - arrow_type::ArrowTypeEnum::None(_) => DataType::Null, - arrow_type::ArrowTypeEnum::Bool(_) => DataType::Boolean, - arrow_type::ArrowTypeEnum::Uint8(_) => DataType::UInt8, - arrow_type::ArrowTypeEnum::Int8(_) => DataType::Int8, - arrow_type::ArrowTypeEnum::Uint16(_) => DataType::UInt16, - arrow_type::ArrowTypeEnum::Int16(_) => DataType::Int16, - arrow_type::ArrowTypeEnum::Uint32(_) => DataType::UInt32, - arrow_type::ArrowTypeEnum::Int32(_) => DataType::Int32, - arrow_type::ArrowTypeEnum::Uint64(_) => DataType::UInt64, - arrow_type::ArrowTypeEnum::Int64(_) => DataType::Int64, - arrow_type::ArrowTypeEnum::Float16(_) => DataType::Float16, - arrow_type::ArrowTypeEnum::Float32(_) => DataType::Float32, - arrow_type::ArrowTypeEnum::Float64(_) => DataType::Float64, - arrow_type::ArrowTypeEnum::Utf8(_) => DataType::Utf8, - arrow_type::ArrowTypeEnum::LargeUtf8(_) => DataType::LargeUtf8, - arrow_type::ArrowTypeEnum::Binary(_) => DataType::Binary, - arrow_type::ArrowTypeEnum::FixedSizeBinary(size) => { - DataType::FixedSizeBinary(*size) - } - arrow_type::ArrowTypeEnum::LargeBinary(_) => DataType::LargeBinary, - arrow_type::ArrowTypeEnum::Date32(_) => DataType::Date32, - arrow_type::ArrowTypeEnum::Date64(_) => DataType::Date64, - arrow_type::ArrowTypeEnum::Duration(time_unit) => { - DataType::Duration(protobuf::TimeUnit::from_i32_to_arrow(*time_unit)?) - } - arrow_type::ArrowTypeEnum::Timestamp(protobuf::Timestamp { - time_unit, - timezone, - }) => DataType::Timestamp( - protobuf::TimeUnit::from_i32_to_arrow(*time_unit)?, - match timezone.len() { - 0 => None, - _ => Some(timezone.to_owned()), - }, - ), - arrow_type::ArrowTypeEnum::Time32(time_unit) => { - DataType::Time32(protobuf::TimeUnit::from_i32_to_arrow(*time_unit)?) - } - arrow_type::ArrowTypeEnum::Time64(time_unit) => { - DataType::Time64(protobuf::TimeUnit::from_i32_to_arrow(*time_unit)?) - } - arrow_type::ArrowTypeEnum::Interval(interval_unit) => DataType::Interval( - protobuf::IntervalUnit::from_i32_to_arrow(*interval_unit)?, - ), - arrow_type::ArrowTypeEnum::Decimal(protobuf::Decimal { - whole, - fractional, - }) => DataType::Decimal(*whole as usize, *fractional as usize), - arrow_type::ArrowTypeEnum::List(list) => { - let list_type: &protobuf::Field = list - .as_ref() - .field_type - .as_ref() - .ok_or_else(|| proto_error("Protobuf deserialization error: List message missing required field 'field_type'"))? - .as_ref(); - DataType::List(Box::new(list_type.try_into()?)) - } - arrow_type::ArrowTypeEnum::LargeList(list) => { - let list_type: &protobuf::Field = list - .as_ref() - .field_type - .as_ref() - .ok_or_else(|| proto_error("Protobuf deserialization error: List message missing required field 'field_type'"))? - .as_ref(); - DataType::LargeList(Box::new(list_type.try_into()?)) - } - arrow_type::ArrowTypeEnum::FixedSizeList(list) => { - let list_type: &protobuf::Field = list - .as_ref() - .field_type - .as_ref() - .ok_or_else(|| proto_error("Protobuf deserialization error: List message missing required field 'field_type'"))? - .as_ref(); - let list_size = list.list_size; - DataType::FixedSizeList(Box::new(list_type.try_into()?), list_size) - } - arrow_type::ArrowTypeEnum::Struct(strct) => DataType::Struct( - strct - .sub_field_types - .iter() - .map(|field| field.try_into()) - .collect::, _>>()?, - ), - arrow_type::ArrowTypeEnum::Union(union) => DataType::Union( - union - .union_types - .iter() - .map(|field| field.try_into()) - .collect::, _>>()?, - ), - arrow_type::ArrowTypeEnum::Dictionary(dict) => { - let pb_key_datatype = dict - .as_ref() - .key - .as_ref() - .ok_or_else(|| proto_error("Protobuf deserialization error: Dictionary message missing required field 'key'"))?; - let pb_value_datatype = dict - .as_ref() - .value - .as_ref() - .ok_or_else(|| proto_error("Protobuf deserialization error: Dictionary message missing required field 'key'"))?; - let key_datatype: DataType = pb_key_datatype.as_ref().try_into()?; - let value_datatype: DataType = pb_value_datatype.as_ref().try_into()?; - DataType::Dictionary(Box::new(key_datatype), Box::new(value_datatype)) - } - }) - } -} - -#[allow(clippy::from_over_into)] -impl Into for protobuf::PrimitiveScalarType { - fn into(self) -> DataType { - match self { - protobuf::PrimitiveScalarType::Bool => DataType::Boolean, - protobuf::PrimitiveScalarType::Uint8 => DataType::UInt8, - protobuf::PrimitiveScalarType::Int8 => DataType::Int8, - protobuf::PrimitiveScalarType::Uint16 => DataType::UInt16, - protobuf::PrimitiveScalarType::Int16 => DataType::Int16, - protobuf::PrimitiveScalarType::Uint32 => DataType::UInt32, - protobuf::PrimitiveScalarType::Int32 => DataType::Int32, - protobuf::PrimitiveScalarType::Uint64 => DataType::UInt64, - protobuf::PrimitiveScalarType::Int64 => DataType::Int64, - protobuf::PrimitiveScalarType::Float32 => DataType::Float32, - protobuf::PrimitiveScalarType::Float64 => DataType::Float64, - protobuf::PrimitiveScalarType::Utf8 => DataType::Utf8, - protobuf::PrimitiveScalarType::LargeUtf8 => DataType::LargeUtf8, - protobuf::PrimitiveScalarType::Date32 => DataType::Date32, - protobuf::PrimitiveScalarType::TimeMicrosecond => { - DataType::Time64(TimeUnit::Microsecond) - } - protobuf::PrimitiveScalarType::TimeNanosecond => { - DataType::Time64(TimeUnit::Nanosecond) - } - protobuf::PrimitiveScalarType::Null => DataType::Null, - } - } -} - //Does not typecheck lists fn typechecked_scalar_value_conversion( tested_type: &protobuf::scalar_value::Value, @@ -899,7 +784,7 @@ impl TryInto for &protobuf::LogicalExprNode { op: from_proto_binary_op(&binary_expr.op)?, right: Box::new(parse_required_expr(&binary_expr.r)?), }), - ExprType::ColumnName(column_name) => Ok(Expr::Column(column_name.to_owned())), + ExprType::Column(column) => Ok(Expr::Column(column.into())), ExprType::Literal(literal) => { use datafusion::scalar::ScalarValue; let scalar_value: datafusion::scalar::ScalarValue = literal.try_into()?; @@ -1164,28 +1049,6 @@ impl TryInto for &protobuf::LogicalExprNode { } } -fn from_proto_binary_op(op: &str) -> Result { - match op { - "And" => Ok(Operator::And), - "Or" => Ok(Operator::Or), - "Eq" => Ok(Operator::Eq), - "NotEq" => Ok(Operator::NotEq), - "LtEq" => Ok(Operator::LtEq), - "Lt" => Ok(Operator::Lt), - "Gt" => Ok(Operator::Gt), - "GtEq" => Ok(Operator::GtEq), - "Plus" => Ok(Operator::Plus), - "Minus" => Ok(Operator::Minus), - "Multiply" => Ok(Operator::Multiply), - "Divide" => Ok(Operator::Divide), - "Like" => Ok(Operator::Like), - other => Err(proto_error(format!( - "Unsupported binary operator '{:?}'", - other - ))), - } -} - impl TryInto for &protobuf::ScalarType { type Error = BallistaError; fn try_into(self) -> Result { @@ -1361,43 +1224,3 @@ impl TryFrom for WindowFrame { }) } } - -impl From for AggregateFunction { - fn from(aggr_function: protobuf::AggregateFunction) -> Self { - match aggr_function { - protobuf::AggregateFunction::Min => AggregateFunction::Min, - protobuf::AggregateFunction::Max => AggregateFunction::Max, - protobuf::AggregateFunction::Sum => AggregateFunction::Sum, - protobuf::AggregateFunction::Avg => AggregateFunction::Avg, - protobuf::AggregateFunction::Count => AggregateFunction::Count, - } - } -} - -impl From for BuiltInWindowFunction { - fn from(built_in_function: protobuf::BuiltInWindowFunction) -> Self { - match built_in_function { - protobuf::BuiltInWindowFunction::RowNumber => { - BuiltInWindowFunction::RowNumber - } - protobuf::BuiltInWindowFunction::Rank => BuiltInWindowFunction::Rank, - protobuf::BuiltInWindowFunction::PercentRank => { - BuiltInWindowFunction::PercentRank - } - protobuf::BuiltInWindowFunction::DenseRank => { - BuiltInWindowFunction::DenseRank - } - protobuf::BuiltInWindowFunction::Lag => BuiltInWindowFunction::Lag, - protobuf::BuiltInWindowFunction::Lead => BuiltInWindowFunction::Lead, - protobuf::BuiltInWindowFunction::FirstValue => { - BuiltInWindowFunction::FirstValue - } - protobuf::BuiltInWindowFunction::CumeDist => BuiltInWindowFunction::CumeDist, - protobuf::BuiltInWindowFunction::Ntile => BuiltInWindowFunction::Ntile, - protobuf::BuiltInWindowFunction::NthValue => BuiltInWindowFunction::NthValue, - protobuf::BuiltInWindowFunction::LastValue => { - BuiltInWindowFunction::LastValue - } - } - } -} diff --git a/ballista/rust/core/src/serde/logical_plan/mod.rs b/ballista/rust/core/src/serde/logical_plan/mod.rs index d2792b09fa168..0d27c58ac2925 100644 --- a/ballista/rust/core/src/serde/logical_plan/mod.rs +++ b/ballista/rust/core/src/serde/logical_plan/mod.rs @@ -26,7 +26,9 @@ mod roundtrip_tests { use core::panic; use datafusion::{ arrow::datatypes::{DataType, Field, IntervalUnit, Schema, TimeUnit}, - logical_plan::{Expr, LogicalPlan, LogicalPlanBuilder, Partitioning, ToDFSchema}, + logical_plan::{ + col, Expr, LogicalPlan, LogicalPlanBuilder, Partitioning, ToDFSchema, + }, physical_plan::{csv::CsvReadOptions, functions::BuiltinScalarFunction::Sqrt}, prelude::*, scalar::ScalarValue, @@ -61,10 +63,8 @@ mod roundtrip_tests { let test_batch_sizes = [usize::MIN, usize::MAX, 43256]; - let test_expr: Vec = vec![ - Expr::Column("c1".to_string()) + Expr::Column("c2".to_string()), - Expr::Literal((4.0).into()), - ]; + let test_expr: Vec = + vec![col("c1") + col("c2"), Expr::Literal((4.0).into())]; let schema = Schema::new(vec![ Field::new("id", DataType::Int32, false), @@ -688,15 +688,20 @@ mod roundtrip_tests { Field::new("salary", DataType::Int32, false), ]); - let scan_plan = LogicalPlanBuilder::empty(false) - .build() - .map_err(BallistaError::DataFusionError)?; + let scan_plan = LogicalPlanBuilder::scan_csv( + "employee1", + CsvReadOptions::new().schema(&schema).has_header(true), + Some(vec![0, 3, 4]), + )? + .build() + .map_err(BallistaError::DataFusionError)?; + let plan = LogicalPlanBuilder::scan_csv( - "employee.csv", + "employee2", CsvReadOptions::new().schema(&schema).has_header(true), - Some(vec![3, 4]), + Some(vec![0, 3, 4]), ) - .and_then(|plan| plan.join(&scan_plan, JoinType::Inner, &["id"], &["id"])) + .and_then(|plan| plan.join(&scan_plan, JoinType::Inner, vec!["id"], vec!["id"])) .and_then(|plan| plan.build()) .map_err(BallistaError::DataFusionError)?; @@ -779,7 +784,7 @@ mod roundtrip_tests { #[test] fn roundtrip_is_null() -> Result<()> { - let test_expr = Expr::IsNull(Box::new(Expr::Column("id".into()))); + let test_expr = Expr::IsNull(Box::new(col("id"))); roundtrip_test!(test_expr, protobuf::LogicalExprNode, Expr); @@ -788,7 +793,7 @@ mod roundtrip_tests { #[test] fn roundtrip_is_not_null() -> Result<()> { - let test_expr = Expr::IsNotNull(Box::new(Expr::Column("id".into()))); + let test_expr = Expr::IsNotNull(Box::new(col("id"))); roundtrip_test!(test_expr, protobuf::LogicalExprNode, Expr); diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/ballista/rust/core/src/serde/logical_plan/to_proto.rs index c454d03257f0a..24e2b56bad862 100644 --- a/ballista/rust/core/src/serde/logical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/to_proto.rs @@ -26,7 +26,7 @@ use datafusion::arrow::datatypes::{DataType, Field, IntervalUnit, Schema, TimeUn use datafusion::datasource::CsvFile; use datafusion::logical_plan::{ window_frames::{WindowFrame, WindowFrameBound, WindowFrameUnits}, - Expr, JoinType, LogicalPlan, + Column, Expr, JoinType, LogicalPlan, }; use datafusion::physical_plan::aggregates::AggregateFunction; use datafusion::physical_plan::functions::BuiltinScalarFunction; @@ -816,8 +816,8 @@ impl TryInto for &LogicalPlan { JoinType::Semi => protobuf::JoinType::Semi, JoinType::Anti => protobuf::JoinType::Anti, }; - let left_join_column = on.iter().map(|on| on.0.to_owned()).collect(); - let right_join_column = on.iter().map(|on| on.1.to_owned()).collect(); + let (left_join_column, right_join_column) = + on.iter().map(|(l, r)| (l.into(), r.into())).unzip(); Ok(protobuf::LogicalPlanNode { logical_plan_type: Some(LogicalPlanType::Join(Box::new( protobuf::JoinNode { @@ -908,13 +908,6 @@ impl TryInto for &LogicalPlan { schema: df_schema, } => { use datafusion::sql::parser::FileType; - let schema: Schema = df_schema.as_ref().clone().into(); - let pb_schema: protobuf::Schema = (&schema).try_into().map_err(|e| { - BallistaError::General(format!( - "Could not convert schema into protobuf: {:?}", - e - )) - })?; let pb_file_type: protobuf::FileType = match file_type { FileType::NdJson => protobuf::FileType::NdJson, @@ -929,7 +922,7 @@ impl TryInto for &LogicalPlan { location: location.clone(), file_type: pb_file_type as i32, has_header: *has_header, - schema: Some(pb_schema), + schema: Some(df_schema.into()), }, )), }) @@ -971,9 +964,9 @@ impl TryInto for &Expr { use datafusion::scalar::ScalarValue; use protobuf::scalar_value::Value; match self { - Expr::Column(name) => { + Expr::Column(c) => { let expr = protobuf::LogicalExprNode { - expr_type: Some(ExprType::ColumnName(name.clone())), + expr_type: Some(ExprType::Column(c.into())), }; Ok(expr) } @@ -1214,6 +1207,23 @@ impl TryInto for &Expr { } } +impl From for protobuf::Column { + fn from(c: Column) -> protobuf::Column { + protobuf::Column { + relation: c + .relation + .map(|relation| protobuf::ColumnRelation { relation }), + name: c.name, + } + } +} + +impl From<&Column> for protobuf::Column { + fn from(c: &Column) -> protobuf::Column { + c.clone().into() + } +} + #[allow(clippy::from_over_into)] impl Into for &Schema { fn into(self) -> protobuf::Schema { @@ -1227,6 +1237,24 @@ impl Into for &Schema { } } +impl From<&datafusion::logical_plan::DFField> for protobuf::DfField { + fn from(f: &datafusion::logical_plan::DFField) -> protobuf::DfField { + protobuf::DfField { + field: Some(f.field().into()), + qualifier: f.qualifier().map(|r| protobuf::ColumnRelation { + relation: r.to_string(), + }), + } + } +} + +impl From<&datafusion::logical_plan::DFSchemaRef> for protobuf::DfSchema { + fn from(s: &datafusion::logical_plan::DFSchemaRef) -> protobuf::DfSchema { + let columns = s.fields().iter().map(|f| f.into()).collect::>(); + protobuf::DfSchema { columns } + } +} + impl From<&AggregateFunction> for protobuf::AggregateFunction { fn from(value: &AggregateFunction) -> Self { match value { diff --git a/ballista/rust/core/src/serde/mod.rs b/ballista/rust/core/src/serde/mod.rs index b96163999f39d..af83660baab56 100644 --- a/ballista/rust/core/src/serde/mod.rs +++ b/ballista/rust/core/src/serde/mod.rs @@ -20,6 +20,10 @@ use std::{convert::TryInto, io::Cursor}; +use datafusion::logical_plan::Operator; +use datafusion::physical_plan::aggregates::AggregateFunction; +use datafusion::physical_plan::window_functions::BuiltInWindowFunction; + use crate::{error::BallistaError, serde::scheduler::Action as BallistaAction}; use prost::Message; @@ -57,6 +61,17 @@ macro_rules! convert_required { }}; } +#[macro_export] +macro_rules! into_required { + ($PB:expr) => {{ + if let Some(field) = $PB.as_ref() { + Ok(field.into()) + } else { + Err(proto_error("Missing required field in protobuf")) + } + }}; +} + #[macro_export] macro_rules! convert_box_required { ($PB:expr) => {{ @@ -67,3 +82,212 @@ macro_rules! convert_box_required { } }}; } + +pub(crate) fn from_proto_binary_op(op: &str) -> Result { + match op { + "And" => Ok(Operator::And), + "Or" => Ok(Operator::Or), + "Eq" => Ok(Operator::Eq), + "NotEq" => Ok(Operator::NotEq), + "LtEq" => Ok(Operator::LtEq), + "Lt" => Ok(Operator::Lt), + "Gt" => Ok(Operator::Gt), + "GtEq" => Ok(Operator::GtEq), + "Plus" => Ok(Operator::Plus), + "Minus" => Ok(Operator::Minus), + "Multiply" => Ok(Operator::Multiply), + "Divide" => Ok(Operator::Divide), + "Like" => Ok(Operator::Like), + other => Err(proto_error(format!( + "Unsupported binary operator '{:?}'", + other + ))), + } +} + +impl From for AggregateFunction { + fn from(agg_fun: protobuf::AggregateFunction) -> AggregateFunction { + match agg_fun { + protobuf::AggregateFunction::Min => AggregateFunction::Min, + protobuf::AggregateFunction::Max => AggregateFunction::Max, + protobuf::AggregateFunction::Sum => AggregateFunction::Sum, + protobuf::AggregateFunction::Avg => AggregateFunction::Avg, + protobuf::AggregateFunction::Count => AggregateFunction::Count, + } + } +} + +impl From for BuiltInWindowFunction { + fn from(built_in_function: protobuf::BuiltInWindowFunction) -> Self { + match built_in_function { + protobuf::BuiltInWindowFunction::RowNumber => { + BuiltInWindowFunction::RowNumber + } + protobuf::BuiltInWindowFunction::Rank => BuiltInWindowFunction::Rank, + protobuf::BuiltInWindowFunction::PercentRank => { + BuiltInWindowFunction::PercentRank + } + protobuf::BuiltInWindowFunction::DenseRank => { + BuiltInWindowFunction::DenseRank + } + protobuf::BuiltInWindowFunction::Lag => BuiltInWindowFunction::Lag, + protobuf::BuiltInWindowFunction::Lead => BuiltInWindowFunction::Lead, + protobuf::BuiltInWindowFunction::FirstValue => { + BuiltInWindowFunction::FirstValue + } + protobuf::BuiltInWindowFunction::CumeDist => BuiltInWindowFunction::CumeDist, + protobuf::BuiltInWindowFunction::Ntile => BuiltInWindowFunction::Ntile, + protobuf::BuiltInWindowFunction::NthValue => BuiltInWindowFunction::NthValue, + protobuf::BuiltInWindowFunction::LastValue => { + BuiltInWindowFunction::LastValue + } + } + } +} + +impl TryInto + for &protobuf::arrow_type::ArrowTypeEnum +{ + type Error = BallistaError; + fn try_into(self) -> Result { + use datafusion::arrow::datatypes::DataType; + use protobuf::arrow_type; + Ok(match self { + arrow_type::ArrowTypeEnum::None(_) => DataType::Null, + arrow_type::ArrowTypeEnum::Bool(_) => DataType::Boolean, + arrow_type::ArrowTypeEnum::Uint8(_) => DataType::UInt8, + arrow_type::ArrowTypeEnum::Int8(_) => DataType::Int8, + arrow_type::ArrowTypeEnum::Uint16(_) => DataType::UInt16, + arrow_type::ArrowTypeEnum::Int16(_) => DataType::Int16, + arrow_type::ArrowTypeEnum::Uint32(_) => DataType::UInt32, + arrow_type::ArrowTypeEnum::Int32(_) => DataType::Int32, + arrow_type::ArrowTypeEnum::Uint64(_) => DataType::UInt64, + arrow_type::ArrowTypeEnum::Int64(_) => DataType::Int64, + arrow_type::ArrowTypeEnum::Float16(_) => DataType::Float16, + arrow_type::ArrowTypeEnum::Float32(_) => DataType::Float32, + arrow_type::ArrowTypeEnum::Float64(_) => DataType::Float64, + arrow_type::ArrowTypeEnum::Utf8(_) => DataType::Utf8, + arrow_type::ArrowTypeEnum::LargeUtf8(_) => DataType::LargeUtf8, + arrow_type::ArrowTypeEnum::Binary(_) => DataType::Binary, + arrow_type::ArrowTypeEnum::FixedSizeBinary(size) => { + DataType::FixedSizeBinary(*size) + } + arrow_type::ArrowTypeEnum::LargeBinary(_) => DataType::LargeBinary, + arrow_type::ArrowTypeEnum::Date32(_) => DataType::Date32, + arrow_type::ArrowTypeEnum::Date64(_) => DataType::Date64, + arrow_type::ArrowTypeEnum::Duration(time_unit) => { + DataType::Duration(protobuf::TimeUnit::from_i32_to_arrow(*time_unit)?) + } + arrow_type::ArrowTypeEnum::Timestamp(protobuf::Timestamp { + time_unit, + timezone, + }) => DataType::Timestamp( + protobuf::TimeUnit::from_i32_to_arrow(*time_unit)?, + match timezone.len() { + 0 => None, + _ => Some(timezone.to_owned()), + }, + ), + arrow_type::ArrowTypeEnum::Time32(time_unit) => { + DataType::Time32(protobuf::TimeUnit::from_i32_to_arrow(*time_unit)?) + } + arrow_type::ArrowTypeEnum::Time64(time_unit) => { + DataType::Time64(protobuf::TimeUnit::from_i32_to_arrow(*time_unit)?) + } + arrow_type::ArrowTypeEnum::Interval(interval_unit) => DataType::Interval( + protobuf::IntervalUnit::from_i32_to_arrow(*interval_unit)?, + ), + arrow_type::ArrowTypeEnum::Decimal(protobuf::Decimal { + whole, + fractional, + }) => DataType::Decimal(*whole as usize, *fractional as usize), + arrow_type::ArrowTypeEnum::List(list) => { + let list_type: &protobuf::Field = list + .as_ref() + .field_type + .as_ref() + .ok_or_else(|| proto_error("Protobuf deserialization error: List message missing required field 'field_type'"))? + .as_ref(); + DataType::List(Box::new(list_type.try_into()?)) + } + arrow_type::ArrowTypeEnum::LargeList(list) => { + let list_type: &protobuf::Field = list + .as_ref() + .field_type + .as_ref() + .ok_or_else(|| proto_error("Protobuf deserialization error: List message missing required field 'field_type'"))? + .as_ref(); + DataType::LargeList(Box::new(list_type.try_into()?)) + } + arrow_type::ArrowTypeEnum::FixedSizeList(list) => { + let list_type: &protobuf::Field = list + .as_ref() + .field_type + .as_ref() + .ok_or_else(|| proto_error("Protobuf deserialization error: List message missing required field 'field_type'"))? + .as_ref(); + let list_size = list.list_size; + DataType::FixedSizeList(Box::new(list_type.try_into()?), list_size) + } + arrow_type::ArrowTypeEnum::Struct(strct) => DataType::Struct( + strct + .sub_field_types + .iter() + .map(|field| field.try_into()) + .collect::, _>>()?, + ), + arrow_type::ArrowTypeEnum::Union(union) => DataType::Union( + union + .union_types + .iter() + .map(|field| field.try_into()) + .collect::, _>>()?, + ), + arrow_type::ArrowTypeEnum::Dictionary(dict) => { + let pb_key_datatype = dict + .as_ref() + .key + .as_ref() + .ok_or_else(|| proto_error("Protobuf deserialization error: Dictionary message missing required field 'key'"))?; + let pb_value_datatype = dict + .as_ref() + .value + .as_ref() + .ok_or_else(|| proto_error("Protobuf deserialization error: Dictionary message missing required field 'key'"))?; + let key_datatype: DataType = pb_key_datatype.as_ref().try_into()?; + let value_datatype: DataType = pb_value_datatype.as_ref().try_into()?; + DataType::Dictionary(Box::new(key_datatype), Box::new(value_datatype)) + } + }) + } +} + +#[allow(clippy::from_over_into)] +impl Into for protobuf::PrimitiveScalarType { + fn into(self) -> datafusion::arrow::datatypes::DataType { + use datafusion::arrow::datatypes::{DataType, TimeUnit}; + match self { + protobuf::PrimitiveScalarType::Bool => DataType::Boolean, + protobuf::PrimitiveScalarType::Uint8 => DataType::UInt8, + protobuf::PrimitiveScalarType::Int8 => DataType::Int8, + protobuf::PrimitiveScalarType::Uint16 => DataType::UInt16, + protobuf::PrimitiveScalarType::Int16 => DataType::Int16, + protobuf::PrimitiveScalarType::Uint32 => DataType::UInt32, + protobuf::PrimitiveScalarType::Int32 => DataType::Int32, + protobuf::PrimitiveScalarType::Uint64 => DataType::UInt64, + protobuf::PrimitiveScalarType::Int64 => DataType::Int64, + protobuf::PrimitiveScalarType::Float32 => DataType::Float32, + protobuf::PrimitiveScalarType::Float64 => DataType::Float64, + protobuf::PrimitiveScalarType::Utf8 => DataType::Utf8, + protobuf::PrimitiveScalarType::LargeUtf8 => DataType::LargeUtf8, + protobuf::PrimitiveScalarType::Date32 => DataType::Date32, + protobuf::PrimitiveScalarType::TimeMicrosecond => { + DataType::Time64(TimeUnit::Microsecond) + } + protobuf::PrimitiveScalarType::TimeNanosecond => { + DataType::Time64(TimeUnit::Nanosecond) + } + protobuf::PrimitiveScalarType::Null => DataType::Null, + } + } +} diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index a2c9db9ecafbf..4b87be4105be0 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -18,17 +18,16 @@ //! Serde code to convert from protocol buffers to Rust data structures. use std::collections::HashMap; -use std::convert::TryInto; +use std::convert::{TryFrom, TryInto}; use std::sync::Arc; use crate::error::BallistaError; use crate::execution_plans::{ShuffleReaderExec, UnresolvedShuffleExec}; use crate::serde::protobuf::repartition_exec_node::PartitionMethod; -use crate::serde::protobuf::LogicalExprNode; use crate::serde::protobuf::ShuffleReaderPartition; use crate::serde::scheduler::PartitionLocation; -use crate::serde::{proto_error, protobuf}; -use crate::{convert_box_required, convert_required}; +use crate::serde::{from_proto_binary_op, proto_error, protobuf}; +use crate::{convert_box_required, convert_required, into_required}; use datafusion::arrow::datatypes::{DataType, Schema, SchemaRef}; use datafusion::catalog::catalog::{ CatalogList, CatalogProvider, MemoryCatalogList, MemoryCatalogProvider, @@ -36,9 +35,8 @@ use datafusion::catalog::catalog::{ use datafusion::execution::context::{ ExecutionConfig, ExecutionContextState, ExecutionProps, }; -use datafusion::logical_plan::{DFSchema, Expr}; -use datafusion::physical_plan::aggregates::AggregateFunction; -use datafusion::physical_plan::expressions::col; +use datafusion::logical_plan::{window_frames::WindowFrame, DFSchema, Expr}; +use datafusion::physical_plan::aggregates::{create_aggregate_expr, AggregateFunction}; use datafusion::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec}; use datafusion::physical_plan::hash_join::PartitionMode; use datafusion::physical_plan::merge::MergeExec; @@ -46,13 +44,18 @@ use datafusion::physical_plan::planner::DefaultPhysicalPlanner; use datafusion::physical_plan::window_functions::{ BuiltInWindowFunction, WindowFunction, }; -use datafusion::physical_plan::windows::WindowAggExec; +use datafusion::physical_plan::windows::{create_window_expr, WindowAggExec}; use datafusion::physical_plan::{ coalesce_batches::CoalesceBatchesExec, csv::CsvExec, empty::EmptyExec, - expressions::{Avg, Column, PhysicalSortExpr}, + expressions::{ + col, Avg, BinaryExpr, CaseExpr, CastExpr, Column, InListExpr, IsNotNullExpr, + IsNullExpr, Literal, NegativeExpr, NotExpr, PhysicalSortExpr, TryCastExpr, + DEFAULT_DATAFUSION_CAST_OPTIONS, + }, filter::FilterExec, + functions::{self, BuiltinScalarFunction, ScalarFunctionExpr}, hash_join::HashJoinExec, hash_utils::JoinType, limit::{GlobalLimitExec, LocalLimitExec}, @@ -65,7 +68,7 @@ use datafusion::physical_plan::{ use datafusion::physical_plan::{AggregateExpr, ExecutionPlan, PhysicalExpr, WindowExpr}; use datafusion::prelude::CsvReadOptions; use log::debug; -use protobuf::logical_expr_node::ExprType; +use protobuf::physical_expr_node::ExprType; use protobuf::physical_plan_node::PhysicalPlanType; impl TryInto> for &protobuf::PhysicalPlanNode { @@ -86,23 +89,23 @@ impl TryInto> for &protobuf::PhysicalPlanNode { .expr .iter() .zip(projection.expr_name.iter()) - .map(|(expr, name)| { - compile_expr(expr, &input.schema()).map(|e| (e, name.to_string())) - }) - .collect::, _>>()?; + .map(|(expr, name)| Ok((expr.try_into()?, name.to_string()))) + .collect::, String)>, Self::Error>>( + )?; Ok(Arc::new(ProjectionExec::try_new(exprs, input)?)) } PhysicalPlanType::Filter(filter) => { let input: Arc = convert_box_required!(filter.input)?; - let predicate = compile_expr( - filter.expr.as_ref().ok_or_else(|| { + let predicate = filter + .expr + .as_ref() + .ok_or_else(|| { BallistaError::General( "filter (FilterExecNode) in PhysicalPlanNode is missing." .to_owned(), ) - })?, - &input.schema(), - )?; + })? + .try_into()?; Ok(Arc::new(FilterExec::try_new(predicate, input)?)) } PhysicalPlanType::CsvScan(scan) => { @@ -153,7 +156,7 @@ impl TryInto> for &protobuf::PhysicalPlanNode { let expr = hash_part .hash_expr .iter() - .map(|e| compile_expr(e, &input.schema())) + .map(|e| e.try_into()) .collect::>, _>>()?; Ok(Arc::new(RepartitionExec::try_new( @@ -207,25 +210,33 @@ impl TryInto> for &protobuf::PhysicalPlanNode { .clone(); let physical_schema: SchemaRef = SchemaRef::new((&input_schema).try_into()?); - let ctx_state = ExecutionContextState::new(); - let window_agg_expr: Vec<(Expr, String)> = window_agg + + let physical_window_expr: Vec> = window_agg .window_expr .iter() .zip(window_agg.window_expr_name.iter()) - .map(|(expr, name)| expr.try_into().map(|expr| (expr, name.clone()))) - .collect::, _>>()?; - let df_planner = DefaultPhysicalPlanner::default(); - let physical_window_expr = window_agg_expr - .iter() .map(|(expr, name)| { - df_planner.create_window_expr_with_name( - expr, - name.to_string(), - &physical_schema, - &ctx_state, - ) + let expr_type = expr.expr_type.as_ref().ok_or_else(|| { + proto_error("Unexpected empty window physical expression") + })?; + + match expr_type { + ExprType::WindowExpr(window_node) => Ok(create_window_expr( + &convert_required!(window_node.window_function)?, + name.to_owned(), + &[convert_box_required!(window_node.expr)?], + &[], + &[], + Some(WindowFrame::default()), + &physical_schema, + )?), + _ => Err(BallistaError::General( + "Invalid expression for WindowAggrExec".to_string(), + )), + } }) .collect::, _>>()?; + Ok(Arc::new(WindowAggExec::try_new( physical_window_expr, input, @@ -253,16 +264,10 @@ impl TryInto> for &protobuf::PhysicalPlanNode { .iter() .zip(hash_agg.group_expr_name.iter()) .map(|(expr, name)| { - compile_expr(expr, &input.schema()).map(|e| (e, name.to_string())) + expr.try_into().map(|expr| (expr, name.to_string())) }) .collect::, _>>()?; - let logical_agg_expr: Vec<(Expr, String)> = hash_agg - .aggr_expr - .iter() - .zip(hash_agg.aggr_expr_name.iter()) - .map(|(expr, name)| expr.try_into().map(|expr| (expr, name.clone()))) - .collect::, _>>()?; - let ctx_state = ExecutionContextState::new(); + let input_schema = hash_agg .input_schema .as_ref() @@ -274,18 +279,47 @@ impl TryInto> for &protobuf::PhysicalPlanNode { .clone(); let physical_schema: SchemaRef = SchemaRef::new((&input_schema).try_into()?); - let df_planner = DefaultPhysicalPlanner::default(); - let physical_aggr_expr = logical_agg_expr + + let physical_aggr_expr: Vec> = hash_agg + .aggr_expr .iter() + .zip(hash_agg.aggr_expr_name.iter()) .map(|(expr, name)| { - df_planner.create_aggregate_expr_with_name( - expr, - name.to_string(), - &physical_schema, - &ctx_state, - ) + let expr_type = expr.expr_type.as_ref().ok_or_else(|| { + proto_error("Unexpected empty aggregate physical expression") + })?; + + match expr_type { + ExprType::AggregateExpr(agg_node) => { + let aggr_function = + protobuf::AggregateFunction::from_i32( + agg_node.aggr_function, + ) + .ok_or_else( + || { + proto_error(format!( + "Received an unknown aggregate function: {}", + agg_node.aggr_function + )) + }, + )?; + + Ok(create_aggregate_expr( + &aggr_function.into(), + false, + &[convert_box_required!(agg_node.expr)?], + &physical_schema, + name.to_string(), + )?) + } + _ => Err(BallistaError::General( + "Invalid aggregate expression for HashAggregateExec" + .to_string(), + )), + } }) .collect::, _>>()?; + Ok(Arc::new(HashAggregateExec::try_new( agg_mode, group, @@ -298,11 +332,15 @@ impl TryInto> for &protobuf::PhysicalPlanNode { let left: Arc = convert_box_required!(hashjoin.left)?; let right: Arc = convert_box_required!(hashjoin.right)?; - let on: Vec<(String, String)> = hashjoin + let on: Vec<(Column, Column)> = hashjoin .on .iter() - .map(|col| (col.left.clone(), col.right.clone())) - .collect(); + .map(|col| { + let left = into_required!(col.left)?; + let right = into_required!(col.right)?; + Ok((left, right)) + }) + .collect::>()?; let join_type = protobuf::JoinType::from_i32(hashjoin.join_type) .ok_or_else(|| { proto_error(format!( @@ -321,7 +359,7 @@ impl TryInto> for &protobuf::PhysicalPlanNode { Ok(Arc::new(HashJoinExec::try_new( left, right, - &on, + on, &join_type, PartitionMode::CollectLeft, )?)) @@ -358,7 +396,7 @@ impl TryInto> for &protobuf::PhysicalPlanNode { self )) })?; - if let protobuf::logical_expr_node::ExprType::Sort(sort_expr) = expr { + if let protobuf::physical_expr_node::ExprType::Sort(sort_expr) = expr { let expr = sort_expr .expr .as_ref() @@ -370,7 +408,7 @@ impl TryInto> for &protobuf::PhysicalPlanNode { })? .as_ref(); Ok(PhysicalSortExpr { - expr: compile_expr(expr, &input.schema())?, + expr: expr.try_into()?, options: SortOptions { descending: !sort_expr.asc, nulls_first: sort_expr.nulls_first, @@ -403,14 +441,210 @@ impl TryInto> for &protobuf::PhysicalPlanNode { } } -fn compile_expr( - expr: &protobuf::LogicalExprNode, - schema: &Schema, -) -> Result, BallistaError> { - let df_planner = DefaultPhysicalPlanner::default(); - let state = ExecutionContextState::new(); - let expr: Expr = expr.try_into()?; - df_planner - .create_physical_expr(&expr, schema, &state) - .map_err(|e| BallistaError::General(format!("{:?}", e))) +impl From<&protobuf::PhysicalColumn> for Column { + fn from(c: &protobuf::PhysicalColumn) -> Column { + Column::new(&c.name, c.index as usize) + } +} + +impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { + fn from(f: &protobuf::ScalarFunction) -> BuiltinScalarFunction { + use protobuf::ScalarFunction; + match f { + ScalarFunction::Sqrt => BuiltinScalarFunction::Sqrt, + ScalarFunction::Sin => BuiltinScalarFunction::Sin, + ScalarFunction::Cos => BuiltinScalarFunction::Cos, + ScalarFunction::Tan => BuiltinScalarFunction::Tan, + ScalarFunction::Asin => BuiltinScalarFunction::Asin, + ScalarFunction::Acos => BuiltinScalarFunction::Acos, + ScalarFunction::Atan => BuiltinScalarFunction::Atan, + ScalarFunction::Exp => BuiltinScalarFunction::Exp, + ScalarFunction::Log => BuiltinScalarFunction::Log, + ScalarFunction::Log2 => BuiltinScalarFunction::Log2, + ScalarFunction::Log10 => BuiltinScalarFunction::Log10, + ScalarFunction::Floor => BuiltinScalarFunction::Floor, + ScalarFunction::Ceil => BuiltinScalarFunction::Ceil, + ScalarFunction::Round => BuiltinScalarFunction::Round, + ScalarFunction::Trunc => BuiltinScalarFunction::Trunc, + ScalarFunction::Abs => BuiltinScalarFunction::Abs, + ScalarFunction::Signum => BuiltinScalarFunction::Signum, + ScalarFunction::Octetlength => BuiltinScalarFunction::OctetLength, + ScalarFunction::Concat => BuiltinScalarFunction::Concat, + ScalarFunction::Lower => BuiltinScalarFunction::Lower, + ScalarFunction::Upper => BuiltinScalarFunction::Upper, + ScalarFunction::Trim => BuiltinScalarFunction::Trim, + ScalarFunction::Ltrim => BuiltinScalarFunction::Ltrim, + ScalarFunction::Rtrim => BuiltinScalarFunction::Rtrim, + ScalarFunction::Totimestamp => BuiltinScalarFunction::ToTimestamp, + ScalarFunction::Array => BuiltinScalarFunction::Array, + ScalarFunction::Nullif => BuiltinScalarFunction::NullIf, + ScalarFunction::Datetrunc => BuiltinScalarFunction::DateTrunc, + ScalarFunction::Md5 => BuiltinScalarFunction::MD5, + ScalarFunction::Sha224 => BuiltinScalarFunction::SHA224, + ScalarFunction::Sha256 => BuiltinScalarFunction::SHA256, + ScalarFunction::Sha384 => BuiltinScalarFunction::SHA384, + ScalarFunction::Sha512 => BuiltinScalarFunction::SHA512, + ScalarFunction::Ln => BuiltinScalarFunction::Ln, + } + } +} + +impl TryFrom<&protobuf::PhysicalExprNode> for Arc { + type Error = BallistaError; + + fn try_from(expr: &protobuf::PhysicalExprNode) -> Result { + let expr_type = expr + .expr_type + .as_ref() + .ok_or_else(|| proto_error("Unexpected empty physical expression"))?; + + let pexpr: Arc = match expr_type { + ExprType::Column(c) => { + let pcol: Column = c.into(); + Arc::new(pcol) + } + ExprType::Literal(scalar) => { + Arc::new(Literal::new(convert_required!(scalar.value)?)) + } + ExprType::BinaryExpr(binary_expr) => Arc::new(BinaryExpr::new( + convert_box_required!(&binary_expr.l)?, + from_proto_binary_op(&binary_expr.op)?, + convert_box_required!(&binary_expr.r)?, + )), + ExprType::AggregateExpr(_) => { + return Err(BallistaError::General( + "Cannot convert aggregate expr node to physical expression" + .to_owned(), + )); + } + ExprType::WindowExpr(_) => { + return Err(BallistaError::General( + "Cannot convert window expr node to physical expression".to_owned(), + )); + } + ExprType::Sort(_) => { + return Err(BallistaError::General( + "Cannot convert sort expr node to physical expression".to_owned(), + )); + } + ExprType::IsNullExpr(e) => { + Arc::new(IsNullExpr::new(convert_box_required!(e.expr)?)) + } + ExprType::IsNotNullExpr(e) => { + Arc::new(IsNotNullExpr::new(convert_box_required!(e.expr)?)) + } + ExprType::NotExpr(e) => { + Arc::new(NotExpr::new(convert_box_required!(e.expr)?)) + } + ExprType::Negative(e) => { + Arc::new(NegativeExpr::new(convert_box_required!(e.expr)?)) + } + ExprType::InList(e) => Arc::new(InListExpr::new( + convert_box_required!(e.expr)?, + e.list + .iter() + .map(|x| x.try_into()) + .collect::, _>>()?, + e.negated, + )), + ExprType::Case(e) => Arc::new(CaseExpr::try_new( + e.expr.as_ref().map(|e| e.as_ref().try_into()).transpose()?, + e.when_then_expr + .iter() + .map(|e| { + Ok(( + convert_required!(e.when_expr)?, + convert_required!(e.then_expr)?, + )) + }) + .collect::, BallistaError>>()? + .as_slice(), + e.else_expr + .as_ref() + .map(|e| e.as_ref().try_into()) + .transpose()?, + )?), + ExprType::Cast(e) => Arc::new(CastExpr::new( + convert_box_required!(e.expr)?, + convert_required!(e.arrow_type)?, + DEFAULT_DATAFUSION_CAST_OPTIONS, + )), + ExprType::TryCast(e) => Arc::new(TryCastExpr::new( + convert_box_required!(e.expr)?, + convert_required!(e.arrow_type)?, + )), + ExprType::ScalarFunction(e) => { + let scalar_function = protobuf::ScalarFunction::from_i32(e.fun) + .ok_or_else(|| { + proto_error(format!( + "Received an unknown scalar function: {}", + e.fun, + )) + })?; + + let args = e + .args + .iter() + .map(|x| x.try_into()) + .collect::, _>>()?; + + let catalog_list = + Arc::new(MemoryCatalogList::new()) as Arc; + let ctx_state = ExecutionContextState { + catalog_list, + scalar_functions: Default::default(), + var_provider: Default::default(), + aggregate_functions: Default::default(), + config: ExecutionConfig::new(), + execution_props: ExecutionProps::new(), + }; + + let fun_expr = functions::create_physical_fun( + &(&scalar_function).into(), + &ctx_state, + )?; + + Arc::new(ScalarFunctionExpr::new( + &e.name, + fun_expr, + args, + &convert_required!(e.return_type)?, + )) + } + }; + + Ok(pexpr) + } +} + +impl TryFrom<&protobuf::physical_window_expr_node::WindowFunction> for WindowFunction { + type Error = BallistaError; + + fn try_from( + expr: &protobuf::physical_window_expr_node::WindowFunction, + ) -> Result { + match expr { + protobuf::physical_window_expr_node::WindowFunction::AggrFunction(n) => { + let f = protobuf::AggregateFunction::from_i32(*n).ok_or_else(|| { + proto_error(format!( + "Received an unknown window aggregate function: {}", + n + )) + })?; + + Ok(WindowFunction::AggregateFunction(f.into())) + } + protobuf::physical_window_expr_node::WindowFunction::BuiltInFunction(n) => { + let f = + protobuf::BuiltInWindowFunction::from_i32(*n).ok_or_else(|| { + proto_error(format!( + "Received an unknown window builtin function: {}", + n + )) + })?; + + Ok(WindowFunction::BuiltInWindowFunction(f.into())) + } + } + } } diff --git a/ballista/rust/core/src/serde/physical_plan/mod.rs b/ballista/rust/core/src/serde/physical_plan/mod.rs index fdba2152b7f8d..c0fe81f0ffb91 100644 --- a/ballista/rust/core/src/serde/physical_plan/mod.rs +++ b/ballista/rust/core/src/serde/physical_plan/mod.rs @@ -30,7 +30,7 @@ mod roundtrip_tests { logical_plan::Operator, physical_plan::{ empty::EmptyExec, - expressions::{binary, lit, InListExpr, NotExpr}, + expressions::{binary, col, lit, InListExpr, NotExpr}, expressions::{Avg, Column, PhysicalSortExpr}, filter::FilterExec, hash_aggregate::{AggregateMode, HashAggregateExec}, @@ -83,35 +83,35 @@ mod roundtrip_tests { let field_a = Field::new("col", DataType::Int64, false); let schema_left = Schema::new(vec![field_a.clone()]); let schema_right = Schema::new(vec![field_a]); + let on = vec![( + Column::new("col", schema_left.index_of("col")?), + Column::new("col", schema_right.index_of("col")?), + )]; roundtrip_test(Arc::new(HashJoinExec::try_new( Arc::new(EmptyExec::new(false, Arc::new(schema_left))), Arc::new(EmptyExec::new(false, Arc::new(schema_right))), - &[("col".to_string(), "col".to_string())], + on, &JoinType::Inner, PartitionMode::CollectLeft, )?)) } - fn col(name: &str) -> Arc { - Arc::new(Column::new(name)) - } - #[test] fn rountrip_hash_aggregate() -> Result<()> { + let field_a = Field::new("a", DataType::Int64, false); + let field_b = Field::new("b", DataType::Int64, false); + let schema = Arc::new(Schema::new(vec![field_a, field_b])); + let groups: Vec<(Arc, String)> = - vec![(col("a"), "unused".to_string())]; + vec![(col("a", &schema)?, "unused".to_string())]; let aggregates: Vec> = vec![Arc::new(Avg::new( - col("b"), + col("b", &schema)?, "AVG(b)".to_string(), DataType::Float64, ))]; - let field_a = Field::new("a", DataType::Int64, false); - let field_b = Field::new("b", DataType::Int64, false); - let schema = Arc::new(Schema::new(vec![field_a, field_b])); - roundtrip_test(Arc::new(HashAggregateExec::try_new( AggregateMode::Final, groups.clone(), @@ -127,9 +127,9 @@ mod roundtrip_tests { let field_b = Field::new("b", DataType::Int64, false); let field_c = Field::new("c", DataType::Int64, false); let schema = Arc::new(Schema::new(vec![field_a, field_b, field_c])); - let not = Arc::new(NotExpr::new(col("a"))); + let not = Arc::new(NotExpr::new(col("a", &schema)?)); let in_list = Arc::new(InListExpr::new( - col("b"), + col("b", &schema)?, vec![ lit(ScalarValue::Int64(Some(1))), lit(ScalarValue::Int64(Some(2))), @@ -150,14 +150,14 @@ mod roundtrip_tests { let schema = Arc::new(Schema::new(vec![field_a, field_b])); let sort_exprs = vec![ PhysicalSortExpr { - expr: col("a"), + expr: col("a", &schema)?, options: SortOptions { descending: true, nulls_first: false, }, }, PhysicalSortExpr { - expr: col("b"), + expr: col("b", &schema)?, options: SortOptions { descending: false, nulls_first: true, diff --git a/ballista/rust/core/src/serde/physical_plan/to_proto.rs b/ballista/rust/core/src/serde/physical_plan/to_proto.rs index 15d5d4b931ff2..cf5401b650193 100644 --- a/ballista/rust/core/src/serde/physical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/to_proto.rs @@ -125,8 +125,14 @@ impl TryInto for Arc { .on() .iter() .map(|tuple| protobuf::JoinOn { - left: tuple.0.to_owned(), - right: tuple.1.to_owned(), + left: Some(protobuf::PhysicalColumn { + name: tuple.0.name().to_string(), + index: tuple.0.index() as u32, + }), + right: Some(protobuf::PhysicalColumn { + name: tuple.1.name().to_string(), + index: tuple.1.index() as u32, + }), }) .collect(); let join_type = match exec.join_type() { @@ -300,7 +306,7 @@ impl TryInto for Arc { let pb_partition_method = match exec.partitioning() { Partitioning::Hash(exprs, partition_count) => { - PartitionMethod::Hash(protobuf::HashRepartition { + PartitionMethod::Hash(protobuf::PhysicalHashRepartition { hash_expr: exprs .iter() .map(|expr| expr.clone().try_into()) @@ -330,13 +336,13 @@ impl TryInto for Arc { .expr() .iter() .map(|expr| { - let sort_expr = Box::new(protobuf::SortExprNode { + let sort_expr = Box::new(protobuf::PhysicalSortExprNode { expr: Some(Box::new(expr.expr.to_owned().try_into()?)), asc: !expr.options.descending, nulls_first: expr.options.nulls_first, }); - Ok(protobuf::LogicalExprNode { - expr_type: Some(protobuf::logical_expr_node::ExprType::Sort( + Ok(protobuf::PhysicalExprNode { + expr_type: Some(protobuf::physical_expr_node::ExprType::Sort( sort_expr, )), }) @@ -373,10 +379,10 @@ impl TryInto for Arc { } } -impl TryInto for Arc { +impl TryInto for Arc { type Error = BallistaError; - fn try_into(self) -> Result { + fn try_into(self) -> Result { let aggr_function = if self.as_any().downcast_ref::().is_some() { Ok(protobuf::AggregateFunction::Avg.into()) } else if self.as_any().downcast_ref::().is_some() { @@ -389,14 +395,14 @@ impl TryInto for Arc { self ))) }?; - let expressions: Vec = self + let expressions: Vec = self .expressions() .iter() .map(|e| e.clone().try_into()) .collect::, BallistaError>>()?; - Ok(protobuf::LogicalExprNode { - expr_type: Some(protobuf::logical_expr_node::ExprType::AggregateExpr( - Box::new(protobuf::AggregateExprNode { + Ok(protobuf::PhysicalExprNode { + expr_type: Some(protobuf::physical_expr_node::ExprType::AggregateExpr( + Box::new(protobuf::PhysicalAggregateExprNode { aggr_function, expr: Some(Box::new(expressions[0].clone())), }), @@ -405,90 +411,100 @@ impl TryInto for Arc { } } -impl TryFrom> for protobuf::LogicalExprNode { +impl TryFrom> for protobuf::PhysicalExprNode { type Error = BallistaError; fn try_from(value: Arc) -> Result { let expr = value.as_any(); if let Some(expr) = expr.downcast_ref::() { - Ok(protobuf::LogicalExprNode { - expr_type: Some(protobuf::logical_expr_node::ExprType::ColumnName( - expr.name().to_owned(), + Ok(protobuf::PhysicalExprNode { + expr_type: Some(protobuf::physical_expr_node::ExprType::Column( + protobuf::PhysicalColumn { + name: expr.name().to_string(), + index: expr.index() as u32, + }, )), }) } else if let Some(expr) = expr.downcast_ref::() { - let binary_expr = Box::new(protobuf::BinaryExprNode { + let binary_expr = Box::new(protobuf::PhysicalBinaryExprNode { l: Some(Box::new(expr.left().to_owned().try_into()?)), r: Some(Box::new(expr.right().to_owned().try_into()?)), op: format!("{:?}", expr.op()), }); - Ok(protobuf::LogicalExprNode { - expr_type: Some(protobuf::logical_expr_node::ExprType::BinaryExpr( + Ok(protobuf::PhysicalExprNode { + expr_type: Some(protobuf::physical_expr_node::ExprType::BinaryExpr( binary_expr, )), }) } else if let Some(expr) = expr.downcast_ref::() { - Ok(protobuf::LogicalExprNode { - expr_type: Some(protobuf::logical_expr_node::ExprType::Case(Box::new( - protobuf::CaseNode { - expr: expr - .expr() - .as_ref() - .map(|exp| exp.clone().try_into().map(Box::new)) - .transpose()?, - when_then_expr: expr - .when_then_expr() - .iter() - .map(|(when_expr, then_expr)| { - try_parse_when_then_expr(when_expr, then_expr) - }) - .collect::, Self::Error>>()?, - else_expr: expr - .else_expr() - .map(|a| a.clone().try_into().map(Box::new)) - .transpose()?, - }, - ))), + Ok(protobuf::PhysicalExprNode { + expr_type: Some( + protobuf::physical_expr_node::ExprType::Case( + Box::new( + protobuf::PhysicalCaseNode { + expr: expr + .expr() + .as_ref() + .map(|exp| exp.clone().try_into().map(Box::new)) + .transpose()?, + when_then_expr: expr + .when_then_expr() + .iter() + .map(|(when_expr, then_expr)| { + try_parse_when_then_expr(when_expr, then_expr) + }) + .collect::, + Self::Error, + >>()?, + else_expr: expr + .else_expr() + .map(|a| a.clone().try_into().map(Box::new)) + .transpose()?, + }, + ), + ), + ), }) } else if let Some(expr) = expr.downcast_ref::() { - Ok(protobuf::LogicalExprNode { - expr_type: Some(protobuf::logical_expr_node::ExprType::NotExpr( - Box::new(protobuf::Not { + Ok(protobuf::PhysicalExprNode { + expr_type: Some(protobuf::physical_expr_node::ExprType::NotExpr( + Box::new(protobuf::PhysicalNot { expr: Some(Box::new(expr.arg().to_owned().try_into()?)), }), )), }) } else if let Some(expr) = expr.downcast_ref::() { - Ok(protobuf::LogicalExprNode { - expr_type: Some(protobuf::logical_expr_node::ExprType::IsNullExpr( - Box::new(protobuf::IsNull { + Ok(protobuf::PhysicalExprNode { + expr_type: Some(protobuf::physical_expr_node::ExprType::IsNullExpr( + Box::new(protobuf::PhysicalIsNull { expr: Some(Box::new(expr.arg().to_owned().try_into()?)), }), )), }) } else if let Some(expr) = expr.downcast_ref::() { - Ok(protobuf::LogicalExprNode { - expr_type: Some(protobuf::logical_expr_node::ExprType::IsNotNullExpr( - Box::new(protobuf::IsNotNull { + Ok(protobuf::PhysicalExprNode { + expr_type: Some(protobuf::physical_expr_node::ExprType::IsNotNullExpr( + Box::new(protobuf::PhysicalIsNotNull { expr: Some(Box::new(expr.arg().to_owned().try_into()?)), }), )), }) } else if let Some(expr) = expr.downcast_ref::() { - Ok(protobuf::LogicalExprNode { + Ok(protobuf::PhysicalExprNode { expr_type: Some( - protobuf::logical_expr_node::ExprType::InList( + protobuf::physical_expr_node::ExprType::InList( Box::new( - protobuf::InListNode { + protobuf::PhysicalInListNode { expr: Some(Box::new(expr.expr().to_owned().try_into()?)), list: expr .list() .iter() .map(|a| a.clone().try_into()) .collect::, + Vec, Self::Error, >>()?, negated: expr.negated(), @@ -498,32 +514,32 @@ impl TryFrom> for protobuf::LogicalExprNode { ), }) } else if let Some(expr) = expr.downcast_ref::() { - Ok(protobuf::LogicalExprNode { - expr_type: Some(protobuf::logical_expr_node::ExprType::Negative( - Box::new(protobuf::NegativeNode { + Ok(protobuf::PhysicalExprNode { + expr_type: Some(protobuf::physical_expr_node::ExprType::Negative( + Box::new(protobuf::PhysicalNegativeNode { expr: Some(Box::new(expr.arg().to_owned().try_into()?)), }), )), }) } else if let Some(lit) = expr.downcast_ref::() { - Ok(protobuf::LogicalExprNode { - expr_type: Some(protobuf::logical_expr_node::ExprType::Literal( + Ok(protobuf::PhysicalExprNode { + expr_type: Some(protobuf::physical_expr_node::ExprType::Literal( lit.value().try_into()?, )), }) } else if let Some(cast) = expr.downcast_ref::() { - Ok(protobuf::LogicalExprNode { - expr_type: Some(protobuf::logical_expr_node::ExprType::Cast(Box::new( - protobuf::CastNode { + Ok(protobuf::PhysicalExprNode { + expr_type: Some(protobuf::physical_expr_node::ExprType::Cast(Box::new( + protobuf::PhysicalCastNode { expr: Some(Box::new(cast.expr().clone().try_into()?)), arrow_type: Some(cast.cast_type().into()), }, ))), }) } else if let Some(cast) = expr.downcast_ref::() { - Ok(protobuf::LogicalExprNode { - expr_type: Some(protobuf::logical_expr_node::ExprType::TryCast( - Box::new(protobuf::TryCastNode { + Ok(protobuf::PhysicalExprNode { + expr_type: Some(protobuf::physical_expr_node::ExprType::TryCast( + Box::new(protobuf::PhysicalTryCastNode { expr: Some(Box::new(cast.expr().clone().try_into()?)), arrow_type: Some(cast.cast_type().into()), }), @@ -533,16 +549,18 @@ impl TryFrom> for protobuf::LogicalExprNode { let fun: BuiltinScalarFunction = BuiltinScalarFunction::from_str(expr.name())?; let fun: protobuf::ScalarFunction = (&fun).try_into()?; - let expr: Vec = expr + let args: Vec = expr .args() .iter() .map(|e| e.to_owned().try_into()) .collect::, _>>()?; - Ok(protobuf::LogicalExprNode { - expr_type: Some(protobuf::logical_expr_node::ExprType::ScalarFunction( - protobuf::ScalarFunctionNode { + Ok(protobuf::PhysicalExprNode { + expr_type: Some(protobuf::physical_expr_node::ExprType::ScalarFunction( + protobuf::PhysicalScalarFunctionNode { + name: expr.name().to_string(), fun: fun.into(), - expr, + args, + return_type: Some(expr.return_type().into()), }, )), }) @@ -558,8 +576,8 @@ impl TryFrom> for protobuf::LogicalExprNode { fn try_parse_when_then_expr( when_expr: &Arc, then_expr: &Arc, -) -> Result { - Ok(protobuf::WhenThen { +) -> Result { + Ok(protobuf::PhysicalWhenThen { when_expr: Some(when_expr.clone().try_into()?), then_expr: Some(then_expr.clone().try_into()?), }) diff --git a/benchmarks/run.sh b/benchmarks/run.sh index 8e36424da89f0..21633d39c23ad 100755 --- a/benchmarks/run.sh +++ b/benchmarks/run.sh @@ -20,7 +20,7 @@ set -e # This bash script is meant to be run inside the docker-compose environment. Check the README for instructions cd / -for query in 1 3 5 6 10 12 +for query in 1 3 5 6 7 8 9 10 12 do /tpch benchmark ballista --host ballista-scheduler --port 50050 --query $query --path /data --format tbl --iterations 1 --debug done diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index 08c47630fa18a..286fe45945104 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -707,6 +707,16 @@ mod tests { run_query(6).await } + #[tokio::test] + async fn run_q7() -> Result<()> { + run_query(7).await + } + + #[tokio::test] + async fn run_q8() -> Result<()> { + run_query(8).await + } + #[tokio::test] async fn run_q9() -> Result<()> { run_query(9).await diff --git a/datafusion/src/dataframe.rs b/datafusion/src/dataframe.rs index 9c7c2ef96d6be..507a79861cd53 100644 --- a/datafusion/src/dataframe.rs +++ b/datafusion/src/dataframe.rs @@ -188,6 +188,8 @@ pub trait DataFrame: Send + Sync { right_cols: &[&str], ) -> Result>; + // TODO: add join_using + /// Repartition a DataFrame based on a logical partitioning scheme. /// /// ``` diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index b42695b0c4c64..926e2db9450a1 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -52,7 +52,7 @@ use crate::datasource::TableProvider; use crate::error::{DataFusionError, Result}; use crate::execution::dataframe_impl::DataFrameImpl; use crate::logical_plan::{ - FunctionRegistry, LogicalPlan, LogicalPlanBuilder, ToDFSchema, + FunctionRegistry, LogicalPlan, LogicalPlanBuilder, UNNAMED_TABLE, }; use crate::optimizer::constant_folding::ConstantFolding; use crate::optimizer::filter_push_down::FilterPushDown; @@ -297,18 +297,9 @@ impl ExecutionContext { &mut self, provider: Arc, ) -> Result> { - let schema = provider.schema(); - let table_scan = LogicalPlan::TableScan { - table_name: "".to_string(), - source: provider, - projected_schema: schema.to_dfschema_ref()?, - projection: None, - filters: vec![], - limit: None, - }; Ok(Arc::new(DataFrameImpl::new( self.state.clone(), - &LogicalPlanBuilder::from(&table_scan).build()?, + &LogicalPlanBuilder::scan(UNNAMED_TABLE, provider, None)?.build()?, ))) } @@ -410,22 +401,15 @@ impl ExecutionContext { ) -> Result> { let table_ref = table_ref.into(); let schema = self.state.lock().unwrap().schema_for_ref(table_ref)?; - match schema.table(table_ref.table()) { Some(ref provider) => { - let schema = provider.schema(); - let table_scan = LogicalPlan::TableScan { - table_name: table_ref.table().to_owned(), - source: Arc::clone(provider), - projected_schema: schema.to_dfschema_ref()?, - projection: None, - filters: vec![], - limit: None, - }; - Ok(Arc::new(DataFrameImpl::new( - self.state.clone(), - &LogicalPlanBuilder::from(&table_scan).build()?, - ))) + let plan = LogicalPlanBuilder::scan( + table_ref.table(), + Arc::clone(provider), + None, + )? + .build()?; + Ok(Arc::new(DataFrameImpl::new(self.state.clone(), &plan))) } _ => Err(DataFusionError::Plan(format!( "No table named '{}'", @@ -1038,7 +1022,6 @@ mod tests { let logical_plan = ctx.optimize(&logical_plan)?; let physical_plan = ctx.create_physical_plan(&logical_plan)?; - println!("{:?}", physical_plan); let results = collect_partitioned(physical_plan).await?; @@ -1110,7 +1093,7 @@ mod tests { _ => panic!("expect optimized_plan to be projection"), } - let expected = "Projection: #c2\ + let expected = "Projection: #test.c2\ \n TableScan: test projection=Some([1])"; assert_eq!(format!("{:?}", optimized_plan), expected); @@ -1133,7 +1116,7 @@ mod tests { let schema: Schema = ctx.table("test").unwrap().schema().clone().into(); assert!(!schema.field_with_name("c1")?.is_nullable()); - let plan = LogicalPlanBuilder::scan_empty("", &schema, None)? + let plan = LogicalPlanBuilder::scan_empty(None, &schema, None)? .project(vec![col("c1")])? .build()?; @@ -1183,8 +1166,11 @@ mod tests { _ => panic!("expect optimized_plan to be projection"), } - let expected = "Projection: #b\ - \n TableScan: projection=Some([1])"; + let expected = format!( + "Projection: #{}.b\ + \n TableScan: {} projection=Some([1])", + UNNAMED_TABLE, UNNAMED_TABLE + ); assert_eq!(format!("{:?}", optimized_plan), expected); let physical_plan = ctx.create_physical_plan(&optimized_plan)?; @@ -2138,9 +2124,9 @@ mod tests { Field::new("c2", DataType::UInt32, false), ])); - let plan = LogicalPlanBuilder::scan_empty("", schema.as_ref(), None)? + let plan = LogicalPlanBuilder::scan_empty(None, schema.as_ref(), None)? .aggregate(vec![col("c1")], vec![sum(col("c2"))])? - .project(vec![col("c1"), col("SUM(c2)").alias("total_salary")])? + .project(vec![col("c1"), sum(col("c2")).alias("total_salary")])? .build()?; let plan = ctx.optimize(&plan)?; @@ -2590,7 +2576,7 @@ mod tests { assert_eq!( format!("{:?}", plan), - "Projection: #a, #b, my_add(#a, #b)\n TableScan: t projection=None" + "Projection: #t.a, #t.b, my_add(#t.a, #t.b)\n TableScan: t projection=None" ); let plan = ctx.optimize(&plan)?; diff --git a/datafusion/src/execution/dataframe_impl.rs b/datafusion/src/execution/dataframe_impl.rs index a674e3cdb0f1b..99eb7f077c96a 100644 --- a/datafusion/src/execution/dataframe_impl.rs +++ b/datafusion/src/execution/dataframe_impl.rs @@ -110,7 +110,12 @@ impl DataFrame for DataFrameImpl { right_cols: &[&str], ) -> Result> { let plan = LogicalPlanBuilder::from(&self.plan) - .join(&right.to_logical_plan(), join_type, left_cols, right_cols)? + .join( + &right.to_logical_plan(), + join_type, + left_cols.to_vec(), + right_cols.to_vec(), + )? .build()?; Ok(Arc::new(DataFrameImpl::new(self.ctx_state.clone(), &plan))) } diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs index 6bd5181050fd6..4b4ed0fb9d413 100644 --- a/datafusion/src/logical_plan/builder.rs +++ b/datafusion/src/logical_plan/builder.rs @@ -24,19 +24,27 @@ use arrow::{ record_batch::RecordBatch, }; -use super::dfschema::ToDFSchema; -use super::{ - col, exprlist_to_fields, Expr, JoinType, LogicalPlan, PlanType, StringifiedPlan, -}; use crate::datasource::TableProvider; use crate::error::{DataFusionError, Result}; -use crate::logical_plan::{DFField, DFSchema, DFSchemaRef, Partitioning}; use crate::{ datasource::{empty::EmptyTable, parquet::ParquetTable, CsvFile, MemTable}, prelude::CsvReadOptions, }; + +use super::dfschema::ToDFSchema; +use super::{ + exprlist_to_fields, Expr, JoinConstraint, JoinType, LogicalPlan, PlanType, + StringifiedPlan, +}; +use crate::logical_plan::{ + columnize_expr, normalize_col, normalize_cols, Column, DFField, DFSchema, + DFSchemaRef, Partitioning, +}; use std::collections::HashSet; +/// Default table name for unnamed table +pub const UNNAMED_TABLE: &str = "?table?"; + /// Builder for logical plans /// /// ``` @@ -62,7 +70,7 @@ use std::collections::HashSet; /// // FROM employees /// // WHERE salary < 1000 /// let plan = LogicalPlanBuilder::scan_empty( -/// "employee.csv", +/// Some("employee"), /// &employee_schema(), /// None, /// )? @@ -102,7 +110,7 @@ impl LogicalPlanBuilder { projection: Option>, ) -> Result { let provider = Arc::new(MemTable::try_new(schema, partitions)?); - Self::scan("", provider, projection) + Self::scan(UNNAMED_TABLE, provider, projection) } /// Scan a CSV data source @@ -112,7 +120,7 @@ impl LogicalPlanBuilder { projection: Option>, ) -> Result { let provider = Arc::new(CsvFile::try_new(path, options)?); - Self::scan("", provider, projection) + Self::scan(path, provider, projection) } /// Scan a Parquet data source @@ -122,38 +130,53 @@ impl LogicalPlanBuilder { max_concurrency: usize, ) -> Result { let provider = Arc::new(ParquetTable::try_new(path, max_concurrency)?); - Self::scan("", provider, projection) + Self::scan(path, provider, projection) } /// Scan an empty data source, mainly used in tests pub fn scan_empty( - name: &str, + name: Option<&str>, table_schema: &Schema, projection: Option>, ) -> Result { let table_schema = Arc::new(table_schema.clone()); let provider = Arc::new(EmptyTable::new(table_schema)); - Self::scan(name, provider, projection) + Self::scan(name.unwrap_or(UNNAMED_TABLE), provider, projection) } /// Convert a table provider into a builder with a TableScan pub fn scan( - name: &str, + table_name: &str, provider: Arc, projection: Option>, ) -> Result { + if table_name.is_empty() { + return Err(DataFusionError::Plan( + "table_name cannot be empty".to_string(), + )); + } + let schema = provider.schema(); let projected_schema = projection .as_ref() - .map(|p| Schema::new(p.iter().map(|i| schema.field(*i).clone()).collect())) - .map_or(schema, SchemaRef::new) - .to_dfschema_ref()?; + .map(|p| { + DFSchema::new( + p.iter() + .map(|i| { + DFField::from_qualified(table_name, schema.field(*i).clone()) + }) + .collect(), + ) + }) + .unwrap_or_else(|| { + DFSchema::try_from_qualified_schema(table_name, &schema) + })?; let table_scan = LogicalPlan::TableScan { - table_name: name.to_string(), + table_name: table_name.to_string(), source: provider, - projected_schema, + projected_schema: Arc::new(projected_schema), projection, filters: vec![], limit: None, @@ -170,16 +193,21 @@ impl LogicalPlanBuilder { /// * An invalid expression is used (e.g. a `sort` expression) pub fn project(&self, expr: impl IntoIterator) -> Result { let input_schema = self.plan.schema(); + let all_schemas = self.plan.all_schemas(); let mut projected_expr = vec![]; for e in expr { match e { Expr::Wildcard => { (0..input_schema.fields().len()).for_each(|i| { - projected_expr.push(col(input_schema.field(i).name())) + projected_expr + .push(Expr::Column(input_schema.field(i).qualified_column())) }); } - _ => projected_expr.push(e), - }; + _ => projected_expr.push(columnize_expr( + normalize_col(e, &all_schemas)?, + input_schema, + )), + } } validate_unique_names("Projections", projected_expr.iter(), input_schema)?; @@ -195,6 +223,7 @@ impl LogicalPlanBuilder { /// Apply a filter pub fn filter(&self, expr: Expr) -> Result { + let expr = normalize_col(expr, &self.plan.all_schemas())?; Ok(Self::from(&LogicalPlan::Filter { predicate: expr, input: Arc::new(self.plan.clone()), @@ -210,69 +239,103 @@ impl LogicalPlanBuilder { } /// Apply a sort - pub fn sort(&self, expr: impl IntoIterator) -> Result { + pub fn sort(&self, exprs: impl IntoIterator) -> Result { + let schemas = self.plan.all_schemas(); Ok(Self::from(&LogicalPlan::Sort { - expr: expr.into_iter().collect(), + expr: normalize_cols(exprs, &schemas)?, input: Arc::new(self.plan.clone()), })) } /// Apply a union pub fn union(&self, plan: LogicalPlan) -> Result { - let schema = self.plan.schema(); + Ok(Self::from(&union_with_alias( + self.plan.clone(), + plan, + None, + )?)) + } - if plan.schema() != schema { + /// Apply a join with on constraint + pub fn join( + &self, + right: &LogicalPlan, + join_type: JoinType, + left_keys: Vec>, + right_keys: Vec>, + ) -> Result { + if left_keys.len() != right_keys.len() { return Err(DataFusionError::Plan( - "Schema's for union should be the same ".to_string(), + "left_keys and right_keys were not the same length".to_string(), )); } - // Add plan to existing union if possible - let mut inputs = match &self.plan { - LogicalPlan::Union { inputs, .. } => inputs.clone(), - _ => vec![self.plan.clone()], - }; - inputs.push(plan); - Ok(Self::from(&LogicalPlan::Union { - inputs, - schema: schema.clone(), - alias: None, + let left_keys: Vec = left_keys + .into_iter() + .map(|c| c.into().normalize(&self.plan.all_schemas())) + .collect::>()?; + let right_keys: Vec = right_keys + .into_iter() + .map(|c| c.into().normalize(&right.all_schemas())) + .collect::>()?; + let on: Vec<(_, _)> = left_keys.into_iter().zip(right_keys.into_iter()).collect(); + let join_schema = build_join_schema( + self.plan.schema(), + right.schema(), + &on, + &join_type, + &JoinConstraint::On, + )?; + + Ok(Self::from(&LogicalPlan::Join { + left: Arc::new(self.plan.clone()), + right: Arc::new(right.clone()), + on, + join_type, + join_constraint: JoinConstraint::On, + schema: DFSchemaRef::new(join_schema), })) } - /// Apply a join - pub fn join( + /// Apply a join with using constraint, which duplicates all join columns in output schema. + pub fn join_using( &self, right: &LogicalPlan, join_type: JoinType, - left_keys: &[&str], - right_keys: &[&str], + using_keys: Vec + Clone>, ) -> Result { - if left_keys.len() != right_keys.len() { - Err(DataFusionError::Plan( - "left_keys and right_keys were not the same length".to_string(), - )) - } else { - let on: Vec<_> = left_keys - .iter() - .zip(right_keys.iter()) - .map(|(x, y)| (x.to_string(), y.to_string())) - .collect::>(); - let join_schema = - build_join_schema(self.plan.schema(), right.schema(), &on, &join_type)?; - Ok(Self::from(&LogicalPlan::Join { - left: Arc::new(self.plan.clone()), - right: Arc::new(right.clone()), - on, - join_type, - schema: DFSchemaRef::new(join_schema), - })) - } + let left_keys: Vec = using_keys + .clone() + .into_iter() + .map(|c| c.into().normalize(&self.plan.all_schemas())) + .collect::>()?; + let right_keys: Vec = using_keys + .into_iter() + .map(|c| c.into().normalize(&right.all_schemas())) + .collect::>()?; + + let on: Vec<(_, _)> = left_keys.into_iter().zip(right_keys.into_iter()).collect(); + let join_schema = build_join_schema( + self.plan.schema(), + right.schema(), + &on, + &join_type, + &JoinConstraint::Using, + )?; + + Ok(Self::from(&LogicalPlan::Join { + left: Arc::new(self.plan.clone()), + right: Arc::new(right.clone()), + on, + join_type, + join_constraint: JoinConstraint::Using, + schema: DFSchemaRef::new(join_schema), + })) } + /// Apply a cross join pub fn cross_join(&self, right: &LogicalPlan) -> Result { let schema = self.plan.schema().join(right.schema())?; - Ok(Self::from(&LogicalPlan::CrossJoin { left: Arc::new(self.plan.clone()), right: Arc::new(right.clone()), @@ -320,9 +383,9 @@ impl LogicalPlanBuilder { group_expr: impl IntoIterator, aggr_expr: impl IntoIterator, ) -> Result { - let group_expr = group_expr.into_iter().collect::>(); - let aggr_expr = aggr_expr.into_iter().collect::>(); - + let schemas = self.plan.all_schemas(); + let group_expr = normalize_cols(group_expr, &schemas)?; + let aggr_expr = normalize_cols(aggr_expr, &schemas)?; let all_expr = group_expr.iter().chain(aggr_expr.iter()); validate_unique_names("Aggregations", all_expr.clone(), self.plan.schema())?; @@ -363,27 +426,35 @@ impl LogicalPlanBuilder { /// Creates a schema for a join operation. /// The fields from the left side are first -fn build_join_schema( +pub fn build_join_schema( left: &DFSchema, right: &DFSchema, - on: &[(String, String)], + on: &[(Column, Column)], join_type: &JoinType, + join_constraint: &JoinConstraint, ) -> Result { let fields: Vec = match join_type { JoinType::Inner | JoinType::Left | JoinType::Full => { - // remove right-side join keys if they have the same names as the left-side - let duplicate_keys = &on - .iter() - .filter(|(l, r)| l == r) - .map(|on| on.1.to_string()) - .collect::>(); + let duplicate_keys = match join_constraint { + JoinConstraint::On => on + .iter() + .filter(|(l, r)| l == r) + .map(|on| on.1.clone()) + .collect::>(), + // using join requires unique join columns in the output schema, so we mark all + // right join keys as duplicate + JoinConstraint::Using => { + on.iter().map(|on| on.1.clone()).collect::>() + } + }; let left_fields = left.fields().iter(); + // remove right-side join keys if they have the same names as the left-side let right_fields = right .fields() .iter() - .filter(|f| !duplicate_keys.contains(f.name())); + .filter(|f| !duplicate_keys.contains(&f.qualified_column())); // left then right left_fields.chain(right_fields).cloned().collect() @@ -393,17 +464,24 @@ fn build_join_schema( left.fields().clone() } JoinType::Right => { - // remove left-side join keys if they have the same names as the right-side - let duplicate_keys = &on - .iter() - .filter(|(l, r)| l == r) - .map(|on| on.1.to_string()) - .collect::>(); + let duplicate_keys = match join_constraint { + JoinConstraint::On => on + .iter() + .filter(|(l, r)| l == r) + .map(|on| on.1.clone()) + .collect::>(), + // using join requires unique join columns in the output schema, so we mark all + // left join keys as duplicate + JoinConstraint::Using => { + on.iter().map(|on| on.0.clone()).collect::>() + } + }; + // remove left-side join keys if they have the same names as the right-side let left_fields = left .fields() .iter() - .filter(|f| !duplicate_keys.contains(f.name())); + .filter(|f| !duplicate_keys.contains(&f.qualified_column())); let right_fields = right.fields().iter(); @@ -411,6 +489,7 @@ fn build_join_schema( left_fields.chain(right_fields).cloned().collect() } }; + DFSchema::new(fields) } @@ -441,17 +520,56 @@ fn validate_unique_names<'a>( }) } +/// Union two logical plans with an optional alias. +pub fn union_with_alias( + left_plan: LogicalPlan, + right_plan: LogicalPlan, + alias: Option, +) -> Result { + let inputs = vec![left_plan, right_plan] + .into_iter() + .flat_map(|p| match p { + LogicalPlan::Union { inputs, .. } => inputs, + x => vec![x], + }) + .collect::>(); + if inputs.is_empty() { + return Err(DataFusionError::Plan("Empty UNION".to_string())); + } + + let union_schema = (**inputs[0].schema()).clone(); + let union_schema = Arc::new(match alias { + Some(ref alias) => union_schema.replace_qualifier(alias.as_str()), + None => union_schema.strip_qualifiers(), + }); + if !inputs.iter().skip(1).all(|input_plan| { + // union changes all qualifers in resulting schema, so we only need to + // match against arrow schema here, which doesn't include qualifiers + union_schema.matches_arrow_schema(&((**input_plan.schema()).clone().into())) + }) { + return Err(DataFusionError::Plan( + "UNION ALL schemas are expected to be the same".to_string(), + )); + } + + Ok(LogicalPlan::Union { + schema: union_schema, + inputs, + alias, + }) +} + #[cfg(test)] mod tests { use arrow::datatypes::{DataType, Field}; - use super::super::{lit, sum}; + use super::super::{col, lit, sum}; use super::*; #[test] fn plan_builder_simple() -> Result<()> { let plan = LogicalPlanBuilder::scan_empty( - "employee.csv", + Some("employee_csv"), &employee_schema(), Some(vec![0, 3]), )? @@ -459,9 +577,9 @@ mod tests { .project(vec![col("id")])? .build()?; - let expected = "Projection: #id\ - \n Filter: #state Eq Utf8(\"CO\")\ - \n TableScan: employee.csv projection=Some([0, 3])"; + let expected = "Projection: #employee_csv.id\ + \n Filter: #employee_csv.state Eq Utf8(\"CO\")\ + \n TableScan: employee_csv projection=Some([0, 3])"; assert_eq!(expected, format!("{:?}", plan)); @@ -471,7 +589,7 @@ mod tests { #[test] fn plan_builder_aggregate() -> Result<()> { let plan = LogicalPlanBuilder::scan_empty( - "employee.csv", + Some("employee_csv"), &employee_schema(), Some(vec![3, 4]), )? @@ -482,9 +600,9 @@ mod tests { .project(vec![col("state"), col("total_salary")])? .build()?; - let expected = "Projection: #state, #total_salary\ - \n Aggregate: groupBy=[[#state]], aggr=[[SUM(#salary) AS total_salary]]\ - \n TableScan: employee.csv projection=Some([3, 4])"; + let expected = "Projection: #employee_csv.state, #total_salary\ + \n Aggregate: groupBy=[[#employee_csv.state]], aggr=[[SUM(#employee_csv.salary) AS total_salary]]\ + \n TableScan: employee_csv projection=Some([3, 4])"; assert_eq!(expected, format!("{:?}", plan)); @@ -494,7 +612,7 @@ mod tests { #[test] fn plan_builder_sort() -> Result<()> { let plan = LogicalPlanBuilder::scan_empty( - "employee.csv", + Some("employee_csv"), &employee_schema(), Some(vec![3, 4]), )? @@ -505,15 +623,15 @@ mod tests { nulls_first: true, }, Expr::Sort { - expr: Box::new(col("total_salary")), + expr: Box::new(col("salary")), asc: false, nulls_first: false, }, ])? .build()?; - let expected = "Sort: #state ASC NULLS FIRST, #total_salary DESC NULLS LAST\ - \n TableScan: employee.csv projection=Some([3, 4])"; + let expected = "Sort: #employee_csv.state ASC NULLS FIRST, #employee_csv.salary DESC NULLS LAST\ + \n TableScan: employee_csv projection=Some([3, 4])"; assert_eq!(expected, format!("{:?}", plan)); @@ -523,7 +641,7 @@ mod tests { #[test] fn plan_builder_union_combined_single_union() -> Result<()> { let plan = LogicalPlanBuilder::scan_empty( - "employee.csv", + Some("employee_csv"), &employee_schema(), Some(vec![3, 4]), )?; @@ -536,10 +654,10 @@ mod tests { // output has only one union let expected = "Union\ - \n TableScan: employee.csv projection=Some([3, 4])\ - \n TableScan: employee.csv projection=Some([3, 4])\ - \n TableScan: employee.csv projection=Some([3, 4])\ - \n TableScan: employee.csv projection=Some([3, 4])"; + \n TableScan: employee_csv projection=Some([3, 4])\ + \n TableScan: employee_csv projection=Some([3, 4])\ + \n TableScan: employee_csv projection=Some([3, 4])\ + \n TableScan: employee_csv projection=Some([3, 4])"; assert_eq!(expected, format!("{:?}", plan)); @@ -549,9 +667,10 @@ mod tests { #[test] fn projection_non_unique_names() -> Result<()> { let plan = LogicalPlanBuilder::scan_empty( - "employee.csv", + Some("employee_csv"), &employee_schema(), - Some(vec![0, 3]), + // project id and first_name by column index + Some(vec![0, 1]), )? // two columns with the same name => error .project(vec![col("id"), col("first_name").alias("id")]); @@ -560,9 +679,8 @@ mod tests { Err(DataFusionError::Plan(e)) => { assert_eq!( e, - "Projections require unique expression names \ - but the expression \"#id\" at position 0 and \"#first_name AS id\" at \ - position 1 have the same name. Consider aliasing (\"AS\") one of them." + "Schema contains qualified field name 'employee_csv.id' \ + and unqualified field name 'id' which would be ambiguous" ); Ok(()) } @@ -575,9 +693,10 @@ mod tests { #[test] fn aggregate_non_unique_names() -> Result<()> { let plan = LogicalPlanBuilder::scan_empty( - "employee.csv", + Some("employee_csv"), &employee_schema(), - Some(vec![0, 3]), + // project state and salary by column index + Some(vec![3, 4]), )? // two columns with the same name => error .aggregate(vec![col("state")], vec![sum(col("salary")).alias("state")]); @@ -586,9 +705,8 @@ mod tests { Err(DataFusionError::Plan(e)) => { assert_eq!( e, - "Aggregations require unique expression names \ - but the expression \"#state\" at position 0 and \"SUM(#salary) AS state\" at \ - position 1 have the same name. Consider aliasing (\"AS\") one of them." + "Schema contains qualified field name 'employee_csv.state' and \ + unqualified field name 'state' which would be ambiguous" ); Ok(()) } diff --git a/datafusion/src/logical_plan/dfschema.rs b/datafusion/src/logical_plan/dfschema.rs index c5437b3af953c..e754addb9da77 100644 --- a/datafusion/src/logical_plan/dfschema.rs +++ b/datafusion/src/logical_plan/dfschema.rs @@ -23,6 +23,7 @@ use std::convert::TryFrom; use std::sync::Arc; use crate::error::{DataFusionError, Result}; +use crate::logical_plan::Column; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use std::fmt::{Display, Formatter}; @@ -88,7 +89,7 @@ impl DFSchema { } /// Create a `DFSchema` from an Arrow schema - pub fn try_from_qualified(qualifier: &str, schema: &Schema) -> Result { + pub fn try_from_qualified_schema(qualifier: &str, schema: &Schema) -> Result { Self::new( schema .fields() @@ -108,6 +109,21 @@ impl DFSchema { Self::new(fields) } + /// Merge a schema into self + pub fn merge(&mut self, other_schema: &DFSchema) { + for field in other_schema.fields() { + // skip duplicate columns + let duplicated_field = match field.qualifier() { + Some(q) => self.field_with_name(Some(q.as_str()), field.name()).is_ok(), + // for unqualifed columns, check as unqualified name + None => self.field_with_unqualified_name(field.name()).is_ok(), + }; + if !duplicated_field { + self.fields.push(field.clone()); + } + } + } + /// Get a list of fields pub fn fields(&self) -> &Vec { &self.fields @@ -119,7 +135,7 @@ impl DFSchema { &self.fields[i] } - /// Find the index of the column with the given name + /// Find the index of the column with the given unqualifed name pub fn index_of(&self, name: &str) -> Result { for i in 0..self.fields.len() { if self.fields[i].name() == name { @@ -129,6 +145,20 @@ impl DFSchema { Err(DataFusionError::Plan(format!("No field named '{}'", name))) } + /// Find the index of the column with the given qualifer and name + pub fn index_of_column(&self, col: &Column) -> Result { + for i in 0..self.fields.len() { + let field = &self.fields[i]; + if field.qualifier() == col.relation.as_ref() && field.name() == &col.name { + return Ok(i); + } + } + Err(DataFusionError::Plan(format!( + "No field matches column '{}'", + col, + ))) + } + /// Find the field with the given name pub fn field_with_name( &self, @@ -150,7 +180,10 @@ impl DFSchema { .filter(|field| field.name() == name) .collect(); match matches.len() { - 0 => Err(DataFusionError::Plan(format!("No field named '{}'", name))), + 0 => Err(DataFusionError::Plan(format!( + "No field with unqualified name '{}'", + name + ))), 1 => Ok(matches[0].to_owned()), _ => Err(DataFusionError::Plan(format!( "Ambiguous reference to field named '{}'", @@ -184,6 +217,62 @@ impl DFSchema { ))), } } + + /// Find the field with the given qualified column + pub fn field_from_qualified_column(&self, column: &Column) -> Result { + match &column.relation { + Some(r) => self.field_with_qualified_name(r, &column.name), + None => self.field_with_unqualified_name(&column.name), + } + } + + /// Check to see if unqualified field names matches field names in Arrow schema + pub fn matches_arrow_schema(&self, arrow_schema: &Schema) -> bool { + self.fields + .iter() + .zip(arrow_schema.fields().iter()) + .all(|(dffield, arrowfield)| dffield.name() == arrowfield.name()) + } + + /// Strip all field qualifier in schema + pub fn strip_qualifiers(self) -> Self { + DFSchema { + fields: self + .fields + .into_iter() + .map(|f| { + if f.qualifier().is_some() { + DFField::new( + None, + f.name(), + f.data_type().to_owned(), + f.is_nullable(), + ) + } else { + f + } + }) + .collect(), + } + } + + /// Replace all field qualifier with new value in schema + pub fn replace_qualifier(self, qualifer: &str) -> Self { + DFSchema { + fields: self + .fields + .into_iter() + .map(|f| { + DFField::new( + Some(qualifer), + f.name(), + f.data_type().to_owned(), + f.is_nullable(), + ) + }) + .collect(), + } + } } impl Into for DFSchema { @@ -195,7 +284,7 @@ impl Into for DFSchema { .map(|f| { if f.qualifier().is_some() { Field::new( - f.qualified_name().as_str(), + f.name().as_str(), f.data_type().to_owned(), f.is_nullable(), ) @@ -208,6 +297,13 @@ impl Into for DFSchema { } } +impl Into for &DFSchema { + /// Convert a schema into a DFSchema + fn into(self) -> Schema { + Schema::new(self.fields.iter().map(|f| f.field.clone()).collect()) + } +} + /// Create a `DFSchema` from an Arrow schema impl TryFrom for DFSchema { type Error = DataFusionError; @@ -340,7 +436,7 @@ impl DFField { self.field.is_nullable() } - /// Returns a reference to the `DFField`'s qualified name + /// Returns a string to the `DFField`'s qualified name pub fn qualified_name(&self) -> String { if let Some(relation_name) = &self.qualifier { format!("{}.{}", relation_name, self.field.name()) @@ -349,10 +445,23 @@ impl DFField { } } + /// Builds a qualified column based on self + pub fn qualified_column(&self) -> Column { + Column { + relation: self.qualifier.clone(), + name: self.field.name().to_string(), + } + } + /// Get the optional qualifier pub fn qualifier(&self) -> Option<&String> { self.qualifier.as_ref() } + + /// Get the arrow field + pub fn field(&self) -> &Field { + &self.field + } } #[cfg(test)] @@ -385,25 +494,25 @@ mod tests { #[test] fn from_qualified_schema() -> Result<()> { - let schema = DFSchema::try_from_qualified("t1", &test_schema_1())?; + let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; assert_eq!("t1.c0, t1.c1", schema.to_string()); Ok(()) } #[test] fn from_qualified_schema_into_arrow_schema() -> Result<()> { - let schema = DFSchema::try_from_qualified("t1", &test_schema_1())?; + let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; let arrow_schema: Schema = schema.into(); - let expected = "Field { name: \"t1.c0\", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, \ - Field { name: \"t1.c1\", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }"; + let expected = "Field { name: \"c0\", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, \ + Field { name: \"c1\", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }"; assert_eq!(expected, arrow_schema.to_string()); Ok(()) } #[test] fn join_qualified() -> Result<()> { - let left = DFSchema::try_from_qualified("t1", &test_schema_1())?; - let right = DFSchema::try_from_qualified("t2", &test_schema_1())?; + let left = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; + let right = DFSchema::try_from_qualified_schema("t2", &test_schema_1())?; let join = left.join(&right)?; assert_eq!("t1.c0, t1.c1, t2.c0, t2.c1", join.to_string()); // test valid access @@ -418,8 +527,8 @@ mod tests { #[test] fn join_qualified_duplicate() -> Result<()> { - let left = DFSchema::try_from_qualified("t1", &test_schema_1())?; - let right = DFSchema::try_from_qualified("t1", &test_schema_1())?; + let left = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; + let right = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; let join = left.join(&right); assert!(join.is_err()); assert_eq!( @@ -446,7 +555,7 @@ mod tests { #[test] fn join_mixed() -> Result<()> { - let left = DFSchema::try_from_qualified("t1", &test_schema_1())?; + let left = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; let right = DFSchema::try_from(test_schema_2())?; let join = left.join(&right)?; assert_eq!("t1.c0, t1.c1, c100, c101", join.to_string()); @@ -464,7 +573,7 @@ mod tests { #[test] fn join_mixed_duplicate() -> Result<()> { - let left = DFSchema::try_from_qualified("t1", &test_schema_1())?; + let left = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; let right = DFSchema::try_from(test_schema_1())?; let join = left.join(&right); assert!(join.is_err()); diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index 58dba16f02efe..1c5cc770c94ff 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -20,7 +20,7 @@ pub use super::Operator; use crate::error::{DataFusionError, Result}; -use crate::logical_plan::{window_frames, DFField, DFSchema}; +use crate::logical_plan::{window_frames, DFField, DFSchema, DFSchemaRef}; use crate::physical_plan::{ aggregates, expressions::binary_operator_data_type, functions, udf::ScalarUDF, window_functions, @@ -33,6 +33,90 @@ use std::collections::HashSet; use std::fmt; use std::sync::Arc; +/// A named reference to a qualified field in a schema. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Column { + /// relation/table name. + pub relation: Option, + /// field/column name. + pub name: String, +} + +impl Column { + /// Create Column from unqualified name. + pub fn from_name(name: String) -> Self { + Self { + relation: None, + name, + } + } + + /// Deserialize a fully qualified name string into a column + pub fn from_qualified_name(flat_name: &str) -> Self { + use sqlparser::tokenizer::Token; + + let dialect = sqlparser::dialect::GenericDialect {}; + let mut tokenizer = sqlparser::tokenizer::Tokenizer::new(&dialect, flat_name); + if let Ok(tokens) = tokenizer.tokenize() { + if let [Token::Word(relation), Token::Period, Token::Word(name)] = + tokens.as_slice() + { + return Column { + relation: Some(relation.value.clone()), + name: name.value.clone(), + }; + } + } + // any expression that's not in the form of `foo.bar` will be treated as unqualified column + // name + Column { + relation: None, + name: String::from(flat_name), + } + } + + /// Serialize column into a flat name string + pub fn flat_name(&self) -> String { + match &self.relation { + Some(r) => format!("{}.{}", r, self.name), + None => self.name.clone(), + } + } + + /// Normalize Column with qualifier based on provided dataframe schemas. + pub fn normalize(self, schemas: &[&DFSchemaRef]) -> Result { + if self.relation.is_some() { + return Ok(self); + } + + for schema in schemas { + if let Ok(field) = schema.field_with_unqualified_name(&self.name) { + return Ok(field.qualified_column()); + } + } + + Err(DataFusionError::Plan(format!( + "Column {} not found in provided schemas", + self + ))) + } +} + +impl From<&str> for Column { + fn from(c: &str) -> Self { + Self::from_qualified_name(c) + } +} + +impl fmt::Display for Column { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match &self.relation { + Some(r) => write!(f, "#{}.{}", r, self.name), + None => write!(f, "#{}", self.name), + } + } +} + /// `Expr` is a central struct of DataFusion's query API, and /// represent logical expressions such as `A + 1`, or `CAST(c1 AS /// int)`. @@ -47,7 +131,7 @@ use std::sync::Arc; /// ``` /// # use datafusion::logical_plan::*; /// let expr = col("c1"); -/// assert_eq!(expr, Expr::Column("c1".to_string())); +/// assert_eq!(expr, Expr::Column(Column::from_name("c1".to_string()))); /// ``` /// /// ## Create the expression `c1 + c2` to add columns "c1" and "c2" together @@ -81,8 +165,8 @@ use std::sync::Arc; pub enum Expr { /// An expression with a specific name. Alias(Box, String), - /// A named reference to a field in a schema. - Column(String), + /// A named reference to a qualified filed in a schema. + Column(Column), /// A named reference to a variable in a registry. ScalarVariable(Vec), /// A constant value. @@ -232,10 +316,9 @@ impl Expr { pub fn get_type(&self, schema: &DFSchema) -> Result { match self { Expr::Alias(expr, _) => expr.get_type(schema), - Expr::Column(name) => Ok(schema - .field_with_unqualified_name(name)? - .data_type() - .clone()), + Expr::Column(c) => { + Ok(schema.field_from_qualified_column(c)?.data_type().clone()) + } Expr::ScalarVariable(_) => Ok(DataType::Utf8), Expr::Literal(l) => Ok(l.get_datatype()), Expr::Case { when_then_expr, .. } => when_then_expr[0].1.get_type(schema), @@ -307,9 +390,9 @@ impl Expr { pub fn nullable(&self, input_schema: &DFSchema) -> Result { match self { Expr::Alias(expr, _) => expr.nullable(input_schema), - Expr::Column(name) => Ok(input_schema - .field_with_unqualified_name(name)? - .is_nullable()), + Expr::Column(c) => { + Ok(input_schema.field_from_qualified_column(c)?.is_nullable()) + } Expr::Literal(value) => Ok(value.is_null()), Expr::ScalarVariable(_) => Ok(true), Expr::Case { @@ -355,7 +438,7 @@ impl Expr { } } - /// Returns the name of this expression based on [arrow::datatypes::Schema]. + /// Returns the name of this expression based on [crate::logical_plan::DFSchema]. /// /// This represents how a column with this expression is named when no alias is chosen pub fn name(&self, input_schema: &DFSchema) -> Result { @@ -364,12 +447,20 @@ impl Expr { /// Returns a [arrow::datatypes::Field] compatible with this expression. pub fn to_field(&self, input_schema: &DFSchema) -> Result { - Ok(DFField::new( - None, //TODO qualifier - &self.name(input_schema)?, - self.get_type(input_schema)?, - self.nullable(input_schema)?, - )) + match self { + Expr::Column(c) => Ok(DFField::new( + c.relation.as_deref(), + &c.name, + self.get_type(input_schema)?, + self.nullable(input_schema)?, + )), + _ => Ok(DFField::new( + None, + &self.name(input_schema)?, + self.get_type(input_schema)?, + self.nullable(input_schema)?, + )), + } } /// Wraps this expression in a cast to a target [arrow::datatypes::DataType]. @@ -540,7 +631,7 @@ impl Expr { // recurse (and cover all expression types) let visitor = match self { Expr::Alias(expr, _) => expr.accept(visitor), - Expr::Column(..) => Ok(visitor), + Expr::Column(_) => Ok(visitor), Expr::ScalarVariable(..) => Ok(visitor), Expr::Literal(..) => Ok(visitor), Expr::BinaryExpr { left, right, .. } => { @@ -668,7 +759,7 @@ impl Expr { // recurse into all sub expressions(and cover all expression types) let expr = match self { Expr::Alias(expr, name) => Expr::Alias(rewrite_boxed(expr, rewriter)?, name), - Expr::Column(name) => Expr::Column(name), + Expr::Column(_) => self.clone(), Expr::ScalarVariable(names) => Expr::ScalarVariable(names), Expr::Literal(value) => Expr::Literal(value), Expr::BinaryExpr { left, op, right } => Expr::BinaryExpr { @@ -985,9 +1076,72 @@ pub fn or(left: Expr, right: Expr) -> Expr { } } -/// Create a column expression based on a column name -pub fn col(name: &str) -> Expr { - Expr::Column(name.to_owned()) +/// Create a column expression based on a qualified or unqualified column name +pub fn col(ident: &str) -> Expr { + Expr::Column(ident.into()) +} + +/// Convert an expression into Column expression if it's already provided as input plan. +/// +/// For example, it rewrites: +/// +/// ```ignore +/// .aggregate(vec![col("c1")], vec![sum(col("c2"))])? +/// .project(vec![col("c1"), sum(col("c2"))? +/// ``` +/// +/// Into: +/// +/// ```ignore +/// .aggregate(vec![col("c1")], vec![sum(col("c2"))])? +/// .project(vec![col("c1"), col("SUM(#c2)")? +/// ``` +pub fn columnize_expr(e: Expr, input_schema: &DFSchema) -> Expr { + match e { + Expr::Column(_) => e, + Expr::Alias(inner_expr, name) => { + Expr::Alias(Box::new(columnize_expr(*inner_expr, input_schema)), name) + } + _ => match e.name(input_schema) { + Ok(name) => match input_schema.field_with_unqualified_name(&name) { + Ok(field) => Expr::Column(field.qualified_column()), + // expression not provided as input, do not convert to a column reference + Err(_) => e, + }, + Err(_) => e, + }, + } +} + +/// Recursively normalize all Column expressions in a given expression tree +pub fn normalize_col(e: Expr, schemas: &[&DFSchemaRef]) -> Result { + struct ColumnNormalizer<'a, 'b> { + schemas: &'a [&'b DFSchemaRef], + } + + impl<'a, 'b> ExprRewriter for ColumnNormalizer<'a, 'b> { + fn mutate(&mut self, expr: Expr) -> Result { + if let Expr::Column(c) = expr { + Ok(Expr::Column(c.normalize(self.schemas)?)) + } else { + Ok(expr) + } + } + } + + e.rewrite(&mut ColumnNormalizer { schemas }) +} + +/// Recursively normalize all Column expressions in a list of expression trees +#[inline] +pub fn normalize_cols( + exprs: impl IntoIterator, + schemas: &[&DFSchemaRef], +) -> Result> { + exprs + .into_iter() + .map(|e| normalize_col(e, schemas)) + .collect() } /// Create an expression to represent the min() aggregate function @@ -1240,7 +1394,7 @@ impl fmt::Debug for Expr { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Expr::Alias(expr, alias) => write!(f, "{:?} AS {}", expr, alias), - Expr::Column(name) => write!(f, "#{}", name), + Expr::Column(c) => write!(f, "{}", c), Expr::ScalarVariable(var_names) => write!(f, "{}", var_names.join(".")), Expr::Literal(v) => write!(f, "{:?}", v), Expr::Case { @@ -1373,7 +1527,7 @@ fn create_function_name( fn create_name(e: &Expr, input_schema: &DFSchema) -> Result { match e { Expr::Alias(_, name) => Ok(name.clone()), - Expr::Column(name) => Ok(name.clone()), + Expr::Column(c) => Ok(c.flat_name()), Expr::ScalarVariable(variable_names) => Ok(variable_names.join(".")), Expr::Literal(value) => Ok(format!("{:?}", value)), Expr::BinaryExpr { left, op, right } => { @@ -1524,8 +1678,8 @@ mod tests { #[test] fn filter_is_null_and_is_not_null() { - let col_null = Expr::Column("col1".to_string()); - let col_not_null = Expr::Column("col2".to_string()); + let col_null = col("col1"); + let col_not_null = col("col2"); assert_eq!(format!("{:?}", col_null.is_null()), "#col1 IS NULL"); assert_eq!( format!("{:?}", col_not_null.is_not_null()), diff --git a/datafusion/src/logical_plan/mod.rs b/datafusion/src/logical_plan/mod.rs index 4a39e114d53f1..69d03d22bb21a 100644 --- a/datafusion/src/logical_plan/mod.rs +++ b/datafusion/src/logical_plan/mod.rs @@ -30,22 +30,26 @@ mod operators; mod plan; mod registry; pub mod window_frames; -pub use builder::LogicalPlanBuilder; +pub use builder::{ + build_join_schema, union_with_alias, LogicalPlanBuilder, UNNAMED_TABLE, +}; pub use dfschema::{DFField, DFSchema, DFSchemaRef, ToDFSchema}; pub use display::display_schema; pub use expr::{ abs, acos, and, array, ascii, asin, atan, avg, binary_expr, bit_length, btrim, case, - ceil, character_length, chr, col, combine_filters, concat, concat_ws, cos, count, - count_distinct, create_udaf, create_udf, exp, exprlist_to_fields, floor, in_list, - initcap, left, length, lit, ln, log10, log2, lower, lpad, ltrim, max, md5, min, now, - octet_length, or, random, regexp_match, regexp_replace, repeat, replace, reverse, - right, round, rpad, rtrim, sha224, sha256, sha384, sha512, signum, sin, split_part, - sqrt, starts_with, strpos, substr, sum, tan, to_hex, translate, trim, trunc, upper, - when, Expr, ExprRewriter, ExpressionVisitor, Literal, Recursion, + ceil, character_length, chr, col, columnize_expr, combine_filters, concat, concat_ws, + cos, count, count_distinct, create_udaf, create_udf, exp, exprlist_to_fields, floor, + in_list, initcap, left, length, lit, ln, log10, log2, lower, lpad, ltrim, max, md5, + min, normalize_col, normalize_cols, now, octet_length, or, random, regexp_match, + regexp_replace, repeat, replace, reverse, right, round, rpad, rtrim, sha224, sha256, + sha384, sha512, signum, sin, split_part, sqrt, starts_with, strpos, substr, sum, tan, + to_hex, translate, trim, trunc, upper, when, Column, Expr, ExprRewriter, + ExpressionVisitor, Literal, Recursion, }; pub use extension::UserDefinedLogicalNode; pub use operators::Operator; pub use plan::{ - JoinType, LogicalPlan, Partitioning, PlanType, PlanVisitor, StringifiedPlan, + JoinConstraint, JoinType, LogicalPlan, Partitioning, PlanType, PlanVisitor, + StringifiedPlan, }; pub use registry::FunctionRegistry; diff --git a/datafusion/src/logical_plan/plan.rs b/datafusion/src/logical_plan/plan.rs index a80bc54b4a2f1..256247228213e 100644 --- a/datafusion/src/logical_plan/plan.rs +++ b/datafusion/src/logical_plan/plan.rs @@ -17,18 +17,14 @@ //! This module contains the `LogicalPlan` enum that describes queries //! via a logical query plan. -use super::expr::Expr; +use super::display::{GraphvizVisitor, IndentVisitor}; +use super::expr::{Column, Expr}; use super::extension::UserDefinedLogicalNode; -use super::{ - col, - display::{GraphvizVisitor, IndentVisitor}, -}; use crate::datasource::TableProvider; use crate::logical_plan::dfschema::DFSchemaRef; use crate::sql::parser::FileType; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use std::{ - cmp::min, fmt::{self, Display}, sync::Arc, }; @@ -50,6 +46,15 @@ pub enum JoinType { Anti, } +/// Join constraint +#[derive(Debug, Clone, Copy)] +pub enum JoinConstraint { + /// Join ON + On, + /// Join USING + Using, +} + /// A LogicalPlan represents the different types of relational /// operators (such as Projection, Filter, etc) and can be created by /// the SQL query planner and the DataFrame API. @@ -125,9 +130,11 @@ pub enum LogicalPlan { /// Right input right: Arc, /// Equijoin clause expressed as pairs of (left, right) join columns - on: Vec<(String, String)>, + on: Vec<(Column, Column)>, /// Join type join_type: JoinType, + /// Join constraint + join_constraint: JoinConstraint, /// The output schema, containing fields from the left and right inputs schema: DFSchemaRef, }, @@ -312,9 +319,10 @@ impl LogicalPlan { aggr_expr, .. } => group_expr.iter().chain(aggr_expr.iter()).cloned().collect(), - LogicalPlan::Join { on, .. } => { - on.iter().flat_map(|(l, r)| vec![col(l), col(r)]).collect() - } + LogicalPlan::Join { on, .. } => on + .iter() + .flat_map(|(l, r)| vec![Expr::Column(l.clone()), Expr::Column(r.clone())]) + .collect(), LogicalPlan::Sort { expr, .. } => expr.clone(), LogicalPlan::Extension { node } => node.expressions(), // plans without expressions @@ -479,9 +487,9 @@ impl LogicalPlan { /// per node. For example: /// /// ```text - /// Projection: #id - /// Filter: #state Eq Utf8(\"CO\")\ - /// CsvScan: employee.csv projection=Some([0, 3]) + /// Projection: #employee.id + /// Filter: #employee.state Eq Utf8(\"CO\")\ + /// CsvScan: employee projection=Some([0, 3]) /// ``` /// /// ``` @@ -490,15 +498,15 @@ impl LogicalPlan { /// let schema = Schema::new(vec![ /// Field::new("id", DataType::Int32, false), /// ]); - /// let plan = LogicalPlanBuilder::scan_empty("foo.csv", &schema, None).unwrap() + /// let plan = LogicalPlanBuilder::scan_empty(Some("foo_csv"), &schema, None).unwrap() /// .filter(col("id").eq(lit(5))).unwrap() /// .build().unwrap(); /// /// // Format using display_indent /// let display_string = format!("{}", plan.display_indent()); /// - /// assert_eq!("Filter: #id Eq Int32(5)\ - /// \n TableScan: foo.csv projection=None", + /// assert_eq!("Filter: #foo_csv.id Eq Int32(5)\ + /// \n TableScan: foo_csv projection=None", /// display_string); /// ``` pub fn display_indent(&self) -> impl fmt::Display + '_ { @@ -520,9 +528,9 @@ impl LogicalPlan { /// per node that includes the output schema. For example: /// /// ```text - /// Projection: #id [id:Int32]\ - /// Filter: #state Eq Utf8(\"CO\") [id:Int32, state:Utf8]\ - /// TableScan: employee.csv projection=Some([0, 3]) [id:Int32, state:Utf8]"; + /// Projection: #employee.id [id:Int32]\ + /// Filter: #employee.state Eq Utf8(\"CO\") [id:Int32, state:Utf8]\ + /// TableScan: employee projection=Some([0, 3]) [id:Int32, state:Utf8]"; /// ``` /// /// ``` @@ -531,15 +539,15 @@ impl LogicalPlan { /// let schema = Schema::new(vec![ /// Field::new("id", DataType::Int32, false), /// ]); - /// let plan = LogicalPlanBuilder::scan_empty("foo.csv", &schema, None).unwrap() + /// let plan = LogicalPlanBuilder::scan_empty(Some("foo_csv"), &schema, None).unwrap() /// .filter(col("id").eq(lit(5))).unwrap() /// .build().unwrap(); /// /// // Format using display_indent_schema /// let display_string = format!("{}", plan.display_indent_schema()); /// - /// assert_eq!("Filter: #id Eq Int32(5) [id:Int32]\ - /// \n TableScan: foo.csv projection=None [id:Int32]", + /// assert_eq!("Filter: #foo_csv.id Eq Int32(5) [id:Int32]\ + /// \n TableScan: foo_csv projection=None [id:Int32]", /// display_string); /// ``` pub fn display_indent_schema(&self) -> impl fmt::Display + '_ { @@ -571,7 +579,7 @@ impl LogicalPlan { /// let schema = Schema::new(vec![ /// Field::new("id", DataType::Int32, false), /// ]); - /// let plan = LogicalPlanBuilder::scan_empty("foo.csv", &schema, None).unwrap() + /// let plan = LogicalPlanBuilder::scan_empty(Some("foo.csv"), &schema, None).unwrap() /// .filter(col("id").eq(lit(5))).unwrap() /// .build().unwrap(); /// @@ -630,7 +638,7 @@ impl LogicalPlan { /// let schema = Schema::new(vec![ /// Field::new("id", DataType::Int32, false), /// ]); - /// let plan = LogicalPlanBuilder::scan_empty("foo.csv", &schema, None).unwrap() + /// let plan = LogicalPlanBuilder::scan_empty(Some("foo.csv"), &schema, None).unwrap() /// .build().unwrap(); /// /// // Format using display @@ -653,11 +661,10 @@ impl LogicalPlan { ref limit, .. } => { - let sep = " ".repeat(min(1, table_name.len())); write!( f, - "TableScan: {}{}projection={:?}", - table_name, sep, projection + "TableScan: {} projection={:?}", + table_name, projection )?; if !filters.is_empty() { @@ -826,7 +833,7 @@ mod tests { fn display_plan() -> LogicalPlan { LogicalPlanBuilder::scan_empty( - "employee.csv", + Some("employee_csv"), &employee_schema(), Some(vec![0, 3]), ) @@ -843,9 +850,9 @@ mod tests { fn test_display_indent() { let plan = display_plan(); - let expected = "Projection: #id\ - \n Filter: #state Eq Utf8(\"CO\")\ - \n TableScan: employee.csv projection=Some([0, 3])"; + let expected = "Projection: #employee_csv.id\ + \n Filter: #employee_csv.state Eq Utf8(\"CO\")\ + \n TableScan: employee_csv projection=Some([0, 3])"; assert_eq!(expected, format!("{}", plan.display_indent())); } @@ -854,9 +861,9 @@ mod tests { fn test_display_indent_schema() { let plan = display_plan(); - let expected = "Projection: #id [id:Int32]\ - \n Filter: #state Eq Utf8(\"CO\") [id:Int32, state:Utf8]\ - \n TableScan: employee.csv projection=Some([0, 3]) [id:Int32, state:Utf8]"; + let expected = "Projection: #employee_csv.id [id:Int32]\ + \n Filter: #employee_csv.state Eq Utf8(\"CO\") [id:Int32, state:Utf8]\ + \n TableScan: employee_csv projection=Some([0, 3]) [id:Int32, state:Utf8]"; assert_eq!(expected, format!("{}", plan.display_indent_schema())); } @@ -878,12 +885,12 @@ mod tests { ); assert!( graphviz.contains( - r#"[shape=box label="TableScan: employee.csv projection=Some([0, 3])"]"# + r#"[shape=box label="TableScan: employee_csv projection=Some([0, 3])"]"# ), "\n{}", plan.display_graphviz() ); - assert!(graphviz.contains(r#"[shape=box label="TableScan: employee.csv projection=Some([0, 3])\nSchema: [id:Int32, state:Utf8]"]"#), + assert!(graphviz.contains(r#"[shape=box label="TableScan: employee_csv projection=Some([0, 3])\nSchema: [id:Int32, state:Utf8]"]"#), "\n{}", plan.display_graphviz()); assert!( graphviz.contains(r#"// End DataFusion GraphViz Plan"#), @@ -1128,9 +1135,12 @@ mod tests { } fn test_plan() -> LogicalPlan { - let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]); + let schema = Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("state", DataType::Utf8, false), + ]); - LogicalPlanBuilder::scan_empty("", &schema, Some(vec![0])) + LogicalPlanBuilder::scan_empty(None, &schema, Some(vec![0, 1])) .unwrap() .filter(col("state").eq(lit("CO"))) .unwrap() diff --git a/datafusion/src/optimizer/constant_folding.rs b/datafusion/src/optimizer/constant_folding.rs index d2ac5ce2f3837..956f74adc28f7 100644 --- a/datafusion/src/optimizer/constant_folding.rs +++ b/datafusion/src/optimizer/constant_folding.rs @@ -293,7 +293,7 @@ mod tests { Field::new("c", DataType::Boolean, false), Field::new("d", DataType::UInt32, false), ]); - LogicalPlanBuilder::scan_empty("test", &schema, None)?.build() + LogicalPlanBuilder::scan_empty(Some("test"), &schema, None)?.build() } fn expr_test_schema() -> DFSchemaRef { @@ -551,9 +551,9 @@ mod tests { .build()?; let expected = "\ - Projection: #a\ - \n Filter: NOT #c\ - \n Filter: #b\ + Projection: #test.a\ + \n Filter: NOT #test.c\ + \n Filter: #test.b\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); @@ -571,10 +571,10 @@ mod tests { .build()?; let expected = "\ - Projection: #a\ + Projection: #test.a\ \n Limit: 1\ - \n Filter: #c\ - \n Filter: NOT #b\ + \n Filter: #test.c\ + \n Filter: NOT #test.b\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); @@ -590,8 +590,8 @@ mod tests { .build()?; let expected = "\ - Projection: #a\ - \n Filter: NOT #b And #c\ + Projection: #test.a\ + \n Filter: NOT #test.b And #test.c\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); @@ -607,8 +607,8 @@ mod tests { .build()?; let expected = "\ - Projection: #a\ - \n Filter: NOT #b Or NOT #c\ + Projection: #test.a\ + \n Filter: NOT #test.b Or NOT #test.c\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); @@ -624,8 +624,8 @@ mod tests { .build()?; let expected = "\ - Projection: #a\ - \n Filter: #b\ + Projection: #test.a\ + \n Filter: #test.b\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); @@ -640,7 +640,7 @@ mod tests { .build()?; let expected = "\ - Projection: #a, #d, NOT #b\ + Projection: #test.a, #test.d, NOT #test.b\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); @@ -659,8 +659,8 @@ mod tests { .build()?; let expected = "\ - Aggregate: groupBy=[[#a, #c]], aggr=[[MAX(#b), MIN(#b)]]\ - \n Projection: #a, #c, #b\ + Aggregate: groupBy=[[#test.a, #test.c]], aggr=[[MAX(#test.b), MIN(#test.b)]]\ + \n Projection: #test.a, #test.c, #test.b\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); diff --git a/datafusion/src/optimizer/eliminate_limit.rs b/datafusion/src/optimizer/eliminate_limit.rs index 1b965f1d02e42..4b5a634889a79 100644 --- a/datafusion/src/optimizer/eliminate_limit.rs +++ b/datafusion/src/optimizer/eliminate_limit.rs @@ -122,7 +122,7 @@ mod tests { // Left side is removed let expected = "Union\ \n EmptyRelation\ - \n Aggregate: groupBy=[[#a]], aggr=[[SUM(#b)]]\ + \n Aggregate: groupBy=[[#test.a]], aggr=[[SUM(#test.b)]]\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); } diff --git a/datafusion/src/optimizer/filter_push_down.rs b/datafusion/src/optimizer/filter_push_down.rs index dc4d5e993a380..e5f8dcfbfffd6 100644 --- a/datafusion/src/optimizer/filter_push_down.rs +++ b/datafusion/src/optimizer/filter_push_down.rs @@ -16,7 +16,7 @@ use crate::datasource::datasource::TableProviderFilterPushDown; use crate::execution::context::ExecutionProps; -use crate::logical_plan::{and, LogicalPlan}; +use crate::logical_plan::{and, Column, LogicalPlan}; use crate::logical_plan::{DFSchema, Expr}; use crate::optimizer::optimizer::OptimizerRule; use crate::optimizer::utils; @@ -56,15 +56,15 @@ pub struct FilterPushDown {} #[derive(Debug, Clone, Default)] struct State { // (predicate, columns on the predicate) - filters: Vec<(Expr, HashSet)>, + filters: Vec<(Expr, HashSet)>, } -type Predicates<'a> = (Vec<&'a Expr>, Vec<&'a HashSet>); +type Predicates<'a> = (Vec<&'a Expr>, Vec<&'a HashSet>); /// returns all predicates in `state` that depend on any of `used_columns` fn get_predicates<'a>( state: &'a State, - used_columns: &HashSet, + used_columns: &HashSet, ) -> Predicates<'a> { state .filters @@ -89,19 +89,19 @@ fn get_join_predicates<'a>( left: &DFSchema, right: &DFSchema, ) -> ( - Vec<&'a HashSet>, - Vec<&'a HashSet>, + Vec<&'a HashSet>, + Vec<&'a HashSet>, Predicates<'a>, ) { let left_columns = &left .fields() .iter() - .map(|f| f.name().clone()) + .map(|f| f.qualified_column()) .collect::>(); let right_columns = &right .fields() .iter() - .map(|f| f.name().clone()) + .map(|f| f.qualified_column()) .collect::>(); let filters = state @@ -173,9 +173,9 @@ fn add_filter(plan: LogicalPlan, predicates: &[&Expr]) -> LogicalPlan { // remove all filters from `filters` that are in `predicate_columns` fn remove_filters( - filters: &[(Expr, HashSet)], - predicate_columns: &[&HashSet], -) -> Vec<(Expr, HashSet)> { + filters: &[(Expr, HashSet)], + predicate_columns: &[&HashSet], +) -> Vec<(Expr, HashSet)> { filters .iter() .filter(|(_, columns)| !predicate_columns.contains(&columns)) @@ -185,9 +185,9 @@ fn remove_filters( // keeps all filters from `filters` that are in `predicate_columns` fn keep_filters( - filters: &[(Expr, HashSet)], - predicate_columns: &[&HashSet], -) -> Vec<(Expr, HashSet)> { + filters: &[(Expr, HashSet)], + predicate_columns: &[&HashSet], +) -> Vec<(Expr, HashSet)> { filters .iter() .filter(|(_, columns)| predicate_columns.contains(&columns)) @@ -199,7 +199,7 @@ fn keep_filters( /// in `state` depend on the columns `used_columns`. fn issue_filters( mut state: State, - used_columns: HashSet, + used_columns: HashSet, plan: &LogicalPlan, ) -> Result { let (predicates, predicate_columns) = get_predicates(&state, &used_columns); @@ -248,8 +248,8 @@ fn optimize(plan: &LogicalPlan, mut state: State) -> Result { predicates .into_iter() .try_for_each::<_, Result<()>>(|predicate| { - let mut columns: HashSet = HashSet::new(); - utils::expr_to_column_names(predicate, &mut columns)?; + let mut columns: HashSet = HashSet::new(); + utils::expr_to_columns(predicate, &mut columns)?; if columns.is_empty() { no_col_predicates.push(predicate) } else { @@ -282,7 +282,7 @@ fn optimize(plan: &LogicalPlan, mut state: State) -> Result { expr => expr.clone(), }; - projection.insert(field.name().clone(), expr); + projection.insert(field.qualified_name(), expr); }); // re-write all filters based on this projection @@ -291,7 +291,7 @@ fn optimize(plan: &LogicalPlan, mut state: State) -> Result { *predicate = rewrite(predicate, &projection)?; columns.clear(); - utils::expr_to_column_names(predicate, columns)?; + utils::expr_to_columns(predicate, columns)?; } // optimize inner @@ -308,11 +308,11 @@ fn optimize(plan: &LogicalPlan, mut state: State) -> Result { // construct set of columns that `aggr_expr` depends on let mut used_columns = HashSet::new(); - utils::exprlist_to_column_names(aggr_expr, &mut used_columns)?; + utils::exprlist_to_columns(aggr_expr, &mut used_columns)?; let agg_columns = aggr_expr .iter() - .map(|x| x.name(input.schema())) + .map(|x| Ok(Column::from_name(x.name(input.schema())?))) .collect::>>()?; used_columns.extend(agg_columns); @@ -332,7 +332,7 @@ fn optimize(plan: &LogicalPlan, mut state: State) -> Result { .schema() .fields() .iter() - .map(|f| f.name().clone()) + .map(|f| f.qualified_column()) .collect::>(); issue_filters(state, used_columns, plan) } @@ -415,7 +415,7 @@ fn optimize(plan: &LogicalPlan, mut state: State) -> Result { .schema() .fields() .iter() - .map(|f| f.name().clone()) + .map(|f| f.qualified_column()) .collect::>(); issue_filters(state, used_columns, plan) } @@ -448,8 +448,8 @@ fn rewrite(expr: &Expr, projection: &HashMap) -> Result { .map(|e| rewrite(e, projection)) .collect::>>()?; - if let Expr::Column(name) = expr { - if let Some(expr) = projection.get(name) { + if let Expr::Column(c) = expr { + if let Some(expr) = projection.get(&c.flat_name()) { return Ok(expr.clone()); } } @@ -489,8 +489,8 @@ mod tests { .build()?; // filter is before projection let expected = "\ - Projection: #a, #b\ - \n Filter: #a Eq Int64(1)\ + Projection: #test.a, #test.b\ + \n Filter: #test.a Eq Int64(1)\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); Ok(()) @@ -506,9 +506,9 @@ mod tests { .build()?; // filter is before single projection let expected = "\ - Filter: #a Eq Int64(1)\ + Filter: #test.a Eq Int64(1)\ \n Limit: 10\ - \n Projection: #a, #b\ + \n Projection: #test.a, #test.b\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); Ok(()) @@ -537,9 +537,9 @@ mod tests { .build()?; // filter is before double projection let expected = "\ - Projection: #c, #b\ - \n Projection: #a, #b, #c\ - \n Filter: #a Eq Int64(1)\ + Projection: #test.c, #test.b\ + \n Projection: #test.a, #test.b, #test.c\ + \n Filter: #test.a Eq Int64(1)\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); Ok(()) @@ -554,8 +554,8 @@ mod tests { .build()?; // filter of key aggregation is commutative let expected = "\ - Aggregate: groupBy=[[#a]], aggr=[[SUM(#b) AS total_salary]]\ - \n Filter: #a Gt Int64(10)\ + Aggregate: groupBy=[[#test.a]], aggr=[[SUM(#test.b) AS total_salary]]\ + \n Filter: #test.a Gt Int64(10)\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); Ok(()) @@ -571,7 +571,7 @@ mod tests { // filter of aggregate is after aggregation since they are non-commutative let expected = "\ Filter: #b Gt Int64(10)\ - \n Aggregate: groupBy=[[#a]], aggr=[[SUM(#b) AS b]]\ + \n Aggregate: groupBy=[[#test.a]], aggr=[[SUM(#test.b) AS b]]\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); Ok(()) @@ -587,8 +587,8 @@ mod tests { .build()?; // filter is before projection let expected = "\ - Projection: #a AS b, #c\ - \n Filter: #a Eq Int64(1)\ + Projection: #test.a AS b, #test.c\ + \n Filter: #test.a Eq Int64(1)\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); Ok(()) @@ -627,14 +627,14 @@ mod tests { format!("{:?}", plan), "\ Filter: #b Eq Int64(1)\ - \n Projection: #a Multiply Int32(2) Plus #c AS b, #c\ + \n Projection: #test.a Multiply Int32(2) Plus #test.c AS b, #test.c\ \n TableScan: test projection=None" ); // filter is before projection let expected = "\ - Projection: #a Multiply Int32(2) Plus #c AS b, #c\ - \n Filter: #a Multiply Int32(2) Plus #c Eq Int64(1)\ + Projection: #test.a Multiply Int32(2) Plus #test.c AS b, #test.c\ + \n Filter: #test.a Multiply Int32(2) Plus #test.c Eq Int64(1)\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); Ok(()) @@ -659,16 +659,16 @@ mod tests { format!("{:?}", plan), "\ Filter: #a Eq Int64(1)\ - \n Projection: #b Multiply Int32(3) AS a, #c\ - \n Projection: #a Multiply Int32(2) Plus #c AS b, #c\ + \n Projection: #b Multiply Int32(3) AS a, #test.c\ + \n Projection: #test.a Multiply Int32(2) Plus #test.c AS b, #test.c\ \n TableScan: test projection=None" ); // filter is before the projections let expected = "\ - Projection: #b Multiply Int32(3) AS a, #c\ - \n Projection: #a Multiply Int32(2) Plus #c AS b, #c\ - \n Filter: #a Multiply Int32(2) Plus #c Multiply Int32(3) Eq Int64(1)\ + Projection: #b Multiply Int32(3) AS a, #test.c\ + \n Projection: #test.a Multiply Int32(2) Plus #test.c AS b, #test.c\ + \n Filter: #test.a Multiply Int32(2) Plus #test.c Multiply Int32(3) Eq Int64(1)\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); Ok(()) @@ -684,26 +684,26 @@ mod tests { .project(vec![col("a").alias("b"), col("c")])? .aggregate(vec![col("b")], vec![sum(col("c"))])? .filter(col("b").gt(lit(10i64)))? - .filter(col("SUM(c)").gt(lit(10i64)))? + .filter(col("SUM(test.c)").gt(lit(10i64)))? .build()?; // not part of the test, just good to know: assert_eq!( format!("{:?}", plan), "\ - Filter: #SUM(c) Gt Int64(10)\ + Filter: #SUM(test.c) Gt Int64(10)\ \n Filter: #b Gt Int64(10)\ - \n Aggregate: groupBy=[[#b]], aggr=[[SUM(#c)]]\ - \n Projection: #a AS b, #c\ + \n Aggregate: groupBy=[[#b]], aggr=[[SUM(#test.c)]]\ + \n Projection: #test.a AS b, #test.c\ \n TableScan: test projection=None" ); // filter is before the projections let expected = "\ - Filter: #SUM(c) Gt Int64(10)\ - \n Aggregate: groupBy=[[#b]], aggr=[[SUM(#c)]]\ - \n Projection: #a AS b, #c\ - \n Filter: #a Gt Int64(10)\ + Filter: #SUM(test.c) Gt Int64(10)\ + \n Aggregate: groupBy=[[#b]], aggr=[[SUM(#test.c)]]\ + \n Projection: #test.a AS b, #test.c\ + \n Filter: #test.a Gt Int64(10)\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); @@ -720,8 +720,8 @@ mod tests { .project(vec![col("a").alias("b"), col("c")])? .aggregate(vec![col("b")], vec![sum(col("c"))])? .filter(and( - col("SUM(c)").gt(lit(10i64)), - and(col("b").gt(lit(10i64)), col("SUM(c)").lt(lit(20i64))), + col("SUM(test.c)").gt(lit(10i64)), + and(col("b").gt(lit(10i64)), col("SUM(test.c)").lt(lit(20i64))), ))? .build()?; @@ -729,18 +729,18 @@ mod tests { assert_eq!( format!("{:?}", plan), "\ - Filter: #SUM(c) Gt Int64(10) And #b Gt Int64(10) And #SUM(c) Lt Int64(20)\ - \n Aggregate: groupBy=[[#b]], aggr=[[SUM(#c)]]\ - \n Projection: #a AS b, #c\ + Filter: #SUM(test.c) Gt Int64(10) And #b Gt Int64(10) And #SUM(test.c) Lt Int64(20)\ + \n Aggregate: groupBy=[[#b]], aggr=[[SUM(#test.c)]]\ + \n Projection: #test.a AS b, #test.c\ \n TableScan: test projection=None" ); // filter is before the projections let expected = "\ - Filter: #SUM(c) Gt Int64(10) And #SUM(c) Lt Int64(20)\ - \n Aggregate: groupBy=[[#b]], aggr=[[SUM(#c)]]\ - \n Projection: #a AS b, #c\ - \n Filter: #a Gt Int64(10)\ + Filter: #SUM(test.c) Gt Int64(10) And #SUM(test.c) Lt Int64(20)\ + \n Aggregate: groupBy=[[#b]], aggr=[[SUM(#test.c)]]\ + \n Projection: #test.a AS b, #test.c\ + \n Filter: #test.a Gt Int64(10)\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); @@ -760,11 +760,11 @@ mod tests { .build()?; // filter does not just any of the limits let expected = "\ - Projection: #a, #b\ - \n Filter: #a Eq Int64(1)\ + Projection: #test.a, #test.b\ + \n Filter: #test.a Eq Int64(1)\ \n Limit: 10\ \n Limit: 20\ - \n Projection: #a, #b\ + \n Projection: #test.a, #test.b\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); Ok(()) @@ -804,20 +804,20 @@ mod tests { // not part of the test assert_eq!( format!("{:?}", plan), - "Filter: #a GtEq Int64(1)\ - \n Projection: #a\ + "Filter: #test.a GtEq Int64(1)\ + \n Projection: #test.a\ \n Limit: 1\ - \n Filter: #a LtEq Int64(1)\ - \n Projection: #a\ + \n Filter: #test.a LtEq Int64(1)\ + \n Projection: #test.a\ \n TableScan: test projection=None" ); let expected = "\ - Projection: #a\ - \n Filter: #a GtEq Int64(1)\ + Projection: #test.a\ + \n Filter: #test.a GtEq Int64(1)\ \n Limit: 1\ - \n Projection: #a\ - \n Filter: #a LtEq Int64(1)\ + \n Projection: #test.a\ + \n Filter: #test.a LtEq Int64(1)\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); @@ -838,16 +838,16 @@ mod tests { // not part of the test assert_eq!( format!("{:?}", plan), - "Projection: #a\ - \n Filter: #a GtEq Int64(1)\ - \n Filter: #a LtEq Int64(1)\ + "Projection: #test.a\ + \n Filter: #test.a GtEq Int64(1)\ + \n Filter: #test.a LtEq Int64(1)\ \n Limit: 1\ \n TableScan: test projection=None" ); let expected = "\ - Projection: #a\ - \n Filter: #a GtEq Int64(1) And #a LtEq Int64(1)\ + Projection: #test.a\ + \n Filter: #test.a GtEq Int64(1) And #test.a LtEq Int64(1)\ \n Limit: 1\ \n TableScan: test projection=None"; @@ -868,7 +868,7 @@ mod tests { let expected = "\ TestUserDefined\ - \n Filter: #a LtEq Int64(1)\ + \n Filter: #test.a LtEq Int64(1)\ \n TableScan: test projection=None"; // not part of the test @@ -887,7 +887,12 @@ mod tests { .project(vec![col("a")])? .build()?; let plan = LogicalPlanBuilder::from(&left) - .join(&right, JoinType::Inner, &["a"], &["a"])? + .join( + &right, + JoinType::Inner, + vec![Column::from_name("a".to_string())], + vec![Column::from_name("a".to_string())], + )? .filter(col("a").lt_eq(lit(1i64)))? .build()?; @@ -895,20 +900,20 @@ mod tests { assert_eq!( format!("{:?}", plan), "\ - Filter: #a LtEq Int64(1)\ - \n Join: a = a\ + Filter: #test.a LtEq Int64(1)\ + \n Join: #test.a = #test.a\ \n TableScan: test projection=None\ - \n Projection: #a\ + \n Projection: #test.a\ \n TableScan: test projection=None" ); // filter sent to side before the join let expected = "\ - Join: a = a\ - \n Filter: #a LtEq Int64(1)\ + Join: #test.a = #test.a\ + \n Filter: #test.a LtEq Int64(1)\ \n TableScan: test projection=None\ - \n Projection: #a\ - \n Filter: #a LtEq Int64(1)\ + \n Projection: #test.a\ + \n Filter: #test.a LtEq Int64(1)\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); Ok(()) @@ -925,7 +930,12 @@ mod tests { .project(vec![col("a"), col("b")])? .build()?; let plan = LogicalPlanBuilder::from(&left) - .join(&right, JoinType::Inner, &["a"], &["a"])? + .join( + &right, + JoinType::Inner, + vec![Column::from_name("a".to_string())], + vec![Column::from_name("a".to_string())], + )? // "b" and "c" are not shared by either side: they are only available together after the join .filter(col("c").lt_eq(col("b")))? .build()?; @@ -934,11 +944,11 @@ mod tests { assert_eq!( format!("{:?}", plan), "\ - Filter: #c LtEq #b\ - \n Join: a = a\ - \n Projection: #a, #c\ + Filter: #test.c LtEq #test.b\ + \n Join: #test.a = #test.a\ + \n Projection: #test.a, #test.c\ \n TableScan: test projection=None\ - \n Projection: #a, #b\ + \n Projection: #test.a, #test.b\ \n TableScan: test projection=None" ); @@ -959,7 +969,12 @@ mod tests { .project(vec![col("a"), col("c")])? .build()?; let plan = LogicalPlanBuilder::from(&left) - .join(&right, JoinType::Inner, &["a"], &["a"])? + .join( + &right, + JoinType::Inner, + vec![Column::from_name("a".to_string())], + vec![Column::from_name("a".to_string())], + )? .filter(col("b").lt_eq(lit(1i64)))? .build()?; @@ -967,20 +982,20 @@ mod tests { assert_eq!( format!("{:?}", plan), "\ - Filter: #b LtEq Int64(1)\ - \n Join: a = a\ - \n Projection: #a, #b\ + Filter: #test.b LtEq Int64(1)\ + \n Join: #test.a = #test.a\ + \n Projection: #test.a, #test.b\ \n TableScan: test projection=None\ - \n Projection: #a, #c\ + \n Projection: #test.a, #test.c\ \n TableScan: test projection=None" ); let expected = "\ - Join: a = a\ - \n Projection: #a, #b\ - \n Filter: #b LtEq Int64(1)\ + Join: #test.a = #test.a\ + \n Projection: #test.a, #test.b\ + \n Filter: #test.b LtEq Int64(1)\ \n TableScan: test projection=None\ - \n Projection: #a, #c\ + \n Projection: #test.a, #test.c\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); Ok(()) @@ -1030,14 +1045,15 @@ mod tests { fn table_scan_with_pushdown_provider( filter_support: TableProviderFilterPushDown, ) -> Result { + use std::convert::TryFrom; + let test_provider = PushDownProvider { filter_support }; let table_scan = LogicalPlan::TableScan { - table_name: "".into(), + table_name: "test".to_string(), filters: vec![], - projected_schema: Arc::new(DFSchema::try_from_qualified( - "", - &*test_provider.schema(), + projected_schema: Arc::new(DFSchema::try_from( + (*test_provider.schema()).clone(), )?), projection: None, source: Arc::new(test_provider), @@ -1054,7 +1070,7 @@ mod tests { let plan = table_scan_with_pushdown_provider(TableProviderFilterPushDown::Exact)?; let expected = "\ - TableScan: projection=None, filters=[#a Eq Int64(1)]"; + TableScan: test projection=None, filters=[#a Eq Int64(1)]"; assert_optimized_plan_eq(&plan, expected); Ok(()) } @@ -1066,7 +1082,7 @@ mod tests { let expected = "\ Filter: #a Eq Int64(1)\ - \n TableScan: projection=None, filters=[#a Eq Int64(1)]"; + \n TableScan: test projection=None, filters=[#a Eq Int64(1)]"; assert_optimized_plan_eq(&plan, expected); Ok(()) } @@ -1080,7 +1096,7 @@ mod tests { let expected = "\ Filter: #a Eq Int64(1)\ - \n TableScan: projection=None, filters=[#a Eq Int64(1)]"; + \n TableScan: test projection=None, filters=[#a Eq Int64(1)]"; // Optimizing the same plan multiple times should produce the same plan // each time. @@ -1095,7 +1111,7 @@ mod tests { let expected = "\ Filter: #a Eq Int64(1)\ - \n TableScan: projection=None"; + \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); Ok(()) } diff --git a/datafusion/src/optimizer/hash_build_probe_order.rs b/datafusion/src/optimizer/hash_build_probe_order.rs index 74d2b00901942..a2a99ae364a70 100644 --- a/datafusion/src/optimizer/hash_build_probe_order.rs +++ b/datafusion/src/optimizer/hash_build_probe_order.rs @@ -22,7 +22,7 @@ use std::sync::Arc; -use crate::logical_plan::LogicalPlan; +use crate::logical_plan::{Expr, LogicalPlan, LogicalPlanBuilder}; use crate::optimizer::optimizer::OptimizerRule; use crate::{error::Result, prelude::JoinType}; @@ -131,6 +131,7 @@ impl OptimizerRule for HashBuildProbeOrder { right, on, join_type, + join_constraint, schema, } => { let left = self.optimize(left, execution_props)?; @@ -140,11 +141,9 @@ impl OptimizerRule for HashBuildProbeOrder { Ok(LogicalPlan::Join { left: Arc::new(right), right: Arc::new(left), - on: on - .iter() - .map(|(l, r)| (r.to_string(), l.to_string())) - .collect(), + on: on.iter().map(|(l, r)| (r.clone(), l.clone())).collect(), join_type: swap_join_type(*join_type), + join_constraint: *join_constraint, schema: schema.clone(), }) } else { @@ -154,6 +153,7 @@ impl OptimizerRule for HashBuildProbeOrder { right: Arc::new(right), on: on.clone(), join_type: *join_type, + join_constraint: *join_constraint, schema: schema.clone(), }) } @@ -166,12 +166,19 @@ impl OptimizerRule for HashBuildProbeOrder { let left = self.optimize(left, execution_props)?; let right = self.optimize(right, execution_props)?; if should_swap_join_order(&left, &right) { - // Swap left and right - Ok(LogicalPlan::CrossJoin { - left: Arc::new(right), - right: Arc::new(left), - schema: schema.clone(), - }) + let swapped = LogicalPlanBuilder::from(&right).cross_join(&left)?; + // wrap plan with projection to maintain column order + let left_cols = left + .schema() + .fields() + .iter() + .map(|f| Expr::Column(f.qualified_column())); + let right_cols = right + .schema() + .fields() + .iter() + .map(|f| Expr::Column(f.qualified_column())); + swapped.project(left_cols.chain(right_cols))?.build() } else { // Keep join as is Ok(LogicalPlan::CrossJoin { diff --git a/datafusion/src/optimizer/limit_push_down.rs b/datafusion/src/optimizer/limit_push_down.rs index e616869d7c4a4..afd993710a5f5 100644 --- a/datafusion/src/optimizer/limit_push_down.rs +++ b/datafusion/src/optimizer/limit_push_down.rs @@ -163,7 +163,7 @@ mod test { // Should push the limit down to table provider // When it has a select let expected = "Limit: 1000\ - \n Projection: #a\ + \n Projection: #test.a\ \n TableScan: test projection=None, limit=1000"; assert_optimized_plan_eq(&plan, expected); @@ -202,7 +202,7 @@ mod test { // Limit should *not* push down aggregate node let expected = "Limit: 1000\ - \n Aggregate: groupBy=[[#a]], aggr=[[MAX(#b)]]\ + \n Aggregate: groupBy=[[#test.a]], aggr=[[MAX(#test.b)]]\ \n TableScan: test projection=None"; assert_optimized_plan_eq(&plan, expected); @@ -244,7 +244,7 @@ mod test { // Limit should use deeper LIMIT 1000, but Limit 10 shouldn't push down aggregation let expected = "Limit: 10\ - \n Aggregate: groupBy=[[#a]], aggr=[[MAX(#b)]]\ + \n Aggregate: groupBy=[[#test.a]], aggr=[[MAX(#test.b)]]\ \n Limit: 1000\ \n TableScan: test projection=None, limit=1000"; diff --git a/datafusion/src/optimizer/projection_push_down.rs b/datafusion/src/optimizer/projection_push_down.rs index ad795f5f5dd52..2544d89d04920 100644 --- a/datafusion/src/optimizer/projection_push_down.rs +++ b/datafusion/src/optimizer/projection_push_down.rs @@ -20,11 +20,14 @@ use crate::error::Result; use crate::execution::context::ExecutionProps; -use crate::logical_plan::{DFField, DFSchema, DFSchemaRef, LogicalPlan, ToDFSchema}; +use crate::logical_plan::{ + build_join_schema, Column, DFField, DFSchema, DFSchemaRef, LogicalPlan, + LogicalPlanBuilder, ToDFSchema, +}; use crate::optimizer::optimizer::OptimizerRule; use crate::optimizer::utils; use crate::sql::utils::find_sort_exprs; -use arrow::datatypes::Schema; +use arrow::datatypes::{Field, Schema}; use arrow::error::Result as ArrowResult; use std::{collections::HashSet, sync::Arc}; use utils::optimize_explain; @@ -44,8 +47,8 @@ impl OptimizerRule for ProjectionPushDown { .schema() .fields() .iter() - .map(|f| f.name().clone()) - .collect::>(); + .map(|f| f.qualified_column()) + .collect::>(); optimize_plan(self, plan, &required_columns, false, execution_props) } @@ -62,8 +65,9 @@ impl ProjectionPushDown { } fn get_projected_schema( + table_name: Option<&String>, schema: &Schema, - required_columns: &HashSet, + required_columns: &HashSet, has_projection: bool, ) -> Result<(Vec, DFSchemaRef)> { // once we reach the table scan, we can use the accumulated set of column @@ -73,7 +77,8 @@ fn get_projected_schema( // e.g. when the column derives from an aggregation let mut projection: Vec = required_columns .iter() - .map(|name| schema.index_of(name)) + .filter(|c| c.relation.as_ref() == table_name) + .map(|c| schema.index_of(&c.name)) .filter_map(ArrowResult::ok) .collect(); @@ -98,8 +103,20 @@ fn get_projected_schema( // create the projected schema let mut projected_fields: Vec = Vec::with_capacity(projection.len()); - for i in &projection { - projected_fields.push(DFField::from(schema.fields()[*i].clone())); + match table_name { + Some(qualifer) => { + for i in &projection { + projected_fields.push(DFField::from_qualified( + qualifer, + schema.fields()[*i].clone(), + )); + } + } + None => { + for i in &projection { + projected_fields.push(DFField::from(schema.fields()[*i].clone())); + } + } } Ok((projection, projected_fields.to_dfschema_ref()?)) @@ -109,7 +126,7 @@ fn get_projected_schema( fn optimize_plan( optimizer: &ProjectionPushDown, plan: &LogicalPlan, - required_columns: &HashSet, // set of columns required up to this step + required_columns: &HashSet, // set of columns required up to this step has_projection: bool, execution_props: &ExecutionProps, ) -> Result { @@ -133,12 +150,12 @@ fn optimize_plan( .iter() .enumerate() .try_for_each(|(i, field)| { - if required_columns.contains(field.name()) { + if required_columns.contains(&field.qualified_column()) { new_expr.push(expr[i].clone()); new_fields.push(field.clone()); // gather the new set of required columns - utils::expr_to_column_names(&expr[i], &mut new_required_columns) + utils::expr_to_columns(&expr[i], &mut new_required_columns) } else { Ok(()) } @@ -167,31 +184,45 @@ fn optimize_plan( right, on, join_type, - schema, + join_constraint, + .. } => { for (l, r) in on { - new_required_columns.insert(l.to_owned()); - new_required_columns.insert(r.to_owned()); + new_required_columns.insert(l.clone()); + new_required_columns.insert(r.clone()); } - Ok(LogicalPlan::Join { - left: Arc::new(optimize_plan( - optimizer, - left, - &new_required_columns, - true, - execution_props, - )?), - right: Arc::new(optimize_plan( - optimizer, - right, - &new_required_columns, - true, - execution_props, - )?), + let optimized_left = Arc::new(optimize_plan( + optimizer, + left, + &new_required_columns, + true, + execution_props, + )?); + + let optimized_right = Arc::new(optimize_plan( + optimizer, + right, + &new_required_columns, + true, + execution_props, + )?); + + let schema = build_join_schema( + optimized_left.schema(), + optimized_right.schema(), + on, + join_type, + join_constraint, + )?; + + Ok(LogicalPlan::Join { + left: optimized_left, + right: optimized_right, join_type: *join_type, + join_constraint: *join_constraint, on: on.clone(), - schema: schema.clone(), + schema: DFSchemaRef::new(schema), }) } LogicalPlan::Window { @@ -205,11 +236,12 @@ fn optimize_plan( { window_expr.iter().try_for_each(|expr| { let name = &expr.name(schema)?; - if required_columns.contains(name) { + let column = Column::from_name(name.to_string()); + if required_columns.contains(&column) { new_window_expr.push(expr.clone()); - new_required_columns.insert(name.clone()); + new_required_columns.insert(column); // add to the new set of required columns - utils::expr_to_column_names(expr, &mut new_required_columns) + utils::expr_to_columns(expr, &mut new_required_columns) } else { Ok(()) } @@ -217,31 +249,20 @@ fn optimize_plan( } // for all the retained window expr, find their sort expressions if any, and retain these - utils::exprlist_to_column_names( + utils::exprlist_to_columns( &find_sort_exprs(&new_window_expr), &mut new_required_columns, )?; - let new_schema = DFSchema::new( - schema - .fields() - .iter() - .filter(|x| new_required_columns.contains(x.name())) - .cloned() - .collect(), - )?; - - Ok(LogicalPlan::Window { - window_expr: new_window_expr, - input: Arc::new(optimize_plan( - optimizer, - input, - &new_required_columns, - true, - execution_props, - )?), - schema: DFSchemaRef::new(new_schema), - }) + LogicalPlanBuilder::from(&optimize_plan( + optimizer, + input, + &new_required_columns, + true, + execution_props, + )?) + .window(new_window_expr)? + .build() } LogicalPlan::Aggregate { schema, @@ -254,19 +275,20 @@ fn optimize_plan( // * remove any aggregate expression that is not required // * construct the new set of required columns - utils::exprlist_to_column_names(group_expr, &mut new_required_columns)?; + utils::exprlist_to_columns(group_expr, &mut new_required_columns)?; // Gather all columns needed for expressions in this Aggregate let mut new_aggr_expr = Vec::new(); aggr_expr.iter().try_for_each(|expr| { let name = &expr.name(schema)?; + let column = Column::from_name(name.to_string()); - if required_columns.contains(name) { + if required_columns.contains(&column) { new_aggr_expr.push(expr.clone()); - new_required_columns.insert(name.clone()); + new_required_columns.insert(column); // add to the new set of required columns - utils::expr_to_column_names(expr, &mut new_required_columns) + utils::expr_to_columns(expr, &mut new_required_columns) } else { Ok(()) } @@ -276,7 +298,7 @@ fn optimize_plan( schema .fields() .iter() - .filter(|x| new_required_columns.contains(x.name())) + .filter(|x| new_required_columns.contains(&x.qualified_column())) .cloned() .collect(), )?; @@ -303,12 +325,15 @@ fn optimize_plan( limit, .. } => { - let (projection, projected_schema) = - get_projected_schema(&source.schema(), required_columns, has_projection)?; - + let (projection, projected_schema) = get_projected_schema( + Some(table_name), + &source.schema(), + required_columns, + has_projection, + )?; // return the table scan with projection Ok(LogicalPlan::TableScan { - table_name: table_name.to_string(), + table_name: table_name.clone(), source: source.clone(), projection: Some(projection), projected_schema, @@ -332,6 +357,48 @@ fn optimize_plan( execution_props, ) } + LogicalPlan::Union { + inputs, + schema, + alias, + } => { + // UNION inputs will reference the same column with different identifiers, so we need + // to populate new_required_columns by unqualified column name based on required fields + // from the resulting UNION output + let union_required_fields = schema + .fields() + .iter() + .filter(|f| new_required_columns.contains(&f.qualified_column())) + .map(|f| f.field()) + .collect::>(); + + let new_inputs = inputs + .iter() + .map(|input_plan| { + input_plan + .schema() + .fields() + .iter() + .filter(|f| union_required_fields.contains(f.field())) + .for_each(|f| { + new_required_columns.insert(f.qualified_column()); + }); + optimize_plan( + optimizer, + input_plan, + &new_required_columns, + has_projection, + execution_props, + ) + }) + .collect::>>()?; + + Ok(LogicalPlan::Union { + inputs: new_inputs, + schema: schema.clone(), + alias: alias.clone(), + }) + } // all other nodes: Add any additional columns used by // expressions in this node to the list of required columns LogicalPlan::Limit { .. } @@ -340,21 +407,20 @@ fn optimize_plan( | LogicalPlan::EmptyRelation { .. } | LogicalPlan::Sort { .. } | LogicalPlan::CreateExternalTable { .. } - | LogicalPlan::Union { .. } | LogicalPlan::CrossJoin { .. } | LogicalPlan::Extension { .. } => { let expr = plan.expressions(); // collect all required columns by this plan - utils::exprlist_to_column_names(&expr, &mut new_required_columns)?; + utils::exprlist_to_columns(&expr, &mut new_required_columns)?; // apply the optimization to all inputs of the plan let inputs = plan.inputs(); let new_inputs = inputs .iter() - .map(|plan| { + .map(|input_plan| { optimize_plan( optimizer, - plan, + input_plan, &new_required_columns, has_projection, execution_props, @@ -371,8 +437,7 @@ fn optimize_plan( mod tests { use super::*; - use crate::logical_plan::{col, lit}; - use crate::logical_plan::{max, min, Expr, LogicalPlanBuilder}; + use crate::logical_plan::{col, lit, max, min, Expr, JoinType, LogicalPlanBuilder}; use crate::test::*; use arrow::datatypes::DataType; @@ -384,7 +449,7 @@ mod tests { .aggregate(vec![], vec![max(col("b"))])? .build()?; - let expected = "Aggregate: groupBy=[[]], aggr=[[MAX(#b)]]\ + let expected = "Aggregate: groupBy=[[]], aggr=[[MAX(#test.b)]]\ \n TableScan: test projection=Some([1])"; assert_optimized_plan_eq(&plan, expected); @@ -400,7 +465,7 @@ mod tests { .aggregate(vec![col("c")], vec![max(col("b"))])? .build()?; - let expected = "Aggregate: groupBy=[[#c]], aggr=[[MAX(#b)]]\ + let expected = "Aggregate: groupBy=[[#test.c]], aggr=[[MAX(#test.b)]]\ \n TableScan: test projection=Some([1, 2])"; assert_optimized_plan_eq(&plan, expected); @@ -417,8 +482,8 @@ mod tests { .aggregate(vec![], vec![max(col("b"))])? .build()?; - let expected = "Aggregate: groupBy=[[]], aggr=[[MAX(#b)]]\ - \n Filter: #c\ + let expected = "Aggregate: groupBy=[[]], aggr=[[MAX(#test.b)]]\ + \n Filter: #test.c\ \n TableScan: test projection=Some([1, 2])"; assert_optimized_plan_eq(&plan, expected); @@ -426,6 +491,43 @@ mod tests { Ok(()) } + #[test] + fn join_schema_trim() -> Result<()> { + let table_scan = test_table_scan()?; + + let schema = Schema::new(vec![Field::new("c1", DataType::UInt32, false)]); + let table2_scan = + LogicalPlanBuilder::scan_empty(Some("test2"), &schema, None)?.build()?; + + let plan = LogicalPlanBuilder::from(&table_scan) + .join(&table2_scan, JoinType::Left, vec!["a"], vec!["c1"])? + .project(vec![col("a"), col("b"), col("c1")])? + .build()?; + + // make sure projections are pushed down to table scan + let expected = "Projection: #test.a, #test.b, #test2.c1\ + \n Join: #test.a = #test2.c1\ + \n TableScan: test projection=Some([0, 1])\ + \n TableScan: test2 projection=Some([0])"; + + let optimized_plan = optimize(&plan)?; + let formatted_plan = format!("{:?}", optimized_plan); + assert_eq!(formatted_plan, expected); + + // make sure schema for join node doesn't include c1 column + let optimized_join = optimized_plan.inputs()[0]; + assert_eq!( + **optimized_join.schema(), + DFSchema::new(vec![ + DFField::new(Some("test"), "a", DataType::UInt32, false), + DFField::new(Some("test"), "b", DataType::UInt32, false), + DFField::new(Some("test2"), "c1", DataType::UInt32, false), + ])?, + ); + + Ok(()) + } + #[test] fn cast() -> Result<()> { let table_scan = test_table_scan()?; @@ -437,7 +539,7 @@ mod tests { }])? .build()?; - let expected = "Projection: CAST(#c AS Float64)\ + let expected = "Projection: CAST(#test.c AS Float64)\ \n TableScan: test projection=Some([2])"; assert_optimized_plan_eq(&projection, expected); @@ -457,7 +559,7 @@ mod tests { assert_fields_eq(&plan, vec!["a", "b"]); - let expected = "Projection: #a, #b\ + let expected = "Projection: #test.a, #test.b\ \n TableScan: test projection=Some([0, 1])"; assert_optimized_plan_eq(&plan, expected); @@ -479,7 +581,7 @@ mod tests { assert_fields_eq(&plan, vec!["c", "a"]); let expected = "Limit: 5\ - \n Projection: #c, #a\ + \n Projection: #test.c, #test.a\ \n TableScan: test projection=Some([0, 2])"; assert_optimized_plan_eq(&plan, expected); @@ -523,12 +625,12 @@ mod tests { .aggregate(vec![col("c")], vec![max(col("a"))])? .build()?; - assert_fields_eq(&plan, vec!["c", "MAX(a)"]); + assert_fields_eq(&plan, vec!["c", "MAX(test.a)"]); let expected = "\ - Aggregate: groupBy=[[#c]], aggr=[[MAX(#a)]]\ - \n Filter: #c Gt Int32(1)\ - \n Projection: #c, #a\ + Aggregate: groupBy=[[#test.c]], aggr=[[MAX(#test.a)]]\ + \n Filter: #test.c Gt Int32(1)\ + \n Projection: #test.c, #test.a\ \n TableScan: test projection=Some([0, 2])"; assert_optimized_plan_eq(&plan, expected); @@ -591,15 +693,15 @@ mod tests { let plan = LogicalPlanBuilder::from(&table_scan) .aggregate(vec![col("a"), col("c")], vec![max(col("b")), min(col("b"))])? .filter(col("c").gt(lit(1)))? - .project(vec![col("c"), col("a"), col("MAX(b)")])? + .project(vec![col("c"), col("a"), col("MAX(test.b)")])? .build()?; - assert_fields_eq(&plan, vec!["c", "a", "MAX(b)"]); + assert_fields_eq(&plan, vec!["c", "a", "MAX(test.b)"]); let expected = "\ - Projection: #c, #a, #MAX(b)\ - \n Filter: #c Gt Int32(1)\ - \n Aggregate: groupBy=[[#a, #c]], aggr=[[MAX(#b)]]\ + Projection: #test.c, #test.a, #MAX(test.b)\ + \n Filter: #test.c Gt Int32(1)\ + \n Aggregate: groupBy=[[#test.a, #test.c]], aggr=[[MAX(#test.b)]]\ \n TableScan: test projection=Some([0, 1, 2])"; assert_optimized_plan_eq(&plan, expected); diff --git a/datafusion/src/optimizer/simplify_expressions.rs b/datafusion/src/optimizer/simplify_expressions.rs index 9ad7a94d8bfe2..4253d2fd4f00c 100644 --- a/datafusion/src/optimizer/simplify_expressions.rs +++ b/datafusion/src/optimizer/simplify_expressions.rs @@ -510,8 +510,8 @@ mod tests { assert_optimized_plan_eq( &plan, "\ - Filter: #b Gt Int32(1)\ - \n Projection: #a\ + Filter: #test.b Gt Int32(1)\ + \n Projection: #test.a\ \n TableScan: test projection=None", ); Ok(()) @@ -532,8 +532,8 @@ mod tests { assert_optimized_plan_eq( &plan, "\ - Filter: #a Gt Int32(5) And #b Lt Int32(6)\ - \n Projection: #a\ + Filter: #test.a Gt Int32(5) And #test.b Lt Int32(6)\ + \n Projection: #test.a\ \n TableScan: test projection=None", ); Ok(()) diff --git a/datafusion/src/optimizer/utils.rs b/datafusion/src/optimizer/utils.rs index 014ec74a0bfe1..76f44b84657ca 100644 --- a/datafusion/src/optimizer/utils.rs +++ b/datafusion/src/optimizer/utils.rs @@ -24,8 +24,8 @@ use arrow::datatypes::Schema; use super::optimizer::OptimizerRule; use crate::execution::context::ExecutionProps; use crate::logical_plan::{ - Expr, LogicalPlan, Operator, Partitioning, PlanType, Recursion, StringifiedPlan, - ToDFSchema, + build_join_schema, Column, DFSchemaRef, Expr, LogicalPlan, LogicalPlanBuilder, + Operator, Partitioning, PlanType, Recursion, StringifiedPlan, ToDFSchema, }; use crate::prelude::lit; use crate::scalar::ScalarValue; @@ -39,14 +39,11 @@ const CASE_ELSE_MARKER: &str = "__DATAFUSION_CASE_ELSE__"; const WINDOW_PARTITION_MARKER: &str = "__DATAFUSION_WINDOW_PARTITION__"; const WINDOW_SORT_MARKER: &str = "__DATAFUSION_WINDOW_SORT__"; -/// Recursively walk a list of expression trees, collecting the unique set of column -/// names referenced in the expression -pub fn exprlist_to_column_names( - expr: &[Expr], - accum: &mut HashSet, -) -> Result<()> { +/// Recursively walk a list of expression trees, collecting the unique set of columns +/// referenced in the expression +pub fn exprlist_to_columns(expr: &[Expr], accum: &mut HashSet) -> Result<()> { for e in expr { - expr_to_column_names(e, accum)?; + expr_to_columns(e, accum)?; } Ok(()) } @@ -54,17 +51,17 @@ pub fn exprlist_to_column_names( /// Recursively walk an expression tree, collecting the unique set of column names /// referenced in the expression struct ColumnNameVisitor<'a> { - accum: &'a mut HashSet, + accum: &'a mut HashSet, } impl ExpressionVisitor for ColumnNameVisitor<'_> { fn pre_visit(self, expr: &Expr) -> Result> { match expr { - Expr::Column(name) => { - self.accum.insert(name.clone()); + Expr::Column(qc) => { + self.accum.insert(qc.clone()); } Expr::ScalarVariable(var_names) => { - self.accum.insert(var_names.join(".")); + self.accum.insert(Column::from_name(var_names.join("."))); } Expr::Alias(_, _) => {} Expr::Literal(_) => {} @@ -90,9 +87,9 @@ impl ExpressionVisitor for ColumnNameVisitor<'_> { } } -/// Recursively walk an expression tree, collecting the unique set of column names +/// Recursively walk an expression tree, collecting the unique set of columns /// referenced in the expression -pub fn expr_to_column_names(expr: &Expr, accum: &mut HashSet) -> Result<()> { +pub fn expr_to_columns(expr: &Expr, accum: &mut HashSet) -> Result<()> { expr.accept(ColumnNameVisitor { accum })?; Ok(()) } @@ -214,21 +211,31 @@ pub fn from_plan( }), LogicalPlan::Join { join_type, + join_constraint, on, - schema, .. - } => Ok(LogicalPlan::Join { - left: Arc::new(inputs[0].clone()), - right: Arc::new(inputs[1].clone()), - join_type: *join_type, - on: on.clone(), - schema: schema.clone(), - }), - LogicalPlan::CrossJoin { schema, .. } => Ok(LogicalPlan::CrossJoin { - left: Arc::new(inputs[0].clone()), - right: Arc::new(inputs[1].clone()), - schema: schema.clone(), - }), + } => { + let schema = build_join_schema( + inputs[0].schema(), + inputs[1].schema(), + on, + join_type, + join_constraint, + )?; + Ok(LogicalPlan::Join { + left: Arc::new(inputs[0].clone()), + right: Arc::new(inputs[1].clone()), + join_type: *join_type, + join_constraint: *join_constraint, + on: on.clone(), + schema: DFSchemaRef::new(schema), + }) + } + LogicalPlan::CrossJoin { .. } => { + let left = &inputs[0]; + let right = &inputs[1]; + LogicalPlanBuilder::from(left).cross_join(right)?.build() + } LogicalPlan::Limit { n, .. } => Ok(LogicalPlan::Limit { n: *n, input: Arc::new(inputs[0].clone()), @@ -493,15 +500,15 @@ mod tests { #[test] fn test_collect_expr() -> Result<()> { - let mut accum: HashSet = HashSet::new(); - expr_to_column_names( + let mut accum: HashSet = HashSet::new(); + expr_to_columns( &Expr::Cast { expr: Box::new(col("a")), data_type: DataType::Float64, }, &mut accum, )?; - expr_to_column_names( + expr_to_columns( &Expr::Cast { expr: Box::new(col("a")), data_type: DataType::Float64, @@ -509,7 +516,7 @@ mod tests { &mut accum, )?; assert_eq!(1, accum.len()); - assert!(accum.contains("a")); + assert!(accum.contains(&Column::from_name("a".to_string()))); Ok(()) } diff --git a/datafusion/src/physical_optimizer/pruning.rs b/datafusion/src/physical_optimizer/pruning.rs index 9e8d9fa778583..5585c4d08140a 100644 --- a/datafusion/src/physical_optimizer/pruning.rs +++ b/datafusion/src/physical_optimizer/pruning.rs @@ -28,6 +28,7 @@ //! https://github.com/apache/arrow-datafusion/issues/363 it will //! be genericized. +use std::convert::TryFrom; use std::{collections::HashSet, sync::Arc}; use arrow::{ @@ -39,7 +40,7 @@ use arrow::{ use crate::{ error::{DataFusionError, Result}, execution::context::ExecutionContextState, - logical_plan::{Expr, Operator}, + logical_plan::{Column, DFSchema, Expr, Operator}, optimizer::utils, physical_plan::{planner::DefaultPhysicalPlanner, ColumnarValue, PhysicalExpr}, }; @@ -65,11 +66,11 @@ use crate::{ pub trait PruningStatistics { /// return the minimum values for the named column, if known. /// Note: the returned array must contain `num_containers()` rows - fn min_values(&self, column: &str) -> Option; + fn min_values(&self, column: &Column) -> Option; /// return the maximum values for the named column, if known. /// Note: the returned array must contain `num_containers()` rows. - fn max_values(&self, column: &str) -> Option; + fn max_values(&self, column: &Column) -> Option; /// return the number of containers (e.g. row groups) being /// pruned with these statistics @@ -120,9 +121,11 @@ impl PruningPredicate { .map(|(_, _, f)| f.clone()) .collect::>(); let stat_schema = Schema::new(stat_fields); + let stat_dfschema = DFSchema::try_from(stat_schema.clone())?; let execution_context_state = ExecutionContextState::new(); let predicate_expr = DefaultPhysicalPlanner::default().create_physical_expr( &logical_predicate_expr, + &stat_dfschema, &stat_schema, &execution_context_state, )?; @@ -196,11 +199,11 @@ impl PruningPredicate { #[derive(Debug, Default, Clone)] struct RequiredStatColumns { /// The statistics required to evaluate this predicate: - /// * The column name in the input schema + /// * The unqualified column in the input schema /// * Statistics type (e.g. Min or Max) /// * The field the statistics value should be placed in for /// pruning predicate evaluation - columns: Vec<(String, StatisticsType, Field)>, + columns: Vec<(Column, StatisticsType, Field)>, } impl RequiredStatColumns { @@ -210,22 +213,22 @@ impl RequiredStatColumns { /// Retur an iterator over items in columns (see doc on /// `self.columns` for details) - fn iter(&self) -> impl Iterator { + fn iter(&self) -> impl Iterator { self.columns.iter() } fn is_stat_column_missing( &self, - column_name: &str, + column: &Column, statistics_type: StatisticsType, ) -> bool { !self .columns .iter() - .any(|(c, t, _f)| c == column_name && t == &statistics_type) + .any(|(c, t, _f)| c == column && t == &statistics_type) } - /// Rewrites column_expr so that all appearances of column_name + /// Rewrites column_expr so that all appearances of column /// are replaced with a reference to either the min or max /// statistics column, while keeping track that a reference to the statistics /// column is required @@ -235,49 +238,53 @@ impl RequiredStatColumns { /// 5` with the approprate entry noted in self.columns fn stat_column_expr( &mut self, - column_name: &str, + column: &Column, column_expr: &Expr, field: &Field, stat_type: StatisticsType, suffix: &str, ) -> Result { - let stat_column_name = format!("{}_{}", column_name, suffix); + let stat_column = Column { + relation: column.relation.clone(), + name: format!("{}_{}", column.flat_name(), suffix), + }; + let stat_field = Field::new( - stat_column_name.as_str(), + stat_column.flat_name().as_str(), field.data_type().clone(), field.is_nullable(), ); - if self.is_stat_column_missing(column_name, stat_type) { + + if self.is_stat_column_missing(column, stat_type) { // only add statistics column if not previously added - self.columns - .push((column_name.to_string(), stat_type, stat_field)); + self.columns.push((column.clone(), stat_type, stat_field)); } - rewrite_column_expr(column_expr, column_name, stat_column_name.as_str()) + rewrite_column_expr(column_expr, column, &stat_column) } /// rewrite col --> col_min fn min_column_expr( &mut self, - column_name: &str, + column: &Column, column_expr: &Expr, field: &Field, ) -> Result { - self.stat_column_expr(column_name, column_expr, field, StatisticsType::Min, "min") + self.stat_column_expr(column, column_expr, field, StatisticsType::Min, "min") } /// rewrite col --> col_max fn max_column_expr( &mut self, - column_name: &str, + column: &Column, column_expr: &Expr, field: &Field, ) -> Result { - self.stat_column_expr(column_name, column_expr, field, StatisticsType::Max, "max") + self.stat_column_expr(column, column_expr, field, StatisticsType::Max, "max") } } -impl From> for RequiredStatColumns { - fn from(columns: Vec<(String, StatisticsType, Field)>) -> Self { +impl From> for RequiredStatColumns { + fn from(columns: Vec<(Column, StatisticsType, Field)>) -> Self { Self { columns } } } @@ -314,14 +321,14 @@ fn build_statistics_record_batch( let mut fields = Vec::::new(); let mut arrays = Vec::::new(); // For each needed statistics column: - for (column_name, statistics_type, stat_field) in required_columns.iter() { + for (column, statistics_type, stat_field) in required_columns.iter() { let data_type = stat_field.data_type(); let num_containers = statistics.num_containers(); let array = match statistics_type { - StatisticsType::Min => statistics.min_values(column_name), - StatisticsType::Max => statistics.max_values(column_name), + StatisticsType::Min => statistics.min_values(column), + StatisticsType::Max => statistics.max_values(column), }; let array = array.unwrap_or_else(|| new_null_array(data_type, num_containers)); @@ -347,7 +354,7 @@ fn build_statistics_record_batch( } struct PruningExpressionBuilder<'a> { - column_name: String, + column: Column, column_expr: &'a Expr, scalar_expr: &'a Expr, field: &'a Field, @@ -363,11 +370,11 @@ impl<'a> PruningExpressionBuilder<'a> { required_columns: &'a mut RequiredStatColumns, ) -> Result { // find column name; input could be a more complicated expression - let mut left_columns = HashSet::::new(); - utils::expr_to_column_names(left, &mut left_columns)?; - let mut right_columns = HashSet::::new(); - utils::expr_to_column_names(right, &mut right_columns)?; - let (column_expr, scalar_expr, column_names, reverse_operator) = + let mut left_columns = HashSet::::new(); + utils::expr_to_columns(left, &mut left_columns)?; + let mut right_columns = HashSet::::new(); + utils::expr_to_columns(right, &mut right_columns)?; + let (column_expr, scalar_expr, columns, reverse_operator) = match (left_columns.len(), right_columns.len()) { (1, 0) => (left, right, left_columns, false), (0, 1) => (right, left, right_columns, true), @@ -379,8 +386,8 @@ impl<'a> PruningExpressionBuilder<'a> { )); } }; - let column_name = column_names.iter().next().unwrap().clone(); - let field = match schema.column_with_name(&column_name) { + let column = columns.iter().next().unwrap().clone(); + let field = match schema.column_with_name(&column.flat_name()) { Some((_, f)) => f, _ => { return Err(DataFusionError::Plan( @@ -390,7 +397,7 @@ impl<'a> PruningExpressionBuilder<'a> { }; Ok(Self { - column_name, + column, column_expr, scalar_expr, field, @@ -418,63 +425,56 @@ impl<'a> PruningExpressionBuilder<'a> { } fn min_column_expr(&mut self) -> Result { - self.required_columns.min_column_expr( - &self.column_name, - self.column_expr, - self.field, - ) + self.required_columns + .min_column_expr(&self.column, self.column_expr, self.field) } fn max_column_expr(&mut self) -> Result { - self.required_columns.max_column_expr( - &self.column_name, - self.column_expr, - self.field, - ) + self.required_columns + .max_column_expr(&self.column, self.column_expr, self.field) } } /// replaces a column with an old name with a new name in an expression fn rewrite_column_expr( expr: &Expr, - column_old_name: &str, - column_new_name: &str, + column_old: &Column, + column_new: &Column, ) -> Result { let expressions = utils::expr_sub_expressions(expr)?; let expressions = expressions .iter() - .map(|e| rewrite_column_expr(e, column_old_name, column_new_name)) + .map(|e| rewrite_column_expr(e, column_old, column_new)) .collect::>>()?; - if let Expr::Column(name) = expr { - if name == column_old_name { - return Ok(Expr::Column(column_new_name.to_string())); + if let Expr::Column(c) = expr { + if c == column_old { + return Ok(Expr::Column(column_new.clone())); } } utils::rewrite_expression(expr, &expressions) } -/// Given a column reference to `column_name`, returns a pruning +/// Given a column reference to `column`, returns a pruning /// expression in terms of the min and max that will evaluate to true /// if the column may contain values, and false if definitely does not /// contain values fn build_single_column_expr( - column_name: &str, + column: &Column, schema: &Schema, required_columns: &mut RequiredStatColumns, is_not: bool, // if true, treat as !col ) -> Option { - use crate::logical_plan; - let field = schema.field_with_name(column_name).ok()?; + let field = schema.field_with_name(&column.name).ok()?; if matches!(field.data_type(), &DataType::Boolean) { - let col_ref = logical_plan::col(column_name); + let col_ref = Expr::Column(column.clone()); let min = required_columns - .min_column_expr(column_name, &col_ref, field) + .min_column_expr(column, &col_ref, field) .ok()?; let max = required_columns - .max_column_expr(column_name, &col_ref, field) + .max_column_expr(column, &col_ref, field) .ok()?; // remember -- we want an expression that is: @@ -514,15 +514,15 @@ fn build_predicate_expression( // predicate expression can only be a binary expression let (left, op, right) = match expr { Expr::BinaryExpr { left, op, right } => (left, *op, right), - Expr::Column(name) => { - let expr = build_single_column_expr(name, schema, required_columns, false) + Expr::Column(col) => { + let expr = build_single_column_expr(col, schema, required_columns, false) .unwrap_or(unhandled); return Ok(expr); } // match !col (don't do so recursively) Expr::Not(input) => { - if let Expr::Column(name) = input.as_ref() { - let expr = build_single_column_expr(name, schema, required_columns, true) + if let Expr::Column(col) = input.as_ref() { + let expr = build_single_column_expr(col, schema, required_columns, true) .unwrap_or(unhandled); return Ok(expr); } else { @@ -674,7 +674,7 @@ mod tests { #[derive(Debug, Default)] struct TestStatistics { // key: column name - stats: HashMap, + stats: HashMap, } impl TestStatistics { @@ -687,20 +687,21 @@ mod tests { name: impl Into, container_stats: ContainerStats, ) -> Self { - self.stats.insert(name.into(), container_stats); + self.stats + .insert(Column::from_name(name.into()), container_stats); self } } impl PruningStatistics for TestStatistics { - fn min_values(&self, column: &str) -> Option { + fn min_values(&self, column: &Column) -> Option { self.stats .get(column) .map(|container_stats| container_stats.min()) .unwrap_or(None) } - fn max_values(&self, column: &str) -> Option { + fn max_values(&self, column: &Column) -> Option { self.stats .get(column) .map(|container_stats| container_stats.max()) @@ -724,11 +725,11 @@ mod tests { } impl PruningStatistics for OneContainerStats { - fn min_values(&self, _column: &str) -> Option { + fn min_values(&self, _column: &Column) -> Option { self.min_values.clone() } - fn max_values(&self, _column: &str) -> Option { + fn max_values(&self, _column: &Column) -> Option { self.max_values.clone() } @@ -743,25 +744,25 @@ mod tests { let required_columns = RequiredStatColumns::from(vec![ // min of original column s1, named s1_min ( - "s1".to_string(), + "s1".into(), StatisticsType::Min, Field::new("s1_min", DataType::Int32, true), ), // max of original column s2, named s2_max ( - "s2".to_string(), + "s2".into(), StatisticsType::Max, Field::new("s2_max", DataType::Int32, true), ), // max of original column s3, named s3_max ( - "s3".to_string(), + "s3".into(), StatisticsType::Max, Field::new("s3_max", DataType::Utf8, true), ), // min of original column s3, named s3_min ( - "s3".to_string(), + "s3".into(), StatisticsType::Min, Field::new("s3_min", DataType::Utf8, true), ), @@ -813,7 +814,7 @@ mod tests { // Request a record batch with of s1_min as a timestamp let required_columns = RequiredStatColumns::from(vec![( - "s1".to_string(), + "s3".into(), StatisticsType::Min, Field::new( "s1_min", @@ -867,7 +868,7 @@ mod tests { // Request a record batch with of s1_min as a timestamp let required_columns = RequiredStatColumns::from(vec![( - "s1".to_string(), + "s3".into(), StatisticsType::Min, Field::new("s1_min", DataType::Utf8, true), )]); @@ -896,7 +897,7 @@ mod tests { fn test_build_statistics_inconsistent_length() { // return an inconsistent length to the actual statistics arrays let required_columns = RequiredStatColumns::from(vec![( - "s1".to_string(), + "s1".into(), StatisticsType::Min, Field::new("s1_min", DataType::Int64, true), )]); @@ -1143,18 +1144,18 @@ mod tests { let c1_min_field = Field::new("c1_min", DataType::Int32, false); assert_eq!( required_columns.columns[0], - ("c1".to_owned(), StatisticsType::Min, c1_min_field) + ("c1".into(), StatisticsType::Min, c1_min_field) ); // c2 = 2 should add c2_min and c2_max let c2_min_field = Field::new("c2_min", DataType::Int32, false); assert_eq!( required_columns.columns[1], - ("c2".to_owned(), StatisticsType::Min, c2_min_field) + ("c2".into(), StatisticsType::Min, c2_min_field) ); let c2_max_field = Field::new("c2_max", DataType::Int32, false); assert_eq!( required_columns.columns[2], - ("c2".to_owned(), StatisticsType::Max, c2_max_field) + ("c2".into(), StatisticsType::Max, c2_max_field) ); // c2 = 3 shouldn't add any new statistics fields assert_eq!(required_columns.columns.len(), 3); diff --git a/datafusion/src/physical_plan/expressions/binary.rs b/datafusion/src/physical_plan/expressions/binary.rs index 5ed0c74463a6e..a69b776e74bb4 100644 --- a/datafusion/src/physical_plan/expressions/binary.rs +++ b/datafusion/src/physical_plan/expressions/binary.rs @@ -611,11 +611,12 @@ mod tests { ]); let a = Int32Array::from(vec![1, 2, 3, 4, 5]); let b = Int32Array::from(vec![1, 2, 4, 8, 16]); + + // expression: "a < b" + let lt = binary_simple(col("a", &schema)?, Operator::Lt, col("b", &schema)?); let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)])?; - // expression: "a < b" - let lt = binary_simple(col("a"), Operator::Lt, col("b")); let result = lt.evaluate(&batch)?.into_array(batch.num_rows()); assert_eq!(result.len(), 5); @@ -639,16 +640,17 @@ mod tests { ]); let a = Int32Array::from(vec![2, 4, 6, 8, 10]); let b = Int32Array::from(vec![2, 5, 4, 8, 8]); - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)])?; // expression: "a < b OR a == b" let expr = binary_simple( - binary_simple(col("a"), Operator::Lt, col("b")), + binary_simple(col("a", &schema)?, Operator::Lt, col("b", &schema)?), Operator::Or, - binary_simple(col("a"), Operator::Eq, col("b")), + binary_simple(col("a", &schema)?, Operator::Eq, col("b", &schema)?), ); - assert_eq!("a < b OR a = b", format!("{}", expr)); + let batch = + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)])?; + + assert_eq!("a@0 < b@1 OR a@0 = b@1", format!("{}", expr)); let result = expr.evaluate(&batch)?.into_array(batch.num_rows()); assert_eq!(result.len(), 5); @@ -680,14 +682,15 @@ mod tests { ]); let a = $A_ARRAY::from($A_VEC); let b = $B_ARRAY::from($B_VEC); + + // verify that we can construct the expression + let expression = + binary(col("a", &schema)?, $OP, col("b", &schema)?, &schema)?; let batch = RecordBatch::try_new( Arc::new(schema.clone()), vec![Arc::new(a), Arc::new(b)], )?; - // verify that we can construct the expression - let expression = binary(col("a"), $OP, col("b"), &schema)?; - // verify that the expression's type is correct assert_eq!(expression.data_type(&schema)?, $C_TYPE); @@ -863,7 +866,12 @@ mod tests { // Test 1: dict = str // verify that we can construct the expression - let expression = binary(col("dict"), Operator::Eq, col("str"), &schema)?; + let expression = binary( + col("dict", &schema)?, + Operator::Eq, + col("str", &schema)?, + &schema, + )?; assert_eq!(expression.data_type(&schema)?, DataType::Boolean); // evaluate and verify the result type matched @@ -877,7 +885,12 @@ mod tests { // str = dict // verify that we can construct the expression - let expression = binary(col("str"), Operator::Eq, col("dict"), &schema)?; + let expression = binary( + col("str", &schema)?, + Operator::Eq, + col("dict", &schema)?, + &schema, + )?; assert_eq!(expression.data_type(&schema)?, DataType::Boolean); // evaluate and verify the result type matched @@ -989,7 +1002,7 @@ mod tests { op: Operator, expected: PrimitiveArray, ) -> Result<()> { - let arithmetic_op = binary_simple(col("a"), op, col("b")); + let arithmetic_op = binary_simple(col("a", &schema)?, op, col("b", &schema)?); let batch = RecordBatch::try_new(schema, data)?; let result = arithmetic_op.evaluate(&batch)?.into_array(batch.num_rows()); @@ -1004,7 +1017,7 @@ mod tests { op: Operator, expected: BooleanArray, ) -> Result<()> { - let arithmetic_op = binary_simple(col("a"), op, col("b")); + let arithmetic_op = binary_simple(col("a", &schema)?, op, col("b", &schema)?); let data: Vec = vec![Arc::new(left), Arc::new(right)]; let batch = RecordBatch::try_new(schema, data)?; let result = arithmetic_op.evaluate(&batch)?.into_array(batch.num_rows()); diff --git a/datafusion/src/physical_plan/expressions/case.rs b/datafusion/src/physical_plan/expressions/case.rs index f89ea8d1e2964..a46522d69deb5 100644 --- a/datafusion/src/physical_plan/expressions/case.rs +++ b/datafusion/src/physical_plan/expressions/case.rs @@ -451,6 +451,7 @@ mod tests { #[test] fn case_with_expr() -> Result<()> { let batch = case_test_batch()?; + let schema = batch.schema(); // CASE a WHEN 'foo' THEN 123 WHEN 'bar' THEN 456 END let when1 = lit(ScalarValue::Utf8(Some("foo".to_string()))); @@ -458,7 +459,11 @@ mod tests { let when2 = lit(ScalarValue::Utf8(Some("bar".to_string()))); let then2 = lit(ScalarValue::Int32(Some(456))); - let expr = case(Some(col("a")), &[(when1, then1), (when2, then2)], None)?; + let expr = case( + Some(col("a", &schema)?), + &[(when1, then1), (when2, then2)], + None, + )?; let result = expr.evaluate(&batch)?.into_array(batch.num_rows()); let result = result .as_any() @@ -475,6 +480,7 @@ mod tests { #[test] fn case_with_expr_else() -> Result<()> { let batch = case_test_batch()?; + let schema = batch.schema(); // CASE a WHEN 'foo' THEN 123 WHEN 'bar' THEN 456 ELSE 999 END let when1 = lit(ScalarValue::Utf8(Some("foo".to_string()))); @@ -484,7 +490,7 @@ mod tests { let else_value = lit(ScalarValue::Int32(Some(999))); let expr = case( - Some(col("a")), + Some(col("a", &schema)?), &[(when1, then1), (when2, then2)], Some(else_value), )?; @@ -505,17 +511,18 @@ mod tests { #[test] fn case_without_expr() -> Result<()> { let batch = case_test_batch()?; + let schema = batch.schema(); // CASE WHEN a = 'foo' THEN 123 WHEN a = 'bar' THEN 456 END let when1 = binary( - col("a"), + col("a", &schema)?, Operator::Eq, lit(ScalarValue::Utf8(Some("foo".to_string()))), &batch.schema(), )?; let then1 = lit(ScalarValue::Int32(Some(123))); let when2 = binary( - col("a"), + col("a", &schema)?, Operator::Eq, lit(ScalarValue::Utf8(Some("bar".to_string()))), &batch.schema(), @@ -539,17 +546,18 @@ mod tests { #[test] fn case_without_expr_else() -> Result<()> { let batch = case_test_batch()?; + let schema = batch.schema(); // CASE WHEN a = 'foo' THEN 123 WHEN a = 'bar' THEN 456 ELSE 999 END let when1 = binary( - col("a"), + col("a", &schema)?, Operator::Eq, lit(ScalarValue::Utf8(Some("foo".to_string()))), &batch.schema(), )?; let then1 = lit(ScalarValue::Int32(Some(123))); let when2 = binary( - col("a"), + col("a", &schema)?, Operator::Eq, lit(ScalarValue::Utf8(Some("bar".to_string()))), &batch.schema(), diff --git a/datafusion/src/physical_plan/expressions/cast.rs b/datafusion/src/physical_plan/expressions/cast.rs index 558b1e5d7e8b8..bba125ebdcc99 100644 --- a/datafusion/src/physical_plan/expressions/cast.rs +++ b/datafusion/src/physical_plan/expressions/cast.rs @@ -180,10 +180,14 @@ mod tests { RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; // verify that we can construct the expression - let expression = cast_with_options(col("a"), &schema, $TYPE, $CAST_OPTIONS)?; + let expression = + cast_with_options(col("a", &schema)?, &schema, $TYPE, $CAST_OPTIONS)?; // verify that its display is correct - assert_eq!(format!("CAST(a AS {:?})", $TYPE), format!("{}", expression)); + assert_eq!( + format!("CAST(a@0 AS {:?})", $TYPE), + format!("{}", expression) + ); // verify that the expression's type is correct assert_eq!(expression.data_type(&schema)?, $TYPE); @@ -272,7 +276,7 @@ mod tests { // Ensure a useful error happens at plan time if invalid casts are used let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); - let result = cast(col("a"), &schema, DataType::LargeBinary); + let result = cast(col("a", &schema).unwrap(), &schema, DataType::LargeBinary); result.expect_err("expected Invalid CAST"); } @@ -283,7 +287,7 @@ mod tests { let a = StringArray::from(vec!["9.1"]); let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; let expression = cast_with_options( - col("a"), + col("a", &schema)?, &schema, DataType::Int32, DEFAULT_DATAFUSION_CAST_OPTIONS, diff --git a/datafusion/src/physical_plan/expressions/column.rs b/datafusion/src/physical_plan/expressions/column.rs index 7e0304e51fe73..d6eafbb05384a 100644 --- a/datafusion/src/physical_plan/expressions/column.rs +++ b/datafusion/src/physical_plan/expressions/column.rs @@ -28,28 +28,40 @@ use crate::error::Result; use crate::physical_plan::{ColumnarValue, PhysicalExpr}; /// Represents the column at a given index in a RecordBatch -#[derive(Debug)] +#[derive(Debug, Hash, PartialEq, Eq, Clone)] pub struct Column { name: String, + index: usize, } impl Column { /// Create a new column expression - pub fn new(name: &str) -> Self { + pub fn new(name: &str, index: usize) -> Self { Self { name: name.to_owned(), + index, } } + /// Create a new column expression based on column name and schema + pub fn new_with_schema(name: &str, schema: &Schema) -> Result { + Ok(Column::new(name, schema.index_of(name)?)) + } + /// Get the column name pub fn name(&self) -> &str { &self.name } + + /// Get the column index + pub fn index(&self) -> usize { + self.index + } } impl std::fmt::Display for Column { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{}", self.name) + write!(f, "{}@{}", self.name, self.index) } } @@ -61,26 +73,21 @@ impl PhysicalExpr for Column { /// Get the data type of this expression, given the schema of the input fn data_type(&self, input_schema: &Schema) -> Result { - Ok(input_schema - .field_with_name(&self.name)? - .data_type() - .clone()) + Ok(input_schema.field(self.index).data_type().clone()) } /// Decide whehter this expression is nullable, given the schema of the input fn nullable(&self, input_schema: &Schema) -> Result { - Ok(input_schema.field_with_name(&self.name)?.is_nullable()) + Ok(input_schema.field(self.index).is_nullable()) } /// Evaluate the expression fn evaluate(&self, batch: &RecordBatch) -> Result { - Ok(ColumnarValue::Array( - batch.column(batch.schema().index_of(&self.name)?).clone(), - )) + Ok(ColumnarValue::Array(batch.column(self.index).clone())) } } /// Create a column expression -pub fn col(name: &str) -> Arc { - Arc::new(Column::new(name)) +pub fn col(name: &str, schema: &Schema) -> Result> { + Ok(Arc::new(Column::new_with_schema(name, schema)?)) } diff --git a/datafusion/src/physical_plan/expressions/in_list.rs b/datafusion/src/physical_plan/expressions/in_list.rs index 41f111006ea2a..38b2b9d45b9bb 100644 --- a/datafusion/src/physical_plan/expressions/in_list.rs +++ b/datafusion/src/physical_plan/expressions/in_list.rs @@ -296,8 +296,8 @@ mod tests { // applies the in_list expr to an input batch and list macro_rules! in_list { - ($BATCH:expr, $LIST:expr, $NEGATED:expr, $EXPECTED:expr) => {{ - let expr = in_list(col("a"), $LIST, $NEGATED).unwrap(); + ($BATCH:expr, $LIST:expr, $NEGATED:expr, $EXPECTED:expr, $COL:expr) => {{ + let expr = in_list($COL, $LIST, $NEGATED).unwrap(); let result = expr.evaluate(&$BATCH)?.into_array($BATCH.num_rows()); let result = result .as_any() @@ -312,6 +312,7 @@ mod tests { fn in_list_utf8() -> Result<()> { let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]); let a = StringArray::from(vec![Some("a"), Some("d"), None]); + let col_a = col("a", &schema)?; let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)])?; // expression: "a in ("a", "b")" @@ -319,14 +320,26 @@ mod tests { lit(ScalarValue::Utf8(Some("a".to_string()))), lit(ScalarValue::Utf8(Some("b".to_string()))), ]; - in_list!(batch, list, &false, vec![Some(true), Some(false), None]); + in_list!( + batch, + list, + &false, + vec![Some(true), Some(false), None], + col_a.clone() + ); // expression: "a not in ("a", "b")" let list = vec![ lit(ScalarValue::Utf8(Some("a".to_string()))), lit(ScalarValue::Utf8(Some("b".to_string()))), ]; - in_list!(batch, list, &true, vec![Some(false), Some(true), None]); + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None], + col_a.clone() + ); // expression: "a not in ("a", "b")" let list = vec![ @@ -334,7 +347,13 @@ mod tests { lit(ScalarValue::Utf8(Some("b".to_string()))), lit(ScalarValue::Utf8(None)), ]; - in_list!(batch, list, &false, vec![Some(true), None, None]); + in_list!( + batch, + list, + &false, + vec![Some(true), None, None], + col_a.clone() + ); // expression: "a not in ("a", "b")" let list = vec![ @@ -342,7 +361,13 @@ mod tests { lit(ScalarValue::Utf8(Some("b".to_string()))), lit(ScalarValue::Utf8(None)), ]; - in_list!(batch, list, &true, vec![Some(false), None, None]); + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + col_a.clone() + ); Ok(()) } @@ -351,6 +376,7 @@ mod tests { fn in_list_int64() -> Result<()> { let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]); let a = Int64Array::from(vec![Some(0), Some(2), None]); + let col_a = col("a", &schema)?; let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)])?; // expression: "a in (0, 1)" @@ -358,14 +384,26 @@ mod tests { lit(ScalarValue::Int64(Some(0))), lit(ScalarValue::Int64(Some(1))), ]; - in_list!(batch, list, &false, vec![Some(true), Some(false), None]); + in_list!( + batch, + list, + &false, + vec![Some(true), Some(false), None], + col_a.clone() + ); // expression: "a not in (0, 1)" let list = vec![ lit(ScalarValue::Int64(Some(0))), lit(ScalarValue::Int64(Some(1))), ]; - in_list!(batch, list, &true, vec![Some(false), Some(true), None]); + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None], + col_a.clone() + ); // expression: "a in (0, 1, NULL)" let list = vec![ @@ -373,7 +411,13 @@ mod tests { lit(ScalarValue::Int64(Some(1))), lit(ScalarValue::Utf8(None)), ]; - in_list!(batch, list, &false, vec![Some(true), None, None]); + in_list!( + batch, + list, + &false, + vec![Some(true), None, None], + col_a.clone() + ); // expression: "a not in (0, 1, NULL)" let list = vec![ @@ -381,7 +425,13 @@ mod tests { lit(ScalarValue::Int64(Some(1))), lit(ScalarValue::Utf8(None)), ]; - in_list!(batch, list, &true, vec![Some(false), None, None]); + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + col_a.clone() + ); Ok(()) } @@ -390,6 +440,7 @@ mod tests { fn in_list_float64() -> Result<()> { let schema = Schema::new(vec![Field::new("a", DataType::Float64, true)]); let a = Float64Array::from(vec![Some(0.0), Some(0.2), None]); + let col_a = col("a", &schema)?; let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)])?; // expression: "a in (0.0, 0.2)" @@ -397,14 +448,26 @@ mod tests { lit(ScalarValue::Float64(Some(0.0))), lit(ScalarValue::Float64(Some(0.1))), ]; - in_list!(batch, list, &false, vec![Some(true), Some(false), None]); + in_list!( + batch, + list, + &false, + vec![Some(true), Some(false), None], + col_a.clone() + ); // expression: "a not in (0.0, 0.2)" let list = vec![ lit(ScalarValue::Float64(Some(0.0))), lit(ScalarValue::Float64(Some(0.1))), ]; - in_list!(batch, list, &true, vec![Some(false), Some(true), None]); + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None], + col_a.clone() + ); // expression: "a in (0.0, 0.2, NULL)" let list = vec![ @@ -412,7 +475,13 @@ mod tests { lit(ScalarValue::Float64(Some(0.1))), lit(ScalarValue::Utf8(None)), ]; - in_list!(batch, list, &false, vec![Some(true), None, None]); + in_list!( + batch, + list, + &false, + vec![Some(true), None, None], + col_a.clone() + ); // expression: "a not in (0.0, 0.2, NULL)" let list = vec![ @@ -420,7 +489,13 @@ mod tests { lit(ScalarValue::Float64(Some(0.1))), lit(ScalarValue::Utf8(None)), ]; - in_list!(batch, list, &true, vec![Some(false), None, None]); + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + col_a.clone() + ); Ok(()) } @@ -429,29 +504,30 @@ mod tests { fn in_list_bool() -> Result<()> { let schema = Schema::new(vec![Field::new("a", DataType::Boolean, true)]); let a = BooleanArray::from(vec![Some(true), None]); + let col_a = col("a", &schema)?; let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)])?; // expression: "a in (true)" let list = vec![lit(ScalarValue::Boolean(Some(true)))]; - in_list!(batch, list, &false, vec![Some(true), None]); + in_list!(batch, list, &false, vec![Some(true), None], col_a.clone()); // expression: "a not in (true)" let list = vec![lit(ScalarValue::Boolean(Some(true)))]; - in_list!(batch, list, &true, vec![Some(false), None]); + in_list!(batch, list, &true, vec![Some(false), None], col_a.clone()); // expression: "a in (true, NULL)" let list = vec![ lit(ScalarValue::Boolean(Some(true))), lit(ScalarValue::Utf8(None)), ]; - in_list!(batch, list, &false, vec![Some(true), None]); + in_list!(batch, list, &false, vec![Some(true), None], col_a.clone()); // expression: "a not in (true, NULL)" let list = vec![ lit(ScalarValue::Boolean(Some(true))), lit(ScalarValue::Utf8(None)), ]; - in_list!(batch, list, &true, vec![Some(false), None]); + in_list!(batch, list, &true, vec![Some(false), None], col_a.clone()); Ok(()) } diff --git a/datafusion/src/physical_plan/expressions/is_not_null.rs b/datafusion/src/physical_plan/expressions/is_not_null.rs index 7ac2110b50221..cce27e36a68c0 100644 --- a/datafusion/src/physical_plan/expressions/is_not_null.rs +++ b/datafusion/src/physical_plan/expressions/is_not_null.rs @@ -100,10 +100,10 @@ mod tests { fn is_not_null_op() -> Result<()> { let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]); let a = StringArray::from(vec![Some("foo"), None]); + let expr = is_not_null(col("a", &schema)?).unwrap(); let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)])?; // expression: "a is not null" - let expr = is_not_null(col("a")).unwrap(); let result = expr.evaluate(&batch)?.into_array(batch.num_rows()); let result = result .as_any() diff --git a/datafusion/src/physical_plan/expressions/is_null.rs b/datafusion/src/physical_plan/expressions/is_null.rs index dfa53f3f7d264..dbb57dfa5f8bd 100644 --- a/datafusion/src/physical_plan/expressions/is_null.rs +++ b/datafusion/src/physical_plan/expressions/is_null.rs @@ -100,10 +100,11 @@ mod tests { fn is_null_op() -> Result<()> { let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]); let a = StringArray::from(vec![Some("foo"), None]); - let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)])?; // expression: "a is null" - let expr = is_null(col("a")).unwrap(); + let expr = is_null(col("a", &schema)?).unwrap(); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)])?; + let result = expr.evaluate(&batch)?.into_array(batch.num_rows()); let result = result .as_any() diff --git a/datafusion/src/physical_plan/expressions/min_max.rs b/datafusion/src/physical_plan/expressions/min_max.rs index ea917d30d940d..680e739cbf292 100644 --- a/datafusion/src/physical_plan/expressions/min_max.rs +++ b/datafusion/src/physical_plan/expressions/min_max.rs @@ -278,7 +278,7 @@ macro_rules! min_max { } e => { return Err(DataFusionError::Internal(format!( - "MIN/MAX is not expected to receive a scalar {:?}", + "MIN/MAX is not expected to receive scalars of incompatible types {:?}", e ))) } diff --git a/datafusion/src/physical_plan/expressions/mod.rs b/datafusion/src/physical_plan/expressions/mod.rs index f8cb40cbacbdc..0b32dca0467d8 100644 --- a/datafusion/src/physical_plan/expressions/mod.rs +++ b/datafusion/src/physical_plan/expressions/mod.rs @@ -66,6 +66,7 @@ pub use nullif::{nullif_func, SUPPORTED_NULLIF_TYPES}; pub use row_number::RowNumber; pub use sum::{sum_return_type, Sum}; pub use try_cast::{try_cast, TryCastExpr}; + /// returns the name of the state pub fn format_state_name(name: &str, state_name: &str) -> String { format!("{}[{}]", name, state_name) @@ -126,8 +127,11 @@ mod tests { let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![$ARRAY])?; - let agg = - Arc::new(<$OP>::new(col("a"), "bla".to_string(), $EXPECTED_DATATYPE)); + let agg = Arc::new(<$OP>::new( + col("a", &schema)?, + "bla".to_string(), + $EXPECTED_DATATYPE, + )); let actual = aggregate(&batch, agg)?; let expected = ScalarValue::from($EXPECTED); diff --git a/datafusion/src/physical_plan/expressions/not.rs b/datafusion/src/physical_plan/expressions/not.rs index 7a997b61b488a..341d38a10aa1c 100644 --- a/datafusion/src/physical_plan/expressions/not.rs +++ b/datafusion/src/physical_plan/expressions/not.rs @@ -127,7 +127,7 @@ mod tests { fn neg_op() -> Result<()> { let schema = Schema::new(vec![Field::new("a", DataType::Boolean, true)]); - let expr = not(col("a"), &schema)?; + let expr = not(col("a", &schema)?, &schema)?; assert_eq!(expr.data_type(&schema)?, DataType::Boolean); assert!(expr.nullable(&schema)?); @@ -152,7 +152,7 @@ mod tests { fn neg_op_not_null() { let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]); - let expr = not(col("a"), &schema); + let expr = not(col("a", &schema).unwrap(), &schema); assert!(expr.is_err()); } } diff --git a/datafusion/src/physical_plan/expressions/nth_value.rs b/datafusion/src/physical_plan/expressions/nth_value.rs index 16897d45119f0..577c19b54ade0 100644 --- a/datafusion/src/physical_plan/expressions/nth_value.rs +++ b/datafusion/src/physical_plan/expressions/nth_value.rs @@ -148,7 +148,7 @@ impl BuiltInWindowFunctionExpr for NthValue { mod tests { use super::*; use crate::error::Result; - use crate::physical_plan::expressions::col; + use crate::physical_plan::expressions::Column; use arrow::record_batch::RecordBatch; use arrow::{array::*, datatypes::*}; @@ -166,32 +166,46 @@ mod tests { #[test] fn first_value() -> Result<()> { - let first_value = - NthValue::first_value("first_value".to_owned(), col("arr"), DataType::Int32); + let first_value = NthValue::first_value( + "first_value".to_owned(), + Arc::new(Column::new("arr", 0)), + DataType::Int32, + ); test_i32_result(first_value, vec![1; 8])?; Ok(()) } #[test] fn last_value() -> Result<()> { - let last_value = - NthValue::last_value("last_value".to_owned(), col("arr"), DataType::Int32); + let last_value = NthValue::last_value( + "last_value".to_owned(), + Arc::new(Column::new("arr", 0)), + DataType::Int32, + ); test_i32_result(last_value, vec![8; 8])?; Ok(()) } #[test] fn nth_value_1() -> Result<()> { - let nth_value = - NthValue::nth_value("nth_value".to_owned(), col("arr"), DataType::Int32, 1)?; + let nth_value = NthValue::nth_value( + "nth_value".to_owned(), + Arc::new(Column::new("arr", 0)), + DataType::Int32, + 1, + )?; test_i32_result(nth_value, vec![1; 8])?; Ok(()) } #[test] fn nth_value_2() -> Result<()> { - let nth_value = - NthValue::nth_value("nth_value".to_owned(), col("arr"), DataType::Int32, 2)?; + let nth_value = NthValue::nth_value( + "nth_value".to_owned(), + Arc::new(Column::new("arr", 0)), + DataType::Int32, + 2, + )?; test_i32_result(nth_value, vec![-2; 8])?; Ok(()) } diff --git a/datafusion/src/physical_plan/expressions/try_cast.rs b/datafusion/src/physical_plan/expressions/try_cast.rs index 5e402fdea28ad..1ba4a50260d46 100644 --- a/datafusion/src/physical_plan/expressions/try_cast.rs +++ b/datafusion/src/physical_plan/expressions/try_cast.rs @@ -139,10 +139,13 @@ mod tests { RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; // verify that we can construct the expression - let expression = try_cast(col("a"), &schema, $TYPE)?; + let expression = try_cast(col("a", &schema)?, &schema, $TYPE)?; // verify that its display is correct - assert_eq!(format!("CAST(a AS {:?})", $TYPE), format!("{}", expression)); + assert_eq!( + format!("CAST(a@0 AS {:?})", $TYPE), + format!("{}", expression) + ); // verify that the expression's type is correct assert_eq!(expression.data_type(&schema)?, $TYPE); @@ -241,7 +244,7 @@ mod tests { // Ensure a useful error happens at plan time if invalid casts are used let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); - let result = try_cast(col("a"), &schema, DataType::LargeBinary); + let result = try_cast(col("a", &schema).unwrap(), &schema, DataType::LargeBinary); result.expect_err("expected Invalid CAST"); } } diff --git a/datafusion/src/physical_plan/filter.rs b/datafusion/src/physical_plan/filter.rs index 0a8c825aba1ae..9e7fa9df9711c 100644 --- a/datafusion/src/physical_plan/filter.rs +++ b/datafusion/src/physical_plan/filter.rs @@ -223,14 +223,14 @@ mod tests { let predicate: Arc = binary( binary( - col("c2"), + col("c2", &schema)?, Operator::Gt, lit(ScalarValue::from(1u32)), &schema, )?, Operator::And, binary( - col("c2"), + col("c2", &schema)?, Operator::Lt, lit(ScalarValue::from(4u32)), &schema, diff --git a/datafusion/src/physical_plan/functions.rs b/datafusion/src/physical_plan/functions.rs index 0e2be51d3ebc1..01f7e95a0ee99 100644 --- a/datafusion/src/physical_plan/functions.rs +++ b/datafusion/src/physical_plan/functions.rs @@ -3651,7 +3651,7 @@ mod tests { let expr = create_physical_expr( &BuiltinScalarFunction::Array, - &[col("a"), col("b")], + &[col("a", &schema)?, col("b", &schema)?], &schema, &ctx_state, )?; @@ -3718,7 +3718,7 @@ mod tests { let columns: Vec = vec![col_value]; let expr = create_physical_expr( &BuiltinScalarFunction::RegexpMatch, - &[col("a"), pattern], + &[col("a", &schema)?, pattern], &schema, &ctx_state, )?; diff --git a/datafusion/src/physical_plan/hash_aggregate.rs b/datafusion/src/physical_plan/hash_aggregate.rs index f1611ebd7a775..250ba2b083062 100644 --- a/datafusion/src/physical_plan/hash_aggregate.rs +++ b/datafusion/src/physical_plan/hash_aggregate.rs @@ -663,9 +663,12 @@ async fn compute_grouped_hash_aggregate( aggr_expr: Vec>, mut input: SendableRecordBatchStream, ) -> ArrowResult { - // the expressions to evaluate the batch, one vec of expressions per aggregation - let aggregate_expressions = aggregate_expressions(&aggr_expr, &mode) - .map_err(DataFusionError::into_arrow_external_error)?; + // The expressions to evaluate the batch, one vec of expressions per aggregation. + // Assume create_schema() always put group columns in front of aggr columns, we set + // col_idx_base to group expression count. + let aggregate_expressions = + aggregate_expressions(&aggr_expr, &mode, group_expr.len()) + .map_err(DataFusionError::into_arrow_external_error)?; // mapping key -> (set of accumulators, indices of the key in the batch) // * the indexes are updated at each row @@ -794,14 +797,21 @@ fn evaluate_many( .collect::>>() } -/// uses `state_fields` to build a vec of expressions required to merge the AggregateExpr' accumulator's state. +/// uses `state_fields` to build a vec of physical column expressions required to merge the +/// AggregateExpr' accumulator's state. +/// +/// `index_base` is the starting physical column index for the next expanded state field. fn merge_expressions( + index_base: usize, expr: &Arc, ) -> Result>> { Ok(expr .state_fields()? .iter() - .map(|f| Arc::new(Column::new(f.name())) as Arc) + .enumerate() + .map(|(idx, f)| { + Arc::new(Column::new(f.name(), index_base + idx)) as Arc + }) .collect::>()) } @@ -809,22 +819,27 @@ fn merge_expressions( /// The expressions are different depending on `mode`: /// * Partial: AggregateExpr::expressions /// * Final: columns of `AggregateExpr::state_fields()` -/// The return value is to be understood as: -/// * index 0 is the aggregation -/// * index 1 is the expression i of the aggregation fn aggregate_expressions( aggr_expr: &[Arc], mode: &AggregateMode, + col_idx_base: usize, ) -> Result>>> { match mode { AggregateMode::Partial => { Ok(aggr_expr.iter().map(|agg| agg.expressions()).collect()) } // in this mode, we build the merge expressions of the aggregation - AggregateMode::Final | AggregateMode::FinalPartitioned => Ok(aggr_expr - .iter() - .map(|agg| merge_expressions(agg)) - .collect::>>()?), + AggregateMode::Final | AggregateMode::FinalPartitioned => { + let mut col_idx_base = col_idx_base; + Ok(aggr_expr + .iter() + .map(|agg| { + let exprs = merge_expressions(col_idx_base, agg)?; + col_idx_base += exprs.len(); + Ok(exprs) + }) + .collect::>>()?) + } } } @@ -846,10 +861,8 @@ async fn compute_hash_aggregate( ) -> ArrowResult { let mut accumulators = create_accumulators(&aggr_expr) .map_err(DataFusionError::into_arrow_external_error)?; - - let expressions = aggregate_expressions(&aggr_expr, &mode) + let expressions = aggregate_expressions(&aggr_expr, &mode, 0) .map_err(DataFusionError::into_arrow_external_error)?; - let expressions = Arc::new(expressions); // 1 for each batch, update / merge accumulators with the expressions' values @@ -1253,16 +1266,17 @@ mod tests { /// build the aggregates on the data from some_data() and check the results async fn check_aggregates(input: Arc) -> Result<()> { + let input_schema = input.schema(); + let groups: Vec<(Arc, String)> = - vec![(col("a"), "a".to_string())]; + vec![(col("a", &input_schema)?, "a".to_string())]; let aggregates: Vec> = vec![Arc::new(Avg::new( - col("b"), + col("b", &input_schema)?, "AVG(b)".to_string(), DataType::Float64, ))]; - let input_schema = input.schema(); let partial_aggregate = Arc::new(HashAggregateExec::try_new( AggregateMode::Partial, groups.clone(), @@ -1286,8 +1300,9 @@ mod tests { let merge = Arc::new(MergeExec::new(partial_aggregate)); - let final_group: Vec> = - (0..groups.len()).map(|i| col(&groups[i].1)).collect(); + let final_group: Vec> = (0..groups.len()) + .map(|i| col(&groups[i].1, &input_schema)) + .collect::>()?; let merged_aggregate = Arc::new(HashAggregateExec::try_new( AggregateMode::Final, diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index 928392a844337..ad356079387a0 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -52,7 +52,7 @@ use arrow::array::{ UInt64Array, UInt8Array, }; -use super::expressions::col; +use super::expressions::Column; use super::{ hash_utils::{build_join_schema, check_join_is_valid, JoinOn, JoinType}, merge::MergeExec, @@ -64,6 +64,7 @@ use super::{ SendableRecordBatchStream, }; use crate::physical_plan::coalesce_batches::concat_batches; +use crate::physical_plan::PhysicalExpr; use log::debug; // Maps a `u64` hash value based on the left ["on" values] to a list of indices with this key's value. @@ -90,7 +91,7 @@ pub struct HashJoinExec { /// right (probe) side which are filtered by the hash table right: Arc, /// Set of common columns used to join on - on: Vec<(String, String)>, + on: Vec<(Column, Column)>, /// How the join is performed join_type: JoinType, /// The schema once the join is applied @@ -127,26 +128,21 @@ impl HashJoinExec { pub fn try_new( left: Arc, right: Arc, - on: &JoinOn, + on: JoinOn, join_type: &JoinType, partition_mode: PartitionMode, ) -> Result { let left_schema = left.schema(); let right_schema = right.schema(); - check_join_is_valid(&left_schema, &right_schema, on)?; + check_join_is_valid(&left_schema, &right_schema, &on)?; let schema = Arc::new(build_join_schema( &left_schema, &right_schema, - on, + &on, join_type, )); - let on = on - .iter() - .map(|(l, r)| (l.to_string(), r.to_string())) - .collect(); - let random_state = RandomState::with_seeds(0, 0, 0, 0); Ok(HashJoinExec { @@ -172,7 +168,7 @@ impl HashJoinExec { } /// Set of common columns used to join on - pub fn on(&self) -> &[(String, String)] { + pub fn on(&self) -> &[(Column, Column)] { &self.on } @@ -236,7 +232,7 @@ impl ExecutionPlan for HashJoinExec { 2 => Ok(Arc::new(HashJoinExec::try_new( children[0].clone(), children[1].clone(), - &self.on, + self.on.clone(), &self.join_type, self.mode, )?)), @@ -307,10 +303,10 @@ impl ExecutionPlan for HashJoinExec { *build_side = Some(left_side.clone()); debug!( - "Built build-side of hash join containing {} rows in {} ms", - num_rows, - start.elapsed().as_millis() - ); + "Built build-side of hash join containing {} rows in {} ms", + num_rows, + start.elapsed().as_millis() + ); left_side } @@ -372,7 +368,7 @@ impl ExecutionPlan for HashJoinExec { // we have the batches and the hash map with their keys. We can how create a stream // over the right that uses this information to issue new batches. - let stream = self.right.execute(partition).await?; + let right_stream = self.right.execute(partition).await?; let on_right = self.on.iter().map(|on| on.1.clone()).collect::>(); let column_indices = self.column_indices_from_schema()?; @@ -383,23 +379,17 @@ impl ExecutionPlan for HashJoinExec { } JoinType::Inner | JoinType::Right => vec![], }; - Ok(Box::pin(HashJoinStream { - schema: self.schema.clone(), + Ok(Box::pin(HashJoinStream::new( + self.schema.clone(), on_left, on_right, - join_type: self.join_type, + self.join_type, left_data, - right: stream, + right_stream, column_indices, - num_input_batches: 0, - num_input_rows: 0, - num_output_batches: 0, - num_output_rows: 0, - join_time: 0, - random_state: self.random_state.clone(), + self.random_state.clone(), visited_left_side, - is_exhausted: false, - })) + ))) } fn fmt_as( @@ -422,7 +412,7 @@ impl ExecutionPlan for HashJoinExec { /// Updates `hash` with new entries from [RecordBatch] evaluated against the expressions `on`, /// assuming that the [RecordBatch] corresponds to the `index`th fn update_hash( - on: &[String], + on: &[Column], batch: &RecordBatch, hash: &mut JoinHashMap, offset: usize, @@ -432,7 +422,7 @@ fn update_hash( // evaluate the keys let keys_values = on .iter() - .map(|name| Ok(col(name).evaluate(batch)?.into_array(batch.num_rows()))) + .map(|c| Ok(c.evaluate(batch)?.into_array(batch.num_rows()))) .collect::>>()?; // calculate the hash values @@ -461,9 +451,9 @@ struct HashJoinStream { /// Input schema schema: Arc, /// columns from the left - on_left: Vec, + on_left: Vec, /// columns from the right used to compute the hash - on_right: Vec, + on_right: Vec, /// type of the join join_type: JoinType, /// information from the left @@ -490,6 +480,39 @@ struct HashJoinStream { is_exhausted: bool, } +#[allow(clippy::too_many_arguments)] +impl HashJoinStream { + fn new( + schema: Arc, + on_left: Vec, + on_right: Vec, + join_type: JoinType, + left_data: JoinLeftData, + right: SendableRecordBatchStream, + column_indices: Vec, + random_state: RandomState, + visited_left_side: Vec, + ) -> Self { + HashJoinStream { + schema, + on_left, + on_right, + join_type, + left_data, + right, + column_indices, + num_input_batches: 0, + num_input_rows: 0, + num_output_batches: 0, + num_output_rows: 0, + join_time: 0, + random_state, + visited_left_side, + is_exhausted: false, + } + } +} + impl RecordBatchStream for HashJoinStream { fn schema(&self) -> SchemaRef { self.schema.clone() @@ -531,8 +554,8 @@ fn build_batch_from_indices( fn build_batch( batch: &RecordBatch, left_data: &JoinLeftData, - on_left: &[String], - on_right: &[String], + on_left: &[Column], + on_right: &[Column], join_type: JoinType, schema: &Schema, column_indices: &[ColumnIndex], @@ -590,21 +613,17 @@ fn build_join_indexes( left_data: &JoinLeftData, right: &RecordBatch, join_type: JoinType, - left_on: &[String], - right_on: &[String], + left_on: &[Column], + right_on: &[Column], random_state: &RandomState, ) -> Result<(UInt64Array, UInt32Array)> { let keys_values = right_on .iter() - .map(|name| Ok(col(name).evaluate(right)?.into_array(right.num_rows()))) + .map(|c| Ok(c.evaluate(right)?.into_array(right.num_rows()))) .collect::>>()?; let left_join_values = left_on .iter() - .map(|name| { - Ok(col(name) - .evaluate(&left_data.1)? - .into_array(left_data.1.num_rows())) - }) + .map(|c| Ok(c.evaluate(&left_data.1)?.into_array(left_data.1.num_rows()))) .collect::>>()?; let hashes_buffer = &mut vec![0; keys_values[0].len()]; let hash_values = create_hashes(&keys_values, random_state, hashes_buffer)?; @@ -1250,6 +1269,7 @@ impl Stream for HashJoinStream { | JoinType::Right => {} } + // End of right batch, print stats in debug mode debug!( "Processed {} probe-side input batches containing {} rows and \ produced {} output batches containing {} rows in {} ms", @@ -1269,7 +1289,9 @@ impl Stream for HashJoinStream { mod tests { use crate::{ assert_batches_sorted_eq, - physical_plan::{common, memory::MemoryExec}, + physical_plan::{ + common, expressions::Column, memory::MemoryExec, repartition::RepartitionExec, + }, test::{build_table_i32, columns}, }; @@ -1289,14 +1311,74 @@ mod tests { fn join( left: Arc, right: Arc, - on: &[(&str, &str)], + on: JoinOn, join_type: &JoinType, ) -> Result { - let on: Vec<_> = on + HashJoinExec::try_new(left, right, on, join_type, PartitionMode::CollectLeft) + } + + async fn join_collect( + left: Arc, + right: Arc, + on: JoinOn, + join_type: &JoinType, + ) -> Result<(Vec, Vec)> { + let join = join(left, right, on, join_type)?; + let columns = columns(&join.schema()); + + let stream = join.execute(0).await?; + let batches = common::collect(stream).await?; + + Ok((columns, batches)) + } + + async fn partitioned_join_collect( + left: Arc, + right: Arc, + on: JoinOn, + join_type: &JoinType, + ) -> Result<(Vec, Vec)> { + let partition_count = 4; + + let (left_expr, right_expr) = on .iter() - .map(|(l, r)| (l.to_string(), r.to_string())) - .collect(); - HashJoinExec::try_new(left, right, &on, join_type, PartitionMode::CollectLeft) + .map(|(l, r)| { + ( + Arc::new(l.clone()) as Arc, + Arc::new(r.clone()) as Arc, + ) + }) + .unzip(); + + let join = HashJoinExec::try_new( + Arc::new(RepartitionExec::try_new( + left, + Partitioning::Hash(left_expr, partition_count), + )?), + Arc::new(RepartitionExec::try_new( + right, + Partitioning::Hash(right_expr, partition_count), + )?), + on, + join_type, + PartitionMode::Partitioned, + )?; + + let columns = columns(&join.schema()); + + let mut batches = vec![]; + for i in 0..partition_count { + let stream = join.execute(i).await?; + let more_batches = common::collect(stream).await?; + batches.extend( + more_batches + .into_iter() + .filter(|b| b.num_rows() > 0) + .collect::>(), + ); + } + + Ok((columns, batches)) } #[tokio::test] @@ -1311,15 +1393,58 @@ mod tests { ("b1", &vec![4, 5, 6]), ("c2", &vec![70, 80, 90]), ); - let on = &[("b1", "b1")]; - let join = join(left, right, on, &JoinType::Inner)?; + let on = vec![( + Column::new_with_schema("b1", &left.schema())?, + Column::new_with_schema("b1", &right.schema())?, + )]; + + let (columns, batches) = + join_collect(left.clone(), right.clone(), on.clone(), &JoinType::Inner) + .await?; - let columns = columns(&join.schema()); assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "c2"]); - let stream = join.execute(0).await?; - let batches = common::collect(stream).await?; + let expected = vec![ + "+----+----+----+----+----+", + "| a1 | b1 | c1 | a2 | c2 |", + "+----+----+----+----+----+", + "| 1 | 4 | 7 | 10 | 70 |", + "| 2 | 5 | 8 | 20 | 80 |", + "| 3 | 5 | 9 | 20 | 80 |", + "+----+----+----+----+----+", + ]; + assert_batches_sorted_eq!(expected, &batches); + + Ok(()) + } + + #[tokio::test] + async fn partitioned_join_inner_one() -> Result<()> { + let left = build_table( + ("a1", &vec![1, 2, 3]), + ("b1", &vec![4, 5, 5]), // this has a repetition + ("c1", &vec![7, 8, 9]), + ); + let right = build_table( + ("a2", &vec![10, 20, 30]), + ("b1", &vec![4, 5, 6]), + ("c2", &vec![70, 80, 90]), + ); + let on = vec![( + Column::new_with_schema("b1", &left.schema())?, + Column::new_with_schema("b1", &right.schema())?, + )]; + + let (columns, batches) = partitioned_join_collect( + left.clone(), + right.clone(), + on.clone(), + &JoinType::Inner, + ) + .await?; + + assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "c2"]); let expected = vec![ "+----+----+----+----+----+", @@ -1347,16 +1472,15 @@ mod tests { ("b2", &vec![4, 5, 6]), ("c2", &vec![70, 80, 90]), ); - let on = &[("b1", "b2")]; + let on = vec![( + Column::new_with_schema("b1", &left.schema())?, + Column::new_with_schema("b2", &right.schema())?, + )]; - let join = join(left, right, on, &JoinType::Inner)?; + let (columns, batches) = join_collect(left, right, on, &JoinType::Inner).await?; - let columns = columns(&join.schema()); assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b2", "c2"]); - let stream = join.execute(0).await?; - let batches = common::collect(stream).await?; - let expected = vec![ "+----+----+----+----+----+----+", "| a1 | b1 | c1 | a2 | b2 | c2 |", @@ -1384,15 +1508,21 @@ mod tests { ("b2", &vec![1, 2, 2]), ("c2", &vec![70, 80, 90]), ); - let on = &[("a1", "a1"), ("b2", "b2")]; + let on = vec![ + ( + Column::new_with_schema("a1", &left.schema())?, + Column::new_with_schema("a1", &right.schema())?, + ), + ( + Column::new_with_schema("b2", &left.schema())?, + Column::new_with_schema("b2", &right.schema())?, + ), + ]; - let join = join(left, right, on, &JoinType::Inner)?; + let (columns, batches) = join_collect(left, right, on, &JoinType::Inner).await?; - let columns = columns(&join.schema()); assert_eq!(columns, vec!["a1", "b2", "c1", "c2"]); - let stream = join.execute(0).await?; - let batches = common::collect(stream).await?; assert_eq!(batches.len(), 1); let expected = vec![ @@ -1430,15 +1560,21 @@ mod tests { ("b2", &vec![1, 2, 2]), ("c2", &vec![70, 80, 90]), ); - let on = &[("a1", "a1"), ("b2", "b2")]; + let on = vec![ + ( + Column::new_with_schema("a1", &left.schema())?, + Column::new_with_schema("a1", &right.schema())?, + ), + ( + Column::new_with_schema("b2", &left.schema())?, + Column::new_with_schema("b2", &right.schema())?, + ), + ]; - let join = join(left, right, on, &JoinType::Inner)?; + let (columns, batches) = join_collect(left, right, on, &JoinType::Inner).await?; - let columns = columns(&join.schema()); assert_eq!(columns, vec!["a1", "b2", "c1", "c2"]); - let stream = join.execute(0).await?; - let batches = common::collect(stream).await?; assert_eq!(batches.len(), 1); let expected = vec![ @@ -1477,7 +1613,10 @@ mod tests { MemoryExec::try_new(&[vec![batch1], vec![batch2]], schema, None).unwrap(), ); - let on = &[("b1", "b1")]; + let on = vec![( + Column::new_with_schema("b1", &left.schema())?, + Column::new_with_schema("b1", &right.schema())?, + )]; let join = join(left, right, on, &JoinType::Inner)?; @@ -1540,7 +1679,10 @@ mod tests { ("b1", &vec![4, 5, 6]), ("c2", &vec![70, 80, 90]), ); - let on = &[("b1", "b1")]; + let on = vec![( + Column::new_with_schema("b1", &left.schema()).unwrap(), + Column::new_with_schema("b1", &right.schema()).unwrap(), + )]; let join = join(left, right, on, &JoinType::Left).unwrap(); @@ -1578,7 +1720,10 @@ mod tests { ("b2", &vec![4, 5, 6]), ("c2", &vec![70, 80, 90]), ); - let on = &[("b1", "b2")]; + let on = vec![( + Column::new_with_schema("b1", &left.schema()).unwrap(), + Column::new_with_schema("b2", &right.schema()).unwrap(), + )]; let join = join(left, right, on, &JoinType::Full).unwrap(); @@ -1613,7 +1758,10 @@ mod tests { ("c1", &vec![7, 8, 9]), ); let right = build_table_i32(("a2", &vec![]), ("b1", &vec![]), ("c2", &vec![])); - let on = &[("b1", "b1")]; + let on = vec![( + Column::new_with_schema("b1", &left.schema()).unwrap(), + Column::new_with_schema("b1", &right.schema()).unwrap(), + )]; let schema = right.schema(); let right = Arc::new(MemoryExec::try_new(&[vec![right]], schema, None).unwrap()); let join = join(left, right, on, &JoinType::Left).unwrap(); @@ -1645,7 +1793,10 @@ mod tests { ("c1", &vec![7, 8, 9]), ); let right = build_table_i32(("a2", &vec![]), ("b2", &vec![]), ("c2", &vec![])); - let on = &[("b1", "b2")]; + let on = vec![( + Column::new_with_schema("b1", &left.schema()).unwrap(), + Column::new_with_schema("b2", &right.schema()).unwrap(), + )]; let schema = right.schema(); let right = Arc::new(MemoryExec::try_new(&[vec![right]], schema, None).unwrap()); let join = join(left, right, on, &JoinType::Full).unwrap(); @@ -1681,15 +1832,55 @@ mod tests { ("b1", &vec![4, 5, 6]), ("c2", &vec![70, 80, 90]), ); - let on = &[("b1", "b1")]; + let on = vec![( + Column::new_with_schema("b1", &left.schema())?, + Column::new_with_schema("b1", &right.schema())?, + )]; + + let (columns, batches) = + join_collect(left.clone(), right.clone(), on.clone(), &JoinType::Left) + .await?; + assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "c2"]); - let join = join(left, right, on, &JoinType::Left)?; + let expected = vec![ + "+----+----+----+----+----+", + "| a1 | b1 | c1 | a2 | c2 |", + "+----+----+----+----+----+", + "| 1 | 4 | 7 | 10 | 70 |", + "| 2 | 5 | 8 | 20 | 80 |", + "| 3 | 7 | 9 | | |", + "+----+----+----+----+----+", + ]; + assert_batches_sorted_eq!(expected, &batches); - let columns = columns(&join.schema()); - assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "c2"]); + Ok(()) + } - let stream = join.execute(0).await?; - let batches = common::collect(stream).await?; + #[tokio::test] + async fn partitioned_join_left_one() -> Result<()> { + let left = build_table( + ("a1", &vec![1, 2, 3]), + ("b1", &vec![4, 5, 7]), // 7 does not exist on the right + ("c1", &vec![7, 8, 9]), + ); + let right = build_table( + ("a2", &vec![10, 20, 30]), + ("b1", &vec![4, 5, 6]), + ("c2", &vec![70, 80, 90]), + ); + let on = vec![( + Column::new_with_schema("b1", &left.schema())?, + Column::new_with_schema("b1", &right.schema())?, + )]; + + let (columns, batches) = partitioned_join_collect( + left.clone(), + right.clone(), + on.clone(), + &JoinType::Left, + ) + .await?; + assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "c2"]); let expected = vec![ "+----+----+----+----+----+", @@ -1717,7 +1908,10 @@ mod tests { ("b1", &vec![4, 5, 6, 5]), // 5 is double on the right ("c2", &vec![70, 80, 90, 100]), ); - let on = &[("b1", "b1")]; + let on = vec![( + Column::new_with_schema("b1", &left.schema())?, + Column::new_with_schema("b1", &right.schema())?, + )]; let join = join(left, right, on, &JoinType::Semi)?; @@ -1753,7 +1947,10 @@ mod tests { ("b1", &vec![4, 5, 6, 5]), // 5 is double on the right ("c2", &vec![70, 80, 90, 100]), ); - let on = &[("b1", "b1")]; + let on = vec![( + Column::new_with_schema("b1", &left.schema())?, + Column::new_with_schema("b1", &right.schema())?, + )]; let join = join(left, right, on, &JoinType::Anti)?; @@ -1787,15 +1984,51 @@ mod tests { ("b1", &vec![4, 5, 6]), // 6 does not exist on the left ("c2", &vec![70, 80, 90]), ); - let on = &[("b1", "b1")]; + let on = vec![( + Column::new_with_schema("b1", &left.schema())?, + Column::new_with_schema("b1", &right.schema())?, + )]; - let join = join(left, right, on, &JoinType::Right)?; + let (columns, batches) = join_collect(left, right, on, &JoinType::Right).await?; - let columns = columns(&join.schema()); assert_eq!(columns, vec!["a1", "c1", "a2", "b1", "c2"]); - let stream = join.execute(0).await?; - let batches = common::collect(stream).await?; + let expected = vec![ + "+----+----+----+----+----+", + "| a1 | c1 | a2 | b1 | c2 |", + "+----+----+----+----+----+", + "| | | 30 | 6 | 90 |", + "| 1 | 7 | 10 | 4 | 70 |", + "| 2 | 8 | 20 | 5 | 80 |", + "+----+----+----+----+----+", + ]; + + assert_batches_sorted_eq!(expected, &batches); + + Ok(()) + } + + #[tokio::test] + async fn partitioned_join_right_one() -> Result<()> { + let left = build_table( + ("a1", &vec![1, 2, 3]), + ("b1", &vec![4, 5, 7]), + ("c1", &vec![7, 8, 9]), + ); + let right = build_table( + ("a2", &vec![10, 20, 30]), + ("b1", &vec![4, 5, 6]), // 6 does not exist on the left + ("c2", &vec![70, 80, 90]), + ); + let on = vec![( + Column::new_with_schema("b1", &left.schema())?, + Column::new_with_schema("b1", &right.schema())?, + )]; + + let (columns, batches) = + partitioned_join_collect(left, right, on, &JoinType::Right).await?; + + assert_eq!(columns, vec!["a1", "c1", "a2", "b1", "c2"]); let expected = vec![ "+----+----+----+----+----+", @@ -1824,7 +2057,10 @@ mod tests { ("b2", &vec![4, 5, 6]), ("c2", &vec![70, 80, 90]), ); - let on = &[("b1", "b2")]; + let on = vec![( + Column::new_with_schema("b1", &left.schema()).unwrap(), + Column::new_with_schema("b2", &right.schema()).unwrap(), + )]; let join = join(left, right, on, &JoinType::Full)?; @@ -1904,8 +2140,8 @@ mod tests { &left_data, &right, JoinType::Inner, - &["a".to_string()], - &["a".to_string()], + &[Column::new("a", 0)], + &[Column::new("a", 0)], &random_state, )?; @@ -1914,7 +2150,6 @@ mod tests { left_ids.append_value(1)?; let mut right_ids = UInt32Builder::new(0); - right_ids.append_value(0)?; right_ids.append_value(1)?; diff --git a/datafusion/src/physical_plan/hash_utils.rs b/datafusion/src/physical_plan/hash_utils.rs index a48710bfbfc35..0cf0b9212cd21 100644 --- a/datafusion/src/physical_plan/hash_utils.rs +++ b/datafusion/src/physical_plan/hash_utils.rs @@ -21,6 +21,8 @@ use crate::error::{DataFusionError, Result}; use arrow::datatypes::{Field, Schema}; use std::collections::HashSet; +use crate::physical_plan::expressions::Column; + /// All valid types of joins. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum JoinType { @@ -39,14 +41,25 @@ pub enum JoinType { } /// The on clause of the join, as vector of (left, right) columns. -pub type JoinOn = [(String, String)]; +pub type JoinOn = Vec<(Column, Column)>; +/// Reference for JoinOn. +pub type JoinOnRef<'a> = &'a [(Column, Column)]; /// Checks whether the schemas "left" and "right" and columns "on" represent a valid join. /// They are valid whenever their columns' intersection equals the set `on` -pub fn check_join_is_valid(left: &Schema, right: &Schema, on: &JoinOn) -> Result<()> { - let left: HashSet = left.fields().iter().map(|f| f.name().clone()).collect(); - let right: HashSet = - right.fields().iter().map(|f| f.name().clone()).collect(); +pub fn check_join_is_valid(left: &Schema, right: &Schema, on: JoinOnRef) -> Result<()> { + let left: HashSet = left + .fields() + .iter() + .enumerate() + .map(|(idx, f)| Column::new(f.name(), idx)) + .collect(); + let right: HashSet = right + .fields() + .iter() + .enumerate() + .map(|(idx, f)| Column::new(f.name(), idx)) + .collect(); check_join_set_is_valid(&left, &right, on) } @@ -54,14 +67,14 @@ pub fn check_join_is_valid(left: &Schema, right: &Schema, on: &JoinOn) -> Result /// Checks whether the sets left, right and on compose a valid join. /// They are valid whenever their intersection equals the set `on` fn check_join_set_is_valid( - left: &HashSet, - right: &HashSet, - on: &JoinOn, + left: &HashSet, + right: &HashSet, + on: &[(Column, Column)], ) -> Result<()> { - let on_left = &on.iter().map(|on| on.0.to_string()).collect::>(); + let on_left = &on.iter().map(|on| on.0.clone()).collect::>(); let left_missing = on_left.difference(left).collect::>(); - let on_right = &on.iter().map(|on| on.1.to_string()).collect::>(); + let on_right = &on.iter().map(|on| on.1.clone()).collect::>(); let right_missing = on_right.difference(right).collect::>(); if !left_missing.is_empty() | !right_missing.is_empty() { @@ -75,7 +88,7 @@ fn check_join_set_is_valid( let remaining = right .difference(on_right) .cloned() - .collect::>(); + .collect::>(); let collisions = left.intersection(&remaining).collect::>(); @@ -94,7 +107,7 @@ fn check_join_set_is_valid( pub fn build_join_schema( left: &Schema, right: &Schema, - on: &JoinOn, + on: JoinOnRef, join_type: &JoinType, ) -> Schema { let fields: Vec = match join_type { @@ -102,8 +115,8 @@ pub fn build_join_schema( // remove right-side join keys if they have the same names as the left-side let duplicate_keys = &on .iter() - .filter(|(l, r)| l == r) - .map(|on| on.1.to_string()) + .filter(|(l, r)| l.name() == r.name()) + .map(|on| on.1.name()) .collect::>(); let left_fields = left.fields().iter(); @@ -111,7 +124,7 @@ pub fn build_join_schema( let right_fields = right .fields() .iter() - .filter(|f| !duplicate_keys.contains(f.name())); + .filter(|f| !duplicate_keys.contains(f.name().as_str())); // left then right left_fields.chain(right_fields).cloned().collect() @@ -120,14 +133,14 @@ pub fn build_join_schema( // remove left-side join keys if they have the same names as the right-side let duplicate_keys = &on .iter() - .filter(|(l, r)| l == r) - .map(|on| on.1.to_string()) + .filter(|(l, r)| l.name() == r.name()) + .map(|on| on.1.name()) .collect::>(); let left_fields = left .fields() .iter() - .filter(|f| !duplicate_keys.contains(f.name())); + .filter(|f| !duplicate_keys.contains(f.name().as_str())); let right_fields = right.fields().iter(); @@ -141,24 +154,25 @@ pub fn build_join_schema( #[cfg(test)] mod tests { - use super::*; - fn check(left: &[&str], right: &[&str], on: &[(&str, &str)]) -> Result<()> { - let left = left.iter().map(|x| x.to_string()).collect::>(); - let right = right.iter().map(|x| x.to_string()).collect::>(); - let on: Vec<_> = on + fn check(left: &[Column], right: &[Column], on: &[(Column, Column)]) -> Result<()> { + let left = left + .iter() + .map(|x| x.to_owned()) + .collect::>(); + let right = right .iter() - .map(|(l, r)| (l.to_string(), r.to_string())) - .collect(); - check_join_set_is_valid(&left, &right, &on) + .map(|x| x.to_owned()) + .collect::>(); + check_join_set_is_valid(&left, &right, on) } #[test] fn check_valid() -> Result<()> { - let left = vec!["a", "b1"]; - let right = vec!["a", "b2"]; - let on = &[("a", "a")]; + let left = vec![Column::new("a", 0), Column::new("b1", 1)]; + let right = vec![Column::new("a", 0), Column::new("b2", 1)]; + let on = &[(Column::new("a", 0), Column::new("a", 0))]; check(&left, &right, on)?; Ok(()) @@ -166,18 +180,18 @@ mod tests { #[test] fn check_not_in_right() { - let left = vec!["a", "b"]; - let right = vec!["b"]; - let on = &[("a", "a")]; + let left = vec![Column::new("a", 0), Column::new("b", 1)]; + let right = vec![Column::new("b", 0)]; + let on = &[(Column::new("a", 0), Column::new("a", 0))]; assert!(check(&left, &right, on).is_err()); } #[test] fn check_not_in_left() { - let left = vec!["b"]; - let right = vec!["a"]; - let on = &[("a", "a")]; + let left = vec![Column::new("b", 0)]; + let right = vec![Column::new("a", 0)]; + let on = &[(Column::new("a", 0), Column::new("a", 0))]; assert!(check(&left, &right, on).is_err()); } @@ -185,18 +199,18 @@ mod tests { #[test] fn check_collision() { // column "a" would appear both in left and right - let left = vec!["a", "c"]; - let right = vec!["a", "b"]; - let on = &[("a", "b")]; + let left = vec![Column::new("a", 0), Column::new("c", 1)]; + let right = vec![Column::new("a", 0), Column::new("b", 1)]; + let on = &[(Column::new("a", 0), Column::new("b", 1))]; assert!(check(&left, &right, on).is_err()); } #[test] fn check_in_right() { - let left = vec!["a", "c"]; - let right = vec!["b"]; - let on = &[("a", "b")]; + let left = vec![Column::new("a", 0), Column::new("c", 1)]; + let right = vec![Column::new("b", 0)]; + let on = &[(Column::new("a", 0), Column::new("b", 0))]; assert!(check(&left, &right, on).is_ok()); } diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index 50c30a57b5fea..7b26d7b3ab6e8 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -211,9 +211,9 @@ pub trait ExecutionPlan: Debug + Send + Sync { /// let displayable_plan = displayable(physical_plan.as_ref()); /// let plan_string = format!("{}", displayable_plan.indent()); /// -/// assert_eq!("ProjectionExec: expr=[a]\ +/// assert_eq!("ProjectionExec: expr=[a@0 as a]\ /// \n CoalesceBatchesExec: target_batch_size=4096\ -/// \n FilterExec: a < 5\ +/// \n FilterExec: a@0 < 5\ /// \n RepartitionExec: partitioning=RoundRobinBatch(3)\ /// \n CsvExec: source=Path(tests/example.csv: [tests/example.csv]), has_header=true", /// plan_string.trim()); diff --git a/datafusion/src/physical_plan/parquet.rs b/datafusion/src/physical_plan/parquet.rs index 2bea94aee1e5b..3d20a9bf98c19 100644 --- a/datafusion/src/physical_plan/parquet.rs +++ b/datafusion/src/physical_plan/parquet.rs @@ -25,7 +25,7 @@ use std::{any::Any, convert::TryInto}; use crate::{ error::{DataFusionError, Result}, - logical_plan::Expr, + logical_plan::{Column, Expr}, physical_optimizer::pruning::{PruningPredicate, PruningStatistics}, physical_plan::{ common, DisplayFormatType, ExecutionPlan, Partitioning, RecordBatchStream, @@ -497,7 +497,7 @@ macro_rules! get_statistic { // Extract the min or max value calling `func` or `bytes_func` on the ParquetStatistics as appropriate macro_rules! get_min_max_values { ($self:expr, $column:expr, $func:ident, $bytes_func:ident) => {{ - let (column_index, field) = if let Some((v, f)) = $self.parquet_schema.column_with_name($column) { + let (column_index, field) = if let Some((v, f)) = $self.parquet_schema.column_with_name(&$column.name) { (v, f) } else { // Named column was not present @@ -532,11 +532,11 @@ macro_rules! get_min_max_values { } impl<'a> PruningStatistics for RowGroupPruningStatistics<'a> { - fn min_values(&self, column: &str) -> Option { + fn min_values(&self, column: &Column) -> Option { get_min_max_values!(self, column, min, min_bytes) } - fn max_values(&self, column: &str) -> Option { + fn max_values(&self, column: &Column) -> Option { get_min_max_values!(self, column, max, max_bytes) } @@ -593,7 +593,6 @@ fn read_files( loop { match batch_reader.next() { Some(Ok(batch)) => { - //println!("ParquetExec got new batch from {}", filename); total_rows += batch.num_rows(); send_result(&response_tx, Ok(batch))?; if limit.map(|l| total_rows >= l).unwrap_or(false) { diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index af0e60f2194ca..a4c20a7f60ebc 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -56,6 +56,121 @@ use expressions::col; use log::debug; use std::sync::Arc; +fn create_function_physical_name( + fun: &str, + distinct: bool, + args: &[Expr], + input_schema: &DFSchema, +) -> Result { + let names: Vec = args + .iter() + .map(|e| physical_name(e, input_schema)) + .collect::>()?; + + let distinct_str = match distinct { + true => "DISTINCT ", + false => "", + }; + Ok(format!("{}({}{})", fun, distinct_str, names.join(","))) +} + +fn physical_name(e: &Expr, input_schema: &DFSchema) -> Result { + match e { + Expr::Column(c) => Ok(c.name.clone()), + Expr::Alias(_, name) => Ok(name.clone()), + Expr::ScalarVariable(variable_names) => Ok(variable_names.join(".")), + Expr::Literal(value) => Ok(format!("{:?}", value)), + Expr::BinaryExpr { left, op, right } => { + let left = physical_name(left, input_schema)?; + let right = physical_name(right, input_schema)?; + Ok(format!("{} {:?} {}", left, op, right)) + } + Expr::Case { + expr, + when_then_expr, + else_expr, + } => { + let mut name = "CASE ".to_string(); + if let Some(e) = expr { + name += &format!("{:?} ", e); + } + for (w, t) in when_then_expr { + name += &format!("WHEN {:?} THEN {:?} ", w, t); + } + if let Some(e) = else_expr { + name += &format!("ELSE {:?} ", e); + } + name += "END"; + Ok(name) + } + Expr::Cast { expr, data_type } => { + let expr = physical_name(expr, input_schema)?; + Ok(format!("CAST({} AS {:?})", expr, data_type)) + } + Expr::TryCast { expr, data_type } => { + let expr = physical_name(expr, input_schema)?; + Ok(format!("TRY_CAST({} AS {:?})", expr, data_type)) + } + Expr::Not(expr) => { + let expr = physical_name(expr, input_schema)?; + Ok(format!("NOT {}", expr)) + } + Expr::Negative(expr) => { + let expr = physical_name(expr, input_schema)?; + Ok(format!("(- {})", expr)) + } + Expr::IsNull(expr) => { + let expr = physical_name(expr, input_schema)?; + Ok(format!("{} IS NULL", expr)) + } + Expr::IsNotNull(expr) => { + let expr = physical_name(expr, input_schema)?; + Ok(format!("{} IS NOT NULL", expr)) + } + Expr::ScalarFunction { fun, args, .. } => { + create_function_physical_name(&fun.to_string(), false, args, input_schema) + } + Expr::ScalarUDF { fun, args, .. } => { + create_function_physical_name(&fun.name, false, args, input_schema) + } + Expr::WindowFunction { fun, args, .. } => { + create_function_physical_name(&fun.to_string(), false, args, input_schema) + } + Expr::AggregateFunction { + fun, + distinct, + args, + .. + } => { + create_function_physical_name(&fun.to_string(), *distinct, args, input_schema) + } + Expr::AggregateUDF { fun, args } => { + let mut names = Vec::with_capacity(args.len()); + for e in args { + names.push(physical_name(e, input_schema)?); + } + Ok(format!("{}({})", fun.name, names.join(","))) + } + Expr::InList { + expr, + list, + negated, + } => { + let expr = physical_name(expr, input_schema)?; + let list = list.iter().map(|expr| physical_name(expr, input_schema)); + if *negated { + Ok(format!("{} NOT IN ({:?})", expr, list)) + } else { + Ok(format!("{} IN ({:?})", expr, list)) + } + } + other => Err(DataFusionError::NotImplemented(format!( + "Cannot derive physical field name for logical expression {:?}", + other + ))), + } +} + /// This trait exposes the ability to plan an [`ExecutionPlan`] out of a [`LogicalPlan`]. pub trait ExtensionPlanner { /// Create a physical plan for a [`UserDefinedLogicalNode`]. @@ -150,10 +265,8 @@ impl DefaultPhysicalPlanner { } let input_exec = self.create_initial_plan(input, ctx_state)?; - let input_schema = input_exec.schema(); - + let physical_input_schema = input_exec.schema(); let logical_input_schema = input.as_ref().schema(); - let physical_input_schema = input_exec.as_ref().schema(); let window_expr = window_expr .iter() @@ -170,7 +283,7 @@ impl DefaultPhysicalPlanner { Ok(Arc::new(WindowAggExec::try_new( window_expr, input_exec.clone(), - input_schema, + physical_input_schema, )?)) } LogicalPlan::Aggregate { @@ -181,8 +294,7 @@ impl DefaultPhysicalPlanner { } => { // Initially need to perform the aggregate and then merge the partitions let input_exec = self.create_initial_plan(input, ctx_state)?; - let input_schema = input_exec.schema(); - let physical_input_schema = input_exec.as_ref().schema(); + let physical_input_schema = input_exec.schema(); let logical_input_schema = input.as_ref().schema(); let groups = group_expr @@ -191,10 +303,11 @@ impl DefaultPhysicalPlanner { tuple_err(( self.create_physical_expr( e, + logical_input_schema, &physical_input_schema, ctx_state, ), - e.name(logical_input_schema), + physical_name(e, logical_input_schema), )) }) .collect::>>()?; @@ -215,11 +328,13 @@ impl DefaultPhysicalPlanner { groups.clone(), aggregates.clone(), input_exec, - input_schema.clone(), + physical_input_schema.clone(), )?); - let final_group: Vec> = - (0..groups.len()).map(|i| col(&groups[i].1)).collect(); + // update group column indices based on partial aggregate plan evaluation + let final_group: Vec> = (0..groups.len()) + .map(|i| col(&groups[i].1, &initial_aggr.schema())) + .collect::>()?; // TODO: dictionary type not yet supported in Hash Repartition let contains_dict = groups @@ -261,31 +376,74 @@ impl DefaultPhysicalPlanner { .collect(), aggregates, initial_aggr, - input_schema, + physical_input_schema.clone(), )?)) } LogicalPlan::Projection { input, expr, .. } => { let input_exec = self.create_initial_plan(input, ctx_state)?; let input_schema = input.as_ref().schema(); - let runtime_expr = expr + + let physical_exprs = expr .iter() .map(|e| { + // For projections, SQL planner and logical plan builder may convert user + // provided expressions into logical Column expressions if their results + // are already provided from the input plans. Because we work with + // qualified columns in logical plane, derived columns involve operators or + // functions will contain qualifers as well. This will result in logical + // columns with names like `SUM(t1.c1)`, `t1.c1 + t1.c2`, etc. + // + // If we run these logical columns through physical_name function, we will + // get physical names with column qualifiers, which violates Datafusion's + // field name semantics. To account for this, we need to derive the + // physical name from physical input instead. + // + // This depends on the invariant that logical schema field index MUST match + // with physical schema field index. + let physical_name = if let Expr::Column(col) = e { + match input_schema.index_of_column(col) { + Ok(idx) => { + // index physical field using logical field index + Ok(input_exec.schema().field(idx).name().to_string()) + } + // logical column is not a derived column, safe to pass along to + // physical_name + Err(_) => physical_name(e, input_schema), + } + } else { + physical_name(e, input_schema) + }; + tuple_err(( - self.create_physical_expr(e, &input_exec.schema(), ctx_state), - e.name(input_schema), + self.create_physical_expr( + e, + input_schema, + &input_exec.schema(), + ctx_state, + ), + physical_name, )) }) .collect::>>()?; - Ok(Arc::new(ProjectionExec::try_new(runtime_expr, input_exec)?)) + + Ok(Arc::new(ProjectionExec::try_new( + physical_exprs, + input_exec, + )?)) } LogicalPlan::Filter { input, predicate, .. } => { - let input = self.create_initial_plan(input, ctx_state)?; - let input_schema = input.as_ref().schema(); - let runtime_expr = - self.create_physical_expr(predicate, &input_schema, ctx_state)?; - Ok(Arc::new(FilterExec::try_new(runtime_expr, input)?)) + let physical_input = self.create_initial_plan(input, ctx_state)?; + let input_schema = physical_input.as_ref().schema(); + let input_dfschema = input.as_ref().schema(); + let runtime_expr = self.create_physical_expr( + predicate, + input_dfschema, + &input_schema, + ctx_state, + )?; + Ok(Arc::new(FilterExec::try_new(runtime_expr, physical_input)?)) } LogicalPlan::Union { inputs, .. } => { let physical_plans = inputs @@ -298,8 +456,9 @@ impl DefaultPhysicalPlanner { input, partitioning_scheme, } => { - let input = self.create_initial_plan(input, ctx_state)?; - let input_schema = input.schema(); + let physical_input = self.create_initial_plan(input, ctx_state)?; + let input_schema = physical_input.schema(); + let input_dfschema = input.as_ref().schema(); let physical_partitioning = match partitioning_scheme { LogicalPartitioning::RoundRobinBatch(n) => { Partitioning::RoundRobinBatch(*n) @@ -308,20 +467,26 @@ impl DefaultPhysicalPlanner { let runtime_expr = expr .iter() .map(|e| { - self.create_physical_expr(e, &input_schema, ctx_state) + self.create_physical_expr( + e, + input_dfschema, + &input_schema, + ctx_state, + ) }) .collect::>>()?; Partitioning::Hash(runtime_expr, *n) } }; Ok(Arc::new(RepartitionExec::try_new( - input, + physical_input, physical_partitioning, )?)) } LogicalPlan::Sort { expr, input, .. } => { - let input = self.create_initial_plan(input, ctx_state)?; - let input_schema = input.as_ref().schema(); + let physical_input = self.create_initial_plan(input, ctx_state)?; + let input_schema = physical_input.as_ref().schema(); + let input_dfschema = input.as_ref().schema(); let sort_expr = expr .iter() @@ -332,6 +497,7 @@ impl DefaultPhysicalPlanner { nulls_first, } => self.create_physical_sort_expr( expr, + input_dfschema, &input_schema, SortOptions { descending: !*asc, @@ -345,7 +511,7 @@ impl DefaultPhysicalPlanner { }) .collect::>>()?; - Ok(Arc::new(SortExec::try_new(sort_expr, input)?)) + Ok(Arc::new(SortExec::try_new(sort_expr, physical_input)?)) } LogicalPlan::Join { left, @@ -354,8 +520,10 @@ impl DefaultPhysicalPlanner { join_type, .. } => { - let left = self.create_initial_plan(left, ctx_state)?; - let right = self.create_initial_plan(right, ctx_state)?; + let left_df_schema = left.schema(); + let physical_left = self.create_initial_plan(left, ctx_state)?; + let right_df_schema = right.schema(); + let physical_right = self.create_initial_plan(right, ctx_state)?; let physical_join_type = match join_type { JoinType::Inner => hash_utils::JoinType::Inner, JoinType::Left => hash_utils::JoinType::Left, @@ -364,30 +532,47 @@ impl DefaultPhysicalPlanner { JoinType::Semi => hash_utils::JoinType::Semi, JoinType::Anti => hash_utils::JoinType::Anti, }; + let join_on = keys + .iter() + .map(|(l, r)| { + Ok(( + Column::new(&l.name, left_df_schema.index_of_column(l)?), + Column::new(&r.name, right_df_schema.index_of_column(r)?), + )) + }) + .collect::>()?; + if ctx_state.config.concurrency > 1 && ctx_state.config.repartition_joins { - let left_expr = keys.iter().map(|x| col(&x.0)).collect(); - let right_expr = keys.iter().map(|x| col(&x.1)).collect(); + let (left_expr, right_expr) = join_on + .iter() + .map(|(l, r)| { + ( + Arc::new(l.clone()) as Arc, + Arc::new(r.clone()) as Arc, + ) + }) + .unzip(); // Use hash partition by default to parallelize hash joins Ok(Arc::new(HashJoinExec::try_new( Arc::new(RepartitionExec::try_new( - left, + physical_left, Partitioning::Hash(left_expr, ctx_state.config.concurrency), )?), Arc::new(RepartitionExec::try_new( - right, + physical_right, Partitioning::Hash(right_expr, ctx_state.config.concurrency), )?), - keys, + join_on, &physical_join_type, PartitionMode::Partitioned, )?)) } else { Ok(Arc::new(HashJoinExec::try_new( - left, - right, - keys, + physical_left, + physical_right, + join_on, &physical_join_type, PartitionMode::CollectLeft, )?)) @@ -476,10 +661,10 @@ impl DefaultPhysicalPlanner { "No installed planner was able to convert the custom node to an execution plan: {:?}", node )))?; - // Ensure the ExecutionPlan's schema matches the + // Ensure the ExecutionPlan's schema matches the // declared logical schema to catch and warn about // logic errors when creating user defined plans. - if plan.schema() != node.schema().as_ref().to_owned().into() { + if !node.schema().matches_arrow_schema(&plan.schema()) { Err(DataFusionError::Plan(format!( "Extension planner for {:?} created an ExecutionPlan with mismatched schema. \ LogicalPlan schema: {:?}, ExecutionPlan schema: {:?}", @@ -496,17 +681,20 @@ impl DefaultPhysicalPlanner { pub fn create_physical_expr( &self, e: &Expr, + input_dfschema: &DFSchema, input_schema: &Schema, ctx_state: &ExecutionContextState, ) -> Result> { match e { - Expr::Alias(expr, ..) => { - Ok(self.create_physical_expr(expr, input_schema, ctx_state)?) - } - Expr::Column(name) => { - // check that name exists - input_schema.field_with_name(name)?; - Ok(Arc::new(Column::new(name))) + Expr::Alias(expr, ..) => Ok(self.create_physical_expr( + expr, + input_dfschema, + input_schema, + ctx_state, + )?), + Expr::Column(c) => { + let idx = input_dfschema.index_of_column(c)?; + Ok(Arc::new(Column::new(&c.name, idx))) } Expr::Literal(value) => Ok(Arc::new(Literal::new(value.clone()))), Expr::ScalarVariable(variable_names) => { @@ -535,8 +723,18 @@ impl DefaultPhysicalPlanner { } } Expr::BinaryExpr { left, op, right } => { - let lhs = self.create_physical_expr(left, input_schema, ctx_state)?; - let rhs = self.create_physical_expr(right, input_schema, ctx_state)?; + let lhs = self.create_physical_expr( + left, + input_dfschema, + input_schema, + ctx_state, + )?; + let rhs = self.create_physical_expr( + right, + input_dfschema, + input_schema, + ctx_state, + )?; binary(lhs, *op, rhs, input_schema) } Expr::Case { @@ -548,6 +746,7 @@ impl DefaultPhysicalPlanner { let expr: Option> = if let Some(e) = expr { Some(self.create_physical_expr( e.as_ref(), + input_dfschema, input_schema, ctx_state, )?) @@ -557,13 +756,23 @@ impl DefaultPhysicalPlanner { let when_expr = when_then_expr .iter() .map(|(w, _)| { - self.create_physical_expr(w.as_ref(), input_schema, ctx_state) + self.create_physical_expr( + w.as_ref(), + input_dfschema, + input_schema, + ctx_state, + ) }) .collect::>>()?; let then_expr = when_then_expr .iter() .map(|(_, t)| { - self.create_physical_expr(t.as_ref(), input_schema, ctx_state) + self.create_physical_expr( + t.as_ref(), + input_dfschema, + input_schema, + ctx_state, + ) }) .collect::>>()?; let when_then_expr: Vec<(Arc, Arc)> = @@ -576,6 +785,7 @@ impl DefaultPhysicalPlanner { { Some(self.create_physical_expr( e.as_ref(), + input_dfschema, input_schema, ctx_state, )?) @@ -589,35 +799,43 @@ impl DefaultPhysicalPlanner { )?)) } Expr::Cast { expr, data_type } => expressions::cast( - self.create_physical_expr(expr, input_schema, ctx_state)?, + self.create_physical_expr(expr, input_dfschema, input_schema, ctx_state)?, input_schema, data_type.clone(), ), Expr::TryCast { expr, data_type } => expressions::try_cast( - self.create_physical_expr(expr, input_schema, ctx_state)?, + self.create_physical_expr(expr, input_dfschema, input_schema, ctx_state)?, input_schema, data_type.clone(), ), Expr::Not(expr) => expressions::not( - self.create_physical_expr(expr, input_schema, ctx_state)?, + self.create_physical_expr(expr, input_dfschema, input_schema, ctx_state)?, input_schema, ), Expr::Negative(expr) => expressions::negative( - self.create_physical_expr(expr, input_schema, ctx_state)?, + self.create_physical_expr(expr, input_dfschema, input_schema, ctx_state)?, input_schema, ), Expr::IsNull(expr) => expressions::is_null(self.create_physical_expr( expr, + input_dfschema, input_schema, ctx_state, )?), Expr::IsNotNull(expr) => expressions::is_not_null( - self.create_physical_expr(expr, input_schema, ctx_state)?, + self.create_physical_expr(expr, input_dfschema, input_schema, ctx_state)?, ), Expr::ScalarFunction { fun, args } => { let physical_args = args .iter() - .map(|e| self.create_physical_expr(e, input_schema, ctx_state)) + .map(|e| { + self.create_physical_expr( + e, + input_dfschema, + input_schema, + ctx_state, + ) + }) .collect::>>()?; functions::create_physical_expr( fun, @@ -631,6 +849,7 @@ impl DefaultPhysicalPlanner { for e in args { physical_args.push(self.create_physical_expr( e, + input_dfschema, input_schema, ctx_state, )?); @@ -648,11 +867,24 @@ impl DefaultPhysicalPlanner { low, high, } => { - let value_expr = - self.create_physical_expr(expr, input_schema, ctx_state)?; - let low_expr = self.create_physical_expr(low, input_schema, ctx_state)?; - let high_expr = - self.create_physical_expr(high, input_schema, ctx_state)?; + let value_expr = self.create_physical_expr( + expr, + input_dfschema, + input_schema, + ctx_state, + )?; + let low_expr = self.create_physical_expr( + low, + input_dfschema, + input_schema, + ctx_state, + )?; + let high_expr = self.create_physical_expr( + high, + input_dfschema, + input_schema, + ctx_state, + )?; // rewrite the between into the two binary operators let binary_expr = binary( @@ -677,44 +909,54 @@ impl DefaultPhysicalPlanner { Ok(expressions::lit(ScalarValue::Boolean(None))) } _ => { - let value_expr = - self.create_physical_expr(expr, input_schema, ctx_state)?; + let value_expr = self.create_physical_expr( + expr, + input_dfschema, + input_schema, + ctx_state, + )?; let value_expr_data_type = value_expr.data_type(input_schema)?; - let list_exprs = - list.iter() - .map(|expr| match expr { - Expr::Literal(ScalarValue::Utf8(None)) => self - .create_physical_expr(expr, input_schema, ctx_state), - _ => { - let list_expr = self.create_physical_expr( - expr, + let list_exprs = list + .iter() + .map(|expr| match expr { + Expr::Literal(ScalarValue::Utf8(None)) => self + .create_physical_expr( + expr, + input_dfschema, + input_schema, + ctx_state, + ), + _ => { + let list_expr = self.create_physical_expr( + expr, + input_dfschema, + input_schema, + ctx_state, + )?; + let list_expr_data_type = + list_expr.data_type(input_schema)?; + + if list_expr_data_type == value_expr_data_type { + Ok(list_expr) + } else if can_cast_types( + &list_expr_data_type, + &value_expr_data_type, + ) { + expressions::cast( + list_expr, input_schema, - ctx_state, - )?; - let list_expr_data_type = - list_expr.data_type(input_schema)?; - - if list_expr_data_type == value_expr_data_type { - Ok(list_expr) - } else if can_cast_types( - &list_expr_data_type, - &value_expr_data_type, - ) { - expressions::cast( - list_expr, - input_schema, - value_expr.data_type(input_schema)?, - ) - } else { - Err(DataFusionError::Plan(format!( - "Unsupported CAST from {:?} to {:?}", - list_expr_data_type, value_expr_data_type - ))) - } + value_expr.data_type(input_schema)?, + ) + } else { + Err(DataFusionError::Plan(format!( + "Unsupported CAST from {:?} to {:?}", + list_expr_data_type, value_expr_data_type + ))) } - }) - .collect::>>()?; + } + }) + .collect::>>()?; expressions::in_list(value_expr, list_exprs, negated) } @@ -731,6 +973,7 @@ impl DefaultPhysicalPlanner { &self, e: &Expr, name: String, + logical_input_schema: &DFSchema, physical_input_schema: &Schema, ctx_state: &ExecutionContextState, ) -> Result> { @@ -745,13 +988,23 @@ impl DefaultPhysicalPlanner { let args = args .iter() .map(|e| { - self.create_physical_expr(e, physical_input_schema, ctx_state) + self.create_physical_expr( + e, + logical_input_schema, + physical_input_schema, + ctx_state, + ) }) .collect::>>()?; let partition_by = partition_by .iter() .map(|e| { - self.create_physical_expr(e, physical_input_schema, ctx_state) + self.create_physical_expr( + e, + logical_input_schema, + physical_input_schema, + ctx_state, + ) }) .collect::>>()?; let order_by = order_by @@ -763,6 +1016,7 @@ impl DefaultPhysicalPlanner { nulls_first, } => self.create_physical_sort_expr( expr, + logical_input_schema, physical_input_schema, SortOptions { descending: !*asc, @@ -809,9 +1063,15 @@ impl DefaultPhysicalPlanner { // unpack aliased logical expressions, e.g. "sum(col) over () as total" let (name, e) = match e { Expr::Alias(sub_expr, alias) => (alias.clone(), sub_expr.as_ref()), - _ => (e.name(logical_input_schema)?, e), + _ => (physical_name(e, logical_input_schema)?, e), }; - self.create_window_expr_with_name(e, name, physical_input_schema, ctx_state) + self.create_window_expr_with_name( + e, + name, + logical_input_schema, + physical_input_schema, + ctx_state, + ) } /// Create an aggregate expression with a name from a logical expression @@ -819,6 +1079,7 @@ impl DefaultPhysicalPlanner { &self, e: &Expr, name: String, + logical_input_schema: &DFSchema, physical_input_schema: &Schema, ctx_state: &ExecutionContextState, ) -> Result> { @@ -832,7 +1093,12 @@ impl DefaultPhysicalPlanner { let args = args .iter() .map(|e| { - self.create_physical_expr(e, physical_input_schema, ctx_state) + self.create_physical_expr( + e, + logical_input_schema, + physical_input_schema, + ctx_state, + ) }) .collect::>>()?; aggregates::create_aggregate_expr( @@ -847,7 +1113,12 @@ impl DefaultPhysicalPlanner { let args = args .iter() .map(|e| { - self.create_physical_expr(e, physical_input_schema, ctx_state) + self.create_physical_expr( + e, + logical_input_schema, + physical_input_schema, + ctx_state, + ) }) .collect::>>()?; @@ -871,21 +1142,34 @@ impl DefaultPhysicalPlanner { // unpack aliased logical expressions, e.g. "sum(col) as total" let (name, e) = match e { Expr::Alias(sub_expr, alias) => (alias.clone(), sub_expr.as_ref()), - _ => (e.name(logical_input_schema)?, e), + _ => (physical_name(e, logical_input_schema)?, e), }; - self.create_aggregate_expr_with_name(e, name, physical_input_schema, ctx_state) + + self.create_aggregate_expr_with_name( + e, + name, + logical_input_schema, + physical_input_schema, + ctx_state, + ) } /// Create a physical sort expression from a logical expression pub fn create_physical_sort_expr( &self, e: &Expr, + input_dfschema: &DFSchema, input_schema: &Schema, options: SortOptions, ctx_state: &ExecutionContextState, ) -> Result { Ok(PhysicalSortExpr { - expr: self.create_physical_expr(e, input_schema, ctx_state)?, + expr: self.create_physical_expr( + e, + input_dfschema, + input_schema, + ctx_state, + )?, options, }) } @@ -913,6 +1197,7 @@ mod tests { use arrow::datatypes::{DataType, Field, SchemaRef}; use async_trait::async_trait; use fmt::Debug; + use std::convert::TryFrom; use std::{any::Any, fmt}; fn make_ctx_state() -> ExecutionContextState { @@ -945,7 +1230,7 @@ mod tests { // verify that the plan correctly casts u8 to i64 // the cast here is implicit so has CastOptions with safe=true - let expected = "BinaryExpr { left: Column { name: \"c7\" }, op: Lt, right: TryCastExpr { expr: Literal { value: UInt8(5) }, cast_type: Int64 } }"; + let expected = "BinaryExpr { left: Column { name: \"c7\", index: 6 }, op: Lt, right: TryCastExpr { expr: Literal { value: UInt8(5) }, cast_type: Int64 } }"; assert!(format!("{:?}", plan).contains(expected)); Ok(()) @@ -954,12 +1239,17 @@ mod tests { #[test] fn test_create_not() -> Result<()> { let schema = Schema::new(vec![Field::new("a", DataType::Boolean, true)]); + let dfschema = DFSchema::try_from(schema.clone())?; let planner = DefaultPhysicalPlanner::default(); - let expr = - planner.create_physical_expr(&col("a").not(), &schema, &make_ctx_state())?; - let expected = expressions::not(expressions::col("a"), &schema)?; + let expr = planner.create_physical_expr( + &col("a").not(), + &dfschema, + &schema, + &make_ctx_state(), + )?; + let expected = expressions::not(expressions::col("a", &schema)?, &schema)?; assert_eq!(format!("{:?}", expr), format!("{:?}", expected)); @@ -980,7 +1270,7 @@ mod tests { // c12 is f64, c7 is u8 -> cast c7 to f64 // the cast here is implicit so has CastOptions with safe=true - let expected = "predicate: BinaryExpr { left: TryCastExpr { expr: Column { name: \"c7\" }, cast_type: Float64 }, op: Lt, right: Column { name: \"c12\" } }"; + let expected = "predicate: BinaryExpr { left: TryCastExpr { expr: Column { name: \"c7\", index: 6 }, cast_type: Float64 }, op: Lt, right: Column { name: \"c12\", index: 11 } }"; assert!(format!("{:?}", plan).contains(expected)); Ok(()) } @@ -1105,8 +1395,7 @@ mod tests { .build()?; let execution_plan = plan(&logical_plan)?; // verify that the plan correctly adds cast from Int64(1) to Utf8 - let expected = "InListExpr { expr: Column { name: \"c1\" }, list: [Literal { value: Utf8(\"a\") }, CastExpr { expr: Literal { value: Int64(1) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }], negated: false }"; - println!("{:?}", execution_plan); + let expected = "InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [Literal { value: Utf8(\"a\") }, CastExpr { expr: Literal { value: Int64(1) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }], negated: false }"; assert!(format!("{:?}", execution_plan).contains(expected)); // expression: "a in (true, 'a')" diff --git a/datafusion/src/physical_plan/projection.rs b/datafusion/src/physical_plan/projection.rs index d4c0459c211be..5110e5b5a8793 100644 --- a/datafusion/src/physical_plan/projection.rs +++ b/datafusion/src/physical_plan/projection.rs @@ -233,8 +233,10 @@ mod tests { )?; // pick column c1 and name it column c1 in the output schema - let projection = - ProjectionExec::try_new(vec![(col("c1"), "c1".to_string())], Arc::new(csv))?; + let projection = ProjectionExec::try_new( + vec![(col("c1", &schema)?, "c1".to_string())], + Arc::new(csv), + )?; let mut partition_count = 0; let mut row_count = 0; diff --git a/datafusion/src/physical_plan/repartition.rs b/datafusion/src/physical_plan/repartition.rs index a7b17c4161b0a..e67e4c2d44779 100644 --- a/datafusion/src/physical_plan/repartition.rs +++ b/datafusion/src/physical_plan/repartition.rs @@ -435,7 +435,7 @@ mod tests { use super::*; use crate::{ assert_batches_sorted_eq, - physical_plan::memory::MemoryExec, + physical_plan::{expressions::col, memory::MemoryExec}, test::exec::{BarrierExec, ErrorExec, MockExec}, }; use arrow::datatypes::{DataType, Field, Schema}; @@ -513,12 +513,7 @@ mod tests { let output_partitions = repartition( &schema, partitions, - Partitioning::Hash( - vec![Arc::new(crate::physical_plan::expressions::Column::new( - "c0", - ))], - 8, - ), + Partitioning::Hash(vec![col("c0", &schema)?], 8), ) .await?; @@ -761,6 +756,7 @@ mod tests { partitioning: Partitioning::Hash( vec![Arc::new(crate::physical_plan::expressions::Column::new( "my_awesome_field", + 0, ))], 2, ), diff --git a/datafusion/src/physical_plan/sort.rs b/datafusion/src/physical_plan/sort.rs index 437519a7d2a29..365097822cc7e 100644 --- a/datafusion/src/physical_plan/sort.rs +++ b/datafusion/src/physical_plan/sort.rs @@ -343,17 +343,17 @@ mod tests { vec![ // c1 string column PhysicalSortExpr { - expr: col("c1"), + expr: col("c1", &schema)?, options: SortOptions::default(), }, // c2 uin32 column PhysicalSortExpr { - expr: col("c2"), + expr: col("c2", &schema)?, options: SortOptions::default(), }, // c7 uin8 column PhysicalSortExpr { - expr: col("c7"), + expr: col("c7", &schema)?, options: SortOptions::default(), }, ], @@ -417,14 +417,14 @@ mod tests { let sort_exec = Arc::new(SortExec::try_new( vec![ PhysicalSortExpr { - expr: col("a"), + expr: col("a", &schema)?, options: SortOptions { descending: true, nulls_first: true, }, }, PhysicalSortExpr { - expr: col("b"), + expr: col("b", &schema)?, options: SortOptions { descending: false, nulls_first: false, diff --git a/datafusion/src/physical_plan/sort_preserving_merge.rs b/datafusion/src/physical_plan/sort_preserving_merge.rs index c39acc474d315..b8ca97cc5974f 100644 --- a/datafusion/src/physical_plan/sort_preserving_merge.rs +++ b/datafusion/src/physical_plan/sort_preserving_merge.rs @@ -579,21 +579,18 @@ mod tests { let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); let schema = b1.schema(); + let sort = vec![ + PhysicalSortExpr { + expr: col("b", &schema).unwrap(), + options: Default::default(), + }, + PhysicalSortExpr { + expr: col("c", &schema).unwrap(), + options: Default::default(), + }, + ]; let exec = MemoryExec::try_new(&[vec![b1], vec![b2]], schema, None).unwrap(); - let merge = Arc::new(SortPreservingMergeExec::new( - vec![ - PhysicalSortExpr { - expr: col("b"), - options: Default::default(), - }, - PhysicalSortExpr { - expr: col("c"), - options: Default::default(), - }, - ], - Arc::new(exec), - 1024, - )); + let merge = Arc::new(SortPreservingMergeExec::new(sort, Arc::new(exec), 1024)); let collected = collect(merge).await.unwrap(); assert_eq!(collected.len(), 1); @@ -668,18 +665,18 @@ mod tests { let sort = vec![ PhysicalSortExpr { - expr: col("c1"), + expr: col("c1", &schema).unwrap(), options: SortOptions { descending: true, nulls_first: true, }, }, PhysicalSortExpr { - expr: col("c2"), + expr: col("c2", &schema).unwrap(), options: Default::default(), }, PhysicalSortExpr { - expr: col("c7"), + expr: col("c7", &schema).unwrap(), options: SortOptions::default(), }, ]; @@ -744,25 +741,26 @@ mod tests { #[tokio::test] async fn test_partition_sort_streaming_input() { + let schema = test::aggr_test_schema(); let sort = vec![ // uint8 PhysicalSortExpr { - expr: col("c7"), + expr: col("c7", &schema).unwrap(), options: Default::default(), }, // int16 PhysicalSortExpr { - expr: col("c4"), + expr: col("c4", &schema).unwrap(), options: Default::default(), }, // utf-8 PhysicalSortExpr { - expr: col("c1"), + expr: col("c1", &schema).unwrap(), options: SortOptions::default(), }, // utf-8 PhysicalSortExpr { - expr: col("c13"), + expr: col("c13", &schema).unwrap(), options: SortOptions::default(), }, ]; @@ -782,15 +780,17 @@ mod tests { #[tokio::test] async fn test_partition_sort_streaming_input_output() { + let schema = test::aggr_test_schema(); + let sort = vec![ // float64 PhysicalSortExpr { - expr: col("c12"), + expr: col("c12", &schema).unwrap(), options: Default::default(), }, // utf-8 PhysicalSortExpr { - expr: col("c13"), + expr: col("c13", &schema).unwrap(), options: Default::default(), }, ]; @@ -850,27 +850,24 @@ mod tests { let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); let schema = b1.schema(); - let exec = MemoryExec::try_new(&[vec![b1], vec![b2]], schema, None).unwrap(); - let merge = Arc::new(SortPreservingMergeExec::new( - vec![ - PhysicalSortExpr { - expr: col("b"), - options: SortOptions { - descending: false, - nulls_first: true, - }, + let sort = vec![ + PhysicalSortExpr { + expr: col("b", &schema).unwrap(), + options: SortOptions { + descending: false, + nulls_first: true, }, - PhysicalSortExpr { - expr: col("c"), - options: SortOptions { - descending: false, - nulls_first: false, - }, + }, + PhysicalSortExpr { + expr: col("c", &schema).unwrap(), + options: SortOptions { + descending: false, + nulls_first: false, }, - ], - Arc::new(exec), - 1024, - )); + }, + ]; + let exec = MemoryExec::try_new(&[vec![b1], vec![b2]], schema, None).unwrap(); + let merge = Arc::new(SortPreservingMergeExec::new(sort, Arc::new(exec), 1024)); let collected = collect(merge).await.unwrap(); assert_eq!(collected.len(), 1); @@ -898,8 +895,9 @@ mod tests { #[tokio::test] async fn test_async() { + let schema = test::aggr_test_schema(); let sort = vec![PhysicalSortExpr { - expr: col("c7"), + expr: col("c7", &schema).unwrap(), options: SortOptions::default(), }]; diff --git a/datafusion/src/physical_plan/type_coercion.rs b/datafusion/src/physical_plan/type_coercion.rs index fe87ecda872cb..ffd8f20064f78 100644 --- a/datafusion/src/physical_plan/type_coercion.rs +++ b/datafusion/src/physical_plan/type_coercion.rs @@ -267,7 +267,9 @@ mod tests { let expressions = |t: Vec, schema| -> Result> { t.iter() .enumerate() - .map(|(i, t)| try_cast(col(&format!("c{}", i)), &schema, t.clone())) + .map(|(i, t)| { + try_cast(col(&format!("c{}", i), &schema)?, &schema, t.clone()) + }) .collect::>>() }; diff --git a/datafusion/src/physical_plan/windows.rs b/datafusion/src/physical_plan/windows.rs index 466cc51b447d0..a214ef17a9f83 100644 --- a/datafusion/src/physical_plan/windows.rs +++ b/datafusion/src/physical_plan/windows.rs @@ -369,7 +369,7 @@ impl WindowAggExec { input: Arc, input_schema: SchemaRef, ) -> Result { - let schema = create_schema(&input.schema(), &window_expr)?; + let schema = create_schema(&input_schema, &window_expr)?; let schema = Arc::new(schema); Ok(WindowAggExec { input, @@ -599,7 +599,7 @@ mod tests { vec![create_window_expr( &WindowFunction::AggregateFunction(AggregateFunction::Count), "count".to_owned(), - &[col("c3")], + &[col("c3", &schema)?], &[], &[], Some(WindowFrame::default()), @@ -632,7 +632,7 @@ mod tests { create_window_expr( &WindowFunction::AggregateFunction(AggregateFunction::Count), "count".to_owned(), - &[col("c3")], + &[col("c3", &schema)?], &[], &[], Some(WindowFrame::default()), @@ -641,7 +641,7 @@ mod tests { create_window_expr( &WindowFunction::AggregateFunction(AggregateFunction::Max), "max".to_owned(), - &[col("c3")], + &[col("c3", &schema)?], &[], &[], Some(WindowFrame::default()), @@ -650,7 +650,7 @@ mod tests { create_window_expr( &WindowFunction::AggregateFunction(AggregateFunction::Min), "min".to_owned(), - &[col("c3")], + &[col("c3", &schema)?], &[], &[], Some(WindowFrame::default()), diff --git a/datafusion/src/prelude.rs b/datafusion/src/prelude.rs index e1f1d7b76047f..e7ad04e74d1a0 100644 --- a/datafusion/src/prelude.rs +++ b/datafusion/src/prelude.rs @@ -32,6 +32,6 @@ pub use crate::logical_plan::{ count, create_udf, in_list, initcap, left, length, lit, lower, lpad, ltrim, max, md5, min, now, octet_length, random, regexp_replace, repeat, replace, reverse, right, rpad, rtrim, sha224, sha256, sha384, sha512, split_part, starts_with, strpos, substr, - sum, to_hex, translate, trim, upper, JoinType, Partitioning, + sum, to_hex, translate, trim, upper, Column, JoinType, Partitioning, }; pub use crate::physical_plan::csv::CsvReadOptions; diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 547e9afd38d91..7912241329a34 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -17,13 +17,18 @@ //! SQL Query Planner (produces logical plan from SQL AST) +use std::collections::HashSet; +use std::str::FromStr; +use std::sync::Arc; +use std::{convert::TryInto, vec}; + use crate::catalog::TableReference; use crate::datasource::TableProvider; use crate::logical_plan::window_frames::{WindowFrame, WindowFrameUnits}; use crate::logical_plan::Expr::Alias; use crate::logical_plan::{ - and, lit, DFSchema, Expr, LogicalPlan, LogicalPlanBuilder, Operator, PlanType, - StringifiedPlan, ToDFSchema, + and, lit, union_with_alias, Column, DFSchema, Expr, LogicalPlan, LogicalPlanBuilder, + Operator, PlanType, StringifiedPlan, ToDFSchema, }; use crate::prelude::JoinType; use crate::scalar::ScalarValue; @@ -47,9 +52,6 @@ use sqlparser::ast::{ use sqlparser::ast::{ColumnDef as SQLColumnDef, ColumnOption}; use sqlparser::ast::{OrderByExpr, Statement}; use sqlparser::parser::ParserError::ParserError; -use std::str::FromStr; -use std::sync::Arc; -use std::{convert::TryInto, vec}; use super::{ parser::DFParser, @@ -163,29 +165,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { (SetOperator::Union, true) => { let left_plan = self.set_expr_to_plan(left.as_ref(), None, ctes)?; let right_plan = self.set_expr_to_plan(right.as_ref(), None, ctes)?; - let inputs = vec![left_plan, right_plan] - .into_iter() - .flat_map(|p| match p { - LogicalPlan::Union { inputs, .. } => inputs, - x => vec![x], - }) - .collect::>(); - if inputs.is_empty() { - return Err(DataFusionError::Plan(format!( - "Empty UNION: {}", - set_expr - ))); - } - if !inputs.iter().all(|s| s.schema() == inputs[0].schema()) { - return Err(DataFusionError::Plan( - "UNION ALL schemas are expected to be the same".to_string(), - )); - } - Ok(LogicalPlan::Union { - schema: inputs[0].schema().clone(), - inputs, - alias, - }) + union_with_alias(left_plan, right_plan, alias) } _ => Err(DataFusionError::NotImplemented(format!( "Only UNION ALL is supported, found {}", @@ -382,7 +362,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ) -> Result { match constraint { JoinConstraint::On(sql_expr) => { - let mut keys: Vec<(String, String)> = vec![]; + let mut keys: Vec<(Column, Column)> = vec![]; let join_schema = left.schema().join(right.schema())?; // parse ON expression @@ -390,20 +370,21 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // extract join keys extract_join_keys(&expr, &mut keys)?; - let left_keys: Vec<&str> = - keys.iter().map(|pair| pair.0.as_str()).collect(); - let right_keys: Vec<&str> = - keys.iter().map(|pair| pair.1.as_str()).collect(); + let (left_keys, right_keys): (Vec, Vec) = + keys.into_iter().unzip(); // return the logical plan representing the join LogicalPlanBuilder::from(left) - .join(right, join_type, &left_keys, &right_keys)? + .join(right, join_type, left_keys, right_keys)? .build() } JoinConstraint::Using(idents) => { - let keys: Vec<&str> = idents.iter().map(|x| x.value.as_str()).collect(); + let keys: Vec = idents + .iter() + .map(|x| Column::from_name(x.value.clone())) + .collect(); LogicalPlanBuilder::from(left) - .join(right, join_type, &keys, &keys)? + .join_using(right, join_type, keys)? .build() } JoinConstraint::Natural => { @@ -489,37 +470,38 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let mut possible_join_keys = vec![]; extract_possible_join_keys(&filter_expr, &mut possible_join_keys)?; - let mut all_join_keys = vec![]; + let mut all_join_keys = HashSet::new(); let mut left = plans[0].clone(); for right in plans.iter().skip(1) { let left_schema = left.schema(); let right_schema = right.schema(); let mut join_keys = vec![]; for (l, r) in &possible_join_keys { - if left_schema.field_with_unqualified_name(l).is_ok() - && right_schema.field_with_unqualified_name(r).is_ok() + if left_schema.field_from_qualified_column(l).is_ok() + && right_schema.field_from_qualified_column(r).is_ok() { - join_keys.push((l.as_str(), r.as_str())); - } else if left_schema.field_with_unqualified_name(r).is_ok() - && right_schema.field_with_unqualified_name(l).is_ok() + join_keys.push((l.clone(), r.clone())); + } else if left_schema.field_from_qualified_column(r).is_ok() + && right_schema.field_from_qualified_column(l).is_ok() { - join_keys.push((r.as_str(), l.as_str())); + join_keys.push((r.clone(), l.clone())); } } if join_keys.is_empty() { left = LogicalPlanBuilder::from(&left).cross_join(right)?.build()?; } else { - let left_keys: Vec<_> = - join_keys.iter().map(|(l, _)| *l).collect(); - let right_keys: Vec<_> = - join_keys.iter().map(|(_, r)| *r).collect(); + let left_keys: Vec = + join_keys.iter().map(|(l, _)| l.clone()).collect(); + let right_keys: Vec = + join_keys.iter().map(|(_, r)| r.clone()).collect(); let builder = LogicalPlanBuilder::from(&left); left = builder - .join(right, JoinType::Inner, &left_keys, &right_keys)? + .join(right, JoinType::Inner, left_keys, right_keys)? .build()?; } - all_join_keys.extend_from_slice(&join_keys); + + all_join_keys.extend(join_keys); } // remove join expressions from filter @@ -548,12 +530,18 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // The SELECT expressions, with wildcards expanded. let select_exprs = self.prepare_select_exprs(&plan, &select.projection)?; + // having and group by clause may reference aliases defined in select projection + let projected_plan = self.project(&plan, select_exprs.clone())?; + let mut combined_schema = (**projected_plan.schema()).clone(); + combined_schema.merge(plan.schema()); + // Optionally the HAVING expression. let having_expr_opt = select .having .as_ref() .map::, _>(|having_expr| { - let having_expr = self.sql_expr_to_logical_expr(having_expr)?; + let having_expr = + self.sql_expr_to_logical_expr(having_expr, &combined_schema)?; // This step "dereferences" any aliases in the HAVING clause. // @@ -582,7 +570,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // The outer expressions we will search through for // aggregates. Aggregates may be sourced from the SELECT... let mut aggr_expr_haystack = select_exprs.clone(); - // ... or from the HAVING. if let Some(having_expr) = &having_expr_opt { aggr_expr_haystack.push(having_expr.clone()); @@ -596,7 +583,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .group_by .iter() .map(|e| { - let group_by_expr = self.sql_expr_to_logical_expr(e)?; + let group_by_expr = self.sql_expr_to_logical_expr(e, &combined_schema)?; let group_by_expr = resolve_aliases_to_exprs(&group_by_expr, &alias_map)?; let group_by_expr = resolve_positions_to_exprs(&group_by_expr, &select_exprs)?; @@ -816,16 +803,16 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let order_by_rex = order_by .iter() - .map(|e| self.order_by_to_sort_expr(e)) + .map(|e| self.order_by_to_sort_expr(e, plan.schema())) .collect::>>()?; LogicalPlanBuilder::from(plan).sort(order_by_rex)?.build() } /// convert sql OrderByExpr to Expr::Sort - fn order_by_to_sort_expr(&self, e: &OrderByExpr) -> Result { + fn order_by_to_sort_expr(&self, e: &OrderByExpr, schema: &DFSchema) -> Result { Ok(Expr::Sort { - expr: Box::new(self.sql_expr_to_logical_expr(&e.expr)?), + expr: Box::new(self.sql_expr_to_logical_expr(&e.expr, schema)?), // by default asc asc: e.asc.unwrap_or(true), // by default nulls first to be consistent with spark @@ -842,11 +829,15 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { find_column_exprs(exprs) .iter() .try_for_each(|col| match col { - Expr::Column(name) => { - schema.field_with_unqualified_name(name).map_err(|_| { + Expr::Column(col) => { + match &col.relation { + Some(r) => schema.field_with_qualified_name(r, &col.name), + None => schema.field_with_unqualified_name(&col.name), + } + .map_err(|_| { DataFusionError::Plan(format!( "Invalid identifier '{}' for schema {}", - name, + col, schema.to_string() )) })?; @@ -873,19 +864,25 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { /// Generate a relational expression from a SQL expression pub fn sql_to_rex(&self, sql: &SQLExpr, schema: &DFSchema) -> Result { - let expr = self.sql_expr_to_logical_expr(sql)?; + let expr = self.sql_expr_to_logical_expr(sql, schema)?; self.validate_schema_satisfies_exprs(schema, &[expr.clone()])?; Ok(expr) } - fn sql_fn_arg_to_logical_expr(&self, sql: &FunctionArg) -> Result { + fn sql_fn_arg_to_logical_expr( + &self, + sql: &FunctionArg, + schema: &DFSchema, + ) -> Result { match sql { - FunctionArg::Named { name: _, arg } => self.sql_expr_to_logical_expr(arg), - FunctionArg::Unnamed(value) => self.sql_expr_to_logical_expr(value), + FunctionArg::Named { name: _, arg } => { + self.sql_expr_to_logical_expr(arg, schema) + } + FunctionArg::Unnamed(value) => self.sql_expr_to_logical_expr(value, schema), } } - fn sql_expr_to_logical_expr(&self, sql: &SQLExpr) -> Result { + fn sql_expr_to_logical_expr(&self, sql: &SQLExpr, schema: &DFSchema) -> Result { match sql { SQLExpr::Value(Value::Number(n, _)) => match n.parse::() { Ok(n) => Ok(lit(n)), @@ -900,7 +897,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { fun: functions::BuiltinScalarFunction::DatePart, args: vec![ Expr::Literal(ScalarValue::Utf8(Some(format!("{}", field)))), - self.sql_expr_to_logical_expr(expr)?, + self.sql_expr_to_logical_expr(expr, schema)?, ], }), @@ -923,7 +920,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let var_names = vec![id.value.clone()]; Ok(Expr::ScalarVariable(var_names)) } else { - Ok(Expr::Column(id.value.to_string())) + Ok(Expr::Column( + schema + .field_with_unqualified_name(&id.value)? + .qualified_column(), + )) } } @@ -934,6 +935,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } if &var_names[0][0..1] == "@" { Ok(Expr::ScalarVariable(var_names)) + } else if var_names.len() == 2 { + // table.column identifier + let name = var_names.pop().unwrap(); + let relation = Some(var_names.pop().unwrap()); + Ok(Expr::Column(Column { relation, name })) } else { Err(DataFusionError::NotImplemented(format!( "Unsupported compound identifier '{:?}'", @@ -951,20 +957,20 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { else_result, } => { let expr = if let Some(e) = operand { - Some(Box::new(self.sql_expr_to_logical_expr(e)?)) + Some(Box::new(self.sql_expr_to_logical_expr(e, schema)?)) } else { None }; let when_expr = conditions .iter() - .map(|e| self.sql_expr_to_logical_expr(e)) + .map(|e| self.sql_expr_to_logical_expr(e, schema)) .collect::>>()?; let then_expr = results .iter() - .map(|e| self.sql_expr_to_logical_expr(e)) + .map(|e| self.sql_expr_to_logical_expr(e, schema)) .collect::>>()?; let else_expr = if let Some(e) = else_result { - Some(Box::new(self.sql_expr_to_logical_expr(e)?)) + Some(Box::new(self.sql_expr_to_logical_expr(e, schema)?)) } else { None }; @@ -984,7 +990,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ref expr, ref data_type, } => Ok(Expr::Cast { - expr: Box::new(self.sql_expr_to_logical_expr(expr)?), + expr: Box::new(self.sql_expr_to_logical_expr(expr, schema)?), data_type: convert_data_type(data_type)?, }), @@ -992,7 +998,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ref expr, ref data_type, } => Ok(Expr::TryCast { - expr: Box::new(self.sql_expr_to_logical_expr(expr)?), + expr: Box::new(self.sql_expr_to_logical_expr(expr, schema)?), data_type: convert_data_type(data_type)?, }), @@ -1004,19 +1010,19 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { data_type: convert_data_type(data_type)?, }), - SQLExpr::IsNull(ref expr) => { - Ok(Expr::IsNull(Box::new(self.sql_expr_to_logical_expr(expr)?))) - } + SQLExpr::IsNull(ref expr) => Ok(Expr::IsNull(Box::new( + self.sql_expr_to_logical_expr(expr, schema)?, + ))), SQLExpr::IsNotNull(ref expr) => Ok(Expr::IsNotNull(Box::new( - self.sql_expr_to_logical_expr(expr)?, + self.sql_expr_to_logical_expr(expr, schema)?, ))), SQLExpr::UnaryOp { ref op, ref expr } => match op { - UnaryOperator::Not => { - Ok(Expr::Not(Box::new(self.sql_expr_to_logical_expr(expr)?))) - } - UnaryOperator::Plus => Ok(self.sql_expr_to_logical_expr(expr)?), + UnaryOperator::Not => Ok(Expr::Not(Box::new( + self.sql_expr_to_logical_expr(expr, schema)?, + ))), + UnaryOperator::Plus => Ok(self.sql_expr_to_logical_expr(expr, schema)?), UnaryOperator::Minus => { match expr.as_ref() { // optimization: if it's a number literal, we apply the negative operator @@ -1032,7 +1038,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { })?)), }, // not a literal, apply negative operator on expression - _ => Ok(Expr::Negative(Box::new(self.sql_expr_to_logical_expr(expr)?))), + _ => Ok(Expr::Negative(Box::new(self.sql_expr_to_logical_expr(expr, schema)?))), } } _ => Err(DataFusionError::NotImplemented(format!( @@ -1047,10 +1053,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ref low, ref high, } => Ok(Expr::Between { - expr: Box::new(self.sql_expr_to_logical_expr(expr)?), + expr: Box::new(self.sql_expr_to_logical_expr(expr, schema)?), negated: *negated, - low: Box::new(self.sql_expr_to_logical_expr(low)?), - high: Box::new(self.sql_expr_to_logical_expr(high)?), + low: Box::new(self.sql_expr_to_logical_expr(low, schema)?), + high: Box::new(self.sql_expr_to_logical_expr(high, schema)?), }), SQLExpr::InList { @@ -1060,11 +1066,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } => { let list_expr = list .iter() - .map(|e| self.sql_expr_to_logical_expr(e)) + .map(|e| self.sql_expr_to_logical_expr(e, schema)) .collect::>>()?; Ok(Expr::InList { - expr: Box::new(self.sql_expr_to_logical_expr(expr)?), + expr: Box::new(self.sql_expr_to_logical_expr(expr, schema)?), list: list_expr, negated: *negated, }) @@ -1098,9 +1104,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { }?; Ok(Expr::BinaryExpr { - left: Box::new(self.sql_expr_to_logical_expr(left)?), + left: Box::new(self.sql_expr_to_logical_expr(left, schema)?), op: operator, - right: Box::new(self.sql_expr_to_logical_expr(right)?), + right: Box::new(self.sql_expr_to_logical_expr(right, schema)?), }) } @@ -1121,7 +1127,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // first, scalar built-in if let Ok(fun) = functions::BuiltinScalarFunction::from_str(&name) { - let args = self.function_args_to_expr(function)?; + let args = self.function_args_to_expr(function, schema)?; return Ok(Expr::ScalarFunction { fun, args }); }; @@ -1131,12 +1137,12 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let partition_by = window .partition_by .iter() - .map(|e| self.sql_expr_to_logical_expr(e)) + .map(|e| self.sql_expr_to_logical_expr(e, schema)) .collect::>>()?; let order_by = window .order_by .iter() - .map(|e| self.order_by_to_sort_expr(e)) + .map(|e| self.order_by_to_sort_expr(e, schema)) .collect::>>()?; let window_frame = window .window_frame @@ -1163,8 +1169,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { fun: window_functions::WindowFunction::AggregateFunction( aggregate_fun.clone(), ), - args: self - .aggregate_fn_to_expr(&aggregate_fun, function)?, + args: self.aggregate_fn_to_expr( + &aggregate_fun, + function, + schema, + )?, partition_by, order_by, window_frame, @@ -1177,7 +1186,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { fun: window_functions::WindowFunction::BuiltInWindowFunction( window_fun, ), - args: self.function_args_to_expr(function)?, + args:self.function_args_to_expr(function, schema)?, partition_by, order_by, window_frame, @@ -1188,7 +1197,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // next, aggregate built-ins if let Ok(fun) = aggregates::AggregateFunction::from_str(&name) { - let args = self.aggregate_fn_to_expr(&fun, function)?; + let args = self.aggregate_fn_to_expr(&fun, function, schema)?; return Ok(Expr::AggregateFunction { fun, distinct: function.distinct, @@ -1199,13 +1208,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // finally, user-defined functions (UDF) and UDAF match self.schema_provider.get_function_meta(&name) { Some(fm) => { - let args = self.function_args_to_expr(function)?; + let args = self.function_args_to_expr(function, schema)?; Ok(Expr::ScalarUDF { fun: fm, args }) } None => match self.schema_provider.get_aggregate_meta(&name) { Some(fm) => { - let args = self.function_args_to_expr(function)?; + let args = self.function_args_to_expr(function, schema)?; Ok(Expr::AggregateUDF { fun: fm, args }) } _ => Err(DataFusionError::Plan(format!( @@ -1216,7 +1225,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } } - SQLExpr::Nested(e) => self.sql_expr_to_logical_expr(e), + SQLExpr::Nested(e) => self.sql_expr_to_logical_expr(e, schema), _ => Err(DataFusionError::NotImplemented(format!( "Unsupported ast node {:?} in sqltorel", @@ -1228,11 +1237,12 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { fn function_args_to_expr( &self, function: &sqlparser::ast::Function, + schema: &DFSchema, ) -> Result> { function .args .iter() - .map(|a| self.sql_fn_arg_to_logical_expr(a)) + .map(|a| self.sql_fn_arg_to_logical_expr(a, schema)) .collect::>>() } @@ -1240,6 +1250,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { &self, fun: &aggregates::AggregateFunction, function: &sqlparser::ast::Function, + schema: &DFSchema, ) -> Result> { if *fun == aggregates::AggregateFunction::Count { function @@ -1250,11 +1261,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Ok(lit(1_u8)) } FunctionArg::Unnamed(SQLExpr::Wildcard) => Ok(lit(1_u8)), - _ => self.sql_fn_arg_to_logical_expr(a), + _ => self.sql_fn_arg_to_logical_expr(a, schema), }) .collect::>>() } else { - self.function_args_to_expr(function) + self.function_args_to_expr(function, schema) } } @@ -1519,13 +1530,15 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { /// Remove join expressions from a filter expression fn remove_join_expressions( expr: &Expr, - join_columns: &[(&str, &str)], + join_columns: &HashSet<(Column, Column)>, ) -> Result> { match expr { Expr::BinaryExpr { left, op, right } => match op { Operator::Eq => match (left.as_ref(), right.as_ref()) { (Expr::Column(l), Expr::Column(r)) => { - if join_columns.contains(&(l, r)) || join_columns.contains(&(r, l)) { + if join_columns.contains(&(l.clone(), r.clone())) + || join_columns.contains(&(r.clone(), l.clone())) + { Ok(None) } else { Ok(Some(expr.clone())) @@ -1556,12 +1569,12 @@ fn remove_join_expressions( /// foo = bar /// foo = bar AND bar = baz AND ... /// -fn extract_join_keys(expr: &Expr, accum: &mut Vec<(String, String)>) -> Result<()> { +fn extract_join_keys(expr: &Expr, accum: &mut Vec<(Column, Column)>) -> Result<()> { match expr { Expr::BinaryExpr { left, op, right } => match op { Operator::Eq => match (left.as_ref(), right.as_ref()) { (Expr::Column(l), Expr::Column(r)) => { - accum.push((l.to_owned(), r.to_owned())); + accum.push((l.clone(), r.clone())); Ok(()) } other => Err(DataFusionError::SQL(ParserError(format!( @@ -1588,13 +1601,13 @@ fn extract_join_keys(expr: &Expr, accum: &mut Vec<(String, String)>) -> Result<( /// Extract join keys from a WHERE clause fn extract_possible_join_keys( expr: &Expr, - accum: &mut Vec<(String, String)>, + accum: &mut Vec<(Column, Column)>, ) -> Result<()> { match expr { Expr::BinaryExpr { left, op, right } => match op { Operator::Eq => match (left.as_ref(), right.as_ref()) { (Expr::Column(l), Expr::Column(r)) => { - accum.push((l.to_owned(), r.to_owned())); + accum.push((l.clone(), r.clone())); Ok(()) } _ => Ok(()), @@ -1635,9 +1648,6 @@ mod tests { use crate::{logical_plan::create_udf, sql::parser::DFParser}; use functions::ScalarFunctionImplementation; - const PERSON_COLUMN_NAMES: &str = - "id, first_name, last_name, age, state, salary, birth_date, 😀"; - #[test] fn select_no_relation() { quick_test( @@ -1651,13 +1661,10 @@ mod tests { fn select_column_does_not_exist() { let sql = "SELECT doesnotexist FROM person"; let err = logical_plan(sql).expect_err("query should have failed"); - assert_eq!( - format!( - r#"Plan("Invalid identifier 'doesnotexist' for schema {}")"#, - PERSON_COLUMN_NAMES - ), - format!("{:?}", err) - ); + assert!(matches!( + err, + DataFusionError::Plan(msg) if msg == "No field with unqualified name 'doesnotexist'", + )); } #[test] @@ -1665,7 +1672,7 @@ mod tests { let sql = "SELECT age, age FROM person"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - r##"Plan("Projections require unique expression names but the expression \"#age\" at position 0 and \"#age\" at position 1 have the same name. Consider aliasing (\"AS\") one of them.")"##, + r##"Plan("Projections require unique expression names but the expression \"#person.age\" at position 0 and \"#person.age\" at position 1 have the same name. Consider aliasing (\"AS\") one of them.")"##, format!("{:?}", err) ); } @@ -1675,7 +1682,7 @@ mod tests { let sql = "SELECT *, age FROM person"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - r##"Plan("Projections require unique expression names but the expression \"#age\" at position 3 and \"#age\" at position 8 have the same name. Consider aliasing (\"AS\") one of them.")"##, + r##"Plan("Projections require unique expression names but the expression \"#person.age\" at position 3 and \"#person.age\" at position 8 have the same name. Consider aliasing (\"AS\") one of them.")"##, format!("{:?}", err) ); } @@ -1684,7 +1691,7 @@ mod tests { fn select_wildcard_with_repeated_column_but_is_aliased() { quick_test( "SELECT *, first_name AS fn from person", - "Projection: #id, #first_name, #last_name, #age, #state, #salary, #birth_date, #😀, #first_name AS fn\ + "Projection: #person.id, #person.first_name, #person.last_name, #person.age, #person.state, #person.salary, #person.birth_date, #person.😀, #person.first_name AS fn\ \n TableScan: person projection=None", ); } @@ -1702,8 +1709,8 @@ mod tests { fn select_simple_filter() { let sql = "SELECT id, first_name, last_name \ FROM person WHERE state = 'CO'"; - let expected = "Projection: #id, #first_name, #last_name\ - \n Filter: #state Eq Utf8(\"CO\")\ + let expected = "Projection: #person.id, #person.first_name, #person.last_name\ + \n Filter: #person.state Eq Utf8(\"CO\")\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -1712,34 +1719,28 @@ mod tests { fn select_filter_column_does_not_exist() { let sql = "SELECT first_name FROM person WHERE doesnotexist = 'A'"; let err = logical_plan(sql).expect_err("query should have failed"); - assert_eq!( - format!( - r#"Plan("Invalid identifier 'doesnotexist' for schema {}")"#, - PERSON_COLUMN_NAMES - ), - format!("{:?}", err) - ); + assert!(matches!( + err, + DataFusionError::Plan(msg) if msg == "No field with unqualified name 'doesnotexist'", + )); } #[test] fn select_filter_cannot_use_alias() { let sql = "SELECT first_name AS x FROM person WHERE x = 'A'"; let err = logical_plan(sql).expect_err("query should have failed"); - assert_eq!( - format!( - r#"Plan("Invalid identifier 'x' for schema {}")"#, - PERSON_COLUMN_NAMES - ), - format!("{:?}", err) - ); + assert!(matches!( + err, + DataFusionError::Plan(msg) if msg == "No field with unqualified name 'x'", + )); } #[test] fn select_neg_filter() { let sql = "SELECT id, first_name, last_name \ FROM person WHERE NOT state"; - let expected = "Projection: #id, #first_name, #last_name\ - \n Filter: NOT #state\ + let expected = "Projection: #person.id, #person.first_name, #person.last_name\ + \n Filter: NOT #person.state\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -1748,8 +1749,8 @@ mod tests { fn select_compound_filter() { let sql = "SELECT id, first_name, last_name \ FROM person WHERE state = 'CO' AND age >= 21 AND age <= 65"; - let expected = "Projection: #id, #first_name, #last_name\ - \n Filter: #state Eq Utf8(\"CO\") And #age GtEq Int64(21) And #age LtEq Int64(65)\ + let expected = "Projection: #person.id, #person.first_name, #person.last_name\ + \n Filter: #person.state Eq Utf8(\"CO\") And #person.age GtEq Int64(21) And #person.age LtEq Int64(65)\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -1759,8 +1760,8 @@ mod tests { let sql = "SELECT state FROM person WHERE birth_date < CAST (158412331400600000 as timestamp)"; - let expected = "Projection: #state\ - \n Filter: #birth_date Lt CAST(Int64(158412331400600000) AS Timestamp(Nanosecond, None))\ + let expected = "Projection: #person.state\ + \n Filter: #person.birth_date Lt CAST(Int64(158412331400600000) AS Timestamp(Nanosecond, None))\ \n TableScan: person projection=None"; quick_test(sql, expected); @@ -1771,8 +1772,8 @@ mod tests { let sql = "SELECT state FROM person WHERE birth_date < CAST ('2020-01-01' as date)"; - let expected = "Projection: #state\ - \n Filter: #birth_date Lt CAST(Utf8(\"2020-01-01\") AS Date32)\ + let expected = "Projection: #person.state\ + \n Filter: #person.birth_date Lt CAST(Utf8(\"2020-01-01\") AS Date32)\ \n TableScan: person projection=None"; quick_test(sql, expected); @@ -1788,13 +1789,13 @@ mod tests { AND age >= 21 \ AND age < 65 \ AND age <= 65"; - let expected = "Projection: #age, #first_name, #last_name\ - \n Filter: #age Eq Int64(21) \ - And #age NotEq Int64(21) \ - And #age Gt Int64(21) \ - And #age GtEq Int64(21) \ - And #age Lt Int64(65) \ - And #age LtEq Int64(65)\ + let expected = "Projection: #person.age, #person.first_name, #person.last_name\ + \n Filter: #person.age Eq Int64(21) \ + And #person.age NotEq Int64(21) \ + And #person.age Gt Int64(21) \ + And #person.age GtEq Int64(21) \ + And #person.age Lt Int64(65) \ + And #person.age LtEq Int64(65)\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -1802,8 +1803,8 @@ mod tests { #[test] fn select_between() { let sql = "SELECT state FROM person WHERE age BETWEEN 21 AND 65"; - let expected = "Projection: #state\ - \n Filter: #age BETWEEN Int64(21) AND Int64(65)\ + let expected = "Projection: #person.state\ + \n Filter: #person.age BETWEEN Int64(21) AND Int64(65)\ \n TableScan: person projection=None"; quick_test(sql, expected); @@ -1812,8 +1813,8 @@ mod tests { #[test] fn select_between_negated() { let sql = "SELECT state FROM person WHERE age NOT BETWEEN 21 AND 65"; - let expected = "Projection: #state\ - \n Filter: #age NOT BETWEEN Int64(21) AND Int64(65)\ + let expected = "Projection: #person.state\ + \n Filter: #person.age NOT BETWEEN Int64(21) AND Int64(65)\ \n TableScan: person projection=None"; quick_test(sql, expected); @@ -1829,9 +1830,9 @@ mod tests { FROM person ) )"; - let expected = "Projection: #fn2, #last_name\ - \n Projection: #fn1 AS fn2, #last_name, #birth_date\ - \n Projection: #first_name AS fn1, #last_name, #birth_date, #age\ + let expected = "Projection: #fn2, #person.last_name\ + \n Projection: #fn1 AS fn2, #person.last_name, #person.birth_date\ + \n Projection: #person.first_name AS fn1, #person.last_name, #person.birth_date, #person.age\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -1846,10 +1847,10 @@ mod tests { ) WHERE fn1 = 'X' AND age < 30"; - let expected = "Projection: #fn1, #age\ - \n Filter: #fn1 Eq Utf8(\"X\") And #age Lt Int64(30)\ - \n Projection: #first_name AS fn1, #age\ - \n Filter: #age Gt Int64(20)\ + let expected = "Projection: #fn1, #person.age\ + \n Filter: #fn1 Eq Utf8(\"X\") And #person.age Lt Int64(30)\ + \n Projection: #person.first_name AS fn1, #person.age\ + \n Filter: #person.age Gt Int64(20)\ \n TableScan: person projection=None"; quick_test(sql, expected); @@ -1860,8 +1861,8 @@ mod tests { let sql = "SELECT id, age FROM person HAVING age > 100 AND age < 200"; - let expected = "Projection: #id, #age\ - \n Filter: #age Gt Int64(100) And #age Lt Int64(200)\ + let expected = "Projection: #person.id, #person.age\ + \n Filter: #person.age Gt Int64(100) And #person.age Lt Int64(200)\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -1907,9 +1908,9 @@ mod tests { let sql = "SELECT MAX(age) FROM person HAVING MAX(age) < 30"; - let expected = "Projection: #MAX(age)\ - \n Filter: #MAX(age) Lt Int64(30)\ - \n Aggregate: groupBy=[[]], aggr=[[MAX(#age)]]\ + let expected = "Projection: #MAX(person.age)\ + \n Filter: #MAX(person.age) Lt Int64(30)\ + \n Aggregate: groupBy=[[]], aggr=[[MAX(#person.age)]]\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -1919,9 +1920,9 @@ mod tests { let sql = "SELECT MAX(age) FROM person HAVING MAX(first_name) > 'M'"; - let expected = "Projection: #MAX(age)\ - \n Filter: #MAX(first_name) Gt Utf8(\"M\")\ - \n Aggregate: groupBy=[[]], aggr=[[MAX(#age), MAX(#first_name)]]\ + let expected = "Projection: #MAX(person.age)\ + \n Filter: #MAX(person.first_name) Gt Utf8(\"M\")\ + \n Aggregate: groupBy=[[]], aggr=[[MAX(#person.age), MAX(#person.first_name)]]\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -1943,9 +1944,10 @@ mod tests { let sql = "SELECT MAX(age) as max_age FROM person HAVING max_age < 30"; - let expected = "Projection: #MAX(age) AS max_age\ - \n Filter: #MAX(age) Lt Int64(30)\ - \n Aggregate: groupBy=[[]], aggr=[[MAX(#age)]]\ + // FIXME: add test for having in execution + let expected = "Projection: #MAX(person.age) AS max_age\ + \n Filter: #MAX(person.age) Lt Int64(30)\ + \n Aggregate: groupBy=[[]], aggr=[[MAX(#person.age)]]\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -1955,9 +1957,9 @@ mod tests { let sql = "SELECT MAX(age) as max_age FROM person HAVING MAX(age) < 30"; - let expected = "Projection: #MAX(age) AS max_age\ - \n Filter: #MAX(age) Lt Int64(30)\ - \n Aggregate: groupBy=[[]], aggr=[[MAX(#age)]]\ + let expected = "Projection: #MAX(person.age) AS max_age\ + \n Filter: #MAX(person.age) Lt Int64(30)\ + \n Aggregate: groupBy=[[]], aggr=[[MAX(#person.age)]]\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -1968,9 +1970,9 @@ mod tests { FROM person GROUP BY first_name HAVING first_name = 'M'"; - let expected = "Projection: #first_name, #MAX(age)\ - \n Filter: #first_name Eq Utf8(\"M\")\ - \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age)]]\ + let expected = "Projection: #person.first_name, #MAX(person.age)\ + \n Filter: #person.first_name Eq Utf8(\"M\")\ + \n Aggregate: groupBy=[[#person.first_name]], aggr=[[MAX(#person.age)]]\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -1982,10 +1984,10 @@ mod tests { WHERE id > 5 GROUP BY first_name HAVING MAX(age) < 100"; - let expected = "Projection: #first_name, #MAX(age)\ - \n Filter: #MAX(age) Lt Int64(100)\ - \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age)]]\ - \n Filter: #id Gt Int64(5)\ + let expected = "Projection: #person.first_name, #MAX(person.age)\ + \n Filter: #MAX(person.age) Lt Int64(100)\ + \n Aggregate: groupBy=[[#person.first_name]], aggr=[[MAX(#person.age)]]\ + \n Filter: #person.id Gt Int64(5)\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -1998,10 +2000,10 @@ mod tests { WHERE id > 5 AND age > 18 GROUP BY first_name HAVING MAX(age) < 100"; - let expected = "Projection: #first_name, #MAX(age)\ - \n Filter: #MAX(age) Lt Int64(100)\ - \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age)]]\ - \n Filter: #id Gt Int64(5) And #age Gt Int64(18)\ + let expected = "Projection: #person.first_name, #MAX(person.age)\ + \n Filter: #MAX(person.age) Lt Int64(100)\ + \n Aggregate: groupBy=[[#person.first_name]], aggr=[[MAX(#person.age)]]\ + \n Filter: #person.id Gt Int64(5) And #person.age Gt Int64(18)\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -2012,9 +2014,9 @@ mod tests { FROM person GROUP BY first_name HAVING MAX(age) > 2 AND fn = 'M'"; - let expected = "Projection: #first_name AS fn, #MAX(age)\ - \n Filter: #MAX(age) Gt Int64(2) And #first_name Eq Utf8(\"M\")\ - \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age)]]\ + let expected = "Projection: #person.first_name AS fn, #MAX(person.age)\ + \n Filter: #MAX(person.age) Gt Int64(2) And #person.first_name Eq Utf8(\"M\")\ + \n Aggregate: groupBy=[[#person.first_name]], aggr=[[MAX(#person.age)]]\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -2026,9 +2028,9 @@ mod tests { FROM person GROUP BY first_name HAVING MAX(age) > 2 AND max_age < 5 AND first_name = 'M' AND fn = 'N'"; - let expected = "Projection: #first_name AS fn, #MAX(age) AS max_age\ - \n Filter: #MAX(age) Gt Int64(2) And #MAX(age) Lt Int64(5) And #first_name Eq Utf8(\"M\") And #first_name Eq Utf8(\"N\")\ - \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age)]]\ + let expected = "Projection: #person.first_name AS fn, #MAX(person.age) AS max_age\ + \n Filter: #MAX(person.age) Gt Int64(2) And #MAX(person.age) Lt Int64(5) And #person.first_name Eq Utf8(\"M\") And #person.first_name Eq Utf8(\"N\")\ + \n Aggregate: groupBy=[[#person.first_name]], aggr=[[MAX(#person.age)]]\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -2039,9 +2041,9 @@ mod tests { FROM person GROUP BY first_name HAVING MAX(age) > 100"; - let expected = "Projection: #first_name, #MAX(age)\ - \n Filter: #MAX(age) Gt Int64(100)\ - \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age)]]\ + let expected = "Projection: #person.first_name, #MAX(person.age)\ + \n Filter: #MAX(person.age) Gt Int64(100)\ + \n Aggregate: groupBy=[[#person.first_name]], aggr=[[MAX(#person.age)]]\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -2065,9 +2067,9 @@ mod tests { FROM person GROUP BY first_name HAVING MAX(age) > 100 AND MAX(age) < 200"; - let expected = "Projection: #first_name, #MAX(age)\ - \n Filter: #MAX(age) Gt Int64(100) And #MAX(age) Lt Int64(200)\ - \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age)]]\ + let expected = "Projection: #person.first_name, #MAX(person.age)\ + \n Filter: #MAX(person.age) Gt Int64(100) And #MAX(person.age) Lt Int64(200)\ + \n Aggregate: groupBy=[[#person.first_name]], aggr=[[MAX(#person.age)]]\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -2078,9 +2080,9 @@ mod tests { FROM person GROUP BY first_name HAVING MAX(age) > 100 AND MIN(id) < 50"; - let expected = "Projection: #first_name, #MAX(age)\ - \n Filter: #MAX(age) Gt Int64(100) And #MIN(id) Lt Int64(50)\ - \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age), MIN(#id)]]\ + let expected = "Projection: #person.first_name, #MAX(person.age)\ + \n Filter: #MAX(person.age) Gt Int64(100) And #MIN(person.id) Lt Int64(50)\ + \n Aggregate: groupBy=[[#person.first_name]], aggr=[[MAX(#person.age), MIN(#person.id)]]\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -2092,9 +2094,9 @@ mod tests { FROM person GROUP BY first_name HAVING max_age > 100"; - let expected = "Projection: #first_name, #MAX(age) AS max_age\ - \n Filter: #MAX(age) Gt Int64(100)\ - \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age)]]\ + let expected = "Projection: #person.first_name, #MAX(person.age) AS max_age\ + \n Filter: #MAX(person.age) Gt Int64(100)\ + \n Aggregate: groupBy=[[#person.first_name]], aggr=[[MAX(#person.age)]]\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -2107,9 +2109,9 @@ mod tests { GROUP BY first_name HAVING max_age_plus_one > 100"; let expected = - "Projection: #first_name, #MAX(age) Plus Int64(1) AS max_age_plus_one\ - \n Filter: #MAX(age) Plus Int64(1) Gt Int64(100)\ - \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age)]]\ + "Projection: #person.first_name, #MAX(person.age) Plus Int64(1) AS max_age_plus_one\ + \n Filter: #MAX(person.age) Plus Int64(1) Gt Int64(100)\ + \n Aggregate: groupBy=[[#person.first_name]], aggr=[[MAX(#person.age)]]\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -2121,9 +2123,9 @@ mod tests { FROM person GROUP BY first_name HAVING MAX(age) > 100 AND MIN(id - 2) < 50"; - let expected = "Projection: #first_name, #MAX(age)\ - \n Filter: #MAX(age) Gt Int64(100) And #MIN(id Minus Int64(2)) Lt Int64(50)\ - \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age), MIN(#id Minus Int64(2))]]\ + let expected = "Projection: #person.first_name, #MAX(person.age)\ + \n Filter: #MAX(person.age) Gt Int64(100) And #MIN(person.id Minus Int64(2)) Lt Int64(50)\ + \n Aggregate: groupBy=[[#person.first_name]], aggr=[[MAX(#person.age), MIN(#person.id Minus Int64(2))]]\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -2134,9 +2136,9 @@ mod tests { FROM person GROUP BY first_name HAVING MAX(age) > 100 AND COUNT(*) < 50"; - let expected = "Projection: #first_name, #MAX(age)\ - \n Filter: #MAX(age) Gt Int64(100) And #COUNT(UInt8(1)) Lt Int64(50)\ - \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#age), COUNT(UInt8(1))]]\ + let expected = "Projection: #person.first_name, #MAX(person.age)\ + \n Filter: #MAX(person.age) Gt Int64(100) And #COUNT(UInt8(1)) Lt Int64(50)\ + \n Aggregate: groupBy=[[#person.first_name]], aggr=[[MAX(#person.age), COUNT(UInt8(1))]]\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -2144,7 +2146,7 @@ mod tests { #[test] fn select_binary_expr() { let sql = "SELECT age + salary from person"; - let expected = "Projection: #age Plus #salary\ + let expected = "Projection: #person.age Plus #person.salary\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -2152,7 +2154,7 @@ mod tests { #[test] fn select_binary_expr_nested() { let sql = "SELECT (age + salary)/2 from person"; - let expected = "Projection: #age Plus #salary Divide Int64(2)\ + let expected = "Projection: #person.age Plus #person.salary Divide Int64(2)\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -2161,15 +2163,15 @@ mod tests { fn select_wildcard_with_groupby() { quick_test( r#"SELECT * FROM person GROUP BY id, first_name, last_name, age, state, salary, birth_date, "😀""#, - "Projection: #id, #first_name, #last_name, #age, #state, #salary, #birth_date, #😀\ - \n Aggregate: groupBy=[[#id, #first_name, #last_name, #age, #state, #salary, #birth_date, #😀]], aggr=[[]]\ + "Projection: #person.id, #person.first_name, #person.last_name, #person.age, #person.state, #person.salary, #person.birth_date, #person.😀\ + \n Aggregate: groupBy=[[#person.id, #person.first_name, #person.last_name, #person.age, #person.state, #person.salary, #person.birth_date, #person.😀]], aggr=[[]]\ \n TableScan: person projection=None", ); quick_test( "SELECT * FROM (SELECT first_name, last_name FROM person) GROUP BY first_name, last_name", - "Projection: #first_name, #last_name\ - \n Aggregate: groupBy=[[#first_name, #last_name]], aggr=[[]]\ - \n Projection: #first_name, #last_name\ + "Projection: #person.first_name, #person.last_name\ + \n Aggregate: groupBy=[[#person.first_name, #person.last_name]], aggr=[[]]\ + \n Projection: #person.first_name, #person.last_name\ \n TableScan: person projection=None", ); } @@ -2178,8 +2180,8 @@ mod tests { fn select_simple_aggregate() { quick_test( "SELECT MIN(age) FROM person", - "Projection: #MIN(age)\ - \n Aggregate: groupBy=[[]], aggr=[[MIN(#age)]]\ + "Projection: #MIN(person.age)\ + \n Aggregate: groupBy=[[]], aggr=[[MIN(#person.age)]]\ \n TableScan: person projection=None", ); } @@ -2188,8 +2190,8 @@ mod tests { fn test_sum_aggregate() { quick_test( "SELECT SUM(age) from person", - "Projection: #SUM(age)\ - \n Aggregate: groupBy=[[]], aggr=[[SUM(#age)]]\ + "Projection: #SUM(person.age)\ + \n Aggregate: groupBy=[[]], aggr=[[SUM(#person.age)]]\ \n TableScan: person projection=None", ); } @@ -2198,13 +2200,10 @@ mod tests { fn select_simple_aggregate_column_does_not_exist() { let sql = "SELECT MIN(doesnotexist) FROM person"; let err = logical_plan(sql).expect_err("query should have failed"); - assert_eq!( - format!( - r#"Plan("Invalid identifier 'doesnotexist' for schema {}")"#, - PERSON_COLUMN_NAMES - ), - format!("{:?}", err) - ); + assert!(matches!( + err, + DataFusionError::Plan(msg) if msg == "No field with unqualified name 'doesnotexist'", + )); } #[test] @@ -2212,7 +2211,7 @@ mod tests { let sql = "SELECT MIN(age), MIN(age) FROM person"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - r##"Plan("Projections require unique expression names but the expression \"#MIN(age)\" at position 0 and \"#MIN(age)\" at position 1 have the same name. Consider aliasing (\"AS\") one of them.")"##, + r##"Plan("Projections require unique expression names but the expression \"MIN(#person.age)\" at position 0 and \"MIN(#person.age)\" at position 1 have the same name. Consider aliasing (\"AS\") one of them.")"##, format!("{:?}", err) ); } @@ -2221,8 +2220,8 @@ mod tests { fn select_simple_aggregate_repeated_aggregate_with_single_alias() { quick_test( "SELECT MIN(age), MIN(age) AS a FROM person", - "Projection: #MIN(age), #MIN(age) AS a\ - \n Aggregate: groupBy=[[]], aggr=[[MIN(#age)]]\ + "Projection: #MIN(person.age), #MIN(person.age) AS a\ + \n Aggregate: groupBy=[[]], aggr=[[MIN(#person.age)]]\ \n TableScan: person projection=None", ); } @@ -2231,8 +2230,8 @@ mod tests { fn select_simple_aggregate_repeated_aggregate_with_unique_aliases() { quick_test( "SELECT MIN(age) AS a, MIN(age) AS b FROM person", - "Projection: #MIN(age) AS a, #MIN(age) AS b\ - \n Aggregate: groupBy=[[]], aggr=[[MIN(#age)]]\ + "Projection: #MIN(person.age) AS a, #MIN(person.age) AS b\ + \n Aggregate: groupBy=[[]], aggr=[[MIN(#person.age)]]\ \n TableScan: person projection=None", ); } @@ -2242,7 +2241,7 @@ mod tests { let sql = "SELECT MIN(age) AS a, MIN(age) AS a FROM person"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - r##"Plan("Projections require unique expression names but the expression \"#MIN(age) AS a\" at position 0 and \"#MIN(age) AS a\" at position 1 have the same name. Consider aliasing (\"AS\") one of them.")"##, + r##"Plan("Projections require unique expression names but the expression \"MIN(#person.age) AS a\" at position 0 and \"MIN(#person.age) AS a\" at position 1 have the same name. Consider aliasing (\"AS\") one of them.")"##, format!("{:?}", err) ); } @@ -2251,8 +2250,8 @@ mod tests { fn select_simple_aggregate_with_groupby() { quick_test( "SELECT state, MIN(age), MAX(age) FROM person GROUP BY state", - "Projection: #state, #MIN(age), #MAX(age)\ - \n Aggregate: groupBy=[[#state]], aggr=[[MIN(#age), MAX(#age)]]\ + "Projection: #person.state, #MIN(person.age), #MAX(person.age)\ + \n Aggregate: groupBy=[[#person.state]], aggr=[[MIN(#person.age), MAX(#person.age)]]\ \n TableScan: person projection=None", ); } @@ -2261,8 +2260,8 @@ mod tests { fn select_simple_aggregate_with_groupby_with_aliases() { quick_test( "SELECT state AS a, MIN(age) AS b FROM person GROUP BY state", - "Projection: #state AS a, #MIN(age) AS b\ - \n Aggregate: groupBy=[[#state]], aggr=[[MIN(#age)]]\ + "Projection: #person.state AS a, #MIN(person.age) AS b\ + \n Aggregate: groupBy=[[#person.state]], aggr=[[MIN(#person.age)]]\ \n TableScan: person projection=None", ); } @@ -2272,7 +2271,7 @@ mod tests { let sql = "SELECT state AS a, MIN(age) AS a FROM person GROUP BY state"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - r##"Plan("Projections require unique expression names but the expression \"#state AS a\" at position 0 and \"#MIN(age) AS a\" at position 1 have the same name. Consider aliasing (\"AS\") one of them.")"##, + r##"Plan("Projections require unique expression names but the expression \"#person.state AS a\" at position 0 and \"MIN(#person.age) AS a\" at position 1 have the same name. Consider aliasing (\"AS\") one of them.")"##, format!("{:?}", err) ); } @@ -2281,8 +2280,8 @@ mod tests { fn select_simple_aggregate_with_groupby_column_unselected() { quick_test( "SELECT MIN(age), MAX(age) FROM person GROUP BY state", - "Projection: #MIN(age), #MAX(age)\ - \n Aggregate: groupBy=[[#state]], aggr=[[MIN(#age), MAX(#age)]]\ + "Projection: #MIN(person.age), #MAX(person.age)\ + \n Aggregate: groupBy=[[#person.state]], aggr=[[MIN(#person.age), MAX(#person.age)]]\ \n TableScan: person projection=None", ); } @@ -2291,26 +2290,20 @@ mod tests { fn select_simple_aggregate_with_groupby_and_column_in_group_by_does_not_exist() { let sql = "SELECT SUM(age) FROM person GROUP BY doesnotexist"; let err = logical_plan(sql).expect_err("query should have failed"); - assert_eq!( - format!( - r#"Plan("Invalid identifier 'doesnotexist' for schema {}")"#, - PERSON_COLUMN_NAMES - ), - format!("{:?}", err) - ); + assert!(matches!( + err, + DataFusionError::Plan(msg) if msg == "No field with unqualified name 'doesnotexist'", + )); } #[test] fn select_simple_aggregate_with_groupby_and_column_in_aggregate_does_not_exist() { let sql = "SELECT SUM(doesnotexist) FROM person GROUP BY first_name"; let err = logical_plan(sql).expect_err("query should have failed"); - assert_eq!( - format!( - r#"Plan("Invalid identifier 'doesnotexist' for schema {}")"#, - PERSON_COLUMN_NAMES - ), - format!("{:?}", err) - ); + assert!(matches!( + err, + DataFusionError::Plan(msg) if msg == "No field with unqualified name 'doesnotexist'", + )); } #[test] @@ -2327,18 +2320,18 @@ mod tests { fn select_unsupported_complex_interval() { let sql = "SELECT INTERVAL '1 year 1 day'"; let err = logical_plan(sql).expect_err("query should have failed"); - assert_eq!( - r#"NotImplemented("DF does not support intervals that have both a Year/Month part as well as Days/Hours/Mins/Seconds: \"1 year 1 day\". Hint: try breaking the interval into two parts, one with Year/Month and the other with Days/Hours/Mins/Seconds - e.g. (NOW() + INTERVAL '1 year') + INTERVAL '1 day'")"#, - format!("{:?}", err) - ); + assert!(matches!( + err, + DataFusionError::NotImplemented(msg) if msg == "DF does not support intervals that have both a Year/Month part as well as Days/Hours/Mins/Seconds: \"1 year 1 day\". Hint: try breaking the interval into two parts, one with Year/Month and the other with Days/Hours/Mins/Seconds - e.g. (NOW() + INTERVAL '1 year') + INTERVAL '1 day'", + )); } #[test] fn select_simple_aggregate_with_groupby_and_column_is_in_aggregate_and_groupby() { quick_test( "SELECT MAX(first_name) FROM person GROUP BY first_name", - "Projection: #MAX(first_name)\ - \n Aggregate: groupBy=[[#first_name]], aggr=[[MAX(#first_name)]]\ + "Projection: #MAX(person.first_name)\ + \n Aggregate: groupBy=[[#person.first_name]], aggr=[[MAX(#person.first_name)]]\ \n TableScan: person projection=None", ); } @@ -2347,14 +2340,14 @@ mod tests { fn select_simple_aggregate_with_groupby_can_use_positions() { quick_test( "SELECT state, age AS b, COUNT(1) FROM person GROUP BY 1, 2", - "Projection: #state, #age AS b, #COUNT(UInt8(1))\ - \n Aggregate: groupBy=[[#state, #age]], aggr=[[COUNT(UInt8(1))]]\ + "Projection: #person.state, #person.age AS b, #COUNT(UInt8(1))\ + \n Aggregate: groupBy=[[#person.state, #person.age]], aggr=[[COUNT(UInt8(1))]]\ \n TableScan: person projection=None", ); quick_test( "SELECT state, age AS b, COUNT(1) FROM person GROUP BY 2, 1", - "Projection: #state, #age AS b, #COUNT(UInt8(1))\ - \n Aggregate: groupBy=[[#age, #state]], aggr=[[COUNT(UInt8(1))]]\ + "Projection: #person.state, #person.age AS b, #COUNT(UInt8(1))\ + \n Aggregate: groupBy=[[#person.age, #person.state]], aggr=[[COUNT(UInt8(1))]]\ \n TableScan: person projection=None", ); } @@ -2380,8 +2373,8 @@ mod tests { fn select_simple_aggregate_with_groupby_can_use_alias() { quick_test( "SELECT state AS a, MIN(age) AS b FROM person GROUP BY a", - "Projection: #state AS a, #MIN(age) AS b\ - \n Aggregate: groupBy=[[#state]], aggr=[[MIN(#age)]]\ + "Projection: #person.state AS a, #MIN(person.age) AS b\ + \n Aggregate: groupBy=[[#person.state]], aggr=[[MIN(#person.age)]]\ \n TableScan: person projection=None", ); } @@ -2391,7 +2384,7 @@ mod tests { let sql = "SELECT state, MIN(age), MIN(age) FROM person GROUP BY state"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - r##"Plan("Projections require unique expression names but the expression \"#MIN(age)\" at position 1 and \"#MIN(age)\" at position 2 have the same name. Consider aliasing (\"AS\") one of them.")"##, + r##"Plan("Projections require unique expression names but the expression \"MIN(#person.age)\" at position 1 and \"MIN(#person.age)\" at position 2 have the same name. Consider aliasing (\"AS\") one of them.")"##, format!("{:?}", err) ); } @@ -2400,8 +2393,8 @@ mod tests { fn select_simple_aggregate_with_groupby_aggregate_repeated_and_one_has_alias() { quick_test( "SELECT state, MIN(age), MIN(age) AS ma FROM person GROUP BY state", - "Projection: #state, #MIN(age), #MIN(age) AS ma\ - \n Aggregate: groupBy=[[#state]], aggr=[[MIN(#age)]]\ + "Projection: #person.state, #MIN(person.age), #MIN(person.age) AS ma\ + \n Aggregate: groupBy=[[#person.state]], aggr=[[MIN(#person.age)]]\ \n TableScan: person projection=None", ) } @@ -2409,8 +2402,8 @@ mod tests { fn select_simple_aggregate_with_groupby_non_column_expression_unselected() { quick_test( "SELECT MIN(first_name) FROM person GROUP BY age + 1", - "Projection: #MIN(first_name)\ - \n Aggregate: groupBy=[[#age Plus Int64(1)]], aggr=[[MIN(#first_name)]]\ + "Projection: #MIN(person.first_name)\ + \n Aggregate: groupBy=[[#person.age Plus Int64(1)]], aggr=[[MIN(#person.first_name)]]\ \n TableScan: person projection=None", ); } @@ -2420,14 +2413,14 @@ mod tests { ) { quick_test( "SELECT age + 1, MIN(first_name) FROM person GROUP BY age + 1", - "Projection: #age Plus Int64(1), #MIN(first_name)\ - \n Aggregate: groupBy=[[#age Plus Int64(1)]], aggr=[[MIN(#first_name)]]\ + "Projection: #person.age Plus Int64(1), #MIN(person.first_name)\ + \n Aggregate: groupBy=[[#person.age Plus Int64(1)]], aggr=[[MIN(#person.first_name)]]\ \n TableScan: person projection=None", ); quick_test( "SELECT MIN(first_name), age + 1 FROM person GROUP BY age + 1", - "Projection: #MIN(first_name), #age Plus Int64(1)\ - \n Aggregate: groupBy=[[#age Plus Int64(1)]], aggr=[[MIN(#first_name)]]\ + "Projection: #MIN(person.first_name), #person.age Plus Int64(1)\ + \n Aggregate: groupBy=[[#person.age Plus Int64(1)]], aggr=[[MIN(#person.first_name)]]\ \n TableScan: person projection=None", ); } @@ -2437,8 +2430,8 @@ mod tests { { quick_test( "SELECT ((age + 1) / 2) * (age + 1), MIN(first_name) FROM person GROUP BY age + 1", - "Projection: #age Plus Int64(1) Divide Int64(2) Multiply #age Plus Int64(1), #MIN(first_name)\ - \n Aggregate: groupBy=[[#age Plus Int64(1)]], aggr=[[MIN(#first_name)]]\ + "Projection: #person.age Plus Int64(1) Divide Int64(2) Multiply #person.age Plus Int64(1), #MIN(person.first_name)\ + \n Aggregate: groupBy=[[#person.age Plus Int64(1)]], aggr=[[MIN(#person.first_name)]]\ \n TableScan: person projection=None", ); } @@ -2471,8 +2464,8 @@ mod tests { fn select_simple_aggregate_nested_in_binary_expr_with_groupby() { quick_test( "SELECT state, MIN(age) < 10 FROM person GROUP BY state", - "Projection: #state, #MIN(age) Lt Int64(10)\ - \n Aggregate: groupBy=[[#state]], aggr=[[MIN(#age)]]\ + "Projection: #person.state, #MIN(person.age) Lt Int64(10)\ + \n Aggregate: groupBy=[[#person.state]], aggr=[[MIN(#person.age)]]\ \n TableScan: person projection=None", ); } @@ -2481,8 +2474,8 @@ mod tests { fn select_simple_aggregate_and_nested_groupby_column() { quick_test( "SELECT age + 1, MAX(first_name) FROM person GROUP BY age", - "Projection: #age Plus Int64(1), #MAX(first_name)\ - \n Aggregate: groupBy=[[#age]], aggr=[[MAX(#first_name)]]\ + "Projection: #person.age Plus Int64(1), #MAX(person.first_name)\ + \n Aggregate: groupBy=[[#person.age]], aggr=[[MAX(#person.first_name)]]\ \n TableScan: person projection=None", ); } @@ -2491,8 +2484,8 @@ mod tests { fn select_aggregate_compounded_with_groupby_column() { quick_test( "SELECT age + MIN(salary) FROM person GROUP BY age", - "Projection: #age Plus #MIN(salary)\ - \n Aggregate: groupBy=[[#age]], aggr=[[MIN(#salary)]]\ + "Projection: #person.age Plus #MIN(person.salary)\ + \n Aggregate: groupBy=[[#person.age]], aggr=[[MIN(#person.salary)]]\ \n TableScan: person projection=None", ); } @@ -2501,8 +2494,8 @@ mod tests { fn select_aggregate_with_non_column_inner_expression_with_groupby() { quick_test( "SELECT state, MIN(age + 1) FROM person GROUP BY state", - "Projection: #state, #MIN(age Plus Int64(1))\ - \n Aggregate: groupBy=[[#state]], aggr=[[MIN(#age Plus Int64(1))]]\ + "Projection: #person.state, #MIN(person.age Plus Int64(1))\ + \n Aggregate: groupBy=[[#person.state]], aggr=[[MIN(#person.age Plus Int64(1))]]\ \n TableScan: person projection=None", ); } @@ -2511,7 +2504,7 @@ mod tests { fn test_wildcard() { quick_test( "SELECT * from person", - "Projection: #id, #first_name, #last_name, #age, #state, #salary, #birth_date, #😀\ + "Projection: #person.id, #person.first_name, #person.last_name, #person.age, #person.state, #person.salary, #person.birth_date, #person.😀\ \n TableScan: person projection=None", ); } @@ -2528,8 +2521,8 @@ mod tests { #[test] fn select_count_column() { let sql = "SELECT COUNT(id) FROM person"; - let expected = "Projection: #COUNT(id)\ - \n Aggregate: groupBy=[[]], aggr=[[COUNT(#id)]]\ + let expected = "Projection: #COUNT(person.id)\ + \n Aggregate: groupBy=[[]], aggr=[[COUNT(#person.id)]]\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -2537,15 +2530,15 @@ mod tests { #[test] fn select_scalar_func() { let sql = "SELECT sqrt(age) FROM person"; - let expected = "Projection: sqrt(#age)\ + let expected = "Projection: sqrt(#person.age)\ \n TableScan: person projection=None"; quick_test(sql, expected); } #[test] fn select_aliased_scalar_func() { - let sql = "SELECT sqrt(age) AS square_people FROM person"; - let expected = "Projection: sqrt(#age) AS square_people\ + let sql = "SELECT sqrt(person.age) AS square_people FROM person"; + let expected = "Projection: sqrt(#person.age) AS square_people\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -2554,8 +2547,8 @@ mod tests { fn select_where_nullif_division() { let sql = "SELECT c3/(c4+c5) \ FROM aggregate_test_100 WHERE c3/nullif(c4+c5, 0) > 0.1"; - let expected = "Projection: #c3 Divide #c4 Plus #c5\ - \n Filter: #c3 Divide nullif(#c4 Plus #c5, Int64(0)) Gt Float64(0.1)\ + let expected = "Projection: #aggregate_test_100.c3 Divide #aggregate_test_100.c4 Plus #aggregate_test_100.c5\ + \n Filter: #aggregate_test_100.c3 Divide nullif(#aggregate_test_100.c4 Plus #aggregate_test_100.c5, Int64(0)) Gt Float64(0.1)\ \n TableScan: aggregate_test_100 projection=None"; quick_test(sql, expected); } @@ -2563,8 +2556,8 @@ mod tests { #[test] fn select_where_with_negative_operator() { let sql = "SELECT c3 FROM aggregate_test_100 WHERE c3 > -0.1 AND -c4 > 0"; - let expected = "Projection: #c3\ - \n Filter: #c3 Gt Float64(-0.1) And (- #c4) Gt Int64(0)\ + let expected = "Projection: #aggregate_test_100.c3\ + \n Filter: #aggregate_test_100.c3 Gt Float64(-0.1) And (- #aggregate_test_100.c4) Gt Int64(0)\ \n TableScan: aggregate_test_100 projection=None"; quick_test(sql, expected); } @@ -2572,8 +2565,8 @@ mod tests { #[test] fn select_where_with_positive_operator() { let sql = "SELECT c3 FROM aggregate_test_100 WHERE c3 > +0.1 AND +c4 > 0"; - let expected = "Projection: #c3\ - \n Filter: #c3 Gt Float64(0.1) And #c4 Gt Int64(0)\ + let expected = "Projection: #aggregate_test_100.c3\ + \n Filter: #aggregate_test_100.c3 Gt Float64(0.1) And #aggregate_test_100.c4 Gt Int64(0)\ \n TableScan: aggregate_test_100 projection=None"; quick_test(sql, expected); } @@ -2581,8 +2574,8 @@ mod tests { #[test] fn select_order_by() { let sql = "SELECT id FROM person ORDER BY id"; - let expected = "Sort: #id ASC NULLS FIRST\ - \n Projection: #id\ + let expected = "Sort: #person.id ASC NULLS FIRST\ + \n Projection: #person.id\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -2590,8 +2583,8 @@ mod tests { #[test] fn select_order_by_desc() { let sql = "SELECT id FROM person ORDER BY id DESC"; - let expected = "Sort: #id DESC NULLS FIRST\ - \n Projection: #id\ + let expected = "Sort: #person.id DESC NULLS FIRST\ + \n Projection: #person.id\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -2600,15 +2593,15 @@ mod tests { fn select_order_by_nulls_last() { quick_test( "SELECT id FROM person ORDER BY id DESC NULLS LAST", - "Sort: #id DESC NULLS LAST\ - \n Projection: #id\ + "Sort: #person.id DESC NULLS LAST\ + \n Projection: #person.id\ \n TableScan: person projection=None", ); quick_test( "SELECT id FROM person ORDER BY id NULLS LAST", - "Sort: #id ASC NULLS LAST\ - \n Projection: #id\ + "Sort: #person.id ASC NULLS LAST\ + \n Projection: #person.id\ \n TableScan: person projection=None", ); } @@ -2616,8 +2609,8 @@ mod tests { #[test] fn select_group_by() { let sql = "SELECT state FROM person GROUP BY state"; - let expected = "Projection: #state\ - \n Aggregate: groupBy=[[#state]], aggr=[[]]\ + let expected = "Projection: #person.state\ + \n Aggregate: groupBy=[[#person.state]], aggr=[[]]\ \n TableScan: person projection=None"; quick_test(sql, expected); @@ -2626,8 +2619,8 @@ mod tests { #[test] fn select_group_by_columns_not_in_select() { let sql = "SELECT MAX(age) FROM person GROUP BY state"; - let expected = "Projection: #MAX(age)\ - \n Aggregate: groupBy=[[#state]], aggr=[[MAX(#age)]]\ + let expected = "Projection: #MAX(person.age)\ + \n Aggregate: groupBy=[[#person.state]], aggr=[[MAX(#person.age)]]\ \n TableScan: person projection=None"; quick_test(sql, expected); @@ -2636,8 +2629,8 @@ mod tests { #[test] fn select_group_by_count_star() { let sql = "SELECT state, COUNT(*) FROM person GROUP BY state"; - let expected = "Projection: #state, #COUNT(UInt8(1))\ - \n Aggregate: groupBy=[[#state]], aggr=[[COUNT(UInt8(1))]]\ + let expected = "Projection: #person.state, #COUNT(UInt8(1))\ + \n Aggregate: groupBy=[[#person.state]], aggr=[[COUNT(UInt8(1))]]\ \n TableScan: person projection=None"; quick_test(sql, expected); @@ -2647,8 +2640,8 @@ mod tests { fn select_group_by_needs_projection() { let sql = "SELECT COUNT(state), state FROM person GROUP BY state"; let expected = "\ - Projection: #COUNT(state), #state\ - \n Aggregate: groupBy=[[#state]], aggr=[[COUNT(#state)]]\ + Projection: #COUNT(person.state), #person.state\ + \n Aggregate: groupBy=[[#person.state]], aggr=[[COUNT(#person.state)]]\ \n TableScan: person projection=None"; quick_test(sql, expected); @@ -2657,8 +2650,8 @@ mod tests { #[test] fn select_7480_1() { let sql = "SELECT c1, MIN(c12) FROM aggregate_test_100 GROUP BY c1, c13"; - let expected = "Projection: #c1, #MIN(c12)\ - \n Aggregate: groupBy=[[#c1, #c13]], aggr=[[MIN(#c12)]]\ + let expected = "Projection: #aggregate_test_100.c1, #MIN(aggregate_test_100.c12)\ + \n Aggregate: groupBy=[[#aggregate_test_100.c1, #aggregate_test_100.c13]], aggr=[[MIN(#aggregate_test_100.c12)]]\ \n TableScan: aggregate_test_100 projection=None"; quick_test(sql, expected); } @@ -2714,22 +2707,49 @@ mod tests { FROM person \ JOIN orders \ ON id = customer_id"; - let expected = "Projection: #id, #order_id\ - \n Join: id = customer_id\ + let expected = "Projection: #person.id, #orders.order_id\ + \n Join: #person.id = #orders.customer_id\ \n TableScan: person projection=None\ \n TableScan: orders projection=None"; quick_test(sql, expected); } + #[test] + fn join_with_table_name() { + let sql = "SELECT id, order_id \ + FROM person \ + JOIN orders \ + ON person.id = orders.customer_id"; + let expected = "Projection: #person.id, #orders.order_id\ + \n Join: #person.id = #orders.customer_id\ + \n TableScan: person projection=None\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); + } + + #[test] + fn join_with_using() { + let sql = "SELECT person.first_name, id \ + FROM person \ + JOIN person as person2 \ + USING (id)"; + let expected = "Projection: #person.first_name, #person.id\ + \n Join: #person.id = #person2.id\ + \n TableScan: person projection=None\ + \n TableScan: person2 projection=None"; + quick_test(sql, expected); + } + #[test] fn equijoin_explicit_syntax_3_tables() { let sql = "SELECT id, order_id, l_description \ FROM person \ JOIN orders ON id = customer_id \ JOIN lineitem ON o_item_id = l_item_id"; - let expected = "Projection: #id, #order_id, #l_description\ - \n Join: o_item_id = l_item_id\ - \n Join: id = customer_id\ + let expected = + "Projection: #person.id, #orders.order_id, #lineitem.l_description\ + \n Join: #orders.o_item_id = #lineitem.l_item_id\ + \n Join: #person.id = #orders.customer_id\ \n TableScan: person projection=None\ \n TableScan: orders projection=None\ \n TableScan: lineitem projection=None"; @@ -2741,8 +2761,8 @@ mod tests { let sql = "SELECT order_id \ FROM orders \ WHERE delivered = false OR delivered = true"; - let expected = "Projection: #order_id\ - \n Filter: #delivered Eq Boolean(false) Or #delivered Eq Boolean(true)\ + let expected = "Projection: #orders.order_id\ + \n Filter: #orders.delivered Eq Boolean(false) Or #orders.delivered Eq Boolean(true)\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2751,9 +2771,9 @@ mod tests { fn union() { let sql = "SELECT order_id from orders UNION ALL SELECT order_id FROM orders"; let expected = "Union\ - \n Projection: #order_id\ + \n Projection: #orders.order_id\ \n TableScan: orders projection=None\ - \n Projection: #order_id\ + \n Projection: #orders.order_id\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2765,13 +2785,13 @@ mod tests { UNION ALL SELECT order_id FROM orders UNION ALL SELECT order_id FROM orders"; let expected = "Union\ - \n Projection: #order_id\ + \n Projection: #orders.order_id\ \n TableScan: orders projection=None\ - \n Projection: #order_id\ + \n Projection: #orders.order_id\ \n TableScan: orders projection=None\ - \n Projection: #order_id\ + \n Projection: #orders.order_id\ \n TableScan: orders projection=None\ - \n Projection: #order_id\ + \n Projection: #orders.order_id\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2790,8 +2810,8 @@ mod tests { fn empty_over() { let sql = "SELECT order_id, MAX(order_id) OVER () from orders"; let expected = "\ - Projection: #order_id, #MAX(order_id)\ - \n WindowAggr: windowExpr=[[MAX(#order_id)]]\ + Projection: #orders.order_id, #MAX(orders.order_id)\ + \n WindowAggr: windowExpr=[[MAX(#orders.order_id)]]\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2800,8 +2820,8 @@ mod tests { fn empty_over_with_alias() { let sql = "SELECT order_id oid, MAX(order_id) OVER () max_oid from orders"; let expected = "\ - Projection: #order_id AS oid, #MAX(order_id) AS max_oid\ - \n WindowAggr: windowExpr=[[MAX(#order_id)]]\ + Projection: #orders.order_id AS oid, #MAX(orders.order_id) AS max_oid\ + \n WindowAggr: windowExpr=[[MAX(#orders.order_id)]]\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2810,8 +2830,8 @@ mod tests { fn empty_over_plus() { let sql = "SELECT order_id, MAX(qty * 1.1) OVER () from orders"; let expected = "\ - Projection: #order_id, #MAX(qty Multiply Float64(1.1))\ - \n WindowAggr: windowExpr=[[MAX(#qty Multiply Float64(1.1))]]\ + Projection: #orders.order_id, #MAX(orders.qty Multiply Float64(1.1))\ + \n WindowAggr: windowExpr=[[MAX(#orders.qty Multiply Float64(1.1))]]\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2821,8 +2841,8 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (), min(qty) over (), aVg(qty) OVER () from orders"; let expected = "\ - Projection: #order_id, #MAX(qty), #MIN(qty), #AVG(qty)\ - \n WindowAggr: windowExpr=[[MAX(#qty), MIN(#qty), AVG(#qty)]]\ + Projection: #orders.order_id, #MAX(orders.qty), #MIN(orders.qty), #AVG(orders.qty)\ + \n WindowAggr: windowExpr=[[MAX(#orders.qty), MIN(#orders.qty), AVG(#orders.qty)]]\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2840,9 +2860,9 @@ mod tests { fn over_partition_by() { let sql = "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id) from orders"; let expected = "\ - Projection: #order_id, #MAX(qty)\ - \n WindowAggr: windowExpr=[[MAX(#qty)]]\ - \n Sort: #order_id ASC NULLS FIRST\ + Projection: #orders.order_id, #MAX(orders.qty)\ + \n WindowAggr: windowExpr=[[MAX(#orders.qty)]]\ + \n Sort: #orders.order_id ASC NULLS FIRST\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2863,11 +2883,11 @@ mod tests { fn over_order_by() { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; let expected = "\ - Projection: #order_id, #MAX(qty), #MIN(qty)\ - \n WindowAggr: windowExpr=[[MAX(#qty)]]\ - \n Sort: #order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#qty)]]\ - \n Sort: #order_id DESC NULLS FIRST\ + Projection: #orders.order_id, #MAX(orders.qty), #MIN(orders.qty)\ + \n WindowAggr: windowExpr=[[MAX(#orders.qty)]]\ + \n Sort: #orders.order_id ASC NULLS FIRST\ + \n WindowAggr: windowExpr=[[MIN(#orders.qty)]]\ + \n Sort: #orders.order_id DESC NULLS FIRST\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2876,11 +2896,11 @@ mod tests { fn over_order_by_with_window_frame_double_end() { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id ROWS BETWEEN 3 PRECEDING and 3 FOLLOWING), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; let expected = "\ - Projection: #order_id, #MAX(qty) ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING, #MIN(qty)\ - \n WindowAggr: windowExpr=[[MAX(#qty) ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING]]\ - \n Sort: #order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#qty)]]\ - \n Sort: #order_id DESC NULLS FIRST\ + Projection: #orders.order_id, #MAX(orders.qty) ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING, #MIN(orders.qty)\ + \n WindowAggr: windowExpr=[[MAX(#orders.qty) ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING]]\ + \n Sort: #orders.order_id ASC NULLS FIRST\ + \n WindowAggr: windowExpr=[[MIN(#orders.qty)]]\ + \n Sort: #orders.order_id DESC NULLS FIRST\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2889,11 +2909,11 @@ mod tests { fn over_order_by_with_window_frame_single_end() { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id ROWS 3 PRECEDING), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; let expected = "\ - Projection: #order_id, #MAX(qty) ROWS BETWEEN 3 PRECEDING AND CURRENT ROW, #MIN(qty)\ - \n WindowAggr: windowExpr=[[MAX(#qty) ROWS BETWEEN 3 PRECEDING AND CURRENT ROW]]\ - \n Sort: #order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#qty)]]\ - \n Sort: #order_id DESC NULLS FIRST\ + Projection: #orders.order_id, #MAX(orders.qty) ROWS BETWEEN 3 PRECEDING AND CURRENT ROW, #MIN(orders.qty)\ + \n WindowAggr: windowExpr=[[MAX(#orders.qty) ROWS BETWEEN 3 PRECEDING AND CURRENT ROW]]\ + \n Sort: #orders.order_id ASC NULLS FIRST\ + \n WindowAggr: windowExpr=[[MIN(#orders.qty)]]\ + \n Sort: #orders.order_id DESC NULLS FIRST\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2934,11 +2954,11 @@ mod tests { fn over_order_by_with_window_frame_single_end_groups() { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id GROUPS 3 PRECEDING), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; let expected = "\ - Projection: #order_id, #MAX(qty) GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW, #MIN(qty)\ - \n WindowAggr: windowExpr=[[MAX(#qty) GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW]]\ - \n Sort: #order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#qty)]]\ - \n Sort: #order_id DESC NULLS FIRST\ + Projection: #orders.order_id, #MAX(orders.qty) GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW, #MIN(orders.qty)\ + \n WindowAggr: windowExpr=[[MAX(#orders.qty) GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW]]\ + \n Sort: #orders.order_id ASC NULLS FIRST\ + \n WindowAggr: windowExpr=[[MIN(#orders.qty)]]\ + \n Sort: #orders.order_id DESC NULLS FIRST\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2959,11 +2979,11 @@ mod tests { fn over_order_by_two_sort_keys() { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id), MIN(qty) OVER (ORDER BY (order_id + 1)) from orders"; let expected = "\ - Projection: #order_id, #MAX(qty), #MIN(qty)\ - \n WindowAggr: windowExpr=[[MAX(#qty)]]\ - \n Sort: #order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#qty)]]\ - \n Sort: #order_id Plus Int64(1) ASC NULLS FIRST\ + Projection: #orders.order_id, #MAX(orders.qty), #MIN(orders.qty)\ + \n WindowAggr: windowExpr=[[MAX(#orders.qty)]]\ + \n Sort: #orders.order_id ASC NULLS FIRST\ + \n WindowAggr: windowExpr=[[MIN(#orders.qty)]]\ + \n Sort: #orders.order_id Plus Int64(1) ASC NULLS FIRST\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2985,12 +3005,12 @@ mod tests { fn over_order_by_sort_keys_sorting() { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY qty, order_id), SUM(qty) OVER (), MIN(qty) OVER (ORDER BY order_id, qty) from orders"; let expected = "\ - Projection: #order_id, #MAX(qty), #SUM(qty), #MIN(qty)\ - \n WindowAggr: windowExpr=[[SUM(#qty)]]\ - \n WindowAggr: windowExpr=[[MAX(#qty)]]\ - \n Sort: #qty ASC NULLS FIRST, #order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#qty)]]\ - \n Sort: #order_id ASC NULLS FIRST, #qty ASC NULLS FIRST\ + Projection: #orders.order_id, #MAX(orders.qty), #SUM(orders.qty), #MIN(orders.qty)\ + \n WindowAggr: windowExpr=[[SUM(#orders.qty)]]\ + \n WindowAggr: windowExpr=[[MAX(#orders.qty)]]\ + \n Sort: #orders.qty ASC NULLS FIRST, #orders.order_id ASC NULLS FIRST\ + \n WindowAggr: windowExpr=[[MIN(#orders.qty)]]\ + \n Sort: #orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -3012,12 +3032,12 @@ mod tests { fn over_order_by_sort_keys_sorting_prefix_compacting() { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id), SUM(qty) OVER (), MIN(qty) OVER (ORDER BY order_id, qty) from orders"; let expected = "\ - Projection: #order_id, #MAX(qty), #SUM(qty), #MIN(qty)\ - \n WindowAggr: windowExpr=[[SUM(#qty)]]\ - \n WindowAggr: windowExpr=[[MAX(#qty)]]\ - \n Sort: #order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#qty)]]\ - \n Sort: #order_id ASC NULLS FIRST, #qty ASC NULLS FIRST\ + Projection: #orders.order_id, #MAX(orders.qty), #SUM(orders.qty), #MIN(orders.qty)\ + \n WindowAggr: windowExpr=[[SUM(#orders.qty)]]\ + \n WindowAggr: windowExpr=[[MAX(#orders.qty)]]\ + \n Sort: #orders.order_id ASC NULLS FIRST\ + \n WindowAggr: windowExpr=[[MIN(#orders.qty)]]\ + \n Sort: #orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -3042,13 +3062,13 @@ mod tests { fn over_order_by_sort_keys_sorting_global_order_compacting() { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY qty, order_id), SUM(qty) OVER (), MIN(qty) OVER (ORDER BY order_id, qty) from orders ORDER BY order_id"; let expected = "\ - Sort: #order_id ASC NULLS FIRST\ - \n Projection: #order_id, #MAX(qty), #SUM(qty), #MIN(qty)\ - \n WindowAggr: windowExpr=[[SUM(#qty)]]\ - \n WindowAggr: windowExpr=[[MAX(#qty)]]\ - \n Sort: #qty ASC NULLS FIRST, #order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#qty)]]\ - \n Sort: #order_id ASC NULLS FIRST, #qty ASC NULLS FIRST\ + Sort: #orders.order_id ASC NULLS FIRST\ + \n Projection: #orders.order_id, #MAX(orders.qty), #SUM(orders.qty), #MIN(orders.qty)\ + \n WindowAggr: windowExpr=[[SUM(#orders.qty)]]\ + \n WindowAggr: windowExpr=[[MAX(#orders.qty)]]\ + \n Sort: #orders.qty ASC NULLS FIRST, #orders.order_id ASC NULLS FIRST\ + \n WindowAggr: windowExpr=[[MIN(#orders.qty)]]\ + \n Sort: #orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -3067,9 +3087,9 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id ORDER BY qty) from orders"; let expected = "\ - Projection: #order_id, #MAX(qty)\ - \n WindowAggr: windowExpr=[[MAX(#qty)]]\ - \n Sort: #order_id ASC NULLS FIRST, #qty ASC NULLS FIRST\ + Projection: #orders.order_id, #MAX(orders.qty)\ + \n WindowAggr: windowExpr=[[MAX(#orders.qty)]]\ + \n Sort: #orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -3088,9 +3108,9 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id, qty ORDER BY qty) from orders"; let expected = "\ - Projection: #order_id, #MAX(qty)\ - \n WindowAggr: windowExpr=[[MAX(#qty)]]\ - \n Sort: #order_id ASC NULLS FIRST, #qty ASC NULLS FIRST\ + Projection: #orders.order_id, #MAX(orders.qty)\ + \n WindowAggr: windowExpr=[[MAX(#orders.qty)]]\ + \n Sort: #orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -3112,11 +3132,11 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id, qty ORDER BY qty), MIN(qty) OVER (PARTITION BY qty ORDER BY order_id) from orders"; let expected = "\ - Projection: #order_id, #MAX(qty), #MIN(qty)\ - \n WindowAggr: windowExpr=[[MAX(#qty)]]\ - \n Sort: #order_id ASC NULLS FIRST, #qty ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#qty)]]\ - \n Sort: #qty ASC NULLS FIRST, #order_id ASC NULLS FIRST\ + Projection: #orders.order_id, #MAX(orders.qty), #MIN(orders.qty)\ + \n WindowAggr: windowExpr=[[MAX(#orders.qty)]]\ + \n Sort: #orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST\ + \n WindowAggr: windowExpr=[[MIN(#orders.qty)]]\ + \n Sort: #orders.qty ASC NULLS FIRST, #orders.order_id ASC NULLS FIRST\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -3137,11 +3157,11 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id ORDER BY qty), MIN(qty) OVER (PARTITION BY order_id, qty ORDER BY price) from orders"; let expected = "\ - Projection: #order_id, #MAX(qty), #MIN(qty)\ - \n WindowAggr: windowExpr=[[MAX(#qty)]]\ - \n Sort: #order_id ASC NULLS FIRST, #qty ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#qty)]]\ - \n Sort: #order_id ASC NULLS FIRST, #qty ASC NULLS FIRST, #price ASC NULLS FIRST\ + Projection: #orders.order_id, #MAX(orders.qty), #MIN(orders.qty)\ + \n WindowAggr: windowExpr=[[MAX(#orders.qty)]]\ + \n Sort: #orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST\ + \n WindowAggr: windowExpr=[[MIN(#orders.qty)]]\ + \n Sort: #orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST, #orders.price ASC NULLS FIRST\ \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -3167,7 +3187,7 @@ mod tests { #[test] fn select_multibyte_column() { let sql = r#"SELECT "😀" FROM person"#; - let expected = "Projection: #😀\ + let expected = "Projection: #person.😀\ \n TableScan: person projection=None"; quick_test(sql, expected); } @@ -3182,7 +3202,7 @@ mod tests { /// Create logical plan, write with formatter, compare to expected output fn quick_test(sql: &str, expected: &str) { let plan = logical_plan(sql).unwrap(); - assert_eq!(expected, format!("{:?}", plan)); + assert_eq!(format!("{:?}", plan), expected); } struct MockContextProvider {} @@ -3218,6 +3238,7 @@ mod tests { "lineitem" => Some(Schema::new(vec![ Field::new("l_item_id", DataType::UInt32, false), Field::new("l_description", DataType::Utf8, false), + Field::new("price", DataType::Float64, false), ])), "aggregate_test_100" => Some(Schema::new(vec![ Field::new("c1", DataType::Utf8, false), diff --git a/datafusion/src/sql/utils.rs b/datafusion/src/sql/utils.rs index 82431c2314ab2..7702748df44f1 100644 --- a/datafusion/src/sql/utils.rs +++ b/datafusion/src/sql/utils.rs @@ -21,7 +21,7 @@ use crate::logical_plan::{DFSchema, Expr, LogicalPlan}; use crate::scalar::ScalarValue; use crate::{ error::{DataFusionError, Result}, - logical_plan::{ExpressionVisitor, Recursion}, + logical_plan::{Column, ExpressionVisitor, Recursion}, }; use std::collections::HashMap; @@ -31,7 +31,7 @@ pub(crate) fn expand_wildcard(expr: &Expr, schema: &DFSchema) -> Vec { Expr::Wildcard => schema .fields() .iter() - .map(|f| Expr::Column(f.name().to_string())) + .map(|f| Expr::Column(f.qualified_column())) .collect::>(), _ => vec![expr.clone()], } @@ -146,7 +146,7 @@ where pub(crate) fn expr_as_column_expr(expr: &Expr, plan: &LogicalPlan) -> Result { match expr { Expr::Column(_) => Ok(expr.clone()), - _ => Ok(Expr::Column(expr.name(plan.schema())?)), + _ => Ok(Expr::Column(Column::from_name(expr.name(plan.schema())?))), } } @@ -376,7 +376,7 @@ where asc: *asc, nulls_first: *nulls_first, }), - Expr::Column(_) | Expr::Literal(_) | Expr::ScalarVariable(_) => { + Expr::Column { .. } | Expr::Literal(_) | Expr::ScalarVariable(_) => { Ok(expr.clone()) } Expr::Wildcard => Ok(Expr::Wildcard), @@ -426,8 +426,8 @@ pub(crate) fn resolve_aliases_to_exprs( aliases: &HashMap, ) -> Result { clone_with_replacement(expr, &|nested_expr| match nested_expr { - Expr::Column(name) => { - if let Some(aliased_expr) = aliases.get(name) { + Expr::Column(c) if c.relation.is_none() => { + if let Some(aliased_expr) = aliases.get(&c.name) { Ok(Some(aliased_expr.clone())) } else { Ok(None) diff --git a/datafusion/src/test/mod.rs b/datafusion/src/test/mod.rs index 51dfe7f3a0993..7ca7cc12d9efb 100644 --- a/datafusion/src/test/mod.rs +++ b/datafusion/src/test/mod.rs @@ -117,7 +117,7 @@ pub fn test_table_scan() -> Result { Field::new("b", DataType::UInt32, false), Field::new("c", DataType::UInt32, false), ]); - LogicalPlanBuilder::scan_empty("test", &schema, None)?.build() + LogicalPlanBuilder::scan_empty(Some("test"), &schema, None)?.build() } pub fn assert_fields_eq(plan: &LogicalPlan, expected: Vec<&str>) { diff --git a/datafusion/tests/custom_sources.rs b/datafusion/tests/custom_sources.rs index b39f47bba07b1..75fbe8e8eede7 100644 --- a/datafusion/tests/custom_sources.rs +++ b/datafusion/tests/custom_sources.rs @@ -30,7 +30,9 @@ use datafusion::{ }; use datafusion::execution::context::ExecutionContext; -use datafusion::logical_plan::{col, Expr, LogicalPlan, LogicalPlanBuilder}; +use datafusion::logical_plan::{ + col, Expr, LogicalPlan, LogicalPlanBuilder, UNNAMED_TABLE, +}; use datafusion::physical_plan::{ ExecutionPlan, Partitioning, RecordBatchStream, SendableRecordBatchStream, }; @@ -196,8 +198,11 @@ async fn custom_source_dataframe() -> Result<()> { _ => panic!("expect optimized_plan to be projection"), } - let expected = "Projection: #c2\ - \n TableScan: projection=Some([1])"; + let expected = format!( + "Projection: #{}.c2\ + \n TableScan: {} projection=Some([1])", + UNNAMED_TABLE, UNNAMED_TABLE + ); assert_eq!(format!("{:?}", optimized_plan), expected); let physical_plan = ctx.create_physical_plan(&optimized_plan)?; diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index cfdb6f4bc9e4b..c06a4bb1462ee 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -1957,12 +1957,12 @@ async fn csv_explain() { register_aggregate_csv_by_sql(&mut ctx).await; let sql = "EXPLAIN SELECT c1 FROM aggregate_test_100 where c2 > 10"; let actual = execute(&mut ctx, sql).await; - let expected = vec![ - vec![ - "logical_plan", - "Projection: #c1\n Filter: #c2 Gt Int64(10)\n TableScan: aggregate_test_100 projection=None" - ] - ]; + let expected = vec![vec![ + "logical_plan", + "Projection: #aggregate_test_100.c1\ + \n Filter: #aggregate_test_100.c2 Gt Int64(10)\ + \n TableScan: aggregate_test_100 projection=None", + ]]; assert_eq!(expected, actual); // Also, expect same result with lowercase explain @@ -1990,8 +1990,8 @@ async fn csv_explain_plans() { // Verify schema let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: #c1 [c1:Utf8]", - " Filter: #c2 Gt Int64(10) [c1:Utf8, c2:Int32, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:Int64, c10:Utf8, c11:Float32, c12:Float64, c13:Utf8]", + " Projection: #aggregate_test_100.c1 [c1:Utf8]", + " Filter: #aggregate_test_100.c2 Gt Int64(10) [c1:Utf8, c2:Int32, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:Int64, c10:Utf8, c11:Float32, c12:Float64, c13:Utf8]", " TableScan: aggregate_test_100 projection=None [c1:Utf8, c2:Int32, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:Int64, c10:Utf8, c11:Float32, c12:Float64, c13:Utf8]", ]; let formatted = plan.display_indent_schema().to_string(); @@ -2005,8 +2005,8 @@ async fn csv_explain_plans() { // Verify the text format of the plan let expected = vec![ "Explain", - " Projection: #c1", - " Filter: #c2 Gt Int64(10)", + " Projection: #aggregate_test_100.c1", + " Filter: #aggregate_test_100.c2 Gt Int64(10)", " TableScan: aggregate_test_100 projection=None", ]; let formatted = plan.display_indent().to_string(); @@ -2025,9 +2025,9 @@ async fn csv_explain_plans() { " {", " graph[label=\"LogicalPlan\"]", " 2[shape=box label=\"Explain\"]", - " 3[shape=box label=\"Projection: #c1\"]", + " 3[shape=box label=\"Projection: #aggregate_test_100.c1\"]", " 2 -> 3 [arrowhead=none, arrowtail=normal, dir=back]", - " 4[shape=box label=\"Filter: #c2 Gt Int64(10)\"]", + " 4[shape=box label=\"Filter: #aggregate_test_100.c2 Gt Int64(10)\"]", " 3 -> 4 [arrowhead=none, arrowtail=normal, dir=back]", " 5[shape=box label=\"TableScan: aggregate_test_100 projection=None\"]", " 4 -> 5 [arrowhead=none, arrowtail=normal, dir=back]", @@ -2036,9 +2036,9 @@ async fn csv_explain_plans() { " {", " graph[label=\"Detailed LogicalPlan\"]", " 7[shape=box label=\"Explain\\nSchema: [plan_type:Utf8, plan:Utf8]\"]", - " 8[shape=box label=\"Projection: #c1\\nSchema: [c1:Utf8]\"]", + " 8[shape=box label=\"Projection: #aggregate_test_100.c1\\nSchema: [c1:Utf8]\"]", " 7 -> 8 [arrowhead=none, arrowtail=normal, dir=back]", - " 9[shape=box label=\"Filter: #c2 Gt Int64(10)\\nSchema: [c1:Utf8, c2:Int32, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:Int64, c10:Utf8, c11:Float32, c12:Float64, c13:Utf8]\"]", + " 9[shape=box label=\"Filter: #aggregate_test_100.c2 Gt Int64(10)\\nSchema: [c1:Utf8, c2:Int32, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:Int64, c10:Utf8, c11:Float32, c12:Float64, c13:Utf8]\"]", " 8 -> 9 [arrowhead=none, arrowtail=normal, dir=back]", " 10[shape=box label=\"TableScan: aggregate_test_100 projection=None\\nSchema: [c1:Utf8, c2:Int32, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:Int64, c10:Utf8, c11:Float32, c12:Float64, c13:Utf8]\"]", " 9 -> 10 [arrowhead=none, arrowtail=normal, dir=back]", @@ -2065,8 +2065,8 @@ async fn csv_explain_plans() { // Verify schema let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: #c1 [c1:Utf8]", - " Filter: #c2 Gt Int64(10) [c1:Utf8, c2:Int32]", + " Projection: #aggregate_test_100.c1 [c1:Utf8]", + " Filter: #aggregate_test_100.c2 Gt Int64(10) [c1:Utf8, c2:Int32]", " TableScan: aggregate_test_100 projection=Some([0, 1]) [c1:Utf8, c2:Int32]", ]; let formatted = plan.display_indent_schema().to_string(); @@ -2080,8 +2080,8 @@ async fn csv_explain_plans() { // Verify the text format of the plan let expected = vec![ "Explain", - " Projection: #c1", - " Filter: #c2 Gt Int64(10)", + " Projection: #aggregate_test_100.c1", + " Filter: #aggregate_test_100.c2 Gt Int64(10)", " TableScan: aggregate_test_100 projection=Some([0, 1])", ]; let formatted = plan.display_indent().to_string(); @@ -2100,9 +2100,9 @@ async fn csv_explain_plans() { " {", " graph[label=\"LogicalPlan\"]", " 2[shape=box label=\"Explain\"]", - " 3[shape=box label=\"Projection: #c1\"]", + " 3[shape=box label=\"Projection: #aggregate_test_100.c1\"]", " 2 -> 3 [arrowhead=none, arrowtail=normal, dir=back]", - " 4[shape=box label=\"Filter: #c2 Gt Int64(10)\"]", + " 4[shape=box label=\"Filter: #aggregate_test_100.c2 Gt Int64(10)\"]", " 3 -> 4 [arrowhead=none, arrowtail=normal, dir=back]", " 5[shape=box label=\"TableScan: aggregate_test_100 projection=Some([0, 1])\"]", " 4 -> 5 [arrowhead=none, arrowtail=normal, dir=back]", @@ -2111,9 +2111,9 @@ async fn csv_explain_plans() { " {", " graph[label=\"Detailed LogicalPlan\"]", " 7[shape=box label=\"Explain\\nSchema: [plan_type:Utf8, plan:Utf8]\"]", - " 8[shape=box label=\"Projection: #c1\\nSchema: [c1:Utf8]\"]", + " 8[shape=box label=\"Projection: #aggregate_test_100.c1\\nSchema: [c1:Utf8]\"]", " 7 -> 8 [arrowhead=none, arrowtail=normal, dir=back]", - " 9[shape=box label=\"Filter: #c2 Gt Int64(10)\\nSchema: [c1:Utf8, c2:Int32]\"]", + " 9[shape=box label=\"Filter: #aggregate_test_100.c2 Gt Int64(10)\\nSchema: [c1:Utf8, c2:Int32]\"]", " 8 -> 9 [arrowhead=none, arrowtail=normal, dir=back]", " 10[shape=box label=\"TableScan: aggregate_test_100 projection=Some([0, 1])\\nSchema: [c1:Utf8, c2:Int32]\"]", " 9 -> 10 [arrowhead=none, arrowtail=normal, dir=back]", @@ -2142,9 +2142,13 @@ async fn csv_explain_plans() { let actual = actual.into_iter().map(|r| r.join("\t")).collect::(); // Since the plan contains path that are environmentally dependant (e.g. full path of the test file), only verify important content assert!(actual.contains("logical_plan"), "Actual: '{}'", actual); - assert!(actual.contains("Projection: #c1"), "Actual: '{}'", actual); assert!( - actual.contains("Filter: #c2 Gt Int64(10)"), + actual.contains("Projection: #aggregate_test_100.c1"), + "Actual: '{}'", + actual + ); + assert!( + actual.contains("Filter: #aggregate_test_100.c2 Gt Int64(10)"), "Actual: '{}'", actual ); @@ -2165,7 +2169,11 @@ async fn csv_explain_verbose() { // pain). Instead just check for a few key pieces. assert!(actual.contains("logical_plan"), "Actual: '{}'", actual); assert!(actual.contains("physical_plan"), "Actual: '{}'", actual); - assert!(actual.contains("#c2 Gt Int64(10)"), "Actual: '{}'", actual); + assert!( + actual.contains("#aggregate_test_100.c2 Gt Int64(10)"), + "Actual: '{}'", + actual + ); } #[tokio::test] @@ -2188,8 +2196,8 @@ async fn csv_explain_verbose_plans() { // Verify schema let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: #c1 [c1:Utf8]", - " Filter: #c2 Gt Int64(10) [c1:Utf8, c2:Int32, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:Int64, c10:Utf8, c11:Float32, c12:Float64, c13:Utf8]", + " Projection: #aggregate_test_100.c1 [c1:Utf8]", + " Filter: #aggregate_test_100.c2 Gt Int64(10) [c1:Utf8, c2:Int32, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:Int64, c10:Utf8, c11:Float32, c12:Float64, c13:Utf8]", " TableScan: aggregate_test_100 projection=None [c1:Utf8, c2:Int32, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:Int64, c10:Utf8, c11:Float32, c12:Float64, c13:Utf8]", ]; let formatted = plan.display_indent_schema().to_string(); @@ -2203,8 +2211,8 @@ async fn csv_explain_verbose_plans() { // Verify the text format of the plan let expected = vec![ "Explain", - " Projection: #c1", - " Filter: #c2 Gt Int64(10)", + " Projection: #aggregate_test_100.c1", + " Filter: #aggregate_test_100.c2 Gt Int64(10)", " TableScan: aggregate_test_100 projection=None", ]; let formatted = plan.display_indent().to_string(); @@ -2223,9 +2231,9 @@ async fn csv_explain_verbose_plans() { " {", " graph[label=\"LogicalPlan\"]", " 2[shape=box label=\"Explain\"]", - " 3[shape=box label=\"Projection: #c1\"]", + " 3[shape=box label=\"Projection: #aggregate_test_100.c1\"]", " 2 -> 3 [arrowhead=none, arrowtail=normal, dir=back]", - " 4[shape=box label=\"Filter: #c2 Gt Int64(10)\"]", + " 4[shape=box label=\"Filter: #aggregate_test_100.c2 Gt Int64(10)\"]", " 3 -> 4 [arrowhead=none, arrowtail=normal, dir=back]", " 5[shape=box label=\"TableScan: aggregate_test_100 projection=None\"]", " 4 -> 5 [arrowhead=none, arrowtail=normal, dir=back]", @@ -2234,9 +2242,9 @@ async fn csv_explain_verbose_plans() { " {", " graph[label=\"Detailed LogicalPlan\"]", " 7[shape=box label=\"Explain\\nSchema: [plan_type:Utf8, plan:Utf8]\"]", - " 8[shape=box label=\"Projection: #c1\\nSchema: [c1:Utf8]\"]", + " 8[shape=box label=\"Projection: #aggregate_test_100.c1\\nSchema: [c1:Utf8]\"]", " 7 -> 8 [arrowhead=none, arrowtail=normal, dir=back]", - " 9[shape=box label=\"Filter: #c2 Gt Int64(10)\\nSchema: [c1:Utf8, c2:Int32, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:Int64, c10:Utf8, c11:Float32, c12:Float64, c13:Utf8]\"]", + " 9[shape=box label=\"Filter: #aggregate_test_100.c2 Gt Int64(10)\\nSchema: [c1:Utf8, c2:Int32, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:Int64, c10:Utf8, c11:Float32, c12:Float64, c13:Utf8]\"]", " 8 -> 9 [arrowhead=none, arrowtail=normal, dir=back]", " 10[shape=box label=\"TableScan: aggregate_test_100 projection=None\\nSchema: [c1:Utf8, c2:Int32, c3:Int16, c4:Int16, c5:Int32, c6:Int64, c7:Int16, c8:Int32, c9:Int64, c10:Utf8, c11:Float32, c12:Float64, c13:Utf8]\"]", " 9 -> 10 [arrowhead=none, arrowtail=normal, dir=back]", @@ -2263,8 +2271,8 @@ async fn csv_explain_verbose_plans() { // Verify schema let expected = vec![ "Explain [plan_type:Utf8, plan:Utf8]", - " Projection: #c1 [c1:Utf8]", - " Filter: #c2 Gt Int64(10) [c1:Utf8, c2:Int32]", + " Projection: #aggregate_test_100.c1 [c1:Utf8]", + " Filter: #aggregate_test_100.c2 Gt Int64(10) [c1:Utf8, c2:Int32]", " TableScan: aggregate_test_100 projection=Some([0, 1]) [c1:Utf8, c2:Int32]", ]; let formatted = plan.display_indent_schema().to_string(); @@ -2278,8 +2286,8 @@ async fn csv_explain_verbose_plans() { // Verify the text format of the plan let expected = vec![ "Explain", - " Projection: #c1", - " Filter: #c2 Gt Int64(10)", + " Projection: #aggregate_test_100.c1", + " Filter: #aggregate_test_100.c2 Gt Int64(10)", " TableScan: aggregate_test_100 projection=Some([0, 1])", ]; let formatted = plan.display_indent().to_string(); @@ -2298,9 +2306,9 @@ async fn csv_explain_verbose_plans() { " {", " graph[label=\"LogicalPlan\"]", " 2[shape=box label=\"Explain\"]", - " 3[shape=box label=\"Projection: #c1\"]", + " 3[shape=box label=\"Projection: #aggregate_test_100.c1\"]", " 2 -> 3 [arrowhead=none, arrowtail=normal, dir=back]", - " 4[shape=box label=\"Filter: #c2 Gt Int64(10)\"]", + " 4[shape=box label=\"Filter: #aggregate_test_100.c2 Gt Int64(10)\"]", " 3 -> 4 [arrowhead=none, arrowtail=normal, dir=back]", " 5[shape=box label=\"TableScan: aggregate_test_100 projection=Some([0, 1])\"]", " 4 -> 5 [arrowhead=none, arrowtail=normal, dir=back]", @@ -2309,9 +2317,9 @@ async fn csv_explain_verbose_plans() { " {", " graph[label=\"Detailed LogicalPlan\"]", " 7[shape=box label=\"Explain\\nSchema: [plan_type:Utf8, plan:Utf8]\"]", - " 8[shape=box label=\"Projection: #c1\\nSchema: [c1:Utf8]\"]", + " 8[shape=box label=\"Projection: #aggregate_test_100.c1\\nSchema: [c1:Utf8]\"]", " 7 -> 8 [arrowhead=none, arrowtail=normal, dir=back]", - " 9[shape=box label=\"Filter: #c2 Gt Int64(10)\\nSchema: [c1:Utf8, c2:Int32]\"]", + " 9[shape=box label=\"Filter: #aggregate_test_100.c2 Gt Int64(10)\\nSchema: [c1:Utf8, c2:Int32]\"]", " 8 -> 9 [arrowhead=none, arrowtail=normal, dir=back]", " 10[shape=box label=\"TableScan: aggregate_test_100 projection=Some([0, 1])\\nSchema: [c1:Utf8, c2:Int32]\"]", " 9 -> 10 [arrowhead=none, arrowtail=normal, dir=back]", @@ -2346,12 +2354,12 @@ async fn csv_explain_verbose_plans() { ); assert!(actual.contains("physical_plan"), "Actual: '{}'", actual); assert!( - actual.contains("FilterExec: CAST(c2 AS Int64) > 10"), + actual.contains("FilterExec: CAST(c2@1 AS Int64) > 10"), "Actual: '{}'", actual ); assert!( - actual.contains("ProjectionExec: expr=[c1]"), + actual.contains("ProjectionExec: expr=[c1@0 as c1]"), "Actual: '{}'", actual ); @@ -3793,15 +3801,15 @@ async fn test_physical_plan_display_indent() { let physical_plan = ctx.create_physical_plan(&plan).unwrap(); let expected = vec![ "GlobalLimitExec: limit=10", - " SortExec: [the_min DESC]", + " SortExec: [the_min@2 DESC]", " MergeExec", - " ProjectionExec: expr=[c1, MAX(c12), MIN(c12) as the_min]", - " HashAggregateExec: mode=FinalPartitioned, gby=[c1], aggr=[MAX(c12), MIN(c12)]", + " ProjectionExec: expr=[c1@0 as c1, MAX(aggregate_test_100.c12)@1 as MAX(c12), MIN(aggregate_test_100.c12)@2 as the_min]", + " HashAggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[MAX(c12), MIN(c12)]", " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"c1\" }], 3)", - " HashAggregateExec: mode=Partial, gby=[c1], aggr=[MAX(c12), MIN(c12)]", + " RepartitionExec: partitioning=Hash([Column { name: \"c1\", index: 0 }], 3)", + " HashAggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[MAX(c12), MIN(c12)]", " CoalesceBatchesExec: target_batch_size=4096", - " FilterExec: c12 < CAST(10 AS Float64)", + " FilterExec: c12@1 < CAST(10 AS Float64)", " RepartitionExec: partitioning=RoundRobinBatch(3)", " CsvExec: source=Path(ARROW_TEST_DATA/csv/aggregate_test_100.csv: [ARROW_TEST_DATA/csv/aggregate_test_100.csv]), has_header=true", ]; @@ -3840,17 +3848,17 @@ async fn test_physical_plan_display_indent_multi_children() { let physical_plan = ctx.create_physical_plan(&plan).unwrap(); let expected = vec![ - "ProjectionExec: expr=[c1]", + "ProjectionExec: expr=[c1@0 as c1]", " CoalesceBatchesExec: target_batch_size=4096", - " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(\"c1\", \"c2\")]", + " HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: \"c1\", index: 0 }, Column { name: \"c2\", index: 0 })]", " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"c1\" }], 3)", - " ProjectionExec: expr=[c1]", + " RepartitionExec: partitioning=Hash([Column { name: \"c1\", index: 0 }], 3)", + " ProjectionExec: expr=[c1@0 as c1]", " RepartitionExec: partitioning=RoundRobinBatch(3)", " CsvExec: source=Path(ARROW_TEST_DATA/csv/aggregate_test_100.csv: [ARROW_TEST_DATA/csv/aggregate_test_100.csv]), has_header=true", " CoalesceBatchesExec: target_batch_size=4096", - " RepartitionExec: partitioning=Hash([Column { name: \"c2\" }], 3)", - " ProjectionExec: expr=[c1 as c2]", + " RepartitionExec: partitioning=Hash([Column { name: \"c2\", index: 0 }], 3)", + " ProjectionExec: expr=[c1@0 as c2]", " RepartitionExec: partitioning=RoundRobinBatch(3)", " CsvExec: source=Path(ARROW_TEST_DATA/csv/aggregate_test_100.csv: [ARROW_TEST_DATA/csv/aggregate_test_100.csv]), has_header=true", ]; diff --git a/datafusion/tests/user_defined_plan.rs b/datafusion/tests/user_defined_plan.rs index 8914c05e8f88f..22ebec8b9a994 100644 --- a/datafusion/tests/user_defined_plan.rs +++ b/datafusion/tests/user_defined_plan.rs @@ -164,7 +164,7 @@ async fn topk_plan() -> Result<()> { let expected = vec![ "| logical_plan after topk | TopK: k=3 |", - "| | Projection: #customer_id, #revenue |", + "| | Projection: #sales.customer_id, #sales.revenue |", "| | TableScan: sales projection=Some([0, 1]) |", ].join("\n"); @@ -174,7 +174,18 @@ async fn topk_plan() -> Result<()> { // normalize newlines (output on windows uses \r\n) let actual_output = actual_output.replace("\r\n", "\n"); - assert!(actual_output.contains(&expected) , "Expected output not present in actual output\nExpected:\n---------\n{}\nActual:\n--------\n{}", expected, actual_output); + assert!( + actual_output.contains(&expected), + "Expected output not present in actual output\ + \nExpected:\ + \n---------\ + \n{}\ + \nActual:\ + \n--------\ + \n{}", + expected, + actual_output + ); Ok(()) } diff --git a/integration-tests/test_psql_parity.py b/integration-tests/test_psql_parity.py index c4b5a7596ae94..92670bed0c4dd 100644 --- a/integration-tests/test_psql_parity.py +++ b/integration-tests/test_psql_parity.py @@ -83,7 +83,7 @@ def test_parity(self): psql_output = pd.read_csv(io.BytesIO(generate_csv_from_psql(fname))) self.assertTrue( np.allclose(datafusion_output, psql_output), - msg=f"data fusion output={datafusion_output}, psql_output={psql_output}", + msg=f"datafusion output=\n{datafusion_output}, psql_output=\n{psql_output}", ) From d55a10569b3a24195bed2d67cc6414c63b6b2336 Mon Sep 17 00:00:00 2001 From: Gang Liao Date: Tue, 22 Jun 2021 15:42:18 -0700 Subject: [PATCH 198/329] Support modulus op (#577) --- .../src/physical_plan/expressions/binary.rs | 52 ++++++++++++------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/datafusion/src/physical_plan/expressions/binary.rs b/datafusion/src/physical_plan/expressions/binary.rs index a69b776e74bb4..102b701633853 100644 --- a/datafusion/src/physical_plan/expressions/binary.rs +++ b/datafusion/src/physical_plan/expressions/binary.rs @@ -20,7 +20,7 @@ use std::{any::Any, sync::Arc}; use arrow::array::TimestampMillisecondArray; use arrow::array::*; use arrow::compute::kernels::arithmetic::{ - add, divide, divide_scalar, multiply, subtract, + add, divide, divide_scalar, modulus, modulus_scalar, multiply, subtract, }; use arrow::compute::kernels::boolean::{and_kleene, or_kleene}; use arrow::compute::kernels::comparison::{eq, gt, gt_eq, lt, lt_eq, neq}; @@ -360,14 +360,11 @@ fn common_binary_type( } // for math expressions, the final value of the coercion is also the return type // because coercion favours higher information types - Operator::Plus | Operator::Minus | Operator::Divide | Operator::Multiply => { - numerical_coercion(lhs_type, rhs_type) - } - Operator::Modulus => { - return Err(DataFusionError::NotImplemented( - "Modulus operator is still not supported".to_string(), - )) - } + Operator::Plus + | Operator::Minus + | Operator::Modulus + | Operator::Divide + | Operator::Multiply => numerical_coercion(lhs_type, rhs_type), }; // re-write the error message of failed coercions to include the operator's information @@ -408,12 +405,11 @@ pub fn binary_operator_data_type( | Operator::GtEq | Operator::LtEq => Ok(DataType::Boolean), // math operations return the same value as the common coerced type - Operator::Plus | Operator::Minus | Operator::Divide | Operator::Multiply => { - Ok(common_type) - } - Operator::Modulus => Err(DataFusionError::NotImplemented( - "Modulus operator is still not supported".to_string(), - )), + Operator::Plus + | Operator::Minus + | Operator::Divide + | Operator::Multiply + | Operator::Modulus => Ok(common_type), } } @@ -473,6 +469,9 @@ impl PhysicalExpr for BinaryExpr { Operator::Divide => { binary_primitive_array_op_scalar!(array, scalar.clone(), divide) } + Operator::Modulus => { + binary_primitive_array_op_scalar!(array, scalar.clone(), modulus) + } // if scalar operation is not supported - fallback to array implementation _ => None, } @@ -522,6 +521,7 @@ impl PhysicalExpr for BinaryExpr { Operator::Minus => binary_primitive_array_op!(left, right, subtract), Operator::Multiply => binary_primitive_array_op!(left, right, multiply), Operator::Divide => binary_primitive_array_op!(left, right, divide), + Operator::Modulus => binary_primitive_array_op!(left, right, modulus), Operator::And => { if left_data_type == DataType::Boolean { boolean_op!(left, right, and_kleene) @@ -544,9 +544,6 @@ impl PhysicalExpr for BinaryExpr { ))); } } - Operator::Modulus => Err(DataFusionError::NotImplemented( - "Modulus operator is still not supported".to_string(), - )), }; result.map(|a| ColumnarValue::Array(a)) } @@ -996,6 +993,25 @@ mod tests { Ok(()) } + #[test] + fn modulus_op() -> Result<()> { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + ])); + let a = Arc::new(Int32Array::from(vec![8, 32, 128, 512, 2048])); + let b = Arc::new(Int32Array::from(vec![2, 4, 7, 14, 32])); + + apply_arithmetic::( + schema, + vec![a, b], + Operator::Modulus, + Int32Array::from(vec![0, 0, 2, 8, 0]), + )?; + + Ok(()) + } + fn apply_arithmetic( schema: SchemaRef, data: Vec, From 25f5a1161f855af7a88f4863c6caa99e8ed91767 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 23 Jun 2021 09:09:37 -0400 Subject: [PATCH 199/329] Fix PR labeler (#611) --- .github/workflows/dev_pr.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml index 7644a3313299a..94396c7327a90 100644 --- a/.github/workflows/dev_pr.yml +++ b/.github/workflows/dev_pr.yml @@ -18,8 +18,11 @@ name: Labeler on: - push: - pull_request: + pull_request_target: + types: + - opened + - edited + - synchronize jobs: process: From 9cd58e4fb8c37ca1719adb3b655a0566d6067c3c Mon Sep 17 00:00:00 2001 From: Yijie Shen Date: Wed, 23 Jun 2021 21:21:17 +0800 Subject: [PATCH 200/329] [TEST]Update Rust version to 1.53.0 for integration test (#597) * Use Rust nightly in docker for integration test * Use latest rust stable for docker image --- dev/docker/ballista-base.dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/docker/ballista-base.dockerfile b/dev/docker/ballista-base.dockerfile index e977f5eeff752..4dc7a065e6907 100644 --- a/dev/docker/ballista-base.dockerfile +++ b/dev/docker/ballista-base.dockerfile @@ -23,7 +23,7 @@ # Base image extends debian:buster-slim -FROM rust:1.52.1-buster AS builder +FROM rust:1.53.0-buster AS builder RUN apt update && apt -y install musl musl-dev musl-tools libssl-dev openssl From 35a0f30e6e153c1f463a8a6ec9b672c0087ca43c Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Wed, 23 Jun 2021 21:23:03 +0800 Subject: [PATCH 201/329] update sort partition points (#595) --- datafusion/src/physical_plan/windows.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/datafusion/src/physical_plan/windows.rs b/datafusion/src/physical_plan/windows.rs index a214ef17a9f83..bb255511371e7 100644 --- a/datafusion/src/physical_plan/windows.rs +++ b/datafusion/src/physical_plan/windows.rs @@ -209,8 +209,9 @@ fn find_ranges_in_range<'a>( ) -> &'a [Range] { let start_idx = sort_partition_points .partition_point(|sort_range| sort_range.start < partition_range.start); - let end_idx = sort_partition_points - .partition_point(|sort_range| sort_range.end <= partition_range.end); + let end_idx = start_idx + + sort_partition_points[start_idx..] + .partition_point(|sort_range| sort_range.end <= partition_range.end); &sort_partition_points[start_idx..end_idx] } From 20f6f21ec551f066fd9ff228cc5221f1068c8b03 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Wed, 23 Jun 2021 21:24:05 +0800 Subject: [PATCH 202/329] update stale documentations related to window functions (#598) * update stale documentations related to window functions * update readme --- README.md | 7 ++++--- .../core/src/serde/logical_plan/from_proto.rs | 2 ++ .../rust/core/src/serde/logical_plan/to_proto.rs | 1 - datafusion/src/logical_plan/builder.rs | 15 +++------------ datafusion/src/logical_plan/plan.rs | 6 ------ datafusion/src/physical_plan/windows.rs | 9 --------- 6 files changed, 9 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index 195d1a7b3c316..d3160608bd565 100644 --- a/README.md +++ b/README.md @@ -225,10 +225,11 @@ DataFusion also includes a simple command-line interactive SQL utility. See the - [x] FULL JOIN - [x] CROSS JOIN - [ ] Window - - [x] [Empty window](https://github.com/apache/arrow-datafusion/issues/298) + - [x] Empty window - [x] Common window functions - - [ ] [Window with ORDER BY clause](https://github.com/apache/arrow-datafusion/issues/360) - - [ ] [Window with PARTITION BY clause](https://github.com/apache/arrow-datafusion/issues/299) + - [x] Window with PARTITION BY clause + - [x] Window with ORDER BY clause + - [ ] Window with FILTER clause - [ ] [Window with custom WINDOW FRAME](https://github.com/apache/arrow-datafusion/issues/361) - [ ] UDF and UDAF for window functions diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index 1b7deb7b7126c..418d60de3e7ae 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -1178,10 +1178,12 @@ impl TryFrom for WindowFrameBound { } protobuf::WindowFrameBoundType::Preceding => { // FIXME implement bound value parsing + // https://github.com/apache/arrow-datafusion/issues/361 Ok(WindowFrameBound::Preceding(Some(1))) } protobuf::WindowFrameBoundType::Following => { // FIXME implement bound value parsing + // https://github.com/apache/arrow-datafusion/issues/361 Ok(WindowFrameBound::Following(Some(1))) } } diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/ballista/rust/core/src/serde/logical_plan/to_proto.rs index 24e2b56bad862..4049622b83dc5 100644 --- a/ballista/rust/core/src/serde/logical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/to_proto.rs @@ -1002,7 +1002,6 @@ impl TryInto for &Expr { ref partition_by, ref order_by, ref window_frame, - .. } => { let window_function = match fun { WindowFunction::AggregateFunction(fun) => { diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs index 4b4ed0fb9d413..147f8322df5d7 100644 --- a/datafusion/src/logical_plan/builder.rs +++ b/datafusion/src/logical_plan/builder.rs @@ -351,23 +351,14 @@ impl LogicalPlanBuilder { })) } - /// Apply a window - /// - /// NOTE: this feature is under development and this API will be changing - /// - /// - https://github.com/apache/arrow-datafusion/issues/359 basic structure - /// - https://github.com/apache/arrow-datafusion/issues/298 empty over clause - /// - https://github.com/apache/arrow-datafusion/issues/299 with partition clause - /// - https://github.com/apache/arrow-datafusion/issues/360 with order by - /// - https://github.com/apache/arrow-datafusion/issues/361 with window frame - pub fn window(&self, window_expr: Vec) -> Result { + /// Apply a window functions to extend the schema + pub fn window(&self, window_expr: impl IntoIterator) -> Result { + let window_expr = window_expr.into_iter().collect::>(); let all_expr = window_expr.iter(); validate_unique_names("Windows", all_expr.clone(), self.plan.schema())?; - let mut window_fields: Vec = exprlist_to_fields(all_expr, self.plan.schema())?; window_fields.extend_from_slice(self.plan.schema().fields()); - Ok(Self::from(&LogicalPlan::Window { input: Arc::new(self.plan.clone()), window_expr, diff --git a/datafusion/src/logical_plan/plan.rs b/datafusion/src/logical_plan/plan.rs index 256247228213e..99f0fa14a2d97 100644 --- a/datafusion/src/logical_plan/plan.rs +++ b/datafusion/src/logical_plan/plan.rs @@ -95,12 +95,6 @@ pub enum LogicalPlan { input: Arc, /// The window function expression window_expr: Vec, - /// Filter by expressions - // filter_by_expr: Vec, - /// Partition by expressions - // partition_by_expr: Vec, - /// Window Frame - // window_frame: Option, /// The schema description of the window output schema: DFSchemaRef, }, diff --git a/datafusion/src/physical_plan/windows.rs b/datafusion/src/physical_plan/windows.rs index bb255511371e7..2f539057c82f4 100644 --- a/datafusion/src/physical_plan/windows.rs +++ b/datafusion/src/physical_plan/windows.rs @@ -171,9 +171,6 @@ impl WindowExpr for BuiltInWindowExpr { } fn evaluate(&self, batch: &RecordBatch) -> Result { - // FIXME, for now we assume all the rows belong to the same partition, which will not be the - // case when partition_by is supported, in which case we'll parallelize the calls. - // See https://github.com/apache/arrow-datafusion/issues/299 let values = self.evaluate_args(batch)?; let partition_points = self.evaluate_partition_points( batch.num_rows(), @@ -309,9 +306,6 @@ impl WindowExpr for AggregateWindowExpr { /// evaluate the window function values against the batch fn evaluate(&self, batch: &RecordBatch) -> Result { - // FIXME, for now we assume all the rows belong to the same partition, which will not be the - // case when partition_by is supported, in which case we'll parallelize the calls. - // See https://github.com/apache/arrow-datafusion/issues/299 match self.evaluation_mode() { WindowFrameUnits::Range => self.peer_based_evaluate(batch), WindowFrameUnits::Rows => self.row_based_evaluate(batch), @@ -477,9 +471,6 @@ fn compute_window_aggregates( window_expr: Vec>, batch: &RecordBatch, ) -> Result> { - // FIXME, for now we assume all the rows belong to the same partition, which will not be the - // case when partition_by is supported, in which case we'll parallelize the calls. - // See https://github.com/apache/arrow-datafusion/issues/299 window_expr .iter() .map(|window_expr| window_expr.evaluate(batch)) From c82c29c926e73b3c3b9c5351bcd7b01e4d0aa6a8 Mon Sep 17 00:00:00 2001 From: rdettai Date: Wed, 23 Jun 2021 22:21:54 +0200 Subject: [PATCH 203/329] [fix] select * on empty table (#613) * [fix] select * on empty table * clippy --- datafusion/src/execution/context.rs | 15 ++++++++++++++- datafusion/src/optimizer/projection_push_down.rs | 5 +++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index 926e2db9450a1..165263084cc7c 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -893,7 +893,7 @@ mod tests { logical_plan::{col, create_udf, sum, Expr}, }; use crate::{ - datasource::{MemTable, TableType}, + datasource::{empty::EmptyTable, MemTable, TableType}, logical_plan::create_udaf, physical_plan::expressions::AvgAccumulator, }; @@ -3333,6 +3333,19 @@ mod tests { assert_batches_sorted_eq!(expected, &result); } + #[tokio::test] + async fn query_empty_table() { + let mut ctx = ExecutionContext::new(); + let empty_table = Arc::new(EmptyTable::new(Arc::new(Schema::empty()))); + ctx.register_table("test_tbl", empty_table).unwrap(); + let sql = "SELECT * FROM test_tbl"; + let result = plan_and_collect(&mut ctx, sql) + .await + .expect("Query empty table"); + let expected = vec!["++", "++"]; + assert_batches_sorted_eq!(expected, &result); + } + struct MyPhysicalPlanner {} impl PhysicalPlanner for MyPhysicalPlanner { diff --git a/datafusion/src/optimizer/projection_push_down.rs b/datafusion/src/optimizer/projection_push_down.rs index 2544d89d04920..a9e571f3d00bb 100644 --- a/datafusion/src/optimizer/projection_push_down.rs +++ b/datafusion/src/optimizer/projection_push_down.rs @@ -83,9 +83,10 @@ fn get_projected_schema( .collect(); if projection.is_empty() { - if has_projection { + if has_projection && !schema.fields().is_empty() { // Ensure that we are reading at least one column from the table in case the query - // does not reference any columns directly such as "SELECT COUNT(1) FROM table" + // does not reference any columns directly such as "SELECT COUNT(1) FROM table", + // except when the table is empty (no column) projection.push(0); } else { // for table scan without projection, we default to return all columns From 0d10dcea18d47f9a7ccb560f2a3479ba1f858ac3 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 24 Jun 2021 22:21:34 +0800 Subject: [PATCH 204/329] implement window functions with partition by (#571) --- datafusion/src/logical_plan/expr.rs | 11 ++- datafusion/src/physical_plan/planner.rs | 52 +++++++++++- datafusion/src/sql/planner.rs | 105 +++++++++--------------- datafusion/src/sql/utils.rs | 6 +- 4 files changed, 103 insertions(+), 71 deletions(-) diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index 1c5cc770c94ff..622b7a4ec4ae4 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -1452,11 +1452,18 @@ impl fmt::Debug for Expr { } Expr::WindowFunction { fun, - ref args, + args, + partition_by, + order_by, window_frame, - .. } => { fmt_function(f, &fun.to_string(), false, args)?; + if !partition_by.is_empty() { + write!(f, " PARTITION BY {:?}", partition_by)?; + } + if !order_by.is_empty() { + write!(f, " ORDER BY {:?}", order_by)?; + } if let Some(window_frame) = window_frame { write!( f, diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index a4c20a7f60ebc..d59004243533e 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -44,6 +44,7 @@ use crate::physical_plan::{ }; use crate::prelude::JoinType; use crate::scalar::ScalarValue; +use crate::sql::utils::generate_sort_key; use crate::variable::VarType; use crate::{ error::{DataFusionError, Result}, @@ -263,11 +264,56 @@ impl DefaultPhysicalPlanner { "Impossibly got empty window expression".to_owned(), )); } + let get_sort_keys = |expr: &Expr| match expr { + Expr::WindowFunction { + ref partition_by, + ref order_by, + .. + } => generate_sort_key(partition_by, order_by), + _ => unreachable!(), + }; + + let sort_keys = get_sort_keys(&window_expr[0]); + if window_expr.len() > 1 { + debug_assert!( + window_expr[1..] + .iter() + .all(|expr| get_sort_keys(expr) == sort_keys), + "all window expressions shall have the same sort keys, as guaranteed by logical planning" + ); + } let input_exec = self.create_initial_plan(input, ctx_state)?; - let physical_input_schema = input_exec.schema(); - let logical_input_schema = input.as_ref().schema(); + let logical_input_schema = input.schema(); + + let input_exec = if sort_keys.is_empty() { + input_exec + } else { + let physical_input_schema = input_exec.schema(); + let sort_keys = sort_keys + .iter() + .map(|e| match e { + Expr::Sort { + expr, + asc, + nulls_first, + } => self.create_physical_sort_expr( + expr, + logical_input_schema, + &physical_input_schema, + SortOptions { + descending: !*asc, + nulls_first: *nulls_first, + }, + ctx_state, + ), + _ => unreachable!(), + }) + .collect::>>()?; + Arc::new(SortExec::try_new(sort_keys, input_exec)?) + }; + let physical_input_schema = input_exec.schema(); let window_expr = window_expr .iter() .map(|e| { @@ -282,7 +328,7 @@ impl DefaultPhysicalPlanner { Ok(Arc::new(WindowAggExec::try_new( window_expr, - input_exec.clone(), + input_exec, physical_input_schema, )?)) } diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 7912241329a34..1974b2681e579 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -695,12 +695,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // if there's an empty over, it'll be at the top level groups.sort_by(|(key_a, _), (key_b, _)| key_a.len().cmp(&key_b.len())); groups.reverse(); - for (sort_keys, exprs) in groups { - if !sort_keys.is_empty() { - let sort_keys: Vec = sort_keys.to_vec(); - plan = LogicalPlanBuilder::from(&plan).sort(sort_keys)?.build()?; - } - let window_exprs: Vec = exprs.into_iter().cloned().collect(); + for (_, exprs) in groups { + let window_exprs = exprs.into_iter().cloned().collect::>(); + // the partition and sort itself is done at physical level, see physical_planner's + // fn create_initial_plan plan = LogicalPlanBuilder::from(&plan) .window(window_exprs)? .build()?; @@ -2861,9 +2859,8 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id) from orders"; let expected = "\ Projection: #orders.order_id, #MAX(orders.qty)\ - \n WindowAggr: windowExpr=[[MAX(#orders.qty)]]\ - \n Sort: #orders.order_id ASC NULLS FIRST\ - \n TableScan: orders projection=None"; + \n WindowAggr: windowExpr=[[MAX(#orders.qty) PARTITION BY [#orders.order_id]]]\ + \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2884,11 +2881,9 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; let expected = "\ Projection: #orders.order_id, #MAX(orders.qty), #MIN(orders.qty)\ - \n WindowAggr: windowExpr=[[MAX(#orders.qty)]]\ - \n Sort: #orders.order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#orders.qty)]]\ - \n Sort: #orders.order_id DESC NULLS FIRST\ - \n TableScan: orders projection=None"; + \n WindowAggr: windowExpr=[[MAX(#orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST]]]\ + \n WindowAggr: windowExpr=[[MIN(#orders.qty) ORDER BY [#orders.order_id DESC NULLS FIRST]]]\ + \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2897,11 +2892,9 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id ROWS BETWEEN 3 PRECEDING and 3 FOLLOWING), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; let expected = "\ Projection: #orders.order_id, #MAX(orders.qty) ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING, #MIN(orders.qty)\ - \n WindowAggr: windowExpr=[[MAX(#orders.qty) ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING]]\ - \n Sort: #orders.order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#orders.qty)]]\ - \n Sort: #orders.order_id DESC NULLS FIRST\ - \n TableScan: orders projection=None"; + \n WindowAggr: windowExpr=[[MAX(#orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING]]\ + \n WindowAggr: windowExpr=[[MIN(#orders.qty) ORDER BY [#orders.order_id DESC NULLS FIRST]]]\ + \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2910,11 +2903,9 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id ROWS 3 PRECEDING), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; let expected = "\ Projection: #orders.order_id, #MAX(orders.qty) ROWS BETWEEN 3 PRECEDING AND CURRENT ROW, #MIN(orders.qty)\ - \n WindowAggr: windowExpr=[[MAX(#orders.qty) ROWS BETWEEN 3 PRECEDING AND CURRENT ROW]]\ - \n Sort: #orders.order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#orders.qty)]]\ - \n Sort: #orders.order_id DESC NULLS FIRST\ - \n TableScan: orders projection=None"; + \n WindowAggr: windowExpr=[[MAX(#orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND CURRENT ROW]]\ + \n WindowAggr: windowExpr=[[MIN(#orders.qty) ORDER BY [#orders.order_id DESC NULLS FIRST]]]\ + \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2955,11 +2946,9 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id GROUPS 3 PRECEDING), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; let expected = "\ Projection: #orders.order_id, #MAX(orders.qty) GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW, #MIN(orders.qty)\ - \n WindowAggr: windowExpr=[[MAX(#orders.qty) GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW]]\ - \n Sort: #orders.order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#orders.qty)]]\ - \n Sort: #orders.order_id DESC NULLS FIRST\ - \n TableScan: orders projection=None"; + \n WindowAggr: windowExpr=[[MAX(#orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST] GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW]]\ + \n WindowAggr: windowExpr=[[MIN(#orders.qty) ORDER BY [#orders.order_id DESC NULLS FIRST]]]\ + \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -2980,11 +2969,9 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id), MIN(qty) OVER (ORDER BY (order_id + 1)) from orders"; let expected = "\ Projection: #orders.order_id, #MAX(orders.qty), #MIN(orders.qty)\ - \n WindowAggr: windowExpr=[[MAX(#orders.qty)]]\ - \n Sort: #orders.order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#orders.qty)]]\ - \n Sort: #orders.order_id Plus Int64(1) ASC NULLS FIRST\ - \n TableScan: orders projection=None"; + \n WindowAggr: windowExpr=[[MAX(#orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST]]]\ + \n WindowAggr: windowExpr=[[MIN(#orders.qty) ORDER BY [#orders.order_id Plus Int64(1) ASC NULLS FIRST]]]\ + \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -3007,11 +2994,9 @@ mod tests { let expected = "\ Projection: #orders.order_id, #MAX(orders.qty), #SUM(orders.qty), #MIN(orders.qty)\ \n WindowAggr: windowExpr=[[SUM(#orders.qty)]]\ - \n WindowAggr: windowExpr=[[MAX(#orders.qty)]]\ - \n Sort: #orders.qty ASC NULLS FIRST, #orders.order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#orders.qty)]]\ - \n Sort: #orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST\ - \n TableScan: orders projection=None"; + \n WindowAggr: windowExpr=[[MAX(#orders.qty) ORDER BY [#orders.qty ASC NULLS FIRST, #orders.order_id ASC NULLS FIRST]]]\ + \n WindowAggr: windowExpr=[[MIN(#orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST]]]\ + \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -3034,11 +3019,9 @@ mod tests { let expected = "\ Projection: #orders.order_id, #MAX(orders.qty), #SUM(orders.qty), #MIN(orders.qty)\ \n WindowAggr: windowExpr=[[SUM(#orders.qty)]]\ - \n WindowAggr: windowExpr=[[MAX(#orders.qty)]]\ - \n Sort: #orders.order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#orders.qty)]]\ - \n Sort: #orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST\ - \n TableScan: orders projection=None"; + \n WindowAggr: windowExpr=[[MAX(#orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST]]]\ + \n WindowAggr: windowExpr=[[MIN(#orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST]]]\ + \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -3065,11 +3048,9 @@ mod tests { Sort: #orders.order_id ASC NULLS FIRST\ \n Projection: #orders.order_id, #MAX(orders.qty), #SUM(orders.qty), #MIN(orders.qty)\ \n WindowAggr: windowExpr=[[SUM(#orders.qty)]]\ - \n WindowAggr: windowExpr=[[MAX(#orders.qty)]]\ - \n Sort: #orders.qty ASC NULLS FIRST, #orders.order_id ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#orders.qty)]]\ - \n Sort: #orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST\ - \n TableScan: orders projection=None"; + \n WindowAggr: windowExpr=[[MAX(#orders.qty) ORDER BY [#orders.qty ASC NULLS FIRST, #orders.order_id ASC NULLS FIRST]]]\ + \n WindowAggr: windowExpr=[[MIN(#orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST]]]\ + \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -3088,9 +3069,8 @@ mod tests { "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id ORDER BY qty) from orders"; let expected = "\ Projection: #orders.order_id, #MAX(orders.qty)\ - \n WindowAggr: windowExpr=[[MAX(#orders.qty)]]\ - \n Sort: #orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST\ - \n TableScan: orders projection=None"; + \n WindowAggr: windowExpr=[[MAX(#orders.qty) PARTITION BY [#orders.order_id] ORDER BY [#orders.qty ASC NULLS FIRST]]]\ + \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -3109,9 +3089,8 @@ mod tests { "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id, qty ORDER BY qty) from orders"; let expected = "\ Projection: #orders.order_id, #MAX(orders.qty)\ - \n WindowAggr: windowExpr=[[MAX(#orders.qty)]]\ - \n Sort: #orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST\ - \n TableScan: orders projection=None"; + \n WindowAggr: windowExpr=[[MAX(#orders.qty) PARTITION BY [#orders.order_id, #orders.qty] ORDER BY [#orders.qty ASC NULLS FIRST]]]\ + \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -3133,11 +3112,9 @@ mod tests { "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id, qty ORDER BY qty), MIN(qty) OVER (PARTITION BY qty ORDER BY order_id) from orders"; let expected = "\ Projection: #orders.order_id, #MAX(orders.qty), #MIN(orders.qty)\ - \n WindowAggr: windowExpr=[[MAX(#orders.qty)]]\ - \n Sort: #orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#orders.qty)]]\ - \n Sort: #orders.qty ASC NULLS FIRST, #orders.order_id ASC NULLS FIRST\ - \n TableScan: orders projection=None"; + \n WindowAggr: windowExpr=[[MAX(#orders.qty) PARTITION BY [#orders.order_id, #orders.qty] ORDER BY [#orders.qty ASC NULLS FIRST]]]\ + \n WindowAggr: windowExpr=[[MIN(#orders.qty) PARTITION BY [#orders.qty] ORDER BY [#orders.order_id ASC NULLS FIRST]]]\ + \n TableScan: orders projection=None"; quick_test(sql, expected); } @@ -3158,11 +3135,9 @@ mod tests { "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id ORDER BY qty), MIN(qty) OVER (PARTITION BY order_id, qty ORDER BY price) from orders"; let expected = "\ Projection: #orders.order_id, #MAX(orders.qty), #MIN(orders.qty)\ - \n WindowAggr: windowExpr=[[MAX(#orders.qty)]]\ - \n Sort: #orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST\ - \n WindowAggr: windowExpr=[[MIN(#orders.qty)]]\ - \n Sort: #orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST, #orders.price ASC NULLS FIRST\ - \n TableScan: orders projection=None"; + \n WindowAggr: windowExpr=[[MAX(#orders.qty) PARTITION BY [#orders.order_id] ORDER BY [#orders.qty ASC NULLS FIRST]]]\ + \n WindowAggr: windowExpr=[[MIN(#orders.qty) PARTITION BY [#orders.order_id, #orders.qty] ORDER BY [#orders.price ASC NULLS FIRST]]]\ + \n TableScan: orders projection=None"; quick_test(sql, expected); } diff --git a/datafusion/src/sql/utils.rs b/datafusion/src/sql/utils.rs index 7702748df44f1..5da1275cddfbb 100644 --- a/datafusion/src/sql/utils.rs +++ b/datafusion/src/sql/utils.rs @@ -439,7 +439,11 @@ pub(crate) fn resolve_aliases_to_exprs( type WindowSortKey = Vec; -fn generate_sort_key(partition_by: &[Expr], order_by: &[Expr]) -> WindowSortKey { +/// Generate a sort key for a given window expr's partition_by and order_bu expr +pub(crate) fn generate_sort_key( + partition_by: &[Expr], + order_by: &[Expr], +) -> WindowSortKey { let mut sort_key = vec![]; partition_by.iter().for_each(|e| { let e = e.clone().sort(true, true); From aead7f85ae26f24d5548823cbf63b3f08ffe5326 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 24 Jun 2021 22:21:57 +0800 Subject: [PATCH 205/329] reuse alias map in aggregate logical planning and refactor position resolving (#606) --- datafusion/src/sql/planner.rs | 15 ++++++--------- datafusion/src/sql/utils.rs | 15 +++++++++------ 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 1974b2681e579..a2f3240a8a99d 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -535,6 +535,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let mut combined_schema = (**projected_plan.schema()).clone(); combined_schema.merge(plan.schema()); + // this alias map is resolved and looked up in both having exprs and group by exprs + let alias_map = extract_aliases(&select_exprs); + // Optionally the HAVING expression. let having_expr_opt = select .having @@ -542,7 +545,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .map::, _>(|having_expr| { let having_expr = self.sql_expr_to_logical_expr(having_expr, &combined_schema)?; - // This step "dereferences" any aliases in the HAVING clause. // // This is how we support queries with HAVING expressions that @@ -558,12 +560,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // SELECT c1 AS m FROM t HAVING c1 > 10; // SELECT c1, MAX(c2) AS m FROM t GROUP BY c1 HAVING MAX(c2) > 10; // - let having_expr = resolve_aliases_to_exprs( - &having_expr, - &extract_aliases(&select_exprs), - )?; - - Ok(having_expr) + resolve_aliases_to_exprs(&having_expr, &alias_map) }) .transpose()?; @@ -578,7 +575,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // All of the aggregate expressions (deduplicated). let aggr_exprs = find_aggregate_exprs(&aggr_expr_haystack); - let alias_map = extract_aliases(&select_exprs); let group_by_exprs = select .group_by .iter() @@ -586,7 +582,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let group_by_expr = self.sql_expr_to_logical_expr(e, &combined_schema)?; let group_by_expr = resolve_aliases_to_exprs(&group_by_expr, &alias_map)?; let group_by_expr = - resolve_positions_to_exprs(&group_by_expr, &select_exprs)?; + resolve_positions_to_exprs(&group_by_expr, &select_exprs) + .unwrap_or(group_by_expr); self.validate_schema_satisfies_exprs( plan.schema(), &[group_by_expr.clone()], diff --git a/datafusion/src/sql/utils.rs b/datafusion/src/sql/utils.rs index 5da1275cddfbb..080f84ef10ed3 100644 --- a/datafusion/src/sql/utils.rs +++ b/datafusion/src/sql/utils.rs @@ -398,10 +398,13 @@ pub(crate) fn extract_aliases(exprs: &[Expr]) -> HashMap { .collect::>() } +/// Given an expression that's literal int encoding position, lookup the corresponding expression +/// in the select_exprs list, if the index is within the bounds and it is indeed a position literal; +/// Otherwise, return None pub(crate) fn resolve_positions_to_exprs( expr: &Expr, select_exprs: &[Expr], -) -> Result { +) -> Option { match expr { // sql_expr_to_logical_expr maps number to i64 // https://github.com/apache/arrow-datafusion/blob/8d175c759e17190980f270b5894348dc4cff9bbf/datafusion/src/sql/planner.rs#L882-L887 @@ -410,12 +413,12 @@ pub(crate) fn resolve_positions_to_exprs( { let index = (position - 1) as usize; let select_expr = &select_exprs[index]; - match select_expr { - Expr::Alias(nested_expr, _alias_name) => Ok(*nested_expr.clone()), - _ => Ok(select_expr.clone()), - } + Some(match select_expr { + Expr::Alias(nested_expr, _alias_name) => *nested_expr.clone(), + _ => select_expr.clone(), + }) } - _ => Ok(expr.clone()), + _ => None, } } From 8858d9592d7a0646530d65fa6204276170ee4091 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 24 Jun 2021 22:57:31 +0800 Subject: [PATCH 206/329] fix 592, support alias in window functions (#607) --- datafusion/src/sql/planner.rs | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index a2f3240a8a99d..17181230c26cc 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -635,21 +635,21 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // window function let window_func_exprs = find_window_exprs(&select_exprs_post_aggr); - let (plan, exprs) = if window_func_exprs.is_empty() { - (plan, select_exprs_post_aggr) + let plan = if window_func_exprs.is_empty() { + plan } else { - self.window(plan, window_func_exprs, &select_exprs_post_aggr)? + self.window(plan, window_func_exprs)? }; let plan = if select.distinct { return LogicalPlanBuilder::from(&plan) - .aggregate(exprs, vec![])? + .aggregate(select_exprs_post_aggr, vec![])? .build(); } else { plan }; - self.project(&plan, exprs) + self.project(&plan, select_exprs_post_aggr) } /// Returns the `Expr`'s corresponding to a SQL query's SELECT expressions. @@ -678,12 +678,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } /// Wrap a plan in a window - fn window( - &self, - input: LogicalPlan, - window_exprs: Vec, - select_exprs: &[Expr], - ) -> Result<(LogicalPlan, Vec)> { + fn window(&self, input: LogicalPlan, window_exprs: Vec) -> Result { let mut plan = input; let mut groups = group_window_expr_by_sort_keys(&window_exprs)?; // sort by sort_key len descending, so that more deeply sorted plans gets nested further @@ -700,11 +695,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .window(window_exprs)? .build()?; } - let select_exprs = select_exprs - .iter() - .map(|expr| rebase_expr(expr, &window_exprs, &plan)) - .collect::>>()?; - Ok((plan, select_exprs)) + + Ok(plan) } /// Wrap a plan in an aggregate @@ -2821,6 +2813,16 @@ mod tests { quick_test(sql, expected); } + #[test] + fn empty_over_dup_with_alias() { + let sql = "SELECT order_id oid, MAX(order_id) OVER () max_oid, MAX(order_id) OVER () max_oid_dup from orders"; + let expected = "\ + Projection: #orders.order_id AS oid, #MAX(orders.order_id) AS max_oid, #MAX(orders.order_id) AS max_oid_dup\ + \n WindowAggr: windowExpr=[[MAX(#orders.order_id)]]\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); + } + #[test] fn empty_over_plus() { let sql = "SELECT order_id, MAX(qty * 1.1) OVER () from orders"; From 50b11208acb5092d61fc8ffd25666d7ad3f6110f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 26 Jun 2021 09:26:09 -0400 Subject: [PATCH 207/329] Do not prune out unecessary columns with unqualified references (#619) --- .../src/optimizer/projection_push_down.rs | 53 +++++++++++++++---- 1 file changed, 44 insertions(+), 9 deletions(-) diff --git a/datafusion/src/optimizer/projection_push_down.rs b/datafusion/src/optimizer/projection_push_down.rs index a9e571f3d00bb..2cd7384e24439 100644 --- a/datafusion/src/optimizer/projection_push_down.rs +++ b/datafusion/src/optimizer/projection_push_down.rs @@ -29,7 +29,10 @@ use crate::optimizer::utils; use crate::sql::utils::find_sort_exprs; use arrow::datatypes::{Field, Schema}; use arrow::error::Result as ArrowResult; -use std::{collections::HashSet, sync::Arc}; +use std::{ + collections::{BTreeSet, HashSet}, + sync::Arc, +}; use utils::optimize_explain; /// Optimizer that removes unused projections and aggregations from plans @@ -75,9 +78,12 @@ fn get_projected_schema( // // we discard non-existing columns because some column names are not part of the schema, // e.g. when the column derives from an aggregation - let mut projection: Vec = required_columns + // + // Use BTreeSet to remove potential duplicates (e.g. union) as + // well as to sort the projection to ensure deterministic behavior + let mut projection: BTreeSet = required_columns .iter() - .filter(|c| c.relation.as_ref() == table_name) + .filter(|c| c.relation.is_none() || c.relation.as_ref() == table_name) .map(|c| schema.index_of(&c.name)) .filter_map(ArrowResult::ok) .collect(); @@ -87,7 +93,7 @@ fn get_projected_schema( // Ensure that we are reading at least one column from the table in case the query // does not reference any columns directly such as "SELECT COUNT(1) FROM table", // except when the table is empty (no column) - projection.push(0); + projection.insert(0); } else { // for table scan without projection, we default to return all columns projection = schema @@ -95,13 +101,10 @@ fn get_projected_schema( .iter() .enumerate() .map(|(i, _)| i) - .collect::>(); + .collect::>(); } } - // sort the projection otherwise we get non-deterministic behavior - projection.sort_unstable(); - // create the projected schema let mut projected_fields: Vec = Vec::with_capacity(projection.len()); match table_name { @@ -120,6 +123,7 @@ fn get_projected_schema( } } + let projection = projection.into_iter().collect::>(); Ok((projection, projected_fields.to_dfschema_ref()?)) } @@ -438,7 +442,9 @@ fn optimize_plan( mod tests { use super::*; - use crate::logical_plan::{col, lit, max, min, Expr, JoinType, LogicalPlanBuilder}; + use crate::logical_plan::{ + col, exprlist_to_fields, lit, max, min, Expr, JoinType, LogicalPlanBuilder, + }; use crate::test::*; use arrow::datatypes::DataType; @@ -568,6 +574,35 @@ mod tests { Ok(()) } + #[test] + fn table_scan_projected_schema_non_qualified_relation() -> Result<()> { + let table_scan = test_table_scan()?; + let input_schema = table_scan.schema(); + assert_eq!(3, input_schema.fields().len()); + assert_fields_eq(&table_scan, vec!["a", "b", "c"]); + + // Build the LogicalPlan directly (don't use PlanBuilder), so + // that the Column references are unqualified (e.g. their + // relation is `None`). PlanBuilder resolves the expressions + let expr = vec![col("a"), col("b")]; + let projected_fields = exprlist_to_fields(&expr, input_schema).unwrap(); + let projected_schema = DFSchema::new(projected_fields).unwrap(); + let plan = LogicalPlan::Projection { + expr, + input: Arc::new(table_scan), + schema: Arc::new(projected_schema), + }; + + assert_fields_eq(&plan, vec!["a", "b"]); + + let expected = "Projection: #a, #b\ + \n TableScan: test projection=Some([0, 1])"; + + assert_optimized_plan_eq(&plan, expected); + + Ok(()) + } + #[test] fn table_limit() -> Result<()> { let table_scan = test_table_scan()?; From ccb75200397a89fdc9ebe8294ae1521a3e94485b Mon Sep 17 00:00:00 2001 From: Ximo Guanter Date: Sat, 26 Jun 2021 15:39:33 +0200 Subject: [PATCH 208/329] Add some resiliency to lost executors (#568) --- ballista/rust/core/proto/ballista.proto | 10 +- ballista/rust/executor/src/execution_loop.rs | 11 +- ballista/rust/scheduler/src/api/handlers.rs | 39 ++---- ballista/rust/scheduler/src/lib.rs | 42 +----- ballista/rust/scheduler/src/state/etcd.rs | 27 +--- ballista/rust/scheduler/src/state/mod.rs | 131 ++++++++++++++---- .../rust/scheduler/src/state/standalone.rs | 24 +--- 7 files changed, 144 insertions(+), 140 deletions(-) diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index d75cbaa73efe0..365d8e9fd9a42 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -745,10 +745,10 @@ message ExecutorRegistration { uint32 port = 3; } -message GetExecutorMetadataParams {} - -message GetExecutorMetadataResult { - repeated ExecutorMetadata metadata = 1; +message ExecutorHeartbeat { + ExecutorMetadata meta = 1; + // Unix epoch-based timestamp in seconds + uint64 timestamp = 2; } message RunningTask { @@ -847,8 +847,6 @@ message FilePartitionMetadata { } service SchedulerGrpc { - rpc GetExecutorsMetadata (GetExecutorMetadataParams) returns (GetExecutorMetadataResult) {} - // Executors must poll the scheduler for heartbeat and to receive tasks rpc PollWork (PollWorkParams) returns (PollWorkResult) {} diff --git a/ballista/rust/executor/src/execution_loop.rs b/ballista/rust/executor/src/execution_loop.rs index 6eb4713f5e396..17a8d8c2002a8 100644 --- a/ballista/rust/executor/src/execution_loop.rs +++ b/ballista/rust/executor/src/execution_loop.rs @@ -91,10 +91,14 @@ async fn run_received_tasks( task_status_sender: Sender, task: TaskDefinition, ) { - info!("Received task {:?}", task.task_id.as_ref().unwrap()); + let task_id = task.task_id.unwrap(); + let task_id_log = format!( + "{}/{}/{}", + task_id.job_id, task_id.stage_id, task_id.partition_id + ); + info!("Received task {}", task_id_log); available_tasks_slots.fetch_sub(1, Ordering::SeqCst); let plan: Arc = (&task.plan.unwrap()).try_into().unwrap(); - let task_id = task.task_id.unwrap(); tokio::spawn(async move { let execution_result = executor @@ -105,7 +109,8 @@ async fn run_received_tasks( plan, ) .await; - info!("DONE WITH TASK: {:?}", execution_result); + info!("Done with task {}", task_id_log); + debug!("Statistics: {:?}", execution_result); available_tasks_slots.fetch_add(1, Ordering::SeqCst); let _ = task_status_sender.send(as_task_status( execution_result.map(|_| ()), diff --git a/ballista/rust/scheduler/src/api/handlers.rs b/ballista/rust/scheduler/src/api/handlers.rs index 7293558d0cc44..ee0ee73f4ecaf 100644 --- a/ballista/rust/scheduler/src/api/handlers.rs +++ b/ballista/rust/scheduler/src/api/handlers.rs @@ -11,45 +11,32 @@ // limitations under the License. use crate::SchedulerServer; -use ballista_core::serde::protobuf::{ - scheduler_grpc_server::SchedulerGrpc, ExecutorMetadata, GetExecutorMetadataParams, - GetExecutorMetadataResult, -}; -use ballista_core::serde::scheduler::ExecutorMeta; -use tonic::{Request, Response}; +use ballista_core::{serde::scheduler::ExecutorMeta, BALLISTA_VERSION}; use warp::Rejection; #[derive(Debug, serde::Serialize)] struct StateResponse { executors: Vec, started: u128, - version: String, + version: &'static str, } pub(crate) async fn scheduler_state( data_server: SchedulerServer, ) -> Result { - let data: Result, tonic::Status> = data_server - .get_executors_metadata(Request::new(GetExecutorMetadataParams {})) - .await; - let metadata: Vec = match data { - Ok(result) => { - let res: &GetExecutorMetadataResult = result.get_ref(); - let vec: &Vec = &res.metadata; - vec.iter() - .map(|v: &ExecutorMetadata| ExecutorMeta { - host: v.host.clone(), - port: v.port as u16, - id: v.id.clone(), - }) - .collect() - } - Err(_) => vec![], - }; + // TODO: Display last seen information in UI + let executors: Vec = data_server + .state + .get_executors_metadata() + .await + .unwrap_or_default() + .into_iter() + .map(|(metadata, _duration)| metadata) + .collect(); let response = StateResponse { - executors: metadata, + executors, started: data_server.start_time, - version: data_server.version.clone(), + version: BALLISTA_VERSION, }; Ok(warp::reply::json(&response)) } diff --git a/ballista/rust/scheduler/src/lib.rs b/ballista/rust/scheduler/src/lib.rs index 54cba48db2e54..3620f79baaa55 100644 --- a/ballista/rust/scheduler/src/lib.rs +++ b/ballista/rust/scheduler/src/lib.rs @@ -34,10 +34,10 @@ use std::{fmt, net::IpAddr}; use ballista_core::serde::protobuf::{ execute_query_params::Query, executor_registration::OptionalHost, job_status, scheduler_grpc_server::SchedulerGrpc, ExecuteQueryParams, ExecuteQueryResult, - FailedJob, FilePartitionMetadata, FileType, GetExecutorMetadataParams, - GetExecutorMetadataResult, GetFileMetadataParams, GetFileMetadataResult, - GetJobStatusParams, GetJobStatusResult, JobStatus, PartitionId, PollWorkParams, - PollWorkResult, QueuedJob, RunningJob, TaskDefinition, TaskStatus, + FailedJob, FilePartitionMetadata, FileType, GetFileMetadataParams, + GetFileMetadataResult, GetJobStatusParams, GetJobStatusResult, JobStatus, + PartitionId, PollWorkParams, PollWorkResult, QueuedJob, RunningJob, TaskDefinition, + TaskStatus, }; use ballista_core::serde::scheduler::ExecutorMeta; @@ -76,9 +76,8 @@ use std::time::{Instant, SystemTime, UNIX_EPOCH}; #[derive(Clone)] pub struct SchedulerServer { caller_ip: IpAddr, - state: Arc, + pub(crate) state: Arc, start_time: u128, - version: String, } impl SchedulerServer { @@ -87,7 +86,6 @@ impl SchedulerServer { namespace: String, caller_ip: IpAddr, ) -> Self { - const VERSION: Option<&'static str> = option_env!("CARGO_PKG_VERSION"); let state = Arc::new(SchedulerState::new(config, namespace)); let state_clone = state.clone(); @@ -101,35 +99,12 @@ impl SchedulerServer { .duration_since(UNIX_EPOCH) .unwrap() .as_millis(), - version: VERSION.unwrap_or("Unknown").to_string(), } } } #[tonic::async_trait] impl SchedulerGrpc for SchedulerServer { - async fn get_executors_metadata( - &self, - _request: Request, - ) -> std::result::Result, tonic::Status> { - info!("Received get_executors_metadata request"); - let result = self - .state - .get_executors_metadata() - .await - .map_err(|e| { - let msg = format!("Error reading executors metadata: {}", e); - error!("{}", msg); - tonic::Status::internal(msg) - })? - .into_iter() - .map(|meta| meta.into()) - .collect(); - Ok(Response::new(GetExecutorMetadataResult { - metadata: result, - })) - } - async fn poll_work( &self, request: Request, @@ -279,13 +254,6 @@ impl SchedulerGrpc for SchedulerServer { } }; debug!("Received plan for execution: {:?}", plan); - let executors = self.state.get_executors_metadata().await.map_err(|e| { - let msg = format!("Error reading executors metadata: {}", e); - error!("{}", msg); - tonic::Status::internal(msg) - })?; - debug!("Found executors: {:?}", executors); - let job_id: String = { let mut rng = thread_rng(); std::iter::repeat(()) diff --git a/ballista/rust/scheduler/src/state/etcd.rs b/ballista/rust/scheduler/src/state/etcd.rs index 807477d86995b..d6741a7d83dcc 100644 --- a/ballista/rust/scheduler/src/state/etcd.rs +++ b/ballista/rust/scheduler/src/state/etcd.rs @@ -17,14 +17,12 @@ //! Etcd config backend. -use std::{task::Poll, time::Duration}; +use std::task::Poll; use crate::state::ConfigBackendClient; use ballista_core::error::{ballista_error, Result}; -use etcd_client::{ - GetOptions, LockResponse, PutOptions, WatchOptions, WatchStream, Watcher, -}; +use etcd_client::{GetOptions, LockResponse, WatchOptions, WatchStream, Watcher}; use futures::{Stream, StreamExt}; use log::warn; @@ -70,25 +68,9 @@ impl ConfigBackendClient for EtcdClient { .collect()) } - async fn put( - &self, - key: String, - value: Vec, - lease_time: Option, - ) -> Result<()> { + async fn put(&self, key: String, value: Vec) -> Result<()> { let mut etcd = self.etcd.clone(); - let put_options = if let Some(lease_time) = lease_time { - etcd.lease_grant(lease_time.as_secs() as i64, None) - .await - .map(|lease| Some(PutOptions::new().with_lease(lease.id()))) - .map_err(|e| { - warn!("etcd lease grant failed: {:?}", e.to_string()); - ballista_error("etcd lease grant failed") - })? - } else { - None - }; - etcd.put(key.clone(), value.clone(), put_options) + etcd.put(key.clone(), value.clone(), None) .await .map_err(|e| { warn!("etcd put failed: {}", e); @@ -99,6 +81,7 @@ impl ConfigBackendClient for EtcdClient { async fn lock(&self) -> Result> { let mut etcd = self.etcd.clone(); + // TODO: make this a namespaced-lock let lock = etcd .lock("/ballista_global_lock", None) .await diff --git a/ballista/rust/scheduler/src/state/mod.rs b/ballista/rust/scheduler/src/state/mod.rs index 75f1574ef1257..a17c82d4b7379 100644 --- a/ballista/rust/scheduler/src/state/mod.rs +++ b/ballista/rust/scheduler/src/state/mod.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use std::time::{SystemTime, UNIX_EPOCH}; use std::{ any::type_name, collections::HashMap, convert::TryInto, sync::Arc, time::Duration, }; @@ -26,8 +27,9 @@ use prost::Message; use tokio::sync::OwnedMutexGuard; use ballista_core::serde::protobuf::{ - job_status, task_status, CompletedJob, CompletedTask, ExecutorMetadata, FailedJob, - FailedTask, JobStatus, PhysicalPlanNode, RunningJob, RunningTask, TaskStatus, + job_status, task_status, CompletedJob, CompletedTask, ExecutorHeartbeat, + ExecutorMetadata, FailedJob, FailedTask, JobStatus, PhysicalPlanNode, RunningJob, + RunningTask, TaskStatus, }; use ballista_core::serde::scheduler::PartitionStats; use ballista_core::{error::BallistaError, serde::scheduler::ExecutorMeta}; @@ -48,8 +50,6 @@ pub use etcd::EtcdClient; #[cfg(feature = "sled")] pub use standalone::StandaloneClient; -const LEASE_TIME: Duration = Duration::from_secs(60); - /// A trait that contains the necessary methods to save and retrieve the state and configuration of a cluster. #[tonic::async_trait] pub trait ConfigBackendClient: Send + Sync { @@ -62,12 +62,7 @@ pub trait ConfigBackendClient: Send + Sync { async fn get_from_prefix(&self, prefix: &str) -> Result)>>; /// Saves the value into the provided key, overriding any previous data that might have been associated to that key. - async fn put( - &self, - key: String, - value: Vec, - lease_time: Option, - ) -> Result<()>; + async fn put(&self, key: String, value: Vec) -> Result<()>; async fn lock(&self) -> Result>; @@ -104,25 +99,55 @@ impl SchedulerState { } } - pub async fn get_executors_metadata(&self) -> Result> { + pub async fn get_executors_metadata(&self) -> Result> { let mut result = vec![]; let entries = self .config_client .get_from_prefix(&get_executors_prefix(&self.namespace)) .await?; + let now_epoch_ts = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("Time went backwards"); for (_key, entry) in entries { - let meta: ExecutorMetadata = decode_protobuf(&entry)?; - result.push(meta.into()); + let heartbeat: ExecutorHeartbeat = decode_protobuf(&entry)?; + let meta = heartbeat.meta.unwrap(); + let ts = Duration::from_secs(heartbeat.timestamp); + let time_since_last_seen = now_epoch_ts + .checked_sub(ts) + .unwrap_or_else(|| Duration::from_secs(0)); + result.push((meta.into(), time_since_last_seen)); } Ok(result) } + pub async fn get_alive_executors_metadata( + &self, + last_seen_threshold: Duration, + ) -> Result> { + Ok(self + .get_executors_metadata() + .await? + .into_iter() + .filter_map(|(exec, last_seen)| { + (last_seen < last_seen_threshold).then(|| exec) + }) + .collect()) + } + pub async fn save_executor_metadata(&self, meta: ExecutorMeta) -> Result<()> { let key = get_executor_key(&self.namespace, &meta.id); let meta: ExecutorMetadata = meta.into(); - let value: Vec = encode_protobuf(&meta)?; - self.config_client.put(key, value, Some(LEASE_TIME)).await + let timestamp = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("Time went backwards") + .as_secs(); + let heartbeat = ExecutorHeartbeat { + meta: Some(meta), + timestamp, + }; + let value: Vec = encode_protobuf(&heartbeat)?; + self.config_client.put(key, value).await } pub async fn save_job_metadata( @@ -133,7 +158,7 @@ impl SchedulerState { debug!("Saving job metadata: {:?}", status); let key = get_job_key(&self.namespace, job_id); let value = encode_protobuf(status)?; - self.config_client.put(key, value, None).await + self.config_client.put(key, value).await } pub async fn get_job_metadata(&self, job_id: &str) -> Result { @@ -158,7 +183,7 @@ impl SchedulerState { partition_id.partition_id as usize, ); let value = encode_protobuf(status)?; - self.config_client.put(key, value, None).await + self.config_client.put(key, value).await } pub async fn _get_task_status( @@ -191,7 +216,7 @@ impl SchedulerState { let proto: PhysicalPlanNode = plan.try_into()?; encode_protobuf(&proto)? }; - self.config_client.clone().put(key, value, None).await + self.config_client.clone().put(key, value).await } pub async fn get_stage_plan( @@ -211,6 +236,40 @@ impl SchedulerState { Ok((&value).try_into()?) } + /// This function ensures that the task wasn't assigned to an executor that died. + /// If that is the case, then the task is re-scheduled. + /// Returns true if the task was dead, false otherwise. + async fn reschedule_dead_task( + &self, + task_status: &TaskStatus, + executors: &[ExecutorMeta], + ) -> Result { + let executor_id: &str = match &task_status.status { + Some(task_status::Status::Completed(CompletedTask { executor_id })) => { + executor_id + } + Some(task_status::Status::Running(RunningTask { executor_id })) => { + executor_id + } + _ => return Ok(false), + }; + let executor_meta = executors.iter().find(|exec| exec.id == executor_id); + let task_is_dead = executor_meta.is_none(); + if task_is_dead { + info!( + "Executor {} isn't alive. Rescheduling task {:?}", + executor_id, + task_status.partition_id.as_ref().unwrap() + ); + // Task was handled in an executor that isn't alive anymore, so we can't resolve it + // We mark the task as pending again and continue + let mut task_status = task_status.clone(); + task_status.status = None; + self.save_task_status(&task_status).await?; + } + Ok(task_is_dead) + } + pub async fn assign_next_schedulable_task( &self, executor_id: &str, @@ -221,7 +280,10 @@ impl SchedulerState { .await? .into_iter() .collect(); - let executors = self.get_executors_metadata().await?; + // TODO: Make the duration a configurable parameter + let executors = self + .get_alive_executors_metadata(Duration::from_secs(60)) + .await?; 'tasks: for (_key, value) in kvs.iter() { let mut status: TaskStatus = decode_protobuf(value)?; if status.status.is_none() { @@ -249,13 +311,23 @@ impl SchedulerState { .unwrap(); let referenced_task: TaskStatus = decode_protobuf(referenced_task)?; - if let Some(task_status::Status::Completed(CompletedTask { - executor_id, - })) = referenced_task.status + let task_is_dead = self + .reschedule_dead_task(&referenced_task, &executors) + .await?; + if task_is_dead { + continue 'tasks; + } else if let Some(task_status::Status::Completed( + CompletedTask { executor_id }, + )) = referenced_task.status { let empty = vec![]; let locations = partition_locations.entry(stage_id).or_insert(empty); + let executor_meta = executors + .iter() + .find(|exec| exec.id == executor_id) + .unwrap() + .clone(); locations.push(vec![ ballista_core::serde::scheduler::PartitionLocation { partition_id: @@ -264,11 +336,7 @@ impl SchedulerState { stage_id, partition_id, }, - executor_meta: executors - .iter() - .find(|exec| exec.id == executor_id) - .unwrap() - .clone(), + executor_meta, partition_stats: PartitionStats::default(), }, ]); @@ -336,7 +404,7 @@ impl SchedulerState { .get_executors_metadata() .await? .into_iter() - .map(|meta| (meta.id.to_string(), meta)) + .map(|(meta, _)| (meta.id.to_string(), meta)) .collect(); let status: JobStatus = decode_protobuf(&value)?; let new_status = self.get_job_status_from_tasks(job_id, &executors).await?; @@ -553,7 +621,12 @@ mod test { port: 123, }; state.save_executor_metadata(meta.clone()).await?; - let result = state.get_executors_metadata().await?; + let result: Vec<_> = state + .get_executors_metadata() + .await? + .into_iter() + .map(|(meta, _)| meta) + .collect(); assert_eq!(vec![meta], result); Ok(()) } diff --git a/ballista/rust/scheduler/src/state/standalone.rs b/ballista/rust/scheduler/src/state/standalone.rs index 69805c016a105..8514d4cf3e64c 100644 --- a/ballista/rust/scheduler/src/state/standalone.rs +++ b/ballista/rust/scheduler/src/state/standalone.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::{sync::Arc, task::Poll, time::Duration}; +use std::{sync::Arc, task::Poll}; use crate::state::ConfigBackendClient; use ballista_core::error::{ballista_error, BallistaError, Result}; @@ -89,13 +89,7 @@ impl ConfigBackendClient for StandaloneClient { .map_err(|e| ballista_error(&format!("sled error {:?}", e)))?) } - // TODO: support lease_time. See https://github.com/spacejam/sled/issues/1119 for how to approach this - async fn put( - &self, - key: String, - value: Vec, - _lease_time: Option, - ) -> Result<()> { + async fn put(&self, key: String, value: Vec) -> Result<()> { self.db .insert(key, value) .map_err(|e| { @@ -170,7 +164,7 @@ mod tests { let client = create_instance()?; let key = "key"; let value = "value".as_bytes(); - client.put(key.to_owned(), value.to_vec(), None).await?; + client.put(key.to_owned(), value.to_vec()).await?; assert_eq!(client.get(key).await?, value); Ok(()) } @@ -189,12 +183,8 @@ mod tests { let client = create_instance()?; let key = "key"; let value = "value".as_bytes(); - client - .put(format!("{}/1", key), value.to_vec(), None) - .await?; - client - .put(format!("{}/2", key), value.to_vec(), None) - .await?; + client.put(format!("{}/1", key), value.to_vec()).await?; + client.put(format!("{}/2", key), value.to_vec()).await?; assert_eq!( client.get_from_prefix(key).await?, vec![ @@ -211,13 +201,13 @@ mod tests { let key = "key"; let value = "value".as_bytes(); let mut watch: Box = client.watch(key.to_owned()).await?; - client.put(key.to_owned(), value.to_vec(), None).await?; + client.put(key.to_owned(), value.to_vec()).await?; assert_eq!( watch.next().await, Some(WatchEvent::Put(key.to_owned(), value.to_owned())) ); let value2 = "value2".as_bytes(); - client.put(key.to_owned(), value2.to_vec(), None).await?; + client.put(key.to_owned(), value2.to_vec()).await?; assert_eq!( watch.next().await, Some(WatchEvent::Put(key.to_owned(), value2.to_owned())) From f8aaa4aee53e637870efeadc3bb23e487314d365 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Sat, 26 Jun 2021 21:40:11 +0800 Subject: [PATCH 209/329] in ballista also check for UI prettier changes (#578) --- .github/workflows/dev.yml | 7 + ballista/ui/scheduler/index.d.ts | 2 +- ballista/ui/scheduler/package.json | 2 +- ballista/ui/scheduler/react-table-config.d.ts | 229 +++++++++-------- ballista/ui/scheduler/src/App.tsx | 58 ++--- .../ui/scheduler/src/components/DataTable.tsx | 242 +++++++++++------- .../ui/scheduler/src/components/Footer.tsx | 22 +- .../ui/scheduler/src/components/Header.tsx | 123 +++++---- .../ui/scheduler/src/components/NodesList.tsx | 10 +- .../scheduler/src/components/QueriesList.tsx | 180 +++++++------ .../ui/scheduler/src/components/Summary.tsx | 13 +- ballista/ui/scheduler/yarn.lock | 8 +- 12 files changed, 504 insertions(+), 392 deletions(-) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 3f52ccd344505..f9e6b27fdb80e 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -50,3 +50,10 @@ jobs: # if you encounter error, try rerun the command below with --write instead of --check # and commit the changes npx prettier@2.3.0 --check {ballista,datafusion,datafusion-examples,dev,docs,python}/**/*.md README.md DEVELOPERS.md + - name: Prettier check for Ballista UI + run: | + cd ballista/ui/scheduler + npx yarn + # if you encounter error, try rerun the command below with --write instead of --check + # and commit the changes + npx yarn prettier --check **/*.{ts,tsx} diff --git a/ballista/ui/scheduler/index.d.ts b/ballista/ui/scheduler/index.d.ts index 9f715810217d2..be0f38ba098bf 100644 --- a/ballista/ui/scheduler/index.d.ts +++ b/ballista/ui/scheduler/index.d.ts @@ -15,4 +15,4 @@ // specific language governing permissions and limitations // under the License. -declare module "@chakra-ui/icons"; \ No newline at end of file +declare module "@chakra-ui/icons"; diff --git a/ballista/ui/scheduler/package.json b/ballista/ui/scheduler/package.json index fe1e72d68f7ae..9a5d2006dcce3 100644 --- a/ballista/ui/scheduler/package.json +++ b/ballista/ui/scheduler/package.json @@ -52,7 +52,7 @@ "devDependencies": { "@types/react-table": "^7.0.28", "@types/react-timeago": "^4.1.2", - "prettier": "^2.2.1" + "prettier": "^2.3.0" }, "proxy": "http://localhost:50050" } diff --git a/ballista/ui/scheduler/react-table-config.d.ts b/ballista/ui/scheduler/react-table-config.d.ts index 4bdce7667ecaa..2c9994f91d2ab 100644 --- a/ballista/ui/scheduler/react-table-config.d.ts +++ b/ballista/ui/scheduler/react-table-config.d.ts @@ -16,122 +16,131 @@ // under the License. import { - UseColumnOrderInstanceProps, - UseColumnOrderState, - UseExpandedHooks, - UseExpandedInstanceProps, - UseExpandedOptions, - UseExpandedRowProps, - UseExpandedState, - UseFiltersColumnOptions, - UseFiltersColumnProps, - UseFiltersInstanceProps, - UseFiltersOptions, - UseFiltersState, - UseGlobalFiltersColumnOptions, - UseGlobalFiltersInstanceProps, - UseGlobalFiltersOptions, - UseGlobalFiltersState, - UseGroupByCellProps, - UseGroupByColumnOptions, - UseGroupByColumnProps, - UseGroupByHooks, - UseGroupByInstanceProps, - UseGroupByOptions, - UseGroupByRowProps, - UseGroupByState, - UsePaginationInstanceProps, - UsePaginationOptions, - UsePaginationState, - UseResizeColumnsColumnOptions, - UseResizeColumnsColumnProps, - UseResizeColumnsOptions, - UseResizeColumnsState, - UseRowSelectHooks, - UseRowSelectInstanceProps, - UseRowSelectOptions, - UseRowSelectRowProps, - UseRowSelectState, - UseRowStateCellProps, - UseRowStateInstanceProps, - UseRowStateOptions, - UseRowStateRowProps, - UseRowStateState, - UseSortByColumnOptions, - UseSortByColumnProps, - UseSortByHooks, - UseSortByInstanceProps, - UseSortByOptions, - UseSortByState -} from 'react-table' + UseColumnOrderInstanceProps, + UseColumnOrderState, + UseExpandedHooks, + UseExpandedInstanceProps, + UseExpandedOptions, + UseExpandedRowProps, + UseExpandedState, + UseFiltersColumnOptions, + UseFiltersColumnProps, + UseFiltersInstanceProps, + UseFiltersOptions, + UseFiltersState, + UseGlobalFiltersColumnOptions, + UseGlobalFiltersInstanceProps, + UseGlobalFiltersOptions, + UseGlobalFiltersState, + UseGroupByCellProps, + UseGroupByColumnOptions, + UseGroupByColumnProps, + UseGroupByHooks, + UseGroupByInstanceProps, + UseGroupByOptions, + UseGroupByRowProps, + UseGroupByState, + UsePaginationInstanceProps, + UsePaginationOptions, + UsePaginationState, + UseResizeColumnsColumnOptions, + UseResizeColumnsColumnProps, + UseResizeColumnsOptions, + UseResizeColumnsState, + UseRowSelectHooks, + UseRowSelectInstanceProps, + UseRowSelectOptions, + UseRowSelectRowProps, + UseRowSelectState, + UseRowStateCellProps, + UseRowStateInstanceProps, + UseRowStateOptions, + UseRowStateRowProps, + UseRowStateState, + UseSortByColumnOptions, + UseSortByColumnProps, + UseSortByHooks, + UseSortByInstanceProps, + UseSortByOptions, + UseSortByState, +} from "react-table"; -declare module 'react-table' { - // take this file as-is, or comment out the sections that don't apply to your plugin configuration +declare module "react-table" { + // take this file as-is, or comment out the sections that don't apply to your plugin configuration - export interface TableOptions> - extends UseExpandedOptions, - UseFiltersOptions, - UseGlobalFiltersOptions, - UseGroupByOptions, - UsePaginationOptions, - UseResizeColumnsOptions, - UseRowSelectOptions, - UseRowStateOptions, - UseSortByOptions, - // note that having Record here allows you to add anything to the options, this matches the spirit of the - // underlying js library, but might be cleaner if it's replaced by a more specific type that matches your - // feature set, this is a safe default. - Record {} + export interface TableOptions< + D extends Record + > extends UseExpandedOptions, + UseFiltersOptions, + UseGlobalFiltersOptions, + UseGroupByOptions, + UsePaginationOptions, + UseResizeColumnsOptions, + UseRowSelectOptions, + UseRowStateOptions, + UseSortByOptions, + // note that having Record here allows you to add anything to the options, this matches the spirit of the + // underlying js library, but might be cleaner if it's replaced by a more specific type that matches your + // feature set, this is a safe default. + Record {} - export interface Hooks = Record> - extends UseExpandedHooks, - UseGroupByHooks, - UseRowSelectHooks, - UseSortByHooks {} + export interface Hooks< + D extends Record = Record + > extends UseExpandedHooks, + UseGroupByHooks, + UseRowSelectHooks, + UseSortByHooks {} - export interface TableInstance = Record> - extends UseColumnOrderInstanceProps, - UseExpandedInstanceProps, - UseFiltersInstanceProps, - UseGlobalFiltersInstanceProps, - UseGroupByInstanceProps, - UsePaginationInstanceProps, - UseRowSelectInstanceProps, - UseRowStateInstanceProps, - UseSortByInstanceProps {} + export interface TableInstance< + D extends Record = Record + > extends UseColumnOrderInstanceProps, + UseExpandedInstanceProps, + UseFiltersInstanceProps, + UseGlobalFiltersInstanceProps, + UseGroupByInstanceProps, + UsePaginationInstanceProps, + UseRowSelectInstanceProps, + UseRowStateInstanceProps, + UseSortByInstanceProps {} - export interface TableState = Record> - extends UseColumnOrderState, - UseExpandedState, - UseFiltersState, - UseGlobalFiltersState, - UseGroupByState, - UsePaginationState, - UseResizeColumnsState, - UseRowSelectState, - UseRowStateState, - UseSortByState {} + export interface TableState< + D extends Record = Record + > extends UseColumnOrderState, + UseExpandedState, + UseFiltersState, + UseGlobalFiltersState, + UseGroupByState, + UsePaginationState, + UseResizeColumnsState, + UseRowSelectState, + UseRowStateState, + UseSortByState {} - export interface ColumnInterface = Record> - extends UseFiltersColumnOptions, - UseGlobalFiltersColumnOptions, - UseGroupByColumnOptions, - UseResizeColumnsColumnOptions, - UseSortByColumnOptions {} + export interface ColumnInterface< + D extends Record = Record + > extends UseFiltersColumnOptions, + UseGlobalFiltersColumnOptions, + UseGroupByColumnOptions, + UseResizeColumnsColumnOptions, + UseSortByColumnOptions {} - export interface ColumnInstance = Record> - extends UseFiltersColumnProps, - UseGroupByColumnProps, - UseResizeColumnsColumnProps, - UseSortByColumnProps {} + export interface ColumnInstance< + D extends Record = Record + > extends UseFiltersColumnProps, + UseGroupByColumnProps, + UseResizeColumnsColumnProps, + UseSortByColumnProps {} - export interface Cell = Record, V = any> - extends UseGroupByCellProps, - UseRowStateCellProps {} + export interface Cell< + D extends Record = Record, + V = any + > extends UseGroupByCellProps, + UseRowStateCellProps {} - export interface Row = Record> - extends UseExpandedRowProps, - UseGroupByRowProps, - UseRowSelectRowProps, - UseRowStateRowProps {} -} \ No newline at end of file + export interface Row< + D extends Record = Record + > extends UseExpandedRowProps, + UseGroupByRowProps, + UseRowSelectRowProps, + UseRowStateRowProps {} +} diff --git a/ballista/ui/scheduler/src/App.tsx b/ballista/ui/scheduler/src/App.tsx index 5864a27cdf57a..adb5896a81d15 100644 --- a/ballista/ui/scheduler/src/App.tsx +++ b/ballista/ui/scheduler/src/App.tsx @@ -15,12 +15,12 @@ // specific language governing permissions and limitations // under the License. -import React, {useState, useEffect} from "react"; -import {Box, Grid, VStack} from "@chakra-ui/react"; -import {Header} from "./components/Header"; -import { Summary} from "./components/Summary"; -import {QueriesList, Query, QueryStatus} from "./components/QueriesList"; -import {Footer} from "./components/Footer"; +import React, { useState, useEffect } from "react"; +import { Box, Grid, VStack } from "@chakra-ui/react"; +import { Header } from "./components/Header"; +import { Summary } from "./components/Summary"; +import { QueriesList, Query, QueryStatus } from "./components/QueriesList"; +import { Footer } from "./components/Footer"; import "./App.css"; @@ -38,22 +38,23 @@ const getRandomQueries = (num: number): Query[] => { for (let i = 0; i < num; i++) { nodes.push({ started: new Date().toISOString(), - query: "SELECT \n" + - " employee.id,\n" + - " employee.first_name,\n" + - " employee.last_name,\n" + - " SUM(DATEDIFF(\"SECOND\", call.start_time, call.end_time)) AS call_duration_sum\n" + - "FROM call\n" + - "INNER JOIN employee ON call.employee_id = employee.id\n" + - "GROUP BY\n" + - " employee.id,\n" + - " employee.first_name,\n" + - " employee.last_name\n" + - "ORDER BY\n" + - " employee.id ASC;", + query: + "SELECT \n" + + " employee.id,\n" + + " employee.first_name,\n" + + " employee.last_name,\n" + + ' SUM(DATEDIFF("SECOND", call.start_time, call.end_time)) AS call_duration_sum\n' + + "FROM call\n" + + "INNER JOIN employee ON call.employee_id = employee.id\n" + + "GROUP BY\n" + + " employee.id,\n" + + " employee.first_name,\n" + + " employee.last_name\n" + + "ORDER BY\n" + + " employee.id ASC;", status: QueryStatus.RUNNING, progress: Math.round(Math.random() * 100), - uuid: uuidv4() + uuid: uuidv4(), }); } return nodes; @@ -61,19 +62,18 @@ const getRandomQueries = (num: number): Query[] => { const queries = getRandomQueries(17); -const App : React.FunctionComponent = () => { - - const [schedulerState, setSchedulerState] = useState(undefined) +const App: React.FunctionComponent = () => { + const [schedulerState, setSchedulerState] = useState(undefined); function getSchedulerState() { return fetch(`/state`, { - method: 'POST', + method: "POST", headers: { - 'Accept': 'application/json' - } + Accept: "application/json", + }, }) - .then(res => res.json()) - .then(res => setSchedulerState(res)); + .then((res) => res.json()) + .then((res) => setSchedulerState(res)); } useEffect(() => { @@ -92,6 +92,6 @@ const App : React.FunctionComponent = () => { ); -} +}; export default App; diff --git a/ballista/ui/scheduler/src/components/DataTable.tsx b/ballista/ui/scheduler/src/components/DataTable.tsx index 38176d3e34fda..188ddc8c8cc00 100644 --- a/ballista/ui/scheduler/src/components/DataTable.tsx +++ b/ballista/ui/scheduler/src/components/DataTable.tsx @@ -16,116 +16,164 @@ // under the License. import React from "react"; -import {Link, Table, Thead, Flex, Tbody, Text, Tr, Th, Td, VStack, chakra} from "@chakra-ui/react"; -import {TriangleDownIcon, TriangleUpIcon} from "@chakra-ui/icons"; -import {useTable, useSortBy, usePagination, Column as RTColumn} from "react-table"; -import {HiChevronLeft, HiChevronRight} from "react-icons/all"; +import { + Link, + Table, + Thead, + Flex, + Tbody, + Text, + Tr, + Th, + Td, + VStack, + chakra, +} from "@chakra-ui/react"; +import { TriangleDownIcon, TriangleUpIcon } from "@chakra-ui/icons"; +import { + useTable, + useSortBy, + usePagination, + Column as RTColumn, +} from "react-table"; +import { HiChevronLeft, HiChevronRight } from "react-icons/all"; import TimeAgo from "react-timeago"; type RenderFn = (props: any) => React.ReactNode; interface Row { - [name: string]: any; + [name: string]: any; } // eslint-disable-next-line -export type Column = RTColumn | { - isNumeric?: boolean; - render?: RenderFn; -}; +export type Column = + | RTColumn + | { + isNumeric?: boolean; + render?: RenderFn; + }; interface DataTableProps { - columns: Column[]; - data: Row[]; - pageSize?: number; - maxW?: number; - pb?: number; + columns: Column[]; + data: Row[]; + pageSize?: number; + maxW?: number; + pb?: number; } -export const DateCell : (props: any) => React.ReactNode = (props: any) => { - return { - if (unit === 'second') return 'just now'; - const plural: string = value !== 1 ? 's' : ''; - return `${value} ${unit}${plural} ${suffix}`; - }} +export const DateCell: (props: any) => React.ReactNode = (props: any) => { + return ( + { + if (unit === "second") return "just now"; + const plural: string = value !== 1 ? "s" : ""; + return `${value} ${unit}${plural} ${suffix}`; + }} /> -} + ); +}; -export const LinkCell : (props: any) => React.ReactNode = (props: any) => { - return ( - - {props.value} - - ) -} +export const LinkCell: (props: any) => React.ReactNode = (props: any) => { + return ( + + {props.value} + + ); +}; -export const DataTable: React.FunctionComponent = ({data, columns, pageSize = 10, maxW, pb}) => { - const { - getTableProps, - getTableBodyProps, - headerGroups, - rows, - prepareRow, - pageOptions, - canNextPage, - nextPage, - canPreviousPage, - previousPage, - state: {pageIndex}, - } = useTable({columns: columns as any, data, initialState: {pageIndex: 0, pageSize},}, useSortBy, usePagination); +export const DataTable: React.FunctionComponent = ({ + data, + columns, + pageSize = 10, + maxW, + pb, +}) => { + const { + getTableProps, + getTableBodyProps, + headerGroups, + rows, + prepareRow, + pageOptions, + canNextPage, + nextPage, + canPreviousPage, + previousPage, + state: { pageIndex }, + } = useTable( + { columns: columns as any, data, initialState: { pageIndex: 0, pageSize } }, + useSortBy, + usePagination + ); - const last = data.length; - const start = (pageIndex * pageSize) + 1; - const end = Math.min((pageIndex + 1) * pageSize, last); + const last = data.length; + const start = pageIndex * pageSize + 1; + const end = Math.min((pageIndex + 1) * pageSize, last); - return ( - - - - {headerGroups.map((headerGroup) => ( - - {headerGroup.headers.map((column: any) => ( - - ))} - - ))} - - - {rows.slice(start - 1, end).map((row: any) => { - prepareRow(row); - return ( - - {row.cells.map((cell: any) => ( - - ))} - - ); - })} - -
- {column.render("Header")} - - {column.isSorted ? ( - column.isSortedDesc ? ( - - ) : ( - - ) - ) : null} - -
- {cell.render("Cell")} -
- {pageOptions.length > 1 ? - ( - Showing {start} to {end} of {last}. - - - ) : null} -
- ); - } -; + return ( + + + + {headerGroups.map((headerGroup) => ( + + {headerGroup.headers.map((column: any) => ( + + ))} + + ))} + + + {rows.slice(start - 1, end).map((row: any) => { + prepareRow(row); + return ( + + {row.cells.map((cell: any) => ( + + ))} + + ); + })} + +
+ {column.render("Header")} + + {column.isSorted ? ( + column.isSortedDesc ? ( + + ) : ( + + ) + ) : null} + +
+ {cell.render("Cell")} +
+ {pageOptions.length > 1 ? ( + + + Showing {start} to {end} of {last}.{" "} + + + + + ) : null} +
+ ); +}; diff --git a/ballista/ui/scheduler/src/components/Footer.tsx b/ballista/ui/scheduler/src/components/Footer.tsx index ab03898f44b31..acab6083dbc54 100644 --- a/ballista/ui/scheduler/src/components/Footer.tsx +++ b/ballista/ui/scheduler/src/components/Footer.tsx @@ -16,13 +16,19 @@ // under the License. import React from "react"; -import {Flex, Text} from "@chakra-ui/react"; - +import { Flex, Text } from "@chakra-ui/react"; export const Footer: React.FunctionComponent = () => { - return ( - - Licensed under the Apache License, Version 2.0. - - ) -} \ No newline at end of file + return ( + + + Licensed under the Apache License, Version 2.0. + + + ); +}; diff --git a/ballista/ui/scheduler/src/components/Header.tsx b/ballista/ui/scheduler/src/components/Header.tsx index 1a0b0f178bd6b..5f8021284c7c0 100644 --- a/ballista/ui/scheduler/src/components/Header.tsx +++ b/ballista/ui/scheduler/src/components/Header.tsx @@ -16,67 +16,80 @@ // under the License. import React from "react"; -import {Box, Flex, Text, Button} from "@chakra-ui/react"; +import { Box, Flex, Text, Button } from "@chakra-ui/react"; import Logo from "./logo.svg"; -import {AiFillGithub, HiDocumentText} from "react-icons/all"; -import {SchedulerState} from "./Summary"; +import { AiFillGithub, HiDocumentText } from "react-icons/all"; +import { SchedulerState } from "./Summary"; -export const NavBarContainer: React.FunctionComponent> = ({children, ...props}) => { - return ( - - {children} - - ); +export const NavBarContainer: React.FunctionComponent< + React.PropsWithChildren +> = ({ children, ...props }) => { + return ( + + {children} + + ); }; interface HeaderProps { - schedulerState?: SchedulerState + schedulerState?: SchedulerState; } -export const Header: React.FunctionComponent = ({schedulerState}) => { - return ( - - - - - {"Ballista - - Version - {schedulerState?.version} - - - -
- - - - - - - - +export const Header: React.FunctionComponent = ({ + schedulerState, +}) => { + return ( + + + + + {"Ballista + + Version - {schedulerState?.version} + + + + + + + + + + - ); + + + ); }; diff --git a/ballista/ui/scheduler/src/components/NodesList.tsx b/ballista/ui/scheduler/src/components/NodesList.tsx index 2690e86b534f1..3ad85fc7fc615 100644 --- a/ballista/ui/scheduler/src/components/NodesList.tsx +++ b/ballista/ui/scheduler/src/components/NodesList.tsx @@ -16,12 +16,12 @@ // under the License. import React from "react"; -import {Box } from "@chakra-ui/react"; -import {Column, DateCell, DataTable} from "./DataTable"; +import { Box } from "@chakra-ui/react"; +import { Column, DateCell, DataTable } from "./DataTable"; export enum NodeStatus { RUNNING = "RUNNING", - TERMINATED = "TERMINATED" + TERMINATED = "TERMINATED", } export interface NodeInfo { @@ -32,7 +32,7 @@ export interface NodeInfo { started: string; } -const columns : Column[] = [ +const columns: Column[] = [ { Header: "Node", accessor: "id", @@ -57,7 +57,7 @@ const columns : Column[] = [ ]; interface NodesListProps { - nodes: NodeInfo[] + nodes: NodeInfo[]; } export const NodesList: React.FunctionComponent = ({ diff --git a/ballista/ui/scheduler/src/components/QueriesList.tsx b/ballista/ui/scheduler/src/components/QueriesList.tsx index 2d7166a28eb21..447a507fdda3d 100644 --- a/ballista/ui/scheduler/src/components/QueriesList.tsx +++ b/ballista/ui/scheduler/src/components/QueriesList.tsx @@ -16,100 +16,126 @@ // under the License. import React from "react"; -import {CircularProgress, CircularProgressLabel, VStack, Skeleton, Stack, Text, Flex, Box} from "@chakra-ui/react"; -import {Column, DateCell, DataTable, LinkCell} from "./DataTable"; -import {FaStop} from "react-icons/fa"; -import {GrPowerReset} from "react-icons/gr"; +import { + CircularProgress, + CircularProgressLabel, + VStack, + Skeleton, + Stack, + Text, + Flex, + Box, +} from "@chakra-ui/react"; +import { Column, DateCell, DataTable, LinkCell } from "./DataTable"; +import { FaStop } from "react-icons/fa"; +import { GrPowerReset } from "react-icons/gr"; export enum QueryStatus { - QUEUED = "QUEUED", - RUNNING = "RUNNING", - FAILED = "FAILED", - COMPLETED = "COMPLETED", + QUEUED = "QUEUED", + RUNNING = "RUNNING", + FAILED = "FAILED", + COMPLETED = "COMPLETED", } export interface Query { - uuid: string; - query: string; - status: QueryStatus; - progress: number; - started: string; + uuid: string; + query: string; + status: QueryStatus; + progress: number; + started: string; } export interface QueriesListProps { - queries?: Query[]; + queries?: Query[]; } export const ActionsCell: (props: any) => React.ReactNode = (props: any) => { - return ( - - - - - - ) -} + return ( + + + + + + ); +}; export const ProgressCell: (props: any) => React.ReactNode = (props: any) => { - return ( - - {props.value}% - - ) -} + return ( + + {props.value}% + + ); +}; const columns: Column[] = [ - { - Header: "UUID", - accessor: "uuid", - Cell: LinkCell - }, - { - Header: "Query", - accessor: "query", - }, - { - Header: "Status", - accessor: "status", - }, - { - Header: "Progress", - accessor: "progress", - Cell: ProgressCell, - }, - { - Header: "Started", - accessor: "started", - Cell: DateCell, - }, - { - Header: "Actions", - accessor: "", - Cell: ActionsCell, - } + { + Header: "UUID", + accessor: "uuid", + Cell: LinkCell, + }, + { + Header: "Query", + accessor: "query", + }, + { + Header: "Status", + accessor: "status", + }, + { + Header: "Progress", + accessor: "progress", + Cell: ProgressCell, + }, + { + Header: "Started", + accessor: "started", + Cell: DateCell, + }, + { + Header: "Actions", + accessor: "", + Cell: ActionsCell, + }, ]; const getSkeletion = () => ( - <> - - - - - - - -) + <> + + + + + + + +); -export const QueriesList: React.FunctionComponent = ({queries}) => { - const isLoaded = typeof queries !== "undefined"; +export const QueriesList: React.FunctionComponent = ({ + queries, +}) => { + const isLoaded = typeof queries !== "undefined"; - //TODO: Remove blur once queries api is ready - return ( - - Queries - - {isLoaded ? : getSkeletion()} - - - ) -}; \ No newline at end of file + //TODO: Remove blur once queries api is ready + return ( + + Queries + + {isLoaded ? ( + + ) : ( + getSkeletion() + )} + + + ); +}; diff --git a/ballista/ui/scheduler/src/components/Summary.tsx b/ballista/ui/scheduler/src/components/Summary.tsx index 2e52498296feb..9fe1a232400f5 100644 --- a/ballista/ui/scheduler/src/components/Summary.tsx +++ b/ballista/ui/scheduler/src/components/Summary.tsx @@ -39,13 +39,14 @@ export interface SchedulerState { } export interface SummaryProps { - schedulerState?: SchedulerState + schedulerState?: SchedulerState; } -export const Summary: React.FunctionComponent = ({schedulerState}) => { - +export const Summary: React.FunctionComponent = ({ + schedulerState, +}) => { if (!schedulerState) { - return Scheduler isn't running + return Scheduler isn't running; } return ( @@ -58,7 +59,9 @@ export const Summary: React.FunctionComponent = ({schedulerState}) alignItems={"flex-start"} fontWeight={"normal"} > - General Cluster Info + + General Cluster Info + diff --git a/ballista/ui/scheduler/yarn.lock b/ballista/ui/scheduler/yarn.lock index f2ea84b87bce8..93748027dfed1 100644 --- a/ballista/ui/scheduler/yarn.lock +++ b/ballista/ui/scheduler/yarn.lock @@ -9537,10 +9537,10 @@ prepend-http@^1.0.0: resolved "https://registry.yarnpkg.com/prepend-http/-/prepend-http-1.0.4.tgz#d4f4562b0ce3696e41ac52d0e002e57a635dc6dc" integrity sha1-1PRWKwzjaW5BrFLQ4ALlemNdxtw= -prettier@^2.2.1: - version "2.2.1" - resolved "https://registry.yarnpkg.com/prettier/-/prettier-2.2.1.tgz#795a1a78dd52f073da0cd42b21f9c91381923ff5" - integrity sha512-PqyhM2yCjg/oKkFPtTGUojv7gnZAoG80ttl45O6x2Ug/rMJw4wcc9k6aaf2hibP7BGVCCM33gZoGjyvt9mm16Q== +prettier@^2.3.0: + version "2.3.1" + resolved "https://registry.yarnpkg.com/prettier/-/prettier-2.3.1.tgz#76903c3f8c4449bc9ac597acefa24dc5ad4cbea6" + integrity sha512-p+vNbgpLjif/+D+DwAZAbndtRrR0md0MwfmOVN9N+2RgyACMT+7tfaRnT+WDPkqnuVwleyuBIG2XBxKDme3hPA== pretty-bytes@^5.3.0: version "5.6.0" From 61199b985b17e82941232bbd6fbd355b115b503b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 26 Jun 2021 10:05:46 -0600 Subject: [PATCH 210/329] Ballista: Implement map-side shuffle (#543) --- ballista/rust/core/Cargo.toml | 1 + .../core/src/execution_plans/query_stage.rs | 265 ++++++++++++++++-- ballista/rust/core/src/serde/scheduler/mod.rs | 3 +- datafusion/src/physical_plan/mod.rs | 3 +- 4 files changed, 248 insertions(+), 24 deletions(-) diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml index 1f23a2a42e2a0..bedc0973e6ad9 100644 --- a/ballista/rust/core/Cargo.toml +++ b/ballista/rust/core/Cargo.toml @@ -30,6 +30,7 @@ build = "build.rs" simd = ["datafusion/simd"] [dependencies] +ahash = "0.7" async-trait = "0.1.36" futures = "0.3" log = "0.4" diff --git a/ballista/rust/core/src/execution_plans/query_stage.rs b/ballista/rust/core/src/execution_plans/query_stage.rs index 264c44dc43dca..1c7a7aad8fba2 100644 --- a/ballista/rust/core/src/execution_plans/query_stage.rs +++ b/ballista/rust/core/src/execution_plans/query_stage.rs @@ -20,8 +20,9 @@ //! a query stage either forms the input of another query stage or can be the final result of //! a query. +use std::iter::Iterator; use std::path::PathBuf; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use std::time::Instant; use std::{any::Any, pin::Pin}; @@ -29,13 +30,22 @@ use crate::error::BallistaError; use crate::memory_stream::MemoryStream; use crate::utils; +use crate::serde::scheduler::PartitionStats; use async_trait::async_trait; -use datafusion::arrow::array::{ArrayRef, StringBuilder}; +use datafusion::arrow::array::{ + Array, ArrayBuilder, ArrayRef, StringBuilder, StructBuilder, UInt32Builder, + UInt64Builder, +}; +use datafusion::arrow::compute::take; use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use datafusion::arrow::ipc::writer::FileWriter; use datafusion::arrow::record_batch::RecordBatch; use datafusion::error::{DataFusionError, Result}; +use datafusion::physical_plan::hash_join::create_hashes; use datafusion::physical_plan::{ExecutionPlan, Partitioning, RecordBatchStream}; +use futures::StreamExt; use log::info; +use std::fs::File; use uuid::Uuid; /// QueryStageExec represents a section of a query plan that has consistent partitioning and @@ -133,7 +143,6 @@ impl ExecutionPlan for QueryStageExec { None => { path.push(&format!("{}", partition)); std::fs::create_dir_all(&path)?; - path.push("data.arrow"); let path = path.to_str().unwrap(); info!("Writing results to {}", path); @@ -150,32 +159,150 @@ impl ExecutionPlan for QueryStageExec { stats ); - let schema = Arc::new(Schema::new(vec![ - Field::new("path", DataType::Utf8, false), - stats.arrow_struct_repr(), - ])); + let schema = result_schema(); // build result set with summary of the partition execution status - let mut c0 = StringBuilder::new(1); - c0.append_value(&path).unwrap(); - let path: ArrayRef = Arc::new(c0.finish()); + let mut part_builder = UInt32Builder::new(1); + part_builder.append_value(partition as u32)?; + let part: ArrayRef = Arc::new(part_builder.finish()); + + let mut path_builder = StringBuilder::new(1); + path_builder.append_value(&path)?; + let path: ArrayRef = Arc::new(path_builder.finish()); let stats: ArrayRef = stats .to_arrow_arrayref() .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; - let batch = RecordBatch::try_new(schema.clone(), vec![path, stats]) + let batch = RecordBatch::try_new(schema.clone(), vec![part, path, stats]) .map_err(DataFusionError::ArrowError)?; Ok(Box::pin(MemoryStream::try_new(vec![batch], schema, None)?)) } - Some(Partitioning::Hash(_, _)) => { - //TODO re-use code from RepartitionExec to split each batch into - // partitions and write to one IPC file per partition - // See https://github.com/apache/arrow-datafusion/issues/456 - Err(DataFusionError::NotImplemented( - "Shuffle partitioning not implemented yet".to_owned(), - )) + Some(Partitioning::Hash(exprs, n)) => { + let num_output_partitions = *n; + + // we won't necessary produce output for every possible partition, so we + // create writers on demand + let mut writers: Vec>>> = vec![]; + for _ in 0..num_output_partitions { + writers.push(None); + } + + let hashes_buf = &mut vec![]; + let random_state = ahash::RandomState::with_seeds(0, 0, 0, 0); + while let Some(result) = stream.next().await { + let input_batch = result?; + let arrays = exprs + .iter() + .map(|expr| { + Ok(expr + .evaluate(&input_batch)? + .into_array(input_batch.num_rows())) + }) + .collect::>>()?; + hashes_buf.clear(); + hashes_buf.resize(arrays[0].len(), 0); + // Hash arrays and compute buckets based on number of partitions + let hashes = create_hashes(&arrays, &random_state, hashes_buf)?; + let mut indices = vec![vec![]; num_output_partitions]; + for (index, hash) in hashes.iter().enumerate() { + indices[(*hash % num_output_partitions as u64) as usize] + .push(index as u64) + } + for (num_output_partition, partition_indices) in + indices.into_iter().enumerate() + { + let indices = partition_indices.into(); + // Produce batches based on indices + let columns = input_batch + .columns() + .iter() + .map(|c| { + take(c.as_ref(), &indices, None).map_err(|e| { + DataFusionError::Execution(e.to_string()) + }) + }) + .collect::>>>()?; + + let output_batch = + RecordBatch::try_new(input_batch.schema(), columns)?; + + // write batch out + match &writers[num_output_partition] { + Some(w) => { + let mut w = w.lock().unwrap(); + w.write(&output_batch)?; + } + None => { + let mut path = path.clone(); + path.push(&format!("{}", partition)); + std::fs::create_dir_all(&path)?; + + path.push("data.arrow"); + let path = path.to_str().unwrap(); + info!("Writing results to {}", path); + + let mut writer = + ShuffleWriter::new(path, stream.schema().as_ref())?; + + writer.write(&output_batch)?; + writers[num_output_partition] = + Some(Arc::new(Mutex::new(writer))); + } + } + } + } + + // build metadata result batch + let num_writers = writers.iter().filter(|w| w.is_some()).count(); + let mut partition_builder = UInt32Builder::new(num_writers); + let mut path_builder = StringBuilder::new(num_writers); + let mut num_rows_builder = UInt64Builder::new(num_writers); + let mut num_batches_builder = UInt64Builder::new(num_writers); + let mut num_bytes_builder = UInt64Builder::new(num_writers); + + for (i, w) in writers.iter().enumerate() { + match w { + Some(w) => { + let mut w = w.lock().unwrap(); + w.finish()?; + path_builder.append_value(w.path())?; + partition_builder.append_value(i as u32)?; + num_rows_builder.append_value(w.num_rows)?; + num_batches_builder.append_value(w.num_batches)?; + num_bytes_builder.append_value(w.num_bytes)?; + } + None => {} + } + } + + // build arrays + let partition_num: ArrayRef = Arc::new(partition_builder.finish()); + let path: ArrayRef = Arc::new(path_builder.finish()); + let field_builders: Vec> = vec![ + Box::new(num_rows_builder), + Box::new(num_batches_builder), + Box::new(num_bytes_builder), + ]; + let mut stats_builder = StructBuilder::new( + PartitionStats::default().arrow_struct_fields(), + field_builders, + ); + for _ in 0..num_writers { + stats_builder.append(true)?; + } + let stats = Arc::new(stats_builder.finish()); + + // build result batch containing metadata + let schema = result_schema(); + let batch = RecordBatch::try_new( + schema.clone(), + vec![partition_num, path, stats], + ) + .map_err(DataFusionError::ArrowError)?; + + Ok(Box::pin(MemoryStream::try_new(vec![batch], schema, None)?)) } _ => Err(DataFusionError::Execution( @@ -185,10 +312,69 @@ impl ExecutionPlan for QueryStageExec { } } +fn result_schema() -> SchemaRef { + let stats = PartitionStats::default(); + Arc::new(Schema::new(vec![ + Field::new("partition", DataType::UInt32, false), + Field::new("path", DataType::Utf8, false), + stats.arrow_struct_repr(), + ])) +} + +struct ShuffleWriter { + path: String, + writer: FileWriter, + num_batches: u64, + num_rows: u64, + num_bytes: u64, +} + +impl ShuffleWriter { + fn new(path: &str, schema: &Schema) -> Result { + let file = File::create(path) + .map_err(|e| { + BallistaError::General(format!( + "Failed to create partition file at {}: {:?}", + path, e + )) + }) + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; + Ok(Self { + num_batches: 0, + num_rows: 0, + num_bytes: 0, + path: path.to_owned(), + writer: FileWriter::try_new(file, schema)?, + }) + } + + fn write(&mut self, batch: &RecordBatch) -> Result<()> { + self.writer.write(batch)?; + self.num_batches += 1; + self.num_rows += batch.num_rows() as u64; + let num_bytes: usize = batch + .columns() + .iter() + .map(|array| array.get_array_memory_size()) + .sum(); + self.num_bytes += num_bytes as u64; + Ok(()) + } + + fn finish(&mut self) -> Result<()> { + self.writer.finish().map_err(DataFusionError::ArrowError) + } + + fn path(&self) -> &str { + &self.path + } +} + #[cfg(test)] mod tests { use super::*; use datafusion::arrow::array::{StringArray, StructArray, UInt32Array, UInt64Array}; + use datafusion::physical_plan::expressions::Column; use datafusion::physical_plan::memory::MemoryExec; use tempfile::TempDir; @@ -207,17 +393,17 @@ mod tests { let batches = utils::collect_stream(&mut stream) .await .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; - assert!(batches.len() == 1); + assert_eq!(1, batches.len()); let batch = &batches[0]; - assert_eq!(2, batch.num_columns()); + assert_eq!(3, batch.num_columns()); assert_eq!(1, batch.num_rows()); - let path = batch.columns()[0] + let path = batch.columns()[1] .as_any() .downcast_ref::() .unwrap(); let file = path.value(0); assert!(file.ends_with("data.arrow")); - let stats = batch.columns()[1] + let stats = batch.columns()[2] .as_any() .downcast_ref::() .unwrap(); @@ -231,6 +417,41 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_partitioned() -> Result<()> { + let input_plan = create_input_plan()?; + let work_dir = TempDir::new()?; + let query_stage = QueryStageExec::try_new( + "jobOne".to_owned(), + 1, + input_plan, + work_dir.into_path().to_str().unwrap().to_owned(), + Some(Partitioning::Hash(vec![Arc::new(Column::new("a"))], 2)), + )?; + let mut stream = query_stage.execute(0).await?; + let batches = utils::collect_stream(&mut stream) + .await + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; + assert_eq!(1, batches.len()); + let batch = &batches[0]; + assert_eq!(3, batch.num_columns()); + assert_eq!(2, batch.num_rows()); + let stats = batch.columns()[2] + .as_any() + .downcast_ref::() + .unwrap(); + let num_rows = stats + .column_by_name("num_rows") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(2, num_rows.value(0)); + assert_eq!(2, num_rows.value(1)); + + Ok(()) + } + fn create_input_plan() -> Result> { let schema = Arc::new(Schema::new(vec![ Field::new("a", DataType::UInt32, true), diff --git a/ballista/rust/core/src/serde/scheduler/mod.rs b/ballista/rust/core/src/serde/scheduler/mod.rs index c9bd1e93db2c4..75e3ac496ff57 100644 --- a/ballista/rust/core/src/serde/scheduler/mod.rs +++ b/ballista/rust/core/src/serde/scheduler/mod.rs @@ -134,7 +134,8 @@ impl PartitionStats { false, ) } - fn arrow_struct_fields(self) -> Vec { + + pub fn arrow_struct_fields(self) -> Vec { vec![ Field::new("num_rows", DataType::UInt64, false), Field::new("num_batches", DataType::UInt64, false), diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index 7b26d7b3ab6e8..7f9f7eace8354 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -396,7 +396,8 @@ impl ColumnarValue { } } - fn into_array(self, num_rows: usize) -> ArrayRef { + /// Convert a columnar value into an ArrayRef + pub fn into_array(self, num_rows: usize) -> ArrayRef { match self { ColumnarValue::Array(array) => array, ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(num_rows), From 2f1d6cbc76f0069e20a741a18cf21f5d42020426 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 26 Jun 2021 10:44:31 -0600 Subject: [PATCH 211/329] Fix build (#627) --- ballista/rust/core/src/execution_plans/query_stage.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ballista/rust/core/src/execution_plans/query_stage.rs b/ballista/rust/core/src/execution_plans/query_stage.rs index 1c7a7aad8fba2..c117110498e66 100644 --- a/ballista/rust/core/src/execution_plans/query_stage.rs +++ b/ballista/rust/core/src/execution_plans/query_stage.rs @@ -426,7 +426,7 @@ mod tests { 1, input_plan, work_dir.into_path().to_str().unwrap().to_owned(), - Some(Partitioning::Hash(vec![Arc::new(Column::new("a"))], 2)), + Some(Partitioning::Hash(vec![Arc::new(Column::new("a", 0))], 2)), )?; let mut stream = query_stage.execute(0).await?; let batches = utils::collect_stream(&mut stream) From c21106edf2e16ae97c786a95c660bd25efbb6a87 Mon Sep 17 00:00:00 2001 From: QP Hou Date: Sat, 26 Jun 2021 23:43:27 -0700 Subject: [PATCH 212/329] honor table name for csv/parquet scan in ballista plan serde (#629) * honor table name for csv/parquet scan in ballista plan serde * disable query 7,8,9 in ballista integration test --- .../core/src/serde/logical_plan/from_proto.rs | 22 ++++++++++++----- benchmarks/run.sh | 2 +- datafusion/src/logical_plan/builder.rs | 24 +++++++++++++++++-- 3 files changed, 39 insertions(+), 9 deletions(-) diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index 418d60de3e7ae..15ee50733ecaf 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -126,9 +126,14 @@ impl TryInto for &protobuf::LogicalPlanNode { projection = Some(column_indices); } - LogicalPlanBuilder::scan_csv(&scan.path, options, projection)? - .build() - .map_err(|e| e.into()) + LogicalPlanBuilder::scan_csv_with_name( + &scan.path, + options, + projection, + &scan.table_name, + )? + .build() + .map_err(|e| e.into()) } LogicalPlanType::ParquetScan(scan) => { let projection = match scan.projection.as_ref() { @@ -151,9 +156,14 @@ impl TryInto for &protobuf::LogicalPlanNode { Some(r?) } }; - LogicalPlanBuilder::scan_parquet(&scan.path, projection, 24)? //TODO concurrency - .build() - .map_err(|e| e.into()) + LogicalPlanBuilder::scan_parquet_with_name( + &scan.path, + projection, + 24, + &scan.table_name, + )? //TODO concurrency + .build() + .map_err(|e| e.into()) } LogicalPlanType::Sort(sort) => { let input: LogicalPlan = convert_box_required!(sort.input)?; diff --git a/benchmarks/run.sh b/benchmarks/run.sh index 21633d39c23ad..8e36424da89f0 100755 --- a/benchmarks/run.sh +++ b/benchmarks/run.sh @@ -20,7 +20,7 @@ set -e # This bash script is meant to be run inside the docker-compose environment. Check the README for instructions cd / -for query in 1 3 5 6 7 8 9 10 12 +for query in 1 3 5 6 10 12 do /tpch benchmark ballista --host ballista-scheduler --port 50050 --query $query --path /data --format tbl --iterations 1 --debug done diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs index 147f8322df5d7..ced77ba6c6f68 100644 --- a/datafusion/src/logical_plan/builder.rs +++ b/datafusion/src/logical_plan/builder.rs @@ -118,9 +118,19 @@ impl LogicalPlanBuilder { path: &str, options: CsvReadOptions, projection: Option>, + ) -> Result { + Self::scan_csv_with_name(path, options, projection, path) + } + + /// Scan a CSV data source and register it with a given table name + pub fn scan_csv_with_name( + path: &str, + options: CsvReadOptions, + projection: Option>, + table_name: &str, ) -> Result { let provider = Arc::new(CsvFile::try_new(path, options)?); - Self::scan(path, provider, projection) + Self::scan(table_name, provider, projection) } /// Scan a Parquet data source @@ -128,9 +138,19 @@ impl LogicalPlanBuilder { path: &str, projection: Option>, max_concurrency: usize, + ) -> Result { + Self::scan_parquet_with_name(path, projection, max_concurrency, path) + } + + /// Scan a Parquet data source and register it with a given table name + pub fn scan_parquet_with_name( + path: &str, + projection: Option>, + max_concurrency: usize, + table_name: &str, ) -> Result { let provider = Arc::new(ParquetTable::try_new(path, max_concurrency)?); - Self::scan(path, provider, projection) + Self::scan(table_name, provider, projection) } /// Scan an empty data source, mainly used in tests From 5bdc880e9e45538a7d86a1b56ac613b2dfca176c Mon Sep 17 00:00:00 2001 From: QP Hou Date: Sun, 27 Jun 2021 00:52:06 -0700 Subject: [PATCH 213/329] round trip TPCH queries in tests (#630) * honor table name for csv/parquet scan in ballista plan serde * disable query 7,8,9 in ballista integration test * add tpch query ballista roundtrip test * also roud trip physical plan * fix clippy * simplify test code --- benchmarks/Cargo.toml | 3 ++ benchmarks/src/bin/tpch.rs | 85 +++++++++++++++++++++++++++++++- datafusion/src/datasource/mod.rs | 1 + 3 files changed, 88 insertions(+), 1 deletion(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 6a763420c7823..19a67a504e775 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -39,3 +39,6 @@ futures = "0.3" env_logger = "^0.8" mimalloc = { version = "0.1", optional = true, default-features = false } snmalloc-rs = {version = "0.2", optional = true, features= ["cache-friendly"] } + +[dev-dependencies] +ballista-core = { path = "../ballista/rust/core" } diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index 286fe45945104..77c69f0ce524f 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -573,7 +573,6 @@ mod tests { use datafusion::arrow::array::*; use datafusion::arrow::util::display::array_value_to_string; - use datafusion::logical_plan::Expr; use datafusion::logical_plan::Expr::Cast; @@ -1042,4 +1041,88 @@ mod tests { Ok(()) } + + mod ballista_round_trip { + use super::*; + use ballista_core::serde::protobuf; + use datafusion::physical_plan::ExecutionPlan; + use std::convert::TryInto; + + fn round_trip_query(n: usize) -> Result<()> { + let config = ExecutionConfig::new() + .with_concurrency(1) + .with_batch_size(10); + let mut ctx = ExecutionContext::with_config(config); + + // set tpch_data_path to dummy value and skip physical plan serde test when TPCH_DATA + // is not set. + let tpch_data_path = + env::var("TPCH_DATA").unwrap_or_else(|_| "./".to_string()); + + for &table in TABLES { + let schema = get_schema(table); + let options = CsvReadOptions::new() + .schema(&schema) + .delimiter(b'|') + .has_header(false) + .file_extension(".tbl"); + let provider = CsvFile::try_new( + &format!("{}/{}.tbl", tpch_data_path, table), + options, + )?; + ctx.register_table(table, Arc::new(provider))?; + } + + // test logical plan round trip + let plan = create_logical_plan(&mut ctx, n)?; + let proto: protobuf::LogicalPlanNode = (&plan).try_into().unwrap(); + let round_trip: LogicalPlan = (&proto).try_into().unwrap(); + assert_eq!( + format!("{:?}", plan), + format!("{:?}", round_trip), + "logical plan round trip failed" + ); + + // test optimized logical plan round trip + let plan = ctx.optimize(&plan)?; + let proto: protobuf::LogicalPlanNode = (&plan).try_into().unwrap(); + let round_trip: LogicalPlan = (&proto).try_into().unwrap(); + assert_eq!( + format!("{:?}", plan), + format!("{:?}", round_trip), + "opitmized logical plan round trip failed" + ); + + // test physical plan roundtrip + if env::var("TPCH_DATA").is_ok() { + let physical_plan = ctx.create_physical_plan(&plan)?; + let proto: protobuf::PhysicalPlanNode = + (physical_plan.clone()).try_into().unwrap(); + let round_trip: Arc = (&proto).try_into().unwrap(); + assert_eq!( + format!("{:?}", physical_plan), + format!("{:?}", round_trip), + "physical plan round trip failed" + ); + } + + Ok(()) + } + + macro_rules! test_round_trip { + ($tn:ident, $query:expr) => { + #[test] + fn $tn() -> Result<()> { + round_trip_query($query) + } + }; + } + + test_round_trip!(q1, 1); + test_round_trip!(q3, 3); + test_round_trip!(q5, 5); + test_round_trip!(q6, 6); + test_round_trip!(q10, 10); + test_round_trip!(q12, 12); + } } diff --git a/datafusion/src/datasource/mod.rs b/datafusion/src/datasource/mod.rs index b46b9cc4e8995..9699a997caa11 100644 --- a/datafusion/src/datasource/mod.rs +++ b/datafusion/src/datasource/mod.rs @@ -28,6 +28,7 @@ pub use self::csv::{CsvFile, CsvReadOptions}; pub use self::datasource::{TableProvider, TableType}; pub use self::memory::MemTable; +/// Source for table input data pub(crate) enum Source> { /// Path to a single file or a directory containing one of more files Path(String), From f995de59c0f83156ddd0a1e9ab274cb43225a307 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 27 Jun 2021 04:57:43 -0600 Subject: [PATCH 214/329] Improve field not found error messages (#625) --- datafusion/src/logical_plan/dfschema.rs | 50 ++++++++++++++++++++++--- datafusion/src/sql/planner.rs | 12 +++--- 2 files changed, 50 insertions(+), 12 deletions(-) diff --git a/datafusion/src/logical_plan/dfschema.rs b/datafusion/src/logical_plan/dfschema.rs index e754addb9da77..b46e067a268bf 100644 --- a/datafusion/src/logical_plan/dfschema.rs +++ b/datafusion/src/logical_plan/dfschema.rs @@ -135,14 +135,18 @@ impl DFSchema { &self.fields[i] } - /// Find the index of the column with the given unqualifed name + /// Find the index of the column with the given unqualified name pub fn index_of(&self, name: &str) -> Result { for i in 0..self.fields.len() { if self.fields[i].name() == name { return Ok(i); } } - Err(DataFusionError::Plan(format!("No field named '{}'", name))) + Err(DataFusionError::Plan(format!( + "No field named '{}'. Valid fields are {}.", + name, + self.get_field_names() + ))) } /// Find the index of the column with the given qualifer and name @@ -181,8 +185,9 @@ impl DFSchema { .collect(); match matches.len() { 0 => Err(DataFusionError::Plan(format!( - "No field with unqualified name '{}'", - name + "No field with unqualified name '{}'. Valid fields are {}.", + name, + self.get_field_names() ))), 1 => Ok(matches[0].to_owned()), _ => Err(DataFusionError::Plan(format!( @@ -207,8 +212,10 @@ impl DFSchema { .collect(); match matches.len() { 0 => Err(DataFusionError::Plan(format!( - "No field named '{}.{}'", - relation_name, name + "No field named '{}.{}'. Valid fields are {}.", + relation_name, + name, + self.get_field_names() ))), 1 => Ok(matches[0].to_owned()), _ => Err(DataFusionError::Internal(format!( @@ -273,6 +280,15 @@ impl DFSchema { .collect(), } } + + /// Get comma-seperated list of field names for use in error messages + fn get_field_names(&self) -> String { + self.fields + .iter() + .map(|f| format!("'{}'", f.name())) + .collect::>() + .join(", ") + } } impl Into for DFSchema { @@ -585,6 +601,28 @@ mod tests { Ok(()) } + #[test] + fn helpful_error_messages() -> Result<()> { + let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; + let expected_help = "Valid fields are \'c0\', \'c1\'."; + assert!(schema + .field_with_qualified_name("x", "y") + .unwrap_err() + .to_string() + .contains(expected_help)); + assert!(schema + .field_with_unqualified_name("y") + .unwrap_err() + .to_string() + .contains(expected_help)); + assert!(schema + .index_of("y") + .unwrap_err() + .to_string() + .contains(expected_help)); + Ok(()) + } + #[test] fn into() { // Demonstrate how to convert back and forth between Schema, SchemaRef, DFSchema, and DFSchemaRef diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 17181230c26cc..0691ce6c30e60 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -1650,7 +1650,7 @@ mod tests { let err = logical_plan(sql).expect_err("query should have failed"); assert!(matches!( err, - DataFusionError::Plan(msg) if msg == "No field with unqualified name 'doesnotexist'", + DataFusionError::Plan(msg) if msg.contains("No field with unqualified name 'doesnotexist'"), )); } @@ -1708,7 +1708,7 @@ mod tests { let err = logical_plan(sql).expect_err("query should have failed"); assert!(matches!( err, - DataFusionError::Plan(msg) if msg == "No field with unqualified name 'doesnotexist'", + DataFusionError::Plan(msg) if msg.contains("No field with unqualified name 'doesnotexist'"), )); } @@ -1718,7 +1718,7 @@ mod tests { let err = logical_plan(sql).expect_err("query should have failed"); assert!(matches!( err, - DataFusionError::Plan(msg) if msg == "No field with unqualified name 'x'", + DataFusionError::Plan(msg) if msg.contains("No field with unqualified name 'x'"), )); } @@ -2189,7 +2189,7 @@ mod tests { let err = logical_plan(sql).expect_err("query should have failed"); assert!(matches!( err, - DataFusionError::Plan(msg) if msg == "No field with unqualified name 'doesnotexist'", + DataFusionError::Plan(msg) if msg.contains("No field with unqualified name 'doesnotexist'"), )); } @@ -2279,7 +2279,7 @@ mod tests { let err = logical_plan(sql).expect_err("query should have failed"); assert!(matches!( err, - DataFusionError::Plan(msg) if msg == "No field with unqualified name 'doesnotexist'", + DataFusionError::Plan(msg) if msg.contains("No field with unqualified name 'doesnotexist'"), )); } @@ -2289,7 +2289,7 @@ mod tests { let err = logical_plan(sql).expect_err("query should have failed"); assert!(matches!( err, - DataFusionError::Plan(msg) if msg == "No field with unqualified name 'doesnotexist'", + DataFusionError::Plan(msg) if msg.contains("No field with unqualified name 'doesnotexist'"), )); } From ffb195c8634fc537127eff1a082811d28e8fcc2b Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Sun, 27 Jun 2021 18:58:07 +0800 Subject: [PATCH 215/329] fix 593 (#610) --- .../core/src/serde/logical_plan/from_proto.rs | 18 +++---- datafusion/src/execution/context.rs | 4 +- datafusion/src/execution/dataframe_impl.rs | 22 ++++---- datafusion/src/logical_plan/builder.rs | 36 ++++++------- datafusion/src/optimizer/constant_folding.rs | 28 +++++----- datafusion/src/optimizer/eliminate_limit.rs | 6 +-- datafusion/src/optimizer/filter_push_down.rs | 54 +++++++++---------- .../src/optimizer/hash_build_probe_order.rs | 3 +- datafusion/src/optimizer/limit_push_down.rs | 12 ++--- .../src/optimizer/projection_push_down.rs | 28 +++++----- .../src/optimizer/simplify_expressions.rs | 4 +- datafusion/src/optimizer/utils.rs | 2 +- datafusion/src/sql/planner.rs | 54 +++++++++---------- datafusion/tests/custom_sources.rs | 2 +- 14 files changed, 137 insertions(+), 136 deletions(-) diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index 15ee50733ecaf..a1136cf4a7d6e 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -61,14 +61,14 @@ impl TryInto for &protobuf::LogicalPlanNode { .iter() .map(|expr| expr.try_into()) .collect::, _>>()?; - LogicalPlanBuilder::from(&input) + LogicalPlanBuilder::from(input) .project(x)? .build() .map_err(|e| e.into()) } LogicalPlanType::Selection(selection) => { let input: LogicalPlan = convert_box_required!(selection.input)?; - LogicalPlanBuilder::from(&input) + LogicalPlanBuilder::from(input) .filter( selection .expr @@ -86,7 +86,7 @@ impl TryInto for &protobuf::LogicalPlanNode { .iter() .map(|expr| expr.try_into()) .collect::, _>>()?; - LogicalPlanBuilder::from(&input) + LogicalPlanBuilder::from(input) .window(window_expr)? .build() .map_err(|e| e.into()) @@ -103,7 +103,7 @@ impl TryInto for &protobuf::LogicalPlanNode { .iter() .map(|expr| expr.try_into()) .collect::, _>>()?; - LogicalPlanBuilder::from(&input) + LogicalPlanBuilder::from(input) .aggregate(group_expr, aggr_expr)? .build() .map_err(|e| e.into()) @@ -172,7 +172,7 @@ impl TryInto for &protobuf::LogicalPlanNode { .iter() .map(|expr| expr.try_into()) .collect::, _>>()?; - LogicalPlanBuilder::from(&input) + LogicalPlanBuilder::from(input) .sort(sort_expr)? .build() .map_err(|e| e.into()) @@ -203,7 +203,7 @@ impl TryInto for &protobuf::LogicalPlanNode { } }; - LogicalPlanBuilder::from(&input) + LogicalPlanBuilder::from(input) .repartition(partitioning_scheme)? .build() .map_err(|e| e.into()) @@ -233,14 +233,14 @@ impl TryInto for &protobuf::LogicalPlanNode { } LogicalPlanType::Explain(explain) => { let input: LogicalPlan = convert_box_required!(explain.input)?; - LogicalPlanBuilder::from(&input) + LogicalPlanBuilder::from(input) .explain(explain.verbose)? .build() .map_err(|e| e.into()) } LogicalPlanType::Limit(limit) => { let input: LogicalPlan = convert_box_required!(limit.input)?; - LogicalPlanBuilder::from(&input) + LogicalPlanBuilder::from(input) .limit(limit.limit as usize)? .build() .map_err(|e| e.into()) @@ -265,7 +265,7 @@ impl TryInto for &protobuf::LogicalPlanNode { protobuf::JoinType::Semi => JoinType::Semi, protobuf::JoinType::Anti => JoinType::Anti, }; - LogicalPlanBuilder::from(&convert_box_required!(join.left)?) + LogicalPlanBuilder::from(convert_box_required!(join.left)?) .join( &convert_box_required!(join.right)?, join_type, diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index 165263084cc7c..8ce408de86a5b 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -1073,7 +1073,7 @@ mod tests { let ctx = create_ctx(&tmp_dir, partition_count)?; let table = ctx.table("test")?; - let logical_plan = LogicalPlanBuilder::from(&table.to_logical_plan()) + let logical_plan = LogicalPlanBuilder::from(table.to_logical_plan()) .project(vec![col("c2")])? .build()?; @@ -2566,7 +2566,7 @@ mod tests { let t = ctx.table("t")?; - let plan = LogicalPlanBuilder::from(&t.to_logical_plan()) + let plan = LogicalPlanBuilder::from(t.to_logical_plan()) .project(vec![ col("a"), col("b"), diff --git a/datafusion/src/execution/dataframe_impl.rs b/datafusion/src/execution/dataframe_impl.rs index 99eb7f077c96a..7cf779740c473 100644 --- a/datafusion/src/execution/dataframe_impl.rs +++ b/datafusion/src/execution/dataframe_impl.rs @@ -63,7 +63,7 @@ impl DataFrame for DataFrameImpl { /// Create a projection based on arbitrary expressions fn select(&self, expr_list: Vec) -> Result> { - let plan = LogicalPlanBuilder::from(&self.plan) + let plan = LogicalPlanBuilder::from(self.to_logical_plan()) .project(expr_list)? .build()?; Ok(Arc::new(DataFrameImpl::new(self.ctx_state.clone(), &plan))) @@ -71,7 +71,7 @@ impl DataFrame for DataFrameImpl { /// Create a filter based on a predicate expression fn filter(&self, predicate: Expr) -> Result> { - let plan = LogicalPlanBuilder::from(&self.plan) + let plan = LogicalPlanBuilder::from(self.to_logical_plan()) .filter(predicate)? .build()?; Ok(Arc::new(DataFrameImpl::new(self.ctx_state.clone(), &plan))) @@ -83,7 +83,7 @@ impl DataFrame for DataFrameImpl { group_expr: Vec, aggr_expr: Vec, ) -> Result> { - let plan = LogicalPlanBuilder::from(&self.plan) + let plan = LogicalPlanBuilder::from(self.to_logical_plan()) .aggregate(group_expr, aggr_expr)? .build()?; Ok(Arc::new(DataFrameImpl::new(self.ctx_state.clone(), &plan))) @@ -91,13 +91,17 @@ impl DataFrame for DataFrameImpl { /// Limit the number of rows fn limit(&self, n: usize) -> Result> { - let plan = LogicalPlanBuilder::from(&self.plan).limit(n)?.build()?; + let plan = LogicalPlanBuilder::from(self.to_logical_plan()) + .limit(n)? + .build()?; Ok(Arc::new(DataFrameImpl::new(self.ctx_state.clone(), &plan))) } /// Sort by specified sorting expressions fn sort(&self, expr: Vec) -> Result> { - let plan = LogicalPlanBuilder::from(&self.plan).sort(expr)?.build()?; + let plan = LogicalPlanBuilder::from(self.to_logical_plan()) + .sort(expr)? + .build()?; Ok(Arc::new(DataFrameImpl::new(self.ctx_state.clone(), &plan))) } @@ -109,7 +113,7 @@ impl DataFrame for DataFrameImpl { left_cols: &[&str], right_cols: &[&str], ) -> Result> { - let plan = LogicalPlanBuilder::from(&self.plan) + let plan = LogicalPlanBuilder::from(self.to_logical_plan()) .join( &right.to_logical_plan(), join_type, @@ -124,7 +128,7 @@ impl DataFrame for DataFrameImpl { &self, partitioning_scheme: Partitioning, ) -> Result> { - let plan = LogicalPlanBuilder::from(&self.plan) + let plan = LogicalPlanBuilder::from(self.to_logical_plan()) .repartition(partitioning_scheme)? .build()?; Ok(Arc::new(DataFrameImpl::new(self.ctx_state.clone(), &plan))) @@ -161,7 +165,7 @@ impl DataFrame for DataFrameImpl { } fn explain(&self, verbose: bool) -> Result> { - let plan = LogicalPlanBuilder::from(&self.plan) + let plan = LogicalPlanBuilder::from(self.to_logical_plan()) .explain(verbose)? .build()?; Ok(Arc::new(DataFrameImpl::new(self.ctx_state.clone(), &plan))) @@ -173,7 +177,7 @@ impl DataFrame for DataFrameImpl { } fn union(&self, dataframe: Arc) -> Result> { - let plan = LogicalPlanBuilder::from(&self.plan) + let plan = LogicalPlanBuilder::from(self.to_logical_plan()) .union(dataframe.to_logical_plan())? .build()?; Ok(Arc::new(DataFrameImpl::new(self.ctx_state.clone(), &plan))) diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs index ced77ba6c6f68..17fe6636439c7 100644 --- a/datafusion/src/logical_plan/builder.rs +++ b/datafusion/src/logical_plan/builder.rs @@ -89,15 +89,15 @@ pub struct LogicalPlanBuilder { impl LogicalPlanBuilder { /// Create a builder from an existing plan - pub fn from(plan: &LogicalPlan) -> Self { - Self { plan: plan.clone() } + pub fn from(plan: LogicalPlan) -> Self { + Self { plan } } /// Create an empty relation. /// /// `produce_one_row` set to true means this empty node needs to produce a placeholder row. pub fn empty(produce_one_row: bool) -> Self { - Self::from(&LogicalPlan::EmptyRelation { + Self::from(LogicalPlan::EmptyRelation { produce_one_row, schema: DFSchemaRef::new(DFSchema::empty()), }) @@ -202,7 +202,7 @@ impl LogicalPlanBuilder { limit: None, }; - Ok(Self::from(&table_scan)) + Ok(Self::from(table_scan)) } /// Apply a projection. @@ -234,7 +234,7 @@ impl LogicalPlanBuilder { let schema = DFSchema::new(exprlist_to_fields(&projected_expr, input_schema)?)?; - Ok(Self::from(&LogicalPlan::Projection { + Ok(Self::from(LogicalPlan::Projection { expr: projected_expr, input: Arc::new(self.plan.clone()), schema: DFSchemaRef::new(schema), @@ -244,7 +244,7 @@ impl LogicalPlanBuilder { /// Apply a filter pub fn filter(&self, expr: Expr) -> Result { let expr = normalize_col(expr, &self.plan.all_schemas())?; - Ok(Self::from(&LogicalPlan::Filter { + Ok(Self::from(LogicalPlan::Filter { predicate: expr, input: Arc::new(self.plan.clone()), })) @@ -252,7 +252,7 @@ impl LogicalPlanBuilder { /// Apply a limit pub fn limit(&self, n: usize) -> Result { - Ok(Self::from(&LogicalPlan::Limit { + Ok(Self::from(LogicalPlan::Limit { n, input: Arc::new(self.plan.clone()), })) @@ -261,7 +261,7 @@ impl LogicalPlanBuilder { /// Apply a sort pub fn sort(&self, exprs: impl IntoIterator) -> Result { let schemas = self.plan.all_schemas(); - Ok(Self::from(&LogicalPlan::Sort { + Ok(Self::from(LogicalPlan::Sort { expr: normalize_cols(exprs, &schemas)?, input: Arc::new(self.plan.clone()), })) @@ -269,11 +269,7 @@ impl LogicalPlanBuilder { /// Apply a union pub fn union(&self, plan: LogicalPlan) -> Result { - Ok(Self::from(&union_with_alias( - self.plan.clone(), - plan, - None, - )?)) + Ok(Self::from(union_with_alias(self.plan.clone(), plan, None)?)) } /// Apply a join with on constraint @@ -307,7 +303,7 @@ impl LogicalPlanBuilder { &JoinConstraint::On, )?; - Ok(Self::from(&LogicalPlan::Join { + Ok(Self::from(LogicalPlan::Join { left: Arc::new(self.plan.clone()), right: Arc::new(right.clone()), on, @@ -343,7 +339,7 @@ impl LogicalPlanBuilder { &JoinConstraint::Using, )?; - Ok(Self::from(&LogicalPlan::Join { + Ok(Self::from(LogicalPlan::Join { left: Arc::new(self.plan.clone()), right: Arc::new(right.clone()), on, @@ -356,7 +352,7 @@ impl LogicalPlanBuilder { /// Apply a cross join pub fn cross_join(&self, right: &LogicalPlan) -> Result { let schema = self.plan.schema().join(right.schema())?; - Ok(Self::from(&LogicalPlan::CrossJoin { + Ok(Self::from(LogicalPlan::CrossJoin { left: Arc::new(self.plan.clone()), right: Arc::new(right.clone()), schema: DFSchemaRef::new(schema), @@ -365,7 +361,7 @@ impl LogicalPlanBuilder { /// Repartition pub fn repartition(&self, partitioning_scheme: Partitioning) -> Result { - Ok(Self::from(&LogicalPlan::Repartition { + Ok(Self::from(LogicalPlan::Repartition { input: Arc::new(self.plan.clone()), partitioning_scheme, })) @@ -379,7 +375,7 @@ impl LogicalPlanBuilder { let mut window_fields: Vec = exprlist_to_fields(all_expr, self.plan.schema())?; window_fields.extend_from_slice(self.plan.schema().fields()); - Ok(Self::from(&LogicalPlan::Window { + Ok(Self::from(LogicalPlan::Window { input: Arc::new(self.plan.clone()), window_expr, schema: Arc::new(DFSchema::new(window_fields)?), @@ -404,7 +400,7 @@ impl LogicalPlanBuilder { let aggr_schema = DFSchema::new(exprlist_to_fields(all_expr, self.plan.schema())?)?; - Ok(Self::from(&LogicalPlan::Aggregate { + Ok(Self::from(LogicalPlan::Aggregate { input: Arc::new(self.plan.clone()), group_expr, aggr_expr, @@ -421,7 +417,7 @@ impl LogicalPlanBuilder { let schema = LogicalPlan::explain_schema(); - Ok(Self::from(&LogicalPlan::Explain { + Ok(Self::from(LogicalPlan::Explain { verbose, plan: Arc::new(self.plan.clone()), stringified_plans, diff --git a/datafusion/src/optimizer/constant_folding.rs b/datafusion/src/optimizer/constant_folding.rs index 956f74adc28f7..79833df66129d 100644 --- a/datafusion/src/optimizer/constant_folding.rs +++ b/datafusion/src/optimizer/constant_folding.rs @@ -544,7 +544,7 @@ mod tests { #[test] fn optimize_plan_eq_expr() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .filter(col("b").eq(lit(true)))? .filter(col("c").eq(lit(false)))? .project(vec![col("a")])? @@ -563,7 +563,7 @@ mod tests { #[test] fn optimize_plan_not_eq_expr() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .filter(col("b").not_eq(lit(true)))? .filter(col("c").not_eq(lit(false)))? .limit(1)? @@ -584,7 +584,7 @@ mod tests { #[test] fn optimize_plan_and_expr() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .filter(col("b").not_eq(lit(true)).and(col("c").eq(lit(true))))? .project(vec![col("a")])? .build()?; @@ -601,7 +601,7 @@ mod tests { #[test] fn optimize_plan_or_expr() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .filter(col("b").not_eq(lit(true)).or(col("c").eq(lit(false))))? .project(vec![col("a")])? .build()?; @@ -618,7 +618,7 @@ mod tests { #[test] fn optimize_plan_not_expr() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .filter(col("b").eq(lit(false)).not())? .project(vec![col("a")])? .build()?; @@ -635,7 +635,7 @@ mod tests { #[test] fn optimize_plan_support_projection() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("a"), col("d"), col("b").eq(lit(false))])? .build()?; @@ -650,7 +650,7 @@ mod tests { #[test] fn optimize_plan_support_aggregate() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("a"), col("c"), col("b")])? .aggregate( vec![col("a"), col("c")], @@ -691,7 +691,7 @@ mod tests { )))], fun: BuiltinScalarFunction::ToTimestamp, }]; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(proj) .unwrap() .build() @@ -713,7 +713,7 @@ mod tests { )))], fun: BuiltinScalarFunction::ToTimestamp, }]; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(proj) .unwrap() .build() @@ -732,7 +732,7 @@ mod tests { args: vec![], fun: BuiltinScalarFunction::ToTimestamp, }]; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(proj) .unwrap() .build() @@ -751,7 +751,7 @@ mod tests { expr: Box::new(Expr::Literal(ScalarValue::Utf8(Some("0".to_string())))), data_type: DataType::Int32, }]; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(proj) .unwrap() .build() @@ -770,7 +770,7 @@ mod tests { expr: Box::new(Expr::Literal(ScalarValue::Utf8(Some("".to_string())))), data_type: DataType::Int32, }]; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(proj) .unwrap() .build() @@ -790,7 +790,7 @@ mod tests { fun: BuiltinScalarFunction::Now, }]; let time = chrono::Utc::now(); - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(proj) .unwrap() .build() @@ -823,7 +823,7 @@ mod tests { "t2".to_string(), ), ]; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(proj) .unwrap() .build() diff --git a/datafusion/src/optimizer/eliminate_limit.rs b/datafusion/src/optimizer/eliminate_limit.rs index 4b5a634889a79..bf3f2b3be283e 100644 --- a/datafusion/src/optimizer/eliminate_limit.rs +++ b/datafusion/src/optimizer/eliminate_limit.rs @@ -88,7 +88,7 @@ mod tests { #[test] fn limit_0_root() { let table_scan = test_table_scan().unwrap(); - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .aggregate(vec![col("a")], vec![sum(col("b"))]) .unwrap() .limit(0) @@ -104,12 +104,12 @@ mod tests { #[test] fn limit_0_nested() { let table_scan = test_table_scan().unwrap(); - let plan1 = LogicalPlanBuilder::from(&table_scan) + let plan1 = LogicalPlanBuilder::from(table_scan.clone()) .aggregate(vec![col("a")], vec![sum(col("b"))]) .unwrap() .build() .unwrap(); - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .aggregate(vec![col("a")], vec![sum(col("b"))]) .unwrap() .limit(0) diff --git a/datafusion/src/optimizer/filter_push_down.rs b/datafusion/src/optimizer/filter_push_down.rs index e5f8dcfbfffd6..7b1ff326c3c6e 100644 --- a/datafusion/src/optimizer/filter_push_down.rs +++ b/datafusion/src/optimizer/filter_push_down.rs @@ -483,7 +483,7 @@ mod tests { #[test] fn filter_before_projection() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("a"), col("b")])? .filter(col("a").eq(lit(1i64)))? .build()?; @@ -499,7 +499,7 @@ mod tests { #[test] fn filter_after_limit() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("a"), col("b")])? .limit(10)? .filter(col("a").eq(lit(1i64)))? @@ -517,7 +517,7 @@ mod tests { #[test] fn filter_no_columns() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .filter(lit(0i64).eq(lit(1i64)))? .build()?; let expected = "\ @@ -530,7 +530,7 @@ mod tests { #[test] fn filter_jump_2_plans() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("a"), col("b"), col("c")])? .project(vec![col("c"), col("b")])? .filter(col("a").eq(lit(1i64)))? @@ -548,7 +548,7 @@ mod tests { #[test] fn filter_move_agg() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .aggregate(vec![col("a")], vec![sum(col("b")).alias("total_salary")])? .filter(col("a").gt(lit(10i64)))? .build()?; @@ -564,7 +564,7 @@ mod tests { #[test] fn filter_keep_agg() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .aggregate(vec![col("a")], vec![sum(col("b")).alias("b")])? .filter(col("b").gt(lit(10i64)))? .build()?; @@ -581,7 +581,7 @@ mod tests { #[test] fn alias() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("a").alias("b"), col("c")])? .filter(col("b").eq(lit(1i64)))? .build()?; @@ -614,7 +614,7 @@ mod tests { #[test] fn complex_expression() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![ add(multiply(col("a"), lit(2)), col("c")).alias("b"), col("c"), @@ -644,7 +644,7 @@ mod tests { #[test] fn complex_plan() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![ add(multiply(col("a"), lit(2)), col("c")).alias("b"), col("c"), @@ -680,7 +680,7 @@ mod tests { fn multi_filter() -> Result<()> { // the aggregation allows one filter to pass (b), and the other one to not pass (SUM(c)) let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("a").alias("b"), col("c")])? .aggregate(vec![col("b")], vec![sum(col("c"))])? .filter(col("b").gt(lit(10i64)))? @@ -716,7 +716,7 @@ mod tests { fn split_filter() -> Result<()> { // the aggregation allows one filter to pass (b), and the other one to not pass (SUM(c)) let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("a").alias("b"), col("c")])? .aggregate(vec![col("b")], vec![sum(col("c"))])? .filter(and( @@ -751,7 +751,7 @@ mod tests { #[test] fn double_limit() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("a"), col("b")])? .limit(20)? .limit(10)? @@ -773,8 +773,8 @@ mod tests { #[test] fn union_all() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) - .union(LogicalPlanBuilder::from(&table_scan).build()?)? + let plan = LogicalPlanBuilder::from(table_scan.clone()) + .union(LogicalPlanBuilder::from(table_scan).build()?)? .filter(col("a").eq(lit(1i64)))? .build()?; // filter appears below Union @@ -792,7 +792,7 @@ mod tests { #[test] fn filter_2_breaks_limits() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("a")])? .filter(col("a").lt_eq(lit(1i64)))? .limit(1)? @@ -828,7 +828,7 @@ mod tests { #[test] fn two_filters_on_same_depth() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .limit(1)? .filter(col("a").lt_eq(lit(1i64)))? .filter(col("a").gt_eq(lit(1i64)))? @@ -860,7 +860,7 @@ mod tests { #[test] fn filters_user_defined_node() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .filter(col("a").lt_eq(lit(1i64)))? .build()?; @@ -882,11 +882,11 @@ mod tests { #[test] fn filter_join_on_common_independent() -> Result<()> { let table_scan = test_table_scan()?; - let left = LogicalPlanBuilder::from(&table_scan).build()?; - let right = LogicalPlanBuilder::from(&table_scan) + let left = LogicalPlanBuilder::from(table_scan.clone()).build()?; + let right = LogicalPlanBuilder::from(table_scan) .project(vec![col("a")])? .build()?; - let plan = LogicalPlanBuilder::from(&left) + let plan = LogicalPlanBuilder::from(left) .join( &right, JoinType::Inner, @@ -923,13 +923,13 @@ mod tests { #[test] fn filter_join_on_common_dependent() -> Result<()> { let table_scan = test_table_scan()?; - let left = LogicalPlanBuilder::from(&table_scan) + let left = LogicalPlanBuilder::from(table_scan.clone()) .project(vec![col("a"), col("c")])? .build()?; - let right = LogicalPlanBuilder::from(&table_scan) + let right = LogicalPlanBuilder::from(table_scan) .project(vec![col("a"), col("b")])? .build()?; - let plan = LogicalPlanBuilder::from(&left) + let plan = LogicalPlanBuilder::from(left) .join( &right, JoinType::Inner, @@ -962,13 +962,13 @@ mod tests { #[test] fn filter_join_on_one_side() -> Result<()> { let table_scan = test_table_scan()?; - let left = LogicalPlanBuilder::from(&table_scan) + let left = LogicalPlanBuilder::from(table_scan.clone()) .project(vec![col("a"), col("b")])? .build()?; - let right = LogicalPlanBuilder::from(&table_scan) + let right = LogicalPlanBuilder::from(table_scan) .project(vec![col("a"), col("c")])? .build()?; - let plan = LogicalPlanBuilder::from(&left) + let plan = LogicalPlanBuilder::from(left) .join( &right, JoinType::Inner, @@ -1060,7 +1060,7 @@ mod tests { limit: None, }; - LogicalPlanBuilder::from(&table_scan) + LogicalPlanBuilder::from(table_scan) .filter(col("a").eq(lit(1i64)))? .build() } diff --git a/datafusion/src/optimizer/hash_build_probe_order.rs b/datafusion/src/optimizer/hash_build_probe_order.rs index a2a99ae364a70..ecb3b40e32032 100644 --- a/datafusion/src/optimizer/hash_build_probe_order.rs +++ b/datafusion/src/optimizer/hash_build_probe_order.rs @@ -166,7 +166,8 @@ impl OptimizerRule for HashBuildProbeOrder { let left = self.optimize(left, execution_props)?; let right = self.optimize(right, execution_props)?; if should_swap_join_order(&left, &right) { - let swapped = LogicalPlanBuilder::from(&right).cross_join(&left)?; + let swapped = + LogicalPlanBuilder::from(right.clone()).cross_join(&left)?; // wrap plan with projection to maintain column order let left_cols = left .schema() diff --git a/datafusion/src/optimizer/limit_push_down.rs b/datafusion/src/optimizer/limit_push_down.rs index afd993710a5f5..21b82a6c9698a 100644 --- a/datafusion/src/optimizer/limit_push_down.rs +++ b/datafusion/src/optimizer/limit_push_down.rs @@ -155,7 +155,7 @@ mod test { fn limit_pushdown_projection_table_provider() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("a")])? .limit(1000)? .build()?; @@ -174,7 +174,7 @@ mod test { fn limit_push_down_take_smaller_limit() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .limit(1000)? .limit(10)? .build()?; @@ -195,7 +195,7 @@ mod test { fn limit_doesnt_push_down_aggregation() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .aggregate(vec![col("a")], vec![max(col("b"))])? .limit(1000)? .build()?; @@ -214,8 +214,8 @@ mod test { fn limit_should_push_down_union() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) - .union(LogicalPlanBuilder::from(&table_scan).build()?)? + let plan = LogicalPlanBuilder::from(table_scan.clone()) + .union(LogicalPlanBuilder::from(table_scan).build()?)? .limit(1000)? .build()?; @@ -236,7 +236,7 @@ mod test { fn multi_stage_limit_recurses_to_deeper_limit() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .limit(1000)? .aggregate(vec![col("a")], vec![max(col("b"))])? .limit(10)? diff --git a/datafusion/src/optimizer/projection_push_down.rs b/datafusion/src/optimizer/projection_push_down.rs index 2cd7384e24439..4bf2b6e797f8e 100644 --- a/datafusion/src/optimizer/projection_push_down.rs +++ b/datafusion/src/optimizer/projection_push_down.rs @@ -259,7 +259,7 @@ fn optimize_plan( &mut new_required_columns, )?; - LogicalPlanBuilder::from(&optimize_plan( + LogicalPlanBuilder::from(optimize_plan( optimizer, input, &new_required_columns, @@ -452,7 +452,7 @@ mod tests { fn aggregate_no_group_by() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .aggregate(vec![], vec![max(col("b"))])? .build()?; @@ -468,7 +468,7 @@ mod tests { fn aggregate_group_by() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .aggregate(vec![col("c")], vec![max(col("b"))])? .build()?; @@ -484,7 +484,7 @@ mod tests { fn aggregate_no_group_by_with_filter() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .filter(col("c"))? .aggregate(vec![], vec![max(col("b"))])? .build()?; @@ -506,7 +506,7 @@ mod tests { let table2_scan = LogicalPlanBuilder::scan_empty(Some("test2"), &schema, None)?.build()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .join(&table2_scan, JoinType::Left, vec!["a"], vec!["c1"])? .project(vec![col("a"), col("b"), col("c1")])? .build()?; @@ -539,7 +539,7 @@ mod tests { fn cast() -> Result<()> { let table_scan = test_table_scan()?; - let projection = LogicalPlanBuilder::from(&table_scan) + let projection = LogicalPlanBuilder::from(table_scan) .project(vec![Expr::Cast { expr: Box::new(col("c")), data_type: DataType::Float64, @@ -560,7 +560,7 @@ mod tests { assert_eq!(3, table_scan.schema().fields().len()); assert_fields_eq(&table_scan, vec!["a", "b", "c"]); - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("a"), col("b")])? .build()?; @@ -609,7 +609,7 @@ mod tests { assert_eq!(3, table_scan.schema().fields().len()); assert_fields_eq(&table_scan, vec!["a", "b", "c"]); - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("c"), col("a")])? .limit(5)? .build()?; @@ -628,7 +628,7 @@ mod tests { #[test] fn table_scan_without_projection() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan).build()?; + let plan = LogicalPlanBuilder::from(table_scan).build()?; // should expand projection to all columns without projection let expected = "TableScan: test projection=Some([0, 1, 2])"; assert_optimized_plan_eq(&plan, expected); @@ -638,7 +638,7 @@ mod tests { #[test] fn table_scan_with_literal_projection() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![lit(1_i64), lit(2_i64)])? .build()?; let expected = "Projection: Int64(1), Int64(2)\ @@ -655,7 +655,7 @@ mod tests { assert_fields_eq(&table_scan, vec!["a", "b", "c"]); // we never use "b" in the first projection => remove it - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("c"), col("a"), col("b")])? .filter(col("c").gt(lit(1)))? .aggregate(vec![col("c")], vec![max(col("a"))])? @@ -682,7 +682,7 @@ mod tests { assert_fields_eq(&table_scan, vec!["a", "b", "c"]); // there is no need for the first projection - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("b")])? .project(vec![lit(1).alias("a")])? .build()?; @@ -703,7 +703,7 @@ mod tests { fn test_double_optimization() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("b")])? .project(vec![lit(1).alias("a")])? .build()?; @@ -726,7 +726,7 @@ mod tests { assert_fields_eq(&table_scan, vec!["a", "b", "c"]); // we never use "min(b)" => remove it - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .aggregate(vec![col("a"), col("c")], vec![max(col("b")), min(col("b"))])? .filter(col("c").gt(lit(1)))? .project(vec![col("c"), col("a"), col("MAX(test.b)")])? diff --git a/datafusion/src/optimizer/simplify_expressions.rs b/datafusion/src/optimizer/simplify_expressions.rs index 4253d2fd4f00c..0e65de07305ff 100644 --- a/datafusion/src/optimizer/simplify_expressions.rs +++ b/datafusion/src/optimizer/simplify_expressions.rs @@ -502,7 +502,7 @@ mod tests { #[test] fn test_simplify_optimized_plan() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("a")])? .filter(and(col("b").gt(lit(1)), col("b").gt(lit(1))))? .build()?; @@ -521,7 +521,7 @@ mod tests { #[test] fn test_simplify_optimized_plan_with_composed_and() -> Result<()> { let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(&table_scan) + let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("a")])? .filter(and( and(col("a").gt(lit(5)), col("b").lt(lit(6))), diff --git a/datafusion/src/optimizer/utils.rs b/datafusion/src/optimizer/utils.rs index 76f44b84657ca..394308f5af801 100644 --- a/datafusion/src/optimizer/utils.rs +++ b/datafusion/src/optimizer/utils.rs @@ -232,7 +232,7 @@ pub fn from_plan( }) } LogicalPlan::CrossJoin { .. } => { - let left = &inputs[0]; + let left = inputs[0].clone(); let right = &inputs[1]; LogicalPlanBuilder::from(left).cross_join(right)?.build() } diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 0691ce6c30e60..a2ff456ca451c 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -143,9 +143,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } let plan = self.set_expr_to_plan(set_expr, alias, ctes)?; - let plan = self.order_by(&plan, &query.order_by)?; + let plan = self.order_by(plan, &query.order_by)?; - self.limit(&plan, &query.limit) + self.limit(plan, &query.limit) } fn set_expr_to_plan( @@ -309,9 +309,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { match t.joins.len() { 0 => Ok(left), n => { - let mut left = self.parse_relation_join(&left, &t.joins[0], ctes)?; + let mut left = self.parse_relation_join(left, &t.joins[0], ctes)?; for i in 1..n { - left = self.parse_relation_join(&left, &t.joins[i], ctes)?; + left = self.parse_relation_join(left, &t.joins[i], ctes)?; } Ok(left) } @@ -320,7 +320,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { fn parse_relation_join( &self, - left: &LogicalPlan, + left: LogicalPlan, join: &Join, ctes: &mut HashMap, ) -> Result { @@ -347,7 +347,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } fn parse_cross_join( &self, - left: &LogicalPlan, + left: LogicalPlan, right: &LogicalPlan, ) -> Result { LogicalPlanBuilder::from(left).cross_join(right)?.build() @@ -355,7 +355,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { fn parse_join( &self, - left: &LogicalPlan, + left: LogicalPlan, right: &LogicalPlan, constraint: &JoinConstraint, join_type: JoinType, @@ -489,13 +489,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } if join_keys.is_empty() { left = - LogicalPlanBuilder::from(&left).cross_join(right)?.build()?; + LogicalPlanBuilder::from(left).cross_join(right)?.build()?; } else { let left_keys: Vec = join_keys.iter().map(|(l, _)| l.clone()).collect(); let right_keys: Vec = join_keys.iter().map(|(_, r)| r.clone()).collect(); - let builder = LogicalPlanBuilder::from(&left); + let builder = LogicalPlanBuilder::from(left); left = builder .join(right, JoinType::Inner, left_keys, right_keys)? .build()?; @@ -507,7 +507,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // remove join expressions from filter match remove_join_expressions(&filter_expr, &all_join_keys)? { Some(filter_expr) => { - LogicalPlanBuilder::from(&left).filter(filter_expr)?.build() + LogicalPlanBuilder::from(left).filter(filter_expr)?.build() } _ => Ok(left), } @@ -519,7 +519,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let mut left = plans[0].clone(); for right in plans.iter().skip(1) { left = - LogicalPlanBuilder::from(&left).cross_join(right)?.build()?; + LogicalPlanBuilder::from(left).cross_join(right)?.build()?; } Ok(left) } @@ -531,7 +531,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let select_exprs = self.prepare_select_exprs(&plan, &select.projection)?; // having and group by clause may reference aliases defined in select projection - let projected_plan = self.project(&plan, select_exprs.clone())?; + let projected_plan = self.project(plan.clone(), select_exprs.clone())?; let mut combined_schema = (**projected_plan.schema()).clone(); combined_schema.merge(plan.schema()); @@ -597,7 +597,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { || !aggr_exprs.is_empty() { self.aggregate( - &plan, + plan, &select_exprs, &having_expr_opt, group_by_exprs, @@ -625,7 +625,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { }; let plan = if let Some(having_expr_post_aggr) = having_expr_post_aggr_opt { - LogicalPlanBuilder::from(&plan) + LogicalPlanBuilder::from(plan) .filter(having_expr_post_aggr)? .build()? } else { @@ -642,14 +642,14 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { }; let plan = if select.distinct { - return LogicalPlanBuilder::from(&plan) + return LogicalPlanBuilder::from(plan) .aggregate(select_exprs_post_aggr, vec![])? .build(); } else { plan }; - self.project(&plan, select_exprs_post_aggr) + self.project(plan, select_exprs_post_aggr) } /// Returns the `Expr`'s corresponding to a SQL query's SELECT expressions. @@ -672,7 +672,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } /// Wrap a plan in a projection - fn project(&self, input: &LogicalPlan, expr: Vec) -> Result { + fn project(&self, input: LogicalPlan, expr: Vec) -> Result { self.validate_schema_satisfies_exprs(input.schema(), &expr)?; LogicalPlanBuilder::from(input).project(expr)?.build() } @@ -691,7 +691,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let window_exprs = exprs.into_iter().cloned().collect::>(); // the partition and sort itself is done at physical level, see physical_planner's // fn create_initial_plan - plan = LogicalPlanBuilder::from(&plan) + plan = LogicalPlanBuilder::from(plan) .window(window_exprs)? .build()?; } @@ -702,7 +702,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { /// Wrap a plan in an aggregate fn aggregate( &self, - input: &LogicalPlan, + input: LogicalPlan, select_exprs: &[Expr], having_expr_opt: &Option, group_by_exprs: Vec, @@ -714,7 +714,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .cloned() .collect::>(); - let plan = LogicalPlanBuilder::from(input) + let plan = LogicalPlanBuilder::from(input.clone()) .aggregate(group_by_exprs, aggr_exprs)? .build()?; @@ -722,14 +722,14 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // available to next phases of planning. let column_exprs_post_aggr = aggr_projection_exprs .iter() - .map(|expr| expr_as_column_expr(expr, input)) + .map(|expr| expr_as_column_expr(expr, &input)) .collect::>>()?; // Rewrite the SELECT expression to use the columns produced by the // aggregation. let select_exprs_post_aggr = select_exprs .iter() - .map(|expr| rebase_expr(expr, &aggr_projection_exprs, input)) + .map(|expr| rebase_expr(expr, &aggr_projection_exprs, &input)) .collect::>>()?; if !can_columns_satisfy_exprs(&column_exprs_post_aggr, &select_exprs_post_aggr)? { @@ -742,7 +742,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // aggregation. let having_expr_post_aggr_opt = if let Some(having_expr) = having_expr_opt { let having_expr_post_aggr = - rebase_expr(having_expr, &aggr_projection_exprs, input)?; + rebase_expr(having_expr, &aggr_projection_exprs, &input)?; if !can_columns_satisfy_exprs( &column_exprs_post_aggr, @@ -762,7 +762,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } /// Wrap a plan in a limit - fn limit(&self, input: &LogicalPlan, limit: &Option) -> Result { + fn limit(&self, input: LogicalPlan, limit: &Option) -> Result { match *limit { Some(ref limit_expr) => { let n = match self.sql_to_rex(limit_expr, input.schema())? { @@ -774,18 +774,18 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { LogicalPlanBuilder::from(input).limit(n)?.build() } - _ => Ok(input.clone()), + _ => Ok(input), } } /// Wrap the logical in a sort fn order_by( &self, - plan: &LogicalPlan, + plan: LogicalPlan, order_by: &[OrderByExpr], ) -> Result { if order_by.is_empty() { - return Ok(plan.clone()); + return Ok(plan); } let order_by_rex = order_by diff --git a/datafusion/tests/custom_sources.rs b/datafusion/tests/custom_sources.rs index 75fbe8e8eede7..36adbea1be0eb 100644 --- a/datafusion/tests/custom_sources.rs +++ b/datafusion/tests/custom_sources.rs @@ -178,7 +178,7 @@ async fn custom_source_dataframe() -> Result<()> { let mut ctx = ExecutionContext::new(); let table = ctx.read_table(Arc::new(CustomTableProvider))?; - let logical_plan = LogicalPlanBuilder::from(&table.to_logical_plan()) + let logical_plan = LogicalPlanBuilder::from(table.to_logical_plan()) .project(vec![col("c2")])? .build()?; From 27dc5d6253d4a0770a21e86032b6408eac24c4b9 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Sun, 27 Jun 2021 19:22:53 +0800 Subject: [PATCH 216/329] fix 621, where unnamed window functions shall be differentiated by partition and order by clause (#622) --- datafusion/src/logical_plan/expr.rs | 25 +++++++++++++------ datafusion/src/sql/planner.rs | 37 +++++++++++++++++++---------- 2 files changed, 42 insertions(+), 20 deletions(-) diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index 622b7a4ec4ae4..d20b1f698238c 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -1594,14 +1594,25 @@ fn create_name(e: &Expr, input_schema: &DFSchema) -> Result { fun, args, window_frame, - .. + partition_by, + order_by, } => { - let fun_name = - create_function_name(&fun.to_string(), false, args, input_schema)?; - Ok(match window_frame { - Some(window_frame) => format!("{} {}", fun_name, window_frame), - None => fun_name, - }) + let mut parts: Vec = vec![create_function_name( + &fun.to_string(), + false, + args, + input_schema, + )?]; + if !partition_by.is_empty() { + parts.push(format!("PARTITION BY {:?}", partition_by)); + } + if !order_by.is_empty() { + parts.push(format!("ORDER BY {:?}", order_by)); + } + if let Some(window_frame) = window_frame { + parts.push(format!("{}", window_frame)); + } + Ok(parts.join(" ")) } Expr::AggregateFunction { fun, diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index a2ff456ca451c..b86dc0f48c149 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -2823,6 +2823,17 @@ mod tests { quick_test(sql, expected); } + #[test] + fn empty_over_dup_with_different_sort() { + let sql = "SELECT order_id oid, MAX(order_id) OVER (), MAX(order_id) OVER (ORDER BY order_id) from orders"; + let expected = "\ + Projection: #orders.order_id AS oid, #MAX(orders.order_id), #MAX(orders.order_id) ORDER BY [#orders.order_id ASC NULLS FIRST]\ + \n WindowAggr: windowExpr=[[MAX(#orders.order_id)]]\ + \n WindowAggr: windowExpr=[[MAX(#orders.order_id) ORDER BY [#orders.order_id ASC NULLS FIRST]]]\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); + } + #[test] fn empty_over_plus() { let sql = "SELECT order_id, MAX(qty * 1.1) OVER () from orders"; @@ -2857,7 +2868,7 @@ mod tests { fn over_partition_by() { let sql = "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id) from orders"; let expected = "\ - Projection: #orders.order_id, #MAX(orders.qty)\ + Projection: #orders.order_id, #MAX(orders.qty) PARTITION BY [#orders.order_id]\ \n WindowAggr: windowExpr=[[MAX(#orders.qty) PARTITION BY [#orders.order_id]]]\ \n TableScan: orders projection=None"; quick_test(sql, expected); @@ -2879,7 +2890,7 @@ mod tests { fn over_order_by() { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; let expected = "\ - Projection: #orders.order_id, #MAX(orders.qty), #MIN(orders.qty)\ + Projection: #orders.order_id, #MAX(orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST], #MIN(orders.qty) ORDER BY [#orders.order_id DESC NULLS FIRST]\ \n WindowAggr: windowExpr=[[MAX(#orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST]]]\ \n WindowAggr: windowExpr=[[MIN(#orders.qty) ORDER BY [#orders.order_id DESC NULLS FIRST]]]\ \n TableScan: orders projection=None"; @@ -2890,7 +2901,7 @@ mod tests { fn over_order_by_with_window_frame_double_end() { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id ROWS BETWEEN 3 PRECEDING and 3 FOLLOWING), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; let expected = "\ - Projection: #orders.order_id, #MAX(orders.qty) ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING, #MIN(orders.qty)\ + Projection: #orders.order_id, #MAX(orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING, #MIN(orders.qty) ORDER BY [#orders.order_id DESC NULLS FIRST]\ \n WindowAggr: windowExpr=[[MAX(#orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING]]\ \n WindowAggr: windowExpr=[[MIN(#orders.qty) ORDER BY [#orders.order_id DESC NULLS FIRST]]]\ \n TableScan: orders projection=None"; @@ -2901,7 +2912,7 @@ mod tests { fn over_order_by_with_window_frame_single_end() { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id ROWS 3 PRECEDING), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; let expected = "\ - Projection: #orders.order_id, #MAX(orders.qty) ROWS BETWEEN 3 PRECEDING AND CURRENT ROW, #MIN(orders.qty)\ + Projection: #orders.order_id, #MAX(orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND CURRENT ROW, #MIN(orders.qty) ORDER BY [#orders.order_id DESC NULLS FIRST]\ \n WindowAggr: windowExpr=[[MAX(#orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST] ROWS BETWEEN 3 PRECEDING AND CURRENT ROW]]\ \n WindowAggr: windowExpr=[[MIN(#orders.qty) ORDER BY [#orders.order_id DESC NULLS FIRST]]]\ \n TableScan: orders projection=None"; @@ -2944,7 +2955,7 @@ mod tests { fn over_order_by_with_window_frame_single_end_groups() { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id GROUPS 3 PRECEDING), MIN(qty) OVER (ORDER BY order_id DESC) from orders"; let expected = "\ - Projection: #orders.order_id, #MAX(orders.qty) GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW, #MIN(orders.qty)\ + Projection: #orders.order_id, #MAX(orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST] GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW, #MIN(orders.qty) ORDER BY [#orders.order_id DESC NULLS FIRST]\ \n WindowAggr: windowExpr=[[MAX(#orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST] GROUPS BETWEEN 3 PRECEDING AND CURRENT ROW]]\ \n WindowAggr: windowExpr=[[MIN(#orders.qty) ORDER BY [#orders.order_id DESC NULLS FIRST]]]\ \n TableScan: orders projection=None"; @@ -2967,7 +2978,7 @@ mod tests { fn over_order_by_two_sort_keys() { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id), MIN(qty) OVER (ORDER BY (order_id + 1)) from orders"; let expected = "\ - Projection: #orders.order_id, #MAX(orders.qty), #MIN(orders.qty)\ + Projection: #orders.order_id, #MAX(orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST], #MIN(orders.qty) ORDER BY [#orders.order_id Plus Int64(1) ASC NULLS FIRST]\ \n WindowAggr: windowExpr=[[MAX(#orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST]]]\ \n WindowAggr: windowExpr=[[MIN(#orders.qty) ORDER BY [#orders.order_id Plus Int64(1) ASC NULLS FIRST]]]\ \n TableScan: orders projection=None"; @@ -2991,7 +3002,7 @@ mod tests { fn over_order_by_sort_keys_sorting() { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY qty, order_id), SUM(qty) OVER (), MIN(qty) OVER (ORDER BY order_id, qty) from orders"; let expected = "\ - Projection: #orders.order_id, #MAX(orders.qty), #SUM(orders.qty), #MIN(orders.qty)\ + Projection: #orders.order_id, #MAX(orders.qty) ORDER BY [#orders.qty ASC NULLS FIRST, #orders.order_id ASC NULLS FIRST], #SUM(orders.qty), #MIN(orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST]\ \n WindowAggr: windowExpr=[[SUM(#orders.qty)]]\ \n WindowAggr: windowExpr=[[MAX(#orders.qty) ORDER BY [#orders.qty ASC NULLS FIRST, #orders.order_id ASC NULLS FIRST]]]\ \n WindowAggr: windowExpr=[[MIN(#orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST]]]\ @@ -3016,7 +3027,7 @@ mod tests { fn over_order_by_sort_keys_sorting_prefix_compacting() { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY order_id), SUM(qty) OVER (), MIN(qty) OVER (ORDER BY order_id, qty) from orders"; let expected = "\ - Projection: #orders.order_id, #MAX(orders.qty), #SUM(orders.qty), #MIN(orders.qty)\ + Projection: #orders.order_id, #MAX(orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST], #SUM(orders.qty), #MIN(orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST]\ \n WindowAggr: windowExpr=[[SUM(#orders.qty)]]\ \n WindowAggr: windowExpr=[[MAX(#orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST]]]\ \n WindowAggr: windowExpr=[[MIN(#orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST]]]\ @@ -3045,7 +3056,7 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (ORDER BY qty, order_id), SUM(qty) OVER (), MIN(qty) OVER (ORDER BY order_id, qty) from orders ORDER BY order_id"; let expected = "\ Sort: #orders.order_id ASC NULLS FIRST\ - \n Projection: #orders.order_id, #MAX(orders.qty), #SUM(orders.qty), #MIN(orders.qty)\ + \n Projection: #orders.order_id, #MAX(orders.qty) ORDER BY [#orders.qty ASC NULLS FIRST, #orders.order_id ASC NULLS FIRST], #SUM(orders.qty), #MIN(orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST]\ \n WindowAggr: windowExpr=[[SUM(#orders.qty)]]\ \n WindowAggr: windowExpr=[[MAX(#orders.qty) ORDER BY [#orders.qty ASC NULLS FIRST, #orders.order_id ASC NULLS FIRST]]]\ \n WindowAggr: windowExpr=[[MIN(#orders.qty) ORDER BY [#orders.order_id ASC NULLS FIRST, #orders.qty ASC NULLS FIRST]]]\ @@ -3067,7 +3078,7 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id ORDER BY qty) from orders"; let expected = "\ - Projection: #orders.order_id, #MAX(orders.qty)\ + Projection: #orders.order_id, #MAX(orders.qty) PARTITION BY [#orders.order_id] ORDER BY [#orders.qty ASC NULLS FIRST]\ \n WindowAggr: windowExpr=[[MAX(#orders.qty) PARTITION BY [#orders.order_id] ORDER BY [#orders.qty ASC NULLS FIRST]]]\ \n TableScan: orders projection=None"; quick_test(sql, expected); @@ -3087,7 +3098,7 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id, qty ORDER BY qty) from orders"; let expected = "\ - Projection: #orders.order_id, #MAX(orders.qty)\ + Projection: #orders.order_id, #MAX(orders.qty) PARTITION BY [#orders.order_id, #orders.qty] ORDER BY [#orders.qty ASC NULLS FIRST]\ \n WindowAggr: windowExpr=[[MAX(#orders.qty) PARTITION BY [#orders.order_id, #orders.qty] ORDER BY [#orders.qty ASC NULLS FIRST]]]\ \n TableScan: orders projection=None"; quick_test(sql, expected); @@ -3110,7 +3121,7 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id, qty ORDER BY qty), MIN(qty) OVER (PARTITION BY qty ORDER BY order_id) from orders"; let expected = "\ - Projection: #orders.order_id, #MAX(orders.qty), #MIN(orders.qty)\ + Projection: #orders.order_id, #MAX(orders.qty) PARTITION BY [#orders.order_id, #orders.qty] ORDER BY [#orders.qty ASC NULLS FIRST], #MIN(orders.qty) PARTITION BY [#orders.qty] ORDER BY [#orders.order_id ASC NULLS FIRST]\ \n WindowAggr: windowExpr=[[MAX(#orders.qty) PARTITION BY [#orders.order_id, #orders.qty] ORDER BY [#orders.qty ASC NULLS FIRST]]]\ \n WindowAggr: windowExpr=[[MIN(#orders.qty) PARTITION BY [#orders.qty] ORDER BY [#orders.order_id ASC NULLS FIRST]]]\ \n TableScan: orders projection=None"; @@ -3133,7 +3144,7 @@ mod tests { let sql = "SELECT order_id, MAX(qty) OVER (PARTITION BY order_id ORDER BY qty), MIN(qty) OVER (PARTITION BY order_id, qty ORDER BY price) from orders"; let expected = "\ - Projection: #orders.order_id, #MAX(orders.qty), #MIN(orders.qty)\ + Projection: #orders.order_id, #MAX(orders.qty) PARTITION BY [#orders.order_id] ORDER BY [#orders.qty ASC NULLS FIRST], #MIN(orders.qty) PARTITION BY [#orders.order_id, #orders.qty] ORDER BY [#orders.price ASC NULLS FIRST]\ \n WindowAggr: windowExpr=[[MAX(#orders.qty) PARTITION BY [#orders.order_id] ORDER BY [#orders.qty ASC NULLS FIRST]]]\ \n WindowAggr: windowExpr=[[MIN(#orders.qty) PARTITION BY [#orders.order_id, #orders.qty] ORDER BY [#orders.price ASC NULLS FIRST]]]\ \n TableScan: orders projection=None"; From 4068f8b3a212aff8d7cdf2183fd1834be0dc5e69 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 28 Jun 2021 02:05:00 -0600 Subject: [PATCH 217/329] Rename MergeExec to CoalescePartitionsExec (#635) --- ballista/rust/core/proto/ballista.proto | 4 ++-- .../src/serde/physical_plan/from_proto.rs | 4 ++-- .../core/src/serde/physical_plan/to_proto.rs | 6 +++--- ballista/rust/core/src/utils.rs | 14 +++++++++----- ballista/rust/scheduler/src/planner.rs | 19 ++++++++++++------- ballista/rust/scheduler/src/test_utils.rs | 4 ++-- datafusion/src/execution/context.rs | 4 ++-- .../src/physical_optimizer/merge_exec.rs | 18 +++++++++--------- .../{merge.rs => coalesce_partitions.rs} | 12 ++++++------ datafusion/src/physical_plan/cross_join.rs | 6 ++++-- .../src/physical_plan/hash_aggregate.rs | 4 ++-- datafusion/src/physical_plan/hash_join.rs | 4 ++-- datafusion/src/physical_plan/limit.rs | 5 +++-- datafusion/src/physical_plan/mod.rs | 8 +++++--- datafusion/src/physical_plan/sort.rs | 4 ++-- .../physical_plan/sort_preserving_merge.rs | 4 ++-- 16 files changed, 67 insertions(+), 53 deletions(-) rename datafusion/src/physical_plan/{merge.rs => coalesce_partitions.rs} (95%) diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index 365d8e9fd9a42..2aa6102c2180d 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -414,7 +414,7 @@ message PhysicalPlanNode { SortExecNode sort = 11; CoalesceBatchesExecNode coalesce_batches = 12; FilterExecNode filter = 13; - MergeExecNode merge = 14; + CoalescePartitionsExecNode merge = 14; UnresolvedShuffleExecNode unresolved = 15; RepartitionExecNode repartition = 16; WindowAggExecNode window = 17; @@ -648,7 +648,7 @@ message CoalesceBatchesExecNode { uint32 target_batch_size = 2; } -message MergeExecNode { +message CoalescePartitionsExecNode { PhysicalPlanNode input = 1; } diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index 4b87be4105be0..83cbdb4ccec40 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -37,9 +37,9 @@ use datafusion::execution::context::{ }; use datafusion::logical_plan::{window_frames::WindowFrame, DFSchema, Expr}; use datafusion::physical_plan::aggregates::{create_aggregate_expr, AggregateFunction}; +use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec}; use datafusion::physical_plan::hash_join::PartitionMode; -use datafusion::physical_plan::merge::MergeExec; use datafusion::physical_plan::planner::DefaultPhysicalPlanner; use datafusion::physical_plan::window_functions::{ BuiltInWindowFunction, WindowFunction, @@ -147,7 +147,7 @@ impl TryInto> for &protobuf::PhysicalPlanNode { } PhysicalPlanType::Merge(merge) => { let input: Arc = convert_box_required!(merge.input)?; - Ok(Arc::new(MergeExec::new(input))) + Ok(Arc::new(CoalescePartitionsExec::new(input))) } PhysicalPlanType::Repartition(repart) => { let input: Arc = convert_box_required!(repart.input)?; diff --git a/ballista/rust/core/src/serde/physical_plan/to_proto.rs b/ballista/rust/core/src/serde/physical_plan/to_proto.rs index cf5401b650193..306abc166ad56 100644 --- a/ballista/rust/core/src/serde/physical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/to_proto.rs @@ -59,8 +59,8 @@ use crate::execution_plans::{ShuffleReaderExec, UnresolvedShuffleExec}; use crate::serde::protobuf::repartition_exec_node::PartitionMethod; use crate::serde::scheduler::PartitionLocation; use crate::serde::{protobuf, BallistaError}; +use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion::physical_plan::functions::{BuiltinScalarFunction, ScalarFunctionExpr}; -use datafusion::physical_plan::merge::MergeExec; use datafusion::physical_plan::repartition::RepartitionExec; impl TryInto for Arc { @@ -292,11 +292,11 @@ impl TryInto for Arc { }, )), }) - } else if let Some(exec) = plan.downcast_ref::() { + } else if let Some(exec) = plan.downcast_ref::() { let input: protobuf::PhysicalPlanNode = exec.input().to_owned().try_into()?; Ok(protobuf::PhysicalPlanNode { physical_plan_type: Some(PhysicalPlanType::Merge(Box::new( - protobuf::MergeExecNode { + protobuf::CoalescePartitionsExecNode { input: Some(Box::new(input)), }, ))), diff --git a/ballista/rust/core/src/utils.rs b/ballista/rust/core/src/utils.rs index b58be2800f7b1..26bdb00f34fb4 100644 --- a/ballista/rust/core/src/utils.rs +++ b/ballista/rust/core/src/utils.rs @@ -40,15 +40,15 @@ use datafusion::arrow::{ use datafusion::execution::context::{ExecutionConfig, ExecutionContext}; use datafusion::logical_plan::Operator; use datafusion::physical_optimizer::coalesce_batches::CoalesceBatches; -use datafusion::physical_optimizer::merge_exec::AddMergeExec; +use datafusion::physical_optimizer::merge_exec::AddCoalescePartitionsExec; use datafusion::physical_optimizer::optimizer::PhysicalOptimizerRule; use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; +use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion::physical_plan::csv::CsvExec; use datafusion::physical_plan::expressions::{BinaryExpr, Column, Literal}; use datafusion::physical_plan::filter::FilterExec; use datafusion::physical_plan::hash_aggregate::HashAggregateExec; use datafusion::physical_plan::hash_join::HashJoinExec; -use datafusion::physical_plan::merge::MergeExec; use datafusion::physical_plan::parquet::ParquetExec; use datafusion::physical_plan::projection::ProjectionExec; use datafusion::physical_plan::sort::SortExec; @@ -177,8 +177,12 @@ fn build_exec_plan_diagram( .is_some() { "CoalesceBatchesExec" - } else if plan.as_any().downcast_ref::().is_some() { - "MergeExec" + } else if plan + .as_any() + .downcast_ref::() + .is_some() + { + "CoalescePartitionsExec" } else { println!("Unknown: {:?}", plan); "Unknown" @@ -226,7 +230,7 @@ pub fn create_datafusion_context() -> ExecutionContext { // remove Repartition rule because that isn't supported yet let rules: Vec> = vec![ Arc::new(CoalesceBatches::new()), - Arc::new(AddMergeExec::new()), + Arc::new(AddCoalescePartitionsExec::new()), ]; let config = ExecutionConfig::new() .with_concurrency(1) diff --git a/ballista/rust/scheduler/src/planner.rs b/ballista/rust/scheduler/src/planner.rs index 2ac9f6121e00d..32fc9a9e25ebd 100644 --- a/ballista/rust/scheduler/src/planner.rs +++ b/ballista/rust/scheduler/src/planner.rs @@ -30,11 +30,11 @@ use ballista_core::{ }; use datafusion::execution::context::{ExecutionConfig, ExecutionContext}; use datafusion::physical_optimizer::coalesce_batches::CoalesceBatches; -use datafusion::physical_optimizer::merge_exec::AddMergeExec; +use datafusion::physical_optimizer::merge_exec::AddCoalescePartitionsExec; use datafusion::physical_optimizer::optimizer::PhysicalOptimizerRule; +use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec}; use datafusion::physical_plan::hash_join::HashJoinExec; -use datafusion::physical_plan::merge::MergeExec; use datafusion::physical_plan::windows::WindowAggExec; use datafusion::physical_plan::ExecutionPlan; use log::info; @@ -101,12 +101,15 @@ impl DistributedPlanner { // remove Repartition rule because that isn't supported yet let rules: Vec> = vec![ Arc::new(CoalesceBatches::new()), - Arc::new(AddMergeExec::new()), + Arc::new(AddCoalescePartitionsExec::new()), ]; let config = ExecutionConfig::new().with_physical_optimizer_rules(rules); let ctx = ExecutionContext::with_config(config); Ok((ctx.create_physical_plan(&adapter.logical_plan)?, stages)) - } else if let Some(merge) = execution_plan.as_any().downcast_ref::() { + } else if let Some(merge) = execution_plan + .as_any() + .downcast_ref::() + { let query_stage = create_query_stage( job_id, self.next_stage_id(), @@ -244,8 +247,10 @@ mod test { use ballista_core::serde::protobuf; use datafusion::physical_plan::hash_aggregate::HashAggregateExec; use datafusion::physical_plan::sort::SortExec; + use datafusion::physical_plan::{ + coalesce_partitions::CoalescePartitionsExec, projection::ProjectionExec, + }; use datafusion::physical_plan::{displayable, ExecutionPlan}; - use datafusion::physical_plan::{merge::MergeExec, projection::ProjectionExec}; use std::convert::TryInto; use std::sync::Arc; use uuid::Uuid; @@ -284,7 +289,7 @@ mod test { HashAggregateExec: groupBy=["l_returnflag"], aggrExpr=["SUM(l_extendedprice Multiply Int64(1)) [\"l_extendedprice * CAST(1 AS Float64)\"]"] CsvExec: testdata/lineitem; partitions=2 QueryStageExec: job=f011432e-e424-4016-915d-e3d8b84f6dbd, stage=2 - MergeExec + CoalescePartitionsExec UnresolvedShuffleExec: stages=[1] QueryStageExec: job=f011432e-e424-4016-915d-e3d8b84f6dbd, stage=3 SortExec { input: ProjectionExec { expr: [(Column { name: "l_returnflag" }, "l_returnflag"), (Column { name: "SUM(l_ext @@ -309,7 +314,7 @@ mod test { assert_eq!(unresolved_shuffle.query_stage_ids, vec![2]); let merge_exec = stages[1].children()[0].clone(); - let merge_exec = downcast_exec!(merge_exec, MergeExec); + let merge_exec = downcast_exec!(merge_exec, CoalescePartitionsExec); let unresolved_shuffle = merge_exec.children()[0].clone(); let unresolved_shuffle = diff --git a/ballista/rust/scheduler/src/test_utils.rs b/ballista/rust/scheduler/src/test_utils.rs index 311f9a7a3de0c..becb95f961acf 100644 --- a/ballista/rust/scheduler/src/test_utils.rs +++ b/ballista/rust/scheduler/src/test_utils.rs @@ -22,7 +22,7 @@ use ballista_core::error::Result; use datafusion::arrow::datatypes::{DataType, Field, Schema}; use datafusion::execution::context::{ExecutionConfig, ExecutionContext}; use datafusion::physical_optimizer::coalesce_batches::CoalesceBatches; -use datafusion::physical_optimizer::merge_exec::AddMergeExec; +use datafusion::physical_optimizer::merge_exec::AddCoalescePartitionsExec; use datafusion::physical_optimizer::optimizer::PhysicalOptimizerRule; use datafusion::physical_plan::csv::CsvReadOptions; @@ -33,7 +33,7 @@ pub const TPCH_TABLES: &[&str] = &[ pub fn datafusion_test_context(path: &str) -> Result { // remove Repartition rule because that isn't supported yet let rules: Vec> = vec![ - Arc::new(AddMergeExec::new()), + Arc::new(AddCoalescePartitionsExec::new()), Arc::new(CoalesceBatches::new()), ]; let config = ExecutionConfig::new() diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index 8ce408de86a5b..17625c92c6962 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -61,7 +61,7 @@ use crate::optimizer::optimizer::OptimizerRule; use crate::optimizer::projection_push_down::ProjectionPushDown; use crate::optimizer::simplify_expressions::SimplifyExpressions; use crate::physical_optimizer::coalesce_batches::CoalesceBatches; -use crate::physical_optimizer::merge_exec::AddMergeExec; +use crate::physical_optimizer::merge_exec::AddCoalescePartitionsExec; use crate::physical_optimizer::repartition::Repartition; use crate::physical_plan::csv::CsvReadOptions; @@ -643,7 +643,7 @@ impl Default for ExecutionConfig { physical_optimizers: vec![ Arc::new(CoalesceBatches::new()), Arc::new(Repartition::new()), - Arc::new(AddMergeExec::new()), + Arc::new(AddCoalescePartitionsExec::new()), ], query_planner: Arc::new(DefaultQueryPlanner {}), default_catalog: "datafusion".to_owned(), diff --git a/datafusion/src/physical_optimizer/merge_exec.rs b/datafusion/src/physical_optimizer/merge_exec.rs index 877c0be00e1b8..0127313bb1eb3 100644 --- a/datafusion/src/physical_optimizer/merge_exec.rs +++ b/datafusion/src/physical_optimizer/merge_exec.rs @@ -15,27 +15,27 @@ // specific language governing permissions and limitations // under the License. -//! AddMergeExec adds MergeExec to merge plans -//! with more partitions into one partition when the node -//! needs a single partition +//! AddCoalescePartitionsExec adds CoalescePartitionsExec to plans +//! with more than one partition, to coalesce them into one partition +//! when the node needs a single partition use super::optimizer::PhysicalOptimizerRule; use crate::{ error::Result, - physical_plan::{merge::MergeExec, Distribution}, + physical_plan::{coalesce_partitions::CoalescePartitionsExec, Distribution}, }; use std::sync::Arc; -/// Introduces MergeExec -pub struct AddMergeExec {} +/// Introduces CoalescePartitionsExec +pub struct AddCoalescePartitionsExec {} -impl AddMergeExec { +impl AddCoalescePartitionsExec { #[allow(missing_docs)] pub fn new() -> Self { Self {} } } -impl PhysicalOptimizerRule for AddMergeExec { +impl PhysicalOptimizerRule for AddCoalescePartitionsExec { fn optimize( &self, plan: Arc, @@ -60,7 +60,7 @@ impl PhysicalOptimizerRule for AddMergeExec { if child.output_partitioning().partition_count() == 1 { child.clone() } else { - Arc::new(MergeExec::new(child.clone())) + Arc::new(CoalescePartitionsExec::new(child.clone())) } }) .collect(), diff --git a/datafusion/src/physical_plan/merge.rs b/datafusion/src/physical_plan/coalesce_partitions.rs similarity index 95% rename from datafusion/src/physical_plan/merge.rs rename to datafusion/src/physical_plan/coalesce_partitions.rs index a25f5c7909fdb..94ff230b81259 100644 --- a/datafusion/src/physical_plan/merge.rs +++ b/datafusion/src/physical_plan/coalesce_partitions.rs @@ -40,15 +40,15 @@ use pin_project_lite::pin_project; /// Merge execution plan executes partitions in parallel and combines them into a single /// partition. No guarantees are made about the order of the resulting partition. #[derive(Debug)] -pub struct MergeExec { +pub struct CoalescePartitionsExec { /// Input execution plan input: Arc, } -impl MergeExec { +impl CoalescePartitionsExec { /// Create a new MergeExec pub fn new(input: Arc) -> Self { - MergeExec { input } + CoalescePartitionsExec { input } } /// Input execution plan @@ -58,7 +58,7 @@ impl MergeExec { } #[async_trait] -impl ExecutionPlan for MergeExec { +impl ExecutionPlan for CoalescePartitionsExec { /// Return a reference to Any that can be used for downcasting fn as_any(&self) -> &dyn Any { self @@ -82,7 +82,7 @@ impl ExecutionPlan for MergeExec { children: Vec>, ) -> Result> { match children.len() { - 1 => Ok(Arc::new(MergeExec::new(children[0].clone()))), + 1 => Ok(Arc::new(CoalescePartitionsExec::new(children[0].clone()))), _ => Err(DataFusionError::Internal( "MergeExec wrong number of children".to_string(), )), @@ -194,7 +194,7 @@ mod tests { // input should have 4 partitions assert_eq!(csv.output_partitioning().partition_count(), num_partitions); - let merge = MergeExec::new(Arc::new(csv)); + let merge = CoalescePartitionsExec::new(Arc::new(csv)); // output of MergeExec should have a single partition assert_eq!(merge.output_partitioning().partition_count(), 1); diff --git a/datafusion/src/physical_plan/cross_join.rs b/datafusion/src/physical_plan/cross_join.rs index f6f5da4cf8db9..98ad3440aa4ab 100644 --- a/datafusion/src/physical_plan/cross_join.rs +++ b/datafusion/src/physical_plan/cross_join.rs @@ -27,7 +27,9 @@ use arrow::record_batch::RecordBatch; use futures::{Stream, TryStreamExt}; -use super::{hash_utils::check_join_is_valid, merge::MergeExec}; +use super::{ + coalesce_partitions::CoalescePartitionsExec, hash_utils::check_join_is_valid, +}; use crate::{ error::{DataFusionError, Result}, scalar::ScalarValue, @@ -144,7 +146,7 @@ impl ExecutionPlan for CrossJoinExec { let start = Instant::now(); // merge all left parts into a single stream - let merge = MergeExec::new(self.left.clone()); + let merge = CoalescePartitionsExec::new(self.left.clone()); let stream = merge.execute(0).await?; // Load all batches and count the rows diff --git a/datafusion/src/physical_plan/hash_aggregate.rs b/datafusion/src/physical_plan/hash_aggregate.rs index 250ba2b083062..e157243dd8c2b 100644 --- a/datafusion/src/physical_plan/hash_aggregate.rs +++ b/datafusion/src/physical_plan/hash_aggregate.rs @@ -1230,7 +1230,7 @@ mod tests { use crate::physical_plan::expressions::{col, Avg}; use crate::{assert_batches_sorted_eq, physical_plan::common}; - use crate::physical_plan::merge::MergeExec; + use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; /// some mock data to aggregates fn some_data() -> (Arc, Vec) { @@ -1298,7 +1298,7 @@ mod tests { ]; assert_batches_sorted_eq!(expected, &result); - let merge = Arc::new(MergeExec::new(partial_aggregate)); + let merge = Arc::new(CoalescePartitionsExec::new(partial_aggregate)); let final_group: Vec> = (0..groups.len()) .map(|i| col(&groups[i].1, &input_schema)) diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index ad356079387a0..eb5ceaf0d949e 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -54,8 +54,8 @@ use arrow::array::{ use super::expressions::Column; use super::{ + coalesce_partitions::CoalescePartitionsExec, hash_utils::{build_join_schema, check_join_is_valid, JoinOn, JoinType}, - merge::MergeExec, }; use crate::error::{DataFusionError, Result}; @@ -260,7 +260,7 @@ impl ExecutionPlan for HashJoinExec { let start = Instant::now(); // merge all left parts into a single stream - let merge = MergeExec::new(self.left.clone()); + let merge = CoalescePartitionsExec::new(self.left.clone()); let stream = merge.execute(0).await?; // This operation performs 2 steps at once: diff --git a/datafusion/src/physical_plan/limit.rs b/datafusion/src/physical_plan/limit.rs index c56dbe141b2d1..361e26e5e94e1 100644 --- a/datafusion/src/physical_plan/limit.rs +++ b/datafusion/src/physical_plan/limit.rs @@ -295,9 +295,9 @@ mod tests { use common::collect; use super::*; + use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; use crate::physical_plan::common; use crate::physical_plan::csv::{CsvExec, CsvReadOptions}; - use crate::physical_plan::merge::MergeExec; use crate::test; #[tokio::test] @@ -319,7 +319,8 @@ mod tests { // input should have 4 partitions assert_eq!(csv.output_partitioning().partition_count(), num_partitions); - let limit = GlobalLimitExec::new(Arc::new(MergeExec::new(Arc::new(csv))), 7); + let limit = + GlobalLimitExec::new(Arc::new(CoalescePartitionsExec::new(Arc::new(csv))), 7); // the result should contain 4 batches (one per input partition) let iter = limit.execute(0).await?; diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index 7f9f7eace8354..2122751abb604 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -17,7 +17,9 @@ //! Traits for physical query plan, supporting parallel execution for partitioned relations. -use self::{display::DisplayableExecutionPlan, merge::MergeExec}; +use self::{ + coalesce_partitions::CoalescePartitionsExec, display::DisplayableExecutionPlan, +}; use crate::execution::context::ExecutionContextState; use crate::logical_plan::LogicalPlan; use crate::physical_plan::expressions::PhysicalSortExpr; @@ -315,7 +317,7 @@ pub async fn collect(plan: Arc) -> Result> { } _ => { // merge into a single partition - let plan = MergeExec::new(plan.clone()); + let plan = CoalescePartitionsExec::new(plan.clone()); // MergeExec must produce a single partition assert_eq!(1, plan.output_partitioning().partition_count()); common::collect(plan.execute(0).await?).await @@ -592,6 +594,7 @@ pub trait Accumulator: Send + Sync + Debug { pub mod aggregates; pub mod array_expressions; pub mod coalesce_batches; +pub mod coalesce_partitions; pub mod common; pub mod cross_join; #[cfg(feature = "crypto_expressions")] @@ -613,7 +616,6 @@ pub mod json; pub mod limit; pub mod math_expressions; pub mod memory; -pub mod merge; pub mod parquet; pub mod planner; pub mod projection; diff --git a/datafusion/src/physical_plan/sort.rs b/datafusion/src/physical_plan/sort.rs index 365097822cc7e..faaa10d109361 100644 --- a/datafusion/src/physical_plan/sort.rs +++ b/datafusion/src/physical_plan/sort.rs @@ -315,9 +315,9 @@ impl RecordBatchStream for SortStream { #[cfg(test)] mod tests { use super::*; + use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; use crate::physical_plan::expressions::col; use crate::physical_plan::memory::MemoryExec; - use crate::physical_plan::merge::MergeExec; use crate::physical_plan::{ collect, csv::{CsvExec, CsvReadOptions}, @@ -357,7 +357,7 @@ mod tests { options: SortOptions::default(), }, ], - Arc::new(MergeExec::new(Arc::new(csv))), + Arc::new(CoalescePartitionsExec::new(Arc::new(csv))), )?); let result: Vec = collect(sort_exec).await?; diff --git a/datafusion/src/physical_plan/sort_preserving_merge.rs b/datafusion/src/physical_plan/sort_preserving_merge.rs index b8ca97cc5974f..316f0509960dd 100644 --- a/datafusion/src/physical_plan/sort_preserving_merge.rs +++ b/datafusion/src/physical_plan/sort_preserving_merge.rs @@ -542,10 +542,10 @@ mod tests { use crate::arrow::array::{Int32Array, StringArray, TimestampNanosecondArray}; use crate::assert_batches_eq; use crate::datasource::CsvReadOptions; + use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; use crate::physical_plan::csv::CsvExec; use crate::physical_plan::expressions::col; use crate::physical_plan::memory::MemoryExec; - use crate::physical_plan::merge::MergeExec; use crate::physical_plan::sort::SortExec; use crate::physical_plan::{collect, common}; use crate::test; @@ -639,7 +639,7 @@ mod tests { src: Arc, sort: Vec, ) -> RecordBatch { - let merge = Arc::new(MergeExec::new(src)); + let merge = Arc::new(CoalescePartitionsExec::new(src)); let sort_exec = Arc::new(SortExec::try_new(sort, merge).unwrap()); let mut result = collect(sort_exec).await.unwrap(); assert_eq!(result.len(), 1); From e86f8e92a9adb4bd92170b191db6bfaff83cba38 Mon Sep 17 00:00:00 2001 From: QP Hou Date: Mon, 28 Jun 2021 03:53:00 -0700 Subject: [PATCH 218/329] use Into as argument type wherever applicable (#615) * use Into as argument type wherever applicable * switch from Into to AsRef for write_csv and write_parquet --- datafusion/src/datasource/csv.rs | 7 +++-- datafusion/src/datasource/parquet.rs | 7 +++-- datafusion/src/execution/context.rs | 17 +++++++---- datafusion/src/logical_plan/builder.rs | 28 +++++++++++-------- datafusion/src/logical_plan/expr.rs | 6 ++-- datafusion/src/optimizer/filter_push_down.rs | 12 ++++---- .../src/optimizer/projection_push_down.rs | 4 +-- datafusion/src/optimizer/utils.rs | 2 +- datafusion/src/physical_plan/aggregates.rs | 4 +-- .../src/physical_plan/expressions/average.rs | 8 ++++-- .../src/physical_plan/expressions/count.rs | 8 ++++-- .../src/physical_plan/expressions/min_max.rs | 16 ++++++++--- .../physical_plan/expressions/nth_value.rs | 12 ++++---- .../physical_plan/expressions/row_number.rs | 4 +-- .../src/physical_plan/expressions/sum.rs | 8 ++++-- datafusion/src/physical_plan/planner.rs | 15 +++++----- datafusion/src/physical_plan/udaf.rs | 4 +-- 17 files changed, 97 insertions(+), 65 deletions(-) diff --git a/datafusion/src/datasource/csv.rs b/datafusion/src/datasource/csv.rs index 906a1ce415f60..987c4fdb079dd 100644 --- a/datafusion/src/datasource/csv.rs +++ b/datafusion/src/datasource/csv.rs @@ -59,11 +59,12 @@ pub struct CsvFile { impl CsvFile { /// Attempt to initialize a new `CsvFile` from a file path - pub fn try_new(path: &str, options: CsvReadOptions) -> Result { + pub fn try_new(path: impl Into, options: CsvReadOptions) -> Result { + let path = path.into(); let schema = Arc::new(match options.schema { Some(s) => s.clone(), None => { - let filenames = common::build_file_list(path, options.file_extension)?; + let filenames = common::build_file_list(&path, options.file_extension)?; if filenames.is_empty() { return Err(DataFusionError::Plan(format!( "No files found at {path} with file extension {file_extension}", @@ -76,7 +77,7 @@ impl CsvFile { }); Ok(Self { - source: Source::Path(path.to_string()), + source: Source::Path(path), schema, has_header: options.has_header, delimiter: options.delimiter, diff --git a/datafusion/src/datasource/parquet.rs b/datafusion/src/datasource/parquet.rs index abfb81d99887d..fd147413059ba 100644 --- a/datafusion/src/datasource/parquet.rs +++ b/datafusion/src/datasource/parquet.rs @@ -42,11 +42,12 @@ pub struct ParquetTable { impl ParquetTable { /// Attempt to initialize a new `ParquetTable` from a file path. - pub fn try_new(path: &str, max_concurrency: usize) -> Result { - let parquet_exec = ParquetExec::try_from_path(path, None, None, 0, 1, None)?; + pub fn try_new(path: impl Into, max_concurrency: usize) -> Result { + let path = path.into(); + let parquet_exec = ParquetExec::try_from_path(&path, None, None, 0, 1, None)?; let schema = parquet_exec.schema(); Ok(Self { - path: path.to_string(), + path, schema, statistics: parquet_exec.statistics().to_owned(), max_concurrency, diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index 17625c92c6962..318ea596939e3 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -270,7 +270,7 @@ impl ExecutionContext { /// Creates a DataFrame for reading a CSV data source. pub fn read_csv( &mut self, - filename: &str, + filename: impl Into, options: CsvReadOptions, ) -> Result> { Ok(Arc::new(DataFrameImpl::new( @@ -280,7 +280,10 @@ impl ExecutionContext { } /// Creates a DataFrame for reading a Parquet data source. - pub fn read_parquet(&mut self, filename: &str) -> Result> { + pub fn read_parquet( + &mut self, + filename: impl Into, + ) -> Result> { Ok(Arc::new(DataFrameImpl::new( self.state.clone(), &LogicalPlanBuilder::scan_parquet( @@ -474,10 +477,11 @@ impl ExecutionContext { pub async fn write_csv( &self, plan: Arc, - path: String, + path: impl AsRef, ) -> Result<()> { + let path = path.as_ref(); // create directory to contain the CSV files (one per partition) - let fs_path = Path::new(&path); + let fs_path = Path::new(path); match fs::create_dir(fs_path) { Ok(()) => { let mut tasks = vec![]; @@ -511,11 +515,12 @@ impl ExecutionContext { pub async fn write_parquet( &self, plan: Arc, - path: String, + path: impl AsRef, writer_properties: Option, ) -> Result<()> { + let path = path.as_ref(); // create directory to contain the Parquet files (one per partition) - let fs_path = Path::new(&path); + let fs_path = Path::new(path); match fs::create_dir(fs_path) { Ok(()) => { let mut tasks = vec![]; diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs index 17fe6636439c7..1a53e2185a4bc 100644 --- a/datafusion/src/logical_plan/builder.rs +++ b/datafusion/src/logical_plan/builder.rs @@ -115,19 +115,20 @@ impl LogicalPlanBuilder { /// Scan a CSV data source pub fn scan_csv( - path: &str, + path: impl Into, options: CsvReadOptions, projection: Option>, ) -> Result { - Self::scan_csv_with_name(path, options, projection, path) + let path = path.into(); + Self::scan_csv_with_name(path.clone(), options, projection, path) } /// Scan a CSV data source and register it with a given table name pub fn scan_csv_with_name( - path: &str, + path: impl Into, options: CsvReadOptions, projection: Option>, - table_name: &str, + table_name: impl Into, ) -> Result { let provider = Arc::new(CsvFile::try_new(path, options)?); Self::scan(table_name, provider, projection) @@ -135,19 +136,20 @@ impl LogicalPlanBuilder { /// Scan a Parquet data source pub fn scan_parquet( - path: &str, + path: impl Into, projection: Option>, max_concurrency: usize, ) -> Result { - Self::scan_parquet_with_name(path, projection, max_concurrency, path) + let path = path.into(); + Self::scan_parquet_with_name(path.clone(), projection, max_concurrency, path) } /// Scan a Parquet data source and register it with a given table name pub fn scan_parquet_with_name( - path: &str, + path: impl Into, projection: Option>, max_concurrency: usize, - table_name: &str, + table_name: impl Into, ) -> Result { let provider = Arc::new(ParquetTable::try_new(path, max_concurrency)?); Self::scan(table_name, provider, projection) @@ -166,10 +168,12 @@ impl LogicalPlanBuilder { /// Convert a table provider into a builder with a TableScan pub fn scan( - table_name: &str, + table_name: impl Into, provider: Arc, projection: Option>, ) -> Result { + let table_name = table_name.into(); + if table_name.is_empty() { return Err(DataFusionError::Plan( "table_name cannot be empty".to_string(), @@ -184,17 +188,17 @@ impl LogicalPlanBuilder { DFSchema::new( p.iter() .map(|i| { - DFField::from_qualified(table_name, schema.field(*i).clone()) + DFField::from_qualified(&table_name, schema.field(*i).clone()) }) .collect(), ) }) .unwrap_or_else(|| { - DFSchema::try_from_qualified_schema(table_name, &schema) + DFSchema::try_from_qualified_schema(&table_name, &schema) })?; let table_scan = LogicalPlan::TableScan { - table_name: table_name.to_string(), + table_name, source: provider, projected_schema: Arc::new(projected_schema), projection, diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index d20b1f698238c..1bf3b65d9af00 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -44,10 +44,10 @@ pub struct Column { impl Column { /// Create Column from unqualified name. - pub fn from_name(name: String) -> Self { + pub fn from_name(name: impl Into) -> Self { Self { relation: None, - name, + name: name.into(), } } @@ -131,7 +131,7 @@ impl fmt::Display for Column { /// ``` /// # use datafusion::logical_plan::*; /// let expr = col("c1"); -/// assert_eq!(expr, Expr::Column(Column::from_name("c1".to_string()))); +/// assert_eq!(expr, Expr::Column(Column::from_name("c1"))); /// ``` /// /// ## Create the expression `c1 + c2` to add columns "c1" and "c2" together diff --git a/datafusion/src/optimizer/filter_push_down.rs b/datafusion/src/optimizer/filter_push_down.rs index 7b1ff326c3c6e..c1d81fe629345 100644 --- a/datafusion/src/optimizer/filter_push_down.rs +++ b/datafusion/src/optimizer/filter_push_down.rs @@ -890,8 +890,8 @@ mod tests { .join( &right, JoinType::Inner, - vec![Column::from_name("a".to_string())], - vec![Column::from_name("a".to_string())], + vec![Column::from_name("a")], + vec![Column::from_name("a")], )? .filter(col("a").lt_eq(lit(1i64)))? .build()?; @@ -933,8 +933,8 @@ mod tests { .join( &right, JoinType::Inner, - vec![Column::from_name("a".to_string())], - vec![Column::from_name("a".to_string())], + vec![Column::from_name("a")], + vec![Column::from_name("a")], )? // "b" and "c" are not shared by either side: they are only available together after the join .filter(col("c").lt_eq(col("b")))? @@ -972,8 +972,8 @@ mod tests { .join( &right, JoinType::Inner, - vec![Column::from_name("a".to_string())], - vec![Column::from_name("a".to_string())], + vec![Column::from_name("a")], + vec![Column::from_name("a")], )? .filter(col("b").lt_eq(lit(1i64)))? .build()?; diff --git a/datafusion/src/optimizer/projection_push_down.rs b/datafusion/src/optimizer/projection_push_down.rs index 4bf2b6e797f8e..3c8f1ee4ceb58 100644 --- a/datafusion/src/optimizer/projection_push_down.rs +++ b/datafusion/src/optimizer/projection_push_down.rs @@ -241,7 +241,7 @@ fn optimize_plan( { window_expr.iter().try_for_each(|expr| { let name = &expr.name(schema)?; - let column = Column::from_name(name.to_string()); + let column = Column::from_name(name); if required_columns.contains(&column) { new_window_expr.push(expr.clone()); new_required_columns.insert(column); @@ -286,7 +286,7 @@ fn optimize_plan( let mut new_aggr_expr = Vec::new(); aggr_expr.iter().try_for_each(|expr| { let name = &expr.name(schema)?; - let column = Column::from_name(name.to_string()); + let column = Column::from_name(name); if required_columns.contains(&column) { new_aggr_expr.push(expr.clone()); diff --git a/datafusion/src/optimizer/utils.rs b/datafusion/src/optimizer/utils.rs index 394308f5af801..ae3e196c22251 100644 --- a/datafusion/src/optimizer/utils.rs +++ b/datafusion/src/optimizer/utils.rs @@ -516,7 +516,7 @@ mod tests { &mut accum, )?; assert_eq!(1, accum.len()); - assert!(accum.contains(&Column::from_name("a".to_string()))); + assert!(accum.contains(&Column::from_name("a"))); Ok(()) } diff --git a/datafusion/src/physical_plan/aggregates.rs b/datafusion/src/physical_plan/aggregates.rs index 897c78fd46ff6..c297a959639a5 100644 --- a/datafusion/src/physical_plan/aggregates.rs +++ b/datafusion/src/physical_plan/aggregates.rs @@ -110,9 +110,9 @@ pub fn create_aggregate_expr( distinct: bool, args: &[Arc], input_schema: &Schema, - name: String, + name: impl Into, ) -> Result> { - // coerce + let name = name.into(); let arg = coerce(args, input_schema, &signature(fun))?; if arg.is_empty() { return Err(DataFusionError::Plan(format!( diff --git a/datafusion/src/physical_plan/expressions/average.rs b/datafusion/src/physical_plan/expressions/average.rs index 6a6332042188f..2e218191f6683 100644 --- a/datafusion/src/physical_plan/expressions/average.rs +++ b/datafusion/src/physical_plan/expressions/average.rs @@ -64,9 +64,13 @@ pub fn avg_return_type(arg_type: &DataType) -> Result { impl Avg { /// Create a new AVG aggregate function - pub fn new(expr: Arc, name: String, data_type: DataType) -> Self { + pub fn new( + expr: Arc, + name: impl Into, + data_type: DataType, + ) -> Self { Self { - name, + name: name.into(), expr, data_type, nullable: true, diff --git a/datafusion/src/physical_plan/expressions/count.rs b/datafusion/src/physical_plan/expressions/count.rs index 4a3fbe4fa7d3d..30c44f1c03b45 100644 --- a/datafusion/src/physical_plan/expressions/count.rs +++ b/datafusion/src/physical_plan/expressions/count.rs @@ -44,9 +44,13 @@ pub struct Count { impl Count { /// Create a new COUNT aggregate function. - pub fn new(expr: Arc, name: String, data_type: DataType) -> Self { + pub fn new( + expr: Arc, + name: impl Into, + data_type: DataType, + ) -> Self { Self { - name, + name: name.into(), expr, data_type, nullable: true, diff --git a/datafusion/src/physical_plan/expressions/min_max.rs b/datafusion/src/physical_plan/expressions/min_max.rs index 680e739cbf292..46e41f46a0e53 100644 --- a/datafusion/src/physical_plan/expressions/min_max.rs +++ b/datafusion/src/physical_plan/expressions/min_max.rs @@ -49,9 +49,13 @@ pub struct Max { impl Max { /// Create a new MAX aggregate function - pub fn new(expr: Arc, name: String, data_type: DataType) -> Self { + pub fn new( + expr: Arc, + name: impl Into, + data_type: DataType, + ) -> Self { Self { - name, + name: name.into(), expr, data_type, nullable: true, @@ -352,9 +356,13 @@ pub struct Min { impl Min { /// Create a new MIN aggregate function - pub fn new(expr: Arc, name: String, data_type: DataType) -> Self { + pub fn new( + expr: Arc, + name: impl Into, + data_type: DataType, + ) -> Self { Self { - name, + name: name.into(), expr, data_type, nullable: true, diff --git a/datafusion/src/physical_plan/expressions/nth_value.rs b/datafusion/src/physical_plan/expressions/nth_value.rs index 577c19b54ade0..b548f912b2236 100644 --- a/datafusion/src/physical_plan/expressions/nth_value.rs +++ b/datafusion/src/physical_plan/expressions/nth_value.rs @@ -45,12 +45,12 @@ pub struct NthValue { impl NthValue { /// Create a new FIRST_VALUE window aggregate function pub fn first_value( - name: String, + name: impl Into, expr: Arc, data_type: DataType, ) -> Self { Self { - name, + name: name.into(), expr, data_type, kind: NthValueKind::First, @@ -59,12 +59,12 @@ impl NthValue { /// Create a new LAST_VALUE window aggregate function pub fn last_value( - name: String, + name: impl Into, expr: Arc, data_type: DataType, ) -> Self { Self { - name, + name: name.into(), expr, data_type, kind: NthValueKind::Last, @@ -73,7 +73,7 @@ impl NthValue { /// Create a new NTH_VALUE window aggregate function pub fn nth_value( - name: String, + name: impl Into, expr: Arc, data_type: DataType, n: u32, @@ -83,7 +83,7 @@ impl NthValue { "nth_value expect n to be > 0".to_owned(), )), _ => Ok(Self { - name, + name: name.into(), expr, data_type, kind: NthValueKind::Nth(n), diff --git a/datafusion/src/physical_plan/expressions/row_number.rs b/datafusion/src/physical_plan/expressions/row_number.rs index 0444ee971f40d..6b488cc25af29 100644 --- a/datafusion/src/physical_plan/expressions/row_number.rs +++ b/datafusion/src/physical_plan/expressions/row_number.rs @@ -32,8 +32,8 @@ pub struct RowNumber { impl RowNumber { /// Create a new ROW_NUMBER function - pub fn new(name: String) -> Self { - Self { name } + pub fn new(name: impl Into) -> Self { + Self { name: name.into() } } } diff --git a/datafusion/src/physical_plan/expressions/sum.rs b/datafusion/src/physical_plan/expressions/sum.rs index 7bbbf99fa6598..c3f57e31e0d54 100644 --- a/datafusion/src/physical_plan/expressions/sum.rs +++ b/datafusion/src/physical_plan/expressions/sum.rs @@ -65,9 +65,13 @@ pub fn sum_return_type(arg_type: &DataType) -> Result { impl Sum { /// Create a new SUM aggregate function - pub fn new(expr: Arc, name: String, data_type: DataType) -> Self { + pub fn new( + expr: Arc, + name: impl Into, + data_type: DataType, + ) -> Self { Self { - name, + name: name.into(), expr, data_type, nullable: true, diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index d59004243533e..c3bb9a80136f1 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -1018,11 +1018,12 @@ impl DefaultPhysicalPlanner { pub fn create_window_expr_with_name( &self, e: &Expr, - name: String, + name: impl Into, logical_input_schema: &DFSchema, physical_input_schema: &Schema, ctx_state: &ExecutionContextState, ) -> Result> { + let name = name.into(); match e { Expr::WindowFunction { fun, @@ -1124,7 +1125,7 @@ impl DefaultPhysicalPlanner { pub fn create_aggregate_expr_with_name( &self, e: &Expr, - name: String, + name: impl Into, logical_input_schema: &DFSchema, physical_input_schema: &Schema, ctx_state: &ExecutionContextState, @@ -1263,7 +1264,7 @@ mod tests { let path = format!("{}/csv/aggregate_test_100.csv", testdata); let options = CsvReadOptions::new().schema_infer_max_records(100); - let logical_plan = LogicalPlanBuilder::scan_csv(&path, options, None)? + let logical_plan = LogicalPlanBuilder::scan_csv(path, options, None)? // filter clause needs the type coercion rule applied .filter(col("c7").lt(lit(5_u8)))? .project(vec![col("c1"), col("c2")])? @@ -1308,7 +1309,7 @@ mod tests { let path = format!("{}/csv/aggregate_test_100.csv", testdata); let options = CsvReadOptions::new().schema_infer_max_records(100); - let logical_plan = LogicalPlanBuilder::scan_csv(&path, options, None)? + let logical_plan = LogicalPlanBuilder::scan_csv(path, options, None)? .filter(col("c7").lt(col("c12")))? .build()?; @@ -1449,7 +1450,7 @@ mod tests { Expr::Literal(ScalarValue::Boolean(Some(true))), Expr::Literal(ScalarValue::Utf8(Some("a".to_string()))), ]; - let logical_plan = LogicalPlanBuilder::scan_csv(&path, options, None)? + let logical_plan = LogicalPlanBuilder::scan_csv(path, options, None)? // filter clause needs the type coercion rule applied .filter(col("c12").lt(lit(0.05)))? .project(vec![col("c12").lt_eq(lit(0.025)).in_list(list, false)])? @@ -1476,7 +1477,7 @@ mod tests { let path = format!("{}/csv/aggregate_test_100.csv", testdata); let options = CsvReadOptions::new().schema_infer_max_records(100); - let logical_plan = LogicalPlanBuilder::scan_csv(&path, options, None)? + let logical_plan = LogicalPlanBuilder::scan_csv(path, options, None)? .aggregate(vec![col("c1")], vec![sum(col("c2"))])? .build()?; @@ -1499,7 +1500,7 @@ mod tests { let path = format!("{}/csv/aggregate_test_100.csv", testdata); let options = CsvReadOptions::new().schema_infer_max_records(100); - let logical_plan = LogicalPlanBuilder::scan_csv(&path, options, None)? + let logical_plan = LogicalPlanBuilder::scan_csv(path, options, None)? .aggregate(vec![col("c1")], vec![sum(col("c2"))])? .build()?; diff --git a/datafusion/src/physical_plan/udaf.rs b/datafusion/src/physical_plan/udaf.rs index f7515d326d0a5..c6d65ad5dd607 100644 --- a/datafusion/src/physical_plan/udaf.rs +++ b/datafusion/src/physical_plan/udaf.rs @@ -105,7 +105,7 @@ pub fn create_aggregate_expr( fun: &AggregateUDF, args: &[Arc], input_schema: &Schema, - name: String, + name: impl Into, ) -> Result> { // coerce let args = coerce(args, input_schema, &fun.signature)?; @@ -119,7 +119,7 @@ pub fn create_aggregate_expr( fun: fun.clone(), args: args.clone(), data_type: (fun.return_type)(&arg_types)?.as_ref().clone(), - name, + name: name.into(), })) } From 62b03493e3dd02814af3e10aac839c2ef4e89f55 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 28 Jun 2021 08:19:47 -0600 Subject: [PATCH 219/329] Remove hard-coded PartitionMode from Ballista serde (#637) --- ballista/rust/core/proto/ballista.proto | 7 ++++- .../src/serde/physical_plan/from_proto.rs | 14 ++++++++- .../rust/core/src/serde/physical_plan/mod.rs | 30 ++++++++++++++----- .../core/src/serde/physical_plan/to_proto.rs | 7 ++++- datafusion/src/physical_plan/hash_join.rs | 5 ++++ 5 files changed, 53 insertions(+), 10 deletions(-) diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index 2aa6102c2180d..e3788066d33fc 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -565,12 +565,17 @@ message CsvScanExecNode { repeated string filename = 8; } +enum PartitionMode { + COLLECT_LEFT = 0; + PARTITIONED = 1; +} + message HashJoinExecNode { PhysicalPlanNode left = 1; PhysicalPlanNode right = 2; repeated JoinOn on = 3; JoinType join_type = 4; - + PartitionMode partition_mode = 6; } message PhysicalColumn { diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index 83cbdb4ccec40..717ee209dbe91 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -356,12 +356,24 @@ impl TryInto> for &protobuf::PhysicalPlanNode { protobuf::JoinType::Semi => JoinType::Semi, protobuf::JoinType::Anti => JoinType::Anti, }; + let partition_mode = + protobuf::PartitionMode::from_i32(hashjoin.partition_mode) + .ok_or_else(|| { + proto_error(format!( + "Received a HashJoinNode message with unknown PartitionMode {}", + hashjoin.partition_mode + )) + })?; + let partition_mode = match partition_mode { + protobuf::PartitionMode::CollectLeft => PartitionMode::CollectLeft, + protobuf::PartitionMode::Partitioned => PartitionMode::Partitioned, + }; Ok(Arc::new(HashJoinExec::try_new( left, right, on, &join_type, - PartitionMode::CollectLeft, + partition_mode, )?)) } PhysicalPlanType::ShuffleReader(shuffle_reader) => { diff --git a/ballista/rust/core/src/serde/physical_plan/mod.rs b/ballista/rust/core/src/serde/physical_plan/mod.rs index c0fe81f0ffb91..a393d7fdab1f7 100644 --- a/ballista/rust/core/src/serde/physical_plan/mod.rs +++ b/ballista/rust/core/src/serde/physical_plan/mod.rs @@ -88,13 +88,29 @@ mod roundtrip_tests { Column::new("col", schema_right.index_of("col")?), )]; - roundtrip_test(Arc::new(HashJoinExec::try_new( - Arc::new(EmptyExec::new(false, Arc::new(schema_left))), - Arc::new(EmptyExec::new(false, Arc::new(schema_right))), - on, - &JoinType::Inner, - PartitionMode::CollectLeft, - )?)) + let schema_left = Arc::new(schema_left); + let schema_right = Arc::new(schema_right); + for join_type in &[ + JoinType::Inner, + JoinType::Left, + JoinType::Right, + JoinType::Full, + JoinType::Anti, + JoinType::Semi, + ] { + for partition_mode in + &[PartitionMode::Partitioned, PartitionMode::CollectLeft] + { + roundtrip_test(Arc::new(HashJoinExec::try_new( + Arc::new(EmptyExec::new(false, schema_left.clone())), + Arc::new(EmptyExec::new(false, schema_right.clone())), + on.clone(), + &join_type, + *partition_mode, + )?))?; + } + } + Ok(()) } #[test] diff --git a/ballista/rust/core/src/serde/physical_plan/to_proto.rs b/ballista/rust/core/src/serde/physical_plan/to_proto.rs index 306abc166ad56..0fc27850074c3 100644 --- a/ballista/rust/core/src/serde/physical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/to_proto.rs @@ -34,7 +34,7 @@ use datafusion::physical_plan::expressions::{ use datafusion::physical_plan::expressions::{CastExpr, TryCastExpr}; use datafusion::physical_plan::filter::FilterExec; use datafusion::physical_plan::hash_aggregate::AggregateMode; -use datafusion::physical_plan::hash_join::HashJoinExec; +use datafusion::physical_plan::hash_join::{HashJoinExec, PartitionMode}; use datafusion::physical_plan::hash_utils::JoinType; use datafusion::physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; use datafusion::physical_plan::parquet::ParquetExec; @@ -143,6 +143,10 @@ impl TryInto for Arc { JoinType::Semi => protobuf::JoinType::Semi, JoinType::Anti => protobuf::JoinType::Anti, }; + let partition_mode = match exec.partition_mode() { + PartitionMode::CollectLeft => protobuf::PartitionMode::CollectLeft, + PartitionMode::Partitioned => protobuf::PartitionMode::Partitioned, + }; Ok(protobuf::PhysicalPlanNode { physical_plan_type: Some(PhysicalPlanType::HashJoin(Box::new( protobuf::HashJoinExecNode { @@ -150,6 +154,7 @@ impl TryInto for Arc { right: Some(Box::new(right)), on, join_type: join_type.into(), + partition_mode: partition_mode.into(), }, ))), }) diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index eb5ceaf0d949e..195a19c54070d 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -177,6 +177,11 @@ impl HashJoinExec { &self.join_type } + /// The partitioning mode of this hash join + pub fn partition_mode(&self) -> &PartitionMode { + &self.mode + } + /// Calculates column indices and left/right placement on input / output schemas and jointype fn column_indices_from_schema(&self) -> ArrowResult> { let (primary_is_left, primary_schema, secondary_schema) = match self.join_type { From eba0fcf1c4dcec199e6b7843c04e3be0e7a8261e Mon Sep 17 00:00:00 2001 From: Ximo Guanter Date: Mon, 28 Jun 2021 16:21:40 +0200 Subject: [PATCH 220/329] Add Keda autoscaling for ballista in k8s (#586) --- ballista/rust/scheduler/Cargo.toml | 1 + ballista/rust/scheduler/build.rs | 7 +- ballista/rust/scheduler/proto/keda.proto | 63 +++++++++++++++++ ballista/rust/scheduler/src/lib.rs | 69 +++++++++++++++++-- ballista/rust/scheduler/src/main.rs | 4 ++ ballista/rust/scheduler/src/state/mod.rs | 28 ++++---- docs/user-guide/src/distributed/kubernetes.md | 50 +++++++++++++- 7 files changed, 201 insertions(+), 21 deletions(-) create mode 100644 ballista/rust/scheduler/proto/keda.proto diff --git a/ballista/rust/scheduler/Cargo.toml b/ballista/rust/scheduler/Cargo.toml index 215c58a7fb3fa..9bca8d9695714 100644 --- a/ballista/rust/scheduler/Cargo.toml +++ b/ballista/rust/scheduler/Cargo.toml @@ -61,6 +61,7 @@ uuid = { version = "0.8", features = ["v4"] } [build-dependencies] configure_me_codegen = "0.4.0" +tonic-build = { version = "0.4" } [package.metadata.configure_me.bin] scheduler = "scheduler_config_spec.toml" diff --git a/ballista/rust/scheduler/build.rs b/ballista/rust/scheduler/build.rs index bae6a3bfe2e60..e90bd495a9e45 100644 --- a/ballista/rust/scheduler/build.rs +++ b/ballista/rust/scheduler/build.rs @@ -20,5 +20,10 @@ extern crate configure_me_codegen; fn main() -> Result<(), String> { println!("cargo:rerun-if-changed=scheduler_config_spec.toml"); configure_me_codegen::build_script_auto() - .map_err(|e| format!("configure_me code generation failed: {}", e)) + .map_err(|e| format!("configure_me code generation failed: {}", e))?; + + println!("cargo:rerun-if-changed=proto/keda.proto"); + tonic_build::configure() + .compile(&["proto/keda.proto"], &["proto"]) + .map_err(|e| format!("protobuf compilation failed: {}", e)) } diff --git a/ballista/rust/scheduler/proto/keda.proto b/ballista/rust/scheduler/proto/keda.proto new file mode 100644 index 0000000000000..051dd438f41aa --- /dev/null +++ b/ballista/rust/scheduler/proto/keda.proto @@ -0,0 +1,63 @@ +/* + Copyright 2020 The KEDA Authors. + + and others that have contributed code to the public domain. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at. + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +// This file comes from https://github.com/kedacore/keda/blob/main/pkg/scalers/externalscaler/externalscaler.proto +syntax = "proto3"; + +package externalscaler; +option go_package = ".;externalscaler"; + +service ExternalScaler { + rpc IsActive(ScaledObjectRef) returns (IsActiveResponse) {} + // Commented out since we aren't supporting the streaming scaler interface at the moment + // rpc StreamIsActive(ScaledObjectRef) returns (stream IsActiveResponse) {} + rpc GetMetricSpec(ScaledObjectRef) returns (GetMetricSpecResponse) {} + rpc GetMetrics(GetMetricsRequest) returns (GetMetricsResponse) {} +} + +message ScaledObjectRef { + string name = 1; + string namespace = 2; + map scalerMetadata = 3; +} + +message IsActiveResponse { + bool result = 1; +} + +message GetMetricSpecResponse { + repeated MetricSpec metricSpecs = 1; +} + +message MetricSpec { + string metricName = 1; + int64 targetSize = 2; +} + +message GetMetricsRequest { + ScaledObjectRef scaledObjectRef = 1; + string metricName = 2; +} + +message GetMetricsResponse { + repeated MetricValue metricValues = 1; +} + +message MetricValue { + string metricName = 1; + int64 metricValue = 2; +} \ No newline at end of file diff --git a/ballista/rust/scheduler/src/lib.rs b/ballista/rust/scheduler/src/lib.rs index 3620f79baaa55..3bd4c03aa9c33 100644 --- a/ballista/rust/scheduler/src/lib.rs +++ b/ballista/rust/scheduler/src/lib.rs @@ -28,16 +28,22 @@ pub use standalone::new_standalone_scheduler; #[cfg(test)] pub mod test_utils; +// include the generated protobuf source as a submodule +#[allow(clippy::all)] +pub mod externalscaler { + include!(concat!(env!("OUT_DIR"), "/externalscaler.rs")); +} + use std::{convert::TryInto, sync::Arc}; use std::{fmt, net::IpAddr}; use ballista_core::serde::protobuf::{ execute_query_params::Query, executor_registration::OptionalHost, job_status, - scheduler_grpc_server::SchedulerGrpc, ExecuteQueryParams, ExecuteQueryResult, - FailedJob, FilePartitionMetadata, FileType, GetFileMetadataParams, - GetFileMetadataResult, GetJobStatusParams, GetJobStatusResult, JobStatus, - PartitionId, PollWorkParams, PollWorkResult, QueuedJob, RunningJob, TaskDefinition, - TaskStatus, + scheduler_grpc_server::SchedulerGrpc, task_status, ExecuteQueryParams, + ExecuteQueryResult, FailedJob, FilePartitionMetadata, FileType, + GetFileMetadataParams, GetFileMetadataResult, GetJobStatusParams, GetJobStatusResult, + JobStatus, PartitionId, PollWorkParams, PollWorkResult, QueuedJob, RunningJob, + TaskDefinition, TaskStatus, }; use ballista_core::serde::scheduler::ExecutorMeta; @@ -62,6 +68,10 @@ impl parse_arg::ParseArgFromStr for ConfigBackend { } } +use crate::externalscaler::{ + external_scaler_server::ExternalScaler, GetMetricSpecResponse, GetMetricsRequest, + GetMetricsResponse, IsActiveResponse, MetricSpec, MetricValue, ScaledObjectRef, +}; use crate::planner::DistributedPlanner; use log::{debug, error, info, warn}; @@ -103,6 +113,55 @@ impl SchedulerServer { } } +const INFLIGHT_TASKS_METRIC_NAME: &str = "inflight_tasks"; + +#[tonic::async_trait] +impl ExternalScaler for SchedulerServer { + async fn is_active( + &self, + _request: Request, + ) -> Result, tonic::Status> { + let tasks = self.state.get_all_tasks().await.map_err(|e| { + let msg = format!("Error reading tasks: {}", e); + error!("{}", msg); + tonic::Status::internal(msg) + })?; + let result = tasks.iter().any(|(_key, task)| { + !matches!( + task.status, + Some(task_status::Status::Completed(_)) + | Some(task_status::Status::Failed(_)) + ) + }); + debug!("Are there active tasks? {}", result); + Ok(Response::new(IsActiveResponse { result })) + } + + async fn get_metric_spec( + &self, + _request: Request, + ) -> Result, tonic::Status> { + Ok(Response::new(GetMetricSpecResponse { + metric_specs: vec![MetricSpec { + metric_name: INFLIGHT_TASKS_METRIC_NAME.to_string(), + target_size: 1, + }], + })) + } + + async fn get_metrics( + &self, + _request: Request, + ) -> Result, tonic::Status> { + Ok(Response::new(GetMetricsResponse { + metric_values: vec![MetricValue { + metric_name: INFLIGHT_TASKS_METRIC_NAME.to_string(), + metric_value: 10000000, // A very high number to saturate the HPA + }], + })) + } +} + #[tonic::async_trait] impl SchedulerGrpc for SchedulerServer { async fn poll_work( diff --git a/ballista/rust/scheduler/src/main.rs b/ballista/rust/scheduler/src/main.rs index 34386ca6c5617..7b79eb1b39ac4 100644 --- a/ballista/rust/scheduler/src/main.rs +++ b/ballista/rust/scheduler/src/main.rs @@ -18,6 +18,7 @@ //! Ballista Rust scheduler binary. use anyhow::{Context, Result}; +use ballista_scheduler::externalscaler::external_scaler_server::ExternalScalerServer; use futures::future::{self, Either, TryFutureExt}; use hyper::{server::conn::AddrStream, service::make_service_fn, Server}; use std::convert::Infallible; @@ -72,8 +73,11 @@ async fn start_server( let scheduler_grpc_server = SchedulerGrpcServer::new(scheduler_server.clone()); + let keda_scaler = ExternalScalerServer::new(scheduler_server.clone()); + let mut tonic = TonicServer::builder() .add_service(scheduler_grpc_server) + .add_service(keda_scaler) .into_service(); let mut warp = warp::service(get_routes(scheduler_server)); diff --git a/ballista/rust/scheduler/src/state/mod.rs b/ballista/rust/scheduler/src/state/mod.rs index a17c82d4b7379..cbee3f1bef690 100644 --- a/ballista/rust/scheduler/src/state/mod.rs +++ b/ballista/rust/scheduler/src/state/mod.rs @@ -236,6 +236,15 @@ impl SchedulerState { Ok((&value).try_into()?) } + pub async fn get_all_tasks(&self) -> Result> { + self.config_client + .get_from_prefix(&get_task_prefix(&self.namespace)) + .await? + .into_iter() + .map(|(key, bytes)| Ok((key, decode_protobuf(&bytes)?))) + .collect() + } + /// This function ensures that the task wasn't assigned to an executor that died. /// If that is the case, then the task is re-scheduled. /// Returns true if the task was dead, false otherwise. @@ -274,18 +283,12 @@ impl SchedulerState { &self, executor_id: &str, ) -> Result)>> { - let kvs: HashMap> = self - .config_client - .get_from_prefix(&get_task_prefix(&self.namespace)) - .await? - .into_iter() - .collect(); + let tasks = self.get_all_tasks().await?; // TODO: Make the duration a configurable parameter let executors = self .get_alive_executors_metadata(Duration::from_secs(60)) .await?; - 'tasks: for (_key, value) in kvs.iter() { - let mut status: TaskStatus = decode_protobuf(value)?; + 'tasks: for (_key, status) in tasks.iter() { if status.status.is_none() { let partition = status.partition_id.as_ref().unwrap(); let plan = self @@ -301,7 +304,7 @@ impl SchedulerState { for unresolved_shuffle in unresolved_shuffles { for stage_id in unresolved_shuffle.query_stage_ids { for partition_id in 0..unresolved_shuffle.partition_count { - let referenced_task = kvs + let referenced_task = tasks .get(&get_task_status_key( &self.namespace, &partition.job_id, @@ -309,8 +312,6 @@ impl SchedulerState { partition_id, )) .unwrap(); - let referenced_task: TaskStatus = - decode_protobuf(referenced_task)?; let task_is_dead = self .reschedule_dead_task(&referenced_task, &executors) .await?; @@ -318,14 +319,14 @@ impl SchedulerState { continue 'tasks; } else if let Some(task_status::Status::Completed( CompletedTask { executor_id }, - )) = referenced_task.status + )) = &referenced_task.status { let empty = vec![]; let locations = partition_locations.entry(stage_id).or_insert(empty); let executor_meta = executors .iter() - .find(|exec| exec.id == executor_id) + .find(|exec| exec.id == *executor_id) .unwrap() .clone(); locations.push(vec![ @@ -350,6 +351,7 @@ impl SchedulerState { remove_unresolved_shuffles(plan.as_ref(), &partition_locations)?; // If we get here, there are no more unresolved shuffled and the task can be run + let mut status = status.clone(); status.status = Some(task_status::Status::Running(RunningTask { executor_id: executor_id.to_owned(), })); diff --git a/docs/user-guide/src/distributed/kubernetes.md b/docs/user-guide/src/distributed/kubernetes.md index 07b51f7871b6c..4b80d1731943c 100644 --- a/docs/user-guide/src/distributed/kubernetes.md +++ b/docs/user-guide/src/distributed/kubernetes.md @@ -28,6 +28,7 @@ The k8s deployment consists of: - k8s deployment for one or more executor processes - k8s service to route traffic to the schedulers - k8s persistent volume and persistent volume claims to make local data accessible to Ballista +- _(optional)_ a [keda](http://keda.sh) instance for autoscaling the number of executors ## Limitations @@ -163,8 +164,8 @@ spec: image: command: ["/executor"] args: - - "--bind-port=50051", - - "--scheduler-host=ballista-scheduler", + - "--bind-port=50051" + - "--scheduler-host=ballista-scheduler" - "--scheduler-port=50050" ports: - containerPort: 50051 @@ -216,3 +217,48 @@ Run the following kubectl command to delete the cluster. ```bash kubectl delete -f cluster.yaml ``` + +## Adding autoscaling for executors + +Ballista supports autoscaling for executors through [Keda](http://keda.sh). Keda allows scaling a deployment +through custom metrics which are exposed through the Ballista scheduler, and it can even scale the number of +executors down to 0 if there is no activity in the cluster. + +Keda can be installed in your kubernetes cluster through a single command line: + +```bash +kubectl apply -f https://github.com/kedacore/keda/releases/download/v2.3.0/keda-2.3.0.yaml +``` + +Once you have deployed Keda on your cluster, you can now deploy a new kubernetes object called `ScaledObject` +which will let Keda know how to scale your executors. In order to do that, copy the following YAML into a +`scale.yaml` file: + +```yaml +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: ballista-executor +spec: + scaleTargetRef: + name: ballista-executor + minReplicaCount: 0 + maxReplicaCount: 5 + triggers: + - type: external + metadata: + # Change this DNS if the scheduler isn't deployed in the "default" namespace + scalerAddress: ballista-scheduler.default.svc.cluster.local:50050 +``` + +And then deploy it into the cluster: + +```bash +kubectl apply -f scale.yaml +``` + +If the cluster is inactive, Keda will now scale the number of executors down to 0, and will scale them up when +you launch a query. Please note that Keda will perform a scan once every 30 seconds, so it might take a bit to +scale the executors. + +Please visit Keda's [documentation page](https://keda.sh/docs/2.3/concepts/scaling-deployments/) for more information. \ No newline at end of file From 407de2a60550d1fcb36fe6da2e77dde6ddb3621c Mon Sep 17 00:00:00 2001 From: Ximo Guanter Date: Mon, 28 Jun 2021 16:25:15 +0200 Subject: [PATCH 221/329] Remove unnecessary mutex (#639) --- ballista/rust/core/src/execution_plans/query_stage.rs | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/ballista/rust/core/src/execution_plans/query_stage.rs b/ballista/rust/core/src/execution_plans/query_stage.rs index c117110498e66..1e91540a7d898 100644 --- a/ballista/rust/core/src/execution_plans/query_stage.rs +++ b/ballista/rust/core/src/execution_plans/query_stage.rs @@ -184,7 +184,7 @@ impl ExecutionPlan for QueryStageExec { // we won't necessary produce output for every possible partition, so we // create writers on demand - let mut writers: Vec>>> = vec![]; + let mut writers: Vec> = vec![]; for _ in 0..num_output_partitions { writers.push(None); } @@ -229,9 +229,8 @@ impl ExecutionPlan for QueryStageExec { RecordBatch::try_new(input_batch.schema(), columns)?; // write batch out - match &writers[num_output_partition] { + match &mut writers[num_output_partition] { Some(w) => { - let mut w = w.lock().unwrap(); w.write(&output_batch)?; } None => { @@ -247,8 +246,7 @@ impl ExecutionPlan for QueryStageExec { ShuffleWriter::new(path, stream.schema().as_ref())?; writer.write(&output_batch)?; - writers[num_output_partition] = - Some(Arc::new(Mutex::new(writer))); + writers[num_output_partition] = Some(writer); } } } @@ -262,10 +260,9 @@ impl ExecutionPlan for QueryStageExec { let mut num_batches_builder = UInt64Builder::new(num_writers); let mut num_bytes_builder = UInt64Builder::new(num_writers); - for (i, w) in writers.iter().enumerate() { + for (i, w) in writers.iter_mut().enumerate() { match w { Some(w) => { - let mut w = w.lock().unwrap(); w.finish()?; path_builder.append_value(w.path())?; partition_builder.append_value(i as u32)?; From 8e12e482830afcd619fadca237f2c6412883a63d Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Tue, 29 Jun 2021 05:54:12 +0800 Subject: [PATCH 222/329] add rank and dense rank and refactor window built in functions (#631) --- .../src/physical_plan/expressions/mod.rs | 2 + .../physical_plan/expressions/nth_value.rs | 59 +++--- .../src/physical_plan/expressions/rank.rs | 172 ++++++++++++++++++ .../physical_plan/expressions/row_number.rs | 32 +++- .../src/physical_plan/window_functions.rs | 63 ++++++- datafusion/src/physical_plan/windows.rs | 55 +++--- 6 files changed, 318 insertions(+), 65 deletions(-) create mode 100644 datafusion/src/physical_plan/expressions/rank.rs diff --git a/datafusion/src/physical_plan/expressions/mod.rs b/datafusion/src/physical_plan/expressions/mod.rs index 0b32dca0467d8..440cb5b4ec67a 100644 --- a/datafusion/src/physical_plan/expressions/mod.rs +++ b/datafusion/src/physical_plan/expressions/mod.rs @@ -42,6 +42,7 @@ mod negative; mod not; mod nth_value; mod nullif; +mod rank; mod row_number; mod sum; mod try_cast; @@ -63,6 +64,7 @@ pub use negative::{negative, NegativeExpr}; pub use not::{not, NotExpr}; pub use nth_value::NthValue; pub use nullif::{nullif_func, SUPPORTED_NULLIF_TYPES}; +pub use rank::{dense_rank, rank}; pub use row_number::RowNumber; pub use sum::{sum_return_type, Sum}; pub use try_cast::{try_cast, TryCastExpr}; diff --git a/datafusion/src/physical_plan/expressions/nth_value.rs b/datafusion/src/physical_plan/expressions/nth_value.rs index b548f912b2236..3897ae5cb53e0 100644 --- a/datafusion/src/physical_plan/expressions/nth_value.rs +++ b/datafusion/src/physical_plan/expressions/nth_value.rs @@ -18,11 +18,14 @@ //! Defines physical expressions that can evaluated at runtime during query execution use crate::error::{DataFusionError, Result}; +use crate::physical_plan::window_functions::PartitionEvaluator; use crate::physical_plan::{window_functions::BuiltInWindowFunctionExpr, PhysicalExpr}; use crate::scalar::ScalarValue; -use arrow::array::{new_empty_array, new_null_array, ArrayRef}; +use arrow::array::{new_null_array, ArrayRef}; use arrow::datatypes::{DataType, Field}; +use arrow::record_batch::RecordBatch; use std::any::Any; +use std::ops::Range; use std::sync::Arc; /// nth_value kind @@ -111,25 +114,34 @@ impl BuiltInWindowFunctionExpr for NthValue { &self.name } - fn evaluate(&self, num_rows: usize, values: &[ArrayRef]) -> Result { - if values.is_empty() { - return Err(DataFusionError::Execution(format!( - "No arguments supplied to {}", - self.name() - ))); - } - let value = &values[0]; - if value.len() != num_rows { - return Err(DataFusionError::Execution(format!( - "Invalid data supplied to {}, expect {} rows, got {} rows", - self.name(), - num_rows, - value.len() - ))); - } - if num_rows == 0 { - return Ok(new_empty_array(value.data_type())); - } + fn create_evaluator( + &self, + batch: &RecordBatch, + ) -> Result> { + let values = self + .expressions() + .iter() + .map(|e| e.evaluate(batch)) + .map(|r| r.map(|v| v.into_array(batch.num_rows()))) + .collect::>>()?; + Ok(Box::new(NthValueEvaluator { + kind: self.kind, + values, + })) + } +} + +/// Value evaluator for nth_value functions +pub(crate) struct NthValueEvaluator { + kind: NthValueKind, + values: Vec, +} + +impl PartitionEvaluator for NthValueEvaluator { + fn evaluate_partition(&self, partition: Range) -> Result { + let value = &self.values[0]; + let num_rows = partition.end - partition.start; + let value = value.slice(partition.start, num_rows); let index: usize = match self.kind { NthValueKind::First => 0, NthValueKind::Last => (num_rows as usize) - 1, @@ -138,7 +150,7 @@ impl BuiltInWindowFunctionExpr for NthValue { Ok(if index >= num_rows { new_null_array(value.data_type(), num_rows) } else { - let value = ScalarValue::try_from_array(value, index)?; + let value = ScalarValue::try_from_array(&value, index)?; value.to_array_of_size(num_rows) }) } @@ -157,8 +169,9 @@ mod tests { let values = vec![arr]; let schema = Schema::new(vec![Field::new("arr", DataType::Int32, false)]); let batch = RecordBatch::try_new(Arc::new(schema), values.clone())?; - let result = expr.evaluate(batch.num_rows(), &values)?; - let result = result.as_any().downcast_ref::().unwrap(); + let result = expr.create_evaluator(&batch)?.evaluate(vec![0..8])?; + assert_eq!(1, result.len()); + let result = result[0].as_any().downcast_ref::().unwrap(); let result = result.values(); assert_eq!(expected, result); Ok(()) diff --git a/datafusion/src/physical_plan/expressions/rank.rs b/datafusion/src/physical_plan/expressions/rank.rs new file mode 100644 index 0000000000000..b88dec378c06e --- /dev/null +++ b/datafusion/src/physical_plan/expressions/rank.rs @@ -0,0 +1,172 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines physical expressions that can evaluated at runtime during query execution + +use crate::error::Result; +use crate::physical_plan::window_functions::PartitionEvaluator; +use crate::physical_plan::{window_functions::BuiltInWindowFunctionExpr, PhysicalExpr}; +use arrow::array::ArrayRef; +use arrow::array::UInt64Array; +use arrow::datatypes::{DataType, Field}; +use arrow::record_batch::RecordBatch; +use std::any::Any; +use std::iter; +use std::ops::Range; +use std::sync::Arc; + +/// Rank calculates the rank in the window function with order by +#[derive(Debug)] +pub struct Rank { + name: String, + dense: bool, +} + +/// Create a rank window function +pub fn rank(name: String) -> Rank { + Rank { name, dense: false } +} + +/// Create a dense rank window function +pub fn dense_rank(name: String) -> Rank { + Rank { name, dense: true } +} + +impl BuiltInWindowFunctionExpr for Rank { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn field(&self) -> Result { + let nullable = false; + let data_type = DataType::UInt64; + Ok(Field::new(self.name(), data_type, nullable)) + } + + fn expressions(&self) -> Vec> { + vec![] + } + + fn name(&self) -> &str { + &self.name + } + + fn create_evaluator( + &self, + _batch: &RecordBatch, + ) -> Result> { + Ok(Box::new(RankEvaluator { dense: self.dense })) + } +} + +pub(crate) struct RankEvaluator { + dense: bool, +} + +impl PartitionEvaluator for RankEvaluator { + fn include_rank(&self) -> bool { + true + } + + fn evaluate_partition(&self, _partition: Range) -> Result { + unreachable!("rank evaluation must be called with evaluate_partition_with_rank") + } + + fn evaluate_partition_with_rank( + &self, + _partition: Range, + ranks_in_partition: &[Range], + ) -> Result { + let result = if self.dense { + UInt64Array::from_iter_values(ranks_in_partition.iter().zip(1u64..).flat_map( + |(range, rank)| { + let len = range.end - range.start; + iter::repeat(rank).take(len) + }, + )) + } else { + UInt64Array::from_iter_values( + ranks_in_partition + .iter() + .scan(1_u64, |acc, range| { + let len = range.end - range.start; + let result = iter::repeat(*acc).take(len); + *acc += len as u64; + Some(result) + }) + .flatten(), + ) + }; + Ok(Arc::new(result)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::{array::*, datatypes::*}; + + fn test_with_rank(expr: &Rank, expected: Vec) -> Result<()> { + test_i32_result( + expr, + vec![-2, -2, 1, 3, 3, 3, 7, 8], + vec![0..2, 2..3, 3..6, 6..7, 7..8], + expected, + ) + } + + fn test_without_rank(expr: &Rank, expected: Vec) -> Result<()> { + test_i32_result(expr, vec![-2, -2, 1, 3, 3, 3, 7, 8], vec![0..8], expected) + } + + fn test_i32_result( + expr: &Rank, + data: Vec, + ranks: Vec>, + expected: Vec, + ) -> Result<()> { + let arr: ArrayRef = Arc::new(Int32Array::from(data)); + let values = vec![arr]; + let schema = Schema::new(vec![Field::new("arr", DataType::Int32, false)]); + let batch = RecordBatch::try_new(Arc::new(schema), values.clone())?; + let result = expr + .create_evaluator(&batch)? + .evaluate_with_rank(vec![0..8], ranks)?; + assert_eq!(1, result.len()); + let result = result[0].as_any().downcast_ref::().unwrap(); + let result = result.values(); + assert_eq!(expected, result); + Ok(()) + } + + #[test] + fn test_dense_rank() -> Result<()> { + let r = dense_rank("arr".into()); + test_without_rank(&r, vec![1; 8])?; + test_with_rank(&r, vec![1, 1, 2, 3, 3, 3, 4, 5])?; + Ok(()) + } + + #[test] + fn test_rank() -> Result<()> { + let r = rank("arr".into()); + test_without_rank(&r, vec![1; 8])?; + test_with_rank(&r, vec![1, 1, 3, 4, 4, 4, 7, 8])?; + Ok(()) + } +} diff --git a/datafusion/src/physical_plan/expressions/row_number.rs b/datafusion/src/physical_plan/expressions/row_number.rs index 6b488cc25af29..c65945f1ce8ce 100644 --- a/datafusion/src/physical_plan/expressions/row_number.rs +++ b/datafusion/src/physical_plan/expressions/row_number.rs @@ -18,10 +18,13 @@ //! Defines physical expression for `row_number` that can evaluated at runtime during query execution use crate::error::Result; +use crate::physical_plan::window_functions::PartitionEvaluator; use crate::physical_plan::{window_functions::BuiltInWindowFunctionExpr, PhysicalExpr}; use arrow::array::{ArrayRef, UInt64Array}; use arrow::datatypes::{DataType, Field}; +use arrow::record_batch::RecordBatch; use std::any::Any; +use std::ops::Range; use std::sync::Arc; /// row_number expression @@ -54,12 +57,25 @@ impl BuiltInWindowFunctionExpr for RowNumber { } fn name(&self) -> &str { - self.name.as_str() + &self.name } - fn evaluate(&self, num_rows: usize, _values: &[ArrayRef]) -> Result { + fn create_evaluator( + &self, + _batch: &RecordBatch, + ) -> Result> { + Ok(Box::new(NumRowsEvaluator::default())) + } +} + +#[derive(Default)] +pub(crate) struct NumRowsEvaluator {} + +impl PartitionEvaluator for NumRowsEvaluator { + fn evaluate_partition(&self, partition: Range) -> Result { + let num_rows = partition.end - partition.start; Ok(Arc::new(UInt64Array::from_iter_values( - (1..num_rows + 1).map(|i| i as u64), + 1..(num_rows as u64) + 1, ))) } } @@ -79,8 +95,9 @@ mod tests { let schema = Schema::new(vec![Field::new("arr", DataType::Boolean, false)]); let batch = RecordBatch::try_new(Arc::new(schema), vec![arr])?; let row_number = RowNumber::new("row_number".to_owned()); - let result = row_number.evaluate(batch.num_rows(), &[])?; - let result = result.as_any().downcast_ref::().unwrap(); + let result = row_number.create_evaluator(&batch)?.evaluate(vec![0..8])?; + assert_eq!(1, result.len()); + let result = result[0].as_any().downcast_ref::().unwrap(); let result = result.values(); assert_eq!(vec![1, 2, 3, 4, 5, 6, 7, 8], result); Ok(()) @@ -94,8 +111,9 @@ mod tests { let schema = Schema::new(vec![Field::new("arr", DataType::Boolean, false)]); let batch = RecordBatch::try_new(Arc::new(schema), vec![arr])?; let row_number = RowNumber::new("row_number".to_owned()); - let result = row_number.evaluate(batch.num_rows(), &[])?; - let result = result.as_any().downcast_ref::().unwrap(); + let result = row_number.create_evaluator(&batch)?.evaluate(vec![0..8])?; + assert_eq!(1, result.len()); + let result = result[0].as_any().downcast_ref::().unwrap(); let result = result.values(); assert_eq!(vec![1, 2, 3, 4, 5, 6, 7, 8], result); Ok(()) diff --git a/datafusion/src/physical_plan/window_functions.rs b/datafusion/src/physical_plan/window_functions.rs index 4f56aa7d38262..99805b6d29414 100644 --- a/datafusion/src/physical_plan/window_functions.rs +++ b/datafusion/src/physical_plan/window_functions.rs @@ -20,15 +20,17 @@ //! //! see also https://www.postgresql.org/docs/current/functions-window.html -use crate::arrow::array::ArrayRef; -use crate::arrow::datatypes::Field; use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ aggregates, aggregates::AggregateFunction, functions::Signature, - type_coercion::data_types, PhysicalExpr, + type_coercion::data_types, windows::find_ranges_in_range, PhysicalExpr, }; +use arrow::array::ArrayRef; use arrow::datatypes::DataType; +use arrow::datatypes::Field; +use arrow::record_batch::RecordBatch; use std::any::Any; +use std::ops::Range; use std::sync::Arc; use std::{fmt, str::FromStr}; @@ -208,11 +210,57 @@ pub(super) fn signature_for_built_in(fun: &BuiltInWindowFunction) -> Signature { } } +/// Partition evaluator +pub(crate) trait PartitionEvaluator { + /// Whether the evaluator should be evaluated with rank + fn include_rank(&self) -> bool { + false + } + + /// evaluate the partition evaluator against the partitions + fn evaluate(&self, partition_points: Vec>) -> Result> { + partition_points + .into_iter() + .map(|partition| self.evaluate_partition(partition)) + .collect() + } + + /// evaluate the partition evaluator against the partitions with rank information + fn evaluate_with_rank( + &self, + partition_points: Vec>, + sort_partition_points: Vec>, + ) -> Result> { + partition_points + .into_iter() + .map(|partition| { + let ranks_in_partition = + find_ranges_in_range(&partition, &sort_partition_points); + self.evaluate_partition_with_rank(partition, ranks_in_partition) + }) + .collect() + } + + /// evaluate the partition evaluator against the partition + fn evaluate_partition(&self, _partition: Range) -> Result; + + /// evaluate the partition evaluator against the partition but with rank + fn evaluate_partition_with_rank( + &self, + _partition: Range, + _ranks_in_partition: &[Range], + ) -> Result { + Err(DataFusionError::NotImplemented( + "evaluate_partition_with_rank is not implemented by default".into(), + )) + } +} + /// A window expression that is a built-in window function. /// /// Note that unlike aggregation based window functions, built-in window functions normally ignore /// window frame spec, with the exception of first_value, last_value, and nth_value. -pub trait BuiltInWindowFunctionExpr: Send + Sync + std::fmt::Debug { +pub(crate) trait BuiltInWindowFunctionExpr: Send + Sync + std::fmt::Debug { /// Returns the aggregate expression as [`Any`](std::any::Any) so that it can be /// downcast to a specific implementation. fn as_any(&self) -> &dyn Any; @@ -230,8 +278,11 @@ pub trait BuiltInWindowFunctionExpr: Send + Sync + std::fmt::Debug { "BuiltInWindowFunctionExpr: default name" } - /// Evaluate the built-in window function against the number of rows and the arguments - fn evaluate(&self, num_rows: usize, values: &[ArrayRef]) -> Result; + /// Create built-in window evaluator with a batch + fn create_evaluator( + &self, + batch: &RecordBatch, + ) -> Result>; } #[cfg(test)] diff --git a/datafusion/src/physical_plan/windows.rs b/datafusion/src/physical_plan/windows.rs index 2f539057c82f4..89263767c72af 100644 --- a/datafusion/src/physical_plan/windows.rs +++ b/datafusion/src/physical_plan/windows.rs @@ -21,11 +21,12 @@ use crate::error::{DataFusionError, Result}; use crate::logical_plan::window_frames::{WindowFrame, WindowFrameUnits}; use crate::physical_plan::{ aggregates, common, - expressions::{Literal, NthValue, PhysicalSortExpr, RowNumber}, + expressions::{dense_rank, rank, Literal, NthValue, PhysicalSortExpr, RowNumber}, type_coercion::coerce, - window_functions::signature_for_built_in, - window_functions::BuiltInWindowFunctionExpr, - window_functions::{BuiltInWindowFunction, WindowFunction}, + window_functions::{ + signature_for_built_in, BuiltInWindowFunction, BuiltInWindowFunctionExpr, + WindowFunction, + }, Accumulator, AggregateExpr, Distribution, ExecutionPlan, Partitioning, PhysicalExpr, RecordBatchStream, SendableRecordBatchStream, WindowExpr, }; @@ -84,7 +85,8 @@ pub fn create_window_expr( window_frame, }), WindowFunction::BuiltInWindowFunction(fun) => Arc::new(BuiltInWindowExpr { - window: create_built_in_window_expr(fun, args, input_schema, name)?, + fun: fun.clone(), + expr: create_built_in_window_expr(fun, args, input_schema, name)?, partition_by: partition_by.to_vec(), order_by: order_by.to_vec(), window_frame, @@ -100,6 +102,8 @@ fn create_built_in_window_expr( ) -> Result> { match fun { BuiltInWindowFunction::RowNumber => Ok(Arc::new(RowNumber::new(name))), + BuiltInWindowFunction::Rank => Ok(Arc::new(rank(name))), + BuiltInWindowFunction::DenseRank => Ok(Arc::new(dense_rank(name))), BuiltInWindowFunction::NthValue => { let coerced_args = coerce(args, input_schema, &signature_for_built_in(fun))?; let arg = coerced_args[0].clone(); @@ -138,7 +142,8 @@ fn create_built_in_window_expr( /// A window expr that takes the form of a built in window function #[derive(Debug)] pub struct BuiltInWindowExpr { - window: Arc, + fun: BuiltInWindowFunction, + expr: Arc, partition_by: Vec>, order_by: Vec, window_frame: Option, @@ -151,15 +156,15 @@ impl WindowExpr for BuiltInWindowExpr { } fn name(&self) -> &str { - self.window.name() + self.expr.name() } fn field(&self) -> Result { - self.window.field() + self.expr.field() } fn expressions(&self) -> Vec> { - self.window.expressions() + self.expr.expressions() } fn partition_by(&self) -> &[Arc] { @@ -171,25 +176,17 @@ impl WindowExpr for BuiltInWindowExpr { } fn evaluate(&self, batch: &RecordBatch) -> Result { - let values = self.evaluate_args(batch)?; - let partition_points = self.evaluate_partition_points( - batch.num_rows(), - &self.partition_columns(batch)?, - )?; - let results = partition_points - .iter() - .map(|partition_range| { - let start = partition_range.start; - let len = partition_range.end - start; - let values = values - .iter() - .map(|arr| arr.slice(start, len)) - .collect::>(); - self.window.evaluate(len, &values) - }) - .collect::>>()? - .into_iter() - .collect::>(); + let evaluator = self.expr.create_evaluator(batch)?; + let num_rows = batch.num_rows(); + let partition_points = + self.evaluate_partition_points(num_rows, &self.partition_columns(batch)?)?; + let results = if evaluator.include_rank() { + let sort_partition_points = + self.evaluate_partition_points(num_rows, &self.sort_columns(batch)?)?; + evaluator.evaluate_with_rank(partition_points, sort_partition_points)? + } else { + evaluator.evaluate(partition_points)? + }; let results = results.iter().map(|i| i.as_ref()).collect::>(); concat(&results).map_err(DataFusionError::ArrowError) } @@ -200,7 +197,7 @@ impl WindowExpr for BuiltInWindowExpr { /// boundaries would align (what's sorted on [partition columns...] would definitely be sorted /// on finer columns), so this will use binary search to find ranges that are within the /// partition range and return the valid slice. -fn find_ranges_in_range<'a>( +pub(crate) fn find_ranges_in_range<'a>( partition_range: &Range, sort_partition_points: &'a [Range], ) -> &'a [Range] { From 16a3db64cb50a5f6e27a032c270d9de40dd2d5a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Tue, 29 Jun 2021 00:09:15 +0200 Subject: [PATCH 223/329] Optimize count(*) with table statistics (#620) * Optimize count(*) with table statistics * Optimize count(*) with table statistics * Fixes, simplification * Alias fix * Add member to table provider to return whether statistics are exact * Fix * Improve test * Naming changes * Add test for non-exact statistics * Generalize solution * Added tests * Fix name --- datafusion/src/datasource/datasource.rs | 5 + datafusion/src/datasource/memory.rs | 4 + datafusion/src/datasource/parquet.rs | 4 + datafusion/src/execution/context.rs | 4 +- .../src/optimizer/aggregate_statistics.rs | 335 ++++++++++++++++++ datafusion/src/optimizer/mod.rs | 1 + 6 files changed, 352 insertions(+), 1 deletion(-) create mode 100644 datafusion/src/optimizer/aggregate_statistics.rs diff --git a/datafusion/src/datasource/datasource.rs b/datafusion/src/datasource/datasource.rs index 0349a49e491ba..b83aa4b1ab56b 100644 --- a/datafusion/src/datasource/datasource.rs +++ b/datafusion/src/datasource/datasource.rs @@ -108,6 +108,11 @@ pub trait TableProvider: Sync + Send { /// Statistics should be optional because not all data sources can provide statistics. fn statistics(&self) -> Statistics; + /// Returns whether statistics provided are exact values or estimates + fn has_exact_statistics(&self) -> bool { + false + } + /// Tests whether the table provider can make use of a filter expression /// to optimise data retrieval. fn supports_filter_pushdown( diff --git a/datafusion/src/datasource/memory.rs b/datafusion/src/datasource/memory.rs index af40480870287..a4dbfd6c4a24d 100644 --- a/datafusion/src/datasource/memory.rs +++ b/datafusion/src/datasource/memory.rs @@ -216,6 +216,10 @@ impl TableProvider for MemTable { fn statistics(&self) -> Statistics { self.statistics.clone() } + + fn has_exact_statistics(&self) -> bool { + true + } } #[cfg(test)] diff --git a/datafusion/src/datasource/parquet.rs b/datafusion/src/datasource/parquet.rs index fd147413059ba..e53fbbdefd2f2 100644 --- a/datafusion/src/datasource/parquet.rs +++ b/datafusion/src/datasource/parquet.rs @@ -102,6 +102,10 @@ impl TableProvider for ParquetTable { fn statistics(&self) -> Statistics { self.statistics.clone() } + + fn has_exact_statistics(&self) -> bool { + true + } } #[cfg(test)] diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index 318ea596939e3..5c41ed26eea43 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -22,7 +22,8 @@ use crate::{ information_schema::CatalogWithInformationSchema, }, optimizer::{ - eliminate_limit::EliminateLimit, hash_build_probe_order::HashBuildProbeOrder, + aggregate_statistics::AggregateStatistics, eliminate_limit::EliminateLimit, + hash_build_probe_order::HashBuildProbeOrder, }, physical_optimizer::optimizer::PhysicalOptimizerRule, }; @@ -639,6 +640,7 @@ impl Default for ExecutionConfig { optimizers: vec![ Arc::new(ConstantFolding::new()), Arc::new(EliminateLimit::new()), + Arc::new(AggregateStatistics::new()), Arc::new(ProjectionPushDown::new()), Arc::new(FilterPushDown::new()), Arc::new(SimplifyExpressions::new()), diff --git a/datafusion/src/optimizer/aggregate_statistics.rs b/datafusion/src/optimizer/aggregate_statistics.rs new file mode 100644 index 0000000000000..a20eafc688b8d --- /dev/null +++ b/datafusion/src/optimizer/aggregate_statistics.rs @@ -0,0 +1,335 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Utilizing exact statistics from sources to avoid scanning data +use std::{sync::Arc, vec}; + +use crate::{ + execution::context::ExecutionProps, + logical_plan::{col, DFField, DFSchema, Expr, LogicalPlan}, + physical_plan::aggregates::AggregateFunction, + scalar::ScalarValue, +}; + +use super::{optimizer::OptimizerRule, utils}; +use crate::error::Result; + +/// Optimizer that uses available statistics for aggregate functions +pub struct AggregateStatistics {} + +impl AggregateStatistics { + #[allow(missing_docs)] + pub fn new() -> Self { + Self {} + } +} + +impl OptimizerRule for AggregateStatistics { + fn optimize( + &self, + plan: &LogicalPlan, + execution_props: &ExecutionProps, + ) -> crate::error::Result { + match plan { + // match only select count(*) from table_scan + LogicalPlan::Aggregate { + input, + group_expr, + aggr_expr, + schema, + } if group_expr.is_empty() => { + // aggregations that can not be replaced + // using statistics + let mut agg = vec![]; + // expressions that can be replaced by constants + let mut projections = vec![]; + if let Some(num_rows) = match input.as_ref() { + LogicalPlan::TableScan { source, .. } + if source.has_exact_statistics() => + { + source.statistics().num_rows + } + _ => None, + } { + for expr in aggr_expr { + match expr { + Expr::AggregateFunction { + fun: AggregateFunction::Count, + args, + distinct: false, + } if args + == &[Expr::Literal(ScalarValue::UInt8(Some(1)))] => + { + projections.push(Expr::Alias( + Box::new(Expr::Literal(ScalarValue::UInt64(Some( + num_rows as u64, + )))), + "COUNT(Uint8(1))".to_string(), + )); + } + _ => { + agg.push(expr.clone()); + } + } + } + + return Ok(if agg.is_empty() { + // table scan can be entirely removed + + LogicalPlan::Projection { + expr: projections, + input: Arc::new(LogicalPlan::EmptyRelation { + produce_one_row: true, + schema: Arc::new(DFSchema::empty()), + }), + schema: schema.clone(), + } + } else if projections.is_empty() { + // no replacements -> return original plan + plan.clone() + } else { + // Split into parts that can be supported and part that should stay in aggregate + let agg_fields = agg + .iter() + .map(|x| x.to_field(input.schema())) + .collect::>>()?; + let agg_schema = DFSchema::new(agg_fields)?; + let cols = agg + .iter() + .map(|e| e.name(&agg_schema)) + .collect::>>()?; + projections.extend(cols.iter().map(|x| col(x))); + LogicalPlan::Projection { + expr: projections, + schema: schema.clone(), + input: Arc::new(LogicalPlan::Aggregate { + input: input.clone(), + group_expr: vec![], + aggr_expr: agg, + schema: Arc::new(agg_schema), + }), + } + }); + } + Ok(plan.clone()) + } + // Rest: recurse and find possible statistics + _ => { + let expr = plan.expressions(); + + // apply the optimization to all inputs of the plan + let inputs = plan.inputs(); + let new_inputs = inputs + .iter() + .map(|plan| self.optimize(plan, execution_props)) + .collect::>>()?; + + utils::from_plan(plan, &expr, &new_inputs) + } + } + } + + fn name(&self) -> &str { + "aggregate_statistics" + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow::datatypes::{DataType, Field, Schema}; + + use crate::error::Result; + use crate::execution::context::ExecutionProps; + use crate::logical_plan::LogicalPlan; + use crate::optimizer::aggregate_statistics::AggregateStatistics; + use crate::optimizer::optimizer::OptimizerRule; + use crate::{ + datasource::{datasource::Statistics, TableProvider}, + logical_plan::Expr, + }; + + struct TestTableProvider { + num_rows: usize, + is_exact: bool, + } + + impl TableProvider for TestTableProvider { + fn as_any(&self) -> &dyn std::any::Any { + unimplemented!() + } + fn schema(&self) -> arrow::datatypes::SchemaRef { + Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)])) + } + + fn scan( + &self, + _projection: &Option>, + _batch_size: usize, + _filters: &[Expr], + _limit: Option, + ) -> Result> { + unimplemented!() + } + fn statistics(&self) -> crate::datasource::datasource::Statistics { + Statistics { + num_rows: Some(self.num_rows), + total_byte_size: None, + column_statistics: None, + } + } + fn has_exact_statistics(&self) -> bool { + self.is_exact + } + } + + #[test] + fn optimize_count_using_statistics() -> Result<()> { + use crate::execution::context::ExecutionContext; + let mut ctx = ExecutionContext::new(); + ctx.register_table( + "test", + Arc::new(TestTableProvider { + num_rows: 100, + is_exact: true, + }), + ) + .unwrap(); + + let plan = ctx + .create_logical_plan("select count(*) from test") + .unwrap(); + let expected = "\ + Projection: #COUNT(UInt8(1))\ + \n Projection: UInt64(100) AS COUNT(Uint8(1))\ + \n EmptyRelation"; + + assert_optimized_plan_eq(&plan, expected); + Ok(()) + } + + #[test] + fn optimize_count_not_exact() -> Result<()> { + use crate::execution::context::ExecutionContext; + let mut ctx = ExecutionContext::new(); + ctx.register_table( + "test", + Arc::new(TestTableProvider { + num_rows: 100, + is_exact: false, + }), + ) + .unwrap(); + + let plan = ctx + .create_logical_plan("select count(*) from test") + .unwrap(); + let expected = "\ + Projection: #COUNT(UInt8(1))\ + \n Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1))]]\ + \n TableScan: test projection=None"; + + assert_optimized_plan_eq(&plan, expected); + Ok(()) + } + + #[test] + fn optimize_count_sum() -> Result<()> { + use crate::execution::context::ExecutionContext; + let mut ctx = ExecutionContext::new(); + ctx.register_table( + "test", + Arc::new(TestTableProvider { + num_rows: 100, + is_exact: true, + }), + ) + .unwrap(); + + let plan = ctx + .create_logical_plan("select sum(a)/count(*) from test") + .unwrap(); + let expected = "\ + Projection: #SUM(test.a) Divide #COUNT(UInt8(1))\ + \n Projection: UInt64(100) AS COUNT(Uint8(1)), #SUM(test.a)\ + \n Aggregate: groupBy=[[]], aggr=[[SUM(#test.a)]]\ + \n TableScan: test projection=None"; + + assert_optimized_plan_eq(&plan, expected); + Ok(()) + } + + #[test] + fn optimize_count_group_by() -> Result<()> { + use crate::execution::context::ExecutionContext; + let mut ctx = ExecutionContext::new(); + ctx.register_table( + "test", + Arc::new(TestTableProvider { + num_rows: 100, + is_exact: true, + }), + ) + .unwrap(); + + let plan = ctx + .create_logical_plan("SELECT count(*), a FROM test GROUP BY a") + .unwrap(); + let expected = "\ + Projection: #COUNT(UInt8(1)), #test.a\ + \n Aggregate: groupBy=[[#test.a]], aggr=[[COUNT(UInt8(1))]]\ + \n TableScan: test projection=None"; + + assert_optimized_plan_eq(&plan, expected); + Ok(()) + } + + #[test] + fn optimize_count_filter() -> Result<()> { + use crate::execution::context::ExecutionContext; + let mut ctx = ExecutionContext::new(); + ctx.register_table( + "test", + Arc::new(TestTableProvider { + num_rows: 100, + is_exact: true, + }), + ) + .unwrap(); + + let plan = ctx + .create_logical_plan("SELECT count(*) FROM test WHERE a < 5") + .unwrap(); + let expected = "\ + Projection: #COUNT(UInt8(1))\ + \n Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1))]]\ + \n Filter: #test.a Lt Int64(5)\ + \n TableScan: test projection=None"; + + assert_optimized_plan_eq(&plan, expected); + Ok(()) + } + + fn assert_optimized_plan_eq(plan: &LogicalPlan, expected: &str) { + let opt = AggregateStatistics::new(); + let optimized_plan = opt.optimize(plan, &ExecutionProps::new()).unwrap(); + let formatted_plan = format!("{:?}", optimized_plan); + assert_eq!(formatted_plan, expected); + assert_eq!(plan.schema(), plan.schema()); + } +} diff --git a/datafusion/src/optimizer/mod.rs b/datafusion/src/optimizer/mod.rs index e360a54f2a965..68758474d594a 100644 --- a/datafusion/src/optimizer/mod.rs +++ b/datafusion/src/optimizer/mod.rs @@ -18,6 +18,7 @@ //! This module contains a query optimizer that operates against a logical plan and applies //! some simple rules to a logical plan, such as "Projection Push Down" and "Type Coercion". +pub mod aggregate_statistics; pub mod constant_folding; pub mod eliminate_limit; pub mod filter_push_down; From c7c0968f931f8f7e29eae887b623427584a8506c Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Tue, 29 Jun 2021 21:22:36 +0800 Subject: [PATCH 224/329] remove dev/ (#580) --- .github/workflows/comment_bot.yml | 76 +- .github/workflows/dev.yml | 24 +- dev/.gitignore | 21 - dev/README.md | 199 -- dev/archery/MANIFEST.in | 4 - dev/archery/archery/__init__.py | 16 - dev/archery/archery/benchmark/__init__.py | 16 - dev/archery/archery/benchmark/codec.py | 97 - dev/archery/archery/benchmark/compare.py | 173 -- dev/archery/archery/benchmark/core.py | 57 - dev/archery/archery/benchmark/google.py | 174 -- dev/archery/archery/benchmark/runner.py | 212 -- dev/archery/archery/bot.py | 261 --- dev/archery/archery/cli.py | 1092 ---------- dev/archery/archery/compat.py | 51 - dev/archery/archery/crossbow/__init__.py | 19 - dev/archery/archery/crossbow/cli.py | 352 --- dev/archery/archery/crossbow/core.py | 1162 ---------- dev/archery/archery/crossbow/reports.py | 302 --- .../crossbow/tests/fixtures/crossbow-job.yaml | 51 - .../fixtures/crossbow-success-message.md | 10 - .../archery/crossbow/tests/test_core.py | 25 - .../crossbow/tests/test_crossbow_cli.py | 43 - .../archery/crossbow/tests/test_reports.py | 35 - dev/archery/archery/docker.py | 402 ---- dev/archery/archery/integration/__init__.py | 16 - dev/archery/archery/integration/datagen.py | 1604 -------------- dev/archery/archery/integration/runner.py | 419 ---- dev/archery/archery/integration/scenario.py | 29 - dev/archery/archery/integration/tester.py | 62 - dev/archery/archery/integration/tester_cpp.py | 116 - dev/archery/archery/integration/tester_go.py | 67 - .../archery/integration/tester_java.py | 140 -- dev/archery/archery/integration/tester_js.py | 73 - .../archery/integration/tester_rust.py | 115 - dev/archery/archery/integration/util.py | 166 -- dev/archery/archery/lang/__init__.py | 16 - dev/archery/archery/lang/cpp.py | 295 --- dev/archery/archery/lang/java.py | 30 - dev/archery/archery/lang/python.py | 218 -- dev/archery/archery/lang/rust.py | 23 - dev/archery/archery/release.py | 535 ----- .../archery/templates/release_changelog.md.j2 | 29 - .../archery/templates/release_curation.txt.j2 | 41 - dev/archery/archery/testing.py | 83 - .../archery-benchmark-diff-empty-lines.jsonl | 6 - .../fixtures/archery-benchmark-diff.jsonl | 4 - .../event-issue-comment-build-command.json | 212 -- ...-issue-comment-by-non-authorized-user.json | 212 -- .../event-issue-comment-by-ursabot.json | 212 -- ...-issue-comment-not-mentioning-ursabot.json | 212 -- ...vent-issue-comment-with-empty-command.json | 217 -- ...nt-issue-comment-without-pull-request.json | 206 -- .../fixtures/event-pull-request-opened.json | 445 ---- .../archery/tests/fixtures/issue-19.json | 64 - .../archery/tests/fixtures/issue-26.json | 70 - .../fixtures/issue-comment-480243811.json | 31 - .../fixtures/issue-comment-480248726.json | 31 - .../fixtures/pull-request-26-commit.json | 158 -- .../tests/fixtures/pull-request-26-files.json | 170 -- .../tests/fixtures/pull-request-26.json | 329 --- dev/archery/archery/tests/test_benchmarks.py | 383 ---- dev/archery/archery/tests/test_bot.py | 201 -- dev/archery/archery/tests/test_cli.py | 162 -- dev/archery/archery/tests/test_docker.py | 512 ----- dev/archery/archery/tests/test_release.py | 333 --- dev/archery/archery/tests/test_testing.py | 62 - dev/archery/archery/utils/__init__.py | 16 - dev/archery/archery/utils/cache.py | 80 - dev/archery/archery/utils/cmake.py | 215 -- dev/archery/archery/utils/command.py | 97 - dev/archery/archery/utils/git.py | 100 - dev/archery/archery/utils/lint.py | 387 ---- dev/archery/archery/utils/logger.py | 29 - dev/archery/archery/utils/rat.py | 70 - dev/archery/archery/utils/report.py | 64 - dev/archery/archery/utils/source.py | 205 -- dev/archery/archery/utils/tmpdir.py | 28 - dev/archery/conftest.py | 70 - dev/archery/generate_files_for_endian_test.sh | 43 - dev/archery/requirements-lint.txt | 3 - dev/archery/requirements.txt | 4 - dev/archery/setup.py | 62 - dev/benchmarking/.env | 18 - dev/benchmarking/.gitignore | 1 - dev/benchmarking/Dockerfile | 23 - dev/benchmarking/README.md | 256 --- dev/benchmarking/data_model.dot | 219 -- dev/benchmarking/data_model.rst | 373 ---- dev/benchmarking/ddl/0_setup.sql | 23 - .../ddl/1_00_table_public_project.sql | 45 - .../ddl/1_01_table_public_cpu.sql | 63 - .../ddl/1_02_table_public_gpu.sql | 43 - dev/benchmarking/ddl/1_03_table_public_os.sql | 57 - .../1_04_table_public_benchmark_language.sql | 35 - .../ddl/1_05_table_public_dependencies.sql | 31 - ...public_language_implementation_version.sql | 46 - .../ddl/1_07_table_public_benchmark_type.sql | 39 - .../ddl/1_08_table_public_machine.sql | 69 - .../ddl/1_09_table_public_unit.sql | 37 - .../ddl/1_10_table_public_environment.sql | 51 - .../ddl/1_11_table_public_benchmark.sql | 54 - .../ddl/1_12_table_public_benchmark_run.sql | 112 - dev/benchmarking/ddl/2_00_views.sql | 324 --- .../ddl/3_00_functions_helpers.sql | 643 ------ .../ddl/3_01_functions_triggers.sql | 574 ----- .../ddl/3_02_functions_ingestion.sql | 323 --- .../ddl/3_10_functions_documentation.sql | 395 ---- dev/benchmarking/ddl/4_00_triggers.sql | 61 - dev/benchmarking/ddl/5_00_permissions.sql | 73 - dev/benchmarking/docker-compose.yml | 43 - .../examples/benchmark_example.json | 32 - .../examples/benchmark_run_example.csv | 6 - .../examples/benchmark_run_example.json | 97 - .../benchmark_with_context_example.json | 73 - dev/benchmarking/examples/example.sql | 232 -- .../examples/example_graphql_mutation.json | 12 - .../graphql_query_environment_view.json | 3 - dev/benchmarking/examples/machine.json | 22 - dev/benchmarking/graphql_submit.sh | 75 - dev/benchmarking/make_data_model_rst.sh | 69 - dev/benchmarking/make_dotfile.sh | 70 - dev/benchmarking/make_machine_json.sh | 55 - dev/build-ballista-docker-arm64.sh | 34 - dev/build-ballista-docker.sh | 24 - dev/build-set-env.sh | 20 - dev/build-ui.sh | 23 - dev/merge.conf.sample | 25 - dev/merge_arrow_pr.py | 610 ------ dev/merge_arrow_pr.sh | 56 - dev/release/.env.example | 48 - dev/release/.gitignore | 21 - dev/release/01-prepare-test.rb | 665 ------ dev/release/01-prepare.sh | 291 --- dev/release/02-source-test.rb | 146 -- dev/release/02-source.sh | 162 -- dev/release/03-binary-submit.sh | 46 - dev/release/04-binary-download.sh | 38 - dev/release/05-binary-upload.sh | 137 -- dev/release/README.md | 24 - dev/release/Rakefile | 37 - dev/release/VERIFY.md | 76 - dev/release/binary-common.sh | 86 - dev/release/binary-task.rb | 1909 ----------------- dev/release/binary/.dockerignore | 18 - dev/release/binary/Dockerfile | 68 - dev/release/binary/runner.sh | 36 - dev/release/check-rat-report.py | 59 - dev/release/download_rc_binaries.py | 173 -- dev/release/post-01-upload.sh | 71 - dev/release/post-02-binary.sh | 95 - dev/release/post-03-website.sh | 266 --- dev/release/post-04-ruby.sh | 55 - dev/release/post-05-js.sh | 48 - dev/release/post-06-csharp.sh | 59 - dev/release/post-07-rust.sh | 74 - dev/release/post-08-remove-rc.sh | 50 - dev/release/post-09-docs.sh | 68 - dev/release/post-10-python.sh | 44 - dev/release/post-11-java.sh | 69 - dev/release/rat_exclude_files.txt | 108 - dev/release/run-rat.sh | 43 - dev/release/run-test.rb | 31 - dev/release/setup-gpg-agent.sh | 24 - dev/release/test-helper.rb | 96 - dev/release/verify-apt.sh | 150 -- .../verify-release-candidate-wheels.bat | 107 - dev/release/verify-release-candidate.bat | 130 -- dev/release/verify-release-candidate.sh | 808 ------- dev/release/verify-yum.sh | 154 -- dev/requirements_merge_arrow_pr.txt | 3 - dev/tasks/README.md | 19 - ...ion10.2numpy1.17python3.6.____cpython.yaml | 70 - ...ion10.2numpy1.17python3.7.____cpython.yaml | 70 - ...ion10.2numpy1.17python3.8.____cpython.yaml | 70 - ...ion10.2numpy1.19python3.9.____cpython.yaml | 70 - ...ionNonenumpy1.17python3.6.____cpython.yaml | 70 - ...ionNonenumpy1.17python3.7.____cpython.yaml | 70 - ...ionNonenumpy1.17python3.8.____cpython.yaml | 70 - ...ionNonenumpy1.19python3.9.____cpython.yaml | 70 - .../linux_aarch64_python3.6.____cpython.yaml | 71 - .../linux_aarch64_python3.7.____cpython.yaml | 71 - .../linux_aarch64_python3.8.____cpython.yaml | 71 - .../linux_aarch64_python3.9.____cpython.yaml | 71 - ...osx_64_numpy1.17python3.6.____cpython.yaml | 65 - ...osx_64_numpy1.17python3.7.____cpython.yaml | 65 - ...osx_64_numpy1.17python3.8.____cpython.yaml | 65 - ...osx_64_numpy1.19python3.9.____cpython.yaml | 65 - .../osx_arm64_python3.8.____cpython.yaml | 65 - .../osx_arm64_python3.9.____cpython.yaml | 65 - .../.ci_support/r/linux_64_r_base3.6.yaml | 22 - .../.ci_support/r/linux_64_r_base4.0.yaml | 22 - .../.ci_support/r/osx_64_r_base3.6.yaml | 26 - .../.ci_support/r/osx_64_r_base4.0.yaml | 26 - .../.ci_support/r/win_64_r_base3.6.yaml | 12 - .../.ci_support/r/win_64_r_base4.0.yaml | 12 - ...ionNonenumpy1.17python3.6.____cpython.yaml | 55 - ...ionNonenumpy1.17python3.7.____cpython.yaml | 55 - ...ionNonenumpy1.17python3.8.____cpython.yaml | 55 - ...ionNonenumpy1.19python3.9.____cpython.yaml | 55 - .../conda-recipes/.scripts/logging_utils.sh | 30 - dev/tasks/conda-recipes/README.md | 65 - .../conda-recipes/arrow-cpp/LLVM_LICENSE.txt | 68 - .../conda-recipes/arrow-cpp/bld-arrow.bat | 54 - .../conda-recipes/arrow-cpp/bld-pyarrow.bat | 44 - .../conda-recipes/arrow-cpp/build-arrow.sh | 95 - .../conda-recipes/arrow-cpp/build-pyarrow.sh | 49 - dev/tasks/conda-recipes/arrow-cpp/meta.yaml | 302 --- dev/tasks/conda-recipes/azure.clean.yml | 28 - dev/tasks/conda-recipes/azure.linux.yml | 38 - dev/tasks/conda-recipes/azure.osx.yml | 80 - dev/tasks/conda-recipes/azure.win.yml | 77 - dev/tasks/conda-recipes/build_steps.sh | 55 - dev/tasks/conda-recipes/clean.py | 80 - dev/tasks/conda-recipes/conda-forge.yml | 1 - dev/tasks/conda-recipes/drone-steps.sh | 29 - dev/tasks/conda-recipes/drone.yml | 43 - dev/tasks/conda-recipes/parquet-cpp/meta.yaml | 51 - dev/tasks/conda-recipes/r-arrow/bld.bat | 9 - dev/tasks/conda-recipes/r-arrow/build.sh | 3 - dev/tasks/conda-recipes/r-arrow/build_win.sh | 8 - dev/tasks/conda-recipes/r-arrow/configure.win | 8 - .../conda-recipes/r-arrow/install.libs.R | 5 - dev/tasks/conda-recipes/r-arrow/meta.yaml | 66 - dev/tasks/conda-recipes/run_docker_build.sh | 77 - dev/tasks/cpp-examples/github.linux.yml | 46 - dev/tasks/docker-tests/azure.linux.yml | 52 - dev/tasks/docker-tests/circle.linux.yml | 51 - dev/tasks/docker-tests/github.linux.yml | 42 - dev/tasks/gandiva-jars/README.md | 29 - dev/tasks/gandiva-jars/build-cpp-linux.sh | 73 - dev/tasks/gandiva-jars/build-cpp-osx.sh | 49 - dev/tasks/gandiva-jars/build-java.sh | 64 - dev/tasks/gandiva-jars/github.linux.yml | 47 - dev/tasks/gandiva-jars/github.osx.yml | 46 - dev/tasks/homebrew-formulae/apache-arrow.rb | 69 - .../autobrew/apache-arrow.rb | 88 - dev/tasks/homebrew-formulae/github.macos.yml | 56 - dev/tasks/linux-packages/.gitignore | 28 - dev/tasks/linux-packages/README.md | 40 - dev/tasks/linux-packages/Rakefile | 234 -- .../apache-arrow-apt-source/Rakefile | 64 - .../apt/debian-bullseye/Dockerfile | 40 - .../apt/debian-buster/Dockerfile | 41 - .../apt/ubuntu-bionic/Dockerfile | 41 - .../apt/ubuntu-focal/Dockerfile | 41 - .../apt/ubuntu-groovy/Dockerfile | 41 - .../apt/ubuntu-xenial/Dockerfile | 41 - .../debian/apache-arrow-apt-source.install | 2 - .../apache-arrow-apt-source/debian/changelog | 0 .../apache-arrow-apt-source/debian/compat | 1 - .../apache-arrow-apt-source/debian/control | 23 - .../apache-arrow-apt-source/debian/copyright | 26 - .../apache-arrow-apt-source/debian/rules | 37 - .../debian/source/format | 1 - .../apache-arrow-release/Rakefile | 66 - .../yum/Apache-Arrow.repo | 37 - .../yum/apache-arrow-release.spec.in | 110 - .../yum/centos-7/Dockerfile | 28 - .../yum/centos-8/Dockerfile | 28 - .../linux-packages/apache-arrow/Rakefile | 120 -- .../apt/debian-bullseye-arm64/from | 18 - .../apt/debian-bullseye/Dockerfile | 81 - .../apt/debian-bullseye/qemu-dummy-static | 33 - .../apache-arrow/apt/debian-buster-arm64/from | 18 - .../apache-arrow/apt/debian-buster/Dockerfile | 85 - .../apt/debian-buster/qemu-dummy-static | 33 - .../apache-arrow/apt/ubuntu-bionic-arm64/from | 18 - .../apache-arrow/apt/ubuntu-bionic/Dockerfile | 88 - .../apt/ubuntu-bionic/qemu-dummy-static | 33 - .../apache-arrow/apt/ubuntu-focal-arm64/from | 18 - .../apache-arrow/apt/ubuntu-focal/Dockerfile | 78 - .../apt/ubuntu-focal/qemu-dummy-static | 33 - .../apache-arrow/apt/ubuntu-groovy-arm64/from | 18 - .../apache-arrow/apt/ubuntu-groovy/Dockerfile | 79 - .../apt/ubuntu-groovy/qemu-dummy-static | 33 - .../apache-arrow/debian/changelog | 111 - .../linux-packages/apache-arrow/debian/compat | 1 - .../apache-arrow/debian/control.in | 583 ----- .../apache-arrow/debian/copyright | 193 -- .../debian/gir1.2-arrow-1.0.install | 1 - .../debian/gir1.2-arrow-cuda-1.0.install | 1 - .../debian/gir1.2-arrow-dataset-1.0.install | 1 - .../debian/gir1.2-gandiva-1.0.install | 1 - .../debian/gir1.2-parquet-1.0.install | 1 - .../debian/gir1.2-plasma-1.0.install | 1 - .../debian/libarrow-cuda-dev.install | 6 - .../debian/libarrow-cuda-glib-dev.install | 4 - .../debian/libarrow-cuda-glib400.install | 1 - .../debian/libarrow-cuda400.install | 1 - .../debian/libarrow-dataset-dev.install | 6 - .../debian/libarrow-dataset-glib-dev.install | 4 - .../debian/libarrow-dataset-glib-doc.doc-base | 9 - .../debian/libarrow-dataset-glib-doc.install | 1 - .../debian/libarrow-dataset-glib-doc.links | 3 - .../debian/libarrow-dataset-glib400.install | 1 - .../debian/libarrow-dataset400.install | 1 - .../apache-arrow/debian/libarrow-dev.install | 21 - .../debian/libarrow-flight-dev.install | 6 - .../debian/libarrow-flight400.install | 1 - .../debian/libarrow-glib-dev.install | 6 - .../debian/libarrow-glib-doc.doc-base | 9 - .../debian/libarrow-glib-doc.install | 2 - .../debian/libarrow-glib-doc.links | 3 - .../debian/libarrow-glib400.install | 1 - .../debian/libarrow-python-dev.install | 6 - .../debian/libarrow-python-flight-dev.install | 6 - .../debian/libarrow-python-flight400.install | 1 - .../debian/libarrow-python400.install | 1 - .../apache-arrow/debian/libarrow400.install | 1 - .../debian/libgandiva-dev.install | 7 - .../debian/libgandiva-glib-dev.install | 4 - .../debian/libgandiva-glib-doc.doc-base | 9 - .../debian/libgandiva-glib-doc.install | 1 - .../debian/libgandiva-glib-doc.links | 3 - .../debian/libgandiva-glib400.install | 1 - .../apache-arrow/debian/libgandiva400.install | 1 - .../debian/libparquet-dev.install | 7 - .../debian/libparquet-glib-dev.install | 4 - .../debian/libparquet-glib-doc.doc-base | 9 - .../debian/libparquet-glib-doc.install | 1 - .../debian/libparquet-glib-doc.links | 3 - .../debian/libparquet-glib400.install | 1 - .../apache-arrow/debian/libparquet400.install | 1 - .../apache-arrow/debian/libplasma-dev.install | 7 - .../debian/libplasma-glib-dev.install | 4 - .../debian/libplasma-glib-doc.doc-base | 9 - .../debian/libplasma-glib-doc.install | 1 - .../debian/libplasma-glib-doc.links | 3 - .../debian/libplasma-glib400.install | 1 - .../apache-arrow/debian/libplasma400.install | 1 - .../apache-arrow/debian/patches/series | 0 .../debian/plasma-store-server.install | 1 - .../linux-packages/apache-arrow/debian/rules | 103 - .../apache-arrow/debian/source/format | 1 - .../linux-packages/apache-arrow/debian/watch | 2 - .../apache-arrow/yum/arrow.spec.in | 802 ------- .../apache-arrow/yum/centos-7/Dockerfile | 62 - .../yum/centos-7/qemu-dummy-static | 33 - .../apache-arrow/yum/centos-8-aarch64/from | 18 - .../apache-arrow/yum/centos-8/Dockerfile | 65 - .../yum/centos-8/qemu-dummy-static | 33 - dev/tasks/linux-packages/apt/build.sh | 115 - .../linux-packages/github.linux.amd64.yml | 100 - dev/tasks/linux-packages/helper.rb | 70 - dev/tasks/linux-packages/package-task.rb | 622 ------ .../linux-packages/travis.linux.arm64.yml | 149 -- dev/tasks/linux-packages/yum/build.sh | 157 -- dev/tasks/macros.jinja | 198 -- dev/tasks/nightlies.sample.yml | 68 - dev/tasks/nuget-packages/github.linux.yml | 43 - dev/tasks/python-sdist/github.yml | 45 - .../python-wheels/github.linux.amd64.yml | 48 - dev/tasks/python-wheels/github.osx.yml | 133 -- dev/tasks/python-wheels/github.windows.yml | 53 - .../python-wheels/travis.linux.arm64.yml | 73 - dev/tasks/r/azure.linux.yml | 74 - dev/tasks/r/github.devdocs.yml | 92 - dev/tasks/r/github.linux.cran.yml | 79 - .../r/github.linux.version.compatibility.yml | 109 - dev/tasks/r/github.linux.versions.yml | 80 - dev/tasks/r/github.macos-linux.local.yml | 88 - dev/tasks/r/github.macos.autobrew.yml | 78 - dev/tasks/tasks.yml | 1703 --------------- dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat | 90 - dev/tasks/vcpkg-tests/github.windows.yml | 63 - dev/tasks/verify-rc/github.linux.yml | 75 - dev/tasks/verify-rc/github.macos.yml | 50 - dev/tasks/verify-rc/github.win.yml | 45 - dev/test_merge_arrow_pr.py | 317 --- dev/update_arrow_deps.py | 83 - 371 files changed, 15 insertions(+), 39322 deletions(-) delete mode 100644 dev/.gitignore delete mode 100644 dev/README.md delete mode 100644 dev/archery/MANIFEST.in delete mode 100644 dev/archery/archery/__init__.py delete mode 100644 dev/archery/archery/benchmark/__init__.py delete mode 100644 dev/archery/archery/benchmark/codec.py delete mode 100644 dev/archery/archery/benchmark/compare.py delete mode 100644 dev/archery/archery/benchmark/core.py delete mode 100644 dev/archery/archery/benchmark/google.py delete mode 100644 dev/archery/archery/benchmark/runner.py delete mode 100644 dev/archery/archery/bot.py delete mode 100644 dev/archery/archery/cli.py delete mode 100644 dev/archery/archery/compat.py delete mode 100644 dev/archery/archery/crossbow/__init__.py delete mode 100644 dev/archery/archery/crossbow/cli.py delete mode 100644 dev/archery/archery/crossbow/core.py delete mode 100644 dev/archery/archery/crossbow/reports.py delete mode 100644 dev/archery/archery/crossbow/tests/fixtures/crossbow-job.yaml delete mode 100644 dev/archery/archery/crossbow/tests/fixtures/crossbow-success-message.md delete mode 100644 dev/archery/archery/crossbow/tests/test_core.py delete mode 100644 dev/archery/archery/crossbow/tests/test_crossbow_cli.py delete mode 100644 dev/archery/archery/crossbow/tests/test_reports.py delete mode 100644 dev/archery/archery/docker.py delete mode 100644 dev/archery/archery/integration/__init__.py delete mode 100644 dev/archery/archery/integration/datagen.py delete mode 100644 dev/archery/archery/integration/runner.py delete mode 100644 dev/archery/archery/integration/scenario.py delete mode 100644 dev/archery/archery/integration/tester.py delete mode 100644 dev/archery/archery/integration/tester_cpp.py delete mode 100644 dev/archery/archery/integration/tester_go.py delete mode 100644 dev/archery/archery/integration/tester_java.py delete mode 100644 dev/archery/archery/integration/tester_js.py delete mode 100644 dev/archery/archery/integration/tester_rust.py delete mode 100644 dev/archery/archery/integration/util.py delete mode 100644 dev/archery/archery/lang/__init__.py delete mode 100644 dev/archery/archery/lang/cpp.py delete mode 100644 dev/archery/archery/lang/java.py delete mode 100644 dev/archery/archery/lang/python.py delete mode 100644 dev/archery/archery/lang/rust.py delete mode 100644 dev/archery/archery/release.py delete mode 100644 dev/archery/archery/templates/release_changelog.md.j2 delete mode 100644 dev/archery/archery/templates/release_curation.txt.j2 delete mode 100644 dev/archery/archery/testing.py delete mode 100644 dev/archery/archery/tests/fixtures/archery-benchmark-diff-empty-lines.jsonl delete mode 100644 dev/archery/archery/tests/fixtures/archery-benchmark-diff.jsonl delete mode 100644 dev/archery/archery/tests/fixtures/event-issue-comment-build-command.json delete mode 100644 dev/archery/archery/tests/fixtures/event-issue-comment-by-non-authorized-user.json delete mode 100644 dev/archery/archery/tests/fixtures/event-issue-comment-by-ursabot.json delete mode 100644 dev/archery/archery/tests/fixtures/event-issue-comment-not-mentioning-ursabot.json delete mode 100644 dev/archery/archery/tests/fixtures/event-issue-comment-with-empty-command.json delete mode 100644 dev/archery/archery/tests/fixtures/event-issue-comment-without-pull-request.json delete mode 100644 dev/archery/archery/tests/fixtures/event-pull-request-opened.json delete mode 100644 dev/archery/archery/tests/fixtures/issue-19.json delete mode 100644 dev/archery/archery/tests/fixtures/issue-26.json delete mode 100644 dev/archery/archery/tests/fixtures/issue-comment-480243811.json delete mode 100644 dev/archery/archery/tests/fixtures/issue-comment-480248726.json delete mode 100644 dev/archery/archery/tests/fixtures/pull-request-26-commit.json delete mode 100644 dev/archery/archery/tests/fixtures/pull-request-26-files.json delete mode 100644 dev/archery/archery/tests/fixtures/pull-request-26.json delete mode 100644 dev/archery/archery/tests/test_benchmarks.py delete mode 100644 dev/archery/archery/tests/test_bot.py delete mode 100644 dev/archery/archery/tests/test_cli.py delete mode 100644 dev/archery/archery/tests/test_docker.py delete mode 100644 dev/archery/archery/tests/test_release.py delete mode 100644 dev/archery/archery/tests/test_testing.py delete mode 100644 dev/archery/archery/utils/__init__.py delete mode 100644 dev/archery/archery/utils/cache.py delete mode 100644 dev/archery/archery/utils/cmake.py delete mode 100644 dev/archery/archery/utils/command.py delete mode 100644 dev/archery/archery/utils/git.py delete mode 100644 dev/archery/archery/utils/lint.py delete mode 100644 dev/archery/archery/utils/logger.py delete mode 100644 dev/archery/archery/utils/rat.py delete mode 100644 dev/archery/archery/utils/report.py delete mode 100644 dev/archery/archery/utils/source.py delete mode 100644 dev/archery/archery/utils/tmpdir.py delete mode 100644 dev/archery/conftest.py delete mode 100755 dev/archery/generate_files_for_endian_test.sh delete mode 100644 dev/archery/requirements-lint.txt delete mode 100644 dev/archery/requirements.txt delete mode 100755 dev/archery/setup.py delete mode 100644 dev/benchmarking/.env delete mode 100644 dev/benchmarking/.gitignore delete mode 100644 dev/benchmarking/Dockerfile delete mode 100644 dev/benchmarking/README.md delete mode 100644 dev/benchmarking/data_model.dot delete mode 100644 dev/benchmarking/data_model.rst delete mode 100644 dev/benchmarking/ddl/0_setup.sql delete mode 100644 dev/benchmarking/ddl/1_00_table_public_project.sql delete mode 100644 dev/benchmarking/ddl/1_01_table_public_cpu.sql delete mode 100644 dev/benchmarking/ddl/1_02_table_public_gpu.sql delete mode 100644 dev/benchmarking/ddl/1_03_table_public_os.sql delete mode 100644 dev/benchmarking/ddl/1_04_table_public_benchmark_language.sql delete mode 100644 dev/benchmarking/ddl/1_05_table_public_dependencies.sql delete mode 100644 dev/benchmarking/ddl/1_06_table_public_language_implementation_version.sql delete mode 100644 dev/benchmarking/ddl/1_07_table_public_benchmark_type.sql delete mode 100644 dev/benchmarking/ddl/1_08_table_public_machine.sql delete mode 100644 dev/benchmarking/ddl/1_09_table_public_unit.sql delete mode 100644 dev/benchmarking/ddl/1_10_table_public_environment.sql delete mode 100644 dev/benchmarking/ddl/1_11_table_public_benchmark.sql delete mode 100644 dev/benchmarking/ddl/1_12_table_public_benchmark_run.sql delete mode 100644 dev/benchmarking/ddl/2_00_views.sql delete mode 100644 dev/benchmarking/ddl/3_00_functions_helpers.sql delete mode 100644 dev/benchmarking/ddl/3_01_functions_triggers.sql delete mode 100644 dev/benchmarking/ddl/3_02_functions_ingestion.sql delete mode 100644 dev/benchmarking/ddl/3_10_functions_documentation.sql delete mode 100644 dev/benchmarking/ddl/4_00_triggers.sql delete mode 100644 dev/benchmarking/ddl/5_00_permissions.sql delete mode 100644 dev/benchmarking/docker-compose.yml delete mode 100644 dev/benchmarking/examples/benchmark_example.json delete mode 100644 dev/benchmarking/examples/benchmark_run_example.csv delete mode 100644 dev/benchmarking/examples/benchmark_run_example.json delete mode 100644 dev/benchmarking/examples/benchmark_with_context_example.json delete mode 100644 dev/benchmarking/examples/example.sql delete mode 100644 dev/benchmarking/examples/example_graphql_mutation.json delete mode 100644 dev/benchmarking/examples/graphql_query_environment_view.json delete mode 100644 dev/benchmarking/examples/machine.json delete mode 100755 dev/benchmarking/graphql_submit.sh delete mode 100755 dev/benchmarking/make_data_model_rst.sh delete mode 100755 dev/benchmarking/make_dotfile.sh delete mode 100755 dev/benchmarking/make_machine_json.sh delete mode 100755 dev/build-ballista-docker-arm64.sh delete mode 100755 dev/build-ballista-docker.sh delete mode 100755 dev/build-set-env.sh delete mode 100755 dev/build-ui.sh delete mode 100644 dev/merge.conf.sample delete mode 100755 dev/merge_arrow_pr.py delete mode 100755 dev/merge_arrow_pr.sh delete mode 100644 dev/release/.env.example delete mode 100644 dev/release/.gitignore delete mode 100644 dev/release/01-prepare-test.rb delete mode 100755 dev/release/01-prepare.sh delete mode 100644 dev/release/02-source-test.rb delete mode 100755 dev/release/02-source.sh delete mode 100755 dev/release/03-binary-submit.sh delete mode 100755 dev/release/04-binary-download.sh delete mode 100755 dev/release/05-binary-upload.sh delete mode 100644 dev/release/README.md delete mode 100644 dev/release/Rakefile delete mode 100644 dev/release/VERIFY.md delete mode 100644 dev/release/binary-common.sh delete mode 100644 dev/release/binary-task.rb delete mode 100644 dev/release/binary/.dockerignore delete mode 100644 dev/release/binary/Dockerfile delete mode 100755 dev/release/binary/runner.sh delete mode 100644 dev/release/check-rat-report.py delete mode 100755 dev/release/download_rc_binaries.py delete mode 100755 dev/release/post-01-upload.sh delete mode 100755 dev/release/post-02-binary.sh delete mode 100755 dev/release/post-03-website.sh delete mode 100755 dev/release/post-04-ruby.sh delete mode 100755 dev/release/post-05-js.sh delete mode 100755 dev/release/post-06-csharp.sh delete mode 100755 dev/release/post-07-rust.sh delete mode 100755 dev/release/post-08-remove-rc.sh delete mode 100755 dev/release/post-09-docs.sh delete mode 100755 dev/release/post-10-python.sh delete mode 100755 dev/release/post-11-java.sh delete mode 100644 dev/release/rat_exclude_files.txt delete mode 100755 dev/release/run-rat.sh delete mode 100755 dev/release/run-test.rb delete mode 100644 dev/release/setup-gpg-agent.sh delete mode 100644 dev/release/test-helper.rb delete mode 100755 dev/release/verify-apt.sh delete mode 100644 dev/release/verify-release-candidate-wheels.bat delete mode 100644 dev/release/verify-release-candidate.bat delete mode 100755 dev/release/verify-release-candidate.sh delete mode 100755 dev/release/verify-yum.sh delete mode 100644 dev/requirements_merge_arrow_pr.txt delete mode 100644 dev/tasks/README.md delete mode 100644 dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.6.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.7.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.8.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.19python3.9.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.6.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.7.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.8.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.19python3.9.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/linux_aarch64_python3.6.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/linux_aarch64_python3.7.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/linux_aarch64_python3.8.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/linux_aarch64_python3.9.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.6.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.7.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.8.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.19python3.9.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/osx_arm64_python3.8.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/osx_arm64_python3.9.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base3.6.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.0.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base3.6.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.0.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/r/win_64_r_base3.6.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/r/win_64_r_base4.0.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.6.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.7.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.8.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.19python3.9.____cpython.yaml delete mode 100644 dev/tasks/conda-recipes/.scripts/logging_utils.sh delete mode 100644 dev/tasks/conda-recipes/README.md delete mode 100644 dev/tasks/conda-recipes/arrow-cpp/LLVM_LICENSE.txt delete mode 100644 dev/tasks/conda-recipes/arrow-cpp/bld-arrow.bat delete mode 100644 dev/tasks/conda-recipes/arrow-cpp/bld-pyarrow.bat delete mode 100644 dev/tasks/conda-recipes/arrow-cpp/build-arrow.sh delete mode 100644 dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh delete mode 100644 dev/tasks/conda-recipes/arrow-cpp/meta.yaml delete mode 100644 dev/tasks/conda-recipes/azure.clean.yml delete mode 100755 dev/tasks/conda-recipes/azure.linux.yml delete mode 100755 dev/tasks/conda-recipes/azure.osx.yml delete mode 100755 dev/tasks/conda-recipes/azure.win.yml delete mode 100755 dev/tasks/conda-recipes/build_steps.sh delete mode 100644 dev/tasks/conda-recipes/clean.py delete mode 100644 dev/tasks/conda-recipes/conda-forge.yml delete mode 100755 dev/tasks/conda-recipes/drone-steps.sh delete mode 100644 dev/tasks/conda-recipes/drone.yml delete mode 100644 dev/tasks/conda-recipes/parquet-cpp/meta.yaml delete mode 100644 dev/tasks/conda-recipes/r-arrow/bld.bat delete mode 100644 dev/tasks/conda-recipes/r-arrow/build.sh delete mode 100755 dev/tasks/conda-recipes/r-arrow/build_win.sh delete mode 100755 dev/tasks/conda-recipes/r-arrow/configure.win delete mode 100644 dev/tasks/conda-recipes/r-arrow/install.libs.R delete mode 100644 dev/tasks/conda-recipes/r-arrow/meta.yaml delete mode 100755 dev/tasks/conda-recipes/run_docker_build.sh delete mode 100644 dev/tasks/cpp-examples/github.linux.yml delete mode 100644 dev/tasks/docker-tests/azure.linux.yml delete mode 100644 dev/tasks/docker-tests/circle.linux.yml delete mode 100644 dev/tasks/docker-tests/github.linux.yml delete mode 100644 dev/tasks/gandiva-jars/README.md delete mode 100755 dev/tasks/gandiva-jars/build-cpp-linux.sh delete mode 100755 dev/tasks/gandiva-jars/build-cpp-osx.sh delete mode 100755 dev/tasks/gandiva-jars/build-java.sh delete mode 100644 dev/tasks/gandiva-jars/github.linux.yml delete mode 100644 dev/tasks/gandiva-jars/github.osx.yml delete mode 100644 dev/tasks/homebrew-formulae/apache-arrow.rb delete mode 100644 dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb delete mode 100644 dev/tasks/homebrew-formulae/github.macos.yml delete mode 100644 dev/tasks/linux-packages/.gitignore delete mode 100644 dev/tasks/linux-packages/README.md delete mode 100644 dev/tasks/linux-packages/Rakefile delete mode 100644 dev/tasks/linux-packages/apache-arrow-apt-source/Rakefile delete mode 100644 dev/tasks/linux-packages/apache-arrow-apt-source/apt/debian-bullseye/Dockerfile delete mode 100644 dev/tasks/linux-packages/apache-arrow-apt-source/apt/debian-buster/Dockerfile delete mode 100644 dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-bionic/Dockerfile delete mode 100644 dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-focal/Dockerfile delete mode 100644 dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-groovy/Dockerfile delete mode 100644 dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-xenial/Dockerfile delete mode 100644 dev/tasks/linux-packages/apache-arrow-apt-source/debian/apache-arrow-apt-source.install delete mode 100644 dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog delete mode 100644 dev/tasks/linux-packages/apache-arrow-apt-source/debian/compat delete mode 100644 dev/tasks/linux-packages/apache-arrow-apt-source/debian/control delete mode 100644 dev/tasks/linux-packages/apache-arrow-apt-source/debian/copyright delete mode 100755 dev/tasks/linux-packages/apache-arrow-apt-source/debian/rules delete mode 100644 dev/tasks/linux-packages/apache-arrow-apt-source/debian/source/format delete mode 100644 dev/tasks/linux-packages/apache-arrow-release/Rakefile delete mode 100644 dev/tasks/linux-packages/apache-arrow-release/yum/Apache-Arrow.repo delete mode 100644 dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in delete mode 100644 dev/tasks/linux-packages/apache-arrow-release/yum/centos-7/Dockerfile delete mode 100644 dev/tasks/linux-packages/apache-arrow-release/yum/centos-8/Dockerfile delete mode 100644 dev/tasks/linux-packages/apache-arrow/Rakefile delete mode 100644 dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye-arm64/from delete mode 100644 dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye/Dockerfile delete mode 100755 dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye/qemu-dummy-static delete mode 100644 dev/tasks/linux-packages/apache-arrow/apt/debian-buster-arm64/from delete mode 100644 dev/tasks/linux-packages/apache-arrow/apt/debian-buster/Dockerfile delete mode 100755 dev/tasks/linux-packages/apache-arrow/apt/debian-buster/qemu-dummy-static delete mode 100644 dev/tasks/linux-packages/apache-arrow/apt/ubuntu-bionic-arm64/from delete mode 100644 dev/tasks/linux-packages/apache-arrow/apt/ubuntu-bionic/Dockerfile delete mode 100755 dev/tasks/linux-packages/apache-arrow/apt/ubuntu-bionic/qemu-dummy-static delete mode 100644 dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal-arm64/from delete mode 100644 dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal/Dockerfile delete mode 100755 dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal/qemu-dummy-static delete mode 100644 dev/tasks/linux-packages/apache-arrow/apt/ubuntu-groovy-arm64/from delete mode 100644 dev/tasks/linux-packages/apache-arrow/apt/ubuntu-groovy/Dockerfile delete mode 100755 dev/tasks/linux-packages/apache-arrow/apt/ubuntu-groovy/qemu-dummy-static delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/changelog delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/compat delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/control.in delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/copyright delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/gir1.2-arrow-1.0.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/gir1.2-arrow-cuda-1.0.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/gir1.2-arrow-dataset-1.0.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/gir1.2-gandiva-1.0.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/gir1.2-parquet-1.0.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/gir1.2-plasma-1.0.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-dev.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib-dev.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib400.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda400.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-dev.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-dev.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-doc.doc-base delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-doc.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-doc.links delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib400.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset400.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-dev.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight-dev.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight400.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-dev.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-doc.doc-base delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-doc.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-doc.links delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib400.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-dev.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-flight-dev.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-flight400.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow-python400.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libarrow400.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libgandiva-dev.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-dev.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-doc.doc-base delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-doc.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-doc.links delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib400.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libgandiva400.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libparquet-dev.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-dev.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-doc.doc-base delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-doc.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-doc.links delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib400.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libparquet400.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libplasma-dev.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-dev.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-doc.doc-base delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-doc.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-doc.links delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib400.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/libplasma400.install delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/patches/series delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/plasma-store-server.install delete mode 100755 dev/tasks/linux-packages/apache-arrow/debian/rules delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/source/format delete mode 100644 dev/tasks/linux-packages/apache-arrow/debian/watch delete mode 100644 dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in delete mode 100644 dev/tasks/linux-packages/apache-arrow/yum/centos-7/Dockerfile delete mode 100755 dev/tasks/linux-packages/apache-arrow/yum/centos-7/qemu-dummy-static delete mode 100644 dev/tasks/linux-packages/apache-arrow/yum/centos-8-aarch64/from delete mode 100644 dev/tasks/linux-packages/apache-arrow/yum/centos-8/Dockerfile delete mode 100755 dev/tasks/linux-packages/apache-arrow/yum/centos-8/qemu-dummy-static delete mode 100755 dev/tasks/linux-packages/apt/build.sh delete mode 100644 dev/tasks/linux-packages/github.linux.amd64.yml delete mode 100644 dev/tasks/linux-packages/helper.rb delete mode 100644 dev/tasks/linux-packages/package-task.rb delete mode 100644 dev/tasks/linux-packages/travis.linux.arm64.yml delete mode 100755 dev/tasks/linux-packages/yum/build.sh delete mode 100644 dev/tasks/macros.jinja delete mode 100644 dev/tasks/nightlies.sample.yml delete mode 100644 dev/tasks/nuget-packages/github.linux.yml delete mode 100644 dev/tasks/python-sdist/github.yml delete mode 100644 dev/tasks/python-wheels/github.linux.amd64.yml delete mode 100644 dev/tasks/python-wheels/github.osx.yml delete mode 100644 dev/tasks/python-wheels/github.windows.yml delete mode 100644 dev/tasks/python-wheels/travis.linux.arm64.yml delete mode 100644 dev/tasks/r/azure.linux.yml delete mode 100644 dev/tasks/r/github.devdocs.yml delete mode 100644 dev/tasks/r/github.linux.cran.yml delete mode 100644 dev/tasks/r/github.linux.version.compatibility.yml delete mode 100644 dev/tasks/r/github.linux.versions.yml delete mode 100644 dev/tasks/r/github.macos-linux.local.yml delete mode 100644 dev/tasks/r/github.macos.autobrew.yml delete mode 100644 dev/tasks/tasks.yml delete mode 100644 dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat delete mode 100644 dev/tasks/vcpkg-tests/github.windows.yml delete mode 100644 dev/tasks/verify-rc/github.linux.yml delete mode 100644 dev/tasks/verify-rc/github.macos.yml delete mode 100644 dev/tasks/verify-rc/github.win.yml delete mode 100644 dev/test_merge_arrow_pr.py delete mode 100755 dev/update_arrow_deps.py diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml index 9e103003eeedf..6ca095328af17 100644 --- a/.github/workflows/comment_bot.yml +++ b/.github/workflows/comment_bot.yml @@ -33,13 +33,13 @@ jobs: - name: Checkout Arrow uses: actions/checkout@v2 with: - path: arrow + repository: apache/arrow - name: Set up Python uses: actions/setup-python@v2 with: python-version: 3.8 - name: Install Archery and Crossbow dependencies - run: pip install -e arrow/dev/archery[bot] + run: pip install -e dev/archery[bot] - name: Handle Github comment event env: ARROW_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -49,78 +49,6 @@ jobs: --event-name ${{ github.event_name }} \ --event-payload ${{ github.event_path }} - autotune: - name: "Fix all the things" - if: startsWith(github.event.comment.body, '@github-actions autotune') - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: r-lib/actions/pr-fetch@master - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - - name: See what is different - run: | - set -ex - git remote add upstream https://github.com/apache/arrow - git fetch upstream - changed() { - git diff --name-only HEAD..upstream/master | grep -e "$1" >/dev/null 2>&1 - } - if changed '^r/.*\.R$'; then - echo "R_DOCS=true" >> $GITHUB_ENV - fi - if changed 'cmake' || changed 'CMake'; then - echo "CMAKE_FORMAT=true" >> $GITHUB_ENV - fi - if changed '^cpp/src'; then - echo "CLANG_FORMAT_CPP=true" >> $GITHUB_ENV - fi - if changed '^r/src'; then - echo "CLANG_FORMAT_R=true" >> $GITHUB_ENV - fi - - name: Run cmake_format - if: env.CMAKE_FORMAT == 'true' || endsWith(github.event.comment.body, 'everything') - run: | - set -ex - export PATH=/home/runner/.local/bin:$PATH - python3 -m pip install --upgrade pip setuptools wheel - python3 -m pip install -r dev/archery/requirements-lint.txt - python3 run-cmake-format.py - - name: Run clang-format on cpp - if: env.CLANG_FORMAT_CPP == 'true' || endsWith(github.event.comment.body, 'everything') - run: | - . .env # To get the clang version we use - cpp/build-support/run_clang_format.py \ - --clang_format_binary=clang-format-${CLANG_TOOLS} \ - --exclude_glob=cpp/build-support/lint_exclusions.txt \ - --source_dir=cpp/src --quiet --fix - - name: Run clang-format on r - if: env.CLANG_FORMAT_R == 'true' || endsWith(github.event.comment.body, 'everything') - run: | - . .env # To get the clang version we use - cpp/build-support/run_clang_format.py \ - --clang_format_binary=clang-format-${CLANG_TOOLS} \ - --exclude_glob=cpp/build-support/lint_exclusions.txt \ - --source_dir=r/src --quiet --fix - - uses: r-lib/actions/setup-r@v1 - if: env.R_DOCS == 'true' || endsWith(github.event.comment.body, 'everything') - - name: Update R docs - if: env.R_DOCS == 'true' || endsWith(github.event.comment.body, 'everything') - shell: Rscript {0} - run: | - source("ci/etc/rprofile") - install.packages(c("remotes", "roxygen2")) - remotes::install_deps("r") - roxygen2::roxygenize("r") - - name: Commit results - run: | - git config user.name "$(git log -1 --pretty=format:%an)" - git config user.email "$(git log -1 --pretty=format:%ae)" - git commit -a -m 'Autoformat/render all the things [automated commit]' || echo "No changes to commit" - - uses: r-lib/actions/pr-push@master - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - rebase: name: "Rebase" if: startsWith(github.event.comment.body, '@github-actions rebase') diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index f9e6b27fdb80e..a7e574eef97c9 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -27,13 +27,18 @@ jobs: name: Lint C++, Python, R, Rust, Docker, RAT runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - name: Checkout Arrow + uses: actions/checkout@v2 + with: + repository: apache/arrow + submodules: true + fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@v1 + uses: actions/setup-python@v2 with: python-version: 3.8 - name: Setup Archery - run: pip install -e dev/archery[docker] + run: pip install -e dev/archery[lint] - name: Lint run: archery lint --rat @@ -49,11 +54,8 @@ jobs: run: | # if you encounter error, try rerun the command below with --write instead of --check # and commit the changes - npx prettier@2.3.0 --check {ballista,datafusion,datafusion-examples,dev,docs,python}/**/*.md README.md DEVELOPERS.md - - name: Prettier check for Ballista UI - run: | - cd ballista/ui/scheduler - npx yarn - # if you encounter error, try rerun the command below with --write instead of --check - # and commit the changes - npx yarn prettier --check **/*.{ts,tsx} + npx prettier@2.3.2 --check \ + {ballista,datafusion,datafusion-examples,docs,python}/**/*.md \ + README.md \ + DEVELOPERS.md \ + ballista/**/*.{ts,tsx} diff --git a/dev/.gitignore b/dev/.gitignore deleted file mode 100644 index 399c30926260c..0000000000000 --- a/dev/.gitignore +++ /dev/null @@ -1,21 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Python virtual environments for dev tools -.venv*/ - -__pycache__ diff --git a/dev/README.md b/dev/README.md deleted file mode 100644 index 27440878bbcec..0000000000000 --- a/dev/README.md +++ /dev/null @@ -1,199 +0,0 @@ - - -# Arrow Developer Scripts - -This directory contains scripts useful to developers when packaging, -testing, or committing to Arrow. - -Merging a pull request requires being a committer on the project. In addition -you need to have linked your GitHub and ASF accounts on -https://gitbox.apache.org/setup/ to be able to push to GitHub as the main -remote. - -NOTE: It may take some time (a few hours) between when you complete -the setup at GitBox, and when your GitHub account will be added as a -committer. - -## How to merge a Pull request - -Please don't merge PRs using the Github Web interface. Instead, set up -your git clone such as to have a remote named `apache` pointing to the -official Arrow repository: - -``` -git remote add apache git@github.com:apache/arrow.git -``` - -and then run the following command: - -``` -./dev/merge_arrow_pr.sh -``` - -This creates a new Python virtual environment under `dev/.venv[PY_VERSION]` -and installs all the necessary dependencies to run the Arrow merge script. -After installed, it runs the merge script. - -(we don't provide a wrapper script for Windows yet, so under Windows you'll -have to install Python dependencies yourself and then run `dev/merge_arrow_pr.py` -directly) - -The merge script uses the GitHub REST API; if you encounter rate limit issues, -you may set a `ARROW_GITHUB_API_TOKEN` environment variable to use a Personal -Access Token. - -You can specify the username and the password of your JIRA account in -`APACHE_JIRA_USERNAME` and `APACHE_JIRA_PASSWORD` environment variables. -If these aren't supplied, the script will ask you the values of them. - -Note that the directory name of your Arrow git clone must be called `arrow`. - -example output: - -``` -Which pull request would you like to merge? (e.g. 34): -``` - -Type the pull request number (from https://github.com/apache/arrow/pulls) and hit enter. - -``` -=== Pull Request #X === -title Blah Blah Blah -source repo/branch -target master -url https://api.github.com/repos/apache/arrow/pulls/X - -Proceed with merging pull request #3? (y/n): -``` - -If this looks good, type y and hit enter. - -``` -From git-wip-us.apache.org:/repos/asf/arrow.git - * [new branch] master -> PR_TOOL_MERGE_PR_3_MASTER -Switched to branch 'PR_TOOL_MERGE_PR_3_MASTER' - -Merge complete (local ref PR_TOOL_MERGE_PR_3_MASTER). Push to apache? (y/n): -``` - -A local branch with the merge has been created. -type y and hit enter to push it to apache master - -``` -Counting objects: 67, done. -Delta compression using up to 4 threads. -Compressing objects: 100% (26/26), done. -Writing objects: 100% (36/36), 5.32 KiB, done. -Total 36 (delta 17), reused 0 (delta 0) -To git-wip-us.apache.org:/repos/arrow-mr.git - b767ac4..485658a PR_TOOL_MERGE_PR_X_MASTER -> master -Restoring head pointer to b767ac4e -Note: checking out 'b767ac4e'. - -You are in 'detached HEAD' state. You can look around, make experimental -changes and commit them, and you can discard any commits you make in this -state without impacting any branches by performing another checkout. - -If you want to create a new branch to retain commits you create, you may -do so (now or later) by using -b with the checkout command again. Example: - - git checkout -b new_branch_name - -HEAD is now at b767ac4... Update README.md -Deleting local branch PR_TOOL_MERGE_PR_X -Deleting local branch PR_TOOL_MERGE_PR_X_MASTER -Pull request #X merged! -Merge hash: 485658a5 - -Would you like to pick 485658a5 into another branch? (y/n): -``` - -For now just say n as we have 1 branch - -## Verifying Release Candidates - -We have provided a script to assist with verifying release candidates: - -```shell -bash dev/release/verify-release-candidate.sh 0.7.0 0 -``` - -Currently this only works on Linux (patches to expand to macOS welcome!). Read -the script for information about system dependencies. - -On Windows, we have a script that verifies C++ and Python (requires Visual -Studio 2015): - -``` -dev/release/verify-release-candidate.bat apache-arrow-0.7.0.tar.gz -``` - -### Verifying the JavaScript release - -For JavaScript-specific releases, use a different verification script: - -```shell -bash dev/release/js-verify-release-candidate.sh 0.7.0 0 -``` - -# Integration testing - -Build the following base image used by multiple tests: - -```shell -docker build -t arrow_integration_xenial_base -f docker_common/Dockerfile.xenial.base . -``` - -## HDFS C++ / Python support - -```shell -docker-compose build conda-cpp -docker-compose build conda-python -docker-compose build conda-python-hdfs -docker-compose run --rm conda-python-hdfs -``` - -## Apache Spark Integration Tests - -Tests can be run to ensure that the current snapshot of Java and Python Arrow -works with Spark. This will run a docker image to build Arrow C++ -and Python in a Conda environment, build and install Arrow Java to the local -Maven repository, build Spark with the new Arrow artifact, and run Arrow -related unit tests in Spark for Java and Python. Any errors will exit with a -non-zero value. To run, use the following command: - -```shell -docker-compose build conda-cpp -docker-compose build conda-python -docker-compose build conda-python-spark -docker-compose run --rm conda-python-spark -``` - -If you already are building Spark, these commands will map your local Maven -repo to the image and save time by not having to download all dependencies. -Be aware, that docker write files as root, which can cause problems for maven -on the host. - -```shell -docker-compose run --rm -v $HOME/.m2:/root/.m2 conda-python-spark -``` - -NOTE: If the Java API has breaking changes, a patched version of Spark might -need to be used to successfully build. diff --git a/dev/archery/MANIFEST.in b/dev/archery/MANIFEST.in deleted file mode 100644 index 90fe034c21341..0000000000000 --- a/dev/archery/MANIFEST.in +++ /dev/null @@ -1,4 +0,0 @@ -include ../../LICENSE.txt -include ../../NOTICE.txt - -include archery/reports/* diff --git a/dev/archery/archery/__init__.py b/dev/archery/archery/__init__.py deleted file mode 100644 index 13a83393a9124..0000000000000 --- a/dev/archery/archery/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. diff --git a/dev/archery/archery/benchmark/__init__.py b/dev/archery/archery/benchmark/__init__.py deleted file mode 100644 index 13a83393a9124..0000000000000 --- a/dev/archery/archery/benchmark/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. diff --git a/dev/archery/archery/benchmark/codec.py b/dev/archery/archery/benchmark/codec.py deleted file mode 100644 index 4157890d13d0e..0000000000000 --- a/dev/archery/archery/benchmark/codec.py +++ /dev/null @@ -1,97 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -import json - -from ..benchmark.core import Benchmark, BenchmarkSuite -from ..benchmark.runner import BenchmarkRunner, StaticBenchmarkRunner -from ..benchmark.compare import BenchmarkComparator - - -class JsonEncoder(json.JSONEncoder): - def default(self, o): - if isinstance(o, Benchmark): - return BenchmarkCodec.encode(o) - - if isinstance(o, BenchmarkSuite): - return BenchmarkSuiteCodec.encode(o) - - if isinstance(o, BenchmarkRunner): - return BenchmarkRunnerCodec.encode(o) - - if isinstance(o, BenchmarkComparator): - return BenchmarkComparatorCodec.encode(o) - - return json.JSONEncoder.default(self, o) - - -class BenchmarkCodec: - @staticmethod - def encode(b): - return { - "name": b.name, - "unit": b.unit, - "less_is_better": b.less_is_better, - "values": b.values, - "time_unit": b.time_unit, - "times": b.times, - "counters": b.counters, - } - - @staticmethod - def decode(dct, **kwargs): - return Benchmark(**dct, **kwargs) - - -class BenchmarkSuiteCodec: - @staticmethod - def encode(bs): - return { - "name": bs.name, - "benchmarks": [BenchmarkCodec.encode(b) for b in bs.benchmarks] - } - - @staticmethod - def decode(dct, **kwargs): - benchmarks = [BenchmarkCodec.decode(b) - for b in dct.pop("benchmarks", [])] - return BenchmarkSuite(benchmarks=benchmarks, **dct, **kwargs) - - -class BenchmarkRunnerCodec: - @staticmethod - def encode(br): - return {"suites": [BenchmarkSuiteCodec.encode(s) for s in br.suites]} - - @staticmethod - def decode(dct, **kwargs): - suites = [BenchmarkSuiteCodec.decode(s) - for s in dct.pop("suites", [])] - return StaticBenchmarkRunner(suites=suites, **dct, **kwargs) - - -class BenchmarkComparatorCodec: - @staticmethod - def encode(bc): - comparator = bc.formatted - - suite_name = bc.suite_name - if suite_name: - comparator["suite"] = suite_name - - return comparator diff --git a/dev/archery/archery/benchmark/compare.py b/dev/archery/archery/benchmark/compare.py deleted file mode 100644 index 622b80179178b..0000000000000 --- a/dev/archery/archery/benchmark/compare.py +++ /dev/null @@ -1,173 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -# Define a global regression threshold as 5%. This is purely subjective and -# flawed. This does not track cumulative regression. -DEFAULT_THRESHOLD = 0.05 - - -def items_per_seconds_fmt(value): - if value < 1000: - return "{} items/sec".format(value) - if value < 1000**2: - return "{:.3f}K items/sec".format(value / 1000) - if value < 1000**3: - return "{:.3f}M items/sec".format(value / 1000**2) - else: - return "{:.3f}G items/sec".format(value / 1000**3) - - -def bytes_per_seconds_fmt(value): - if value < 1024: - return "{} bytes/sec".format(value) - if value < 1024**2: - return "{:.3f} KiB/sec".format(value / 1024) - if value < 1024**3: - return "{:.3f} MiB/sec".format(value / 1024**2) - if value < 1024**4: - return "{:.3f} GiB/sec".format(value / 1024**3) - else: - return "{:.3f} TiB/sec".format(value / 1024**4) - - -def change_fmt(value): - return "{:.3%}".format(value) - - -def formatter_for_unit(unit): - if unit == "bytes_per_second": - return bytes_per_seconds_fmt - elif unit == "items_per_second": - return items_per_seconds_fmt - else: - return lambda x: x - - -class BenchmarkComparator: - """ Compares two benchmarks. - - Encodes the logic of comparing two benchmarks and taking a decision on - if it induce a regression. - """ - - def __init__(self, contender, baseline, threshold=DEFAULT_THRESHOLD, - suite_name=None): - self.contender = contender - self.baseline = baseline - self.threshold = threshold - self.suite_name = suite_name - - @property - def name(self): - return self.baseline.name - - @property - def less_is_better(self): - return self.baseline.less_is_better - - @property - def unit(self): - return self.baseline.unit - - @property - def change(self): - new = self.contender.value - old = self.baseline.value - - if old == 0 and new == 0: - return 0.0 - if old == 0: - return 0.0 - - return float(new - old) / abs(old) - - @property - def confidence(self): - """ Indicate if a comparison of benchmarks should be trusted. """ - return True - - @property - def regression(self): - change = self.change - adjusted_change = change if self.less_is_better else -change - return (self.confidence and adjusted_change > self.threshold) - - @property - def formatted(self): - fmt = formatter_for_unit(self.unit) - return { - "benchmark": self.name, - "change": change_fmt(self.change), - "regression": self.regression, - "baseline": fmt(self.baseline.value), - "contender": fmt(self.contender.value), - "unit": self.unit, - "less_is_better": self.less_is_better, - "counters": str(self.baseline.counters) - } - - def compare(self, comparator=None): - return { - "benchmark": self.name, - "change": self.change, - "regression": self.regression, - "baseline": self.baseline.value, - "contender": self.contender.value, - "unit": self.unit, - "less_is_better": self.less_is_better, - "counters": self.baseline.counters - } - - def __call__(self, **kwargs): - return self.compare(**kwargs) - - -def pairwise_compare(contender, baseline): - dict_contender = {e.name: e for e in contender} - dict_baseline = {e.name: e for e in baseline} - - for name in (dict_contender.keys() & dict_baseline.keys()): - yield name, (dict_contender[name], dict_baseline[name]) - - -class RunnerComparator: - """ Compares suites/benchmarks from runners. - - It is up to the caller that ensure that runners are compatible (both from - the same language implementation). - """ - - def __init__(self, contender, baseline, threshold=DEFAULT_THRESHOLD): - self.contender = contender - self.baseline = baseline - self.threshold = threshold - - @property - def comparisons(self): - contender = self.contender.suites - baseline = self.baseline.suites - suites = pairwise_compare(contender, baseline) - - for suite_name, (suite_cont, suite_base) in suites: - benchmarks = pairwise_compare( - suite_cont.benchmarks, suite_base.benchmarks) - - for _, (bench_cont, bench_base) in benchmarks: - yield BenchmarkComparator(bench_cont, bench_base, - threshold=self.threshold, - suite_name=suite_name) diff --git a/dev/archery/archery/benchmark/core.py b/dev/archery/archery/benchmark/core.py deleted file mode 100644 index 5a92271a35391..0000000000000 --- a/dev/archery/archery/benchmark/core.py +++ /dev/null @@ -1,57 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -def median(values): - n = len(values) - if n == 0: - raise ValueError("median requires at least one value") - elif n % 2 == 0: - return (values[(n // 2) - 1] + values[n // 2]) / 2 - else: - return values[n // 2] - - -class Benchmark: - def __init__(self, name, unit, less_is_better, values, time_unit, - times, counters=None): - self.name = name - self.unit = unit - self.less_is_better = less_is_better - self.values = sorted(values) - self.time_unit = time_unit - self.times = sorted(times) - self.median = median(self.values) - self.counters = counters or {} - - @property - def value(self): - return self.median - - def __repr__(self): - return "Benchmark[name={},value={}]".format(self.name, self.value) - - -class BenchmarkSuite: - def __init__(self, name, benchmarks): - self.name = name - self.benchmarks = benchmarks - - def __repr__(self): - return "BenchmarkSuite[name={}, benchmarks={}]".format( - self.name, self.benchmarks - ) diff --git a/dev/archery/archery/benchmark/google.py b/dev/archery/archery/benchmark/google.py deleted file mode 100644 index ebcc5263645f2..0000000000000 --- a/dev/archery/archery/benchmark/google.py +++ /dev/null @@ -1,174 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from itertools import filterfalse, groupby, tee -import json -import subprocess -from tempfile import NamedTemporaryFile - -from .core import Benchmark -from ..utils.command import Command - - -def partition(pred, iterable): - # adapted from python's examples - t1, t2 = tee(iterable) - return list(filter(pred, t1)), list(filterfalse(pred, t2)) - - -class GoogleBenchmarkCommand(Command): - """ Run a google benchmark binary. - - This assumes the binary supports the standard command line options, - notably `--benchmark_filter`, `--benchmark_format`, etc... - """ - - def __init__(self, benchmark_bin, benchmark_filter=None): - self.bin = benchmark_bin - self.benchmark_filter = benchmark_filter - - def list_benchmarks(self): - argv = ["--benchmark_list_tests"] - if self.benchmark_filter: - argv.append("--benchmark_filter={}".format(self.benchmark_filter)) - result = self.run(*argv, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - return str.splitlines(result.stdout.decode("utf-8")) - - def results(self, repetitions=1): - with NamedTemporaryFile() as out: - argv = ["--benchmark_repetitions={}".format(repetitions), - "--benchmark_out={}".format(out.name), - "--benchmark_out_format=json"] - - if self.benchmark_filter: - argv.append( - "--benchmark_filter={}".format(self.benchmark_filter) - ) - - self.run(*argv, check=True) - return json.load(out) - - -class GoogleBenchmarkObservation: - """ Represents one run of a single (google c++) benchmark. - - Aggregates are reported by Google Benchmark executables alongside - other observations whenever repetitions are specified (with - `--benchmark_repetitions` on the bare benchmark, or with the - archery option `--repetitions`). Aggregate observations are not - included in `GoogleBenchmark.runs`. - - RegressionSumKernel/32768/0 1 us 1 us 25.8077GB/s - RegressionSumKernel/32768/0 1 us 1 us 25.7066GB/s - RegressionSumKernel/32768/0 1 us 1 us 25.1481GB/s - RegressionSumKernel/32768/0 1 us 1 us 25.846GB/s - RegressionSumKernel/32768/0 1 us 1 us 25.6453GB/s - RegressionSumKernel/32768/0_mean 1 us 1 us 25.6307GB/s - RegressionSumKernel/32768/0_median 1 us 1 us 25.7066GB/s - RegressionSumKernel/32768/0_stddev 0 us 0 us 288.046MB/s - """ - - def __init__(self, name, real_time, cpu_time, time_unit, run_type, - size=None, bytes_per_second=None, items_per_second=None, - **counters): - self._name = name - self.real_time = real_time - self.cpu_time = cpu_time - self.time_unit = time_unit - self.run_type = run_type - self.size = size - self.bytes_per_second = bytes_per_second - self.items_per_second = items_per_second - self.counters = counters - - @property - def is_aggregate(self): - """ Indicate if the observation is a run or an aggregate. """ - return self.run_type == "aggregate" - - @property - def is_realtime(self): - """ Indicate if the preferred value is realtime instead of cputime. """ - return self.name.find("/real_time") != -1 - - @property - def name(self): - name = self._name - return name.rsplit("_", maxsplit=1)[0] if self.is_aggregate else name - - @property - def time(self): - return self.real_time if self.is_realtime else self.cpu_time - - @property - def value(self): - """ Return the benchmark value.""" - return self.bytes_per_second or self.items_per_second or self.time - - @property - def unit(self): - if self.bytes_per_second: - return "bytes_per_second" - elif self.items_per_second: - return "items_per_second" - else: - return self.time_unit - - def __repr__(self): - return str(self.value) - - -class GoogleBenchmark(Benchmark): - """ A set of GoogleBenchmarkObservations. """ - - def __init__(self, name, runs): - """ Initialize a GoogleBenchmark. - - Parameters - ---------- - name: str - Name of the benchmark - runs: list(GoogleBenchmarkObservation) - Repetitions of GoogleBenchmarkObservation run. - - """ - self.name = name - # exclude google benchmark aggregate artifacts - _, runs = partition(lambda b: b.is_aggregate, runs) - self.runs = sorted(runs, key=lambda b: b.value) - unit = self.runs[0].unit - time_unit = self.runs[0].time_unit - less_is_better = not unit.endswith("per_second") - values = [b.value for b in self.runs] - times = [b.real_time for b in self.runs] - # Slight kludge to extract the UserCounters for each benchmark - counters = self.runs[0].counters - super().__init__(name, unit, less_is_better, values, time_unit, times, - counters) - - def __repr__(self): - return "GoogleBenchmark[name={},runs={}]".format(self.names, self.runs) - - @classmethod - def from_json(cls, payload): - def group_key(x): - return x.name - - benchmarks = map(lambda x: GoogleBenchmarkObservation(**x), payload) - groups = groupby(sorted(benchmarks, key=group_key), group_key) - return [cls(k, list(bs)) for k, bs in groups] diff --git a/dev/archery/archery/benchmark/runner.py b/dev/archery/archery/benchmark/runner.py deleted file mode 100644 index 5718bcaf108c7..0000000000000 --- a/dev/archery/archery/benchmark/runner.py +++ /dev/null @@ -1,212 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import glob -import json -import os -import re - -from .core import BenchmarkSuite -from .google import GoogleBenchmarkCommand, GoogleBenchmark -from ..lang.cpp import CppCMakeDefinition, CppConfiguration -from ..utils.cmake import CMakeBuild -from ..utils.logger import logger - - -def regex_filter(re_expr): - if re_expr is None: - return lambda s: True - re_comp = re.compile(re_expr) - return lambda s: re_comp.search(s) - - -DEFAULT_REPETITIONS = 1 - - -class BenchmarkRunner: - def __init__(self, suite_filter=None, benchmark_filter=None, - repetitions=DEFAULT_REPETITIONS): - self.suite_filter = suite_filter - self.benchmark_filter = benchmark_filter - self.repetitions = repetitions - - @property - def suites(self): - raise NotImplementedError("BenchmarkRunner must implement suites") - - @staticmethod - def from_rev_or_path(src, root, rev_or_path, cmake_conf, **kwargs): - """ Returns a BenchmarkRunner from a path or a git revision. - - First, it checks if `rev_or_path` is a valid path (or string) of a json - object that can deserialize to a BenchmarkRunner. If so, it initialize - a StaticBenchmarkRunner from it. This allows memoizing the result of a - run in a file or a string. - - Second, it checks if `rev_or_path` points to a valid CMake build - directory. If so, it creates a CppBenchmarkRunner with this existing - CMakeBuild. - - Otherwise, it assumes `rev_or_path` is a revision and clone/checkout - the given revision and create a fresh CMakeBuild. - """ - build = None - if StaticBenchmarkRunner.is_json_result(rev_or_path): - return StaticBenchmarkRunner.from_json(rev_or_path, **kwargs) - elif CMakeBuild.is_build_dir(rev_or_path): - build = CMakeBuild.from_path(rev_or_path) - return CppBenchmarkRunner(build, **kwargs) - else: - # Revisions can references remote via the `/` character, ensure - # that the revision is path friendly - path_rev = rev_or_path.replace("/", "_") - root_rev = os.path.join(root, path_rev) - os.mkdir(root_rev) - - clone_dir = os.path.join(root_rev, "arrow") - # Possibly checkout the sources at given revision, no need to - # perform cleanup on cloned repository as root_rev is reclaimed. - src_rev, _ = src.at_revision(rev_or_path, clone_dir) - cmake_def = CppCMakeDefinition(src_rev.cpp, cmake_conf) - build_dir = os.path.join(root_rev, "build") - return CppBenchmarkRunner(cmake_def.build(build_dir), **kwargs) - - -class StaticBenchmarkRunner(BenchmarkRunner): - """ Run suites from a (static) set of suites. """ - - def __init__(self, suites, **kwargs): - self._suites = suites - super().__init__(**kwargs) - - @property - def list_benchmarks(self): - for suite in self._suites: - for benchmark in suite.benchmarks: - yield "{}.{}".format(suite.name, benchmark.name) - - @property - def suites(self): - suite_fn = regex_filter(self.suite_filter) - benchmark_fn = regex_filter(self.benchmark_filter) - - for suite in (s for s in self._suites if suite_fn(s.name)): - benchmarks = [b for b in suite.benchmarks if benchmark_fn(b.name)] - yield BenchmarkSuite(suite.name, benchmarks) - - @classmethod - def is_json_result(cls, path_or_str): - builder = None - try: - builder = cls.from_json(path_or_str) - except BaseException: - pass - - return builder is not None - - @staticmethod - def from_json(path_or_str, **kwargs): - # .codec imported here to break recursive imports - from .codec import BenchmarkRunnerCodec - if os.path.isfile(path_or_str): - with open(path_or_str) as f: - loaded = json.load(f) - else: - loaded = json.loads(path_or_str) - return BenchmarkRunnerCodec.decode(loaded, **kwargs) - - def __repr__(self): - return "BenchmarkRunner[suites={}]".format(list(self.suites)) - - -class CppBenchmarkRunner(BenchmarkRunner): - """ Run suites from a CMakeBuild. """ - - def __init__(self, build, **kwargs): - """ Initialize a CppBenchmarkRunner. """ - self.build = build - super().__init__(**kwargs) - - @staticmethod - def default_configuration(**kwargs): - """ Returns the default benchmark configuration. """ - return CppConfiguration( - build_type="release", with_tests=False, with_benchmarks=True, - with_compute=True, - with_csv=True, - with_dataset=True, - with_json=True, - with_parquet=True, - with_python=False, - with_brotli=True, - with_bz2=True, - with_lz4=True, - with_snappy=True, - with_zlib=True, - with_zstd=True, - **kwargs) - - @property - def suites_binaries(self): - """ Returns a list of benchmark binaries for this build. """ - # Ensure build is up-to-date to run benchmarks - self.build() - # Not the best method, but works for now - glob_expr = os.path.join(self.build.binaries_dir, "*-benchmark") - return {os.path.basename(b): b for b in glob.glob(glob_expr)} - - def suite(self, name, suite_bin): - """ Returns the resulting benchmarks for a given suite. """ - suite_cmd = GoogleBenchmarkCommand(suite_bin, self.benchmark_filter) - - # Ensure there will be data - benchmark_names = suite_cmd.list_benchmarks() - if not benchmark_names: - return None - - results = suite_cmd.results(repetitions=self.repetitions) - benchmarks = GoogleBenchmark.from_json(results.get("benchmarks")) - return BenchmarkSuite(name, benchmarks) - - @property - def list_benchmarks(self): - for suite_name, suite_bin in self.suites_binaries.items(): - suite_cmd = GoogleBenchmarkCommand(suite_bin) - for benchmark_name in suite_cmd.list_benchmarks(): - yield "{}.{}".format(suite_name, benchmark_name) - - @property - def suites(self): - """ Returns all suite for a runner. """ - suite_matcher = regex_filter(self.suite_filter) - - suite_and_binaries = self.suites_binaries - for suite_name in suite_and_binaries: - if not suite_matcher(suite_name): - logger.debug("Ignoring suite {}".format(suite_name)) - continue - - suite_bin = suite_and_binaries[suite_name] - suite = self.suite(suite_name, suite_bin) - - # Filter may exclude all benchmarks - if not suite: - logger.debug("Suite {} executed but no results" - .format(suite_name)) - continue - - yield suite diff --git a/dev/archery/archery/bot.py b/dev/archery/archery/bot.py deleted file mode 100644 index c69cf9112da86..0000000000000 --- a/dev/archery/archery/bot.py +++ /dev/null @@ -1,261 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import os -import shlex -from pathlib import Path -from functools import partial -import tempfile - -import click -import github - -from .utils.git import git -from .utils.logger import logger -from .crossbow import Repo, Queue, Config, Target, Job, CommentReport - - -class EventError(Exception): - pass - - -class CommandError(Exception): - - def __init__(self, message): - self.message = message - - -class _CommandMixin: - - def get_help_option(self, ctx): - def show_help(ctx, param, value): - if value and not ctx.resilient_parsing: - raise click.UsageError(ctx.get_help()) - option = super().get_help_option(ctx) - option.callback = show_help - return option - - def __call__(self, message, **kwargs): - args = shlex.split(message) - try: - with self.make_context(self.name, args=args, obj=kwargs) as ctx: - return self.invoke(ctx) - except click.ClickException as e: - raise CommandError(e.format_message()) - - -class Command(_CommandMixin, click.Command): - pass - - -class Group(_CommandMixin, click.Group): - - def command(self, *args, **kwargs): - kwargs.setdefault('cls', Command) - return super().command(*args, **kwargs) - - def group(self, *args, **kwargs): - kwargs.setdefault('cls', Group) - return super().group(*args, **kwargs) - - def parse_args(self, ctx, args): - if not args and self.no_args_is_help and not ctx.resilient_parsing: - raise click.UsageError(ctx.get_help()) - return super().parse_args(ctx, args) - - -command = partial(click.command, cls=Command) -group = partial(click.group, cls=Group) - - -class CommentBot: - - def __init__(self, name, handler, token=None): - # TODO(kszucs): validate - assert isinstance(name, str) - assert callable(handler) - self.name = name - self.handler = handler - self.github = github.Github(token) - - def parse_command(self, payload): - # only allow users of apache org to submit commands, for more see - # https://developer.github.com/v4/enum/commentauthorassociation/ - allowed_roles = {'OWNER', 'MEMBER', 'CONTRIBUTOR'} - mention = '@{}'.format(self.name) - comment = payload['comment'] - - if payload['sender']['login'] == self.name: - raise EventError("Don't respond to itself") - elif payload['action'] not in {'created', 'edited'}: - raise EventError("Don't respond to comment deletion") - elif comment['author_association'] not in allowed_roles: - raise EventError( - "Don't respond to comments from non-authorized users" - ) - elif not comment['body'].lstrip().startswith(mention): - raise EventError("The bot is not mentioned") - - return payload['comment']['body'].split(mention)[-1].strip() - - def handle(self, event, payload): - try: - command = self.parse_command(payload) - except EventError as e: - logger.error(e) - # see the possible reasons in the validate method - return - - if event == 'issue_comment': - return self.handle_issue_comment(command, payload) - elif event == 'pull_request_review_comment': - return self.handle_review_comment(command, payload) - else: - raise ValueError("Unexpected event type {}".format(event)) - - def handle_issue_comment(self, command, payload): - repo = self.github.get_repo(payload['repository']['id'], lazy=True) - issue = repo.get_issue(payload['issue']['number']) - - try: - pull = issue.as_pull_request() - except github.GithubException: - return issue.create_comment( - "The comment bot only listens to pull request comments!" - ) - - comment = pull.get_issue_comment(payload['comment']['id']) - try: - self.handler(command, issue=issue, pull_request=pull, - comment=comment) - except CommandError as e: - logger.error(e) - pull.create_issue_comment("```\n{}\n```".format(e.message)) - except Exception as e: - logger.exception(e) - comment.create_reaction('-1') - else: - comment.create_reaction('+1') - - def handle_review_comment(self, payload): - raise NotImplementedError() - - -@group(name='@github-actions') -@click.pass_context -def actions(ctx): - """Ursabot""" - ctx.ensure_object(dict) - - -@actions.group() -@click.option('--crossbow', '-c', default='ursacomputing/crossbow', - help='Crossbow repository on github to use') -@click.pass_obj -def crossbow(obj, crossbow): - """ - Trigger crossbow builds for this pull request - """ - obj['crossbow_repo'] = crossbow - - -def _clone_arrow_and_crossbow(dest, crossbow_repo, pull_request): - """ - Clone the repositories and initialize crossbow objects. - - Parameters - ---------- - dest : Path - Filesystem path to clone the repositories to. - crossbow_repo : str - Github repository name, like kszucs/crossbow. - pull_request : pygithub.PullRequest - Object containing information about the pull request the comment bot - was triggered from. - """ - arrow_path = dest / 'arrow' - queue_path = dest / 'crossbow' - - # clone arrow and checkout the pull request's branch - pull_request_ref = 'pull/{}/head:{}'.format( - pull_request.number, pull_request.head.ref - ) - git.clone(pull_request.base.repo.clone_url, str(arrow_path)) - git.fetch('origin', pull_request_ref, git_dir=arrow_path) - git.checkout(pull_request.head.ref, git_dir=arrow_path) - - # clone crossbow repository - crossbow_url = 'https://github.com/{}'.format(crossbow_repo) - git.clone(crossbow_url, str(queue_path)) - - # initialize crossbow objects - github_token = os.environ['CROSSBOW_GITHUB_TOKEN'] - arrow = Repo(arrow_path) - queue = Queue(queue_path, github_token=github_token, require_https=True) - - return (arrow, queue) - - -@crossbow.command() -@click.argument('tasks', nargs=-1, required=False) -@click.option('--group', '-g', 'groups', multiple=True, - help='Submit task groups as defined in tests.yml') -@click.option('--param', '-p', 'params', multiple=True, - help='Additional task parameters for rendering the CI templates') -@click.option('--arrow-version', '-v', default=None, - help='Set target version explicitly.') -@click.pass_obj -def submit(obj, tasks, groups, params, arrow_version): - """ - Submit crossbow testing tasks. - - See groups defined in arrow/dev/tasks/tests.yml - """ - crossbow_repo = obj['crossbow_repo'] - pull_request = obj['pull_request'] - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - arrow, queue = _clone_arrow_and_crossbow( - dest=Path(tmpdir), - crossbow_repo=crossbow_repo, - pull_request=pull_request, - ) - # load available tasks configuration and groups from yaml - config = Config.load_yaml(arrow.path / "dev" / "tasks" / "tasks.yml") - config.validate() - - # initialize the crossbow build's target repository - target = Target.from_repo(arrow, version=arrow_version, - remote=pull_request.head.repo.clone_url, - branch=pull_request.head.ref) - - # parse additional job parameters - params = dict([p.split("=") for p in params]) - - # instantiate the job object - job = Job.from_config(config=config, target=target, tasks=tasks, - groups=groups, params=params) - - # add the job to the crossbow queue and push to the remote repository - queue.put(job, prefix="actions") - queue.push() - - # render the response comment's content - report = CommentReport(job, crossbow_repo=crossbow_repo) - - # send the response - pull_request.create_issue_comment(report.show()) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py deleted file mode 100644 index 4bbde75b74cf4..0000000000000 --- a/dev/archery/archery/cli.py +++ /dev/null @@ -1,1092 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from collections import namedtuple -from io import StringIO -import click -import errno -import json -import logging -import os -import pathlib -import sys - -from .benchmark.codec import JsonEncoder -from .benchmark.compare import RunnerComparator, DEFAULT_THRESHOLD -from .benchmark.runner import BenchmarkRunner, CppBenchmarkRunner -from .lang.cpp import CppCMakeDefinition, CppConfiguration -from .utils.lint import linter, python_numpydoc, LintValidationException -from .utils.logger import logger, ctx as log_ctx -from .utils.source import ArrowSources, InvalidArrowSource -from .utils.tmpdir import tmpdir - -# Set default logging to INFO in command line. -logging.basicConfig(level=logging.INFO) - - -class ArrowBool(click.types.BoolParamType): - """ - ArrowBool supports the 'ON' and 'OFF' values on top of the values - supported by BoolParamType. This is convenient to port script which exports - CMake options variables. - """ - name = "boolean" - - def convert(self, value, param, ctx): - if isinstance(value, str): - lowered = value.lower() - if lowered == "on": - return True - elif lowered == "off": - return False - - return super().convert(value, param, ctx) - - -BOOL = ArrowBool() - - -@click.group() -@click.option("--debug", type=BOOL, is_flag=True, default=False, - help="Increase logging with debugging output.") -@click.option("--pdb", type=BOOL, is_flag=True, default=False, - help="Invoke pdb on uncaught exception.") -@click.option("-q", "--quiet", type=BOOL, is_flag=True, default=False, - help="Silence executed commands.") -@click.pass_context -def archery(ctx, debug, pdb, quiet): - """ Apache Arrow developer utilities. - - See sub-commands help with `archery --help`. - - """ - # Ensure ctx.obj exists - ctx.ensure_object(dict) - - log_ctx.quiet = quiet - if debug: - logger.setLevel(logging.DEBUG) - - ctx.debug = debug - - if pdb: - import pdb - sys.excepthook = lambda t, v, e: pdb.pm() - - -def validate_arrow_sources(ctx, param, src): - """ Ensure a directory contains Arrow cpp sources. """ - try: - return ArrowSources.find(src) - except InvalidArrowSource as e: - raise click.BadParameter(str(e)) - - -build_dir_type = click.Path(dir_okay=True, file_okay=False, resolve_path=True) -# Supported build types -build_type = click.Choice(["debug", "relwithdebinfo", "release"], - case_sensitive=False) -# Supported warn levels -warn_level_type = click.Choice(["everything", "checkin", "production"], - case_sensitive=False) - -simd_level = click.Choice(["NONE", "SSE4_2", "AVX2", "AVX512"], - case_sensitive=True) - - -def cpp_toolchain_options(cmd): - options = [ - click.option("--cc", metavar="", help="C compiler."), - click.option("--cxx", metavar="", help="C++ compiler."), - click.option("--cxx-flags", help="C++ compiler flags."), - click.option("--cpp-package-prefix", - help=("Value to pass for ARROW_PACKAGE_PREFIX and " - "use ARROW_DEPENDENCY_SOURCE=SYSTEM")) - ] - return _apply_options(cmd, options) - - -def _apply_options(cmd, options): - for option in options: - cmd = option(cmd) - return cmd - - -@archery.command(short_help="Initialize an Arrow C++ build") -@click.option("--src", metavar="", default=None, - callback=validate_arrow_sources, - help="Specify Arrow source directory") -# toolchain -@cpp_toolchain_options -@click.option("--build-type", default=None, type=build_type, - help="CMake's CMAKE_BUILD_TYPE") -@click.option("--warn-level", default="production", type=warn_level_type, - help="Controls compiler warnings -W(no-)error.") -@click.option("--use-gold-linker", default=True, type=BOOL, - help="Toggles ARROW_USE_LD_GOLD option.") -@click.option("--simd-level", default="SSE4_2", type=simd_level, - help="Toggles ARROW_SIMD_LEVEL option.") -# Tests and benchmarks -@click.option("--with-tests", default=True, type=BOOL, - help="Build with tests.") -@click.option("--with-benchmarks", default=None, type=BOOL, - help="Build with benchmarks.") -@click.option("--with-examples", default=None, type=BOOL, - help="Build with examples.") -@click.option("--with-integration", default=None, type=BOOL, - help="Build with integration test executables.") -# Static checks -@click.option("--use-asan", default=None, type=BOOL, - help="Toggle ARROW_USE_ASAN sanitizer.") -@click.option("--use-tsan", default=None, type=BOOL, - help="Toggle ARROW_USE_TSAN sanitizer.") -@click.option("--use-ubsan", default=None, type=BOOL, - help="Toggle ARROW_USE_UBSAN sanitizer.") -@click.option("--with-fuzzing", default=None, type=BOOL, - help="Toggle ARROW_FUZZING.") -# Components -@click.option("--with-compute", default=None, type=BOOL, - help="Build the Arrow compute module.") -@click.option("--with-csv", default=None, type=BOOL, - help="Build the Arrow CSV parser module.") -@click.option("--with-cuda", default=None, type=BOOL, - help="Build the Arrow CUDA extensions.") -@click.option("--with-dataset", default=None, type=BOOL, - help="Build the Arrow dataset module.") -@click.option("--with-filesystem", default=None, type=BOOL, - help="Build the Arrow filesystem layer.") -@click.option("--with-flight", default=None, type=BOOL, - help="Build with Flight rpc support.") -@click.option("--with-gandiva", default=None, type=BOOL, - help="Build with Gandiva expression compiler support.") -@click.option("--with-hdfs", default=None, type=BOOL, - help="Build the Arrow HDFS bridge.") -@click.option("--with-hiveserver2", default=None, type=BOOL, - help="Build the HiveServer2 client and arrow adapater.") -@click.option("--with-ipc", default=None, type=BOOL, - help="Build the Arrow IPC extensions.") -@click.option("--with-json", default=None, type=BOOL, - help="Build the Arrow JSON parser module.") -@click.option("--with-jni", default=None, type=BOOL, - help="Build the Arrow JNI lib.") -@click.option("--with-mimalloc", default=None, type=BOOL, - help="Build the Arrow mimalloc based allocator.") -@click.option("--with-parquet", default=None, type=BOOL, - help="Build with Parquet file support.") -@click.option("--with-plasma", default=None, type=BOOL, - help="Build with Plasma object store support.") -@click.option("--with-python", default=None, type=BOOL, - help="Build the Arrow CPython extesions.") -@click.option("--with-r", default=None, type=BOOL, - help="Build the Arrow R extensions. This is not a CMake option, " - "it will toggle required options") -@click.option("--with-s3", default=None, type=BOOL, - help="Build Arrow with S3 support.") -# Compressions -@click.option("--with-brotli", default=None, type=BOOL, - help="Build Arrow with brotli compression.") -@click.option("--with-bz2", default=None, type=BOOL, - help="Build Arrow with bz2 compression.") -@click.option("--with-lz4", default=None, type=BOOL, - help="Build Arrow with lz4 compression.") -@click.option("--with-snappy", default=None, type=BOOL, - help="Build Arrow with snappy compression.") -@click.option("--with-zlib", default=None, type=BOOL, - help="Build Arrow with zlib compression.") -@click.option("--with-zstd", default=None, type=BOOL, - help="Build Arrow with zstd compression.") -# CMake extra feature -@click.option("--cmake-extras", type=str, multiple=True, - help="Extra flags/options to pass to cmake invocation. " - "Can be stacked") -@click.option("--install-prefix", type=str, - help="Destination directory where files are installed. Expand to" - "CMAKE_INSTALL_PREFIX. Defaults to to $CONDA_PREFIX if the" - "variable exists.") -# misc -@click.option("-f", "--force", type=BOOL, is_flag=True, default=False, - help="Delete existing build directory if found.") -@click.option("--targets", type=str, multiple=True, - help="Generator targets to run. Can be stacked.") -@click.argument("build_dir", type=build_dir_type) -@click.pass_context -def build(ctx, src, build_dir, force, targets, **kwargs): - """ Initialize a C++ build directory. - - The build command creates a directory initialized with Arrow's cpp source - cmake and configuration. It can also optionally invoke the generator to - test the build (and used in scripts). - - Note that archery will carry the caller environment. It will also not touch - an existing directory, one must use the `--force` option to remove the - existing directory. - - Examples: - - \b - # Initialize build with clang8 and avx2 support in directory `clang8-build` - \b - archery build --cc=clang-8 --cxx=clang++-8 --cxx-flags=-mavx2 clang8-build - - \b - # Builds and run test - archery build --targets=all --targets=test build - """ - # Arrow's cpp cmake configuration - conf = CppConfiguration(**kwargs) - # This is a closure around cmake invocation, e.g. calling `def.build()` - # yields a directory ready to be run with the generator - cmake_def = CppCMakeDefinition(src.cpp, conf) - # Create build directory - build = cmake_def.build(build_dir, force=force) - - for target in targets: - build.run(target) - - -LintCheck = namedtuple('LintCheck', ('option_name', 'help')) - -lint_checks = [ - LintCheck('clang-format', "Format C++ files with clang-format."), - LintCheck('clang-tidy', "Lint C++ files with clang-tidy."), - LintCheck('cpplint', "Lint C++ files with cpplint."), - LintCheck('iwyu', "Lint changed C++ files with Include-What-You-Use."), - LintCheck('python', - "Format and lint Python files with autopep8 and flake8."), - LintCheck('numpydoc', "Lint Python files with numpydoc."), - LintCheck('cmake-format', "Format CMake files with cmake-format.py."), - LintCheck('rat', - "Check all sources files for license texts via Apache RAT."), - LintCheck('r', "Lint R files."), - LintCheck('rust', "Lint Rust files."), - LintCheck('docker', "Lint Dockerfiles with hadolint."), -] - - -def decorate_lint_command(cmd): - """ - Decorate the lint() command function to add individual per-check options. - """ - for check in lint_checks: - option = click.option("--{0}/--no-{0}".format(check.option_name), - default=None, help=check.help) - cmd = option(cmd) - return cmd - - -@archery.command(short_help="Check Arrow source tree for errors") -@click.option("--src", metavar="", default=".", - help="Specify Arrow source directory") -@click.option("--fix", is_flag=True, type=BOOL, default=False, - help="Toggle fixing the lint errors if the linter supports it.") -@click.option("--iwyu_all", is_flag=True, type=BOOL, default=False, - help="Run IWYU on all C++ files if enabled") -@click.option("-a", "--all", is_flag=True, default=False, - help="Enable all checks.") -@decorate_lint_command -@click.pass_context -def lint(ctx, src, fix, iwyu_all, **checks): - src = ArrowSources(src) - - if checks.pop('all'): - # "--all" is given => enable all non-selected checks - for k, v in checks.items(): - if v is None: - checks[k] = True - if not any(checks.values()): - raise click.UsageError( - "Need to enable at least one lint check (try --help)") - try: - linter(src, fix, iwyu_all=iwyu_all, **checks) - except LintValidationException: - sys.exit(1) - - -@archery.command(short_help="Lint python docstring with NumpyDoc") -@click.argument('symbols', nargs=-1) -@click.option("--src", metavar="", default=None, - callback=validate_arrow_sources, - help="Specify Arrow source directory") -@click.option("--allow-rule", "-a", multiple=True, - help="Allow only these rules") -@click.option("--disallow-rule", "-d", multiple=True, - help="Disallow these rules") -def numpydoc(src, symbols, allow_rule, disallow_rule): - """ - Pass list of modules or symbols as arguments to restrict the validation. - - By default all modules of pyarrow are tried to be validated. - - Examples - -------- - archery numpydoc pyarrow.dataset - archery numpydoc pyarrow.csv pyarrow.json pyarrow.parquet - archery numpydoc pyarrow.array - """ - disallow_rule = disallow_rule or {'GL01', 'SA01', 'EX01', 'ES01'} - try: - results = python_numpydoc(symbols, allow_rules=allow_rule, - disallow_rule=disallow_rule) - for result in results: - result.ok() - except LintValidationException: - sys.exit(1) - - -@archery.group() -@click.pass_context -def benchmark(ctx): - """ Arrow benchmarking. - - Use the diff sub-command to benchmark revisions, and/or build directories. - """ - pass - - -def benchmark_common_options(cmd): - options = [ - click.option("--src", metavar="", show_default=True, - default=None, callback=validate_arrow_sources, - help="Specify Arrow source directory"), - click.option("--preserve", type=BOOL, default=False, show_default=True, - is_flag=True, - help="Preserve workspace for investigation."), - click.option("--output", metavar="", - type=click.File("w", encoding="utf8"), default="-", - help="Capture output result into file."), - click.option("--cmake-extras", type=str, multiple=True, - help="Extra flags/options to pass to cmake invocation. " - "Can be stacked"), - ] - - cmd = cpp_toolchain_options(cmd) - return _apply_options(cmd, options) - - -def benchmark_filter_options(cmd): - options = [ - click.option("--suite-filter", metavar="", show_default=True, - type=str, default=None, - help="Regex filtering benchmark suites."), - click.option("--benchmark-filter", metavar="", - show_default=True, type=str, default=None, - help="Regex filtering benchmarks.") - ] - return _apply_options(cmd, options) - - -@benchmark.command(name="list", short_help="List benchmark suite") -@click.argument("rev_or_path", metavar="[]", - default="WORKSPACE", required=False) -@benchmark_common_options -@click.pass_context -def benchmark_list(ctx, rev_or_path, src, preserve, output, cmake_extras, - **kwargs): - """ List benchmark suite. - """ - with tmpdir(preserve=preserve) as root: - logger.debug("Running benchmark {}".format(rev_or_path)) - - conf = CppBenchmarkRunner.default_configuration( - cmake_extras=cmake_extras, **kwargs) - - runner_base = BenchmarkRunner.from_rev_or_path( - src, root, rev_or_path, conf) - - for b in runner_base.list_benchmarks: - click.echo(b, file=output) - - -@benchmark.command(name="run", short_help="Run benchmark suite") -@click.argument("rev_or_path", metavar="[]", - default="WORKSPACE", required=False) -@benchmark_common_options -@benchmark_filter_options -@click.option("--repetitions", type=int, default=1, show_default=True, - help=("Number of repetitions of each benchmark. Increasing " - "may improve result precision.")) -@click.pass_context -def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras, - suite_filter, benchmark_filter, repetitions, **kwargs): - """ Run benchmark suite. - - This command will run the benchmark suite for a single build. This is - used to capture (and/or publish) the results. - - The caller can optionally specify a target which is either a git revision - (commit, tag, special values like HEAD) or a cmake build directory. - - When a commit is referenced, a local clone of the arrow sources (specified - via --src) is performed and the proper branch is created. This is done in - a temporary directory which can be left intact with the `--preserve` flag. - - The special token "WORKSPACE" is reserved to specify the current git - workspace. This imply that no clone will be performed. - - Examples: - - \b - # Run the benchmarks on current git workspace - \b - archery benchmark run - - \b - # Run the benchmarks on current previous commit - \b - archery benchmark run HEAD~1 - - \b - # Run the benchmarks on current previous commit - \b - archery benchmark run --output=run.json - """ - with tmpdir(preserve=preserve) as root: - logger.debug("Running benchmark {}".format(rev_or_path)) - - conf = CppBenchmarkRunner.default_configuration( - cmake_extras=cmake_extras, **kwargs) - - runner_base = BenchmarkRunner.from_rev_or_path( - src, root, rev_or_path, conf, - repetitions=repetitions, - suite_filter=suite_filter, benchmark_filter=benchmark_filter) - - json.dump(runner_base, output, cls=JsonEncoder) - - -@benchmark.command(name="diff", short_help="Compare benchmark suites") -@benchmark_common_options -@benchmark_filter_options -@click.option("--threshold", type=float, default=DEFAULT_THRESHOLD, - show_default=True, - help="Regression failure threshold in percentage.") -@click.option("--repetitions", type=int, default=1, show_default=True, - help=("Number of repetitions of each benchmark. Increasing " - "may improve result precision.")) -@click.option("--no-counters", type=BOOL, default=False, is_flag=True, - help="Hide counters field in diff report.") -@click.argument("contender", metavar="[", - default=ArrowSources.WORKSPACE, required=False) -@click.argument("baseline", metavar="[]]", default="origin/master", - required=False) -@click.pass_context -def benchmark_diff(ctx, src, preserve, output, cmake_extras, - suite_filter, benchmark_filter, repetitions, no_counters, - threshold, contender, baseline, **kwargs): - """Compare (diff) benchmark runs. - - This command acts like git-diff but for benchmark results. - - The caller can optionally specify both the contender and the baseline. If - unspecified, the contender will default to the current workspace (like git) - and the baseline will default to master. - - Each target (contender or baseline) can either be a git revision - (commit, tag, special values like HEAD) or a cmake build directory. This - allow comparing git commits, and/or different compilers and/or compiler - flags. - - When a commit is referenced, a local clone of the arrow sources (specified - via --src) is performed and the proper branch is created. This is done in - a temporary directory which can be left intact with the `--preserve` flag. - - The special token "WORKSPACE" is reserved to specify the current git - workspace. This imply that no clone will be performed. - - Examples: - - \b - # Compare workspace (contender) with master (baseline) - \b - archery benchmark diff - - \b - # Compare master (contender) with latest version (baseline) - \b - export LAST=$(git tag -l "apache-arrow-[0-9]*" | sort -rV | head -1) - \b - archery benchmark diff master "$LAST" - - \b - # Compare g++7 (contender) with clang++-8 (baseline) builds - \b - archery build --with-benchmarks=true \\ - --cxx-flags=-ftree-vectorize \\ - --cc=gcc-7 --cxx=g++-7 gcc7-build - \b - archery build --with-benchmarks=true \\ - --cxx-flags=-flax-vector-conversions \\ - --cc=clang-8 --cxx=clang++-8 clang8-build - \b - archery benchmark diff gcc7-build clang8-build - - \b - # Compare default targets but scoped to the suites matching - # `^arrow-compute-aggregate` and benchmarks matching `(Sum|Mean)Kernel`. - \b - archery benchmark diff --suite-filter="^arrow-compute-aggregate" \\ - --benchmark-filter="(Sum|Mean)Kernel" - - \b - # Capture result in file `result.json` - \b - archery benchmark diff --output=result.json - \b - # Equivalently with no stdout clutter. - archery --quiet benchmark diff > result.json - - \b - # Comparing with a cached results from `archery benchmark run` - \b - archery benchmark run --output=run.json HEAD~1 - \b - # This should not recompute the benchmark from run.json - archery --quiet benchmark diff WORKSPACE run.json > result.json - """ - with tmpdir(preserve=preserve) as root: - logger.debug("Comparing {} (contender) with {} (baseline)" - .format(contender, baseline)) - - conf = CppBenchmarkRunner.default_configuration( - cmake_extras=cmake_extras, **kwargs) - - runner_cont = BenchmarkRunner.from_rev_or_path( - src, root, contender, conf, - repetitions=repetitions, - suite_filter=suite_filter, - benchmark_filter=benchmark_filter) - runner_base = BenchmarkRunner.from_rev_or_path( - src, root, baseline, conf, - repetitions=repetitions, - suite_filter=suite_filter, - benchmark_filter=benchmark_filter) - - runner_comp = RunnerComparator(runner_cont, runner_base, threshold) - - # TODO(kszucs): test that the output is properly formatted jsonlines - comparisons_json = _get_comparisons_as_json(runner_comp.comparisons) - formatted = _format_comparisons_with_pandas(comparisons_json, - no_counters) - output.write(formatted) - output.write('\n') - - -def _get_comparisons_as_json(comparisons): - buf = StringIO() - for comparator in comparisons: - json.dump(comparator, buf, cls=JsonEncoder) - buf.write("\n") - - return buf.getvalue() - - -def _format_comparisons_with_pandas(comparisons_json, no_counters): - import pandas as pd - df = pd.read_json(StringIO(comparisons_json), lines=True) - # parse change % so we can sort by it - df['change %'] = df.pop('change').str[:-1].map(float) - first_regression = len(df) - df['regression'].sum() - - fields = ['benchmark', 'baseline', 'contender', 'change %'] - if not no_counters: - fields += ['counters'] - - df = df[fields].sort_values(by='change %', ascending=False) - - def labelled(title, df): - if len(df) == 0: - return '' - title += ': ({})'.format(len(df)) - df_str = df.to_string(index=False) - bar = '-' * df_str.index('\n') - return '\n'.join([bar, title, bar, df_str]) - - return '\n\n'.join([labelled('Non-regressions', df[:first_regression]), - labelled('Regressions', df[first_regression:])]) - - -# ---------------------------------------------------------------------- -# Integration testing - -def _set_default(opt, default): - if opt is None: - return default - return opt - - -@archery.command(short_help="Execute protocol and Flight integration tests") -@click.option('--with-all', is_flag=True, default=False, - help=('Include all known languages by default ' - 'in integration tests')) -@click.option('--random-seed', type=int, default=12345, - help="Seed for PRNG when generating test data") -@click.option('--with-cpp', type=bool, default=False, - help='Include C++ in integration tests') -@click.option('--with-java', type=bool, default=False, - help='Include Java in integration tests') -@click.option('--with-js', type=bool, default=False, - help='Include JavaScript in integration tests') -@click.option('--with-go', type=bool, default=False, - help='Include Go in integration tests') -@click.option('--with-rust', type=bool, default=False, - help='Include Rust in integration tests') -@click.option('--write_generated_json', default=False, - help='Generate test JSON to indicated path') -@click.option('--run-flight', is_flag=True, default=False, - help='Run Flight integration tests') -@click.option('--debug', is_flag=True, default=False, - help='Run executables in debug mode as relevant') -@click.option('--serial', is_flag=True, default=False, - help='Run tests serially, rather than in parallel') -@click.option('--tempdir', default=None, - help=('Directory to use for writing ' - 'integration test temporary files')) -@click.option('stop_on_error', '-x', '--stop-on-error', - is_flag=True, default=False, - help='Stop on first error') -@click.option('--gold-dirs', multiple=True, - help="gold integration test file paths") -@click.option('-k', '--match', - help=("Substring for test names to include in run, " - "e.g. -k primitive")) -def integration(with_all=False, random_seed=12345, **args): - from .integration.runner import write_js_test_json, run_all_tests - import numpy as np - - # FIXME(bkietz) Include help strings for individual testers. - # For example, CPPTester's ARROW_CPP_EXE_PATH environment variable. - - # Make runs involving data generation deterministic - np.random.seed(random_seed) - - gen_path = args['write_generated_json'] - - languages = ['cpp', 'java', 'js', 'go', 'rust'] - - enabled_languages = 0 - for lang in languages: - param = 'with_{}'.format(lang) - if with_all: - args[param] = with_all - - if args[param]: - enabled_languages += 1 - - if gen_path: - try: - os.makedirs(gen_path) - except OSError as e: - if e.errno != errno.EEXIST: - raise - write_js_test_json(gen_path) - else: - if enabled_languages == 0: - raise Exception("Must enable at least 1 language to test") - run_all_tests(**args) - - -@archery.command() -@click.option('--event-name', '-n', required=True) -@click.option('--event-payload', '-p', type=click.File('r', encoding='utf8'), - default='-', required=True) -@click.option('--arrow-token', envvar='ARROW_GITHUB_TOKEN', - help='OAuth token for responding comment in the arrow repo') -@click.option('--crossbow-token', '-ct', envvar='CROSSBOW_GITHUB_TOKEN', - help='OAuth token for pushing to the crossow repository') -def trigger_bot(event_name, event_payload, arrow_token, crossbow_token): - from .bot import CommentBot, actions - - event_payload = json.loads(event_payload.read()) - - bot = CommentBot(name='github-actions', handler=actions, token=arrow_token) - bot.handle(event_name, event_payload) - - -def _mock_compose_calls(compose): - from types import MethodType - from subprocess import CompletedProcess - - def _mock(compose, executable): - def _execute(self, *args, **kwargs): - params = ['{}={}'.format(k, v) - for k, v in self.config.params.items()] - command = ' '.join(params + [executable] + list(args)) - click.echo(command) - return CompletedProcess([], 0) - return MethodType(_execute, compose) - - compose._execute_docker = _mock(compose, executable='docker') - compose._execute_compose = _mock(compose, executable='docker-compose') - - -@archery.group('docker') -@click.option("--src", metavar="", default=None, - callback=validate_arrow_sources, - help="Specify Arrow source directory.") -@click.option('--dry-run/--execute', default=False, - help="Display the docker-compose commands instead of executing " - "them.") -@click.pass_obj -def docker_compose(obj, src, dry_run): - """Interact with docker-compose based builds.""" - from .docker import DockerCompose - - config_path = src.path / 'docker-compose.yml' - if not config_path.exists(): - raise click.ClickException( - "Docker compose configuration cannot be found in directory {}, " - "try to pass the arrow source directory explicitly.".format(src) - ) - - # take the docker-compose parameters like PYTHON, PANDAS, UBUNTU from the - # environment variables to keep the usage similar to docker-compose - compose = DockerCompose(config_path, params=os.environ) - if dry_run: - _mock_compose_calls(compose) - obj['compose'] = compose - - -@docker_compose.command('build') -@click.argument('image') -@click.option('--force-pull/--no-pull', default=True, - help="Whether to force pull the image and its ancestor images") -@click.option('--using-docker-cli', default=False, is_flag=True, - envvar='ARCHERY_USE_DOCKER_CLI', - help="Use docker CLI directly for building instead of calling " - "docker-compose. This may help to reuse cached layers.") -@click.option('--using-docker-buildx', default=False, is_flag=True, - envvar='ARCHERY_USE_DOCKER_BUILDX', - help="Use buildx with docker CLI directly for building instead " - "of calling docker-compose or the plain docker build " - "command. This option makes the build cache reusable " - "across hosts.") -@click.option('--use-cache/--no-cache', default=True, - help="Whether to use cache when building the image and its " - "ancestor images") -@click.option('--use-leaf-cache/--no-leaf-cache', default=True, - help="Whether to use cache when building only the (leaf) image " - "passed as the argument. To disable caching for both the " - "image and its ancestors use --no-cache option.") -@click.pass_obj -def docker_compose_build(obj, image, *, force_pull, using_docker_cli, - using_docker_buildx, use_cache, use_leaf_cache): - """ - Execute docker-compose builds. - """ - from .docker import UndefinedImage - - compose = obj['compose'] - - using_docker_cli |= using_docker_buildx - try: - if force_pull: - compose.pull(image, pull_leaf=use_leaf_cache, - using_docker=using_docker_cli) - compose.build(image, use_cache=use_cache, - use_leaf_cache=use_leaf_cache, - using_docker=using_docker_cli, - using_buildx=using_docker_buildx) - except UndefinedImage as e: - raise click.ClickException( - "There is no service/image defined in docker-compose.yml with " - "name: {}".format(str(e)) - ) - except RuntimeError as e: - raise click.ClickException(str(e)) - - -@docker_compose.command('run') -@click.argument('image') -@click.argument('command', required=False, default=None) -@click.option('--env', '-e', multiple=True, - help="Set environment variable within the container") -@click.option('--user', '-u', default=None, - help="Username or UID to run the container with") -@click.option('--force-pull/--no-pull', default=True, - help="Whether to force pull the image and its ancestor images") -@click.option('--force-build/--no-build', default=True, - help="Whether to force build the image and its ancestor images") -@click.option('--build-only', default=False, is_flag=True, - help="Pull and/or build the image, but do not run it") -@click.option('--using-docker-cli', default=False, is_flag=True, - envvar='ARCHERY_USE_DOCKER_CLI', - help="Use docker CLI directly for building instead of calling " - "docker-compose. This may help to reuse cached layers.") -@click.option('--using-docker-buildx', default=False, is_flag=True, - envvar='ARCHERY_USE_DOCKER_BUILDX', - help="Use buildx with docker CLI directly for building instead " - "of calling docker-compose or the plain docker build " - "command. This option makes the build cache reusable " - "across hosts.") -@click.option('--use-cache/--no-cache', default=True, - help="Whether to use cache when building the image and its " - "ancestor images") -@click.option('--use-leaf-cache/--no-leaf-cache', default=True, - help="Whether to use cache when building only the (leaf) image " - "passed as the argument. To disable caching for both the " - "image and its ancestors use --no-cache option.") -@click.option('--volume', '-v', multiple=True, - help="Set volume within the container") -@click.pass_obj -def docker_compose_run(obj, image, command, *, env, user, force_pull, - force_build, build_only, using_docker_cli, - using_docker_buildx, use_cache, - use_leaf_cache, volume): - """Execute docker-compose builds. - - To see the available builds run `archery docker images`. - - Examples: - - # execute a single build - archery docker run conda-python - - # execute the builds but disable the image pulling - archery docker run --no-cache conda-python - - # pass a docker-compose parameter, like the python version - PYTHON=3.8 archery docker run conda-python - - # disable the cache only for the leaf image - PANDAS=master archery docker run --no-leaf-cache conda-python-pandas - - # entirely skip building the image - archery docker run --no-pull --no-build conda-python - - # pass runtime parameters via docker environment variables - archery docker run -e CMAKE_BUILD_TYPE=release ubuntu-cpp - - # set a volume - archery docker run -v $PWD/build:/build ubuntu-cpp - - # starting an interactive bash session for debugging - archery docker run ubuntu-cpp bash - """ - from .docker import UndefinedImage - - compose = obj['compose'] - using_docker_cli |= using_docker_buildx - - env = dict(kv.split('=', 1) for kv in env) - try: - if force_pull: - compose.pull(image, pull_leaf=use_leaf_cache, - using_docker=using_docker_cli) - if force_build: - compose.build(image, use_cache=use_cache, - use_leaf_cache=use_leaf_cache, - using_docker=using_docker_cli, - using_buildx=using_docker_buildx) - if build_only: - return - compose.run( - image, - command=command, - env=env, - user=user, - using_docker=using_docker_cli, - volumes=volume - ) - except UndefinedImage as e: - raise click.ClickException( - "There is no service/image defined in docker-compose.yml with " - "name: {}".format(str(e)) - ) - except RuntimeError as e: - raise click.ClickException(str(e)) - - -@docker_compose.command('push') -@click.argument('image') -@click.option('--user', '-u', required=False, envvar='ARCHERY_DOCKER_USER', - help='Docker repository username') -@click.option('--password', '-p', required=False, - envvar='ARCHERY_DOCKER_PASSWORD', - help='Docker repository password') -@click.option('--using-docker-cli', default=False, is_flag=True, - help="Use docker CLI directly for building instead of calling " - "docker-compose. This may help to reuse cached layers.") -@click.pass_obj -def docker_compose_push(obj, image, user, password, using_docker_cli): - """Push the generated docker-compose image.""" - compose = obj['compose'] - compose.push(image, user=user, password=password, - using_docker=using_docker_cli) - - -@docker_compose.command('images') -@click.pass_obj -def docker_compose_images(obj): - """List the available docker-compose images.""" - compose = obj['compose'] - click.echo('Available images:') - for image in compose.images(): - click.echo(' - {}'.format(image)) - - -@archery.group('release') -@click.option("--src", metavar="", default=None, - callback=validate_arrow_sources, - help="Specify Arrow source directory.") -@click.option("--jira-cache", type=click.Path(), default=None, - help="File path to cache queried JIRA issues per version.") -@click.pass_obj -def release(obj, src, jira_cache): - """Release releated commands.""" - from .release import Jira, CachedJira - - jira = Jira() - if jira_cache is not None: - jira = CachedJira(jira_cache, jira=jira) - - obj['jira'] = jira - obj['repo'] = src.path - - -@release.command('curate') -@click.argument('version') -@click.pass_obj -def release_curate(obj, version): - """Release curation.""" - from .release import Release - - release = Release.from_jira(version, jira=obj['jira'], repo=obj['repo']) - curation = release.curate() - - click.echo(curation.render('console')) - - -@release.group('changelog') -def release_changelog(): - """Release changelog.""" - pass - - -@release_changelog.command('add') -@click.argument('version') -@click.pass_obj -def release_changelog_add(obj, version): - """Prepend the changelog with the current release""" - from .release import Release - - jira, repo = obj['jira'], obj['repo'] - - # just handle the current version - release = Release.from_jira(version, jira=jira, repo=repo) - if release.is_released: - raise ValueError('This version has been already released!') - - changelog = release.changelog() - changelog_path = pathlib.Path(repo) / 'CHANGELOG.md' - - current_content = changelog_path.read_text() - new_content = changelog.render('markdown') + current_content - - changelog_path.write_text(new_content) - click.echo("CHANGELOG.md is updated!") - - -@release_changelog.command('generate') -@click.argument('version') -@click.argument('output', type=click.File('w', encoding='utf8'), default='-') -@click.pass_obj -def release_changelog_generate(obj, version, output): - """Generate the changelog of a specific release.""" - from .release import Release - - jira, repo = obj['jira'], obj['repo'] - - # just handle the current version - release = Release.from_jira(version, jira=jira, repo=repo) - - changelog = release.changelog() - output.write(changelog.render('markdown')) - - -@release_changelog.command('regenerate') -@click.pass_obj -def release_changelog_regenerate(obj): - """Regeneretate the whole CHANGELOG.md file""" - from .release import Release - - jira, repo = obj['jira'], obj['repo'] - changelogs = [] - - for version in jira.arrow_versions(): - if not version.released: - continue - release = Release.from_jira(version, jira=jira, repo=repo) - click.echo('Querying changelog for version: {}'.format(version)) - changelogs.append(release.changelog()) - - click.echo('Rendering new CHANGELOG.md file...') - changelog_path = pathlib.Path(repo) / 'CHANGELOG.md' - with changelog_path.open('w') as fp: - for cl in changelogs: - fp.write(cl.render('markdown')) - - -@release.command('cherry-pick') -@click.argument('version') -@click.option('--dry-run/--execute', default=True, - help="Display the git commands instead of executing them.") -@click.option('--recreate/--continue', default=True, - help="Recreate the maintenance branch or only apply unapplied " - "patches.") -@click.pass_obj -def release_cherry_pick(obj, version, dry_run, recreate): - """ - Cherry pick commits. - """ - from .release import Release, MinorRelease, PatchRelease - - release = Release.from_jira(version, jira=obj['jira'], repo=obj['repo']) - if not isinstance(release, (MinorRelease, PatchRelease)): - raise click.UsageError('Cherry-pick command only supported for minor ' - 'and patch releases') - - if not dry_run: - release.cherry_pick_commits(recreate_branch=recreate) - click.echo('Executed the following commands:\n') - - click.echo( - 'git checkout {} -b {}'.format(release.previous.tag, release.branch) - ) - for commit in release.commits_to_pick(): - click.echo('git cherry-pick {}'.format(commit.hexsha)) - - -try: - from .crossbow.cli import crossbow # noqa -except ImportError as exc: - missing_package = exc.name - - @archery.command( - 'crossbow', - context_settings={"ignore_unknown_options": True} - ) - def crossbow(): - raise click.ClickException( - "Couldn't import crossbow because of missing dependency: {}" - .format(missing_package) - ) -else: - archery.add_command(crossbow) - - -if __name__ == "__main__": - archery(obj={}) diff --git a/dev/archery/archery/compat.py b/dev/archery/archery/compat.py deleted file mode 100644 index 22cb9fc7957d2..0000000000000 --- a/dev/archery/archery/compat.py +++ /dev/null @@ -1,51 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import pathlib - - -def _is_path_like(path): - # PEP519 filesystem path protocol is available from python 3.6, so pathlib - # doesn't implement __fspath__ for earlier versions - return (isinstance(path, str) or - hasattr(path, '__fspath__') or - isinstance(path, pathlib.Path)) - - -def _ensure_path(path): - if isinstance(path, pathlib.Path): - return path - else: - return pathlib.Path(_stringify_path(path)) - - -def _stringify_path(path): - """ - Convert *path* to a string or unicode path if possible. - """ - if isinstance(path, str): - return path - - # checking whether path implements the filesystem protocol - try: - return path.__fspath__() # new in python 3.6 - except AttributeError: - # fallback pathlib ckeck for earlier python versions than 3.6 - if isinstance(path, pathlib.Path): - return str(path) - - raise TypeError("not a path-like object") diff --git a/dev/archery/archery/crossbow/__init__.py b/dev/archery/archery/crossbow/__init__.py deleted file mode 100644 index bc72e81f05054..0000000000000 --- a/dev/archery/archery/crossbow/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from .core import Config, Repo, Queue, Target, Job # noqa -from .reports import CommentReport, ConsoleReport, EmailReport # noqa diff --git a/dev/archery/archery/crossbow/cli.py b/dev/archery/archery/crossbow/cli.py deleted file mode 100644 index 71c25e0460f1a..0000000000000 --- a/dev/archery/archery/crossbow/cli.py +++ /dev/null @@ -1,352 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from pathlib import Path - -import click - -from .core import Config, Repo, Queue, Target, Job, CrossbowError -from .reports import EmailReport, ConsoleReport -from ..utils.source import ArrowSources - - -_default_arrow_path = ArrowSources.find().path -_default_queue_path = _default_arrow_path.parent / "crossbow" -_default_config_path = _default_arrow_path / "dev" / "tasks" / "tasks.yml" - - -@click.group() -@click.option('--github-token', '-t', default=None, - envvar="CROSSBOW_GITHUB_TOKEN", - help='OAuth token for GitHub authentication') -@click.option('--arrow-path', '-a', - type=click.Path(), default=_default_arrow_path, - help='Arrow\'s repository path. Defaults to the repository of ' - 'this script') -@click.option('--queue-path', '-q', - type=click.Path(), default=_default_queue_path, - help='The repository path used for scheduling the tasks. ' - 'Defaults to crossbow directory placed next to arrow') -@click.option('--queue-remote', '-qr', default=None, - help='Force to use this remote URL for the Queue repository') -@click.option('--output-file', metavar='', - type=click.File('w', encoding='utf8'), default='-', - help='Capture output result into file.') -@click.pass_context -def crossbow(ctx, github_token, arrow_path, queue_path, queue_remote, - output_file): - """ - Schedule packaging tasks or nightly builds on CI services. - """ - ctx.ensure_object(dict) - ctx.obj['output'] = output_file - ctx.obj['arrow'] = Repo(arrow_path) - ctx.obj['queue'] = Queue(queue_path, remote_url=queue_remote, - github_token=github_token, require_https=True) - - -@crossbow.command() -@click.option('--config-path', '-c', - type=click.Path(exists=True), default=_default_config_path, - help='Task configuration yml. Defaults to tasks.yml') -@click.pass_obj -def check_config(obj, config_path): - # load available tasks configuration and groups from yaml - config = Config.load_yaml(config_path) - config.validate() - - output = obj['output'] - config.show(output) - - -@crossbow.command() -@click.argument('tasks', nargs=-1, required=False) -@click.option('--group', '-g', 'groups', multiple=True, - help='Submit task groups as defined in task.yml') -@click.option('--param', '-p', 'params', multiple=True, - help='Additional task parameters for rendering the CI templates') -@click.option('--job-prefix', default='build', - help='Arbitrary prefix for branch names, e.g. nightly') -@click.option('--config-path', '-c', - type=click.Path(exists=True), default=_default_config_path, - help='Task configuration yml. Defaults to tasks.yml') -@click.option('--arrow-version', '-v', default=None, - help='Set target version explicitly.') -@click.option('--arrow-remote', '-r', default=None, - help='Set GitHub remote explicitly, which is going to be cloned ' - 'on the CI services. Note, that no validation happens ' - 'locally. Examples: https://github.com/apache/arrow or ' - 'https://github.com/kszucs/arrow.') -@click.option('--arrow-branch', '-b', default=None, - help='Give the branch name explicitly, e.g. master, ARROW-1949.') -@click.option('--arrow-sha', '-t', default=None, - help='Set commit SHA or Tag name explicitly, e.g. f67a515, ' - 'apache-arrow-0.11.1.') -@click.option('--fetch/--no-fetch', default=True, - help='Fetch references (branches and tags) from the remote') -@click.option('--dry-run/--commit', default=False, - help='Just display the rendered CI configurations without ' - 'committing them') -@click.option('--no-push/--push', default=False, - help='Don\'t push the changes') -@click.pass_obj -def submit(obj, tasks, groups, params, job_prefix, config_path, arrow_version, - arrow_remote, arrow_branch, arrow_sha, fetch, dry_run, no_push): - output = obj['output'] - queue, arrow = obj['queue'], obj['arrow'] - - # load available tasks configuration and groups from yaml - config = Config.load_yaml(config_path) - try: - config.validate() - except CrossbowError as e: - raise click.ClickException(str(e)) - - # Override the detected repo url / remote, branch and sha - this aims to - # make release procedure a bit simpler. - # Note, that the target resivion's crossbow templates must be - # compatible with the locally checked out version of crossbow (which is - # in case of the release procedure), because the templates still - # contain some business logic (dependency installation, deployments) - # which will be reduced to a single command in the future. - target = Target.from_repo(arrow, remote=arrow_remote, branch=arrow_branch, - head=arrow_sha, version=arrow_version) - - # parse additional job parameters - params = dict([p.split("=") for p in params]) - - # instantiate the job object - try: - job = Job.from_config(config=config, target=target, tasks=tasks, - groups=groups, params=params) - except CrossbowError as e: - raise click.ClickException(str(e)) - - job.show(output) - if dry_run: - return - - if fetch: - queue.fetch() - queue.put(job, prefix=job_prefix) - - if no_push: - click.echo('Branches and commits created but not pushed: `{}`' - .format(job.branch)) - else: - queue.push() - click.echo('Pushed job identifier is: `{}`'.format(job.branch)) - - -@crossbow.command() -@click.argument('task', required=True) -@click.option('--config-path', '-c', - type=click.Path(exists=True), default=_default_config_path, - help='Task configuration yml. Defaults to tasks.yml') -@click.option('--arrow-version', '-v', default=None, - help='Set target version explicitly.') -@click.option('--arrow-remote', '-r', default=None, - help='Set GitHub remote explicitly, which is going to be cloned ' - 'on the CI services. Note, that no validation happens ' - 'locally. Examples: https://github.com/apache/arrow or ' - 'https://github.com/kszucs/arrow.') -@click.option('--arrow-branch', '-b', default=None, - help='Give the branch name explicitly, e.g. master, ARROW-1949.') -@click.option('--arrow-sha', '-t', default=None, - help='Set commit SHA or Tag name explicitly, e.g. f67a515, ' - 'apache-arrow-0.11.1.') -@click.option('--param', '-p', 'params', multiple=True, - help='Additional task parameters for rendering the CI templates') -@click.pass_obj -def render(obj, task, config_path, arrow_version, arrow_remote, arrow_branch, - arrow_sha, params): - """ - Utility command to check the rendered CI templates. - """ - from .core import _flatten - - def highlight(code): - try: - from pygments import highlight - from pygments.lexers import YamlLexer - from pygments.formatters import TerminalFormatter - return highlight(code, YamlLexer(), TerminalFormatter()) - except ImportError: - return code - - arrow = obj['arrow'] - - target = Target.from_repo(arrow, remote=arrow_remote, branch=arrow_branch, - head=arrow_sha, version=arrow_version) - config = Config.load_yaml(config_path) - params = dict([p.split("=") for p in params]) - job = Job.from_config(config=config, target=target, tasks=[task], - params=params) - - for task_name, rendered_files in job.render_tasks().items(): - for path, content in _flatten(rendered_files).items(): - click.echo('#' * 80) - click.echo('### {:^72} ###'.format("/".join(path))) - click.echo('#' * 80) - click.echo(highlight(content)) - - -@crossbow.command() -@click.argument('job-name', required=True) -@click.option('--fetch/--no-fetch', default=True, - help='Fetch references (branches and tags) from the remote') -@click.pass_obj -def status(obj, job_name, fetch): - output = obj['output'] - queue = obj['queue'] - if fetch: - queue.fetch() - job = queue.get(job_name) - ConsoleReport(job).show(output) - - -@crossbow.command() -@click.argument('prefix', required=True) -@click.option('--fetch/--no-fetch', default=True, - help='Fetch references (branches and tags) from the remote') -@click.pass_obj -def latest_prefix(obj, prefix, fetch): - queue = obj['queue'] - if fetch: - queue.fetch() - latest = queue.latest_for_prefix(prefix) - click.echo(latest.branch) - - -@crossbow.command() -@click.argument('job-name', required=True) -@click.option('--sender-name', '-n', - help='Name to use for report e-mail.') -@click.option('--sender-email', '-e', - help='E-mail to use for report e-mail.') -@click.option('--recipient-email', '-r', - help='Where to send the e-mail report') -@click.option('--smtp-user', '-u', - help='E-mail address to use for SMTP login') -@click.option('--smtp-password', '-P', - help='SMTP password to use for report e-mail.') -@click.option('--smtp-server', '-s', default='smtp.gmail.com', - help='SMTP server to use for report e-mail.') -@click.option('--smtp-port', '-p', default=465, - help='SMTP port to use for report e-mail.') -@click.option('--poll/--no-poll', default=False, - help='Wait for completion if there are tasks pending') -@click.option('--poll-max-minutes', default=180, - help='Maximum amount of time waiting for job completion') -@click.option('--poll-interval-minutes', default=10, - help='Number of minutes to wait to check job status again') -@click.option('--send/--dry-run', default=False, - help='Just display the report, don\'t send it') -@click.option('--fetch/--no-fetch', default=True, - help='Fetch references (branches and tags) from the remote') -@click.pass_obj -def report(obj, job_name, sender_name, sender_email, recipient_email, - smtp_user, smtp_password, smtp_server, smtp_port, poll, - poll_max_minutes, poll_interval_minutes, send, fetch): - """ - Send an e-mail report showing success/failure of tasks in a Crossbow run - """ - output = obj['output'] - queue = obj['queue'] - if fetch: - queue.fetch() - - job = queue.get(job_name) - report = EmailReport( - job=job, - sender_name=sender_name, - sender_email=sender_email, - recipient_email=recipient_email - ) - - if poll: - job.wait_until_finished( - poll_max_minutes=poll_max_minutes, - poll_interval_minutes=poll_interval_minutes - ) - - if send: - report.send( - smtp_user=smtp_user, - smtp_password=smtp_password, - smtp_server=smtp_server, - smtp_port=smtp_port - ) - else: - report.show(output) - - -@crossbow.command() -@click.argument('job-name', required=True) -@click.option('-t', '--target-dir', - default=_default_arrow_path / 'packages', - type=click.Path(file_okay=False, dir_okay=True), - help='Directory to download the build artifacts') -@click.option('--dry-run/--execute', default=False, - help='Just display process, don\'t download anything') -@click.option('--fetch/--no-fetch', default=True, - help='Fetch references (branches and tags) from the remote') -@click.pass_obj -def download_artifacts(obj, job_name, target_dir, dry_run, fetch): - """Download build artifacts from GitHub releases""" - output = obj['output'] - - # fetch the queue repository - queue = obj['queue'] - if fetch: - queue.fetch() - - # query the job's artifacts - job = queue.get(job_name) - - # create directory to download the assets to - target_dir = Path(target_dir).absolute() / job_name - target_dir.mkdir(parents=True, exist_ok=True) - - # download the assets while showing the job status - def asset_callback(task_name, task, asset): - if asset is not None: - path = target_dir / task_name / asset.name - path.parent.mkdir(exist_ok=True) - if not dry_run: - asset.download(path) - - click.echo('Downloading {}\'s artifacts.'.format(job_name)) - click.echo('Destination directory is {}'.format(target_dir)) - click.echo() - - report = ConsoleReport(job) - report.show(output, asset_callback=asset_callback) - - -@crossbow.command() -@click.option('--sha', required=True, help='Target committish') -@click.option('--tag', required=True, help='Target tag') -@click.option('--method', default='curl', help='Use cURL to upload') -@click.option('--pattern', '-p', 'patterns', required=True, multiple=True, - help='File pattern to upload as assets') -@click.pass_obj -def upload_artifacts(obj, tag, sha, patterns, method): - queue = obj['queue'] - queue.github_overwrite_release_assets( - tag_name=tag, target_commitish=sha, method=method, patterns=patterns - ) diff --git a/dev/archery/archery/crossbow/core.py b/dev/archery/archery/crossbow/core.py deleted file mode 100644 index 9d3074a21d583..0000000000000 --- a/dev/archery/archery/crossbow/core.py +++ /dev/null @@ -1,1162 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import os -import re -import fnmatch -import glob -import time -import logging -import mimetypes -import subprocess -import textwrap -from io import StringIO -from pathlib import Path -from datetime import date - -import jinja2 -from ruamel.yaml import YAML - -try: - import github3 - _have_github3 = True -except ImportError: - github3 = object - _have_github3 = False - -try: - import pygit2 -except ImportError: - PygitRemoteCallbacks = object -else: - PygitRemoteCallbacks = pygit2.RemoteCallbacks - -from ..utils.source import ArrowSources - - -for pkg in ["requests", "urllib3", "github3"]: - logging.getLogger(pkg).setLevel(logging.WARNING) - -logger = logging.getLogger("crossbow") - - -class CrossbowError(Exception): - pass - - -def _flatten(mapping): - """Converts a hierarchical mapping to a flat dictionary""" - result = {} - for k, v in mapping.items(): - if isinstance(v, dict): - for ik, iv in _flatten(v).items(): - ik = ik if isinstance(ik, tuple) else (ik,) - result[(k,) + ik] = iv - elif isinstance(v, list): - for ik, iv in enumerate(_flatten(v)): - ik = ik if isinstance(ik, tuple) else (ik,) - result[(k,) + ik] = iv - else: - result[(k,)] = v - return result - - -def _unflatten(mapping): - """Converts a flat tuple => object mapping to hierarchical one""" - result = {} - for path, value in mapping.items(): - parents, leaf = path[:-1], path[-1] - # create the hierarchy until we reach the leaf value - temp = result - for parent in parents: - temp.setdefault(parent, {}) - temp = temp[parent] - # set the leaf value - temp[leaf] = value - - return result - - -def _unflatten_tree(files): - """Converts a flat path => object mapping to a hierarchical directories - - Input: - { - 'path/to/file.a': a_content, - 'path/to/file.b': b_content, - 'path/file.c': c_content - } - Output: - { - 'path': { - 'to': { - 'file.a': a_content, - 'file.b': b_content - }, - 'file.c': c_content - } - } - """ - files = {tuple(k.split('/')): v for k, v in files.items()} - return _unflatten(files) - - -def _render_jinja_template(searchpath, template, params): - def format_all(items, pattern): - return [pattern.format(item) for item in items] - - loader = jinja2.FileSystemLoader(searchpath) - env = jinja2.Environment(loader=loader, trim_blocks=True, - lstrip_blocks=True, - undefined=jinja2.StrictUndefined) - env.filters['format_all'] = format_all - template = env.get_template(template) - return template.render(**params) - - -# configurations for setting up branch skipping -# - appveyor has a feature to skip builds without an appveyor.yml -# - travis reads from the master branch and applies the rules -# - circle requires the configuration to be present on all branch, even ones -# that are configured to be skipped -# - azure skips branches without azure-pipelines.yml by default -# - github skips branches without .github/workflows/ by default - -_default_travis_yml = """ -branches: - only: - - master - - /.*-travis-.*/ - -os: linux -dist: trusty -language: generic -""" - -_default_circle_yml = """ -version: 2 - -jobs: - build: - machine: true - -workflows: - version: 2 - build: - jobs: - - build: - filters: - branches: - only: - - /.*-circle-.*/ -""" - -_default_tree = { - '.travis.yml': _default_travis_yml, - '.circleci/config.yml': _default_circle_yml -} - - -class GitRemoteCallbacks(PygitRemoteCallbacks): - - def __init__(self, token): - self.token = token - self.attempts = 0 - super().__init__() - - def push_update_reference(self, refname, message): - pass - - def update_tips(self, refname, old, new): - pass - - def credentials(self, url, username_from_url, allowed_types): - # its a libgit2 bug, that it infinitely retries the authentication - self.attempts += 1 - - if self.attempts >= 5: - # pygit2 doesn't propagate the exception properly - msg = 'Wrong oauth personal access token' - print(msg) - raise CrossbowError(msg) - - if allowed_types & pygit2.credentials.GIT_CREDTYPE_USERPASS_PLAINTEXT: - return pygit2.UserPass(self.token, 'x-oauth-basic') - else: - return None - - -def _git_ssh_to_https(url): - return url.replace('git@github.com:', 'https://github.com/') - - -class Repo: - """ - Base class for interaction with local git repositories - - A high level wrapper used for both reading revision information from - arrow's repository and pushing continuous integration tasks to the queue - repository. - - Parameters - ---------- - require_https : boolean, default False - Raise exception for SSH origin URLs - """ - - def __init__(self, path, github_token=None, remote_url=None, - require_https=False): - self.path = Path(path) - self.github_token = github_token - self.require_https = require_https - self._remote_url = remote_url - self._pygit_repo = None - self._github_repo = None # set by as_github_repo() - self._updated_refs = [] - - def __str__(self): - tpl = textwrap.dedent(''' - Repo: {remote}@{branch} - Commit: {head} - ''') - return tpl.format( - remote=self.remote_url, - branch=self.branch.branch_name, - head=self.head - ) - - @property - def repo(self): - if self._pygit_repo is None: - self._pygit_repo = pygit2.Repository(str(self.path)) - return self._pygit_repo - - @property - def origin(self): - remote = self.repo.remotes['origin'] - if self.require_https and remote.url.startswith('git@github.com'): - raise CrossbowError("Change SSH origin URL to HTTPS to use " - "Crossbow: {}".format(remote.url)) - return remote - - def fetch(self): - refspec = '+refs/heads/*:refs/remotes/origin/*' - self.origin.fetch([refspec]) - - def push(self, refs=None, github_token=None): - github_token = github_token or self.github_token - if github_token is None: - raise RuntimeError( - 'Could not determine GitHub token. Please set the ' - 'CROSSBOW_GITHUB_TOKEN environment variable to a ' - 'valid GitHub access token or pass one to --github-token.' - ) - callbacks = GitRemoteCallbacks(github_token) - refs = refs or [] - try: - self.origin.push(refs + self._updated_refs, callbacks=callbacks) - except pygit2.GitError: - raise RuntimeError('Failed to push updated references, ' - 'potentially because of credential issues: {}' - .format(self._updated_refs)) - else: - self.updated_refs = [] - - @property - def head(self): - """Currently checked out commit's sha""" - return self.repo.head - - @property - def branch(self): - """Currently checked out branch""" - try: - return self.repo.branches[self.repo.head.shorthand] - except KeyError: - return None # detached - - @property - def remote(self): - """Currently checked out branch's remote counterpart""" - try: - return self.repo.remotes[self.branch.upstream.remote_name] - except (AttributeError, KeyError): - return None # cannot detect - - @property - def remote_url(self): - """Currently checked out branch's remote counterpart URL - - If an SSH github url is set, it will be replaced by the https - equivalent usable with GitHub OAuth token. - """ - try: - return self._remote_url or _git_ssh_to_https(self.remote.url) - except AttributeError: - return None - - @property - def user_name(self): - try: - return next(self.repo.config.get_multivar('user.name')) - except StopIteration: - return os.environ.get('GIT_COMMITTER_NAME', 'unknown') - - @property - def user_email(self): - try: - return next(self.repo.config.get_multivar('user.email')) - except StopIteration: - return os.environ.get('GIT_COMMITTER_EMAIL', 'unknown') - - @property - def signature(self): - return pygit2.Signature(self.user_name, self.user_email, - int(time.time())) - - def create_tree(self, files): - builder = self.repo.TreeBuilder() - - for filename, content in files.items(): - if isinstance(content, dict): - # create a subtree - tree_id = self.create_tree(content) - builder.insert(filename, tree_id, pygit2.GIT_FILEMODE_TREE) - else: - # create a file - blob_id = self.repo.create_blob(content) - builder.insert(filename, blob_id, pygit2.GIT_FILEMODE_BLOB) - - tree_id = builder.write() - return tree_id - - def create_commit(self, files, parents=None, message='', - reference_name=None): - if parents is None: - # by default use the main branch as the base of the new branch - # required to reuse github actions cache across crossbow tasks - commit, _ = self.repo.resolve_refish("master") - parents = [commit.id] - tree_id = self.create_tree(files) - - author = committer = self.signature - commit_id = self.repo.create_commit(reference_name, author, committer, - message, tree_id, parents) - return self.repo[commit_id] - - def create_branch(self, branch_name, files, parents=None, message='', - signature=None): - # create commit with the passed tree - commit = self.create_commit(files, parents=parents, message=message) - - # create branch pointing to the previously created commit - branch = self.repo.create_branch(branch_name, commit) - - # append to the pushable references - self._updated_refs.append('refs/heads/{}'.format(branch_name)) - - return branch - - def create_tag(self, tag_name, commit_id, message=''): - tag_id = self.repo.create_tag(tag_name, commit_id, - pygit2.GIT_OBJ_COMMIT, self.signature, - message) - - # append to the pushable references - self._updated_refs.append('refs/tags/{}'.format(tag_name)) - - return self.repo[tag_id] - - def file_contents(self, commit_id, file): - commit = self.repo[commit_id] - entry = commit.tree[file] - blob = self.repo[entry.id] - return blob.data - - def _parse_github_user_repo(self): - m = re.match(r'.*\/([^\/]+)\/([^\/\.]+)(\.git)?$', self.remote_url) - if m is None: - raise CrossbowError( - "Unable to parse the github owner and repository from the " - "repository's remote url '{}'".format(self.remote_url) - ) - user, repo = m.group(1), m.group(2) - return user, repo - - def as_github_repo(self, github_token=None): - """Converts it to a repository object which wraps the GitHub API""" - if self._github_repo is None: - if not _have_github3: - raise ImportError('Must install github3.py') - github_token = github_token or self.github_token - username, reponame = self._parse_github_user_repo() - session = github3.session.GitHubSession( - default_connect_timeout=10, - default_read_timeout=30 - ) - github = github3.GitHub(session=session) - github.login(token=github_token) - self._github_repo = github.repository(username, reponame) - return self._github_repo - - def github_commit(self, sha): - repo = self.as_github_repo() - return repo.commit(sha) - - def github_release(self, tag): - repo = self.as_github_repo() - try: - return repo.release_from_tag(tag) - except github3.exceptions.NotFoundError: - return None - - def github_upload_asset_requests(self, release, path, name, mime, - max_retries=None, retry_backoff=None): - if max_retries is None: - max_retries = int(os.environ.get('CROSSBOW_MAX_RETRIES', 8)) - if retry_backoff is None: - retry_backoff = int(os.environ.get('CROSSBOW_RETRY_BACKOFF', 5)) - - for i in range(max_retries): - try: - with open(path, 'rb') as fp: - result = release.upload_asset(name=name, asset=fp, - content_type=mime) - except github3.exceptions.ResponseError as e: - logger.error('Attempt {} has failed with message: {}.' - .format(i + 1, str(e))) - logger.error('Error message {}'.format(e.msg)) - logger.error('List of errors provided by Github:') - for err in e.errors: - logger.error(' - {}'.format(err)) - - if e.code == 422: - # 422 Validation Failed, probably raised because - # ReleaseAsset already exists, so try to remove it before - # reattempting the asset upload - for asset in release.assets(): - if asset.name == name: - logger.info('Release asset {} already exists, ' - 'removing it...'.format(name)) - asset.delete() - logger.info('Asset {} removed.'.format(name)) - break - except github3.exceptions.ConnectionError as e: - logger.error('Attempt {} has failed with message: {}.' - .format(i + 1, str(e))) - else: - logger.info('Attempt {} has finished.'.format(i + 1)) - return result - - time.sleep(retry_backoff) - - raise RuntimeError('Github asset uploading has failed!') - - def github_upload_asset_curl(self, release, path, name, mime): - upload_url, _ = release.upload_url.split('{?') - upload_url += '?name={}'.format(name) - - command = [ - 'curl', - '--fail', - '-H', "Authorization: token {}".format(self.github_token), - '-H', "Content-Type: {}".format(mime), - '--data-binary', '@{}'.format(path), - upload_url - ] - return subprocess.run(command, shell=False, check=True) - - def github_overwrite_release_assets(self, tag_name, target_commitish, - patterns, method='requests'): - # Since github has changed something the asset uploading via requests - # got instable, so prefer the cURL alternative. - # Potential cause: - # sigmavirus24/github3.py/issues/779#issuecomment-379470626 - repo = self.as_github_repo() - if not tag_name: - raise CrossbowError('Empty tag name') - if not target_commitish: - raise CrossbowError('Empty target commit for the release tag') - - # remove the whole release if it already exists - try: - release = repo.release_from_tag(tag_name) - except github3.exceptions.NotFoundError: - pass - else: - release.delete() - - release = repo.create_release(tag_name, target_commitish) - for pattern in patterns: - for path in glob.glob(pattern, recursive=True): - name = os.path.basename(path) - size = os.path.getsize(path) - mime = mimetypes.guess_type(name)[0] or 'application/zip' - - logger.info( - 'Uploading asset `{}` with mimetype {} and size {}...' - .format(name, mime, size) - ) - - if method == 'requests': - self.github_upload_asset_requests(release, path, name=name, - mime=mime) - elif method == 'curl': - self.github_upload_asset_curl(release, path, name=name, - mime=mime) - else: - raise CrossbowError( - 'Unsupported upload method {}'.format(method) - ) - - -class Queue(Repo): - - def _latest_prefix_id(self, prefix): - pattern = re.compile(r'[\w\/-]*{}-(\d+)'.format(prefix)) - matches = list(filter(None, map(pattern.match, self.repo.branches))) - if matches: - latest = max(int(m.group(1)) for m in matches) - else: - latest = -1 - return latest - - def _next_job_id(self, prefix): - """Auto increments the branch's identifier based on the prefix""" - latest_id = self._latest_prefix_id(prefix) - return '{}-{}'.format(prefix, latest_id + 1) - - def latest_for_prefix(self, prefix): - latest_id = self._latest_prefix_id(prefix) - if latest_id < 0: - raise RuntimeError( - 'No job has been submitted with prefix {} yet'.format(prefix) - ) - job_name = '{}-{}'.format(prefix, latest_id) - return self.get(job_name) - - def date_of(self, job): - # it'd be better to bound to the queue repository on deserialization - # and reorganize these methods to Job - branch_name = 'origin/{}'.format(job.branch) - branch = self.repo.branches[branch_name] - commit = self.repo[branch.target] - return date.fromtimestamp(commit.commit_time) - - def jobs(self, pattern): - """Return jobs sorted by its identifier in reverse order""" - job_names = [] - for name in self.repo.branches.remote: - origin, name = name.split('/', 1) - result = re.match(pattern, name) - if result: - job_names.append(name) - - for name in sorted(job_names, reverse=True): - yield self.get(name) - - def get(self, job_name): - branch_name = 'origin/{}'.format(job_name) - branch = self.repo.branches[branch_name] - try: - content = self.file_contents(branch.target, 'job.yml') - except KeyError: - raise CrossbowError( - 'No job is found with name: {}'.format(job_name) - ) - - buffer = StringIO(content.decode('utf-8')) - job = yaml.load(buffer) - job.queue = self - return job - - def put(self, job, prefix='build'): - if not isinstance(job, Job): - raise CrossbowError('`job` must be an instance of Job') - if job.branch is not None: - raise CrossbowError('`job.branch` is automatically generated, ' - 'thus it must be blank') - - if job.target.remote is None: - raise CrossbowError( - 'Cannot determine git remote for the Arrow repository to ' - 'clone or push to, try to push the `{}` branch first to have ' - 'a remote tracking counterpart.'.format(job.target.branch) - ) - if job.target.branch is None: - raise CrossbowError( - 'Cannot determine the current branch of the Arrow repository ' - 'to clone or push to, perhaps it is in detached HEAD state. ' - 'Please checkout a branch.' - ) - - # auto increment and set next job id, e.g. build-85 - job._queue = self - job.branch = self._next_job_id(prefix) - - # create tasks' branches - for task_name, task in job.tasks.items(): - # adding CI's name to the end of the branch in order to use skip - # patterns on travis and circleci - task.branch = '{}-{}-{}'.format(job.branch, task.ci, task_name) - params = { - **job.params, - "arrow": job.target, - "queue_remote_url": self.remote_url - } - files = task.render_files(job.template_searchpath, params=params) - branch = self.create_branch(task.branch, files=files) - self.create_tag(task.tag, branch.target) - task.commit = str(branch.target) - - # create job's branch with its description - return self.create_branch(job.branch, files=job.render_files()) - - -def get_version(root, **kwargs): - """ - Parse function for setuptools_scm that ignores tags for non-C++ - subprojects, e.g. apache-arrow-js-XXX tags. - """ - from setuptools_scm.git import parse as parse_git_version - - # query the calculated version based on the git tags - kwargs['describe_command'] = ( - 'git describe --dirty --tags --long --match "apache-arrow-[0-9].*"' - ) - version = parse_git_version(root, **kwargs) - - # increment the minor version, because there can be patch releases created - # from maintenance branches where the tags are unreachable from the - # master's HEAD, so the git command above generates 0.17.0.dev300 even if - # arrow has a never 0.17.1 patch release - pattern = r"^(\d+)\.(\d+)\.(\d+)$" - match = re.match(pattern, str(version.tag)) - major, minor, patch = map(int, match.groups()) - - # the bumped version number after 0.17.x will be 0.18.0.dev300 - return "{}.{}.{}.dev{}".format(major, minor + 1, patch, version.distance) - - -class Serializable: - - @classmethod - def to_yaml(cls, representer, data): - tag = '!{}'.format(cls.__name__) - dct = {k: v for k, v in data.__dict__.items() if not k.startswith('_')} - return representer.represent_mapping(tag, dct) - - -class Target(Serializable): - """ - Describes target repository and revision the builds run against - - This serializable data container holding information about arrow's - git remote, branch, sha and version number as well as some metadata - (currently only an email address where the notification should be sent). - """ - - def __init__(self, head, branch, remote, version, email=None): - self.head = head - self.email = email - self.branch = branch - self.remote = remote - self.version = version - self.no_rc_version = re.sub(r'-rc\d+\Z', '', version) - # Semantic Versioning 1.0.0: https://semver.org/spec/v1.0.0.html - # - # > A pre-release version number MAY be denoted by appending an - # > arbitrary string immediately following the patch version and a - # > dash. The string MUST be comprised of only alphanumerics plus - # > dash [0-9A-Za-z-]. - # - # Example: - # - # '0.16.1.dev10' -> - # '0.16.1-dev10' - self.no_rc_semver_version = \ - re.sub(r'\.(dev\d+)\Z', r'-\1', self.no_rc_version) - - @classmethod - def from_repo(cls, repo, head=None, branch=None, remote=None, version=None, - email=None): - """Initialize from a repository - - Optionally override detected remote, branch, head, and/or version. - """ - assert isinstance(repo, Repo) - - if head is None: - head = str(repo.head.target) - if branch is None: - branch = repo.branch.branch_name - if remote is None: - remote = repo.remote_url - if version is None: - version = get_version(repo.path) - if email is None: - email = repo.user_email - - return cls(head=head, email=email, branch=branch, remote=remote, - version=version) - - -class Task(Serializable): - """ - Describes a build task and metadata required to render CI templates - - A task is represented as a single git commit and branch containing jinja2 - rendered files (currently appveyor.yml or .travis.yml configurations). - - A task can't be directly submitted to a queue, must belong to a job. - Each task's unique identifier is its branch name, which is generated after - submitting the job to a queue. - """ - - def __init__(self, ci, template, artifacts=None, params=None): - assert ci in { - 'circle', - 'travis', - 'appveyor', - 'azure', - 'github', - 'drone', - } - self.ci = ci - self.template = template - self.artifacts = artifacts or [] - self.params = params or {} - self.branch = None # filled after adding to a queue - self.commit = None # filled after adding to a queue - self._queue = None # set by the queue object after put or get - self._status = None # status cache - self._assets = None # assets cache - - def render_files(self, searchpath, params=None): - params = {**self.params, **(params or {}), "task": self} - try: - rendered = _render_jinja_template(searchpath, self.template, - params=params) - except jinja2.TemplateError as e: - raise RuntimeError( - 'Failed to render template `{}` with {}: {}'.format( - self.template, e.__class__.__name__, str(e) - ) - ) - - tree = {**_default_tree, self.filename: rendered} - return _unflatten_tree(tree) - - @property - def tag(self): - return self.branch - - @property - def filename(self): - config_files = { - 'circle': '.circleci/config.yml', - 'travis': '.travis.yml', - 'appveyor': 'appveyor.yml', - 'azure': 'azure-pipelines.yml', - 'github': '.github/workflows/crossbow.yml', - 'drone': '.drone.yml', - } - return config_files[self.ci] - - def status(self, force_query=False): - _status = getattr(self, '_status', None) - if force_query or _status is None: - github_commit = self._queue.github_commit(self.commit) - self._status = TaskStatus(github_commit) - return self._status - - def assets(self, force_query=False): - _assets = getattr(self, '_assets', None) - if force_query or _assets is None: - github_release = self._queue.github_release(self.tag) - self._assets = TaskAssets(github_release, - artifact_patterns=self.artifacts) - return self._assets - - -class TaskStatus: - """ - Combine the results from status and checks API to a single state. - - Azure pipelines uses checks API which doesn't provide a combined - interface like status API does, so we need to manually combine - both the commit statuses and the commit checks coming from - different API endpoint - - Status.state: error, failure, pending or success, default pending - CheckRun.status: queued, in_progress or completed, default: queued - CheckRun.conclusion: success, failure, neutral, cancelled, timed_out - or action_required, only set if - CheckRun.status == 'completed' - - 1. Convert CheckRun's status and conclusion to one of Status.state - 2. Merge the states based on the following rules: - - failure if any of the contexts report as error or failure - - pending if there are no statuses or a context is pending - - success if the latest status for all contexts is success - error otherwise. - - Parameters - ---------- - commit : github3.Commit - Commit to query the combined status for. - - Returns - ------- - TaskStatus( - combined_state='error|failure|pending|success', - github_status='original github status object', - github_check_runs='github checks associated with the commit', - total_count='number of statuses and checks' - ) - """ - - def __init__(self, commit): - status = commit.status() - check_runs = list(commit.check_runs()) - states = [s.state for s in status.statuses] - - for check in check_runs: - if check.status == 'completed': - if check.conclusion in {'success', 'failure'}: - states.append(check.conclusion) - elif check.conclusion in {'cancelled', 'timed_out', - 'action_required'}: - states.append('error') - # omit `neutral` conclusion - else: - states.append('pending') - - # it could be more effective, but the following is more descriptive - combined_state = 'error' - if len(states): - if any(state in {'error', 'failure'} for state in states): - combined_state = 'failure' - elif any(state == 'pending' for state in states): - combined_state = 'pending' - elif all(state == 'success' for state in states): - combined_state = 'success' - - # show link to the actual build, some of the CI providers implement - # the statuses API others implement the checks API, so display both - build_links = [s.target_url for s in status.statuses] - build_links += [c.html_url for c in check_runs] - - self.combined_state = combined_state - self.github_status = status - self.github_check_runs = check_runs - self.total_count = len(states) - self.build_links = build_links - - -class TaskAssets(dict): - - def __init__(self, github_release, artifact_patterns): - # HACK(kszucs): don't expect uploaded assets of no atifacts were - # defiened for the tasks in order to spare a bit of github rate limit - if not artifact_patterns: - return - - if github_release is None: - github_assets = {} # no assets have been uploaded for the task - else: - github_assets = {a.name: a for a in github_release.assets()} - - for pattern in artifact_patterns: - # artifact can be a regex pattern - compiled = re.compile(pattern) - matches = list( - filter(None, map(compiled.match, github_assets.keys())) - ) - num_matches = len(matches) - - # validate artifact pattern matches single asset - if num_matches == 0: - self[pattern] = None - elif num_matches == 1: - self[pattern] = github_assets[matches[0].group(0)] - else: - raise CrossbowError( - 'Only a single asset should match pattern `{}`, there are ' - 'multiple ones: {}'.format(pattern, ', '.join(matches)) - ) - - def missing_patterns(self): - return [pattern for pattern, asset in self.items() if asset is None] - - def uploaded_assets(self): - return [asset for asset in self.values() if asset is not None] - - -class Job(Serializable): - """Describes multiple tasks against a single target repository""" - - def __init__(self, target, tasks, params=None, template_searchpath=None): - if not tasks: - raise ValueError('no tasks were provided for the job') - if not all(isinstance(task, Task) for task in tasks.values()): - raise ValueError('each `tasks` mus be an instance of Task') - if not isinstance(target, Target): - raise ValueError('`target` must be an instance of Target') - if not isinstance(target, Target): - raise ValueError('`target` must be an instance of Target') - if not isinstance(params, dict): - raise ValueError('`params` must be an instance of dict') - - self.target = target - self.tasks = tasks - self.params = params or {} # additional parameters for the tasks - self.branch = None # filled after adding to a queue - self._queue = None # set by the queue object after put or get - if template_searchpath is None: - self._template_searchpath = ArrowSources.find().path - else: - self._template_searchpath = template_searchpath - - def render_files(self): - with StringIO() as buf: - yaml.dump(self, buf) - content = buf.getvalue() - tree = {**_default_tree, "job.yml": content} - return _unflatten_tree(tree) - - def render_tasks(self, params=None): - result = {} - params = { - **self.params, - "arrow": self.target, - **(params or {}) - } - for task_name, task in self.tasks.items(): - files = task.render_files(self._template_searchpath, params) - result[task_name] = files - return result - - @property - def template_searchpath(self): - return self._template_searchpath - - @property - def queue(self): - assert isinstance(self._queue, Queue) - return self._queue - - @queue.setter - def queue(self, queue): - assert isinstance(queue, Queue) - self._queue = queue - for task in self.tasks.values(): - task._queue = queue - - @property - def email(self): - return os.environ.get('CROSSBOW_EMAIL', self.target.email) - - @property - def date(self): - return self.queue.date_of(self) - - def show(self, stream=None): - return yaml.dump(self, stream=stream) - - @classmethod - def from_config(cls, config, target, tasks=None, groups=None, params=None): - """ - Intantiate a job from based on a config. - - Parameters - ---------- - config : dict - Deserialized content of tasks.yml - target : Target - Describes target repository and revision the builds run against. - tasks : Optional[List[str]], default None - List of glob patterns for matching task names. - groups : Optional[List[str]], default None - List of exact group names matching predefined task sets in the - config. - params : Optional[Dict[str, str]], default None - Additional rendering parameters for the task templates. - - Returns - ------- - Job - - Raises - ------ - Exception: - If invalid groups or tasks has been passed. - """ - task_definitions = config.select(tasks, groups=groups) - - # instantiate the tasks - tasks = {} - versions = {'version': target.version, - 'no_rc_version': target.no_rc_version, - 'no_rc_semver_version': target.no_rc_semver_version} - for task_name, task in task_definitions.items(): - artifacts = task.pop('artifacts', None) or [] # because of yaml - artifacts = [fn.format(**versions) for fn in artifacts] - tasks[task_name] = Task(artifacts=artifacts, **task) - - return cls(target=target, tasks=tasks, params=params, - template_searchpath=config.template_searchpath) - - def is_finished(self): - for task in self.tasks.values(): - status = task.status(force_query=True) - if status.combined_state == 'pending': - return False - return True - - def wait_until_finished(self, poll_max_minutes=120, - poll_interval_minutes=10): - started_at = time.time() - while True: - if self.is_finished(): - break - - waited_for_minutes = (time.time() - started_at) / 60 - if waited_for_minutes > poll_max_minutes: - msg = ('Exceeded the maximum amount of time waiting for job ' - 'to finish, waited for {} minutes.') - raise RuntimeError(msg.format(waited_for_minutes)) - - logger.info('Waiting {} minutes and then checking again' - .format(poll_interval_minutes)) - time.sleep(poll_interval_minutes * 60) - - -class Config(dict): - - def __init__(self, tasks, template_searchpath): - super().__init__(tasks) - self.template_searchpath = template_searchpath - - @classmethod - def load_yaml(cls, path): - path = Path(path) - searchpath = path.parent - rendered = _render_jinja_template(searchpath, template=path.name, - params={}) - config = yaml.load(rendered) - return cls(config, template_searchpath=searchpath) - - def show(self, stream=None): - return yaml.dump(dict(self), stream=stream) - - def select(self, tasks=None, groups=None): - config_groups = dict(self['groups']) - config_tasks = dict(self['tasks']) - valid_groups = set(config_groups.keys()) - valid_tasks = set(config_tasks.keys()) - group_whitelist = list(groups or []) - task_whitelist = list(tasks or []) - - # validate that the passed groups are defined in the config - requested_groups = set(group_whitelist) - invalid_groups = requested_groups - valid_groups - if invalid_groups: - msg = 'Invalid group(s) {!r}. Must be one of {!r}'.format( - invalid_groups, valid_groups - ) - raise CrossbowError(msg) - - # merge the tasks defined in the selected groups - task_patterns = [list(config_groups[name]) for name in group_whitelist] - task_patterns = set(sum(task_patterns, task_whitelist)) - - # treat the task names as glob patterns to select tasks more easily - requested_tasks = set() - for pattern in task_patterns: - matches = fnmatch.filter(valid_tasks, pattern) - if len(matches): - requested_tasks.update(matches) - else: - raise CrossbowError( - "Unable to match any tasks for `{}`".format(pattern) - ) - - # validate that the passed and matched tasks are defined in the config - invalid_tasks = requested_tasks - valid_tasks - if invalid_tasks: - msg = 'Invalid task(s) {!r}. Must be one of {!r}'.format( - invalid_tasks, valid_tasks - ) - raise CrossbowError(msg) - - return { - task_name: config_tasks[task_name] for task_name in requested_tasks - } - - def validate(self): - # validate that the task groups are properly referening the tasks - for group_name, group in self['groups'].items(): - for pattern in group: - tasks = self.select(tasks=[pattern]) - if not tasks: - raise CrossbowError( - "The pattern `{}` defined for task group `{}` is not " - "matching any of the tasks defined in the " - "configuration file.".format(pattern, group_name) - ) - - # validate that the tasks are constructible - for task_name, task in self['tasks'].items(): - try: - Task(**task) - except Exception as e: - raise CrossbowError( - 'Unable to construct a task object from the ' - 'definition of task `{}`. The original error message ' - 'is: `{}`'.format(task_name, str(e)) - ) - - # validate that the defined tasks are renderable, in order to to that - # define the required object with dummy data - target = Target( - head='e279a7e06e61c14868ca7d71dea795420aea6539', - branch='master', - remote='https://github.com/apache/arrow', - version='1.0.0dev123', - email='dummy@example.ltd' - ) - - for task_name, task in self['tasks'].items(): - task = Task(**task) - files = task.render_files( - self.template_searchpath, - params=dict( - arrow=target, - queue_remote_url='https://github.com/org/crossbow' - ) - ) - if not files: - raise CrossbowError('No files have been rendered for task `{}`' - .format(task_name)) - - -# configure yaml serializer -yaml = YAML() -yaml.register_class(Job) -yaml.register_class(Task) -yaml.register_class(Target) diff --git a/dev/archery/archery/crossbow/reports.py b/dev/archery/archery/crossbow/reports.py deleted file mode 100644 index bc82db7f51a54..0000000000000 --- a/dev/archery/archery/crossbow/reports.py +++ /dev/null @@ -1,302 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import click -import collections -import operator -import functools -from io import StringIO -import textwrap - - -# TODO(kszucs): use archery.report.JinjaReport instead -class Report: - - def __init__(self, job): - self.job = job - - def show(self): - raise NotImplementedError() - - -class ConsoleReport(Report): - """Report the status of a Job to the console using click""" - - # output table's header template - HEADER = '[{state:>7}] {branch:<52} {content:>16}' - DETAILS = ' └ {url}' - - # output table's row template for assets - ARTIFACT_NAME = '{artifact:>69} ' - ARTIFACT_STATE = '[{state:>7}]' - - # state color mapping to highlight console output - COLORS = { - # from CombinedStatus - 'error': 'red', - 'failure': 'red', - 'pending': 'yellow', - 'success': 'green', - # custom state messages - 'ok': 'green', - 'missing': 'red' - } - - def lead(self, state, branch, n_uploaded, n_expected): - line = self.HEADER.format( - state=state.upper(), - branch=branch, - content='uploaded {} / {}'.format(n_uploaded, n_expected) - ) - return click.style(line, fg=self.COLORS[state.lower()]) - - def header(self): - header = self.HEADER.format( - state='state', - branch='Task / Branch', - content='Artifacts' - ) - delimiter = '-' * len(header) - return '{}\n{}'.format(header, delimiter) - - def artifact(self, state, pattern, asset): - if asset is None: - artifact = pattern - state = 'pending' if state == 'pending' else 'missing' - else: - artifact = asset.name - state = 'ok' - - name_ = self.ARTIFACT_NAME.format(artifact=artifact) - state_ = click.style( - self.ARTIFACT_STATE.format(state=state.upper()), - self.COLORS[state] - ) - return name_ + state_ - - def show(self, outstream, asset_callback=None): - echo = functools.partial(click.echo, file=outstream) - - # write table's header - echo(self.header()) - - # write table's body - for task_name, task in sorted(self.job.tasks.items()): - # if not task_name.startswith("test-debian-10-python-3"): - # continue - # write summary of the uploaded vs total assets - status = task.status() - assets = task.assets() - - # mapping of artifact pattern to asset or None of not uploaded - n_expected = len(task.artifacts) - n_uploaded = len(assets.uploaded_assets()) - echo(self.lead(status.combined_state, task_name, n_uploaded, - n_expected)) - - # show link to the actual build, some of the CI providers implement - # the statuses API others implement the checks API, so display both - for link in status.build_links: - echo(self.DETAILS.format(url=link)) - - # write per asset status - for artifact_pattern, asset in assets.items(): - if asset_callback is not None: - asset_callback(task_name, task, asset) - echo(self.artifact(status.combined_state, artifact_pattern, - asset)) - - -class EmailReport(Report): - - HEADER = textwrap.dedent(""" - Arrow Build Report for Job {job_name} - - All tasks: {all_tasks_url} - """) - - TASK = textwrap.dedent(""" - - {name}: - URL: {url} - """).strip() - - EMAIL = textwrap.dedent(""" - From: {sender_name} <{sender_email}> - To: {recipient_email} - Subject: {subject} - - {body} - """).strip() - - STATUS_HEADERS = { - # from CombinedStatus - 'error': 'Errored Tasks:', - 'failure': 'Failed Tasks:', - 'pending': 'Pending Tasks:', - 'success': 'Succeeded Tasks:', - } - - def __init__(self, job, sender_name, sender_email, recipient_email): - self.sender_name = sender_name - self.sender_email = sender_email - self.recipient_email = recipient_email - super().__init__(job) - - def url(self, query): - repo_url = self.job.queue.remote_url.strip('.git') - return '{}/branches/all?query={}'.format(repo_url, query) - - def listing(self, tasks): - return '\n'.join( - sorted( - self.TASK.format(name=task_name, url=self.url(task.branch)) - for task_name, task in tasks.items() - ) - ) - - def header(self): - url = self.url(self.job.branch) - return self.HEADER.format(job_name=self.job.branch, all_tasks_url=url) - - def subject(self): - return ( - "[NIGHTLY] Arrow Build Report for Job {}".format(self.job.branch) - ) - - def body(self): - buffer = StringIO() - buffer.write(self.header()) - - tasks_by_state = collections.defaultdict(dict) - for task_name, task in self.job.tasks.items(): - state = task.status().combined_state - tasks_by_state[state][task_name] = task - - for state in ('failure', 'error', 'pending', 'success'): - if state in tasks_by_state: - tasks = tasks_by_state[state] - buffer.write('\n') - buffer.write(self.STATUS_HEADERS[state]) - buffer.write('\n') - buffer.write(self.listing(tasks)) - buffer.write('\n') - - return buffer.getvalue() - - def email(self): - return self.EMAIL.format( - sender_name=self.sender_name, - sender_email=self.sender_email, - recipient_email=self.recipient_email, - subject=self.subject(), - body=self.body() - ) - - def show(self, outstream): - outstream.write(self.email()) - - def send(self, smtp_user, smtp_password, smtp_server, smtp_port): - import smtplib - - email = self.email() - - server = smtplib.SMTP_SSL(smtp_server, smtp_port) - server.ehlo() - server.login(smtp_user, smtp_password) - server.sendmail(smtp_user, self.recipient_email, email) - server.close() - - -class CommentReport(Report): - - _markdown_badge = '[![{title}]({badge})]({url})' - - badges = { - 'github': _markdown_badge.format( - title='Github Actions', - url='https://github.com/{repo}/actions?query=branch:{branch}', - badge=( - 'https://github.com/{repo}/workflows/Crossbow/' - 'badge.svg?branch={branch}' - ), - ), - 'azure': _markdown_badge.format( - title='Azure', - url=( - 'https://dev.azure.com/{repo}/_build/latest' - '?definitionId=1&branchName={branch}' - ), - badge=( - 'https://dev.azure.com/{repo}/_apis/build/status/' - '{repo_dotted}?branchName={branch}' - ) - ), - 'travis': _markdown_badge.format( - title='TravisCI', - url='https://travis-ci.com/{repo}/branches', - badge='https://img.shields.io/travis/{repo}/{branch}.svg' - ), - 'circle': _markdown_badge.format( - title='CircleCI', - url='https://circleci.com/gh/{repo}/tree/{branch}', - badge=( - 'https://img.shields.io/circleci/build/github' - '/{repo}/{branch}.svg' - ) - ), - 'appveyor': _markdown_badge.format( - title='Appveyor', - url='https://ci.appveyor.com/project/{repo}/history', - badge='https://img.shields.io/appveyor/ci/{repo}/{branch}.svg' - ), - 'drone': _markdown_badge.format( - title='Drone', - url='https://cloud.drone.io/{repo}', - badge='https://img.shields.io/drone/build/{repo}/{branch}.svg' - ), - } - - def __init__(self, job, crossbow_repo): - self.crossbow_repo = crossbow_repo - super().__init__(job) - - def show(self): - url = 'https://github.com/{repo}/branches/all?query={branch}' - sha = self.job.target.head - - msg = 'Revision: {}\n\n'.format(sha) - msg += 'Submitted crossbow builds: [{repo} @ {branch}]' - msg += '({})\n'.format(url) - msg += '\n|Task|Status|\n|----|------|' - - tasks = sorted(self.job.tasks.items(), key=operator.itemgetter(0)) - for key, task in tasks: - branch = task.branch - - try: - template = self.badges[task.ci] - badge = template.format( - repo=self.crossbow_repo, - repo_dotted=self.crossbow_repo.replace('/', '.'), - branch=branch - ) - except KeyError: - badge = 'unsupported CI service `{}`'.format(task.ci) - - msg += '\n|{}|{}|'.format(key, badge) - - return msg.format(repo=self.crossbow_repo, branch=self.job.branch) diff --git a/dev/archery/archery/crossbow/tests/fixtures/crossbow-job.yaml b/dev/archery/archery/crossbow/tests/fixtures/crossbow-job.yaml deleted file mode 100644 index c37c7b553a4e7..0000000000000 --- a/dev/archery/archery/crossbow/tests/fixtures/crossbow-job.yaml +++ /dev/null @@ -1,51 +0,0 @@ -!Job -target: !Target - head: f766a1d615dd1b7ee706d05102e579195951a61c - email: unkown - branch: refs/pull/4435/merge - remote: https://github.com/apache/arrow - version: 0.13.0.dev306 - no_rc_version: 0.13.0.dev306 -tasks: - docker-cpp-cmake32: !Task - ci: circle - platform: linux - template: docker-tests/circle.linux.yml - artifacts: [] - params: - commands: - - docker-compose build cpp-cmake32 - - docker-compose run cpp-cmake32 - branch: ursabot-1-circle-docker-cpp-cmake32 - commit: a56b077c8d1b891a7935048e5672bf6fc07599ec - wheel-osx-cp37m: !Task - ci: travis - platform: osx - template: python-wheels/travis.osx.yml - artifacts: - - pyarrow-0.13.0.dev306-cp37-cp37m-macosx_10_6_intel.whl - params: - python_version: 3.7 - branch: ursabot-1-travis-wheel-osx-cp37m - commit: a56b077c8d1b891a7935048e5672bf6fc07599ec - wheel-osx-cp36m: !Task - ci: travis - platform: osx - template: python-wheels/travis.osx.yml - artifacts: - - pyarrow-0.13.0.dev306-cp36-cp36m-macosx_10_6_intel.whl - params: - python_version: 3.6 - branch: ursabot-1-travis-wheel-osx-cp36m - commit: a56b077c8d1b891a7935048e5672bf6fc07599ec - wheel-win-cp36m: !Task - ci: appveyor - platform: win - template: python-wheels/appveyor.yml - artifacts: - - pyarrow-0.13.0.dev306-cp36-cp36m-win_amd64.whl - params: - python_version: 3.6 - branch: ursabot-1-appveyor-wheel-win-cp36m - commit: a56b077c8d1b891a7935048e5672bf6fc07599ec -branch: ursabot-1 diff --git a/dev/archery/archery/crossbow/tests/fixtures/crossbow-success-message.md b/dev/archery/archery/crossbow/tests/fixtures/crossbow-success-message.md deleted file mode 100644 index f914287dcc092..0000000000000 --- a/dev/archery/archery/crossbow/tests/fixtures/crossbow-success-message.md +++ /dev/null @@ -1,10 +0,0 @@ -Revision: {revision} - -Submitted crossbow builds: [{repo} @ {branch}](https://github.com/{repo}/branches/all?query={branch}) - -| Task | Status | -| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| docker-cpp-cmake32 | [![CircleCI](https://img.shields.io/circleci/build/github/{repo}/{branch}-circle-docker-cpp-cmake32.svg)](https://circleci.com/gh/{repo}/tree/{branch}-circle-docker-cpp-cmake32) | -| wheel-osx-cp36m | [![TravisCI](https://img.shields.io/travis/{repo}/{branch}-travis-wheel-osx-cp36m.svg)](https://travis-ci.com/{repo}/branches) | -| wheel-osx-cp37m | [![TravisCI](https://img.shields.io/travis/{repo}/{branch}-travis-wheel-osx-cp37m.svg)](https://travis-ci.com/{repo}/branches) | -| wheel-win-cp36m | [![Appveyor](https://img.shields.io/appveyor/ci/{repo}/{branch}-appveyor-wheel-win-cp36m.svg)](https://ci.appveyor.com/project/{repo}/history) | diff --git a/dev/archery/archery/crossbow/tests/test_core.py b/dev/archery/archery/crossbow/tests/test_core.py deleted file mode 100644 index 518474236aca1..0000000000000 --- a/dev/archery/archery/crossbow/tests/test_core.py +++ /dev/null @@ -1,25 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from archery.utils.source import ArrowSources -from archery.crossbow import Config - - -def test_config(): - src = ArrowSources.find() - conf = Config.load_yaml(src.dev / "tasks" / "tasks.yml") - conf.validate() diff --git a/dev/archery/archery/crossbow/tests/test_crossbow_cli.py b/dev/archery/archery/crossbow/tests/test_crossbow_cli.py deleted file mode 100644 index ee9ba1ee2fc83..0000000000000 --- a/dev/archery/archery/crossbow/tests/test_crossbow_cli.py +++ /dev/null @@ -1,43 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from click.testing import CliRunner -import pytest - -from archery.crossbow.cli import crossbow -from archery.utils.git import git - - -@pytest.mark.integration -def test_crossbow_submit(tmp_path): - runner = CliRunner() - - def invoke(*args): - return runner.invoke(crossbow, ['--queue-path', str(tmp_path), *args]) - - # initialize an empty crossbow repository - git.run_cmd("init", str(tmp_path)) - git.run_cmd("-C", str(tmp_path), "remote", "add", "origin", - "https://github.com/dummy/repo") - git.run_cmd("-C", str(tmp_path), "commit", "-m", "initial", - "--allow-empty") - - result = invoke('check-config') - assert result.exit_code == 0 - - result = invoke('submit', '--no-fetch', '--no-push', '-g', 'wheel') - assert result.exit_code == 0 diff --git a/dev/archery/archery/crossbow/tests/test_reports.py b/dev/archery/archery/crossbow/tests/test_reports.py deleted file mode 100644 index 0df292bb557aa..0000000000000 --- a/dev/archery/archery/crossbow/tests/test_reports.py +++ /dev/null @@ -1,35 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import textwrap - -from archery.crossbow.core import yaml -from archery.crossbow.reports import CommentReport - - -def test_crossbow_comment_formatter(load_fixture): - msg = load_fixture('crossbow-success-message.md') - job = load_fixture('crossbow-job.yaml', decoder=yaml.load) - - report = CommentReport(job, crossbow_repo='ursa-labs/crossbow') - expected = msg.format( - repo='ursa-labs/crossbow', - branch='ursabot-1', - revision='f766a1d615dd1b7ee706d05102e579195951a61c', - status='has been succeeded.' - ) - assert report.show() == textwrap.dedent(expected).strip() diff --git a/dev/archery/archery/docker.py b/dev/archery/archery/docker.py deleted file mode 100644 index 17d4c713afc91..0000000000000 --- a/dev/archery/archery/docker.py +++ /dev/null @@ -1,402 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import os -import re -import subprocess -from io import StringIO - -from dotenv import dotenv_values -from ruamel.yaml import YAML - -from .utils.command import Command, default_bin -from .compat import _ensure_path - - -def flatten(node, parents=None): - parents = list(parents or []) - if isinstance(node, str): - yield (node, parents) - elif isinstance(node, list): - for value in node: - yield from flatten(value, parents=parents) - elif isinstance(node, dict): - for key, value in node.items(): - yield (key, parents) - yield from flatten(value, parents=parents + [key]) - else: - raise TypeError(node) - - -def _sanitize_command(cmd): - if isinstance(cmd, list): - cmd = " ".join(cmd) - return re.sub(r"\s+", " ", cmd) - - -class UndefinedImage(Exception): - pass - - -class ComposeConfig: - - def __init__(self, config_path, dotenv_path, compose_bin, params=None): - config_path = _ensure_path(config_path) - if dotenv_path: - dotenv_path = _ensure_path(dotenv_path) - else: - dotenv_path = config_path.parent / '.env' - self._read_env(dotenv_path, params) - self._read_config(config_path, compose_bin) - - def _read_env(self, dotenv_path, params): - """ - Read .env and merge it with explicitly passed parameters. - """ - self.dotenv = dotenv_values(str(dotenv_path)) - if params is None: - self.params = {} - else: - self.params = {k: v for k, v in params.items() if k in self.dotenv} - - # forward the process' environment variables - self.env = os.environ.copy() - # set the defaults from the dotenv files - self.env.update(self.dotenv) - # override the defaults passed as parameters - self.env.update(self.params) - - # translate docker's architecture notation to a more widely used one - arch = self.env.get('ARCH', 'amd64') - arch_aliases = { - 'amd64': 'x86_64', - 'arm64v8': 'aarch64', - 's390x': 's390x' - } - arch_short_aliases = { - 'amd64': 'x64', - 'arm64v8': 'arm64', - 's390x': 's390x' - } - self.env['ARCH_ALIAS'] = arch_aliases.get(arch, arch) - self.env['ARCH_SHORT_ALIAS'] = arch_short_aliases.get(arch, arch) - - def _read_config(self, config_path, compose_bin): - """ - Validate and read the docker-compose.yml - """ - yaml = YAML() - with config_path.open() as fp: - config = yaml.load(fp) - - services = config['services'].keys() - self.hierarchy = dict(flatten(config.get('x-hierarchy', {}))) - self.with_gpus = config.get('x-with-gpus', []) - nodes = self.hierarchy.keys() - errors = [] - - for name in self.with_gpus: - if name not in services: - errors.append( - 'Service `{}` defined in `x-with-gpus` bot not in ' - '`services`'.format(name) - ) - for name in nodes - services: - errors.append( - 'Service `{}` is defined in `x-hierarchy` bot not in ' - '`services`'.format(name) - ) - for name in services - nodes: - errors.append( - 'Service `{}` is defined in `services` but not in ' - '`x-hierarchy`'.format(name) - ) - - # trigger docker-compose's own validation - compose = Command('docker-compose') - args = ['--file', str(config_path), 'config'] - result = compose.run(*args, env=self.env, check=False, - stderr=subprocess.PIPE, stdout=subprocess.PIPE) - - if result.returncode != 0: - # strip the intro line of docker-compose errors - errors += result.stderr.decode().splitlines() - - if errors: - msg = '\n'.join([' - {}'.format(msg) for msg in errors]) - raise ValueError( - 'Found errors with docker-compose:\n{}'.format(msg) - ) - - rendered_config = StringIO(result.stdout.decode()) - self.path = config_path - self.config = yaml.load(rendered_config) - - def get(self, service_name): - try: - service = self.config['services'][service_name] - except KeyError: - raise UndefinedImage(service_name) - service['name'] = service_name - service['need_gpu'] = service_name in self.with_gpus - service['ancestors'] = self.hierarchy[service_name] - return service - - def __getitem__(self, service_name): - return self.get(service_name) - - -class Docker(Command): - - def __init__(self, docker_bin=None): - self.bin = default_bin(docker_bin, "docker") - - -class DockerCompose(Command): - - def __init__(self, config_path, dotenv_path=None, compose_bin=None, - params=None): - compose_bin = default_bin(compose_bin, 'docker-compose') - self.config = ComposeConfig(config_path, dotenv_path, compose_bin, - params) - self.bin = compose_bin - self.pull_memory = set() - - def clear_pull_memory(self): - self.pull_memory = set() - - def _execute_compose(self, *args, **kwargs): - # execute as a docker compose command - try: - result = super().run('--file', str(self.config.path), *args, - env=self.config.env, **kwargs) - result.check_returncode() - except subprocess.CalledProcessError as e: - def formatdict(d, template): - return '\n'.join( - template.format(k, v) for k, v in sorted(d.items()) - ) - msg = ( - "`{cmd}` exited with a non-zero exit code {code}, see the " - "process log above.\n\nThe docker-compose command was " - "invoked with the following parameters:\n\nDefaults defined " - "in .env:\n{dotenv}\n\nArchery was called with:\n{params}" - ) - raise RuntimeError( - msg.format( - cmd=' '.join(e.cmd), - code=e.returncode, - dotenv=formatdict(self.config.dotenv, template=' {}: {}'), - params=formatdict( - self.config.params, template=' export {}={}' - ) - ) - ) - - def _execute_docker(self, *args, **kwargs): - # execute as a plain docker cli command - try: - result = Docker().run(*args, **kwargs) - result.check_returncode() - except subprocess.CalledProcessError as e: - raise RuntimeError( - "{} exited with non-zero exit code {}".format( - ' '.join(e.cmd), e.returncode - ) - ) - - def pull(self, service_name, pull_leaf=True, using_docker=False): - def _pull(service): - args = ['pull'] - if service['image'] in self.pull_memory: - return - - if using_docker: - try: - self._execute_docker(*args, service['image']) - except Exception as e: - # better --ignore-pull-failures handling - print(e) - else: - args.append('--ignore-pull-failures') - self._execute_compose(*args, service['name']) - - self.pull_memory.add(service['image']) - - service = self.config.get(service_name) - for ancestor in service['ancestors']: - _pull(self.config.get(ancestor)) - if pull_leaf: - _pull(service) - - def build(self, service_name, use_cache=True, use_leaf_cache=True, - using_docker=False, using_buildx=False): - def _build(service, use_cache): - if 'build' not in service: - # nothing to do - return - - args = [] - cache_from = list(service.get('build', {}).get('cache_from', [])) - if use_cache: - for image in cache_from: - if image not in self.pull_memory: - try: - self._execute_docker('pull', image) - except Exception as e: - print(e) - finally: - self.pull_memory.add(image) - else: - args.append('--no-cache') - - # turn on inline build cache, this is a docker buildx feature - # used to bundle the image build cache to the pushed image manifest - # so the build cache can be reused across hosts, documented at - # https://github.com/docker/buildx#--cache-tonametypetypekeyvalue - if self.config.env.get('BUILDKIT_INLINE_CACHE') == '1': - args.extend(['--build-arg', 'BUILDKIT_INLINE_CACHE=1']) - - if using_buildx: - for k, v in service['build'].get('args', {}).items(): - args.extend(['--build-arg', '{}={}'.format(k, v)]) - - if use_cache: - cache_ref = '{}-cache'.format(service['image']) - cache_from = 'type=registry,ref={}'.format(cache_ref) - cache_to = ( - 'type=registry,ref={},mode=max'.format(cache_ref) - ) - args.extend([ - '--cache-from', cache_from, - '--cache-to', cache_to, - ]) - - args.extend([ - '--output', 'type=docker', - '-f', service['build']['dockerfile'], - '-t', service['image'], - service['build'].get('context', '.') - ]) - self._execute_docker("buildx", "build", *args) - elif using_docker: - # better for caching - for k, v in service['build'].get('args', {}).items(): - args.extend(['--build-arg', '{}={}'.format(k, v)]) - for img in cache_from: - args.append('--cache-from="{}"'.format(img)) - args.extend([ - '-f', service['build']['dockerfile'], - '-t', service['image'], - service['build'].get('context', '.') - ]) - self._execute_docker("build", *args) - else: - self._execute_compose("build", *args, service['name']) - - service = self.config.get(service_name) - # build ancestor services - for ancestor in service['ancestors']: - _build(self.config.get(ancestor), use_cache=use_cache) - # build the leaf/target service - _build(service, use_cache=use_cache and use_leaf_cache) - - def run(self, service_name, command=None, *, env=None, volumes=None, - user=None, using_docker=False): - service = self.config.get(service_name) - - args = [] - if user is not None: - args.extend(['-u', user]) - - if env is not None: - for k, v in env.items(): - args.extend(['-e', '{}={}'.format(k, v)]) - - if volumes is not None: - for volume in volumes: - args.extend(['--volume', volume]) - - if using_docker or service['need_gpu']: - # use gpus, requires docker>=19.03 - if service['need_gpu']: - args.extend(['--gpus', 'all']) - - if service.get('shm_size'): - args.extend(['--shm-size', service['shm_size']]) - - # append env variables from the compose conf - for k, v in service.get('environment', {}).items(): - args.extend(['-e', '{}={}'.format(k, v)]) - - # append volumes from the compose conf - for v in service.get('volumes', []): - if not isinstance(v, str): - # if not the compact string volume definition - v = "{}:{}".format(v['source'], v['target']) - args.extend(['-v', v]) - - # infer whether an interactive shell is desired or not - if command in ['cmd.exe', 'bash', 'sh', 'powershell']: - args.append('-it') - - # get the actual docker image name instead of the compose service - # name which we refer as image in general - args.append(service['image']) - - # add command from compose if it wasn't overridden - if command is not None: - args.append(command) - else: - # replace whitespaces from the preformatted compose command - cmd = _sanitize_command(service.get('command', '')) - if cmd: - args.append(cmd) - - # execute as a plain docker cli command - self._execute_docker('run', '--rm', *args) - else: - # execute as a docker-compose command - args.append(service_name) - if command is not None: - args.append(command) - self._execute_compose('run', '--rm', *args) - - def push(self, service_name, user=None, password=None, using_docker=False): - def _push(service): - if using_docker: - return self._execute_docker('push', service['image']) - else: - return self._execute_compose('push', service['name']) - - if user is not None: - try: - # TODO(kszucs): have an option for a prompt - self._execute_docker('login', '-u', user, '-p', password) - except subprocess.CalledProcessError: - # hide credentials - msg = ('Failed to push `{}`, check the passed credentials' - .format(service_name)) - raise RuntimeError(msg) from None - - service = self.config.get(service_name) - for ancestor in service['ancestors']: - _push(self.config.get(ancestor)) - _push(service) - - def images(self): - return sorted(self.config.hierarchy.keys()) diff --git a/dev/archery/archery/integration/__init__.py b/dev/archery/archery/integration/__init__.py deleted file mode 100644 index 13a83393a9124..0000000000000 --- a/dev/archery/archery/integration/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py deleted file mode 100644 index 35ab289cc33db..0000000000000 --- a/dev/archery/archery/integration/datagen.py +++ /dev/null @@ -1,1604 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from collections import namedtuple, OrderedDict -import binascii -import json -import os -import random -import tempfile - -import numpy as np - -from .util import frombytes, tobytes, random_bytes, random_utf8 - - -def metadata_key_values(pairs): - return [{'key': k, 'value': v} for k, v in pairs] - - -class Field(object): - - def __init__(self, name, *, nullable=True, metadata=None): - self.name = name - self.nullable = nullable - self.metadata = metadata or [] - - def get_json(self): - entries = [ - ('name', self.name), - ('type', self._get_type()), - ('nullable', self.nullable), - ('children', self._get_children()), - ] - - dct = self._get_dictionary() - if dct: - entries.append(('dictionary', dct)) - - if self.metadata is not None and len(self.metadata) > 0: - entries.append(('metadata', metadata_key_values(self.metadata))) - - return OrderedDict(entries) - - def _get_dictionary(self): - return None - - def _make_is_valid(self, size, null_probability=0.4): - if self.nullable: - return (np.random.random_sample(size) > null_probability - ).astype(np.int8) - else: - return np.ones(size, dtype=np.int8) - - -class Column(object): - - def __init__(self, name, count): - self.name = name - self.count = count - - def __len__(self): - return self.count - - def _get_children(self): - return [] - - def _get_buffers(self): - return [] - - def get_json(self): - entries = [ - ('name', self.name), - ('count', self.count) - ] - - buffers = self._get_buffers() - entries.extend(buffers) - - children = self._get_children() - if len(children) > 0: - entries.append(('children', children)) - - return OrderedDict(entries) - - -class PrimitiveField(Field): - - def _get_children(self): - return [] - - -class PrimitiveColumn(Column): - - def __init__(self, name, count, is_valid, values): - super().__init__(name, count) - self.is_valid = is_valid - self.values = values - - def _encode_value(self, x): - return x - - def _get_buffers(self): - return [ - ('VALIDITY', [int(v) for v in self.is_valid]), - ('DATA', list([self._encode_value(x) for x in self.values])) - ] - - -class NullColumn(Column): - # This subclass is for readability only - pass - - -class NullField(PrimitiveField): - - def __init__(self, name, metadata=None): - super().__init__(name, nullable=True, - metadata=metadata) - - def _get_type(self): - return OrderedDict([('name', 'null')]) - - def generate_column(self, size, name=None): - return NullColumn(name or self.name, size) - - -TEST_INT_MAX = 2 ** 31 - 1 -TEST_INT_MIN = ~TEST_INT_MAX - - -class IntegerField(PrimitiveField): - - def __init__(self, name, is_signed, bit_width, *, nullable=True, - metadata=None, - min_value=TEST_INT_MIN, - max_value=TEST_INT_MAX): - super().__init__(name, nullable=nullable, - metadata=metadata) - self.is_signed = is_signed - self.bit_width = bit_width - self.min_value = min_value - self.max_value = max_value - - def _get_generated_data_bounds(self): - if self.is_signed: - signed_iinfo = np.iinfo('int' + str(self.bit_width)) - min_value, max_value = signed_iinfo.min, signed_iinfo.max - else: - unsigned_iinfo = np.iinfo('uint' + str(self.bit_width)) - min_value, max_value = 0, unsigned_iinfo.max - - lower_bound = max(min_value, self.min_value) - upper_bound = min(max_value, self.max_value) - return lower_bound, upper_bound - - def _get_type(self): - return OrderedDict([ - ('name', 'int'), - ('isSigned', self.is_signed), - ('bitWidth', self.bit_width) - ]) - - def generate_column(self, size, name=None): - lower_bound, upper_bound = self._get_generated_data_bounds() - return self.generate_range(size, lower_bound, upper_bound, - name=name, include_extremes=True) - - def generate_range(self, size, lower, upper, name=None, - include_extremes=False): - values = np.random.randint(lower, upper, size=size, dtype=np.int64) - if include_extremes and size >= 2: - values[:2] = [lower, upper] - values = list(map(int if self.bit_width < 64 else str, values)) - - is_valid = self._make_is_valid(size) - - if name is None: - name = self.name - return PrimitiveColumn(name, size, is_valid, values) - - -class DateField(IntegerField): - - DAY = 0 - MILLISECOND = 1 - - # 1/1/1 to 12/31/9999 - _ranges = { - DAY: [-719162, 2932896], - MILLISECOND: [-62135596800000, 253402214400000] - } - - def __init__(self, name, unit, *, nullable=True, metadata=None): - bit_width = 32 if unit == self.DAY else 64 - - min_value, max_value = self._ranges[unit] - super().__init__( - name, True, bit_width, - nullable=nullable, metadata=metadata, - min_value=min_value, max_value=max_value - ) - self.unit = unit - - def _get_type(self): - return OrderedDict([ - ('name', 'date'), - ('unit', 'DAY' if self.unit == self.DAY else 'MILLISECOND') - ]) - - -TIMEUNIT_NAMES = { - 's': 'SECOND', - 'ms': 'MILLISECOND', - 'us': 'MICROSECOND', - 'ns': 'NANOSECOND' -} - - -class TimeField(IntegerField): - - BIT_WIDTHS = { - 's': 32, - 'ms': 32, - 'us': 64, - 'ns': 64 - } - - _ranges = { - 's': [0, 86400], - 'ms': [0, 86400000], - 'us': [0, 86400000000], - 'ns': [0, 86400000000000] - } - - def __init__(self, name, unit='s', *, nullable=True, - metadata=None): - min_val, max_val = self._ranges[unit] - super().__init__(name, True, self.BIT_WIDTHS[unit], - nullable=nullable, metadata=metadata, - min_value=min_val, max_value=max_val) - self.unit = unit - - def _get_type(self): - return OrderedDict([ - ('name', 'time'), - ('unit', TIMEUNIT_NAMES[self.unit]), - ('bitWidth', self.bit_width) - ]) - - -class TimestampField(IntegerField): - - # 1/1/1 to 12/31/9999 - _ranges = { - 's': [-62135596800, 253402214400], - 'ms': [-62135596800000, 253402214400000], - 'us': [-62135596800000000, 253402214400000000], - - # Physical range for int64, ~584 years and change - 'ns': [np.iinfo('int64').min, np.iinfo('int64').max] - } - - def __init__(self, name, unit='s', tz=None, *, nullable=True, - metadata=None): - min_val, max_val = self._ranges[unit] - super().__init__(name, True, 64, - nullable=nullable, - metadata=metadata, - min_value=min_val, - max_value=max_val) - self.unit = unit - self.tz = tz - - def _get_type(self): - fields = [ - ('name', 'timestamp'), - ('unit', TIMEUNIT_NAMES[self.unit]) - ] - - if self.tz is not None: - fields.append(('timezone', self.tz)) - - return OrderedDict(fields) - - -class DurationIntervalField(IntegerField): - - def __init__(self, name, unit='s', *, nullable=True, - metadata=None): - min_val, max_val = np.iinfo('int64').min, np.iinfo('int64').max, - super().__init__( - name, True, 64, - nullable=nullable, metadata=metadata, - min_value=min_val, max_value=max_val) - self.unit = unit - - def _get_type(self): - fields = [ - ('name', 'duration'), - ('unit', TIMEUNIT_NAMES[self.unit]) - ] - - return OrderedDict(fields) - - -class YearMonthIntervalField(IntegerField): - def __init__(self, name, *, nullable=True, metadata=None): - min_val, max_val = [-10000*12, 10000*12] # +/- 10000 years. - super().__init__( - name, True, 32, - nullable=nullable, metadata=metadata, - min_value=min_val, max_value=max_val) - - def _get_type(self): - fields = [ - ('name', 'interval'), - ('unit', 'YEAR_MONTH'), - ] - - return OrderedDict(fields) - - -class DayTimeIntervalField(PrimitiveField): - def __init__(self, name, *, nullable=True, metadata=None): - super().__init__(name, - nullable=True, - metadata=metadata) - - @property - def numpy_type(self): - return object - - def _get_type(self): - - return OrderedDict([ - ('name', 'interval'), - ('unit', 'DAY_TIME'), - ]) - - def generate_column(self, size, name=None): - min_day_value, max_day_value = -10000*366, 10000*366 - values = [{'days': random.randint(min_day_value, max_day_value), - 'milliseconds': random.randint(-86400000, +86400000)} - for _ in range(size)] - - is_valid = self._make_is_valid(size) - if name is None: - name = self.name - return PrimitiveColumn(name, size, is_valid, values) - - -class FloatingPointField(PrimitiveField): - - def __init__(self, name, bit_width, *, nullable=True, - metadata=None): - super().__init__(name, - nullable=nullable, - metadata=metadata) - - self.bit_width = bit_width - self.precision = { - 16: 'HALF', - 32: 'SINGLE', - 64: 'DOUBLE' - }[self.bit_width] - - @property - def numpy_type(self): - return 'float' + str(self.bit_width) - - def _get_type(self): - return OrderedDict([ - ('name', 'floatingpoint'), - ('precision', self.precision) - ]) - - def generate_column(self, size, name=None): - values = np.random.randn(size) * 1000 - values = np.round(values, 3) - - is_valid = self._make_is_valid(size) - if name is None: - name = self.name - return PrimitiveColumn(name, size, is_valid, values) - - -DECIMAL_PRECISION_TO_VALUE = { - key: (1 << (8 * i - 1)) - 1 for i, key in enumerate( - [1, 3, 5, 7, 10, 12, 15, 17, 19, 22, 24, 27, 29, 32, 34, 36, - 40, 42, 44, 50, 60, 70], - start=1, - ) -} - - -def decimal_range_from_precision(precision): - assert 1 <= precision <= 76 - try: - max_value = DECIMAL_PRECISION_TO_VALUE[precision] - except KeyError: - return decimal_range_from_precision(precision - 1) - else: - return ~max_value, max_value - - -class DecimalField(PrimitiveField): - def __init__(self, name, precision, scale, bit_width, *, - nullable=True, metadata=None): - super().__init__(name, nullable=True, - metadata=metadata) - self.precision = precision - self.scale = scale - self.bit_width = bit_width - - @property - def numpy_type(self): - return object - - def _get_type(self): - return OrderedDict([ - ('name', 'decimal'), - ('precision', self.precision), - ('scale', self.scale), - ('bitWidth', self.bit_width), - ]) - - def generate_column(self, size, name=None): - min_value, max_value = decimal_range_from_precision(self.precision) - values = [random.randint(min_value, max_value) for _ in range(size)] - - is_valid = self._make_is_valid(size) - if name is None: - name = self.name - return DecimalColumn(name, size, is_valid, values, self.bit_width) - - -class DecimalColumn(PrimitiveColumn): - - def __init__(self, name, count, is_valid, values, bit_width): - super().__init__(name, count, is_valid, values) - self.bit_width = bit_width - - def _encode_value(self, x): - return str(x) - - -class BooleanField(PrimitiveField): - bit_width = 1 - - def _get_type(self): - return OrderedDict([('name', 'bool')]) - - @property - def numpy_type(self): - return 'bool' - - def generate_column(self, size, name=None): - values = list(map(bool, np.random.randint(0, 2, size=size))) - is_valid = self._make_is_valid(size) - if name is None: - name = self.name - return PrimitiveColumn(name, size, is_valid, values) - - -class FixedSizeBinaryField(PrimitiveField): - - def __init__(self, name, byte_width, *, nullable=True, - metadata=None): - super().__init__(name, nullable=nullable, - metadata=metadata) - self.byte_width = byte_width - - @property - def numpy_type(self): - return object - - @property - def column_class(self): - return FixedSizeBinaryColumn - - def _get_type(self): - return OrderedDict([('name', 'fixedsizebinary'), - ('byteWidth', self.byte_width)]) - - def generate_column(self, size, name=None): - is_valid = self._make_is_valid(size) - values = [] - - for i in range(size): - values.append(random_bytes(self.byte_width)) - - if name is None: - name = self.name - return self.column_class(name, size, is_valid, values) - - -class BinaryField(PrimitiveField): - - @property - def numpy_type(self): - return object - - @property - def column_class(self): - return BinaryColumn - - def _get_type(self): - return OrderedDict([('name', 'binary')]) - - def _random_sizes(self, size): - return np.random.exponential(scale=4, size=size).astype(np.int32) - - def generate_column(self, size, name=None): - is_valid = self._make_is_valid(size) - values = [] - - sizes = self._random_sizes(size) - - for i, nbytes in enumerate(sizes): - if is_valid[i]: - values.append(random_bytes(nbytes)) - else: - values.append(b"") - - if name is None: - name = self.name - return self.column_class(name, size, is_valid, values) - - -class StringField(BinaryField): - - @property - def column_class(self): - return StringColumn - - def _get_type(self): - return OrderedDict([('name', 'utf8')]) - - def generate_column(self, size, name=None): - K = 7 - is_valid = self._make_is_valid(size) - values = [] - - for i in range(size): - if is_valid[i]: - values.append(tobytes(random_utf8(K))) - else: - values.append(b"") - - if name is None: - name = self.name - return self.column_class(name, size, is_valid, values) - - -class LargeBinaryField(BinaryField): - - @property - def column_class(self): - return LargeBinaryColumn - - def _get_type(self): - return OrderedDict([('name', 'largebinary')]) - - -class LargeStringField(StringField): - - @property - def column_class(self): - return LargeStringColumn - - def _get_type(self): - return OrderedDict([('name', 'largeutf8')]) - - -class Schema(object): - - def __init__(self, fields, metadata=None): - self.fields = fields - self.metadata = metadata - - def get_json(self): - entries = [ - ('fields', [field.get_json() for field in self.fields]) - ] - - if self.metadata is not None and len(self.metadata) > 0: - entries.append(('metadata', metadata_key_values(self.metadata))) - - return OrderedDict(entries) - - -class _NarrowOffsetsMixin: - - def _encode_offsets(self, offsets): - return list(map(int, offsets)) - - -class _LargeOffsetsMixin: - - def _encode_offsets(self, offsets): - # 64-bit offsets have to be represented as strings to roundtrip - # through JSON. - return list(map(str, offsets)) - - -class _BaseBinaryColumn(PrimitiveColumn): - - def _encode_value(self, x): - return frombytes(binascii.hexlify(x).upper()) - - def _get_buffers(self): - offset = 0 - offsets = [0] - - data = [] - for i, v in enumerate(self.values): - if self.is_valid[i]: - offset += len(v) - else: - v = b"" - - offsets.append(offset) - data.append(self._encode_value(v)) - - return [ - ('VALIDITY', [int(x) for x in self.is_valid]), - ('OFFSET', self._encode_offsets(offsets)), - ('DATA', data) - ] - - -class _BaseStringColumn(_BaseBinaryColumn): - - def _encode_value(self, x): - return frombytes(x) - - -class BinaryColumn(_BaseBinaryColumn, _NarrowOffsetsMixin): - pass - - -class StringColumn(_BaseStringColumn, _NarrowOffsetsMixin): - pass - - -class LargeBinaryColumn(_BaseBinaryColumn, _LargeOffsetsMixin): - pass - - -class LargeStringColumn(_BaseStringColumn, _LargeOffsetsMixin): - pass - - -class FixedSizeBinaryColumn(PrimitiveColumn): - - def _encode_value(self, x): - return frombytes(binascii.hexlify(x).upper()) - - def _get_buffers(self): - data = [] - for i, v in enumerate(self.values): - data.append(self._encode_value(v)) - - return [ - ('VALIDITY', [int(x) for x in self.is_valid]), - ('DATA', data) - ] - - -class ListField(Field): - - def __init__(self, name, value_field, *, nullable=True, - metadata=None): - super().__init__(name, nullable=nullable, - metadata=metadata) - self.value_field = value_field - - @property - def column_class(self): - return ListColumn - - def _get_type(self): - return OrderedDict([ - ('name', 'list') - ]) - - def _get_children(self): - return [self.value_field.get_json()] - - def generate_column(self, size, name=None): - MAX_LIST_SIZE = 4 - - is_valid = self._make_is_valid(size) - list_sizes = np.random.randint(0, MAX_LIST_SIZE + 1, size=size) - offsets = [0] - - offset = 0 - for i in range(size): - if is_valid[i]: - offset += int(list_sizes[i]) - offsets.append(offset) - - # The offset now is the total number of elements in the child array - values = self.value_field.generate_column(offset) - - if name is None: - name = self.name - return self.column_class(name, size, is_valid, offsets, values) - - -class LargeListField(ListField): - - @property - def column_class(self): - return LargeListColumn - - def _get_type(self): - return OrderedDict([ - ('name', 'largelist') - ]) - - -class _BaseListColumn(Column): - - def __init__(self, name, count, is_valid, offsets, values): - super().__init__(name, count) - self.is_valid = is_valid - self.offsets = offsets - self.values = values - - def _get_buffers(self): - return [ - ('VALIDITY', [int(v) for v in self.is_valid]), - ('OFFSET', self._encode_offsets(self.offsets)) - ] - - def _get_children(self): - return [self.values.get_json()] - - -class ListColumn(_BaseListColumn, _NarrowOffsetsMixin): - pass - - -class LargeListColumn(_BaseListColumn, _LargeOffsetsMixin): - pass - - -class MapField(Field): - - def __init__(self, name, key_field, item_field, *, nullable=True, - metadata=None, keys_sorted=False, entries_name='entries'): - super().__init__(name, nullable=nullable, - metadata=metadata) - - assert not key_field.nullable - self.key_field = key_field - self.item_field = item_field - self.pair_field = StructField(entries_name, [key_field, item_field], - nullable=False) - self.keys_sorted = keys_sorted - - def _get_type(self): - return OrderedDict([ - ('name', 'map'), - ('keysSorted', self.keys_sorted) - ]) - - def _get_children(self): - return [self.pair_field.get_json()] - - def generate_column(self, size, name=None): - MAX_MAP_SIZE = 4 - - is_valid = self._make_is_valid(size) - map_sizes = np.random.randint(0, MAX_MAP_SIZE + 1, size=size) - offsets = [0] - - offset = 0 - for i in range(size): - if is_valid[i]: - offset += int(map_sizes[i]) - offsets.append(offset) - - # The offset now is the total number of elements in the child array - pairs = self.pair_field.generate_column(offset) - if name is None: - name = self.name - - return MapColumn(name, size, is_valid, offsets, pairs) - - -class MapColumn(Column): - - def __init__(self, name, count, is_valid, offsets, pairs): - super().__init__(name, count) - self.is_valid = is_valid - self.offsets = offsets - self.pairs = pairs - - def _get_buffers(self): - return [ - ('VALIDITY', [int(v) for v in self.is_valid]), - ('OFFSET', list(self.offsets)) - ] - - def _get_children(self): - return [self.pairs.get_json()] - - -class FixedSizeListField(Field): - - def __init__(self, name, value_field, list_size, *, nullable=True, - metadata=None): - super().__init__(name, nullable=nullable, - metadata=metadata) - self.value_field = value_field - self.list_size = list_size - - def _get_type(self): - return OrderedDict([ - ('name', 'fixedsizelist'), - ('listSize', self.list_size) - ]) - - def _get_children(self): - return [self.value_field.get_json()] - - def generate_column(self, size, name=None): - is_valid = self._make_is_valid(size) - values = self.value_field.generate_column(size * self.list_size) - - if name is None: - name = self.name - return FixedSizeListColumn(name, size, is_valid, values) - - -class FixedSizeListColumn(Column): - - def __init__(self, name, count, is_valid, values): - super().__init__(name, count) - self.is_valid = is_valid - self.values = values - - def _get_buffers(self): - return [ - ('VALIDITY', [int(v) for v in self.is_valid]) - ] - - def _get_children(self): - return [self.values.get_json()] - - -class StructField(Field): - - def __init__(self, name, fields, *, nullable=True, - metadata=None): - super().__init__(name, nullable=nullable, - metadata=metadata) - self.fields = fields - - def _get_type(self): - return OrderedDict([ - ('name', 'struct') - ]) - - def _get_children(self): - return [field.get_json() for field in self.fields] - - def generate_column(self, size, name=None): - is_valid = self._make_is_valid(size) - - field_values = [field.generate_column(size) for field in self.fields] - if name is None: - name = self.name - return StructColumn(name, size, is_valid, field_values) - - -class _BaseUnionField(Field): - - def __init__(self, name, fields, type_ids=None, *, nullable=True, - metadata=None): - super().__init__(name, nullable=nullable, metadata=metadata) - if type_ids is None: - type_ids = list(range(fields)) - else: - assert len(fields) == len(type_ids) - self.fields = fields - self.type_ids = type_ids - assert all(x >= 0 for x in self.type_ids) - - def _get_type(self): - return OrderedDict([ - ('name', 'union'), - ('mode', self.mode), - ('typeIds', self.type_ids), - ]) - - def _get_children(self): - return [field.get_json() for field in self.fields] - - def _make_type_ids(self, size): - return np.random.choice(self.type_ids, size) - - -class SparseUnionField(_BaseUnionField): - mode = 'SPARSE' - - def generate_column(self, size, name=None): - array_type_ids = self._make_type_ids(size) - field_values = [field.generate_column(size) for field in self.fields] - - if name is None: - name = self.name - return SparseUnionColumn(name, size, array_type_ids, field_values) - - -class DenseUnionField(_BaseUnionField): - mode = 'DENSE' - - def generate_column(self, size, name=None): - # Reverse mapping {logical type id => physical child id} - child_ids = [None] * (max(self.type_ids) + 1) - for i, type_id in enumerate(self.type_ids): - child_ids[type_id] = i - - array_type_ids = self._make_type_ids(size) - offsets = [] - child_sizes = [0] * len(self.fields) - - for i in range(size): - child_id = child_ids[array_type_ids[i]] - offset = child_sizes[child_id] - offsets.append(offset) - child_sizes[child_id] = offset + 1 - - field_values = [ - field.generate_column(child_size) - for field, child_size in zip(self.fields, child_sizes)] - - if name is None: - name = self.name - return DenseUnionColumn(name, size, array_type_ids, offsets, - field_values) - - -class Dictionary(object): - - def __init__(self, id_, field, size, name=None, ordered=False): - self.id_ = id_ - self.field = field - self.values = field.generate_column(size=size, name=name) - self.ordered = ordered - - def __len__(self): - return len(self.values) - - def get_json(self): - dummy_batch = RecordBatch(len(self.values), [self.values]) - return OrderedDict([ - ('id', self.id_), - ('data', dummy_batch.get_json()) - ]) - - -class DictionaryField(Field): - - def __init__(self, name, index_field, dictionary, *, nullable=True, - metadata=None): - super().__init__(name, nullable=nullable, - metadata=metadata) - assert index_field.name == '' - assert isinstance(index_field, IntegerField) - assert isinstance(dictionary, Dictionary) - - self.index_field = index_field - self.dictionary = dictionary - - def _get_type(self): - return self.dictionary.field._get_type() - - def _get_children(self): - return self.dictionary.field._get_children() - - def _get_dictionary(self): - return OrderedDict([ - ('id', self.dictionary.id_), - ('indexType', self.index_field._get_type()), - ('isOrdered', self.dictionary.ordered) - ]) - - def generate_column(self, size, name=None): - if name is None: - name = self.name - return self.index_field.generate_range(size, 0, len(self.dictionary), - name=name) - - -ExtensionType = namedtuple( - 'ExtensionType', ['extension_name', 'serialized', 'storage_field']) - - -class ExtensionField(Field): - - def __init__(self, name, extension_type, *, nullable=True, metadata=None): - metadata = (metadata or []) + [ - ('ARROW:extension:name', extension_type.extension_name), - ('ARROW:extension:metadata', extension_type.serialized), - ] - super().__init__(name, nullable=nullable, metadata=metadata) - self.extension_type = extension_type - - def _get_type(self): - return self.extension_type.storage_field._get_type() - - def _get_children(self): - return self.extension_type.storage_field._get_children() - - def _get_dictionary(self): - return self.extension_type.storage_field._get_dictionary() - - def generate_column(self, size, name=None): - if name is None: - name = self.name - return self.extension_type.storage_field.generate_column(size, name) - - -class StructColumn(Column): - - def __init__(self, name, count, is_valid, field_values): - super().__init__(name, count) - self.is_valid = is_valid - self.field_values = field_values - - def _get_buffers(self): - return [ - ('VALIDITY', [int(v) for v in self.is_valid]) - ] - - def _get_children(self): - return [field.get_json() for field in self.field_values] - - -class SparseUnionColumn(Column): - - def __init__(self, name, count, type_ids, field_values): - super().__init__(name, count) - self.type_ids = type_ids - self.field_values = field_values - - def _get_buffers(self): - return [ - ('TYPE_ID', [int(v) for v in self.type_ids]) - ] - - def _get_children(self): - return [field.get_json() for field in self.field_values] - - -class DenseUnionColumn(Column): - - def __init__(self, name, count, type_ids, offsets, field_values): - super().__init__(name, count) - self.type_ids = type_ids - self.offsets = offsets - self.field_values = field_values - - def _get_buffers(self): - return [ - ('TYPE_ID', [int(v) for v in self.type_ids]), - ('OFFSET', [int(v) for v in self.offsets]), - ] - - def _get_children(self): - return [field.get_json() for field in self.field_values] - - -class RecordBatch(object): - - def __init__(self, count, columns): - self.count = count - self.columns = columns - - def get_json(self): - return OrderedDict([ - ('count', self.count), - ('columns', [col.get_json() for col in self.columns]) - ]) - - -class File(object): - - def __init__(self, name, schema, batches, dictionaries=None, - skip=None, path=None): - self.name = name - self.schema = schema - self.dictionaries = dictionaries or [] - self.batches = batches - self.skip = set() - self.path = path - if skip: - self.skip.update(skip) - - def get_json(self): - entries = [ - ('schema', self.schema.get_json()) - ] - - if len(self.dictionaries) > 0: - entries.append(('dictionaries', - [dictionary.get_json() - for dictionary in self.dictionaries])) - - entries.append(('batches', [batch.get_json() - for batch in self.batches])) - return OrderedDict(entries) - - def write(self, path): - with open(path, 'wb') as f: - f.write(json.dumps(self.get_json(), indent=2).encode('utf-8')) - self.path = path - - def skip_category(self, category): - """Skip this test for the given category. - - Category should be SKIP_ARROW or SKIP_FLIGHT. - """ - self.skip.add(category) - return self - - -def get_field(name, type_, **kwargs): - if type_ == 'binary': - return BinaryField(name, **kwargs) - elif type_ == 'utf8': - return StringField(name, **kwargs) - elif type_ == 'largebinary': - return LargeBinaryField(name, **kwargs) - elif type_ == 'largeutf8': - return LargeStringField(name, **kwargs) - elif type_.startswith('fixedsizebinary_'): - byte_width = int(type_.split('_')[1]) - return FixedSizeBinaryField(name, byte_width=byte_width, **kwargs) - - dtype = np.dtype(type_) - - if dtype.kind in ('i', 'u'): - signed = dtype.kind == 'i' - bit_width = dtype.itemsize * 8 - return IntegerField(name, signed, bit_width, **kwargs) - elif dtype.kind == 'f': - bit_width = dtype.itemsize * 8 - return FloatingPointField(name, bit_width, **kwargs) - elif dtype.kind == 'b': - return BooleanField(name, **kwargs) - else: - raise TypeError(dtype) - - -def _generate_file(name, fields, batch_sizes, dictionaries=None, skip=None, - metadata=None): - schema = Schema(fields, metadata=metadata) - batches = [] - for size in batch_sizes: - columns = [] - for field in fields: - col = field.generate_column(size) - columns.append(col) - - batches.append(RecordBatch(size, columns)) - - return File(name, schema, batches, dictionaries, skip=skip) - - -def generate_custom_metadata_case(): - def meta(items): - # Generate a simple block of metadata where each value is '{}'. - # Keys are delimited by whitespace in `items`. - return [(k, '{}') for k in items.split()] - - fields = [ - get_field('sort_of_pandas', 'int8', metadata=meta('pandas')), - - get_field('lots_of_meta', 'int8', metadata=meta('a b c d .. w x y z')), - - get_field( - 'unregistered_extension', 'int8', - metadata=[ - ('ARROW:extension:name', '!nonexistent'), - ('ARROW:extension:metadata', ''), - ('ARROW:integration:allow_unregistered_extension', 'true'), - ]), - - ListField('list_with_odd_values', - get_field('item', 'int32', metadata=meta('odd_values'))), - ] - - batch_sizes = [1] - return _generate_file('custom_metadata', fields, batch_sizes, - metadata=meta('schema_custom_0 schema_custom_1')) - - -def generate_duplicate_fieldnames_case(): - fields = [ - get_field('ints', 'int8'), - get_field('ints', 'int32'), - - StructField('struct', [get_field('', 'int32'), get_field('', 'utf8')]), - ] - - batch_sizes = [1] - return _generate_file('duplicate_fieldnames', fields, batch_sizes) - - -def generate_primitive_case(batch_sizes, name='primitive'): - types = ['bool', 'int8', 'int16', 'int32', 'int64', - 'uint8', 'uint16', 'uint32', 'uint64', - 'float32', 'float64', 'binary', 'utf8', - 'fixedsizebinary_19', 'fixedsizebinary_120'] - - fields = [] - - for type_ in types: - fields.append(get_field(type_ + "_nullable", type_, nullable=True)) - fields.append(get_field(type_ + "_nonnullable", type_, nullable=False)) - - return _generate_file(name, fields, batch_sizes) - - -def generate_primitive_large_offsets_case(batch_sizes): - types = ['largebinary', 'largeutf8'] - - fields = [] - - for type_ in types: - fields.append(get_field(type_ + "_nullable", type_, nullable=True)) - fields.append(get_field(type_ + "_nonnullable", type_, nullable=False)) - - return _generate_file('primitive_large_offsets', fields, batch_sizes) - - -def generate_null_case(batch_sizes): - # Interleave null with non-null types to ensure the appropriate number of - # buffers (0) is read and written - fields = [ - NullField(name='f0'), - get_field('f1', 'int32'), - NullField(name='f2'), - get_field('f3', 'float64'), - NullField(name='f4') - ] - return _generate_file('null', fields, batch_sizes) - - -def generate_null_trivial_case(batch_sizes): - # Generate a case with no buffers - fields = [ - NullField(name='f0'), - ] - return _generate_file('null_trivial', fields, batch_sizes) - - -def generate_decimal128_case(): - fields = [ - DecimalField(name='f{}'.format(i), precision=precision, scale=2, - bit_width=128) - for i, precision in enumerate(range(3, 39)) - ] - - possible_batch_sizes = 7, 10 - batch_sizes = [possible_batch_sizes[i % 2] for i in range(len(fields))] - # 'decimal' is the original name for the test, and it must match - # provide "gold" files that test backwards compatibility, so they - # can be appropriately skipped. - return _generate_file('decimal', fields, batch_sizes) - - -def generate_decimal256_case(): - fields = [ - DecimalField(name='f{}'.format(i), precision=precision, scale=5, - bit_width=256) - for i, precision in enumerate(range(37, 70)) - ] - - possible_batch_sizes = 7, 10 - batch_sizes = [possible_batch_sizes[i % 2] for i in range(len(fields))] - return _generate_file('decimal256', fields, batch_sizes) - - -def generate_datetime_case(): - fields = [ - DateField('f0', DateField.DAY), - DateField('f1', DateField.MILLISECOND), - TimeField('f2', 's'), - TimeField('f3', 'ms'), - TimeField('f4', 'us'), - TimeField('f5', 'ns'), - TimestampField('f6', 's'), - TimestampField('f7', 'ms'), - TimestampField('f8', 'us'), - TimestampField('f9', 'ns'), - TimestampField('f10', 'ms', tz=None), - TimestampField('f11', 's', tz='UTC'), - TimestampField('f12', 'ms', tz='US/Eastern'), - TimestampField('f13', 'us', tz='Europe/Paris'), - TimestampField('f14', 'ns', tz='US/Pacific'), - ] - - batch_sizes = [7, 10] - return _generate_file("datetime", fields, batch_sizes) - - -def generate_interval_case(): - fields = [ - DurationIntervalField('f1', 's'), - DurationIntervalField('f2', 'ms'), - DurationIntervalField('f3', 'us'), - DurationIntervalField('f4', 'ns'), - YearMonthIntervalField('f5'), - DayTimeIntervalField('f6'), - ] - - batch_sizes = [7, 10] - return _generate_file("interval", fields, batch_sizes) - - -def generate_map_case(): - fields = [ - MapField('map_nullable', get_field('key', 'utf8', nullable=False), - get_field('value', 'int32')), - ] - - batch_sizes = [7, 10] - return _generate_file("map", fields, batch_sizes) - - -def generate_non_canonical_map_case(): - fields = [ - MapField('map_other_names', - get_field('some_key', 'utf8', nullable=False), - get_field('some_value', 'int32'), - entries_name='some_entries'), - ] - - batch_sizes = [7] - return _generate_file("map_non_canonical", fields, batch_sizes) - - -def generate_nested_case(): - fields = [ - ListField('list_nullable', get_field('item', 'int32')), - FixedSizeListField('fixedsizelist_nullable', - get_field('item', 'int32'), 4), - StructField('struct_nullable', [get_field('f1', 'int32'), - get_field('f2', 'utf8')]), - # Fails on Go (ARROW-8452) - # ListField('list_nonnullable', get_field('item', 'int32'), - # nullable=False), - ] - - batch_sizes = [7, 10] - return _generate_file("nested", fields, batch_sizes) - - -def generate_recursive_nested_case(): - fields = [ - ListField('lists_list', - ListField('inner_list', get_field('item', 'int16'))), - ListField('structs_list', - StructField('inner_struct', - [get_field('f1', 'int32'), - get_field('f2', 'utf8')])), - ] - - batch_sizes = [7, 10] - return _generate_file("recursive_nested", fields, batch_sizes) - - -def generate_nested_large_offsets_case(): - fields = [ - LargeListField('large_list_nullable', get_field('item', 'int32')), - LargeListField('large_list_nonnullable', - get_field('item', 'int32'), nullable=False), - LargeListField('large_list_nested', - ListField('inner_list', get_field('item', 'int16'))), - ] - - batch_sizes = [0, 13] - return _generate_file("nested_large_offsets", fields, batch_sizes) - - -def generate_unions_case(): - fields = [ - SparseUnionField('sparse', [get_field('f1', 'int32'), - get_field('f2', 'utf8')], - type_ids=[5, 7]), - DenseUnionField('dense', [get_field('f1', 'int16'), - get_field('f2', 'binary')], - type_ids=[10, 20]), - SparseUnionField('sparse', [get_field('f1', 'float32', nullable=False), - get_field('f2', 'bool')], - type_ids=[5, 7], nullable=False), - DenseUnionField('dense', [get_field('f1', 'uint8', nullable=False), - get_field('f2', 'uint16'), - NullField('f3')], - type_ids=[42, 43, 44], nullable=False), - ] - - batch_sizes = [0, 11] - return _generate_file("union", fields, batch_sizes) - - -def generate_dictionary_case(): - dict0 = Dictionary(0, StringField('dictionary1'), size=10, name='DICT0') - dict1 = Dictionary(1, StringField('dictionary1'), size=5, name='DICT1') - dict2 = Dictionary(2, get_field('dictionary2', 'int64'), - size=50, name='DICT2') - - fields = [ - DictionaryField('dict0', get_field('', 'int8'), dict0), - DictionaryField('dict1', get_field('', 'int32'), dict1), - DictionaryField('dict2', get_field('', 'int16'), dict2) - ] - batch_sizes = [7, 10] - return _generate_file("dictionary", fields, batch_sizes, - dictionaries=[dict0, dict1, dict2]) - - -def generate_dictionary_unsigned_case(): - dict0 = Dictionary(0, StringField('dictionary0'), size=5, name='DICT0') - dict1 = Dictionary(1, StringField('dictionary1'), size=5, name='DICT1') - dict2 = Dictionary(2, StringField('dictionary2'), size=5, name='DICT2') - - # TODO: JavaScript does not support uint64 dictionary indices, so disabled - # for now - - # dict3 = Dictionary(3, StringField('dictionary3'), size=5, name='DICT3') - fields = [ - DictionaryField('f0', get_field('', 'uint8'), dict0), - DictionaryField('f1', get_field('', 'uint16'), dict1), - DictionaryField('f2', get_field('', 'uint32'), dict2), - # DictionaryField('f3', get_field('', 'uint64'), dict3) - ] - batch_sizes = [7, 10] - return _generate_file("dictionary_unsigned", fields, batch_sizes, - dictionaries=[dict0, dict1, dict2]) - - -def generate_nested_dictionary_case(): - dict0 = Dictionary(0, StringField('str'), size=10, name='DICT0') - - list_of_dict = ListField( - 'list', - DictionaryField('str_dict', get_field('', 'int8'), dict0)) - dict1 = Dictionary(1, list_of_dict, size=30, name='DICT1') - - struct_of_dict = StructField('struct', [ - DictionaryField('str_dict_a', get_field('', 'int8'), dict0), - DictionaryField('str_dict_b', get_field('', 'int8'), dict0) - ]) - dict2 = Dictionary(2, struct_of_dict, size=30, name='DICT2') - - fields = [ - DictionaryField('list_dict', get_field('', 'int8'), dict1), - DictionaryField('struct_dict', get_field('', 'int8'), dict2) - ] - - batch_sizes = [10, 13] - return _generate_file("nested_dictionary", fields, batch_sizes, - dictionaries=[dict0, dict1, dict2]) - - -def generate_extension_case(): - dict0 = Dictionary(0, StringField('dictionary0'), size=5, name='DICT0') - - uuid_type = ExtensionType('uuid', 'uuid-serialized', - FixedSizeBinaryField('', 16)) - dict_ext_type = ExtensionType( - 'dict-extension', 'dict-extension-serialized', - DictionaryField('str_dict', get_field('', 'int8'), dict0)) - - fields = [ - ExtensionField('uuids', uuid_type), - ExtensionField('dict_exts', dict_ext_type), - ] - - batch_sizes = [0, 13] - return _generate_file("extension", fields, batch_sizes, - dictionaries=[dict0]) - - -def get_generated_json_files(tempdir=None): - tempdir = tempdir or tempfile.mkdtemp(prefix='arrow-integration-') - - def _temp_path(): - return - - file_objs = [ - generate_primitive_case([], name='primitive_no_batches'), - generate_primitive_case([17, 20], name='primitive'), - generate_primitive_case([0, 0, 0], name='primitive_zerolength'), - - generate_primitive_large_offsets_case([17, 20]) - .skip_category('Go') - .skip_category('JS'), - - generate_null_case([10, 0]) - .skip_category('JS') # TODO(ARROW-7900) - .skip_category('Go'), # TODO(ARROW-7901) - - generate_null_trivial_case([0, 0]) - .skip_category('JS') # TODO(ARROW-7900) - .skip_category('Go'), # TODO(ARROW-7901) - - generate_decimal128_case() - .skip_category('Go') # TODO(ARROW-7948): Decimal + Go - .skip_category('Rust'), - - generate_decimal256_case() - .skip_category('Go') # TODO(ARROW-7948): Decimal + Go - .skip_category('JS') - .skip_category('Rust'), - - generate_datetime_case(), - - generate_interval_case() - .skip_category('JS') # TODO(ARROW-5239): Intervals + JS - .skip_category('Rust'), - - generate_map_case() - .skip_category('Go') # TODO(ARROW-5620): Map + Go - .skip_category('Rust'), - - generate_non_canonical_map_case() - .skip_category('Go') # TODO(ARROW-5620) - .skip_category('Java') # TODO(ARROW-8715) - .skip_category('JS') # TODO(ARROW-8716) - .skip_category('Rust'), - - generate_nested_case(), - - generate_recursive_nested_case() - .skip_category('Go'), # TODO(ARROW-8453) - - generate_nested_large_offsets_case() - .skip_category('Go') - .skip_category('JS') - .skip_category('Rust'), - - generate_unions_case() - .skip_category('Go') - .skip_category('JS') - .skip_category('Rust'), - - generate_custom_metadata_case() - .skip_category('Go') - .skip_category('JS'), - - generate_duplicate_fieldnames_case() - .skip_category('Go') - .skip_category('JS'), - - # TODO(ARROW-3039, ARROW-5267): Dictionaries in GO - generate_dictionary_case() - .skip_category('Go'), - - generate_dictionary_unsigned_case() - .skip_category('Go') # TODO(ARROW-9378) - .skip_category('Java'), # TODO(ARROW-9377) - - generate_nested_dictionary_case() - .skip_category('Go') - .skip_category('Java') # TODO(ARROW-7779) - .skip_category('JS') - .skip_category('Rust'), - - generate_extension_case() - .skip_category('Go') - .skip_category('JS') - .skip_category('Rust'), - ] - - generated_paths = [] - for file_obj in file_objs: - out_path = os.path.join(tempdir, 'generated_' + - file_obj.name + '.json') - file_obj.write(out_path) - generated_paths.append(file_obj) - - return generated_paths diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py deleted file mode 100644 index 8aef163749078..0000000000000 --- a/dev/archery/archery/integration/runner.py +++ /dev/null @@ -1,419 +0,0 @@ -# licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from collections import namedtuple -from concurrent.futures import ThreadPoolExecutor -from functools import partial -import glob -import gzip -import itertools -import os -import sys -import tempfile -import traceback - -from .scenario import Scenario -from .tester_cpp import CPPTester -from .tester_go import GoTester -from .tester_rust import RustTester -from .tester_java import JavaTester -from .tester_js import JSTester -from .util import (ARROW_ROOT_DEFAULT, guid, SKIP_ARROW, SKIP_FLIGHT, - printer) -from . import datagen - - -Failure = namedtuple('Failure', - ('test_case', 'producer', 'consumer', 'exc_info')) - -log = printer.print - - -class Outcome: - def __init__(self): - self.failure = None - self.skipped = False - - -class IntegrationRunner(object): - - def __init__(self, json_files, flight_scenarios, testers, tempdir=None, - debug=False, stop_on_error=True, gold_dirs=None, - serial=False, match=None, **unused_kwargs): - self.json_files = json_files - self.flight_scenarios = flight_scenarios - self.testers = testers - self.temp_dir = tempdir or tempfile.mkdtemp() - self.debug = debug - self.stop_on_error = stop_on_error - self.serial = serial - self.gold_dirs = gold_dirs - self.failures = [] - self.match = match - - if self.match is not None: - print("-- Only running tests with {} in their name" - .format(self.match)) - self.json_files = [json_file for json_file in self.json_files - if self.match in json_file.name] - - def run(self): - """ - Run Arrow IPC integration tests for the matrix of enabled - implementations. - """ - for producer, consumer in itertools.product( - filter(lambda t: t.PRODUCER, self.testers), - filter(lambda t: t.CONSUMER, self.testers)): - self._compare_implementations( - producer, consumer, self._produce_consume, - self.json_files) - if self.gold_dirs: - for gold_dir, consumer in itertools.product( - self.gold_dirs, - filter(lambda t: t.CONSUMER, self.testers)): - log('\n\n\n\n') - log('******************************************************') - log('Tests against golden files in {}'.format(gold_dir)) - log('******************************************************') - - def run_gold(producer, consumer, outcome, test_case): - self._run_gold(gold_dir, producer, consumer, outcome, - test_case) - self._compare_implementations( - consumer, consumer, run_gold, - self._gold_tests(gold_dir)) - - def run_flight(self): - """ - Run Arrow Flight integration tests for the matrix of enabled - implementations. - """ - servers = filter(lambda t: t.FLIGHT_SERVER, self.testers) - clients = filter(lambda t: (t.FLIGHT_CLIENT and t.CONSUMER), - self.testers) - for server, client in itertools.product(servers, clients): - self._compare_flight_implementations(server, client) - - def _gold_tests(self, gold_dir): - prefix = os.path.basename(os.path.normpath(gold_dir)) - SUFFIX = ".json.gz" - golds = [jf for jf in os.listdir(gold_dir) if jf.endswith(SUFFIX)] - for json_path in golds: - name = json_path[json_path.index('_')+1: -len(SUFFIX)] - base_name = prefix + "_" + name + ".gold.json" - out_path = os.path.join(self.temp_dir, base_name) - with gzip.open(os.path.join(gold_dir, json_path)) as i: - with open(out_path, "wb") as out: - out.write(i.read()) - - try: - skip = next(f for f in self.json_files - if f.name == name).skip - except StopIteration: - skip = set() - if name == 'union' and prefix == '0.17.1': - skip.add("Java") - if prefix == '1.0.0-bigendian' or prefix == '1.0.0-littleendian': - skip.add("Go") - skip.add("Java") - skip.add("JS") - skip.add("Rust") - if prefix == '2.0.0-compression': - skip.add("JS") - skip.add("Rust") - - # See https://github.com/apache/arrow/pull/9822 for how to - # disable specific compression type tests. - - if prefix == '4.0.0-shareddict': - skip.add("Go") - - yield datagen.File(name, None, None, skip=skip, path=out_path) - - def _run_test_cases(self, producer, consumer, case_runner, - test_cases): - def case_wrapper(test_case): - with printer.cork(): - return case_runner(test_case) - - if self.failures and self.stop_on_error: - return - - if self.serial: - for outcome in map(case_wrapper, test_cases): - if outcome.failure is not None: - self.failures.append(outcome.failure) - if self.stop_on_error: - break - - else: - with ThreadPoolExecutor() as executor: - for outcome in executor.map(case_wrapper, test_cases): - if outcome.failure is not None: - self.failures.append(outcome.failure) - if self.stop_on_error: - break - - def _compare_implementations( - self, producer, consumer, run_binaries, test_cases): - """ - Compare Arrow IPC for two implementations (one producer, one consumer). - """ - log('##########################################################') - log('IPC: {0} producing, {1} consuming' - .format(producer.name, consumer.name)) - log('##########################################################') - - case_runner = partial(self._run_ipc_test_case, - producer, consumer, run_binaries) - self._run_test_cases(producer, consumer, case_runner, test_cases) - - def _run_ipc_test_case(self, producer, consumer, run_binaries, test_case): - """ - Run one IPC test case. - """ - outcome = Outcome() - - json_path = test_case.path - log('==========================================================') - log('Testing file {0}'.format(json_path)) - log('==========================================================') - - if producer.name in test_case.skip: - log('-- Skipping test because producer {0} does ' - 'not support'.format(producer.name)) - outcome.skipped = True - - elif consumer.name in test_case.skip: - log('-- Skipping test because consumer {0} does ' - 'not support'.format(consumer.name)) - outcome.skipped = True - - elif SKIP_ARROW in test_case.skip: - log('-- Skipping test') - outcome.skipped = True - - else: - try: - run_binaries(producer, consumer, outcome, test_case) - except Exception: - traceback.print_exc(file=printer.stdout) - outcome.failure = Failure(test_case, producer, consumer, - sys.exc_info()) - - return outcome - - def _produce_consume(self, producer, consumer, outcome, test_case): - # Make the random access file - json_path = test_case.path - file_id = guid()[:8] - name = os.path.splitext(os.path.basename(json_path))[0] - - producer_file_path = os.path.join(self.temp_dir, file_id + '_' + - name + '.json_as_file') - producer_stream_path = os.path.join(self.temp_dir, file_id + '_' + - name + '.producer_file_as_stream') - consumer_file_path = os.path.join(self.temp_dir, file_id + '_' + - name + '.consumer_stream_as_file') - - log('-- Creating binary inputs') - producer.json_to_file(json_path, producer_file_path) - - # Validate the file - log('-- Validating file') - consumer.validate(json_path, producer_file_path) - - log('-- Validating stream') - producer.file_to_stream(producer_file_path, producer_stream_path) - consumer.stream_to_file(producer_stream_path, consumer_file_path) - consumer.validate(json_path, consumer_file_path) - - def _run_gold(self, gold_dir, producer, consumer, outcome, test_case): - json_path = test_case.path - - # Validate the file - log('-- Validating file') - producer_file_path = os.path.join( - gold_dir, "generated_" + test_case.name + ".arrow_file") - consumer.validate(json_path, producer_file_path) - - log('-- Validating stream') - consumer_stream_path = os.path.join( - gold_dir, "generated_" + test_case.name + ".stream") - file_id = guid()[:8] - name = os.path.splitext(os.path.basename(json_path))[0] - - consumer_file_path = os.path.join(self.temp_dir, file_id + '_' + - name + '.consumer_stream_as_file') - - consumer.stream_to_file(consumer_stream_path, consumer_file_path) - consumer.validate(json_path, consumer_file_path) - - def _compare_flight_implementations(self, producer, consumer): - log('##########################################################') - log('Flight: {0} serving, {1} requesting' - .format(producer.name, consumer.name)) - log('##########################################################') - - case_runner = partial(self._run_flight_test_case, producer, consumer) - self._run_test_cases(producer, consumer, case_runner, - self.json_files + self.flight_scenarios) - - def _run_flight_test_case(self, producer, consumer, test_case): - """ - Run one Flight test case. - """ - outcome = Outcome() - - log('=' * 58) - log('Testing file {0}'.format(test_case.name)) - log('=' * 58) - - if producer.name in test_case.skip: - log('-- Skipping test because producer {0} does ' - 'not support'.format(producer.name)) - outcome.skipped = True - - elif consumer.name in test_case.skip: - log('-- Skipping test because consumer {0} does ' - 'not support'.format(consumer.name)) - outcome.skipped = True - - elif SKIP_FLIGHT in test_case.skip: - log('-- Skipping test') - outcome.skipped = True - - else: - try: - if isinstance(test_case, Scenario): - server = producer.flight_server(test_case.name) - client_args = {'scenario_name': test_case.name} - else: - server = producer.flight_server() - client_args = {'json_path': test_case.path} - - with server as port: - # Have the client upload the file, then download and - # compare - consumer.flight_request(port, **client_args) - except Exception: - traceback.print_exc(file=printer.stdout) - outcome.failure = Failure(test_case, producer, consumer, - sys.exc_info()) - - return outcome - - -def get_static_json_files(): - glob_pattern = os.path.join(ARROW_ROOT_DEFAULT, - 'integration', 'data', '*.json') - return [ - datagen.File(name=os.path.basename(p), path=p, skip=set(), - schema=None, batches=None) - for p in glob.glob(glob_pattern) - ] - - -def run_all_tests(with_cpp=True, with_java=True, with_js=True, - with_go=True, with_rust=False, run_flight=False, - tempdir=None, **kwargs): - tempdir = tempdir or tempfile.mkdtemp(prefix='arrow-integration-') - - testers = [] - - if with_cpp: - testers.append(CPPTester(**kwargs)) - - if with_java: - testers.append(JavaTester(**kwargs)) - - if with_js: - testers.append(JSTester(**kwargs)) - - if with_go: - testers.append(GoTester(**kwargs)) - - if with_rust: - testers.append(RustTester(**kwargs)) - - static_json_files = get_static_json_files() - generated_json_files = datagen.get_generated_json_files(tempdir=tempdir) - json_files = static_json_files + generated_json_files - - # Additional integration test cases for Arrow Flight. - flight_scenarios = [ - Scenario( - "auth:basic_proto", - description="Authenticate using the BasicAuth protobuf."), - Scenario( - "middleware", - description="Ensure headers are propagated via middleware.", - skip={"Rust"} # TODO(ARROW-10961): tonic upgrade needed - ), - ] - - runner = IntegrationRunner(json_files, flight_scenarios, testers, **kwargs) - runner.run() - if run_flight: - runner.run_flight() - - fail_count = 0 - if runner.failures: - log("################# FAILURES #################") - for test_case, producer, consumer, exc_info in runner.failures: - fail_count += 1 - log("FAILED TEST:", end=" ") - log(test_case.name, producer.name, "producing, ", - consumer.name, "consuming") - if exc_info: - traceback.print_exception(*exc_info) - log() - - log(fail_count, "failures") - if fail_count > 0: - sys.exit(1) - - -def write_js_test_json(directory): - datagen.generate_map_case().write( - os.path.join(directory, 'map.json') - ) - datagen.generate_nested_case().write( - os.path.join(directory, 'nested.json') - ) - datagen.generate_decimal_case().write( - os.path.join(directory, 'decimal.json') - ) - datagen.generate_datetime_case().write( - os.path.join(directory, 'datetime.json') - ) - datagen.generate_dictionary_case().write( - os.path.join(directory, 'dictionary.json') - ) - datagen.generate_dictionary_unsigned_case().write( - os.path.join(directory, 'dictionary_unsigned.json') - ) - datagen.generate_primitive_case([]).write( - os.path.join(directory, 'primitive_no_batches.json') - ) - datagen.generate_primitive_case([7, 10]).write( - os.path.join(directory, 'primitive.json') - ) - datagen.generate_primitive_case([0, 0, 0]).write( - os.path.join(directory, 'primitive-empty.json') - ) diff --git a/dev/archery/archery/integration/scenario.py b/dev/archery/archery/integration/scenario.py deleted file mode 100644 index 1fcbca64e6a1f..0000000000000 --- a/dev/archery/archery/integration/scenario.py +++ /dev/null @@ -1,29 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -class Scenario: - """ - An integration test scenario for Arrow Flight. - - Does not correspond to a particular IPC JSON file. - """ - - def __init__(self, name, description, skip=None): - self.name = name - self.description = description - self.skip = skip or set() diff --git a/dev/archery/archery/integration/tester.py b/dev/archery/archery/integration/tester.py deleted file mode 100644 index 122e4f2e4a78b..0000000000000 --- a/dev/archery/archery/integration/tester.py +++ /dev/null @@ -1,62 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Base class for language-specific integration test harnesses - -import subprocess - -from .util import log - - -class Tester(object): - PRODUCER = False - CONSUMER = False - FLIGHT_SERVER = False - FLIGHT_CLIENT = False - - def __init__(self, debug=False, **args): - self.args = args - self.debug = debug - - def run_shell_command(self, cmd): - cmd = ' '.join(cmd) - if self.debug: - log(cmd) - subprocess.check_call(cmd, shell=True) - - def json_to_file(self, json_path, arrow_path): - raise NotImplementedError - - def stream_to_file(self, stream_path, file_path): - raise NotImplementedError - - def file_to_stream(self, file_path, stream_path): - raise NotImplementedError - - def validate(self, json_path, arrow_path): - raise NotImplementedError - - def flight_server(self, scenario_name=None): - """Start the Flight server on a free port. - - This should be a context manager that returns the port as the - managed object, and cleans up the server on exit. - """ - raise NotImplementedError - - def flight_request(self, port, json_path=None, scenario_name=None): - raise NotImplementedError diff --git a/dev/archery/archery/integration/tester_cpp.py b/dev/archery/archery/integration/tester_cpp.py deleted file mode 100644 index d35c9550e58ea..0000000000000 --- a/dev/archery/archery/integration/tester_cpp.py +++ /dev/null @@ -1,116 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import contextlib -import os -import subprocess - -from .tester import Tester -from .util import run_cmd, ARROW_ROOT_DEFAULT, log - - -class CPPTester(Tester): - PRODUCER = True - CONSUMER = True - FLIGHT_SERVER = True - FLIGHT_CLIENT = True - - EXE_PATH = os.environ.get( - 'ARROW_CPP_EXE_PATH', - os.path.join(ARROW_ROOT_DEFAULT, 'cpp/build/debug')) - - CPP_INTEGRATION_EXE = os.path.join(EXE_PATH, 'arrow-json-integration-test') - STREAM_TO_FILE = os.path.join(EXE_PATH, 'arrow-stream-to-file') - FILE_TO_STREAM = os.path.join(EXE_PATH, 'arrow-file-to-stream') - - FLIGHT_SERVER_CMD = [ - os.path.join(EXE_PATH, 'flight-test-integration-server')] - FLIGHT_CLIENT_CMD = [ - os.path.join(EXE_PATH, 'flight-test-integration-client'), - "-host", "localhost"] - - name = 'C++' - - def _run(self, arrow_path=None, json_path=None, command='VALIDATE'): - cmd = [self.CPP_INTEGRATION_EXE, '--integration'] - - if arrow_path is not None: - cmd.append('--arrow=' + arrow_path) - - if json_path is not None: - cmd.append('--json=' + json_path) - - cmd.append('--mode=' + command) - - if self.debug: - log(' '.join(cmd)) - - run_cmd(cmd) - - def validate(self, json_path, arrow_path): - return self._run(arrow_path, json_path, 'VALIDATE') - - def json_to_file(self, json_path, arrow_path): - return self._run(arrow_path, json_path, 'JSON_TO_ARROW') - - def stream_to_file(self, stream_path, file_path): - cmd = [self.STREAM_TO_FILE, '<', stream_path, '>', file_path] - self.run_shell_command(cmd) - - def file_to_stream(self, file_path, stream_path): - cmd = [self.FILE_TO_STREAM, file_path, '>', stream_path] - self.run_shell_command(cmd) - - @contextlib.contextmanager - def flight_server(self, scenario_name=None): - cmd = self.FLIGHT_SERVER_CMD + ['-port=0'] - if scenario_name: - cmd = cmd + ["-scenario", scenario_name] - if self.debug: - log(' '.join(cmd)) - server = subprocess.Popen(cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - try: - output = server.stdout.readline().decode() - if not output.startswith("Server listening on localhost:"): - server.kill() - out, err = server.communicate() - raise RuntimeError( - "Flight-C++ server did not start properly, " - "stdout:\n{}\n\nstderr:\n{}\n" - .format(output + out.decode(), err.decode())) - port = int(output.split(":")[1]) - yield port - finally: - server.kill() - server.wait(5) - - def flight_request(self, port, json_path=None, scenario_name=None): - cmd = self.FLIGHT_CLIENT_CMD + [ - '-port=' + str(port), - ] - if json_path: - cmd.extend(('-path', json_path)) - elif scenario_name: - cmd.extend(('-scenario', scenario_name)) - else: - raise TypeError("Must provide one of json_path or scenario_name") - - if self.debug: - log(' '.join(cmd)) - run_cmd(cmd) diff --git a/dev/archery/archery/integration/tester_go.py b/dev/archery/archery/integration/tester_go.py deleted file mode 100644 index ea799c5a1bd2a..0000000000000 --- a/dev/archery/archery/integration/tester_go.py +++ /dev/null @@ -1,67 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import os - -from .tester import Tester -from .util import run_cmd, log - - -class GoTester(Tester): - PRODUCER = True - CONSUMER = True - - # FIXME(sbinet): revisit for Go modules - HOME = os.getenv('HOME', '~') - GOPATH = os.getenv('GOPATH', os.path.join(HOME, 'go')) - GOBIN = os.environ.get('GOBIN', os.path.join(GOPATH, 'bin')) - - GO_INTEGRATION_EXE = os.path.join(GOBIN, 'arrow-json-integration-test') - STREAM_TO_FILE = os.path.join(GOBIN, 'arrow-stream-to-file') - FILE_TO_STREAM = os.path.join(GOBIN, 'arrow-file-to-stream') - - name = 'Go' - - def _run(self, arrow_path=None, json_path=None, command='VALIDATE'): - cmd = [self.GO_INTEGRATION_EXE] - - if arrow_path is not None: - cmd.extend(['-arrow', arrow_path]) - - if json_path is not None: - cmd.extend(['-json', json_path]) - - cmd.extend(['-mode', command]) - - if self.debug: - log(' '.join(cmd)) - - run_cmd(cmd) - - def validate(self, json_path, arrow_path): - return self._run(arrow_path, json_path, 'VALIDATE') - - def json_to_file(self, json_path, arrow_path): - return self._run(arrow_path, json_path, 'JSON_TO_ARROW') - - def stream_to_file(self, stream_path, file_path): - cmd = [self.STREAM_TO_FILE, '<', stream_path, '>', file_path] - self.run_shell_command(cmd) - - def file_to_stream(self, file_path, stream_path): - cmd = [self.FILE_TO_STREAM, file_path, '>', stream_path] - self.run_shell_command(cmd) diff --git a/dev/archery/archery/integration/tester_java.py b/dev/archery/archery/integration/tester_java.py deleted file mode 100644 index f283f6cd255c7..0000000000000 --- a/dev/archery/archery/integration/tester_java.py +++ /dev/null @@ -1,140 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import contextlib -import os -import subprocess - -from .tester import Tester -from .util import run_cmd, ARROW_ROOT_DEFAULT, log - - -def load_version_from_pom(): - import xml.etree.ElementTree as ET - tree = ET.parse(os.path.join(ARROW_ROOT_DEFAULT, 'java', 'pom.xml')) - tag_pattern = '{http://maven.apache.org/POM/4.0.0}version' - version_tag = list(tree.getroot().findall(tag_pattern))[0] - return version_tag.text - - -class JavaTester(Tester): - PRODUCER = True - CONSUMER = True - FLIGHT_SERVER = True - FLIGHT_CLIENT = True - - JAVA_OPTS = ['-Dio.netty.tryReflectionSetAccessible=true', - '-Darrow.struct.conflict.policy=CONFLICT_APPEND'] - - _arrow_version = load_version_from_pom() - ARROW_TOOLS_JAR = os.environ.get( - 'ARROW_JAVA_INTEGRATION_JAR', - os.path.join(ARROW_ROOT_DEFAULT, - 'java/tools/target/arrow-tools-{}-' - 'jar-with-dependencies.jar'.format(_arrow_version))) - ARROW_FLIGHT_JAR = os.environ.get( - 'ARROW_FLIGHT_JAVA_INTEGRATION_JAR', - os.path.join(ARROW_ROOT_DEFAULT, - 'java/flight/flight-core/target/flight-core-{}-' - 'jar-with-dependencies.jar'.format(_arrow_version))) - ARROW_FLIGHT_SERVER = ('org.apache.arrow.flight.example.integration.' - 'IntegrationTestServer') - ARROW_FLIGHT_CLIENT = ('org.apache.arrow.flight.example.integration.' - 'IntegrationTestClient') - - name = 'Java' - - def _run(self, arrow_path=None, json_path=None, command='VALIDATE'): - cmd = ['java'] + self.JAVA_OPTS + \ - ['-cp', self.ARROW_TOOLS_JAR, 'org.apache.arrow.tools.Integration'] - - if arrow_path is not None: - cmd.extend(['-a', arrow_path]) - - if json_path is not None: - cmd.extend(['-j', json_path]) - - cmd.extend(['-c', command]) - - if self.debug: - log(' '.join(cmd)) - - run_cmd(cmd) - - def validate(self, json_path, arrow_path): - return self._run(arrow_path, json_path, 'VALIDATE') - - def json_to_file(self, json_path, arrow_path): - return self._run(arrow_path, json_path, 'JSON_TO_ARROW') - - def stream_to_file(self, stream_path, file_path): - cmd = ['java'] + self.JAVA_OPTS + \ - ['-cp', self.ARROW_TOOLS_JAR, - 'org.apache.arrow.tools.StreamToFile', stream_path, file_path] - if self.debug: - log(' '.join(cmd)) - run_cmd(cmd) - - def file_to_stream(self, file_path, stream_path): - cmd = ['java'] + self.JAVA_OPTS + \ - ['-cp', self.ARROW_TOOLS_JAR, - 'org.apache.arrow.tools.FileToStream', file_path, stream_path] - if self.debug: - log(' '.join(cmd)) - run_cmd(cmd) - - def flight_request(self, port, json_path=None, scenario_name=None): - cmd = ['java'] + self.JAVA_OPTS + \ - ['-cp', self.ARROW_FLIGHT_JAR, self.ARROW_FLIGHT_CLIENT, - '-port', str(port)] - - if json_path: - cmd.extend(('-j', json_path)) - elif scenario_name: - cmd.extend(('-scenario', scenario_name)) - else: - raise TypeError("Must provide one of json_path or scenario_name") - - if self.debug: - log(' '.join(cmd)) - run_cmd(cmd) - - @contextlib.contextmanager - def flight_server(self, scenario_name=None): - cmd = ['java'] + self.JAVA_OPTS + \ - ['-cp', self.ARROW_FLIGHT_JAR, self.ARROW_FLIGHT_SERVER, - '-port', '0'] - if scenario_name: - cmd.extend(('-scenario', scenario_name)) - if self.debug: - log(' '.join(cmd)) - server = subprocess.Popen(cmd, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - try: - output = server.stdout.readline().decode() - if not output.startswith("Server listening on localhost:"): - server.kill() - out, err = server.communicate() - raise RuntimeError( - "Flight-Java server did not start properly, " - "stdout:\n{}\n\nstderr:\n{}\n" - .format(output + out.decode(), err.decode())) - port = int(output.split(":")[1]) - yield port - finally: - server.kill() - server.wait(5) diff --git a/dev/archery/archery/integration/tester_js.py b/dev/archery/archery/integration/tester_js.py deleted file mode 100644 index e24eec0cadaa7..0000000000000 --- a/dev/archery/archery/integration/tester_js.py +++ /dev/null @@ -1,73 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import os - -from .tester import Tester -from .util import run_cmd, ARROW_ROOT_DEFAULT, log - - -class JSTester(Tester): - PRODUCER = True - CONSUMER = True - - EXE_PATH = os.path.join(ARROW_ROOT_DEFAULT, 'js/bin') - VALIDATE = os.path.join(EXE_PATH, 'integration.js') - JSON_TO_ARROW = os.path.join(EXE_PATH, 'json-to-arrow.js') - STREAM_TO_FILE = os.path.join(EXE_PATH, 'stream-to-file.js') - FILE_TO_STREAM = os.path.join(EXE_PATH, 'file-to-stream.js') - - name = 'JS' - - def _run(self, exe_cmd, arrow_path=None, json_path=None, - command='VALIDATE'): - cmd = [exe_cmd] - - if arrow_path is not None: - cmd.extend(['-a', arrow_path]) - - if json_path is not None: - cmd.extend(['-j', json_path]) - - cmd.extend(['--mode', command]) - - if self.debug: - log(' '.join(cmd)) - - run_cmd(cmd) - - def validate(self, json_path, arrow_path): - return self._run(self.VALIDATE, arrow_path, json_path, 'VALIDATE') - - def json_to_file(self, json_path, arrow_path): - cmd = ['node', - '--no-warnings', self.JSON_TO_ARROW, - '-a', arrow_path, - '-j', json_path] - self.run_shell_command(cmd) - - def stream_to_file(self, stream_path, file_path): - cmd = ['node', '--no-warnings', self.STREAM_TO_FILE, - '<', stream_path, - '>', file_path] - self.run_shell_command(cmd) - - def file_to_stream(self, file_path, stream_path): - cmd = ['node', '--no-warnings', self.FILE_TO_STREAM, - '<', file_path, - '>', stream_path] - self.run_shell_command(cmd) diff --git a/dev/archery/archery/integration/tester_rust.py b/dev/archery/archery/integration/tester_rust.py deleted file mode 100644 index bca80ebae3c60..0000000000000 --- a/dev/archery/archery/integration/tester_rust.py +++ /dev/null @@ -1,115 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import contextlib -import os -import subprocess - -from .tester import Tester -from .util import run_cmd, ARROW_ROOT_DEFAULT, log - - -class RustTester(Tester): - PRODUCER = True - CONSUMER = True - FLIGHT_SERVER = True - FLIGHT_CLIENT = True - - EXE_PATH = os.path.join(ARROW_ROOT_DEFAULT, 'rust/target/debug') - - RUST_INTEGRATION_EXE = os.path.join(EXE_PATH, - 'arrow-json-integration-test') - STREAM_TO_FILE = os.path.join(EXE_PATH, 'arrow-stream-to-file') - FILE_TO_STREAM = os.path.join(EXE_PATH, 'arrow-file-to-stream') - - FLIGHT_SERVER_CMD = [ - os.path.join(EXE_PATH, 'flight-test-integration-server')] - FLIGHT_CLIENT_CMD = [ - os.path.join(EXE_PATH, 'flight-test-integration-client'), - "--host", "localhost"] - - name = 'Rust' - - def _run(self, arrow_path=None, json_path=None, command='VALIDATE'): - cmd = [self.RUST_INTEGRATION_EXE, '--integration'] - - if arrow_path is not None: - cmd.append('--arrow=' + arrow_path) - - if json_path is not None: - cmd.append('--json=' + json_path) - - cmd.append('--mode=' + command) - - if self.debug: - log(' '.join(cmd)) - - run_cmd(cmd) - - def validate(self, json_path, arrow_path): - return self._run(arrow_path, json_path, 'VALIDATE') - - def json_to_file(self, json_path, arrow_path): - return self._run(arrow_path, json_path, 'JSON_TO_ARROW') - - def stream_to_file(self, stream_path, file_path): - cmd = [self.STREAM_TO_FILE, '<', stream_path, '>', file_path] - self.run_shell_command(cmd) - - def file_to_stream(self, file_path, stream_path): - cmd = [self.FILE_TO_STREAM, file_path, '>', stream_path] - self.run_shell_command(cmd) - - @contextlib.contextmanager - def flight_server(self, scenario_name=None): - cmd = self.FLIGHT_SERVER_CMD + ['--port=0'] - if scenario_name: - cmd = cmd + ["--scenario", scenario_name] - if self.debug: - log(' '.join(cmd)) - server = subprocess.Popen(cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - try: - output = server.stdout.readline().decode() - if not output.startswith("Server listening on localhost:"): - server.kill() - out, err = server.communicate() - raise RuntimeError( - "Flight-Rust server did not start properly, " - "stdout:\n{}\n\nstderr:\n{}\n" - .format(output + out.decode(), err.decode())) - port = int(output.split(":")[1]) - yield port - finally: - server.kill() - server.wait(5) - - def flight_request(self, port, json_path=None, scenario_name=None): - cmd = self.FLIGHT_CLIENT_CMD + [ - '--port=' + str(port), - ] - if json_path: - cmd.extend(('--path', json_path)) - elif scenario_name: - cmd.extend(('--scenario', scenario_name)) - else: - raise TypeError("Must provide one of json_path or scenario_name") - - if self.debug: - log(' '.join(cmd)) - run_cmd(cmd) diff --git a/dev/archery/archery/integration/util.py b/dev/archery/archery/integration/util.py deleted file mode 100644 index a4c4982ecb38f..0000000000000 --- a/dev/archery/archery/integration/util.py +++ /dev/null @@ -1,166 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import contextlib -import io -import os -import random -import socket -import subprocess -import sys -import threading -import uuid - -import numpy as np - - -def guid(): - return uuid.uuid4().hex - - -# SKIP categories -SKIP_ARROW = 'arrow' -SKIP_FLIGHT = 'flight' - -ARROW_ROOT_DEFAULT = os.environ.get( - 'ARROW_ROOT', - os.path.abspath(__file__).rsplit("/", 5)[0] -) - - -class _Printer: - """ - A print()-providing object that can override the stream output on - a per-thread basis. - """ - - def __init__(self): - self._tls = threading.local() - - def _get_stdout(self): - try: - return self._tls.stdout - except AttributeError: - self._tls.stdout = sys.stdout - self._tls.corked = False - return self._tls.stdout - - def print(self, *args, **kwargs): - """ - A variant of print() that writes to a thread-local stream. - """ - print(*args, file=self._get_stdout(), **kwargs) - - @property - def stdout(self): - """ - A thread-local stdout wrapper that may be temporarily buffered - using `cork()`. - """ - return self._get_stdout() - - @contextlib.contextmanager - def cork(self): - """ - Temporarily buffer this thread's stream and write out its contents - at the end of the context manager. Useful to avoid interleaved - output when multiple threads output progress information. - """ - outer_stdout = self._get_stdout() - assert not self._tls.corked, "reentrant call" - inner_stdout = self._tls.stdout = io.StringIO() - self._tls.corked = True - try: - yield - finally: - self._tls.stdout = outer_stdout - self._tls.corked = False - outer_stdout.write(inner_stdout.getvalue()) - outer_stdout.flush() - - -printer = _Printer() -log = printer.print - - -_RAND_CHARS = np.array(list("abcdefghijklmnop123456Ârrôwµ£°€矢"), dtype="U") - - -def random_utf8(nchars): - """ - Generate one random UTF8 string. - """ - return ''.join(np.random.choice(_RAND_CHARS, nchars)) - - -def random_bytes(nbytes): - """ - Generate one random binary string. - """ - # NOTE getrandbits(0) fails - if nbytes > 0: - return random.getrandbits(nbytes * 8).to_bytes(nbytes, - byteorder='little') - else: - return b"" - - -def tobytes(o): - if isinstance(o, str): - return o.encode('utf8') - return o - - -def frombytes(o): - if isinstance(o, bytes): - return o.decode('utf8') - return o - - -def run_cmd(cmd): - if isinstance(cmd, str): - cmd = cmd.split(' ') - - try: - output = subprocess.check_output(cmd, stderr=subprocess.STDOUT) - except subprocess.CalledProcessError as e: - # this avoids hiding the stdout / stderr of failed processes - sio = io.StringIO() - print('Command failed:', " ".join(cmd), file=sio) - print('With output:', file=sio) - print('--------------', file=sio) - print(frombytes(e.output), file=sio) - print('--------------', file=sio) - raise RuntimeError(sio.getvalue()) - - return frombytes(output) - - -# Adapted from CPython -def find_unused_port(family=socket.AF_INET, socktype=socket.SOCK_STREAM): - """Returns an unused port that should be suitable for binding. This is - achieved by creating a temporary socket with the same family and type as - the 'sock' parameter (default is AF_INET, SOCK_STREAM), and binding it to - the specified host address (defaults to 0.0.0.0) with the port set to 0, - eliciting an unused ephemeral port from the OS. The temporary socket is - then closed and deleted, and the ephemeral port is returned. - """ - with socket.socket(family, socktype) as tempsock: - tempsock.bind(('', 0)) - port = tempsock.getsockname()[1] - del tempsock - return port diff --git a/dev/archery/archery/lang/__init__.py b/dev/archery/archery/lang/__init__.py deleted file mode 100644 index 13a83393a9124..0000000000000 --- a/dev/archery/archery/lang/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py deleted file mode 100644 index 045d23b56b154..0000000000000 --- a/dev/archery/archery/lang/cpp.py +++ /dev/null @@ -1,295 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import os - -from ..utils.cmake import CMakeDefinition - - -def truthifier(value): - return "ON" if value else "OFF" - - -def or_else(value, default): - return value if value else default - - -def coalesce(value, fallback): - return fallback if value is None else value - - -LLVM_VERSION = 7 - - -class CppConfiguration: - def __init__(self, - - # toolchain - cc=None, cxx=None, cxx_flags=None, - build_type=None, warn_level=None, - cpp_package_prefix=None, install_prefix=None, use_conda=None, - build_static=False, build_shared=True, - # tests & examples - with_tests=None, with_benchmarks=None, with_examples=None, - with_integration=None, - # static checks - use_asan=None, use_tsan=None, use_ubsan=None, - with_fuzzing=None, - # Components - with_compute=None, with_csv=None, with_cuda=None, - with_dataset=None, with_filesystem=None, with_flight=None, - with_gandiva=None, with_hdfs=None, with_hiveserver2=None, - with_ipc=True, with_json=None, with_jni=None, - with_mimalloc=None, - with_parquet=None, with_plasma=None, with_python=True, - with_r=None, with_s3=None, - # Compressions - with_brotli=None, with_bz2=None, with_lz4=None, - with_snappy=None, with_zlib=None, with_zstd=None, - # extras - with_lint_only=False, - use_gold_linker=True, - simd_level="SSE4_2", - cmake_extras=None): - self._cc = cc - self._cxx = cxx - self.cxx_flags = cxx_flags - - self._build_type = build_type - self.warn_level = warn_level - self._install_prefix = install_prefix - self._package_prefix = cpp_package_prefix - self._use_conda = use_conda - self.build_static = build_static - self.build_shared = build_shared - - self.with_tests = with_tests - self.with_benchmarks = with_benchmarks - self.with_examples = with_examples - self.with_integration = with_integration - - self.use_asan = use_asan - self.use_tsan = use_tsan - self.use_ubsan = use_ubsan - self.with_fuzzing = with_fuzzing - - self.with_compute = with_compute - self.with_csv = with_csv - self.with_cuda = with_cuda - self.with_dataset = with_dataset - self.with_filesystem = with_filesystem - self.with_flight = with_flight - self.with_gandiva = with_gandiva - self.with_hdfs = with_hdfs - self.with_hiveserver2 = with_hiveserver2 - self.with_ipc = with_ipc - self.with_json = with_json - self.with_jni = with_jni - self.with_mimalloc = with_mimalloc - self.with_parquet = with_parquet - self.with_plasma = with_plasma - self.with_python = with_python - self.with_r = with_r - self.with_s3 = with_s3 - - self.with_brotli = with_brotli - self.with_bz2 = with_bz2 - self.with_lz4 = with_lz4 - self.with_snappy = with_snappy - self.with_zlib = with_zlib - self.with_zstd = with_zstd - - self.with_lint_only = with_lint_only - self.use_gold_linker = use_gold_linker - self.simd_level = simd_level - - self.cmake_extras = cmake_extras - - # Fixup required dependencies by providing sane defaults if the caller - # didn't specify the option. - if self.with_r: - self.with_csv = coalesce(with_csv, True) - self.with_dataset = coalesce(with_dataset, True) - self.with_filesystem = coalesce(with_filesystem, True) - self.with_ipc = coalesce(with_ipc, True) - self.with_json = coalesce(with_json, True) - self.with_parquet = coalesce(with_parquet, True) - - if self.with_python: - self.with_zlib = coalesce(with_zlib, True) - self.with_lz4 = coalesce(with_lz4, True) - - if self.with_dataset: - self.with_filesystem = coalesce(with_filesystem, True) - self.with_parquet = coalesce(with_parquet, True) - - if self.with_parquet: - self.with_snappy = coalesce(with_snappy, True) - - @property - def build_type(self): - if self._build_type: - return self._build_type - - if self.with_fuzzing: - return "relwithdebinfo" - - return "release" - - @property - def cc(self): - if self._cc: - return self._cc - - if self.with_fuzzing: - return "clang-{}".format(LLVM_VERSION) - - return None - - @property - def cxx(self): - if self._cxx: - return self._cxx - - if self.with_fuzzing: - return "clang++-{}".format(LLVM_VERSION) - - return None - - def _gen_defs(self): - if self.cxx_flags: - yield ("ARROW_CXXFLAGS", self.cxx_flags) - - yield ("CMAKE_EXPORT_COMPILE_COMMANDS", truthifier(True)) - yield ("CMAKE_BUILD_TYPE", self.build_type) - yield ("CMAKE_UNITY_BUILD", True) - - if not self.with_lint_only: - yield ("BUILD_WARNING_LEVEL", - or_else(self.warn_level, "production")) - - # if not ctx.quiet: - # yield ("ARROW_VERBOSE_THIRDPARTY_BUILD", "ON") - - maybe_prefix = self.install_prefix - if maybe_prefix: - yield ("CMAKE_INSTALL_PREFIX", maybe_prefix) - - if self._package_prefix is not None: - yield ("ARROW_DEPENDENCY_SOURCE", "SYSTEM") - yield ("ARROW_PACKAGE_PREFIX", self._package_prefix) - - yield ("ARROW_BUILD_STATIC", truthifier(self.build_static)) - yield ("ARROW_BUILD_SHARED", truthifier(self.build_shared)) - - # Tests and benchmarks - yield ("ARROW_BUILD_TESTS", truthifier(self.with_tests)) - yield ("ARROW_BUILD_BENCHMARKS", truthifier(self.with_benchmarks)) - yield ("ARROW_BUILD_EXAMPLES", truthifier(self.with_examples)) - yield ("ARROW_BUILD_INTEGRATION", truthifier(self.with_integration)) - - # Static checks - yield ("ARROW_USE_ASAN", truthifier(self.use_asan)) - yield ("ARROW_USE_TSAN", truthifier(self.use_tsan)) - yield ("ARROW_USE_UBSAN", truthifier(self.use_ubsan)) - yield ("ARROW_FUZZING", truthifier(self.with_fuzzing)) - - # Components - yield ("ARROW_COMPUTE", truthifier(self.with_compute)) - yield ("ARROW_CSV", truthifier(self.with_csv)) - yield ("ARROW_CUDA", truthifier(self.with_cuda)) - yield ("ARROW_DATASET", truthifier(self.with_dataset)) - yield ("ARROW_FILESYSTEM", truthifier(self.with_filesystem)) - yield ("ARROW_FLIGHT", truthifier(self.with_flight)) - yield ("ARROW_GANDIVA", truthifier(self.with_gandiva)) - yield ("ARROW_PARQUET", truthifier(self.with_parquet)) - yield ("ARROW_HDFS", truthifier(self.with_hdfs)) - yield ("ARROW_HIVESERVER2", truthifier(self.with_hiveserver2)) - yield ("ARROW_IPC", truthifier(self.with_ipc)) - yield ("ARROW_JSON", truthifier(self.with_json)) - yield ("ARROW_JNI", truthifier(self.with_jni)) - yield ("ARROW_MIMALLOC", truthifier(self.with_mimalloc)) - yield ("ARROW_PLASMA", truthifier(self.with_plasma)) - yield ("ARROW_PYTHON", truthifier(self.with_python)) - yield ("ARROW_S3", truthifier(self.with_s3)) - - # Compressions - yield ("ARROW_WITH_BROTLI", truthifier(self.with_brotli)) - yield ("ARROW_WITH_BZ2", truthifier(self.with_bz2)) - yield ("ARROW_WITH_LZ4", truthifier(self.with_lz4)) - yield ("ARROW_WITH_SNAPPY", truthifier(self.with_snappy)) - yield ("ARROW_WITH_ZLIB", truthifier(self.with_zlib)) - yield ("ARROW_WITH_ZSTD", truthifier(self.with_zstd)) - - yield ("ARROW_LINT_ONLY", truthifier(self.with_lint_only)) - - # Some configurations don't like gnu gold linker. - broken_with_gold_ld = [self.with_fuzzing, self.with_gandiva] - if self.use_gold_linker and not any(broken_with_gold_ld): - yield ("ARROW_USE_LD_GOLD", truthifier(self.use_gold_linker)) - yield ("ARROW_SIMD_LEVEL", or_else(self.simd_level, "SSE4_2")) - - # Detect custom conda toolchain - if self.use_conda: - for d, v in [('CMAKE_AR', 'AR'), ('CMAKE_RANLIB', 'RANLIB')]: - v = os.environ.get(v) - if v: - yield (d, v) - - @property - def install_prefix(self): - if self._install_prefix: - return self._install_prefix - - if self.use_conda: - return os.environ.get("CONDA_PREFIX") - - return None - - @property - def use_conda(self): - # If the user didn't specify a preference, guess via environment - if self._use_conda is None: - return os.environ.get("CONDA_PREFIX") is not None - - return self._use_conda - - @property - def definitions(self): - extras = list(self.cmake_extras) if self.cmake_extras else [] - definitions = ["-D{}={}".format(d[0], d[1]) for d in self._gen_defs()] - return definitions + extras - - @property - def environment(self): - env = os.environ.copy() - - if self.cc: - env["CC"] = self.cc - - if self.cxx: - env["CXX"] = self.cxx - - return env - - -class CppCMakeDefinition(CMakeDefinition): - def __init__(self, source, conf, **kwargs): - self.configuration = conf - super().__init__(source, **kwargs, - definitions=conf.definitions, env=conf.environment, - build_type=conf.build_type) diff --git a/dev/archery/archery/lang/java.py b/dev/archery/archery/lang/java.py deleted file mode 100644 index 24743b67fd747..0000000000000 --- a/dev/archery/archery/lang/java.py +++ /dev/null @@ -1,30 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from ..utils.command import Command, CommandStackMixin, default_bin - - -class Java(Command): - def __init__(self, java_bin=None): - self.bin = default_bin(java_bin, "java") - - -class Jar(CommandStackMixin, Java): - def __init__(self, jar, *args, **kwargs): - self.jar = jar - self.argv = ("-jar", jar) - Java.__init__(self, *args, **kwargs) diff --git a/dev/archery/archery/lang/python.py b/dev/archery/archery/lang/python.py deleted file mode 100644 index 4952d5f23051f..0000000000000 --- a/dev/archery/archery/lang/python.py +++ /dev/null @@ -1,218 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import inspect -import tokenize -from contextlib import contextmanager - -try: - from numpydoc.validate import Docstring, validate -except ImportError: - have_numpydoc = False -else: - have_numpydoc = True - -from ..utils.command import Command, capture_stdout, default_bin - - -class Flake8(Command): - def __init__(self, flake8_bin=None): - self.bin = default_bin(flake8_bin, "flake8") - - -class Autopep8(Command): - def __init__(self, autopep8_bin=None): - self.bin = default_bin(autopep8_bin, "autopep8") - - @capture_stdout() - def run_captured(self, *args, **kwargs): - return self.run(*args, **kwargs) - - -def _tokenize_signature(s): - lines = s.encode('ascii').splitlines() - generator = iter(lines).__next__ - return tokenize.tokenize(generator) - - -def _convert_typehint(tokens): - names = [] - opening_bracket_reached = False - for token in tokens: - # omit the tokens before the opening bracket - if not opening_bracket_reached: - if token.string == '(': - opening_bracket_reached = True - else: - continue - - if token.type == 1: # type 1 means NAME token - names.append(token) - else: - if len(names) == 1: - yield (names[0].type, names[0].string) - elif len(names) == 2: - # two "NAME" tokens follow each other which means a cython - # typehint like `bool argument`, so remove the typehint - # note that we could convert it to python typehints, but hints - # are not supported by _signature_fromstr - yield (names[1].type, names[1].string) - elif len(names) > 2: - raise ValueError('More than two NAME tokens follow each other') - names = [] - yield (token.type, token.string) - - -def inspect_signature(obj): - """ - Custom signature inspection primarily for cython generated callables. - - Cython puts the signatures to the first line of the docstrings, which we - can reuse to parse the python signature from, but some gymnastics are - required, like removing the cython typehints. - - It converts the cython signature: - array(obj, type=None, mask=None, size=None, from_pandas=None, - bool safe=True, MemoryPool memory_pool=None) - To: - - """ - cython_signature = obj.__doc__.splitlines()[0] - cython_tokens = _tokenize_signature(cython_signature) - python_tokens = _convert_typehint(cython_tokens) - python_signature = tokenize.untokenize(python_tokens) - return inspect._signature_fromstr(inspect.Signature, obj, python_signature) - - -class NumpyDoc: - - def __init__(self, symbols=None): - if not have_numpydoc: - raise RuntimeError( - 'Numpydoc is not available, install the development version ' - 'with command: pip install ' - 'git+https://github.com/numpy/numpydoc' - ) - self.symbols = set(symbols or {'pyarrow'}) - - def traverse(self, fn, obj, from_package): - """Apply a function on publicly exposed API components. - - Recursively iterates over the members of the passed object. It omits - any '_' prefixed and thirdparty (non pyarrow) symbols. - - Parameters - ---------- - obj : Any - from_package : string, default 'pyarrow' - Predicate to only consider objects from this package. - """ - todo = [obj] - seen = set() - - while todo: - obj = todo.pop() - if obj in seen: - continue - else: - seen.add(obj) - - fn(obj) - - for name in dir(obj): - if name.startswith('_'): - continue - - member = getattr(obj, name) - module = getattr(member, '__module__', None) - if not (module and module.startswith(from_package)): - continue - - todo.append(member) - - @contextmanager - def _apply_patches(self): - """ - Patch Docstring class to bypass loading already loaded python objects. - """ - orig_load_obj = Docstring._load_obj - orig_signature = inspect.signature - - @staticmethod - def _load_obj(obj): - # By default it expects a qualname and import the object, but we - # have already loaded object after the API traversal. - if isinstance(obj, str): - return orig_load_obj(obj) - else: - return obj - - def signature(obj): - # inspect.signature tries to parse __text_signature__ if other - # properties like __signature__ doesn't exists, but cython - # doesn't set that property despite that embedsignature cython - # directive is set. The only way to inspect a cython compiled - # callable's signature to parse it from __doc__ while - # embedsignature directive is set during the build phase. - # So path inspect.signature function to attempt to parse the first - # line of callable.__doc__ as a signature. - try: - return orig_signature(obj) - except Exception as orig_error: - try: - return inspect_signature(obj) - except Exception: - raise orig_error - - try: - Docstring._load_obj = _load_obj - inspect.signature = signature - yield - finally: - Docstring._load_obj = orig_load_obj - inspect.signature = orig_signature - - def validate(self, from_package='', allow_rules=None, - disallow_rules=None): - results = [] - - def callback(obj): - result = validate(obj) - - errors = [] - for errcode, errmsg in result.get('errors', []): - if allow_rules and errcode not in allow_rules: - continue - if disallow_rules and errcode in disallow_rules: - continue - errors.append((errcode, errmsg)) - - if len(errors): - result['errors'] = errors - results.append((obj, result)) - - with self._apply_patches(): - for symbol in self.symbols: - try: - obj = Docstring._load_obj(symbol) - except (ImportError, AttributeError): - print('{} is not available for import'.format(symbol)) - else: - self.traverse(callback, obj, from_package=from_package) - - return results diff --git a/dev/archery/archery/lang/rust.py b/dev/archery/archery/lang/rust.py deleted file mode 100644 index b1d765b7d52e3..0000000000000 --- a/dev/archery/archery/lang/rust.py +++ /dev/null @@ -1,23 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from ..utils.command import Command, default_bin - - -class Cargo(Command): - def __init__(self, cargo_bin=None): - self.bin = default_bin(cargo_bin, "cargo") diff --git a/dev/archery/archery/release.py b/dev/archery/archery/release.py deleted file mode 100644 index acfe3fc237370..0000000000000 --- a/dev/archery/archery/release.py +++ /dev/null @@ -1,535 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from collections import defaultdict -import functools -import os -import re -import pathlib -import shelve -import warnings - -from git import Repo -from jira import JIRA -from semver import VersionInfo as SemVer - -from .utils.source import ArrowSources -from .utils.report import JinjaReport - - -def cached_property(fn): - return property(functools.lru_cache(maxsize=1)(fn)) - - -class Version(SemVer): - - __slots__ = ('released', 'release_date') - - def __init__(self, released=False, release_date=None, **kwargs): - super().__init__(**kwargs) - self.released = released - self.release_date = release_date - - @classmethod - def parse(cls, version, **kwargs): - return cls(**SemVer.parse(version).to_dict(), **kwargs) - - @classmethod - def from_jira(cls, jira_version): - return cls.parse( - jira_version.name, - released=jira_version.released, - release_date=getattr(jira_version, 'releaseDate', None) - ) - - -class Issue: - - def __init__(self, key, type, summary): - self.key = key - self.type = type - self.summary = summary - - @classmethod - def from_jira(cls, jira_issue): - return cls( - key=jira_issue.key, - type=jira_issue.fields.issuetype.name, - summary=jira_issue.fields.summary - ) - - @property - def project(self): - return self.key.split('-')[0] - - @property - def number(self): - return int(self.key.split('-')[1]) - - -class Jira(JIRA): - - def __init__(self, user=None, password=None, - url='https://issues.apache.org/jira'): - user = user or os.environ.get('APACHE_JIRA_USER') - password = password or os.environ.get('APACHE_JIRA_PASSWORD') - super().__init__(url, basic_auth=(user, password)) - - def project_version(self, version_string, project='ARROW'): - # query version from jira to populated with additional metadata - versions = {str(v): v for v in self.project_versions(project)} - return versions[version_string] - - def project_versions(self, project): - versions = [] - for v in super().project_versions(project): - try: - versions.append(Version.from_jira(v)) - except ValueError: - # ignore invalid semantic versions like JS-0.4.0 - continue - return sorted(versions, reverse=True) - - def issue(self, key): - return Issue.from_jira(super().issue(key)) - - def project_issues(self, version, project='ARROW'): - query = "project={} AND fixVersion={}".format(project, version) - issues = super().search_issues(query, maxResults=False) - return list(map(Issue.from_jira, issues)) - - -class CachedJira: - - def __init__(self, cache_path, jira=None): - self.jira = jira or Jira() - self.cache_path = cache_path - - def __getattr__(self, name): - attr = getattr(self.jira, name) - return self._cached(name, attr) if callable(attr) else attr - - def _cached(self, name, method): - def wrapper(*args, **kwargs): - key = str((name, args, kwargs)) - with shelve.open(self.cache_path) as cache: - try: - result = cache[key] - except KeyError: - cache[key] = result = method(*args, **kwargs) - return result - return wrapper - - -_TITLE_REGEX = re.compile( - r"(?P(?P(ARROW|PARQUET))\-\d+)?\s*:?\s*" - r"(?P\[.*\])?\s*(?P

.*)" -) -_COMPONENT_REGEX = re.compile(r"\[([^\[\]]+)\]") - - -class CommitTitle: - - def __init__(self, summary, project=None, issue=None, components=None): - self.project = project - self.issue = issue - self.components = components or [] - self.summary = summary - - def __str__(self): - out = "" - if self.issue: - out += "{}: ".format(self.issue) - if self.components: - for component in self.components: - out += "[{}]".format(component) - out += " " - out += self.summary - return out - - def __eq__(self, other): - return ( - self.summary == other.summary and - self.project == other.project and - self.issue == other.issue and - self.components == other.components - ) - - def __hash__(self): - return hash( - (self.summary, self.project, self.issue, tuple(self.components)) - ) - - @classmethod - def parse(cls, headline): - matches = _TITLE_REGEX.match(headline) - if matches is None: - warnings.warn( - "Unable to parse commit message `{}`".format(headline) - ) - return CommitTitle(headline) - - values = matches.groupdict() - components = values.get('components') or '' - components = _COMPONENT_REGEX.findall(components) - - return CommitTitle( - values['summary'], - project=values.get('project'), - issue=values.get('issue'), - components=components - ) - - -class Commit: - - def __init__(self, wrapped): - self._title = CommitTitle.parse(wrapped.summary) - self._wrapped = wrapped - - def __getattr__(self, attr): - if hasattr(self._title, attr): - return getattr(self._title, attr) - else: - return getattr(self._wrapped, attr) - - def __repr__(self): - template = '' - return template.format(self.hexsha, self.issue, self.components, - self.summary) - - @property - def url(self): - return 'https://github.com/apache/arrow/commit/{}'.format(self.hexsha) - - @property - def title(self): - return self._title - - -class ReleaseCuration(JinjaReport): - templates = { - 'console': 'release_curation.txt.j2' - } - fields = [ - 'release', - 'within', - 'outside', - 'nojira', - 'parquet', - 'nopatch' - ] - - -class JiraChangelog(JinjaReport): - templates = { - 'markdown': 'release_changelog.md.j2', - 'html': 'release_changelog.html.j2' - } - fields = [ - 'release', - 'categories' - ] - - -class Release: - - def __init__(self): - raise TypeError("Do not initialize Release class directly, use " - "Release.from_jira(version) instead.") - - def __repr__(self): - if self.version.released: - status = "released_at={!r}".format(self.version.release_date) - else: - status = "pending" - return "<{} {!r} {}>".format(self.__class__.__name__, - str(self.version), status) - - @staticmethod - def from_jira(version, jira=None, repo=None): - if jira is None: - jira = Jira() - elif isinstance(jira, str): - jira = Jira(jira) - elif not isinstance(jira, (Jira, CachedJira)): - raise TypeError("`jira` argument must be a server url or a valid " - "Jira instance") - - if repo is None: - arrow = ArrowSources.find() - repo = Repo(arrow.path) - elif isinstance(repo, (str, pathlib.Path)): - repo = Repo(repo) - elif not isinstance(repo, Repo): - raise TypeError("`repo` argument must be a path or a valid Repo " - "instance") - - if isinstance(version, str): - version = jira.project_version(version, project='ARROW') - elif not isinstance(version, Version): - raise TypeError(version) - - # decide the type of the release based on the version number - if version.patch == 0: - if version.minor == 0: - klass = MajorRelease - elif version.major == 0: - # handle minor releases before 1.0 as major releases - klass = MajorRelease - else: - klass = MinorRelease - else: - klass = PatchRelease - - # prevent instantiating release object directly - obj = klass.__new__(klass) - obj.version = version - obj.jira = jira - obj.repo = repo - - return obj - - @property - def is_released(self): - return self.version.released - - @property - def tag(self): - return "apache-arrow-{}".format(str(self.version)) - - @property - def branch(self): - raise NotImplementedError() - - @property - def siblings(self): - """ - Releases to consider when calculating previous and next releases. - """ - raise NotImplementedError() - - @cached_property - def previous(self): - # select all non-patch releases - position = self.siblings.index(self.version) - try: - previous = self.siblings[position + 1] - except IndexError: - # first release doesn't have a previous one - return None - else: - return Release.from_jira(previous, jira=self.jira, repo=self.repo) - - @cached_property - def next(self): - # select all non-patch releases - position = self.siblings.index(self.version) - if position <= 0: - raise ValueError("There is no upcoming release set in JIRA after " - "version {}".format(self.version)) - upcoming = self.siblings[position - 1] - return Release.from_jira(upcoming, jira=self.jira, repo=self.repo) - - @cached_property - def issues(self): - issues = self.jira.project_issues(self.version, project='ARROW') - return {i.key: i for i in issues} - - @cached_property - def commits(self): - """ - All commits applied between two versions. - """ - if self.previous is None: - # first release - lower = '' - else: - lower = self.repo.tags[self.previous.tag] - - if self.version.released: - upper = self.repo.tags[self.tag] - else: - try: - upper = self.repo.branches[self.branch] - except IndexError: - warnings.warn("Release branch `{}` doesn't exist." - .format(self.branch)) - return [] - - commit_range = "{}..{}".format(lower, upper) - return list(map(Commit, self.repo.iter_commits(commit_range))) - - def curate(self): - # handle commits with parquet issue key specially and query them from - # jira and add it to the issues - release_issues = self.issues - - within, outside, nojira, parquet = [], [], [], [] - for c in self.commits: - if c.issue is None: - nojira.append(c) - elif c.issue in release_issues: - within.append((release_issues[c.issue], c)) - elif c.project == 'PARQUET': - parquet.append((self.jira.issue(c.issue), c)) - else: - outside.append((self.jira.issue(c.issue), c)) - - # remaining jira tickets - within_keys = {i.key for i, c in within} - nopatch = [issue for key, issue in release_issues.items() - if key not in within_keys] - - return ReleaseCuration(release=self, within=within, outside=outside, - nojira=nojira, parquet=parquet, nopatch=nopatch) - - def changelog(self): - release_issues = [] - - # get organized report for the release - curation = self.curate() - - # jira tickets having patches in the release - for issue, _ in curation.within: - release_issues.append(issue) - - # jira tickets without patches - for issue in curation.nopatch: - release_issues.append(issue) - - # parquet patches in the release - for issue, _ in curation.parquet: - release_issues.append(issue) - - # organize issues into categories - issue_types = { - 'Bug': 'Bug Fixes', - 'Improvement': 'New Features and Improvements', - 'New Feature': 'New Features and Improvements', - 'Sub-task': 'New Features and Improvements', - 'Task': 'New Features and Improvements', - 'Test': 'Bug Fixes', - 'Wish': 'New Features and Improvements', - } - categories = defaultdict(list) - for issue in release_issues: - categories[issue_types[issue.type]].append(issue) - - # sort issues by the issue key in ascending order - for name, issues in categories.items(): - issues.sort(key=lambda issue: (issue.project, issue.number)) - - return JiraChangelog(release=self, categories=categories) - - -class MaintenanceMixin: - """ - Utility methods for cherry-picking commits from the main branch. - """ - - def commits_to_pick(self, exclude_already_applied=True): - # collect commits applied on the main branch since the root of the - # maintenance branch (the previous major release) - if self.version.major == 0: - # treat minor releases as major releases preceeding 1.0.0 release - commit_range = "apache-arrow-0.{}.0..master".format( - self.version.minor - 1 - ) - else: - commit_range = "apache-arrow-{}.0.0..master".format( - self.version.major - ) - - # keeping the original order of the commits helps to minimize the merge - # conflicts during cherry-picks - commits = map(Commit, self.repo.iter_commits(commit_range)) - - # exclude patches that have been already applied to the maintenance - # branch, we cannot identify patches based on sha because it changes - # after the cherry pick so use commit title instead - if exclude_already_applied: - already_applied = {c.title for c in self.commits} - else: - already_applied = set() - - # iterate over the commits applied on the main branch and filter out - # the ones that are included in the jira release - patches_to_pick = [c for c in commits if - c.issue in self.issues and - c.title not in already_applied] - - return reversed(patches_to_pick) - - def cherry_pick_commits(self, recreate_branch=True): - if recreate_branch: - # delete, create and checkout the maintenance branch based off of - # the previous tag - if self.branch in self.repo.branches: - self.repo.git.branch('-D', self.branch) - self.repo.git.checkout(self.previous.tag, b=self.branch) - else: - # just checkout the already existing maintenance branch - self.repo.git.checkout(self.branch) - - # cherry pick the commits based on the jira tickets - for commit in self.commits_to_pick(): - self.repo.git.cherry_pick(commit.hexsha) - - -class MajorRelease(Release): - - @property - def branch(self): - return "master" - - @cached_property - def siblings(self): - """ - Filter only the major releases. - """ - # handle minor releases before 1.0 as major releases - return [v for v in self.jira.project_versions('ARROW') - if v.patch == 0 and (v.major == 0 or v.minor == 0)] - - -class MinorRelease(Release, MaintenanceMixin): - - @property - def branch(self): - return "maint-{}.x.x".format(self.version.major) - - @cached_property - def siblings(self): - """ - Filter the major and minor releases. - """ - return [v for v in self.jira.project_versions('ARROW') if v.patch == 0] - - -class PatchRelease(Release, MaintenanceMixin): - - @property - def branch(self): - return "maint-{}.{}.x".format(self.version.major, self.version.minor) - - @cached_property - def siblings(self): - """ - No filtering, consider all releases. - """ - return self.jira.project_versions('ARROW') diff --git a/dev/archery/archery/templates/release_changelog.md.j2 b/dev/archery/archery/templates/release_changelog.md.j2 deleted file mode 100644 index c0406ddf4e224..0000000000000 --- a/dev/archery/archery/templates/release_changelog.md.j2 +++ /dev/null @@ -1,29 +0,0 @@ -{# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -#} -# Apache Arrow {{ release.version }} ({{ release.version.release_date or today() }}) - -{% for category, issues in categories.items() -%} - -## {{ category }} - -{% for issue in issues -%} -* [{{ issue.key }}](https://issues.apache.org/jira/browse/{{ issue.key }}) - {{ issue.summary | md }} -{% endfor %} - -{% endfor %} diff --git a/dev/archery/archery/templates/release_curation.txt.j2 b/dev/archery/archery/templates/release_curation.txt.j2 deleted file mode 100644 index a5d11e9d4af5f..0000000000000 --- a/dev/archery/archery/templates/release_curation.txt.j2 +++ /dev/null @@ -1,41 +0,0 @@ -{# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -#} -Total number of JIRA tickets assigned to version {{ release.version }}: {{ release.issues|length }} - -Total number of applied patches since version {{ release.previous.version }}: {{ release.commits|length }} - -Patches with assigned issue in version {{ release.version }}: -{% for issue, commit in within -%} - - {{ commit.url }} {{ commit.title }} -{% endfor %} - -Patches with assigned issue outside of version {{ release.version }}: -{% for issue, commit in outside -%} - - {{ commit.url }} {{ commit.title }} -{% endfor %} - -Patches in version {{ release.version }} without a linked issue: -{% for commit in nojira -%} - - {{ commit.url }} {{ commit.title }} -{% endfor %} - -JIRA issues in version {{ release.version }} without a linked patch: -{% for issue in nopatch -%} - - https://issues.apache.org/jira/browse/{{ issue.key }} -{% endfor %} diff --git a/dev/archery/archery/testing.py b/dev/archery/archery/testing.py deleted file mode 100644 index 471a54d4c72cf..0000000000000 --- a/dev/archery/archery/testing.py +++ /dev/null @@ -1,83 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from contextlib import contextmanager -import os -from unittest import mock -import re - - -class DotDict(dict): - - def __getattr__(self, key): - try: - item = self[key] - except KeyError: - raise AttributeError(key) - if isinstance(item, dict): - return DotDict(item) - else: - return item - - -class PartialEnv(dict): - - def __eq__(self, other): - return self.items() <= other.items() - - -_mock_call_type = type(mock.call()) - - -def _ensure_mock_call_object(obj, **kwargs): - if isinstance(obj, _mock_call_type): - return obj - elif isinstance(obj, str): - cmd = re.split(r"\s+", obj) - return mock.call(cmd, **kwargs) - elif isinstance(obj, list): - return mock.call(obj, **kwargs) - else: - raise TypeError(obj) - - -class SuccessfulSubprocessResult: - - def check_returncode(self): - return - - -@contextmanager -def assert_subprocess_calls(expected_commands_or_calls, **kwargs): - calls = [ - _ensure_mock_call_object(obj, **kwargs) - for obj in expected_commands_or_calls - ] - with mock.patch('subprocess.run', autospec=True) as run: - run.return_value = SuccessfulSubprocessResult() - yield run - run.assert_has_calls(calls) - - -@contextmanager -def override_env(mapping): - original = os.environ - try: - os.environ = dict(os.environ, **mapping) - yield os.environ - finally: - os.environ = original diff --git a/dev/archery/archery/tests/fixtures/archery-benchmark-diff-empty-lines.jsonl b/dev/archery/archery/tests/fixtures/archery-benchmark-diff-empty-lines.jsonl deleted file mode 100644 index 5854eb75c9792..0000000000000 --- a/dev/archery/archery/tests/fixtures/archery-benchmark-diff-empty-lines.jsonl +++ /dev/null @@ -1,6 +0,0 @@ -{"benchmark": "RegressionSumKernel/32768/10", "change": 0.0046756468886368545, "regression": false, "baseline": 13265442258.099466, "contender": 13327466781.91994, "unit": "bytes_per_second", "less_is_better": false, "suite": "arrow-compute-aggregate-benchmark"} -{"benchmark": "RegressionSumKernel/32768/1", "change": 0.0025108399115900733, "regression": false, "baseline": 15181891659.539782, "contender": 15220010959.05199, "unit": "bytes_per_second", "less_is_better": false, "suite": "arrow-compute-aggregate-benchmark"} - -{"benchmark": "RegressionSumKernel/32768/50", "change": 0.00346735806287155, "regression": false, "baseline": 11471825667.817123, "contender": 11511602595.042286, "unit": "bytes_per_second", "less_is_better": false, "suite": "arrow-compute-aggregate-benchmark"} - -{"benchmark": "RegressionSumKernel/32768/0", "change": 0.010140954727954987, "regression": false, "baseline": 18316987019.994465, "contender": 18502738756.116768, "unit": "bytes_per_second", "less_is_better": false, "suite": "arrow-compute-aggregate-benchmark"} diff --git a/dev/archery/archery/tests/fixtures/archery-benchmark-diff.jsonl b/dev/archery/archery/tests/fixtures/archery-benchmark-diff.jsonl deleted file mode 100644 index 1e25810d776ea..0000000000000 --- a/dev/archery/archery/tests/fixtures/archery-benchmark-diff.jsonl +++ /dev/null @@ -1,4 +0,0 @@ -{"benchmark":"RegressionSumKernel/32768/50","change":-0.001550846227215492,"regression":false,"baseline":19241207435.428757,"contender":19211367281.47045,"unit":"bytes_per_second","less_is_better":false,"suite":"arrow-compute-aggregate-benchmark"} -{"benchmark":"RegressionSumKernel/32768/1","change":0.0020681767923465765,"regression":true,"baseline":24823170673.777943,"contender":24771831968.277977,"unit":"bytes_per_second","less_is_better":false,"suite":"arrow-compute-aggregate-benchmark"} -{"benchmark":"RegressionSumKernel/32768/10","change":0.0033323376378746905,"regression":false,"baseline":21902707565.968014,"contender":21975694782.76145,"unit":"bytes_per_second","less_is_better":false,"suite":"arrow-compute-aggregate-benchmark"} -{"benchmark":"RegressionSumKernel/32768/0","change":-0.004918126090954414,"regression":true,"baseline":27685006611.446762,"contender":27821164964.790764,"unit":"bytes_per_second","less_is_better":false,"suite":"arrow-compute-aggregate-benchmark"} diff --git a/dev/archery/archery/tests/fixtures/event-issue-comment-build-command.json b/dev/archery/archery/tests/fixtures/event-issue-comment-build-command.json deleted file mode 100644 index d591105f0798b..0000000000000 --- a/dev/archery/archery/tests/fixtures/event-issue-comment-build-command.json +++ /dev/null @@ -1,212 +0,0 @@ -{ - "action": "created", - "comment": { - "author_association": "MEMBER", - "body": "@ursabot build", - "created_at": "2019-04-05T11:55:43Z", - "html_url": "https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480248726", - "id": 480248726, - "issue_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26", - "node_id": "MDEyOklzc3VlQ29tbWVudDQ4MDI0ODcyNg==", - "updated_at": "2019-04-05T11:55:43Z", - "url": "https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480248726", - "user": { - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/kszucs", - "id": 961747, - "login": "kszucs", - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "repos_url": "https://api.github.com/users/kszucs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "type": "User", - "url": "https://api.github.com/users/kszucs" - } - }, - "issue": { - "assignee": null, - "assignees": [], - "author_association": "MEMBER", - "body": "", - "closed_at": null, - "comments": 3, - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments", - "created_at": "2019-04-05T11:22:15Z", - "events_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/events", - "html_url": "https://github.com/ursa-labs/ursabot/pull/26", - "id": 429706959, - "labels": [], - "labels_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}", - "locked": false, - "milestone": null, - "node_id": "MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy", - "number": 26, - "pull_request": { - "diff_url": "https://github.com/ursa-labs/ursabot/pull/26.diff", - "html_url": "https://github.com/ursa-labs/ursabot/pull/26", - "patch_url": "https://github.com/ursa-labs/ursabot/pull/26.patch", - "url": "https://api.github.com/repos/ursa-labs/ursabot/pulls/26" - }, - "repository_url": "https://api.github.com/repos/ursa-labs/ursabot", - "state": "open", - "title": "Unittests for GithubHook", - "updated_at": "2019-04-05T11:55:43Z", - "url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26", - "user": { - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/kszucs", - "id": 961747, - "login": "kszucs", - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "repos_url": "https://api.github.com/users/kszucs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "type": "User", - "url": "https://api.github.com/users/kszucs" - } - }, - "organization": { - "avatar_url": "https://avatars2.githubusercontent.com/u/46514972?v=4", - "description": "Innovation lab for open source data science tools, powered by Apache Arrow", - "events_url": "https://api.github.com/orgs/ursa-labs/events", - "hooks_url": "https://api.github.com/orgs/ursa-labs/hooks", - "id": 46514972, - "issues_url": "https://api.github.com/orgs/ursa-labs/issues", - "login": "ursa-labs", - "members_url": "https://api.github.com/orgs/ursa-labs/members{/member}", - "node_id": "MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy", - "public_members_url": "https://api.github.com/orgs/ursa-labs/public_members{/member}", - "repos_url": "https://api.github.com/orgs/ursa-labs/repos", - "url": "https://api.github.com/orgs/ursa-labs" - }, - "repository": { - "archive_url": "https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}", - "archived": false, - "assignees_url": "https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}", - "blobs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}", - "branches_url": "https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}", - "clone_url": "https://github.com/ursa-labs/ursabot.git", - "collaborators_url": "https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}", - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/comments{/number}", - "commits_url": "https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}", - "compare_url": "https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}", - "contributors_url": "https://api.github.com/repos/ursa-labs/ursabot/contributors", - "created_at": "2019-02-04T15:40:31Z", - "default_branch": "master", - "deployments_url": "https://api.github.com/repos/ursa-labs/ursabot/deployments", - "description": null, - "disabled": false, - "downloads_url": "https://api.github.com/repos/ursa-labs/ursabot/downloads", - "events_url": "https://api.github.com/repos/ursa-labs/ursabot/events", - "fork": false, - "forks": 0, - "forks_count": 0, - "forks_url": "https://api.github.com/repos/ursa-labs/ursabot/forks", - "full_name": "ursa-labs/ursabot", - "git_commits_url": "https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}", - "git_refs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}", - "git_tags_url": "https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}", - "git_url": "git://github.com/ursa-labs/ursabot.git", - "has_downloads": true, - "has_issues": true, - "has_pages": false, - "has_projects": true, - "has_wiki": true, - "homepage": null, - "hooks_url": "https://api.github.com/repos/ursa-labs/ursabot/hooks", - "html_url": "https://github.com/ursa-labs/ursabot", - "id": 169101701, - "issue_comment_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}", - "issue_events_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}", - "issues_url": "https://api.github.com/repos/ursa-labs/ursabot/issues{/number}", - "keys_url": "https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}", - "labels_url": "https://api.github.com/repos/ursa-labs/ursabot/labels{/name}", - "language": "Jupyter Notebook", - "languages_url": "https://api.github.com/repos/ursa-labs/ursabot/languages", - "license": null, - "merges_url": "https://api.github.com/repos/ursa-labs/ursabot/merges", - "milestones_url": "https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}", - "mirror_url": null, - "name": "ursabot", - "node_id": "MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=", - "notifications_url": "https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}", - "open_issues": 19, - "open_issues_count": 19, - "owner": { - "avatar_url": "https://avatars2.githubusercontent.com/u/46514972?v=4", - "events_url": "https://api.github.com/users/ursa-labs/events{/privacy}", - "followers_url": "https://api.github.com/users/ursa-labs/followers", - "following_url": "https://api.github.com/users/ursa-labs/following{/other_user}", - "gists_url": "https://api.github.com/users/ursa-labs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/ursa-labs", - "id": 46514972, - "login": "ursa-labs", - "node_id": "MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy", - "organizations_url": "https://api.github.com/users/ursa-labs/orgs", - "received_events_url": "https://api.github.com/users/ursa-labs/received_events", - "repos_url": "https://api.github.com/users/ursa-labs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/ursa-labs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/ursa-labs/subscriptions", - "type": "Organization", - "url": "https://api.github.com/users/ursa-labs" - }, - "private": false, - "pulls_url": "https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}", - "pushed_at": "2019-04-05T11:22:16Z", - "releases_url": "https://api.github.com/repos/ursa-labs/ursabot/releases{/id}", - "size": 892, - "ssh_url": "git@github.com:ursa-labs/ursabot.git", - "stargazers_count": 1, - "stargazers_url": "https://api.github.com/repos/ursa-labs/ursabot/stargazers", - "statuses_url": "https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}", - "subscribers_url": "https://api.github.com/repos/ursa-labs/ursabot/subscribers", - "subscription_url": "https://api.github.com/repos/ursa-labs/ursabot/subscription", - "svn_url": "https://github.com/ursa-labs/ursabot", - "tags_url": "https://api.github.com/repos/ursa-labs/ursabot/tags", - "teams_url": "https://api.github.com/repos/ursa-labs/ursabot/teams", - "trees_url": "https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}", - "updated_at": "2019-04-04T17:49:10Z", - "url": "https://api.github.com/repos/ursa-labs/ursabot", - "watchers": 1, - "watchers_count": 1 - }, - "sender": { - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/kszucs", - "id": 961747, - "login": "kszucs", - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "repos_url": "https://api.github.com/users/kszucs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "type": "User", - "url": "https://api.github.com/users/kszucs" - } -} \ No newline at end of file diff --git a/dev/archery/archery/tests/fixtures/event-issue-comment-by-non-authorized-user.json b/dev/archery/archery/tests/fixtures/event-issue-comment-by-non-authorized-user.json deleted file mode 100644 index 5a8f3461c0ca9..0000000000000 --- a/dev/archery/archery/tests/fixtures/event-issue-comment-by-non-authorized-user.json +++ /dev/null @@ -1,212 +0,0 @@ -{ - "action": "created", - "comment": { - "author_association": "NONE", - "body": "Unknown command \"\"", - "created_at": "2019-04-05T11:35:47Z", - "html_url": "https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480243815", - "id": 480243815, - "issue_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26", - "node_id": "MDEyOklzc3VlQ29tbWVudDQ4MDI0MzgxNQ==", - "updated_at": "2019-04-05T11:35:47Z", - "url": "https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480243815", - "user": { - "avatar_url": "https://avatars2.githubusercontent.com/u/49275095?v=4", - "events_url": "https://api.github.com/users/ursabot/events{/privacy}", - "followers_url": "https://api.github.com/users/ursabot/followers", - "following_url": "https://api.github.com/users/ursabot/following{/other_user}", - "gists_url": "https://api.github.com/users/ursabot/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/ursabot", - "id": 49275095, - "login": "someone", - "node_id": "MDQ6VXNlcjQ5Mjc1MDk1", - "organizations_url": "https://api.github.com/users/ursabot/orgs", - "received_events_url": "https://api.github.com/users/ursabot/received_events", - "repos_url": "https://api.github.com/users/ursabot/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/ursabot/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/ursabot/subscriptions", - "type": "User", - "url": "https://api.github.com/users/ursabot" - } - }, - "issue": { - "assignee": null, - "assignees": [], - "author_association": "NONE", - "body": "", - "closed_at": null, - "comments": 2, - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments", - "created_at": "2019-04-05T11:22:15Z", - "events_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/events", - "html_url": "https://github.com/ursa-labs/ursabot/pull/26", - "id": 429706959, - "labels": [], - "labels_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}", - "locked": false, - "milestone": null, - "node_id": "MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy", - "number": 26, - "pull_request": { - "diff_url": "https://github.com/ursa-labs/ursabot/pull/26.diff", - "html_url": "https://github.com/ursa-labs/ursabot/pull/26", - "patch_url": "https://github.com/ursa-labs/ursabot/pull/26.patch", - "url": "https://api.github.com/repos/ursa-labs/ursabot/pulls/26" - }, - "repository_url": "https://api.github.com/repos/ursa-labs/ursabot", - "state": "open", - "title": "Unittests for GithubHook", - "updated_at": "2019-04-05T11:35:47Z", - "url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26", - "user": { - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/kszucs", - "id": 961747, - "login": "kszucs", - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "repos_url": "https://api.github.com/users/kszucs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "type": "User", - "url": "https://api.github.com/users/kszucs" - } - }, - "organization": { - "avatar_url": "https://avatars2.githubusercontent.com/u/46514972?v=4", - "description": "Innovation lab for open source data science tools, powered by Apache Arrow", - "events_url": "https://api.github.com/orgs/ursa-labs/events", - "hooks_url": "https://api.github.com/orgs/ursa-labs/hooks", - "id": 46514972, - "issues_url": "https://api.github.com/orgs/ursa-labs/issues", - "login": "ursa-labs", - "members_url": "https://api.github.com/orgs/ursa-labs/members{/member}", - "node_id": "MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy", - "public_members_url": "https://api.github.com/orgs/ursa-labs/public_members{/member}", - "repos_url": "https://api.github.com/orgs/ursa-labs/repos", - "url": "https://api.github.com/orgs/ursa-labs" - }, - "repository": { - "archive_url": "https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}", - "archived": false, - "assignees_url": "https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}", - "blobs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}", - "branches_url": "https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}", - "clone_url": "https://github.com/ursa-labs/ursabot.git", - "collaborators_url": "https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}", - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/comments{/number}", - "commits_url": "https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}", - "compare_url": "https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}", - "contributors_url": "https://api.github.com/repos/ursa-labs/ursabot/contributors", - "created_at": "2019-02-04T15:40:31Z", - "default_branch": "master", - "deployments_url": "https://api.github.com/repos/ursa-labs/ursabot/deployments", - "description": null, - "disabled": false, - "downloads_url": "https://api.github.com/repos/ursa-labs/ursabot/downloads", - "events_url": "https://api.github.com/repos/ursa-labs/ursabot/events", - "fork": false, - "forks": 0, - "forks_count": 0, - "forks_url": "https://api.github.com/repos/ursa-labs/ursabot/forks", - "full_name": "ursa-labs/ursabot", - "git_commits_url": "https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}", - "git_refs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}", - "git_tags_url": "https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}", - "git_url": "git://github.com/ursa-labs/ursabot.git", - "has_downloads": true, - "has_issues": true, - "has_pages": false, - "has_projects": true, - "has_wiki": true, - "homepage": null, - "hooks_url": "https://api.github.com/repos/ursa-labs/ursabot/hooks", - "html_url": "https://github.com/ursa-labs/ursabot", - "id": 169101701, - "issue_comment_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}", - "issue_events_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}", - "issues_url": "https://api.github.com/repos/ursa-labs/ursabot/issues{/number}", - "keys_url": "https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}", - "labels_url": "https://api.github.com/repos/ursa-labs/ursabot/labels{/name}", - "language": "Jupyter Notebook", - "languages_url": "https://api.github.com/repos/ursa-labs/ursabot/languages", - "license": null, - "merges_url": "https://api.github.com/repos/ursa-labs/ursabot/merges", - "milestones_url": "https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}", - "mirror_url": null, - "name": "someone", - "node_id": "MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=", - "notifications_url": "https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}", - "open_issues": 19, - "open_issues_count": 19, - "owner": { - "avatar_url": "https://avatars2.githubusercontent.com/u/46514972?v=4", - "events_url": "https://api.github.com/users/ursa-labs/events{/privacy}", - "followers_url": "https://api.github.com/users/ursa-labs/followers", - "following_url": "https://api.github.com/users/ursa-labs/following{/other_user}", - "gists_url": "https://api.github.com/users/ursa-labs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/ursa-labs", - "id": 46514972, - "login": "ursa-labs", - "node_id": "MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy", - "organizations_url": "https://api.github.com/users/ursa-labs/orgs", - "received_events_url": "https://api.github.com/users/ursa-labs/received_events", - "repos_url": "https://api.github.com/users/ursa-labs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/ursa-labs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/ursa-labs/subscriptions", - "type": "Organization", - "url": "https://api.github.com/users/ursa-labs" - }, - "private": false, - "pulls_url": "https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}", - "pushed_at": "2019-04-05T11:22:16Z", - "releases_url": "https://api.github.com/repos/ursa-labs/ursabot/releases{/id}", - "size": 892, - "ssh_url": "git@github.com:ursa-labs/ursabot.git", - "stargazers_count": 1, - "stargazers_url": "https://api.github.com/repos/ursa-labs/ursabot/stargazers", - "statuses_url": "https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}", - "subscribers_url": "https://api.github.com/repos/ursa-labs/ursabot/subscribers", - "subscription_url": "https://api.github.com/repos/ursa-labs/ursabot/subscription", - "svn_url": "https://github.com/ursa-labs/ursabot", - "tags_url": "https://api.github.com/repos/ursa-labs/ursabot/tags", - "teams_url": "https://api.github.com/repos/ursa-labs/ursabot/teams", - "trees_url": "https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}", - "updated_at": "2019-04-04T17:49:10Z", - "url": "https://api.github.com/repos/ursa-labs/ursabot", - "watchers": 1, - "watchers_count": 1 - }, - "sender": { - "avatar_url": "https://avatars2.githubusercontent.com/u/49275095?v=4", - "events_url": "https://api.github.com/users/ursabot/events{/privacy}", - "followers_url": "https://api.github.com/users/ursabot/followers", - "following_url": "https://api.github.com/users/ursabot/following{/other_user}", - "gists_url": "https://api.github.com/users/ursabot/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/ursabot", - "id": 49275095, - "login": "someone", - "node_id": "MDQ6VXNlcjQ5Mjc1MDk1", - "organizations_url": "https://api.github.com/users/ursabot/orgs", - "received_events_url": "https://api.github.com/users/ursabot/received_events", - "repos_url": "https://api.github.com/users/ursabot/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/ursabot/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/ursabot/subscriptions", - "type": "User", - "url": "https://api.github.com/users/ursabot" - } -} \ No newline at end of file diff --git a/dev/archery/archery/tests/fixtures/event-issue-comment-by-ursabot.json b/dev/archery/archery/tests/fixtures/event-issue-comment-by-ursabot.json deleted file mode 100644 index bfb7210df8a3a..0000000000000 --- a/dev/archery/archery/tests/fixtures/event-issue-comment-by-ursabot.json +++ /dev/null @@ -1,212 +0,0 @@ -{ - "action": "created", - "comment": { - "author_association": "NONE", - "body": "Unknown command \"\"", - "created_at": "2019-04-05T11:35:47Z", - "html_url": "https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480243815", - "id": 480243815, - "issue_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26", - "node_id": "MDEyOklzc3VlQ29tbWVudDQ4MDI0MzgxNQ==", - "updated_at": "2019-04-05T11:35:47Z", - "url": "https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480243815", - "user": { - "avatar_url": "https://avatars2.githubusercontent.com/u/49275095?v=4", - "events_url": "https://api.github.com/users/ursabot/events{/privacy}", - "followers_url": "https://api.github.com/users/ursabot/followers", - "following_url": "https://api.github.com/users/ursabot/following{/other_user}", - "gists_url": "https://api.github.com/users/ursabot/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/ursabot", - "id": 49275095, - "login": "ursabot", - "node_id": "MDQ6VXNlcjQ5Mjc1MDk1", - "organizations_url": "https://api.github.com/users/ursabot/orgs", - "received_events_url": "https://api.github.com/users/ursabot/received_events", - "repos_url": "https://api.github.com/users/ursabot/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/ursabot/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/ursabot/subscriptions", - "type": "User", - "url": "https://api.github.com/users/ursabot" - } - }, - "issue": { - "assignee": null, - "assignees": [], - "author_association": "MEMBER", - "body": "", - "closed_at": null, - "comments": 2, - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments", - "created_at": "2019-04-05T11:22:15Z", - "events_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/events", - "html_url": "https://github.com/ursa-labs/ursabot/pull/26", - "id": 429706959, - "labels": [], - "labels_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}", - "locked": false, - "milestone": null, - "node_id": "MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy", - "number": 26, - "pull_request": { - "diff_url": "https://github.com/ursa-labs/ursabot/pull/26.diff", - "html_url": "https://github.com/ursa-labs/ursabot/pull/26", - "patch_url": "https://github.com/ursa-labs/ursabot/pull/26.patch", - "url": "https://api.github.com/repos/ursa-labs/ursabot/pulls/26" - }, - "repository_url": "https://api.github.com/repos/ursa-labs/ursabot", - "state": "open", - "title": "Unittests for GithubHook", - "updated_at": "2019-04-05T11:35:47Z", - "url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26", - "user": { - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/kszucs", - "id": 961747, - "login": "kszucs", - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "repos_url": "https://api.github.com/users/kszucs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "type": "User", - "url": "https://api.github.com/users/kszucs" - } - }, - "organization": { - "avatar_url": "https://avatars2.githubusercontent.com/u/46514972?v=4", - "description": "Innovation lab for open source data science tools, powered by Apache Arrow", - "events_url": "https://api.github.com/orgs/ursa-labs/events", - "hooks_url": "https://api.github.com/orgs/ursa-labs/hooks", - "id": 46514972, - "issues_url": "https://api.github.com/orgs/ursa-labs/issues", - "login": "ursa-labs", - "members_url": "https://api.github.com/orgs/ursa-labs/members{/member}", - "node_id": "MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy", - "public_members_url": "https://api.github.com/orgs/ursa-labs/public_members{/member}", - "repos_url": "https://api.github.com/orgs/ursa-labs/repos", - "url": "https://api.github.com/orgs/ursa-labs" - }, - "repository": { - "archive_url": "https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}", - "archived": false, - "assignees_url": "https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}", - "blobs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}", - "branches_url": "https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}", - "clone_url": "https://github.com/ursa-labs/ursabot.git", - "collaborators_url": "https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}", - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/comments{/number}", - "commits_url": "https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}", - "compare_url": "https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}", - "contributors_url": "https://api.github.com/repos/ursa-labs/ursabot/contributors", - "created_at": "2019-02-04T15:40:31Z", - "default_branch": "master", - "deployments_url": "https://api.github.com/repos/ursa-labs/ursabot/deployments", - "description": null, - "disabled": false, - "downloads_url": "https://api.github.com/repos/ursa-labs/ursabot/downloads", - "events_url": "https://api.github.com/repos/ursa-labs/ursabot/events", - "fork": false, - "forks": 0, - "forks_count": 0, - "forks_url": "https://api.github.com/repos/ursa-labs/ursabot/forks", - "full_name": "ursa-labs/ursabot", - "git_commits_url": "https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}", - "git_refs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}", - "git_tags_url": "https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}", - "git_url": "git://github.com/ursa-labs/ursabot.git", - "has_downloads": true, - "has_issues": true, - "has_pages": false, - "has_projects": true, - "has_wiki": true, - "homepage": null, - "hooks_url": "https://api.github.com/repos/ursa-labs/ursabot/hooks", - "html_url": "https://github.com/ursa-labs/ursabot", - "id": 169101701, - "issue_comment_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}", - "issue_events_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}", - "issues_url": "https://api.github.com/repos/ursa-labs/ursabot/issues{/number}", - "keys_url": "https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}", - "labels_url": "https://api.github.com/repos/ursa-labs/ursabot/labels{/name}", - "language": "Jupyter Notebook", - "languages_url": "https://api.github.com/repos/ursa-labs/ursabot/languages", - "license": null, - "merges_url": "https://api.github.com/repos/ursa-labs/ursabot/merges", - "milestones_url": "https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}", - "mirror_url": null, - "name": "ursabot", - "node_id": "MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=", - "notifications_url": "https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}", - "open_issues": 19, - "open_issues_count": 19, - "owner": { - "avatar_url": "https://avatars2.githubusercontent.com/u/46514972?v=4", - "events_url": "https://api.github.com/users/ursa-labs/events{/privacy}", - "followers_url": "https://api.github.com/users/ursa-labs/followers", - "following_url": "https://api.github.com/users/ursa-labs/following{/other_user}", - "gists_url": "https://api.github.com/users/ursa-labs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/ursa-labs", - "id": 46514972, - "login": "ursa-labs", - "node_id": "MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy", - "organizations_url": "https://api.github.com/users/ursa-labs/orgs", - "received_events_url": "https://api.github.com/users/ursa-labs/received_events", - "repos_url": "https://api.github.com/users/ursa-labs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/ursa-labs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/ursa-labs/subscriptions", - "type": "Organization", - "url": "https://api.github.com/users/ursa-labs" - }, - "private": false, - "pulls_url": "https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}", - "pushed_at": "2019-04-05T11:22:16Z", - "releases_url": "https://api.github.com/repos/ursa-labs/ursabot/releases{/id}", - "size": 892, - "ssh_url": "git@github.com:ursa-labs/ursabot.git", - "stargazers_count": 1, - "stargazers_url": "https://api.github.com/repos/ursa-labs/ursabot/stargazers", - "statuses_url": "https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}", - "subscribers_url": "https://api.github.com/repos/ursa-labs/ursabot/subscribers", - "subscription_url": "https://api.github.com/repos/ursa-labs/ursabot/subscription", - "svn_url": "https://github.com/ursa-labs/ursabot", - "tags_url": "https://api.github.com/repos/ursa-labs/ursabot/tags", - "teams_url": "https://api.github.com/repos/ursa-labs/ursabot/teams", - "trees_url": "https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}", - "updated_at": "2019-04-04T17:49:10Z", - "url": "https://api.github.com/repos/ursa-labs/ursabot", - "watchers": 1, - "watchers_count": 1 - }, - "sender": { - "avatar_url": "https://avatars2.githubusercontent.com/u/49275095?v=4", - "events_url": "https://api.github.com/users/ursabot/events{/privacy}", - "followers_url": "https://api.github.com/users/ursabot/followers", - "following_url": "https://api.github.com/users/ursabot/following{/other_user}", - "gists_url": "https://api.github.com/users/ursabot/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/ursabot", - "id": 49275095, - "login": "ursabot", - "node_id": "MDQ6VXNlcjQ5Mjc1MDk1", - "organizations_url": "https://api.github.com/users/ursabot/orgs", - "received_events_url": "https://api.github.com/users/ursabot/received_events", - "repos_url": "https://api.github.com/users/ursabot/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/ursabot/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/ursabot/subscriptions", - "type": "User", - "url": "https://api.github.com/users/ursabot" - } -} \ No newline at end of file diff --git a/dev/archery/archery/tests/fixtures/event-issue-comment-not-mentioning-ursabot.json b/dev/archery/archery/tests/fixtures/event-issue-comment-not-mentioning-ursabot.json deleted file mode 100644 index a3d450078aeb0..0000000000000 --- a/dev/archery/archery/tests/fixtures/event-issue-comment-not-mentioning-ursabot.json +++ /dev/null @@ -1,212 +0,0 @@ -{ - "action": "created", - "comment": { - "author_association": "MEMBER", - "body": "bear is no game", - "created_at": "2019-04-05T11:26:56Z", - "html_url": "https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480241727", - "id": 480241727, - "issue_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26", - "node_id": "MDEyOklzc3VlQ29tbWVudDQ4MDI0MTcyNw==", - "updated_at": "2019-04-05T11:26:56Z", - "url": "https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480241727", - "user": { - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/kszucs", - "id": 961747, - "login": "kszucs", - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "repos_url": "https://api.github.com/users/kszucs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "type": "User", - "url": "https://api.github.com/users/kszucs" - } - }, - "issue": { - "assignee": null, - "assignees": [], - "author_association": "MEMBER", - "body": "", - "closed_at": null, - "comments": 0, - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments", - "created_at": "2019-04-05T11:22:15Z", - "events_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/events", - "html_url": "https://github.com/ursa-labs/ursabot/pull/26", - "id": 429706959, - "labels": [], - "labels_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}", - "locked": false, - "milestone": null, - "node_id": "MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy", - "number": 26, - "pull_request": { - "diff_url": "https://github.com/ursa-labs/ursabot/pull/26.diff", - "html_url": "https://github.com/ursa-labs/ursabot/pull/26", - "patch_url": "https://github.com/ursa-labs/ursabot/pull/26.patch", - "url": "https://api.github.com/repos/ursa-labs/ursabot/pulls/26" - }, - "repository_url": "https://api.github.com/repos/ursa-labs/ursabot", - "state": "open", - "title": "Unittests for GithubHook", - "updated_at": "2019-04-05T11:26:56Z", - "url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26", - "user": { - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/kszucs", - "id": 961747, - "login": "kszucs", - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "repos_url": "https://api.github.com/users/kszucs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "type": "User", - "url": "https://api.github.com/users/kszucs" - } - }, - "organization": { - "avatar_url": "https://avatars2.githubusercontent.com/u/46514972?v=4", - "description": "Innovation lab for open source data science tools, powered by Apache Arrow", - "events_url": "https://api.github.com/orgs/ursa-labs/events", - "hooks_url": "https://api.github.com/orgs/ursa-labs/hooks", - "id": 46514972, - "issues_url": "https://api.github.com/orgs/ursa-labs/issues", - "login": "ursa-labs", - "members_url": "https://api.github.com/orgs/ursa-labs/members{/member}", - "node_id": "MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy", - "public_members_url": "https://api.github.com/orgs/ursa-labs/public_members{/member}", - "repos_url": "https://api.github.com/orgs/ursa-labs/repos", - "url": "https://api.github.com/orgs/ursa-labs" - }, - "repository": { - "archive_url": "https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}", - "archived": false, - "assignees_url": "https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}", - "blobs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}", - "branches_url": "https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}", - "clone_url": "https://github.com/ursa-labs/ursabot.git", - "collaborators_url": "https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}", - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/comments{/number}", - "commits_url": "https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}", - "compare_url": "https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}", - "contributors_url": "https://api.github.com/repos/ursa-labs/ursabot/contributors", - "created_at": "2019-02-04T15:40:31Z", - "default_branch": "master", - "deployments_url": "https://api.github.com/repos/ursa-labs/ursabot/deployments", - "description": null, - "disabled": false, - "downloads_url": "https://api.github.com/repos/ursa-labs/ursabot/downloads", - "events_url": "https://api.github.com/repos/ursa-labs/ursabot/events", - "fork": false, - "forks": 0, - "forks_count": 0, - "forks_url": "https://api.github.com/repos/ursa-labs/ursabot/forks", - "full_name": "ursa-labs/ursabot", - "git_commits_url": "https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}", - "git_refs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}", - "git_tags_url": "https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}", - "git_url": "git://github.com/ursa-labs/ursabot.git", - "has_downloads": true, - "has_issues": true, - "has_pages": false, - "has_projects": true, - "has_wiki": true, - "homepage": null, - "hooks_url": "https://api.github.com/repos/ursa-labs/ursabot/hooks", - "html_url": "https://github.com/ursa-labs/ursabot", - "id": 169101701, - "issue_comment_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}", - "issue_events_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}", - "issues_url": "https://api.github.com/repos/ursa-labs/ursabot/issues{/number}", - "keys_url": "https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}", - "labels_url": "https://api.github.com/repos/ursa-labs/ursabot/labels{/name}", - "language": "Jupyter Notebook", - "languages_url": "https://api.github.com/repos/ursa-labs/ursabot/languages", - "license": null, - "merges_url": "https://api.github.com/repos/ursa-labs/ursabot/merges", - "milestones_url": "https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}", - "mirror_url": null, - "name": "ursabot", - "node_id": "MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=", - "notifications_url": "https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}", - "open_issues": 19, - "open_issues_count": 19, - "owner": { - "avatar_url": "https://avatars2.githubusercontent.com/u/46514972?v=4", - "events_url": "https://api.github.com/users/ursa-labs/events{/privacy}", - "followers_url": "https://api.github.com/users/ursa-labs/followers", - "following_url": "https://api.github.com/users/ursa-labs/following{/other_user}", - "gists_url": "https://api.github.com/users/ursa-labs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/ursa-labs", - "id": 46514972, - "login": "ursa-labs", - "node_id": "MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy", - "organizations_url": "https://api.github.com/users/ursa-labs/orgs", - "received_events_url": "https://api.github.com/users/ursa-labs/received_events", - "repos_url": "https://api.github.com/users/ursa-labs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/ursa-labs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/ursa-labs/subscriptions", - "type": "Organization", - "url": "https://api.github.com/users/ursa-labs" - }, - "private": false, - "pulls_url": "https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}", - "pushed_at": "2019-04-05T11:22:16Z", - "releases_url": "https://api.github.com/repos/ursa-labs/ursabot/releases{/id}", - "size": 892, - "ssh_url": "git@github.com:ursa-labs/ursabot.git", - "stargazers_count": 1, - "stargazers_url": "https://api.github.com/repos/ursa-labs/ursabot/stargazers", - "statuses_url": "https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}", - "subscribers_url": "https://api.github.com/repos/ursa-labs/ursabot/subscribers", - "subscription_url": "https://api.github.com/repos/ursa-labs/ursabot/subscription", - "svn_url": "https://github.com/ursa-labs/ursabot", - "tags_url": "https://api.github.com/repos/ursa-labs/ursabot/tags", - "teams_url": "https://api.github.com/repos/ursa-labs/ursabot/teams", - "trees_url": "https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}", - "updated_at": "2019-04-04T17:49:10Z", - "url": "https://api.github.com/repos/ursa-labs/ursabot", - "watchers": 1, - "watchers_count": 1 - }, - "sender": { - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/kszucs", - "id": 961747, - "login": "kszucs", - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "repos_url": "https://api.github.com/users/kszucs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "type": "User", - "url": "https://api.github.com/users/kszucs" - } -} \ No newline at end of file diff --git a/dev/archery/archery/tests/fixtures/event-issue-comment-with-empty-command.json b/dev/archery/archery/tests/fixtures/event-issue-comment-with-empty-command.json deleted file mode 100644 index c88197c8e0244..0000000000000 --- a/dev/archery/archery/tests/fixtures/event-issue-comment-with-empty-command.json +++ /dev/null @@ -1,217 +0,0 @@ -{ - "action": "created", - "comment": { - "author_association": "MEMBER", - "body": "@ursabot ", - "body_html": "", - "body_text": "", - "created_at": "2019-04-05T11:35:46Z", - "html_url": "https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480243811", - "id": 480243811, - "issue_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26", - "node_id": "MDEyOklzc3VlQ29tbWVudDQ4MDI0MzgxMQ==", - "updated_at": "2019-04-05T11:35:46Z", - "url": "https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480243811", - "user": { - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/kszucs", - "id": 961747, - "login": "kszucs", - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "repos_url": "https://api.github.com/users/kszucs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "type": "User", - "url": "https://api.github.com/users/kszucs" - } - }, - "issue": { - "assignee": null, - "assignees": [], - "author_association": "MEMBER", - "body": "", - "body_html": "", - "body_text": "", - "closed_at": null, - "closed_by": null, - "comments": 1, - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments", - "created_at": "2019-04-05T11:22:15Z", - "events_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/events", - "html_url": "https://github.com/ursa-labs/ursabot/pull/26", - "id": 429706959, - "labels": [], - "labels_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}", - "locked": false, - "milestone": null, - "node_id": "MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy", - "number": 26, - "pull_request": { - "diff_url": "https://github.com/ursa-labs/ursabot/pull/26.diff", - "html_url": "https://github.com/ursa-labs/ursabot/pull/26", - "patch_url": "https://github.com/ursa-labs/ursabot/pull/26.patch", - "url": "https://api.github.com/repos/ursa-labs/ursabot/pulls/26" - }, - "repository_url": "https://api.github.com/repos/ursa-labs/ursabot", - "state": "open", - "title": "Unittests for GithubHook", - "updated_at": "2019-04-05T11:35:46Z", - "url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26", - "user": { - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/kszucs", - "id": 961747, - "login": "kszucs", - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "repos_url": "https://api.github.com/users/kszucs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "type": "User", - "url": "https://api.github.com/users/kszucs" - } - }, - "organization": { - "avatar_url": "https://avatars2.githubusercontent.com/u/46514972?v=4", - "description": "Innovation lab for open source data science tools, powered by Apache Arrow", - "events_url": "https://api.github.com/orgs/ursa-labs/events", - "hooks_url": "https://api.github.com/orgs/ursa-labs/hooks", - "id": 46514972, - "issues_url": "https://api.github.com/orgs/ursa-labs/issues", - "login": "ursa-labs", - "members_url": "https://api.github.com/orgs/ursa-labs/members{/member}", - "node_id": "MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy", - "public_members_url": "https://api.github.com/orgs/ursa-labs/public_members{/member}", - "repos_url": "https://api.github.com/orgs/ursa-labs/repos", - "url": "https://api.github.com/orgs/ursa-labs" - }, - "repository": { - "archive_url": "https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}", - "archived": false, - "assignees_url": "https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}", - "blobs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}", - "branches_url": "https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}", - "clone_url": "https://github.com/ursa-labs/ursabot.git", - "collaborators_url": "https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}", - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/comments{/number}", - "commits_url": "https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}", - "compare_url": "https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}", - "contributors_url": "https://api.github.com/repos/ursa-labs/ursabot/contributors", - "created_at": "2019-02-04T15:40:31Z", - "default_branch": "master", - "deployments_url": "https://api.github.com/repos/ursa-labs/ursabot/deployments", - "description": null, - "disabled": false, - "downloads_url": "https://api.github.com/repos/ursa-labs/ursabot/downloads", - "events_url": "https://api.github.com/repos/ursa-labs/ursabot/events", - "fork": false, - "forks": 0, - "forks_count": 0, - "forks_url": "https://api.github.com/repos/ursa-labs/ursabot/forks", - "full_name": "ursa-labs/ursabot", - "git_commits_url": "https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}", - "git_refs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}", - "git_tags_url": "https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}", - "git_url": "git://github.com/ursa-labs/ursabot.git", - "has_downloads": true, - "has_issues": true, - "has_pages": false, - "has_projects": true, - "has_wiki": true, - "homepage": null, - "hooks_url": "https://api.github.com/repos/ursa-labs/ursabot/hooks", - "html_url": "https://github.com/ursa-labs/ursabot", - "id": 169101701, - "issue_comment_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}", - "issue_events_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}", - "issues_url": "https://api.github.com/repos/ursa-labs/ursabot/issues{/number}", - "keys_url": "https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}", - "labels_url": "https://api.github.com/repos/ursa-labs/ursabot/labels{/name}", - "language": "Jupyter Notebook", - "languages_url": "https://api.github.com/repos/ursa-labs/ursabot/languages", - "license": null, - "merges_url": "https://api.github.com/repos/ursa-labs/ursabot/merges", - "milestones_url": "https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}", - "mirror_url": null, - "name": "ursabot", - "node_id": "MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=", - "notifications_url": "https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}", - "open_issues": 19, - "open_issues_count": 19, - "owner": { - "avatar_url": "https://avatars2.githubusercontent.com/u/46514972?v=4", - "events_url": "https://api.github.com/users/ursa-labs/events{/privacy}", - "followers_url": "https://api.github.com/users/ursa-labs/followers", - "following_url": "https://api.github.com/users/ursa-labs/following{/other_user}", - "gists_url": "https://api.github.com/users/ursa-labs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/ursa-labs", - "id": 46514972, - "login": "ursa-labs", - "node_id": "MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy", - "organizations_url": "https://api.github.com/users/ursa-labs/orgs", - "received_events_url": "https://api.github.com/users/ursa-labs/received_events", - "repos_url": "https://api.github.com/users/ursa-labs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/ursa-labs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/ursa-labs/subscriptions", - "type": "Organization", - "url": "https://api.github.com/users/ursa-labs" - }, - "private": false, - "pulls_url": "https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}", - "pushed_at": "2019-04-05T11:22:16Z", - "releases_url": "https://api.github.com/repos/ursa-labs/ursabot/releases{/id}", - "size": 892, - "ssh_url": "git@github.com:ursa-labs/ursabot.git", - "stargazers_count": 1, - "stargazers_url": "https://api.github.com/repos/ursa-labs/ursabot/stargazers", - "statuses_url": "https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}", - "subscribers_url": "https://api.github.com/repos/ursa-labs/ursabot/subscribers", - "subscription_url": "https://api.github.com/repos/ursa-labs/ursabot/subscription", - "svn_url": "https://github.com/ursa-labs/ursabot", - "tags_url": "https://api.github.com/repos/ursa-labs/ursabot/tags", - "teams_url": "https://api.github.com/repos/ursa-labs/ursabot/teams", - "trees_url": "https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}", - "updated_at": "2019-04-04T17:49:10Z", - "url": "https://api.github.com/repos/ursa-labs/ursabot", - "watchers": 1, - "watchers_count": 1 - }, - "sender": { - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/kszucs", - "id": 961747, - "login": "kszucs", - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "repos_url": "https://api.github.com/users/kszucs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "type": "User", - "url": "https://api.github.com/users/kszucs" - } -} \ No newline at end of file diff --git a/dev/archery/archery/tests/fixtures/event-issue-comment-without-pull-request.json b/dev/archery/archery/tests/fixtures/event-issue-comment-without-pull-request.json deleted file mode 100644 index 9e362fc0e1bc0..0000000000000 --- a/dev/archery/archery/tests/fixtures/event-issue-comment-without-pull-request.json +++ /dev/null @@ -1,206 +0,0 @@ -{ - "action": "created", - "comment": { - "author_association": "MEMBER", - "body": "@ursabot build", - "created_at": "2019-04-05T13:07:57Z", - "html_url": "https://github.com/ursa-labs/ursabot/issues/19#issuecomment-480268708", - "id": 480268708, - "issue_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/19", - "node_id": "MDEyOklzc3VlQ29tbWVudDQ4MDI2ODcwOA==", - "updated_at": "2019-04-05T13:07:57Z", - "url": "https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480268708", - "user": { - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/kszucs", - "id": 961747, - "login": "kszucs", - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "repos_url": "https://api.github.com/users/kszucs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "type": "User", - "url": "https://api.github.com/users/kszucs" - } - }, - "issue": { - "assignee": null, - "assignees": [], - "author_association": "MEMBER", - "body": "", - "closed_at": null, - "comments": 5, - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/19/comments", - "created_at": "2019-04-02T09:56:41Z", - "events_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/19/events", - "html_url": "https://github.com/ursa-labs/ursabot/issues/19", - "id": 428131685, - "labels": [], - "labels_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/19/labels{/name}", - "locked": false, - "milestone": null, - "node_id": "MDU6SXNzdWU0MjgxMzE2ODU=", - "number": 19, - "repository_url": "https://api.github.com/repos/ursa-labs/ursabot", - "state": "open", - "title": "Build ursabot itself via ursabot", - "updated_at": "2019-04-05T13:07:57Z", - "url": "https://api.github.com/repos/ursa-labs/ursabot/issues/19", - "user": { - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/kszucs", - "id": 961747, - "login": "kszucs", - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "repos_url": "https://api.github.com/users/kszucs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "type": "User", - "url": "https://api.github.com/users/kszucs" - } - }, - "organization": { - "avatar_url": "https://avatars2.githubusercontent.com/u/46514972?v=4", - "description": "Innovation lab for open source data science tools, powered by Apache Arrow", - "events_url": "https://api.github.com/orgs/ursa-labs/events", - "hooks_url": "https://api.github.com/orgs/ursa-labs/hooks", - "id": 46514972, - "issues_url": "https://api.github.com/orgs/ursa-labs/issues", - "login": "ursa-labs", - "members_url": "https://api.github.com/orgs/ursa-labs/members{/member}", - "node_id": "MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy", - "public_members_url": "https://api.github.com/orgs/ursa-labs/public_members{/member}", - "repos_url": "https://api.github.com/orgs/ursa-labs/repos", - "url": "https://api.github.com/orgs/ursa-labs" - }, - "repository": { - "archive_url": "https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}", - "archived": false, - "assignees_url": "https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}", - "blobs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}", - "branches_url": "https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}", - "clone_url": "https://github.com/ursa-labs/ursabot.git", - "collaborators_url": "https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}", - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/comments{/number}", - "commits_url": "https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}", - "compare_url": "https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}", - "contributors_url": "https://api.github.com/repos/ursa-labs/ursabot/contributors", - "created_at": "2019-02-04T15:40:31Z", - "default_branch": "master", - "deployments_url": "https://api.github.com/repos/ursa-labs/ursabot/deployments", - "description": null, - "disabled": false, - "downloads_url": "https://api.github.com/repos/ursa-labs/ursabot/downloads", - "events_url": "https://api.github.com/repos/ursa-labs/ursabot/events", - "fork": false, - "forks": 0, - "forks_count": 0, - "forks_url": "https://api.github.com/repos/ursa-labs/ursabot/forks", - "full_name": "ursa-labs/ursabot", - "git_commits_url": "https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}", - "git_refs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}", - "git_tags_url": "https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}", - "git_url": "git://github.com/ursa-labs/ursabot.git", - "has_downloads": true, - "has_issues": true, - "has_pages": false, - "has_projects": true, - "has_wiki": true, - "homepage": null, - "hooks_url": "https://api.github.com/repos/ursa-labs/ursabot/hooks", - "html_url": "https://github.com/ursa-labs/ursabot", - "id": 169101701, - "issue_comment_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}", - "issue_events_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}", - "issues_url": "https://api.github.com/repos/ursa-labs/ursabot/issues{/number}", - "keys_url": "https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}", - "labels_url": "https://api.github.com/repos/ursa-labs/ursabot/labels{/name}", - "language": "Jupyter Notebook", - "languages_url": "https://api.github.com/repos/ursa-labs/ursabot/languages", - "license": null, - "merges_url": "https://api.github.com/repos/ursa-labs/ursabot/merges", - "milestones_url": "https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}", - "mirror_url": null, - "name": "ursabot", - "node_id": "MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=", - "notifications_url": "https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}", - "open_issues": 19, - "open_issues_count": 19, - "owner": { - "avatar_url": "https://avatars2.githubusercontent.com/u/46514972?v=4", - "events_url": "https://api.github.com/users/ursa-labs/events{/privacy}", - "followers_url": "https://api.github.com/users/ursa-labs/followers", - "following_url": "https://api.github.com/users/ursa-labs/following{/other_user}", - "gists_url": "https://api.github.com/users/ursa-labs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/ursa-labs", - "id": 46514972, - "login": "ursa-labs", - "node_id": "MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy", - "organizations_url": "https://api.github.com/users/ursa-labs/orgs", - "received_events_url": "https://api.github.com/users/ursa-labs/received_events", - "repos_url": "https://api.github.com/users/ursa-labs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/ursa-labs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/ursa-labs/subscriptions", - "type": "Organization", - "url": "https://api.github.com/users/ursa-labs" - }, - "private": false, - "pulls_url": "https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}", - "pushed_at": "2019-04-05T12:01:40Z", - "releases_url": "https://api.github.com/repos/ursa-labs/ursabot/releases{/id}", - "size": 898, - "ssh_url": "git@github.com:ursa-labs/ursabot.git", - "stargazers_count": 1, - "stargazers_url": "https://api.github.com/repos/ursa-labs/ursabot/stargazers", - "statuses_url": "https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}", - "subscribers_url": "https://api.github.com/repos/ursa-labs/ursabot/subscribers", - "subscription_url": "https://api.github.com/repos/ursa-labs/ursabot/subscription", - "svn_url": "https://github.com/ursa-labs/ursabot", - "tags_url": "https://api.github.com/repos/ursa-labs/ursabot/tags", - "teams_url": "https://api.github.com/repos/ursa-labs/ursabot/teams", - "trees_url": "https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}", - "updated_at": "2019-04-04T17:49:10Z", - "url": "https://api.github.com/repos/ursa-labs/ursabot", - "watchers": 1, - "watchers_count": 1 - }, - "sender": { - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/kszucs", - "id": 961747, - "login": "kszucs", - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "repos_url": "https://api.github.com/users/kszucs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "type": "User", - "url": "https://api.github.com/users/kszucs" - } -} \ No newline at end of file diff --git a/dev/archery/archery/tests/fixtures/event-pull-request-opened.json b/dev/archery/archery/tests/fixtures/event-pull-request-opened.json deleted file mode 100644 index 9cf5c0dda7843..0000000000000 --- a/dev/archery/archery/tests/fixtures/event-pull-request-opened.json +++ /dev/null @@ -1,445 +0,0 @@ -{ - "action": "opened", - "number": 26, - "pull_request": { - "url": "https://api.github.com/repos/ursa-labs/ursabot/pulls/26", - "id": 267785552, - "node_id": "MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy", - "html_url": "https://github.com/ursa-labs/ursabot/pull/26", - "diff_url": "https://github.com/ursa-labs/ursabot/pull/26.diff", - "patch_url": "https://github.com/ursa-labs/ursabot/pull/26.patch", - "issue_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26", - "number": 26, - "state": "open", - "locked": false, - "title": "Unittests for GithubHook", - "user": { - "login": "kszucs", - "id": 961747, - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "gravatar_id": "", - "url": "https://api.github.com/users/kszucs", - "html_url": "https://github.com/kszucs", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "repos_url": "https://api.github.com/users/kszucs/repos", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "type": "User", - "site_admin": false - }, - "body": "", - "created_at": "2019-04-05T11:22:15Z", - "updated_at": "2019-04-05T12:01:40Z", - "closed_at": null, - "merged_at": null, - "merge_commit_sha": "cc5dc3606988b3824be54df779ed2028776113cb", - "assignee": null, - "assignees": [], - "requested_reviewers": [], - "requested_teams": [], - "labels": [], - "milestone": null, - "commits_url": "https://api.github.com/repos/ursa-labs/ursabot/pulls/26/commits", - "review_comments_url": "https://api.github.com/repos/ursa-labs/ursabot/pulls/26/comments", - "review_comment_url": "https://api.github.com/repos/ursa-labs/ursabot/pulls/comments{/number}", - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments", - "statuses_url": "https://api.github.com/repos/ursa-labs/ursabot/statuses/2705da2b616b98fa6010a25813c5a7a27456f71d", - "head": { - "label": "ursa-labs:test-hook", - "ref": "test-hook", - "sha": "2705da2b616b98fa6010a25813c5a7a27456f71d", - "user": { - "login": "ursa-labs", - "id": 46514972, - "node_id": "MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy", - "avatar_url": "https://avatars2.githubusercontent.com/u/46514972?v=4", - "gravatar_id": "", - "url": "https://api.github.com/users/ursa-labs", - "html_url": "https://github.com/ursa-labs", - "followers_url": "https://api.github.com/users/ursa-labs/followers", - "following_url": "https://api.github.com/users/ursa-labs/following{/other_user}", - "gists_url": "https://api.github.com/users/ursa-labs/gists{/gist_id}", - "starred_url": "https://api.github.com/users/ursa-labs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/ursa-labs/subscriptions", - "organizations_url": "https://api.github.com/users/ursa-labs/orgs", - "repos_url": "https://api.github.com/users/ursa-labs/repos", - "events_url": "https://api.github.com/users/ursa-labs/events{/privacy}", - "received_events_url": "https://api.github.com/users/ursa-labs/received_events", - "type": "Organization", - "site_admin": false - }, - "repo": { - "id": 169101701, - "node_id": "MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=", - "name": "ursabot", - "full_name": "ursa-labs/ursabot", - "private": false, - "owner": { - "login": "ursa-labs", - "id": 46514972, - "node_id": "MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy", - "avatar_url": "https://avatars2.githubusercontent.com/u/46514972?v=4", - "gravatar_id": "", - "url": "https://api.github.com/users/ursa-labs", - "html_url": "https://github.com/ursa-labs", - "followers_url": "https://api.github.com/users/ursa-labs/followers", - "following_url": "https://api.github.com/users/ursa-labs/following{/other_user}", - "gists_url": "https://api.github.com/users/ursa-labs/gists{/gist_id}", - "starred_url": "https://api.github.com/users/ursa-labs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/ursa-labs/subscriptions", - "organizations_url": "https://api.github.com/users/ursa-labs/orgs", - "repos_url": "https://api.github.com/users/ursa-labs/repos", - "events_url": "https://api.github.com/users/ursa-labs/events{/privacy}", - "received_events_url": "https://api.github.com/users/ursa-labs/received_events", - "type": "Organization", - "site_admin": false - }, - "html_url": "https://github.com/ursa-labs/ursabot", - "description": null, - "fork": false, - "url": "https://api.github.com/repos/ursa-labs/ursabot", - "forks_url": "https://api.github.com/repos/ursa-labs/ursabot/forks", - "keys_url": "https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}", - "collaborators_url": "https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}", - "teams_url": "https://api.github.com/repos/ursa-labs/ursabot/teams", - "hooks_url": "https://api.github.com/repos/ursa-labs/ursabot/hooks", - "issue_events_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}", - "events_url": "https://api.github.com/repos/ursa-labs/ursabot/events", - "assignees_url": "https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}", - "branches_url": "https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}", - "tags_url": "https://api.github.com/repos/ursa-labs/ursabot/tags", - "blobs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}", - "git_tags_url": "https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}", - "git_refs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}", - "trees_url": "https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}", - "statuses_url": "https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}", - "languages_url": "https://api.github.com/repos/ursa-labs/ursabot/languages", - "stargazers_url": "https://api.github.com/repos/ursa-labs/ursabot/stargazers", - "contributors_url": "https://api.github.com/repos/ursa-labs/ursabot/contributors", - "subscribers_url": "https://api.github.com/repos/ursa-labs/ursabot/subscribers", - "subscription_url": "https://api.github.com/repos/ursa-labs/ursabot/subscription", - "commits_url": "https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}", - "git_commits_url": "https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}", - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/comments{/number}", - "issue_comment_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}", - "compare_url": "https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}", - "merges_url": "https://api.github.com/repos/ursa-labs/ursabot/merges", - "archive_url": "https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}", - "downloads_url": "https://api.github.com/repos/ursa-labs/ursabot/downloads", - "issues_url": "https://api.github.com/repos/ursa-labs/ursabot/issues{/number}", - "pulls_url": "https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}", - "milestones_url": "https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}", - "notifications_url": "https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}", - "labels_url": "https://api.github.com/repos/ursa-labs/ursabot/labels{/name}", - "releases_url": "https://api.github.com/repos/ursa-labs/ursabot/releases{/id}", - "deployments_url": "https://api.github.com/repos/ursa-labs/ursabot/deployments", - "created_at": "2019-02-04T15:40:31Z", - "updated_at": "2019-04-04T17:49:10Z", - "pushed_at": "2019-04-05T12:01:40Z", - "git_url": "git://github.com/ursa-labs/ursabot.git", - "ssh_url": "git@github.com:ursa-labs/ursabot.git", - "clone_url": "https://github.com/ursa-labs/ursabot.git", - "svn_url": "https://github.com/ursa-labs/ursabot", - "homepage": null, - "size": 898, - "stargazers_count": 1, - "watchers_count": 1, - "language": "Jupyter Notebook", - "has_issues": true, - "has_projects": true, - "has_downloads": true, - "has_wiki": true, - "has_pages": false, - "forks_count": 0, - "mirror_url": null, - "archived": false, - "disabled": false, - "open_issues_count": 19, - "license": null, - "forks": 0, - "open_issues": 19, - "watchers": 1, - "default_branch": "master" - } - }, - "base": { - "label": "ursa-labs:master", - "ref": "master", - "sha": "a162ad254b589b924db47e057791191b39613fd5", - "user": { - "login": "ursa-labs", - "id": 46514972, - "node_id": "MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy", - "avatar_url": "https://avatars2.githubusercontent.com/u/46514972?v=4", - "gravatar_id": "", - "url": "https://api.github.com/users/ursa-labs", - "html_url": "https://github.com/ursa-labs", - "followers_url": "https://api.github.com/users/ursa-labs/followers", - "following_url": "https://api.github.com/users/ursa-labs/following{/other_user}", - "gists_url": "https://api.github.com/users/ursa-labs/gists{/gist_id}", - "starred_url": "https://api.github.com/users/ursa-labs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/ursa-labs/subscriptions", - "organizations_url": "https://api.github.com/users/ursa-labs/orgs", - "repos_url": "https://api.github.com/users/ursa-labs/repos", - "events_url": "https://api.github.com/users/ursa-labs/events{/privacy}", - "received_events_url": "https://api.github.com/users/ursa-labs/received_events", - "type": "Organization", - "site_admin": false - }, - "repo": { - "id": 169101701, - "node_id": "MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=", - "name": "ursabot", - "full_name": "ursa-labs/ursabot", - "private": false, - "owner": { - "login": "ursa-labs", - "id": 46514972, - "node_id": "MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy", - "avatar_url": "https://avatars2.githubusercontent.com/u/46514972?v=4", - "gravatar_id": "", - "url": "https://api.github.com/users/ursa-labs", - "html_url": "https://github.com/ursa-labs", - "followers_url": "https://api.github.com/users/ursa-labs/followers", - "following_url": "https://api.github.com/users/ursa-labs/following{/other_user}", - "gists_url": "https://api.github.com/users/ursa-labs/gists{/gist_id}", - "starred_url": "https://api.github.com/users/ursa-labs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/ursa-labs/subscriptions", - "organizations_url": "https://api.github.com/users/ursa-labs/orgs", - "repos_url": "https://api.github.com/users/ursa-labs/repos", - "events_url": "https://api.github.com/users/ursa-labs/events{/privacy}", - "received_events_url": "https://api.github.com/users/ursa-labs/received_events", - "type": "Organization", - "site_admin": false - }, - "html_url": "https://github.com/ursa-labs/ursabot", - "description": null, - "fork": false, - "url": "https://api.github.com/repos/ursa-labs/ursabot", - "forks_url": "https://api.github.com/repos/ursa-labs/ursabot/forks", - "keys_url": "https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}", - "collaborators_url": "https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}", - "teams_url": "https://api.github.com/repos/ursa-labs/ursabot/teams", - "hooks_url": "https://api.github.com/repos/ursa-labs/ursabot/hooks", - "issue_events_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}", - "events_url": "https://api.github.com/repos/ursa-labs/ursabot/events", - "assignees_url": "https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}", - "branches_url": "https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}", - "tags_url": "https://api.github.com/repos/ursa-labs/ursabot/tags", - "blobs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}", - "git_tags_url": "https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}", - "git_refs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}", - "trees_url": "https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}", - "statuses_url": "https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}", - "languages_url": "https://api.github.com/repos/ursa-labs/ursabot/languages", - "stargazers_url": "https://api.github.com/repos/ursa-labs/ursabot/stargazers", - "contributors_url": "https://api.github.com/repos/ursa-labs/ursabot/contributors", - "subscribers_url": "https://api.github.com/repos/ursa-labs/ursabot/subscribers", - "subscription_url": "https://api.github.com/repos/ursa-labs/ursabot/subscription", - "commits_url": "https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}", - "git_commits_url": "https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}", - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/comments{/number}", - "issue_comment_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}", - "compare_url": "https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}", - "merges_url": "https://api.github.com/repos/ursa-labs/ursabot/merges", - "archive_url": "https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}", - "downloads_url": "https://api.github.com/repos/ursa-labs/ursabot/downloads", - "issues_url": "https://api.github.com/repos/ursa-labs/ursabot/issues{/number}", - "pulls_url": "https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}", - "milestones_url": "https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}", - "notifications_url": "https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}", - "labels_url": "https://api.github.com/repos/ursa-labs/ursabot/labels{/name}", - "releases_url": "https://api.github.com/repos/ursa-labs/ursabot/releases{/id}", - "deployments_url": "https://api.github.com/repos/ursa-labs/ursabot/deployments", - "created_at": "2019-02-04T15:40:31Z", - "updated_at": "2019-04-04T17:49:10Z", - "pushed_at": "2019-04-05T12:01:40Z", - "git_url": "git://github.com/ursa-labs/ursabot.git", - "ssh_url": "git@github.com:ursa-labs/ursabot.git", - "clone_url": "https://github.com/ursa-labs/ursabot.git", - "svn_url": "https://github.com/ursa-labs/ursabot", - "homepage": null, - "size": 898, - "stargazers_count": 1, - "watchers_count": 1, - "language": "Jupyter Notebook", - "has_issues": true, - "has_projects": true, - "has_downloads": true, - "has_wiki": true, - "has_pages": false, - "forks_count": 0, - "mirror_url": null, - "archived": false, - "disabled": false, - "open_issues_count": 19, - "license": null, - "forks": 0, - "open_issues": 19, - "watchers": 1, - "default_branch": "master" - } - }, - "_links": { - "self": { - "href": "https://api.github.com/repos/ursa-labs/ursabot/pulls/26" - }, - "html": { - "href": "https://github.com/ursa-labs/ursabot/pull/26" - }, - "issue": { - "href": "https://api.github.com/repos/ursa-labs/ursabot/issues/26" - }, - "comments": { - "href": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments" - }, - "review_comments": { - "href": "https://api.github.com/repos/ursa-labs/ursabot/pulls/26/comments" - }, - "review_comment": { - "href": "https://api.github.com/repos/ursa-labs/ursabot/pulls/comments{/number}" - }, - "commits": { - "href": "https://api.github.com/repos/ursa-labs/ursabot/pulls/26/commits" - }, - "statuses": { - "href": "https://api.github.com/repos/ursa-labs/ursabot/statuses/2705da2b616b98fa6010a25813c5a7a27456f71d" - } - }, - "author_association": "MEMBER", - "merged": false, - "mergeable": true, - "rebaseable": true, - "mergeable_state": "unstable", - "merged_by": null, - "comments": 5, - "review_comments": 0, - "maintainer_can_modify": false, - "commits": 2, - "additions": 1124, - "deletions": 0, - "changed_files": 7 - }, - "repository": { - "archive_url": "https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}", - "archived": false, - "assignees_url": "https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}", - "blobs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}", - "branches_url": "https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}", - "clone_url": "https://github.com/ursa-labs/ursabot.git", - "collaborators_url": "https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}", - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/comments{/number}", - "commits_url": "https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}", - "compare_url": "https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}", - "contributors_url": "https://api.github.com/repos/ursa-labs/ursabot/contributors", - "created_at": "2019-02-04T15:40:31Z", - "default_branch": "master", - "deployments_url": "https://api.github.com/repos/ursa-labs/ursabot/deployments", - "description": null, - "disabled": false, - "downloads_url": "https://api.github.com/repos/ursa-labs/ursabot/downloads", - "events_url": "https://api.github.com/repos/ursa-labs/ursabot/events", - "fork": false, - "forks": 0, - "forks_count": 0, - "forks_url": "https://api.github.com/repos/ursa-labs/ursabot/forks", - "full_name": "ursa-labs/ursabot", - "git_commits_url": "https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}", - "git_refs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}", - "git_tags_url": "https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}", - "git_url": "git://github.com/ursa-labs/ursabot.git", - "has_downloads": true, - "has_issues": true, - "has_pages": false, - "has_projects": true, - "has_wiki": true, - "homepage": null, - "hooks_url": "https://api.github.com/repos/ursa-labs/ursabot/hooks", - "html_url": "https://github.com/ursa-labs/ursabot", - "id": 169101701, - "issue_comment_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}", - "issue_events_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}", - "issues_url": "https://api.github.com/repos/ursa-labs/ursabot/issues{/number}", - "keys_url": "https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}", - "labels_url": "https://api.github.com/repos/ursa-labs/ursabot/labels{/name}", - "language": "Jupyter Notebook", - "languages_url": "https://api.github.com/repos/ursa-labs/ursabot/languages", - "license": null, - "merges_url": "https://api.github.com/repos/ursa-labs/ursabot/merges", - "milestones_url": "https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}", - "mirror_url": null, - "name": "ursabot", - "node_id": "MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=", - "notifications_url": "https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}", - "open_issues": 19, - "open_issues_count": 19, - "owner": { - "avatar_url": "https://avatars2.githubusercontent.com/u/46514972?v=4", - "events_url": "https://api.github.com/users/ursa-labs/events{/privacy}", - "followers_url": "https://api.github.com/users/ursa-labs/followers", - "following_url": "https://api.github.com/users/ursa-labs/following{/other_user}", - "gists_url": "https://api.github.com/users/ursa-labs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/ursa-labs", - "id": 46514972, - "login": "ursa-labs", - "node_id": "MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy", - "organizations_url": "https://api.github.com/users/ursa-labs/orgs", - "received_events_url": "https://api.github.com/users/ursa-labs/received_events", - "repos_url": "https://api.github.com/users/ursa-labs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/ursa-labs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/ursa-labs/subscriptions", - "type": "Organization", - "url": "https://api.github.com/users/ursa-labs" - }, - "private": false, - "pulls_url": "https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}", - "pushed_at": "2019-04-05T11:22:16Z", - "releases_url": "https://api.github.com/repos/ursa-labs/ursabot/releases{/id}", - "size": 892, - "ssh_url": "git@github.com:ursa-labs/ursabot.git", - "stargazers_count": 1, - "stargazers_url": "https://api.github.com/repos/ursa-labs/ursabot/stargazers", - "statuses_url": "https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}", - "subscribers_url": "https://api.github.com/repos/ursa-labs/ursabot/subscribers", - "subscription_url": "https://api.github.com/repos/ursa-labs/ursabot/subscription", - "svn_url": "https://github.com/ursa-labs/ursabot", - "tags_url": "https://api.github.com/repos/ursa-labs/ursabot/tags", - "teams_url": "https://api.github.com/repos/ursa-labs/ursabot/teams", - "trees_url": "https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}", - "updated_at": "2019-04-04T17:49:10Z", - "url": "https://api.github.com/repos/ursa-labs/ursabot", - "watchers": 1, - "watchers_count": 1 - }, - "sender": { - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "gravatar_id": "", - "html_url": "https://github.com/kszucs", - "id": 961747, - "login": "kszucs", - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "repos_url": "https://api.github.com/users/kszucs/repos", - "site_admin": false, - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "type": "User", - "url": "https://api.github.com/users/kszucs" - } -} \ No newline at end of file diff --git a/dev/archery/archery/tests/fixtures/issue-19.json b/dev/archery/archery/tests/fixtures/issue-19.json deleted file mode 100644 index 1e49397765e14..0000000000000 --- a/dev/archery/archery/tests/fixtures/issue-19.json +++ /dev/null @@ -1,64 +0,0 @@ -{ - "url": "https://api.github.com/repos/ursa-labs/ursabot/issues/19", - "repository_url": "https://api.github.com/repos/ursa-labs/ursabot", - "labels_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/19/labels{/name}", - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/19/comments", - "events_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/19/events", - "html_url": "https://github.com/ursa-labs/ursabot/issues/19", - "id": 428131685, - "node_id": "MDU6SXNzdWU0MjgxMzE2ODU=", - "number": 19, - "title": "Build ursabot itself via ursabot", - "user": { - "login": "kszucs", - "id": 961747, - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "gravatar_id": "", - "url": "https://api.github.com/users/kszucs", - "html_url": "https://github.com/kszucs", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "repos_url": "https://api.github.com/users/kszucs/repos", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "type": "User", - "site_admin": false - }, - "labels": [], - "state": "closed", - "locked": false, - "assignee": null, - "assignees": [], - "milestone": null, - "comments": 8, - "created_at": "2019-04-02T09:56:41Z", - "updated_at": "2019-04-05T13:30:49Z", - "closed_at": "2019-04-05T13:30:49Z", - "author_association": "MEMBER", - "body": "", - "closed_by": { - "login": "kszucs", - "id": 961747, - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "gravatar_id": "", - "url": "https://api.github.com/users/kszucs", - "html_url": "https://github.com/kszucs", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "repos_url": "https://api.github.com/users/kszucs/repos", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "type": "User", - "site_admin": false - } -} \ No newline at end of file diff --git a/dev/archery/archery/tests/fixtures/issue-26.json b/dev/archery/archery/tests/fixtures/issue-26.json deleted file mode 100644 index 44c4d3bedef48..0000000000000 --- a/dev/archery/archery/tests/fixtures/issue-26.json +++ /dev/null @@ -1,70 +0,0 @@ -{ - "url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26", - "repository_url": "https://api.github.com/repos/ursa-labs/ursabot", - "labels_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}", - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments", - "events_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/events", - "html_url": "https://github.com/ursa-labs/ursabot/pull/26", - "id": 429706959, - "node_id": "MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy", - "number": 26, - "title": "Unittests for GithubHook + native asyncio syntax", - "user": { - "login": "kszucs", - "id": 961747, - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "gravatar_id": "", - "url": "https://api.github.com/users/kszucs", - "html_url": "https://github.com/kszucs", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "repos_url": "https://api.github.com/users/kszucs/repos", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "type": "User", - "site_admin": false - }, - "labels": [], - "state": "closed", - "locked": false, - "assignee": null, - "assignees": [], - "milestone": null, - "comments": 9, - "created_at": "2019-04-05T11:22:15Z", - "updated_at": "2019-08-28T00:34:19Z", - "closed_at": "2019-04-05T13:54:34Z", - "author_association": "MEMBER", - "pull_request": { - "url": "https://api.github.com/repos/ursa-labs/ursabot/pulls/26", - "html_url": "https://github.com/ursa-labs/ursabot/pull/26", - "diff_url": "https://github.com/ursa-labs/ursabot/pull/26.diff", - "patch_url": "https://github.com/ursa-labs/ursabot/pull/26.patch" - }, - "body": "Resolves:\r\n- #26 Unittests for GithubHook + native asyncio syntax\r\n- #27 Use native async/await keywords instead of @inlineCallbacks and yield\r\n", - "closed_by": { - "login": "kszucs", - "id": 961747, - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "gravatar_id": "", - "url": "https://api.github.com/users/kszucs", - "html_url": "https://github.com/kszucs", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "repos_url": "https://api.github.com/users/kszucs/repos", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "type": "User", - "site_admin": false - } -} \ No newline at end of file diff --git a/dev/archery/archery/tests/fixtures/issue-comment-480243811.json b/dev/archery/archery/tests/fixtures/issue-comment-480243811.json deleted file mode 100644 index 93ee4b13cd469..0000000000000 --- a/dev/archery/archery/tests/fixtures/issue-comment-480243811.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "url": "https://api.github.com/repos/ursa-labs/ursabot/issues/comments/479081273", - "html_url": "https://github.com/ursa-labs/ursabot/pull/21#issuecomment-479081273", - "issue_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/21", - "id": 480243811, - "node_id": "MDEyOklzc3VlQ29tbWVudDQ3OTA4MTI3Mw==", - "user": { - "login": "kszucs", - "id": 961747, - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "gravatar_id": "", - "url": "https://api.github.com/users/kszucs", - "html_url": "https://github.com/kszucs", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "repos_url": "https://api.github.com/users/kszucs/repos", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "type": "User", - "site_admin": false - }, - "created_at": "2019-04-02T16:29:46Z", - "updated_at": "2019-04-02T16:29:46Z", - "author_association": "MEMBER", - "body": "@ursabot" -} \ No newline at end of file diff --git a/dev/archery/archery/tests/fixtures/issue-comment-480248726.json b/dev/archery/archery/tests/fixtures/issue-comment-480248726.json deleted file mode 100644 index f3cd34083ed10..0000000000000 --- a/dev/archery/archery/tests/fixtures/issue-comment-480248726.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "url": "https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480248726", - "html_url": "https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480248726", - "issue_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26", - "id": 480248726, - "node_id": "MDEyOklzc3VlQ29tbWVudDQ4MDI0ODcyNg==", - "user": { - "login": "kszucs", - "id": 961747, - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "gravatar_id": "", - "url": "https://api.github.com/users/kszucs", - "html_url": "https://github.com/kszucs", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "repos_url": "https://api.github.com/users/kszucs/repos", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "type": "User", - "site_admin": false - }, - "created_at": "2019-04-05T11:55:43Z", - "updated_at": "2019-04-05T11:55:43Z", - "author_association": "MEMBER", - "body": "@ursabot build" -} \ No newline at end of file diff --git a/dev/archery/archery/tests/fixtures/pull-request-26-commit.json b/dev/archery/archery/tests/fixtures/pull-request-26-commit.json deleted file mode 100644 index ffc48943a6ca8..0000000000000 --- a/dev/archery/archery/tests/fixtures/pull-request-26-commit.json +++ /dev/null @@ -1,158 +0,0 @@ -{ - "sha": "2705da2b616b98fa6010a25813c5a7a27456f71d", - "node_id": "MDY6Q29tbWl0MTY5MTAxNzAxOjI3MDVkYTJiNjE2Yjk4ZmE2MDEwYTI1ODEzYzVhN2EyNzQ1NmY3MWQ=", - "commit": { - "author": { - "name": "Krisztián Szűcs", - "email": "szucs.krisztian@gmail.com", - "date": "2019-04-05T12:01:31Z" - }, - "committer": { - "name": "Krisztián Szűcs", - "email": "szucs.krisztian@gmail.com", - "date": "2019-04-05T12:01:31Z" - }, - "message": "add recorded event requests", - "tree": { - "sha": "16a7bb186833a67e9c2d84a58393503b85500ceb", - "url": "https://api.github.com/repos/ursa-labs/ursabot/git/trees/16a7bb186833a67e9c2d84a58393503b85500ceb" - }, - "url": "https://api.github.com/repos/ursa-labs/ursabot/git/commits/2705da2b616b98fa6010a25813c5a7a27456f71d", - "comment_count": 0, - "verification": { - "verified": true, - "reason": "valid", - "signature": "-----BEGIN PGP SIGNATURE-----\n\niQFOBAABCAA4FiEEOOW2r8dr6sA77zHlgjqBKYe1QKUFAlynQ58aHHN6dWNzLmty\naXN6dGlhbkBnbWFpbC5jb20ACgkQgjqBKYe1QKUYKwf6AiXDMaLqNLNSjRY7lIXX\nudioewz0hSb4bgIXBv30nswu9CoOA0+mHCokEVtZhYbXzXDsZ1KJrilSC4j+Ws4q\nkRGA6iEmrne2HcSKNZXzcVnwV9zpwKxlVh2QCTNb1PuOYFBLH0kwE704uWIWMGDN\nbo8cjQPwegePCRguCvPh/5wa5J3uiq5gmJLG6bC/d1XYE+FJVtlnyzqzLMIryGKe\ntIciw+wwkF413Q/YVbZ49vLUeCX9H8PHC4mZYGDWuvjFW1WTfkjK5bAH+oaTVM6h\n350I5ZFloHmMA/QeRge5qFxXoEBMDGiXHHktzYZDXnliFOQNxzqwirA5lQQ6LRSS\naQ==\n=7rqi\n-----END PGP SIGNATURE-----", - "payload": "tree 16a7bb186833a67e9c2d84a58393503b85500ceb\nparent 446ae69b9385e8d0f40aa9595f723d34383af2f7\nauthor Krisztián Szűcs 1554465691 +0200\ncommitter Krisztián Szűcs 1554465691 +0200\n\nadd recorded event requests\n" - } - }, - "url": "https://api.github.com/repos/ursa-labs/ursabot/commits/2705da2b616b98fa6010a25813c5a7a27456f71d", - "html_url": "https://github.com/ursa-labs/ursabot/commit/2705da2b616b98fa6010a25813c5a7a27456f71d", - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/commits/2705da2b616b98fa6010a25813c5a7a27456f71d/comments", - "author": { - "login": "kszucs", - "id": 961747, - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "gravatar_id": "", - "url": "https://api.github.com/users/kszucs", - "html_url": "https://github.com/kszucs", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "repos_url": "https://api.github.com/users/kszucs/repos", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "type": "User", - "site_admin": false - }, - "committer": { - "login": "kszucs", - "id": 961747, - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "gravatar_id": "", - "url": "https://api.github.com/users/kszucs", - "html_url": "https://github.com/kszucs", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "repos_url": "https://api.github.com/users/kszucs/repos", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "type": "User", - "site_admin": false - }, - "parents": [ - { - "sha": "446ae69b9385e8d0f40aa9595f723d34383af2f7", - "url": "https://api.github.com/repos/ursa-labs/ursabot/commits/446ae69b9385e8d0f40aa9595f723d34383af2f7", - "html_url": "https://github.com/ursa-labs/ursabot/commit/446ae69b9385e8d0f40aa9595f723d34383af2f7" - } - ], - "stats": { - "total": 1062, - "additions": 1058, - "deletions": 4 - }, - "files": [ - { - "sha": "dfae6eeaef384ae6180c6302a58b49e39982dc33", - "filename": "ursabot/tests/fixtures/issue-comment-build-command.json", - "status": "added", - "additions": 212, - "deletions": 0, - "changes": 212, - "blob_url": "https://github.com/ursa-labs/ursabot/blob/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-build-command.json", - "raw_url": "https://github.com/ursa-labs/ursabot/raw/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-build-command.json", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/fixtures/issue-comment-build-command.json?ref=2705da2b616b98fa6010a25813c5a7a27456f71d", - "patch": "@@ -0,0 +1,212 @@\n+{\n+ \"action\": \"created\",\n+ \"comment\": {\n+ \"author_association\": \"NONE\",\n+ \"body\": \"I've successfully started builds for this PR\",\n+ \"created_at\": \"2019-04-05T11:55:44Z\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480248730\",\n+ \"id\": 480248730,\n+ \"issue_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"node_id\": \"MDEyOklzc3VlQ29tbWVudDQ4MDI0ODczMA==\",\n+ \"updated_at\": \"2019-04-05T11:55:44Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480248730\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/49275095?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursabot/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursabot/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursabot/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursabot/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursabot\",\n+ \"id\": 49275095,\n+ \"login\": \"ursabot\",\n+ \"node_id\": \"MDQ6VXNlcjQ5Mjc1MDk1\",\n+ \"organizations_url\": \"https://api.github.com/users/ursabot/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursabot/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursabot/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursabot/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursabot/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/ursabot\"\n+ }\n+ },\n+ \"issue\": {\n+ \"assignee\": null,\n+ \"assignees\": [],\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"\",\n+ \"closed_at\": null,\n+ \"comments\": 4,\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\",\n+ \"created_at\": \"2019-04-05T11:22:15Z\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/events\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"id\": 429706959,\n+ \"labels\": [],\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}\",\n+ \"locked\": false,\n+ \"milestone\": null,\n+ \"node_id\": \"MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy\",\n+ \"number\": 26,\n+ \"pull_request\": {\n+ \"diff_url\": \"https://github.com/ursa-labs/ursabot/pull/26.diff\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"patch_url\": \"https://github.com/ursa-labs/ursabot/pull/26.patch\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\"\n+ },\n+ \"repository_url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"state\": \"open\",\n+ \"title\": \"Unittests for GithubHook\",\n+ \"updated_at\": \"2019-04-05T11:55:44Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"organization\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"description\": \"Innovation lab for open source data science tools, powered by Apache Arrow\",\n+ \"events_url\": \"https://api.github.com/orgs/ursa-labs/events\",\n+ \"hooks_url\": \"https://api.github.com/orgs/ursa-labs/hooks\",\n+ \"id\": 46514972,\n+ \"issues_url\": \"https://api.github.com/orgs/ursa-labs/issues\",\n+ \"login\": \"ursa-labs\",\n+ \"members_url\": \"https://api.github.com/orgs/ursa-labs/members{/member}\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"public_members_url\": \"https://api.github.com/orgs/ursa-labs/public_members{/member}\",\n+ \"repos_url\": \"https://api.github.com/orgs/ursa-labs/repos\",\n+ \"url\": \"https://api.github.com/orgs/ursa-labs\"\n+ },\n+ \"repository\": {\n+ \"archive_url\": \"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\",\n+ \"archived\": false,\n+ \"assignees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\",\n+ \"blobs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\",\n+ \"branches_url\": \"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\",\n+ \"clone_url\": \"https://github.com/ursa-labs/ursabot.git\",\n+ \"collaborators_url\": \"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\",\n+ \"commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\",\n+ \"compare_url\": \"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\",\n+ \"contributors_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contributors\",\n+ \"created_at\": \"2019-02-04T15:40:31Z\",\n+ \"default_branch\": \"master\",\n+ \"deployments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/deployments\",\n+ \"description\": null,\n+ \"disabled\": false,\n+ \"downloads_url\": \"https://api.github.com/repos/ursa-labs/ursabot/downloads\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/events\",\n+ \"fork\": false,\n+ \"forks\": 0,\n+ \"forks_count\": 0,\n+ \"forks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/forks\",\n+ \"full_name\": \"ursa-labs/ursabot\",\n+ \"git_commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\",\n+ \"git_refs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\",\n+ \"git_tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\",\n+ \"git_url\": \"git://github.com/ursa-labs/ursabot.git\",\n+ \"has_downloads\": true,\n+ \"has_issues\": true,\n+ \"has_pages\": false,\n+ \"has_projects\": true,\n+ \"has_wiki\": true,\n+ \"homepage\": null,\n+ \"hooks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/hooks\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"id\": 169101701,\n+ \"issue_comment_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\",\n+ \"issue_events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\",\n+ \"issues_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\",\n+ \"keys_url\": \"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\",\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\",\n+ \"language\": \"Jupyter Notebook\",\n+ \"languages_url\": \"https://api.github.com/repos/ursa-labs/ursabot/languages\",\n+ \"license\": null,\n+ \"merges_url\": \"https://api.github.com/repos/ursa-labs/ursabot/merges\",\n+ \"milestones_url\": \"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\",\n+ \"mirror_url\": null,\n+ \"name\": \"ursabot\",\n+ \"node_id\": \"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\",\n+ \"notifications_url\": \"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\",\n+ \"open_issues\": 19,\n+ \"open_issues_count\": 19,\n+ \"owner\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"id\": 46514972,\n+ \"login\": \"ursa-labs\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"type\": \"Organization\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\"\n+ },\n+ \"private\": false,\n+ \"pulls_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\",\n+ \"pushed_at\": \"2019-04-05T11:22:16Z\",\n+ \"releases_url\": \"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\",\n+ \"size\": 892,\n+ \"ssh_url\": \"git@github.com:ursa-labs/ursabot.git\",\n+ \"stargazers_count\": 1,\n+ \"stargazers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/stargazers\",\n+ \"statuses_url\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\",\n+ \"subscribers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscribers\",\n+ \"subscription_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscription\",\n+ \"svn_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/tags\",\n+ \"teams_url\": \"https://api.github.com/repos/ursa-labs/ursabot/teams\",\n+ \"trees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\",\n+ \"updated_at\": \"2019-04-04T17:49:10Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"watchers\": 1,\n+ \"watchers_count\": 1\n+ },\n+ \"sender\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/49275095?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursabot/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursabot/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursabot/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursabot/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursabot\",\n+ \"id\": 49275095,\n+ \"login\": \"ursabot\",\n+ \"node_id\": \"MDQ6VXNlcjQ5Mjc1MDk1\",\n+ \"organizations_url\": \"https://api.github.com/users/ursabot/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursabot/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursabot/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursabot/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursabot/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/ursabot\"\n+ }\n+}" - }, - { - "sha": "7ef554e333327f0e62aa1fd76b4b17844a39adeb", - "filename": "ursabot/tests/fixtures/issue-comment-by-ursabot.json", - "status": "added", - "additions": 212, - "deletions": 0, - "changes": 212, - "blob_url": "https://github.com/ursa-labs/ursabot/blob/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-by-ursabot.json", - "raw_url": "https://github.com/ursa-labs/ursabot/raw/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-by-ursabot.json", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/fixtures/issue-comment-by-ursabot.json?ref=2705da2b616b98fa6010a25813c5a7a27456f71d", - "patch": "@@ -0,0 +1,212 @@\n+{\n+ \"action\": \"created\",\n+ \"comment\": {\n+ \"author_association\": \"NONE\",\n+ \"body\": \"Unknown command \\\"\\\"\",\n+ \"created_at\": \"2019-04-05T11:35:47Z\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480243815\",\n+ \"id\": 480243815,\n+ \"issue_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"node_id\": \"MDEyOklzc3VlQ29tbWVudDQ4MDI0MzgxNQ==\",\n+ \"updated_at\": \"2019-04-05T11:35:47Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480243815\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/49275095?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursabot/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursabot/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursabot/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursabot/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursabot\",\n+ \"id\": 49275095,\n+ \"login\": \"ursabot\",\n+ \"node_id\": \"MDQ6VXNlcjQ5Mjc1MDk1\",\n+ \"organizations_url\": \"https://api.github.com/users/ursabot/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursabot/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursabot/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursabot/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursabot/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/ursabot\"\n+ }\n+ },\n+ \"issue\": {\n+ \"assignee\": null,\n+ \"assignees\": [],\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"\",\n+ \"closed_at\": null,\n+ \"comments\": 2,\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\",\n+ \"created_at\": \"2019-04-05T11:22:15Z\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/events\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"id\": 429706959,\n+ \"labels\": [],\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}\",\n+ \"locked\": false,\n+ \"milestone\": null,\n+ \"node_id\": \"MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy\",\n+ \"number\": 26,\n+ \"pull_request\": {\n+ \"diff_url\": \"https://github.com/ursa-labs/ursabot/pull/26.diff\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"patch_url\": \"https://github.com/ursa-labs/ursabot/pull/26.patch\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\"\n+ },\n+ \"repository_url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"state\": \"open\",\n+ \"title\": \"Unittests for GithubHook\",\n+ \"updated_at\": \"2019-04-05T11:35:47Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"organization\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"description\": \"Innovation lab for open source data science tools, powered by Apache Arrow\",\n+ \"events_url\": \"https://api.github.com/orgs/ursa-labs/events\",\n+ \"hooks_url\": \"https://api.github.com/orgs/ursa-labs/hooks\",\n+ \"id\": 46514972,\n+ \"issues_url\": \"https://api.github.com/orgs/ursa-labs/issues\",\n+ \"login\": \"ursa-labs\",\n+ \"members_url\": \"https://api.github.com/orgs/ursa-labs/members{/member}\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"public_members_url\": \"https://api.github.com/orgs/ursa-labs/public_members{/member}\",\n+ \"repos_url\": \"https://api.github.com/orgs/ursa-labs/repos\",\n+ \"url\": \"https://api.github.com/orgs/ursa-labs\"\n+ },\n+ \"repository\": {\n+ \"archive_url\": \"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\",\n+ \"archived\": false,\n+ \"assignees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\",\n+ \"blobs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\",\n+ \"branches_url\": \"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\",\n+ \"clone_url\": \"https://github.com/ursa-labs/ursabot.git\",\n+ \"collaborators_url\": \"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\",\n+ \"commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\",\n+ \"compare_url\": \"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\",\n+ \"contributors_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contributors\",\n+ \"created_at\": \"2019-02-04T15:40:31Z\",\n+ \"default_branch\": \"master\",\n+ \"deployments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/deployments\",\n+ \"description\": null,\n+ \"disabled\": false,\n+ \"downloads_url\": \"https://api.github.com/repos/ursa-labs/ursabot/downloads\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/events\",\n+ \"fork\": false,\n+ \"forks\": 0,\n+ \"forks_count\": 0,\n+ \"forks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/forks\",\n+ \"full_name\": \"ursa-labs/ursabot\",\n+ \"git_commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\",\n+ \"git_refs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\",\n+ \"git_tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\",\n+ \"git_url\": \"git://github.com/ursa-labs/ursabot.git\",\n+ \"has_downloads\": true,\n+ \"has_issues\": true,\n+ \"has_pages\": false,\n+ \"has_projects\": true,\n+ \"has_wiki\": true,\n+ \"homepage\": null,\n+ \"hooks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/hooks\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"id\": 169101701,\n+ \"issue_comment_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\",\n+ \"issue_events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\",\n+ \"issues_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\",\n+ \"keys_url\": \"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\",\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\",\n+ \"language\": \"Jupyter Notebook\",\n+ \"languages_url\": \"https://api.github.com/repos/ursa-labs/ursabot/languages\",\n+ \"license\": null,\n+ \"merges_url\": \"https://api.github.com/repos/ursa-labs/ursabot/merges\",\n+ \"milestones_url\": \"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\",\n+ \"mirror_url\": null,\n+ \"name\": \"ursabot\",\n+ \"node_id\": \"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\",\n+ \"notifications_url\": \"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\",\n+ \"open_issues\": 19,\n+ \"open_issues_count\": 19,\n+ \"owner\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"id\": 46514972,\n+ \"login\": \"ursa-labs\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"type\": \"Organization\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\"\n+ },\n+ \"private\": false,\n+ \"pulls_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\",\n+ \"pushed_at\": \"2019-04-05T11:22:16Z\",\n+ \"releases_url\": \"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\",\n+ \"size\": 892,\n+ \"ssh_url\": \"git@github.com:ursa-labs/ursabot.git\",\n+ \"stargazers_count\": 1,\n+ \"stargazers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/stargazers\",\n+ \"statuses_url\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\",\n+ \"subscribers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscribers\",\n+ \"subscription_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscription\",\n+ \"svn_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/tags\",\n+ \"teams_url\": \"https://api.github.com/repos/ursa-labs/ursabot/teams\",\n+ \"trees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\",\n+ \"updated_at\": \"2019-04-04T17:49:10Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"watchers\": 1,\n+ \"watchers_count\": 1\n+ },\n+ \"sender\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/49275095?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursabot/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursabot/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursabot/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursabot/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursabot\",\n+ \"id\": 49275095,\n+ \"login\": \"ursabot\",\n+ \"node_id\": \"MDQ6VXNlcjQ5Mjc1MDk1\",\n+ \"organizations_url\": \"https://api.github.com/users/ursabot/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursabot/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursabot/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursabot/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursabot/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/ursabot\"\n+ }\n+}" - }, - { - "sha": "a8082dbc91fdfe815b795e49ec10e49000771ef5", - "filename": "ursabot/tests/fixtures/issue-comment-not-mentioning-ursabot.json", - "status": "added", - "additions": 212, - "deletions": 0, - "changes": 212, - "blob_url": "https://github.com/ursa-labs/ursabot/blob/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-not-mentioning-ursabot.json", - "raw_url": "https://github.com/ursa-labs/ursabot/raw/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-not-mentioning-ursabot.json", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/fixtures/issue-comment-not-mentioning-ursabot.json?ref=2705da2b616b98fa6010a25813c5a7a27456f71d", - "patch": "@@ -0,0 +1,212 @@\n+{\n+ \"action\": \"created\",\n+ \"comment\": {\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"bear is no game\",\n+ \"created_at\": \"2019-04-05T11:26:56Z\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480241727\",\n+ \"id\": 480241727,\n+ \"issue_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"node_id\": \"MDEyOklzc3VlQ29tbWVudDQ4MDI0MTcyNw==\",\n+ \"updated_at\": \"2019-04-05T11:26:56Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480241727\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"issue\": {\n+ \"assignee\": null,\n+ \"assignees\": [],\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"\",\n+ \"closed_at\": null,\n+ \"comments\": 0,\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\",\n+ \"created_at\": \"2019-04-05T11:22:15Z\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/events\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"id\": 429706959,\n+ \"labels\": [],\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}\",\n+ \"locked\": false,\n+ \"milestone\": null,\n+ \"node_id\": \"MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy\",\n+ \"number\": 26,\n+ \"pull_request\": {\n+ \"diff_url\": \"https://github.com/ursa-labs/ursabot/pull/26.diff\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"patch_url\": \"https://github.com/ursa-labs/ursabot/pull/26.patch\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\"\n+ },\n+ \"repository_url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"state\": \"open\",\n+ \"title\": \"Unittests for GithubHook\",\n+ \"updated_at\": \"2019-04-05T11:26:56Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"organization\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"description\": \"Innovation lab for open source data science tools, powered by Apache Arrow\",\n+ \"events_url\": \"https://api.github.com/orgs/ursa-labs/events\",\n+ \"hooks_url\": \"https://api.github.com/orgs/ursa-labs/hooks\",\n+ \"id\": 46514972,\n+ \"issues_url\": \"https://api.github.com/orgs/ursa-labs/issues\",\n+ \"login\": \"ursa-labs\",\n+ \"members_url\": \"https://api.github.com/orgs/ursa-labs/members{/member}\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"public_members_url\": \"https://api.github.com/orgs/ursa-labs/public_members{/member}\",\n+ \"repos_url\": \"https://api.github.com/orgs/ursa-labs/repos\",\n+ \"url\": \"https://api.github.com/orgs/ursa-labs\"\n+ },\n+ \"repository\": {\n+ \"archive_url\": \"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\",\n+ \"archived\": false,\n+ \"assignees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\",\n+ \"blobs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\",\n+ \"branches_url\": \"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\",\n+ \"clone_url\": \"https://github.com/ursa-labs/ursabot.git\",\n+ \"collaborators_url\": \"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\",\n+ \"commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\",\n+ \"compare_url\": \"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\",\n+ \"contributors_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contributors\",\n+ \"created_at\": \"2019-02-04T15:40:31Z\",\n+ \"default_branch\": \"master\",\n+ \"deployments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/deployments\",\n+ \"description\": null,\n+ \"disabled\": false,\n+ \"downloads_url\": \"https://api.github.com/repos/ursa-labs/ursabot/downloads\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/events\",\n+ \"fork\": false,\n+ \"forks\": 0,\n+ \"forks_count\": 0,\n+ \"forks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/forks\",\n+ \"full_name\": \"ursa-labs/ursabot\",\n+ \"git_commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\",\n+ \"git_refs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\",\n+ \"git_tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\",\n+ \"git_url\": \"git://github.com/ursa-labs/ursabot.git\",\n+ \"has_downloads\": true,\n+ \"has_issues\": true,\n+ \"has_pages\": false,\n+ \"has_projects\": true,\n+ \"has_wiki\": true,\n+ \"homepage\": null,\n+ \"hooks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/hooks\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"id\": 169101701,\n+ \"issue_comment_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\",\n+ \"issue_events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\",\n+ \"issues_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\",\n+ \"keys_url\": \"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\",\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\",\n+ \"language\": \"Jupyter Notebook\",\n+ \"languages_url\": \"https://api.github.com/repos/ursa-labs/ursabot/languages\",\n+ \"license\": null,\n+ \"merges_url\": \"https://api.github.com/repos/ursa-labs/ursabot/merges\",\n+ \"milestones_url\": \"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\",\n+ \"mirror_url\": null,\n+ \"name\": \"ursabot\",\n+ \"node_id\": \"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\",\n+ \"notifications_url\": \"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\",\n+ \"open_issues\": 19,\n+ \"open_issues_count\": 19,\n+ \"owner\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"id\": 46514972,\n+ \"login\": \"ursa-labs\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"type\": \"Organization\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\"\n+ },\n+ \"private\": false,\n+ \"pulls_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\",\n+ \"pushed_at\": \"2019-04-05T11:22:16Z\",\n+ \"releases_url\": \"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\",\n+ \"size\": 892,\n+ \"ssh_url\": \"git@github.com:ursa-labs/ursabot.git\",\n+ \"stargazers_count\": 1,\n+ \"stargazers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/stargazers\",\n+ \"statuses_url\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\",\n+ \"subscribers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscribers\",\n+ \"subscription_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscription\",\n+ \"svn_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/tags\",\n+ \"teams_url\": \"https://api.github.com/repos/ursa-labs/ursabot/teams\",\n+ \"trees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\",\n+ \"updated_at\": \"2019-04-04T17:49:10Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"watchers\": 1,\n+ \"watchers_count\": 1\n+ },\n+ \"sender\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+}" - }, - { - "sha": "2770e29ba9086394455315e590c0b433d08e437e", - "filename": "ursabot/tests/fixtures/issue-comment-with-empty-command.json", - "status": "added", - "additions": 212, - "deletions": 0, - "changes": 212, - "blob_url": "https://github.com/ursa-labs/ursabot/blob/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-with-empty-command.json", - "raw_url": "https://github.com/ursa-labs/ursabot/raw/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-with-empty-command.json", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/fixtures/issue-comment-with-empty-command.json?ref=2705da2b616b98fa6010a25813c5a7a27456f71d", - "patch": "@@ -0,0 +1,212 @@\n+{\n+ \"action\": \"created\",\n+ \"comment\": {\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"@ursabot \",\n+ \"created_at\": \"2019-04-05T11:35:46Z\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480243811\",\n+ \"id\": 480243811,\n+ \"issue_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"node_id\": \"MDEyOklzc3VlQ29tbWVudDQ4MDI0MzgxMQ==\",\n+ \"updated_at\": \"2019-04-05T11:35:46Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480243811\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"issue\": {\n+ \"assignee\": null,\n+ \"assignees\": [],\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"\",\n+ \"closed_at\": null,\n+ \"comments\": 1,\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\",\n+ \"created_at\": \"2019-04-05T11:22:15Z\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/events\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"id\": 429706959,\n+ \"labels\": [],\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}\",\n+ \"locked\": false,\n+ \"milestone\": null,\n+ \"node_id\": \"MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy\",\n+ \"number\": 26,\n+ \"pull_request\": {\n+ \"diff_url\": \"https://github.com/ursa-labs/ursabot/pull/26.diff\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"patch_url\": \"https://github.com/ursa-labs/ursabot/pull/26.patch\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\"\n+ },\n+ \"repository_url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"state\": \"open\",\n+ \"title\": \"Unittests for GithubHook\",\n+ \"updated_at\": \"2019-04-05T11:35:46Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"organization\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"description\": \"Innovation lab for open source data science tools, powered by Apache Arrow\",\n+ \"events_url\": \"https://api.github.com/orgs/ursa-labs/events\",\n+ \"hooks_url\": \"https://api.github.com/orgs/ursa-labs/hooks\",\n+ \"id\": 46514972,\n+ \"issues_url\": \"https://api.github.com/orgs/ursa-labs/issues\",\n+ \"login\": \"ursa-labs\",\n+ \"members_url\": \"https://api.github.com/orgs/ursa-labs/members{/member}\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"public_members_url\": \"https://api.github.com/orgs/ursa-labs/public_members{/member}\",\n+ \"repos_url\": \"https://api.github.com/orgs/ursa-labs/repos\",\n+ \"url\": \"https://api.github.com/orgs/ursa-labs\"\n+ },\n+ \"repository\": {\n+ \"archive_url\": \"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\",\n+ \"archived\": false,\n+ \"assignees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\",\n+ \"blobs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\",\n+ \"branches_url\": \"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\",\n+ \"clone_url\": \"https://github.com/ursa-labs/ursabot.git\",\n+ \"collaborators_url\": \"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\",\n+ \"commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\",\n+ \"compare_url\": \"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\",\n+ \"contributors_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contributors\",\n+ \"created_at\": \"2019-02-04T15:40:31Z\",\n+ \"default_branch\": \"master\",\n+ \"deployments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/deployments\",\n+ \"description\": null,\n+ \"disabled\": false,\n+ \"downloads_url\": \"https://api.github.com/repos/ursa-labs/ursabot/downloads\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/events\",\n+ \"fork\": false,\n+ \"forks\": 0,\n+ \"forks_count\": 0,\n+ \"forks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/forks\",\n+ \"full_name\": \"ursa-labs/ursabot\",\n+ \"git_commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\",\n+ \"git_refs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\",\n+ \"git_tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\",\n+ \"git_url\": \"git://github.com/ursa-labs/ursabot.git\",\n+ \"has_downloads\": true,\n+ \"has_issues\": true,\n+ \"has_pages\": false,\n+ \"has_projects\": true,\n+ \"has_wiki\": true,\n+ \"homepage\": null,\n+ \"hooks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/hooks\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"id\": 169101701,\n+ \"issue_comment_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\",\n+ \"issue_events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\",\n+ \"issues_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\",\n+ \"keys_url\": \"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\",\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\",\n+ \"language\": \"Jupyter Notebook\",\n+ \"languages_url\": \"https://api.github.com/repos/ursa-labs/ursabot/languages\",\n+ \"license\": null,\n+ \"merges_url\": \"https://api.github.com/repos/ursa-labs/ursabot/merges\",\n+ \"milestones_url\": \"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\",\n+ \"mirror_url\": null,\n+ \"name\": \"ursabot\",\n+ \"node_id\": \"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\",\n+ \"notifications_url\": \"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\",\n+ \"open_issues\": 19,\n+ \"open_issues_count\": 19,\n+ \"owner\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"id\": 46514972,\n+ \"login\": \"ursa-labs\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"type\": \"Organization\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\"\n+ },\n+ \"private\": false,\n+ \"pulls_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\",\n+ \"pushed_at\": \"2019-04-05T11:22:16Z\",\n+ \"releases_url\": \"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\",\n+ \"size\": 892,\n+ \"ssh_url\": \"git@github.com:ursa-labs/ursabot.git\",\n+ \"stargazers_count\": 1,\n+ \"stargazers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/stargazers\",\n+ \"statuses_url\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\",\n+ \"subscribers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscribers\",\n+ \"subscription_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscription\",\n+ \"svn_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/tags\",\n+ \"teams_url\": \"https://api.github.com/repos/ursa-labs/ursabot/teams\",\n+ \"trees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\",\n+ \"updated_at\": \"2019-04-04T17:49:10Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"watchers\": 1,\n+ \"watchers_count\": 1\n+ },\n+ \"sender\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+}" - }, - { - "sha": "80ff46510a2f39ae60f7c3a98e5fdaef8e688784", - "filename": "ursabot/tests/fixtures/issue-comment-without-pull-request.json", - "status": "added", - "additions": 206, - "deletions": 0, - "changes": 206, - "blob_url": "https://github.com/ursa-labs/ursabot/blob/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-without-pull-request.json", - "raw_url": "https://github.com/ursa-labs/ursabot/raw/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-without-pull-request.json", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/fixtures/issue-comment-without-pull-request.json?ref=2705da2b616b98fa6010a25813c5a7a27456f71d", - "patch": "@@ -0,0 +1,206 @@\n+{\n+ \"action\": \"created\",\n+ \"comment\": {\n+ \"author_association\": \"NONE\",\n+ \"body\": \"Ursabot only listens to pull request comments!\",\n+ \"created_at\": \"2019-04-05T11:53:43Z\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/issues/19#issuecomment-480248217\",\n+ \"id\": 480248217,\n+ \"issue_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/19\",\n+ \"node_id\": \"MDEyOklzc3VlQ29tbWVudDQ4MDI0ODIxNw==\",\n+ \"updated_at\": \"2019-04-05T11:53:43Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480248217\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/49275095?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursabot/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursabot/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursabot/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursabot/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursabot\",\n+ \"id\": 49275095,\n+ \"login\": \"ursabot\",\n+ \"node_id\": \"MDQ6VXNlcjQ5Mjc1MDk1\",\n+ \"organizations_url\": \"https://api.github.com/users/ursabot/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursabot/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursabot/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursabot/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursabot/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/ursabot\"\n+ }\n+ },\n+ \"issue\": {\n+ \"assignee\": null,\n+ \"assignees\": [],\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"\",\n+ \"closed_at\": null,\n+ \"comments\": 4,\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/19/comments\",\n+ \"created_at\": \"2019-04-02T09:56:41Z\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/19/events\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/issues/19\",\n+ \"id\": 428131685,\n+ \"labels\": [],\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/19/labels{/name}\",\n+ \"locked\": false,\n+ \"milestone\": null,\n+ \"node_id\": \"MDU6SXNzdWU0MjgxMzE2ODU=\",\n+ \"number\": 19,\n+ \"repository_url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"state\": \"open\",\n+ \"title\": \"Build ursabot itself via ursabot\",\n+ \"updated_at\": \"2019-04-05T11:53:43Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/19\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"organization\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"description\": \"Innovation lab for open source data science tools, powered by Apache Arrow\",\n+ \"events_url\": \"https://api.github.com/orgs/ursa-labs/events\",\n+ \"hooks_url\": \"https://api.github.com/orgs/ursa-labs/hooks\",\n+ \"id\": 46514972,\n+ \"issues_url\": \"https://api.github.com/orgs/ursa-labs/issues\",\n+ \"login\": \"ursa-labs\",\n+ \"members_url\": \"https://api.github.com/orgs/ursa-labs/members{/member}\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"public_members_url\": \"https://api.github.com/orgs/ursa-labs/public_members{/member}\",\n+ \"repos_url\": \"https://api.github.com/orgs/ursa-labs/repos\",\n+ \"url\": \"https://api.github.com/orgs/ursa-labs\"\n+ },\n+ \"repository\": {\n+ \"archive_url\": \"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\",\n+ \"archived\": false,\n+ \"assignees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\",\n+ \"blobs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\",\n+ \"branches_url\": \"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\",\n+ \"clone_url\": \"https://github.com/ursa-labs/ursabot.git\",\n+ \"collaborators_url\": \"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\",\n+ \"commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\",\n+ \"compare_url\": \"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\",\n+ \"contributors_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contributors\",\n+ \"created_at\": \"2019-02-04T15:40:31Z\",\n+ \"default_branch\": \"master\",\n+ \"deployments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/deployments\",\n+ \"description\": null,\n+ \"disabled\": false,\n+ \"downloads_url\": \"https://api.github.com/repos/ursa-labs/ursabot/downloads\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/events\",\n+ \"fork\": false,\n+ \"forks\": 0,\n+ \"forks_count\": 0,\n+ \"forks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/forks\",\n+ \"full_name\": \"ursa-labs/ursabot\",\n+ \"git_commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\",\n+ \"git_refs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\",\n+ \"git_tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\",\n+ \"git_url\": \"git://github.com/ursa-labs/ursabot.git\",\n+ \"has_downloads\": true,\n+ \"has_issues\": true,\n+ \"has_pages\": false,\n+ \"has_projects\": true,\n+ \"has_wiki\": true,\n+ \"homepage\": null,\n+ \"hooks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/hooks\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"id\": 169101701,\n+ \"issue_comment_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\",\n+ \"issue_events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\",\n+ \"issues_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\",\n+ \"keys_url\": \"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\",\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\",\n+ \"language\": \"Jupyter Notebook\",\n+ \"languages_url\": \"https://api.github.com/repos/ursa-labs/ursabot/languages\",\n+ \"license\": null,\n+ \"merges_url\": \"https://api.github.com/repos/ursa-labs/ursabot/merges\",\n+ \"milestones_url\": \"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\",\n+ \"mirror_url\": null,\n+ \"name\": \"ursabot\",\n+ \"node_id\": \"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\",\n+ \"notifications_url\": \"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\",\n+ \"open_issues\": 19,\n+ \"open_issues_count\": 19,\n+ \"owner\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"id\": 46514972,\n+ \"login\": \"ursa-labs\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"type\": \"Organization\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\"\n+ },\n+ \"private\": false,\n+ \"pulls_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\",\n+ \"pushed_at\": \"2019-04-05T11:22:16Z\",\n+ \"releases_url\": \"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\",\n+ \"size\": 892,\n+ \"ssh_url\": \"git@github.com:ursa-labs/ursabot.git\",\n+ \"stargazers_count\": 1,\n+ \"stargazers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/stargazers\",\n+ \"statuses_url\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\",\n+ \"subscribers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscribers\",\n+ \"subscription_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscription\",\n+ \"svn_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/tags\",\n+ \"teams_url\": \"https://api.github.com/repos/ursa-labs/ursabot/teams\",\n+ \"trees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\",\n+ \"updated_at\": \"2019-04-04T17:49:10Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"watchers\": 1,\n+ \"watchers_count\": 1\n+ },\n+ \"sender\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/49275095?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursabot/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursabot/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursabot/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursabot/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursabot\",\n+ \"id\": 49275095,\n+ \"login\": \"ursabot\",\n+ \"node_id\": \"MDQ6VXNlcjQ5Mjc1MDk1\",\n+ \"organizations_url\": \"https://api.github.com/users/ursabot/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursabot/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursabot/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursabot/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursabot/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/ursabot\"\n+ }\n+}" - }, - { - "sha": "c738bb0eb54c87ba0f23e97e827d77c2be74d0b6", - "filename": "ursabot/tests/test_hooks.py", - "status": "modified", - "additions": 4, - "deletions": 4, - "changes": 8, - "blob_url": "https://github.com/ursa-labs/ursabot/blob/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/test_hooks.py", - "raw_url": "https://github.com/ursa-labs/ursabot/raw/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/test_hooks.py", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/test_hooks.py?ref=2705da2b616b98fa6010a25813c5a7a27456f71d", - "patch": "@@ -54,7 +54,7 @@ class TestGithubHook(ChangeHookTestCase):\n await self.request('ping', {})\n assert len(self.hook.master.data.updates.changesAdded) == 0\n \n- @ensure_deferred\n- async def test_issue_comment(self):\n- payload = {}\n- await self.request('issue_comment', payload)\n+ # @ensure_deferred\n+ # async def test_issue_comment(self):\n+ # payload = {}\n+ # await self.request('issue_comment', payload)" - } - ] -} \ No newline at end of file diff --git a/dev/archery/archery/tests/fixtures/pull-request-26-files.json b/dev/archery/archery/tests/fixtures/pull-request-26-files.json deleted file mode 100644 index b039b3d10539f..0000000000000 --- a/dev/archery/archery/tests/fixtures/pull-request-26-files.json +++ /dev/null @@ -1,170 +0,0 @@ -[ - { - "sha": "ebfe3f6c5e98723f9751c99ce8ce798f1ba529c5", - "filename": ".travis.yml", - "status": "modified", - "additions": 4, - "deletions": 1, - "changes": 5, - "blob_url": "https://github.com/ursa-labs/ursabot/blob/70267dee34884e4b972388e1b30d57f6248c58d0/.travis.yml", - "raw_url": "https://github.com/ursa-labs/ursabot/raw/70267dee34884e4b972388e1b30d57f6248c58d0/.travis.yml", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/.travis.yml?ref=70267dee34884e4b972388e1b30d57f6248c58d0", - "patch": "@@ -4,7 +4,10 @@ services:\n python:\n - 3.6\n script:\n- - pip install \"pytest>=3.9\" flake8 -e .\n+ # --no-binary buildbot is required because buildbot doesn't bundle its tests\n+ # to binary wheels, but ursabot's test suite depends on buildbot's so install\n+ # it from source\n+ - pip install --no-binary buildbot \"pytest>=3.9\" mock flake8 -e .\n \n # run linter\n - flake8 ursabot" - }, - { - "sha": "86ad809d3f74c175b92ac58c6c645b0fbf5fa2c5", - "filename": "setup.py", - "status": "modified", - "additions": 6, - "deletions": 1, - "changes": 7, - "blob_url": "https://github.com/ursa-labs/ursabot/blob/70267dee34884e4b972388e1b30d57f6248c58d0/setup.py", - "raw_url": "https://github.com/ursa-labs/ursabot/raw/70267dee34884e4b972388e1b30d57f6248c58d0/setup.py", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/setup.py?ref=70267dee34884e4b972388e1b30d57f6248c58d0", - "patch": "@@ -1,8 +1,13 @@\n #!/usr/bin/env python\n \n+import sys\n from setuptools import setup\n \n \n+if sys.version_info < (3, 6):\n+ sys.exit('Python < 3.6 is not supported due to missing asyncio support')\n+\n+\n # TODO(kszucs): add package data, change maintainer\n setup(\n name='ursabot',\n@@ -15,7 +20,7 @@\n setup_requires=['setuptools_scm'],\n install_requires=['click', 'dask', 'docker', 'docker-map', 'toolz',\n 'buildbot', 'treq'],\n- tests_require=['pytest>=3.9'],\n+ tests_require=['pytest>=3.9', 'mock'],\n entry_points='''\n [console_scripts]\n ursabot=ursabot.cli:ursabot" - }, - { - "sha": "c884f3f85bba499d77d9ad28bcd0ff5edf80f957", - "filename": "ursabot/factories.py", - "status": "modified", - "additions": 6, - "deletions": 2, - "changes": 8, - "blob_url": "https://github.com/ursa-labs/ursabot/blob/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/factories.py", - "raw_url": "https://github.com/ursa-labs/ursabot/raw/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/factories.py", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/factories.py?ref=70267dee34884e4b972388e1b30d57f6248c58d0", - "patch": "@@ -79,8 +79,12 @@ def prepend_step(self, step):\n repourl='https://github.com/ursa-labs/ursabot',\n mode='full'),\n ShellCommand(command=['ls', '-lah']),\n- ShellCommand(command=['pip', 'install', 'pytest', 'flake8']),\n- ShellCommand(command=['pip', 'install', '-e', '.']),\n+ ShellCommand(command=['pip', 'install', 'pytest', 'flake8', 'mock']),\n+ # --no-binary buildbot is required because buildbot doesn't bundle its\n+ # tests to binary wheels, but ursabot's test suite depends on buildbot's\n+ # so install it from source\n+ ShellCommand(command=['pip', 'install', '--no-binary', 'buildbot',\n+ '-e', '.']),\n ShellCommand(command=['flake8']),\n ShellCommand(command=['pytest', '-v', '-m', 'not docker', 'ursabot']),\n ShellCommand(command=['buildbot', 'checkconfig', '.'])" - }, - { - "sha": "0265cfbd9c2882f492469882a7bf513a1c1b5af4", - "filename": "ursabot/hooks.py", - "status": "modified", - "additions": 17, - "deletions": 19, - "changes": 36, - "blob_url": "https://github.com/ursa-labs/ursabot/blob/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/hooks.py", - "raw_url": "https://github.com/ursa-labs/ursabot/raw/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/hooks.py", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/hooks.py?ref=70267dee34884e4b972388e1b30d57f6248c58d0", - "patch": "@@ -1,11 +1,11 @@\n from urllib.parse import urlparse\n \n from twisted.python import log\n-from twisted.internet import defer\n \n from buildbot.www.hooks.github import GitHubEventHandler\n from buildbot.util.httpclientservice import HTTPClientService\n \n+from .utils import ensure_deferred\n \n BOTNAME = 'ursabot'\n \n@@ -22,20 +22,18 @@ def _client(self):\n self.master, self.github_api_endpoint, headers=headers,\n debug=self.debug, verify=self.verify)\n \n- @defer.inlineCallbacks\n- def _get(self, url):\n+ async def _get(self, url):\n url = urlparse(url)\n- client = yield self._client()\n- response = yield client.get(url.path)\n- result = yield response.json()\n+ client = await self._client()\n+ response = await client.get(url.path)\n+ result = await response.json()\n return result\n \n- @defer.inlineCallbacks\n- def _post(self, url, data):\n+ async def _post(self, url, data):\n url = urlparse(url)\n- client = yield self._client()\n- response = yield client.post(url.path, json=data)\n- result = yield response.json()\n+ client = await self._client()\n+ response = await client.post(url.path, json=data)\n+ result = await response.json()\n log.msg(f'POST to {url} with the following result: {result}')\n return result\n \n@@ -46,8 +44,8 @@ def _parse_command(self, message):\n return message.split(mention)[-1].lower().strip()\n return None\n \n- @defer.inlineCallbacks\n- def handle_issue_comment(self, payload, event):\n+ @ensure_deferred\n+ async def handle_issue_comment(self, payload, event):\n issue = payload['issue']\n comments_url = issue['comments_url']\n command = self._parse_command(payload['comment']['body'])\n@@ -64,16 +62,16 @@ def handle_issue_comment(self, payload, event):\n elif command == 'build':\n if 'pull_request' not in issue:\n message = 'Ursabot only listens to pull request comments!'\n- yield self._post(comments_url, {'body': message})\n+ await self._post(comments_url, {'body': message})\n return [], 'git'\n else:\n message = f'Unknown command \"{command}\"'\n- yield self._post(comments_url, {'body': message})\n+ await self._post(comments_url, {'body': message})\n return [], 'git'\n \n try:\n- pull_request = yield self._get(issue['pull_request']['url'])\n- changes, _ = yield self.handle_pull_request({\n+ pull_request = await self._get(issue['pull_request']['url'])\n+ changes, _ = await self.handle_pull_request({\n 'action': 'synchronize',\n 'sender': payload['sender'],\n 'repository': payload['repository'],\n@@ -82,11 +80,11 @@ def handle_issue_comment(self, payload, event):\n }, event)\n except Exception as e:\n message = \"I've failed to start builds for this PR\"\n- yield self._post(comments_url, {'body': message})\n+ await self._post(comments_url, {'body': message})\n raise e\n else:\n message = \"I've successfully started builds for this PR\"\n- yield self._post(comments_url, {'body': message})\n+ await self._post(comments_url, {'body': message})\n return changes, 'git'\n \n # TODO(kszucs):" - }, - { - "sha": "1e1ecf2ce47da929dbf1b93632640e7e6ae1cfe0", - "filename": "ursabot/steps.py", - "status": "modified", - "additions": 13, - "deletions": 13, - "changes": 26, - "blob_url": "https://github.com/ursa-labs/ursabot/blob/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/steps.py", - "raw_url": "https://github.com/ursa-labs/ursabot/raw/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/steps.py", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/steps.py?ref=70267dee34884e4b972388e1b30d57f6248c58d0", - "patch": "@@ -1,9 +1,9 @@\n-from twisted.internet import defer\n-\n from buildbot.plugins import steps, util\n from buildbot.process import buildstep\n from buildbot.process.results import SUCCESS\n \n+from .utils import ensure_deferred\n+\n \n class ShellMixin(buildstep.ShellMixin):\n \"\"\"Run command in a login bash shell\n@@ -49,10 +49,10 @@ def __init__(self, **kwargs):\n kwargs = self.setupShellMixin(kwargs)\n super().__init__(**kwargs)\n \n- @defer.inlineCallbacks\n- def run(self):\n- cmd = yield self.makeRemoteShellCommand(command=self.command)\n- yield self.runCommand(cmd)\n+ @ensure_deferred\n+ async def run(self):\n+ cmd = await self.makeRemoteShellCommand(command=self.command)\n+ await self.runCommand(cmd)\n return cmd.results()\n \n \n@@ -71,8 +71,8 @@ class CMake(ShellMixin, steps.CMake):\n \n name = 'CMake'\n \n- @defer.inlineCallbacks\n- def run(self):\n+ @ensure_deferred\n+ async def run(self):\n \"\"\"Create and run CMake command\n \n Copied from the original CMake implementation to handle None values as\n@@ -94,8 +94,8 @@ def run(self):\n if self.options is not None:\n command.extend(self.options)\n \n- cmd = yield self.makeRemoteShellCommand(command=command)\n- yield self.runCommand(cmd)\n+ cmd = await self.makeRemoteShellCommand(command=command)\n+ await self.runCommand(cmd)\n \n return cmd.results()\n \n@@ -117,8 +117,8 @@ def __init__(self, variables, source='WorkerEnvironment', **kwargs):\n self.source = source\n super().__init__(**kwargs)\n \n- @defer.inlineCallbacks\n- def run(self):\n+ @ensure_deferred\n+ async def run(self):\n # on Windows, environment variables are case-insensitive, but we have\n # a case-sensitive dictionary in worker_environ. Fortunately, that\n # dictionary is also folded to uppercase, so we can simply fold the\n@@ -139,7 +139,7 @@ def run(self):\n # TODO(kszucs) try with self.setProperty similarly like in\n # SetProperties\n properties.setProperty(prop, value, self.source, runtime=True)\n- yield self.addCompleteLog('set-prop', f'{prop}: {value}')\n+ await self.addCompleteLog('set-prop', f'{prop}: {value}')\n \n return SUCCESS\n " - }, - { - "sha": "6a7d5308be6608f542a810d410f9240157a1340f", - "filename": "ursabot/tests/fixtures/issue-comment-build-command.json", - "status": "added", - "additions": 212, - "deletions": 0, - "changes": 212, - "blob_url": "https://github.com/ursa-labs/ursabot/blob/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/tests/fixtures/issue-comment-build-command.json", - "raw_url": "https://github.com/ursa-labs/ursabot/raw/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/tests/fixtures/issue-comment-build-command.json", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/fixtures/issue-comment-build-command.json?ref=70267dee34884e4b972388e1b30d57f6248c58d0", - "patch": "@@ -0,0 +1,212 @@\n+{\n+ \"action\": \"created\",\n+ \"comment\": {\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"@ursabot build\",\n+ \"created_at\": \"2019-04-05T11:55:43Z\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480248726\",\n+ \"id\": 480248726,\n+ \"issue_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"node_id\": \"MDEyOklzc3VlQ29tbWVudDQ4MDI0ODcyNg==\",\n+ \"updated_at\": \"2019-04-05T11:55:43Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480248726\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"issue\": {\n+ \"assignee\": null,\n+ \"assignees\": [],\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"\",\n+ \"closed_at\": null,\n+ \"comments\": 3,\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\",\n+ \"created_at\": \"2019-04-05T11:22:15Z\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/events\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"id\": 429706959,\n+ \"labels\": [],\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}\",\n+ \"locked\": false,\n+ \"milestone\": null,\n+ \"node_id\": \"MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy\",\n+ \"number\": 26,\n+ \"pull_request\": {\n+ \"diff_url\": \"https://github.com/ursa-labs/ursabot/pull/26.diff\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"patch_url\": \"https://github.com/ursa-labs/ursabot/pull/26.patch\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\"\n+ },\n+ \"repository_url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"state\": \"open\",\n+ \"title\": \"Unittests for GithubHook\",\n+ \"updated_at\": \"2019-04-05T11:55:43Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"organization\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"description\": \"Innovation lab for open source data science tools, powered by Apache Arrow\",\n+ \"events_url\": \"https://api.github.com/orgs/ursa-labs/events\",\n+ \"hooks_url\": \"https://api.github.com/orgs/ursa-labs/hooks\",\n+ \"id\": 46514972,\n+ \"issues_url\": \"https://api.github.com/orgs/ursa-labs/issues\",\n+ \"login\": \"ursa-labs\",\n+ \"members_url\": \"https://api.github.com/orgs/ursa-labs/members{/member}\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"public_members_url\": \"https://api.github.com/orgs/ursa-labs/public_members{/member}\",\n+ \"repos_url\": \"https://api.github.com/orgs/ursa-labs/repos\",\n+ \"url\": \"https://api.github.com/orgs/ursa-labs\"\n+ },\n+ \"repository\": {\n+ \"archive_url\": \"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\",\n+ \"archived\": false,\n+ \"assignees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\",\n+ \"blobs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\",\n+ \"branches_url\": \"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\",\n+ \"clone_url\": \"https://github.com/ursa-labs/ursabot.git\",\n+ \"collaborators_url\": \"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\",\n+ \"commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\",\n+ \"compare_url\": \"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\",\n+ \"contributors_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contributors\",\n+ \"created_at\": \"2019-02-04T15:40:31Z\",\n+ \"default_branch\": \"master\",\n+ \"deployments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/deployments\",\n+ \"description\": null,\n+ \"disabled\": false,\n+ \"downloads_url\": \"https://api.github.com/repos/ursa-labs/ursabot/downloads\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/events\",\n+ \"fork\": false,\n+ \"forks\": 0,\n+ \"forks_count\": 0,\n+ \"forks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/forks\",\n+ \"full_name\": \"ursa-labs/ursabot\",\n+ \"git_commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\",\n+ \"git_refs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\",\n+ \"git_tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\",\n+ \"git_url\": \"git://github.com/ursa-labs/ursabot.git\",\n+ \"has_downloads\": true,\n+ \"has_issues\": true,\n+ \"has_pages\": false,\n+ \"has_projects\": true,\n+ \"has_wiki\": true,\n+ \"homepage\": null,\n+ \"hooks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/hooks\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"id\": 169101701,\n+ \"issue_comment_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\",\n+ \"issue_events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\",\n+ \"issues_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\",\n+ \"keys_url\": \"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\",\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\",\n+ \"language\": \"Jupyter Notebook\",\n+ \"languages_url\": \"https://api.github.com/repos/ursa-labs/ursabot/languages\",\n+ \"license\": null,\n+ \"merges_url\": \"https://api.github.com/repos/ursa-labs/ursabot/merges\",\n+ \"milestones_url\": \"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\",\n+ \"mirror_url\": null,\n+ \"name\": \"ursabot\",\n+ \"node_id\": \"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\",\n+ \"notifications_url\": \"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\",\n+ \"open_issues\": 19,\n+ \"open_issues_count\": 19,\n+ \"owner\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"id\": 46514972,\n+ \"login\": \"ursa-labs\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"type\": \"Organization\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\"\n+ },\n+ \"private\": false,\n+ \"pulls_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\",\n+ \"pushed_at\": \"2019-04-05T11:22:16Z\",\n+ \"releases_url\": \"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\",\n+ \"size\": 892,\n+ \"ssh_url\": \"git@github.com:ursa-labs/ursabot.git\",\n+ \"stargazers_count\": 1,\n+ \"stargazers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/stargazers\",\n+ \"statuses_url\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\",\n+ \"subscribers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscribers\",\n+ \"subscription_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscription\",\n+ \"svn_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/tags\",\n+ \"teams_url\": \"https://api.github.com/repos/ursa-labs/ursabot/teams\",\n+ \"trees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\",\n+ \"updated_at\": \"2019-04-04T17:49:10Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"watchers\": 1,\n+ \"watchers_count\": 1\n+ },\n+ \"sender\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+}" - }, - { - "sha": "7ef554e333327f0e62aa1fd76b4b17844a39adeb", - "filename": "ursabot/tests/fixtures/issue-comment-by-ursabot.json", - "status": "added", - "additions": 212, - "deletions": 0, - "changes": 212, - "blob_url": "https://github.com/ursa-labs/ursabot/blob/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/tests/fixtures/issue-comment-by-ursabot.json", - "raw_url": "https://github.com/ursa-labs/ursabot/raw/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/tests/fixtures/issue-comment-by-ursabot.json", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/fixtures/issue-comment-by-ursabot.json?ref=70267dee34884e4b972388e1b30d57f6248c58d0", - "patch": "@@ -0,0 +1,212 @@\n+{\n+ \"action\": \"created\",\n+ \"comment\": {\n+ \"author_association\": \"NONE\",\n+ \"body\": \"Unknown command \\\"\\\"\",\n+ \"created_at\": \"2019-04-05T11:35:47Z\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480243815\",\n+ \"id\": 480243815,\n+ \"issue_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"node_id\": \"MDEyOklzc3VlQ29tbWVudDQ4MDI0MzgxNQ==\",\n+ \"updated_at\": \"2019-04-05T11:35:47Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480243815\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/49275095?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursabot/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursabot/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursabot/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursabot/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursabot\",\n+ \"id\": 49275095,\n+ \"login\": \"ursabot\",\n+ \"node_id\": \"MDQ6VXNlcjQ5Mjc1MDk1\",\n+ \"organizations_url\": \"https://api.github.com/users/ursabot/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursabot/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursabot/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursabot/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursabot/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/ursabot\"\n+ }\n+ },\n+ \"issue\": {\n+ \"assignee\": null,\n+ \"assignees\": [],\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"\",\n+ \"closed_at\": null,\n+ \"comments\": 2,\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\",\n+ \"created_at\": \"2019-04-05T11:22:15Z\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/events\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"id\": 429706959,\n+ \"labels\": [],\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}\",\n+ \"locked\": false,\n+ \"milestone\": null,\n+ \"node_id\": \"MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy\",\n+ \"number\": 26,\n+ \"pull_request\": {\n+ \"diff_url\": \"https://github.com/ursa-labs/ursabot/pull/26.diff\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"patch_url\": \"https://github.com/ursa-labs/ursabot/pull/26.patch\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\"\n+ },\n+ \"repository_url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"state\": \"open\",\n+ \"title\": \"Unittests for GithubHook\",\n+ \"updated_at\": \"2019-04-05T11:35:47Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"organization\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"description\": \"Innovation lab for open source data science tools, powered by Apache Arrow\",\n+ \"events_url\": \"https://api.github.com/orgs/ursa-labs/events\",\n+ \"hooks_url\": \"https://api.github.com/orgs/ursa-labs/hooks\",\n+ \"id\": 46514972,\n+ \"issues_url\": \"https://api.github.com/orgs/ursa-labs/issues\",\n+ \"login\": \"ursa-labs\",\n+ \"members_url\": \"https://api.github.com/orgs/ursa-labs/members{/member}\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"public_members_url\": \"https://api.github.com/orgs/ursa-labs/public_members{/member}\",\n+ \"repos_url\": \"https://api.github.com/orgs/ursa-labs/repos\",\n+ \"url\": \"https://api.github.com/orgs/ursa-labs\"\n+ },\n+ \"repository\": {\n+ \"archive_url\": \"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\",\n+ \"archived\": false,\n+ \"assignees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\",\n+ \"blobs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\",\n+ \"branches_url\": \"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\",\n+ \"clone_url\": \"https://github.com/ursa-labs/ursabot.git\",\n+ \"collaborators_url\": \"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\",\n+ \"commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\",\n+ \"compare_url\": \"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\",\n+ \"contributors_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contributors\",\n+ \"created_at\": \"2019-02-04T15:40:31Z\",\n+ \"default_branch\": \"master\",\n+ \"deployments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/deployments\",\n+ \"description\": null,\n+ \"disabled\": false,\n+ \"downloads_url\": \"https://api.github.com/repos/ursa-labs/ursabot/downloads\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/events\",\n+ \"fork\": false,\n+ \"forks\": 0,\n+ \"forks_count\": 0,\n+ \"forks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/forks\",\n+ \"full_name\": \"ursa-labs/ursabot\",\n+ \"git_commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\",\n+ \"git_refs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\",\n+ \"git_tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\",\n+ \"git_url\": \"git://github.com/ursa-labs/ursabot.git\",\n+ \"has_downloads\": true,\n+ \"has_issues\": true,\n+ \"has_pages\": false,\n+ \"has_projects\": true,\n+ \"has_wiki\": true,\n+ \"homepage\": null,\n+ \"hooks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/hooks\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"id\": 169101701,\n+ \"issue_comment_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\",\n+ \"issue_events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\",\n+ \"issues_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\",\n+ \"keys_url\": \"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\",\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\",\n+ \"language\": \"Jupyter Notebook\",\n+ \"languages_url\": \"https://api.github.com/repos/ursa-labs/ursabot/languages\",\n+ \"license\": null,\n+ \"merges_url\": \"https://api.github.com/repos/ursa-labs/ursabot/merges\",\n+ \"milestones_url\": \"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\",\n+ \"mirror_url\": null,\n+ \"name\": \"ursabot\",\n+ \"node_id\": \"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\",\n+ \"notifications_url\": \"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\",\n+ \"open_issues\": 19,\n+ \"open_issues_count\": 19,\n+ \"owner\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"id\": 46514972,\n+ \"login\": \"ursa-labs\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"type\": \"Organization\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\"\n+ },\n+ \"private\": false,\n+ \"pulls_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\",\n+ \"pushed_at\": \"2019-04-05T11:22:16Z\",\n+ \"releases_url\": \"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\",\n+ \"size\": 892,\n+ \"ssh_url\": \"git@github.com:ursa-labs/ursabot.git\",\n+ \"stargazers_count\": 1,\n+ \"stargazers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/stargazers\",\n+ \"statuses_url\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\",\n+ \"subscribers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscribers\",\n+ \"subscription_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscription\",\n+ \"svn_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/tags\",\n+ \"teams_url\": \"https://api.github.com/repos/ursa-labs/ursabot/teams\",\n+ \"trees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\",\n+ \"updated_at\": \"2019-04-04T17:49:10Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"watchers\": 1,\n+ \"watchers_count\": 1\n+ },\n+ \"sender\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/49275095?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursabot/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursabot/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursabot/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursabot/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursabot\",\n+ \"id\": 49275095,\n+ \"login\": \"ursabot\",\n+ \"node_id\": \"MDQ6VXNlcjQ5Mjc1MDk1\",\n+ \"organizations_url\": \"https://api.github.com/users/ursabot/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursabot/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursabot/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursabot/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursabot/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/ursabot\"\n+ }\n+}" - }, - { - "sha": "a8082dbc91fdfe815b795e49ec10e49000771ef5", - "filename": "ursabot/tests/fixtures/issue-comment-not-mentioning-ursabot.json", - "status": "added", - "additions": 212, - "deletions": 0, - "changes": 212, - "blob_url": "https://github.com/ursa-labs/ursabot/blob/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/tests/fixtures/issue-comment-not-mentioning-ursabot.json", - "raw_url": "https://github.com/ursa-labs/ursabot/raw/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/tests/fixtures/issue-comment-not-mentioning-ursabot.json", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/fixtures/issue-comment-not-mentioning-ursabot.json?ref=70267dee34884e4b972388e1b30d57f6248c58d0", - "patch": "@@ -0,0 +1,212 @@\n+{\n+ \"action\": \"created\",\n+ \"comment\": {\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"bear is no game\",\n+ \"created_at\": \"2019-04-05T11:26:56Z\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480241727\",\n+ \"id\": 480241727,\n+ \"issue_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"node_id\": \"MDEyOklzc3VlQ29tbWVudDQ4MDI0MTcyNw==\",\n+ \"updated_at\": \"2019-04-05T11:26:56Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480241727\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"issue\": {\n+ \"assignee\": null,\n+ \"assignees\": [],\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"\",\n+ \"closed_at\": null,\n+ \"comments\": 0,\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\",\n+ \"created_at\": \"2019-04-05T11:22:15Z\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/events\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"id\": 429706959,\n+ \"labels\": [],\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}\",\n+ \"locked\": false,\n+ \"milestone\": null,\n+ \"node_id\": \"MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy\",\n+ \"number\": 26,\n+ \"pull_request\": {\n+ \"diff_url\": \"https://github.com/ursa-labs/ursabot/pull/26.diff\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"patch_url\": \"https://github.com/ursa-labs/ursabot/pull/26.patch\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\"\n+ },\n+ \"repository_url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"state\": \"open\",\n+ \"title\": \"Unittests for GithubHook\",\n+ \"updated_at\": \"2019-04-05T11:26:56Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"organization\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"description\": \"Innovation lab for open source data science tools, powered by Apache Arrow\",\n+ \"events_url\": \"https://api.github.com/orgs/ursa-labs/events\",\n+ \"hooks_url\": \"https://api.github.com/orgs/ursa-labs/hooks\",\n+ \"id\": 46514972,\n+ \"issues_url\": \"https://api.github.com/orgs/ursa-labs/issues\",\n+ \"login\": \"ursa-labs\",\n+ \"members_url\": \"https://api.github.com/orgs/ursa-labs/members{/member}\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"public_members_url\": \"https://api.github.com/orgs/ursa-labs/public_members{/member}\",\n+ \"repos_url\": \"https://api.github.com/orgs/ursa-labs/repos\",\n+ \"url\": \"https://api.github.com/orgs/ursa-labs\"\n+ },\n+ \"repository\": {\n+ \"archive_url\": \"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\",\n+ \"archived\": false,\n+ \"assignees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\",\n+ \"blobs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\",\n+ \"branches_url\": \"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\",\n+ \"clone_url\": \"https://github.com/ursa-labs/ursabot.git\",\n+ \"collaborators_url\": \"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\",\n+ \"commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\",\n+ \"compare_url\": \"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\",\n+ \"contributors_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contributors\",\n+ \"created_at\": \"2019-02-04T15:40:31Z\",\n+ \"default_branch\": \"master\",\n+ \"deployments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/deployments\",\n+ \"description\": null,\n+ \"disabled\": false,\n+ \"downloads_url\": \"https://api.github.com/repos/ursa-labs/ursabot/downloads\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/events\",\n+ \"fork\": false,\n+ \"forks\": 0,\n+ \"forks_count\": 0,\n+ \"forks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/forks\",\n+ \"full_name\": \"ursa-labs/ursabot\",\n+ \"git_commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\",\n+ \"git_refs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\",\n+ \"git_tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\",\n+ \"git_url\": \"git://github.com/ursa-labs/ursabot.git\",\n+ \"has_downloads\": true,\n+ \"has_issues\": true,\n+ \"has_pages\": false,\n+ \"has_projects\": true,\n+ \"has_wiki\": true,\n+ \"homepage\": null,\n+ \"hooks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/hooks\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"id\": 169101701,\n+ \"issue_comment_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\",\n+ \"issue_events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\",\n+ \"issues_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\",\n+ \"keys_url\": \"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\",\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\",\n+ \"language\": \"Jupyter Notebook\",\n+ \"languages_url\": \"https://api.github.com/repos/ursa-labs/ursabot/languages\",\n+ \"license\": null,\n+ \"merges_url\": \"https://api.github.com/repos/ursa-labs/ursabot/merges\",\n+ \"milestones_url\": \"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\",\n+ \"mirror_url\": null,\n+ \"name\": \"ursabot\",\n+ \"node_id\": \"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\",\n+ \"notifications_url\": \"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\",\n+ \"open_issues\": 19,\n+ \"open_issues_count\": 19,\n+ \"owner\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"id\": 46514972,\n+ \"login\": \"ursa-labs\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"type\": \"Organization\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\"\n+ },\n+ \"private\": false,\n+ \"pulls_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\",\n+ \"pushed_at\": \"2019-04-05T11:22:16Z\",\n+ \"releases_url\": \"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\",\n+ \"size\": 892,\n+ \"ssh_url\": \"git@github.com:ursa-labs/ursabot.git\",\n+ \"stargazers_count\": 1,\n+ \"stargazers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/stargazers\",\n+ \"statuses_url\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\",\n+ \"subscribers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscribers\",\n+ \"subscription_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscription\",\n+ \"svn_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/tags\",\n+ \"teams_url\": \"https://api.github.com/repos/ursa-labs/ursabot/teams\",\n+ \"trees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\",\n+ \"updated_at\": \"2019-04-04T17:49:10Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"watchers\": 1,\n+ \"watchers_count\": 1\n+ },\n+ \"sender\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+}" - }, - { - "sha": "2770e29ba9086394455315e590c0b433d08e437e", - "filename": "ursabot/tests/fixtures/issue-comment-with-empty-command.json", - "status": "added", - "additions": 212, - "deletions": 0, - "changes": 212, - "blob_url": "https://github.com/ursa-labs/ursabot/blob/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/tests/fixtures/issue-comment-with-empty-command.json", - "raw_url": "https://github.com/ursa-labs/ursabot/raw/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/tests/fixtures/issue-comment-with-empty-command.json", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/fixtures/issue-comment-with-empty-command.json?ref=70267dee34884e4b972388e1b30d57f6248c58d0", - "patch": "@@ -0,0 +1,212 @@\n+{\n+ \"action\": \"created\",\n+ \"comment\": {\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"@ursabot \",\n+ \"created_at\": \"2019-04-05T11:35:46Z\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480243811\",\n+ \"id\": 480243811,\n+ \"issue_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"node_id\": \"MDEyOklzc3VlQ29tbWVudDQ4MDI0MzgxMQ==\",\n+ \"updated_at\": \"2019-04-05T11:35:46Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480243811\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"issue\": {\n+ \"assignee\": null,\n+ \"assignees\": [],\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"\",\n+ \"closed_at\": null,\n+ \"comments\": 1,\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\",\n+ \"created_at\": \"2019-04-05T11:22:15Z\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/events\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"id\": 429706959,\n+ \"labels\": [],\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}\",\n+ \"locked\": false,\n+ \"milestone\": null,\n+ \"node_id\": \"MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy\",\n+ \"number\": 26,\n+ \"pull_request\": {\n+ \"diff_url\": \"https://github.com/ursa-labs/ursabot/pull/26.diff\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"patch_url\": \"https://github.com/ursa-labs/ursabot/pull/26.patch\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\"\n+ },\n+ \"repository_url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"state\": \"open\",\n+ \"title\": \"Unittests for GithubHook\",\n+ \"updated_at\": \"2019-04-05T11:35:46Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"organization\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"description\": \"Innovation lab for open source data science tools, powered by Apache Arrow\",\n+ \"events_url\": \"https://api.github.com/orgs/ursa-labs/events\",\n+ \"hooks_url\": \"https://api.github.com/orgs/ursa-labs/hooks\",\n+ \"id\": 46514972,\n+ \"issues_url\": \"https://api.github.com/orgs/ursa-labs/issues\",\n+ \"login\": \"ursa-labs\",\n+ \"members_url\": \"https://api.github.com/orgs/ursa-labs/members{/member}\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"public_members_url\": \"https://api.github.com/orgs/ursa-labs/public_members{/member}\",\n+ \"repos_url\": \"https://api.github.com/orgs/ursa-labs/repos\",\n+ \"url\": \"https://api.github.com/orgs/ursa-labs\"\n+ },\n+ \"repository\": {\n+ \"archive_url\": \"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\",\n+ \"archived\": false,\n+ \"assignees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\",\n+ \"blobs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\",\n+ \"branches_url\": \"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\",\n+ \"clone_url\": \"https://github.com/ursa-labs/ursabot.git\",\n+ \"collaborators_url\": \"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\",\n+ \"commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\",\n+ \"compare_url\": \"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\",\n+ \"contributors_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contributors\",\n+ \"created_at\": \"2019-02-04T15:40:31Z\",\n+ \"default_branch\": \"master\",\n+ \"deployments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/deployments\",\n+ \"description\": null,\n+ \"disabled\": false,\n+ \"downloads_url\": \"https://api.github.com/repos/ursa-labs/ursabot/downloads\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/events\",\n+ \"fork\": false,\n+ \"forks\": 0,\n+ \"forks_count\": 0,\n+ \"forks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/forks\",\n+ \"full_name\": \"ursa-labs/ursabot\",\n+ \"git_commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\",\n+ \"git_refs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\",\n+ \"git_tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\",\n+ \"git_url\": \"git://github.com/ursa-labs/ursabot.git\",\n+ \"has_downloads\": true,\n+ \"has_issues\": true,\n+ \"has_pages\": false,\n+ \"has_projects\": true,\n+ \"has_wiki\": true,\n+ \"homepage\": null,\n+ \"hooks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/hooks\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"id\": 169101701,\n+ \"issue_comment_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\",\n+ \"issue_events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\",\n+ \"issues_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\",\n+ \"keys_url\": \"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\",\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\",\n+ \"language\": \"Jupyter Notebook\",\n+ \"languages_url\": \"https://api.github.com/repos/ursa-labs/ursabot/languages\",\n+ \"license\": null,\n+ \"merges_url\": \"https://api.github.com/repos/ursa-labs/ursabot/merges\",\n+ \"milestones_url\": \"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\",\n+ \"mirror_url\": null,\n+ \"name\": \"ursabot\",\n+ \"node_id\": \"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\",\n+ \"notifications_url\": \"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\",\n+ \"open_issues\": 19,\n+ \"open_issues_count\": 19,\n+ \"owner\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"id\": 46514972,\n+ \"login\": \"ursa-labs\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"type\": \"Organization\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\"\n+ },\n+ \"private\": false,\n+ \"pulls_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\",\n+ \"pushed_at\": \"2019-04-05T11:22:16Z\",\n+ \"releases_url\": \"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\",\n+ \"size\": 892,\n+ \"ssh_url\": \"git@github.com:ursa-labs/ursabot.git\",\n+ \"stargazers_count\": 1,\n+ \"stargazers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/stargazers\",\n+ \"statuses_url\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\",\n+ \"subscribers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscribers\",\n+ \"subscription_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscription\",\n+ \"svn_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/tags\",\n+ \"teams_url\": \"https://api.github.com/repos/ursa-labs/ursabot/teams\",\n+ \"trees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\",\n+ \"updated_at\": \"2019-04-04T17:49:10Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"watchers\": 1,\n+ \"watchers_count\": 1\n+ },\n+ \"sender\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+}" - }, - { - "sha": "b7de8d838332944101812ee2a46c08dd0144efe3", - "filename": "ursabot/tests/fixtures/issue-comment-without-pull-request.json", - "status": "added", - "additions": 206, - "deletions": 0, - "changes": 206, - "blob_url": "https://github.com/ursa-labs/ursabot/blob/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/tests/fixtures/issue-comment-without-pull-request.json", - "raw_url": "https://github.com/ursa-labs/ursabot/raw/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/tests/fixtures/issue-comment-without-pull-request.json", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/fixtures/issue-comment-without-pull-request.json?ref=70267dee34884e4b972388e1b30d57f6248c58d0", - "patch": "@@ -0,0 +1,206 @@\n+{\n+ \"action\": \"created\",\n+ \"comment\": {\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"@ursabot build\",\n+ \"created_at\": \"2019-04-05T13:07:57Z\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/issues/19#issuecomment-480268708\",\n+ \"id\": 480268708,\n+ \"issue_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/19\",\n+ \"node_id\": \"MDEyOklzc3VlQ29tbWVudDQ4MDI2ODcwOA==\",\n+ \"updated_at\": \"2019-04-05T13:07:57Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480268708\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"issue\": {\n+ \"assignee\": null,\n+ \"assignees\": [],\n+ \"author_association\": \"MEMBER\",\n+ \"body\": \"\",\n+ \"closed_at\": null,\n+ \"comments\": 5,\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/19/comments\",\n+ \"created_at\": \"2019-04-02T09:56:41Z\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/19/events\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/issues/19\",\n+ \"id\": 428131685,\n+ \"labels\": [],\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/19/labels{/name}\",\n+ \"locked\": false,\n+ \"milestone\": null,\n+ \"node_id\": \"MDU6SXNzdWU0MjgxMzE2ODU=\",\n+ \"number\": 19,\n+ \"repository_url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"state\": \"open\",\n+ \"title\": \"Build ursabot itself via ursabot\",\n+ \"updated_at\": \"2019-04-05T13:07:57Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/19\",\n+ \"user\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+ },\n+ \"organization\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"description\": \"Innovation lab for open source data science tools, powered by Apache Arrow\",\n+ \"events_url\": \"https://api.github.com/orgs/ursa-labs/events\",\n+ \"hooks_url\": \"https://api.github.com/orgs/ursa-labs/hooks\",\n+ \"id\": 46514972,\n+ \"issues_url\": \"https://api.github.com/orgs/ursa-labs/issues\",\n+ \"login\": \"ursa-labs\",\n+ \"members_url\": \"https://api.github.com/orgs/ursa-labs/members{/member}\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"public_members_url\": \"https://api.github.com/orgs/ursa-labs/public_members{/member}\",\n+ \"repos_url\": \"https://api.github.com/orgs/ursa-labs/repos\",\n+ \"url\": \"https://api.github.com/orgs/ursa-labs\"\n+ },\n+ \"repository\": {\n+ \"archive_url\": \"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\",\n+ \"archived\": false,\n+ \"assignees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\",\n+ \"blobs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\",\n+ \"branches_url\": \"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\",\n+ \"clone_url\": \"https://github.com/ursa-labs/ursabot.git\",\n+ \"collaborators_url\": \"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\",\n+ \"commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\",\n+ \"compare_url\": \"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\",\n+ \"contributors_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contributors\",\n+ \"created_at\": \"2019-02-04T15:40:31Z\",\n+ \"default_branch\": \"master\",\n+ \"deployments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/deployments\",\n+ \"description\": null,\n+ \"disabled\": false,\n+ \"downloads_url\": \"https://api.github.com/repos/ursa-labs/ursabot/downloads\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/events\",\n+ \"fork\": false,\n+ \"forks\": 0,\n+ \"forks_count\": 0,\n+ \"forks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/forks\",\n+ \"full_name\": \"ursa-labs/ursabot\",\n+ \"git_commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\",\n+ \"git_refs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\",\n+ \"git_tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\",\n+ \"git_url\": \"git://github.com/ursa-labs/ursabot.git\",\n+ \"has_downloads\": true,\n+ \"has_issues\": true,\n+ \"has_pages\": false,\n+ \"has_projects\": true,\n+ \"has_wiki\": true,\n+ \"homepage\": null,\n+ \"hooks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/hooks\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"id\": 169101701,\n+ \"issue_comment_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\",\n+ \"issue_events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\",\n+ \"issues_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\",\n+ \"keys_url\": \"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\",\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\",\n+ \"language\": \"Jupyter Notebook\",\n+ \"languages_url\": \"https://api.github.com/repos/ursa-labs/ursabot/languages\",\n+ \"license\": null,\n+ \"merges_url\": \"https://api.github.com/repos/ursa-labs/ursabot/merges\",\n+ \"milestones_url\": \"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\",\n+ \"mirror_url\": null,\n+ \"name\": \"ursabot\",\n+ \"node_id\": \"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\",\n+ \"notifications_url\": \"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\",\n+ \"open_issues\": 19,\n+ \"open_issues_count\": 19,\n+ \"owner\": {\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"id\": 46514972,\n+ \"login\": \"ursa-labs\",\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"type\": \"Organization\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\"\n+ },\n+ \"private\": false,\n+ \"pulls_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\",\n+ \"pushed_at\": \"2019-04-05T12:01:40Z\",\n+ \"releases_url\": \"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\",\n+ \"size\": 898,\n+ \"ssh_url\": \"git@github.com:ursa-labs/ursabot.git\",\n+ \"stargazers_count\": 1,\n+ \"stargazers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/stargazers\",\n+ \"statuses_url\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\",\n+ \"subscribers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscribers\",\n+ \"subscription_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscription\",\n+ \"svn_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/tags\",\n+ \"teams_url\": \"https://api.github.com/repos/ursa-labs/ursabot/teams\",\n+ \"trees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\",\n+ \"updated_at\": \"2019-04-04T17:49:10Z\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"watchers\": 1,\n+ \"watchers_count\": 1\n+ },\n+ \"sender\": {\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"gravatar_id\": \"\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"id\": 961747,\n+ \"login\": \"kszucs\",\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"site_admin\": false,\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"type\": \"User\",\n+ \"url\": \"https://api.github.com/users/kszucs\"\n+ }\n+}" - }, - { - "sha": "33e051455e866fb4774a16ae02ad40dcf9e6a7fd", - "filename": "ursabot/tests/fixtures/pull-request-26-commit.json", - "status": "added", - "additions": 158, - "deletions": 0, - "changes": 158, - "blob_url": "https://github.com/ursa-labs/ursabot/blob/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/tests/fixtures/pull-request-26-commit.json", - "raw_url": "https://github.com/ursa-labs/ursabot/raw/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/tests/fixtures/pull-request-26-commit.json", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/fixtures/pull-request-26-commit.json?ref=70267dee34884e4b972388e1b30d57f6248c58d0", - "patch": "@@ -0,0 +1,158 @@\n+{\n+ \"sha\": \"2705da2b616b98fa6010a25813c5a7a27456f71d\",\n+ \"node_id\": \"MDY6Q29tbWl0MTY5MTAxNzAxOjI3MDVkYTJiNjE2Yjk4ZmE2MDEwYTI1ODEzYzVhN2EyNzQ1NmY3MWQ=\",\n+ \"commit\": {\n+ \"author\": {\n+ \"name\": \"Krisztián Szűcs\",\n+ \"email\": \"szucs.krisztian@gmail.com\",\n+ \"date\": \"2019-04-05T12:01:31Z\"\n+ },\n+ \"committer\": {\n+ \"name\": \"Krisztián Szűcs\",\n+ \"email\": \"szucs.krisztian@gmail.com\",\n+ \"date\": \"2019-04-05T12:01:31Z\"\n+ },\n+ \"message\": \"add recorded event requests\",\n+ \"tree\": {\n+ \"sha\": \"16a7bb186833a67e9c2d84a58393503b85500ceb\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/trees/16a7bb186833a67e9c2d84a58393503b85500ceb\"\n+ },\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/commits/2705da2b616b98fa6010a25813c5a7a27456f71d\",\n+ \"comment_count\": 0,\n+ \"verification\": {\n+ \"verified\": true,\n+ \"reason\": \"valid\",\n+ \"signature\": \"-----BEGIN PGP SIGNATURE-----\\n\\niQFOBAABCAA4FiEEOOW2r8dr6sA77zHlgjqBKYe1QKUFAlynQ58aHHN6dWNzLmty\\naXN6dGlhbkBnbWFpbC5jb20ACgkQgjqBKYe1QKUYKwf6AiXDMaLqNLNSjRY7lIXX\\nudioewz0hSb4bgIXBv30nswu9CoOA0+mHCokEVtZhYbXzXDsZ1KJrilSC4j+Ws4q\\nkRGA6iEmrne2HcSKNZXzcVnwV9zpwKxlVh2QCTNb1PuOYFBLH0kwE704uWIWMGDN\\nbo8cjQPwegePCRguCvPh/5wa5J3uiq5gmJLG6bC/d1XYE+FJVtlnyzqzLMIryGKe\\ntIciw+wwkF413Q/YVbZ49vLUeCX9H8PHC4mZYGDWuvjFW1WTfkjK5bAH+oaTVM6h\\n350I5ZFloHmMA/QeRge5qFxXoEBMDGiXHHktzYZDXnliFOQNxzqwirA5lQQ6LRSS\\naQ==\\n=7rqi\\n-----END PGP SIGNATURE-----\",\n+ \"payload\": \"tree 16a7bb186833a67e9c2d84a58393503b85500ceb\\nparent 446ae69b9385e8d0f40aa9595f723d34383af2f7\\nauthor Krisztián Szűcs 1554465691 +0200\\ncommitter Krisztián Szűcs 1554465691 +0200\\n\\nadd recorded event requests\\n\"\n+ }\n+ },\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits/2705da2b616b98fa6010a25813c5a7a27456f71d\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/commit/2705da2b616b98fa6010a25813c5a7a27456f71d\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits/2705da2b616b98fa6010a25813c5a7a27456f71d/comments\",\n+ \"author\": {\n+ \"login\": \"kszucs\",\n+ \"id\": 961747,\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"gravatar_id\": \"\",\n+ \"url\": \"https://api.github.com/users/kszucs\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"type\": \"User\",\n+ \"site_admin\": false\n+ },\n+ \"committer\": {\n+ \"login\": \"kszucs\",\n+ \"id\": 961747,\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"gravatar_id\": \"\",\n+ \"url\": \"https://api.github.com/users/kszucs\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"type\": \"User\",\n+ \"site_admin\": false\n+ },\n+ \"parents\": [\n+ {\n+ \"sha\": \"446ae69b9385e8d0f40aa9595f723d34383af2f7\",\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits/446ae69b9385e8d0f40aa9595f723d34383af2f7\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/commit/446ae69b9385e8d0f40aa9595f723d34383af2f7\"\n+ }\n+ ],\n+ \"stats\": {\n+ \"total\": 1062,\n+ \"additions\": 1058,\n+ \"deletions\": 4\n+ },\n+ \"files\": [\n+ {\n+ \"sha\": \"dfae6eeaef384ae6180c6302a58b49e39982dc33\",\n+ \"filename\": \"ursabot/tests/fixtures/issue-comment-build-command.json\",\n+ \"status\": \"added\",\n+ \"additions\": 212,\n+ \"deletions\": 0,\n+ \"changes\": 212,\n+ \"blob_url\": \"https://github.com/ursa-labs/ursabot/blob/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-build-command.json\",\n+ \"raw_url\": \"https://github.com/ursa-labs/ursabot/raw/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-build-command.json\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/fixtures/issue-comment-build-command.json?ref=2705da2b616b98fa6010a25813c5a7a27456f71d\",\n+ \"patch\": \"@@ -0,0 +1,212 @@\\n+{\\n+ \\\"action\\\": \\\"created\\\",\\n+ \\\"comment\\\": {\\n+ \\\"author_association\\\": \\\"NONE\\\",\\n+ \\\"body\\\": \\\"I've successfully started builds for this PR\\\",\\n+ \\\"created_at\\\": \\\"2019-04-05T11:55:44Z\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480248730\\\",\\n+ \\\"id\\\": 480248730,\\n+ \\\"issue_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/26\\\",\\n+ \\\"node_id\\\": \\\"MDEyOklzc3VlQ29tbWVudDQ4MDI0ODczMA==\\\",\\n+ \\\"updated_at\\\": \\\"2019-04-05T11:55:44Z\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480248730\\\",\\n+ \\\"user\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars2.githubusercontent.com/u/49275095?v=4\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/users/ursabot/events{/privacy}\\\",\\n+ \\\"followers_url\\\": \\\"https://api.github.com/users/ursabot/followers\\\",\\n+ \\\"following_url\\\": \\\"https://api.github.com/users/ursabot/following{/other_user}\\\",\\n+ \\\"gists_url\\\": \\\"https://api.github.com/users/ursabot/gists{/gist_id}\\\",\\n+ \\\"gravatar_id\\\": \\\"\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursabot\\\",\\n+ \\\"id\\\": 49275095,\\n+ \\\"login\\\": \\\"ursabot\\\",\\n+ \\\"node_id\\\": \\\"MDQ6VXNlcjQ5Mjc1MDk1\\\",\\n+ \\\"organizations_url\\\": \\\"https://api.github.com/users/ursabot/orgs\\\",\\n+ \\\"received_events_url\\\": \\\"https://api.github.com/users/ursabot/received_events\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/users/ursabot/repos\\\",\\n+ \\\"site_admin\\\": false,\\n+ \\\"starred_url\\\": \\\"https://api.github.com/users/ursabot/starred{/owner}{/repo}\\\",\\n+ \\\"subscriptions_url\\\": \\\"https://api.github.com/users/ursabot/subscriptions\\\",\\n+ \\\"type\\\": \\\"User\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/users/ursabot\\\"\\n+ }\\n+ },\\n+ \\\"issue\\\": {\\n+ \\\"assignee\\\": null,\\n+ \\\"assignees\\\": [],\\n+ \\\"author_association\\\": \\\"MEMBER\\\",\\n+ \\\"body\\\": \\\"\\\",\\n+ \\\"closed_at\\\": null,\\n+ \\\"comments\\\": 4,\\n+ \\\"comments_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\\\",\\n+ \\\"created_at\\\": \\\"2019-04-05T11:22:15Z\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/26/events\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs/ursabot/pull/26\\\",\\n+ \\\"id\\\": 429706959,\\n+ \\\"labels\\\": [],\\n+ \\\"labels_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}\\\",\\n+ \\\"locked\\\": false,\\n+ \\\"milestone\\\": null,\\n+ \\\"node_id\\\": \\\"MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy\\\",\\n+ \\\"number\\\": 26,\\n+ \\\"pull_request\\\": {\\n+ \\\"diff_url\\\": \\\"https://github.com/ursa-labs/ursabot/pull/26.diff\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs/ursabot/pull/26\\\",\\n+ \\\"patch_url\\\": \\\"https://github.com/ursa-labs/ursabot/pull/26.patch\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\\\"\\n+ },\\n+ \\\"repository_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot\\\",\\n+ \\\"state\\\": \\\"open\\\",\\n+ \\\"title\\\": \\\"Unittests for GithubHook\\\",\\n+ \\\"updated_at\\\": \\\"2019-04-05T11:55:44Z\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/26\\\",\\n+ \\\"user\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars1.githubusercontent.com/u/961747?v=4\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/users/kszucs/events{/privacy}\\\",\\n+ \\\"followers_url\\\": \\\"https://api.github.com/users/kszucs/followers\\\",\\n+ \\\"following_url\\\": \\\"https://api.github.com/users/kszucs/following{/other_user}\\\",\\n+ \\\"gists_url\\\": \\\"https://api.github.com/users/kszucs/gists{/gist_id}\\\",\\n+ \\\"gravatar_id\\\": \\\"\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/kszucs\\\",\\n+ \\\"id\\\": 961747,\\n+ \\\"login\\\": \\\"kszucs\\\",\\n+ \\\"node_id\\\": \\\"MDQ6VXNlcjk2MTc0Nw==\\\",\\n+ \\\"organizations_url\\\": \\\"https://api.github.com/users/kszucs/orgs\\\",\\n+ \\\"received_events_url\\\": \\\"https://api.github.com/users/kszucs/received_events\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/users/kszucs/repos\\\",\\n+ \\\"site_admin\\\": false,\\n+ \\\"starred_url\\\": \\\"https://api.github.com/users/kszucs/starred{/owner}{/repo}\\\",\\n+ \\\"subscriptions_url\\\": \\\"https://api.github.com/users/kszucs/subscriptions\\\",\\n+ \\\"type\\\": \\\"User\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/users/kszucs\\\"\\n+ }\\n+ },\\n+ \\\"organization\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars2.githubusercontent.com/u/46514972?v=4\\\",\\n+ \\\"description\\\": \\\"Innovation lab for open source data science tools, powered by Apache Arrow\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/orgs/ursa-labs/events\\\",\\n+ \\\"hooks_url\\\": \\\"https://api.github.com/orgs/ursa-labs/hooks\\\",\\n+ \\\"id\\\": 46514972,\\n+ \\\"issues_url\\\": \\\"https://api.github.com/orgs/ursa-labs/issues\\\",\\n+ \\\"login\\\": \\\"ursa-labs\\\",\\n+ \\\"members_url\\\": \\\"https://api.github.com/orgs/ursa-labs/members{/member}\\\",\\n+ \\\"node_id\\\": \\\"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\\\",\\n+ \\\"public_members_url\\\": \\\"https://api.github.com/orgs/ursa-labs/public_members{/member}\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/orgs/ursa-labs/repos\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/orgs/ursa-labs\\\"\\n+ },\\n+ \\\"repository\\\": {\\n+ \\\"archive_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\\\",\\n+ \\\"archived\\\": false,\\n+ \\\"assignees_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\\\",\\n+ \\\"blobs_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\\\",\\n+ \\\"branches_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\\\",\\n+ \\\"clone_url\\\": \\\"https://github.com/ursa-labs/ursabot.git\\\",\\n+ \\\"collaborators_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\\\",\\n+ \\\"comments_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\\\",\\n+ \\\"commits_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\\\",\\n+ \\\"compare_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\\\",\\n+ \\\"contents_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\\\",\\n+ \\\"contributors_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/contributors\\\",\\n+ \\\"created_at\\\": \\\"2019-02-04T15:40:31Z\\\",\\n+ \\\"default_branch\\\": \\\"master\\\",\\n+ \\\"deployments_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/deployments\\\",\\n+ \\\"description\\\": null,\\n+ \\\"disabled\\\": false,\\n+ \\\"downloads_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/downloads\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/events\\\",\\n+ \\\"fork\\\": false,\\n+ \\\"forks\\\": 0,\\n+ \\\"forks_count\\\": 0,\\n+ \\\"forks_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/forks\\\",\\n+ \\\"full_name\\\": \\\"ursa-labs/ursabot\\\",\\n+ \\\"git_commits_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\\\",\\n+ \\\"git_refs_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\\\",\\n+ \\\"git_tags_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\\\",\\n+ \\\"git_url\\\": \\\"git://github.com/ursa-labs/ursabot.git\\\",\\n+ \\\"has_downloads\\\": true,\\n+ \\\"has_issues\\\": true,\\n+ \\\"has_pages\\\": false,\\n+ \\\"has_projects\\\": true,\\n+ \\\"has_wiki\\\": true,\\n+ \\\"homepage\\\": null,\\n+ \\\"hooks_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/hooks\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs/ursabot\\\",\\n+ \\\"id\\\": 169101701,\\n+ \\\"issue_comment_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\\\",\\n+ \\\"issue_events_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\\\",\\n+ \\\"issues_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\\\",\\n+ \\\"keys_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\\\",\\n+ \\\"labels_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\\\",\\n+ \\\"language\\\": \\\"Jupyter Notebook\\\",\\n+ \\\"languages_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/languages\\\",\\n+ \\\"license\\\": null,\\n+ \\\"merges_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/merges\\\",\\n+ \\\"milestones_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\\\",\\n+ \\\"mirror_url\\\": null,\\n+ \\\"name\\\": \\\"ursabot\\\",\\n+ \\\"node_id\\\": \\\"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\\\",\\n+ \\\"notifications_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\\\",\\n+ \\\"open_issues\\\": 19,\\n+ \\\"open_issues_count\\\": 19,\\n+ \\\"owner\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars2.githubusercontent.com/u/46514972?v=4\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/users/ursa-labs/events{/privacy}\\\",\\n+ \\\"followers_url\\\": \\\"https://api.github.com/users/ursa-labs/followers\\\",\\n+ \\\"following_url\\\": \\\"https://api.github.com/users/ursa-labs/following{/other_user}\\\",\\n+ \\\"gists_url\\\": \\\"https://api.github.com/users/ursa-labs/gists{/gist_id}\\\",\\n+ \\\"gravatar_id\\\": \\\"\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs\\\",\\n+ \\\"id\\\": 46514972,\\n+ \\\"login\\\": \\\"ursa-labs\\\",\\n+ \\\"node_id\\\": \\\"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\\\",\\n+ \\\"organizations_url\\\": \\\"https://api.github.com/users/ursa-labs/orgs\\\",\\n+ \\\"received_events_url\\\": \\\"https://api.github.com/users/ursa-labs/received_events\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/users/ursa-labs/repos\\\",\\n+ \\\"site_admin\\\": false,\\n+ \\\"starred_url\\\": \\\"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\\\",\\n+ \\\"subscriptions_url\\\": \\\"https://api.github.com/users/ursa-labs/subscriptions\\\",\\n+ \\\"type\\\": \\\"Organization\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/users/ursa-labs\\\"\\n+ },\\n+ \\\"private\\\": false,\\n+ \\\"pulls_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\\\",\\n+ \\\"pushed_at\\\": \\\"2019-04-05T11:22:16Z\\\",\\n+ \\\"releases_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\\\",\\n+ \\\"size\\\": 892,\\n+ \\\"ssh_url\\\": \\\"git@github.com:ursa-labs/ursabot.git\\\",\\n+ \\\"stargazers_count\\\": 1,\\n+ \\\"stargazers_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/stargazers\\\",\\n+ \\\"statuses_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\\\",\\n+ \\\"subscribers_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/subscribers\\\",\\n+ \\\"subscription_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/subscription\\\",\\n+ \\\"svn_url\\\": \\\"https://github.com/ursa-labs/ursabot\\\",\\n+ \\\"tags_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/tags\\\",\\n+ \\\"teams_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/teams\\\",\\n+ \\\"trees_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\\\",\\n+ \\\"updated_at\\\": \\\"2019-04-04T17:49:10Z\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot\\\",\\n+ \\\"watchers\\\": 1,\\n+ \\\"watchers_count\\\": 1\\n+ },\\n+ \\\"sender\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars2.githubusercontent.com/u/49275095?v=4\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/users/ursabot/events{/privacy}\\\",\\n+ \\\"followers_url\\\": \\\"https://api.github.com/users/ursabot/followers\\\",\\n+ \\\"following_url\\\": \\\"https://api.github.com/users/ursabot/following{/other_user}\\\",\\n+ \\\"gists_url\\\": \\\"https://api.github.com/users/ursabot/gists{/gist_id}\\\",\\n+ \\\"gravatar_id\\\": \\\"\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursabot\\\",\\n+ \\\"id\\\": 49275095,\\n+ \\\"login\\\": \\\"ursabot\\\",\\n+ \\\"node_id\\\": \\\"MDQ6VXNlcjQ5Mjc1MDk1\\\",\\n+ \\\"organizations_url\\\": \\\"https://api.github.com/users/ursabot/orgs\\\",\\n+ \\\"received_events_url\\\": \\\"https://api.github.com/users/ursabot/received_events\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/users/ursabot/repos\\\",\\n+ \\\"site_admin\\\": false,\\n+ \\\"starred_url\\\": \\\"https://api.github.com/users/ursabot/starred{/owner}{/repo}\\\",\\n+ \\\"subscriptions_url\\\": \\\"https://api.github.com/users/ursabot/subscriptions\\\",\\n+ \\\"type\\\": \\\"User\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/users/ursabot\\\"\\n+ }\\n+}\"\n+ },\n+ {\n+ \"sha\": \"7ef554e333327f0e62aa1fd76b4b17844a39adeb\",\n+ \"filename\": \"ursabot/tests/fixtures/issue-comment-by-ursabot.json\",\n+ \"status\": \"added\",\n+ \"additions\": 212,\n+ \"deletions\": 0,\n+ \"changes\": 212,\n+ \"blob_url\": \"https://github.com/ursa-labs/ursabot/blob/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-by-ursabot.json\",\n+ \"raw_url\": \"https://github.com/ursa-labs/ursabot/raw/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-by-ursabot.json\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/fixtures/issue-comment-by-ursabot.json?ref=2705da2b616b98fa6010a25813c5a7a27456f71d\",\n+ \"patch\": \"@@ -0,0 +1,212 @@\\n+{\\n+ \\\"action\\\": \\\"created\\\",\\n+ \\\"comment\\\": {\\n+ \\\"author_association\\\": \\\"NONE\\\",\\n+ \\\"body\\\": \\\"Unknown command \\\\\\\"\\\\\\\"\\\",\\n+ \\\"created_at\\\": \\\"2019-04-05T11:35:47Z\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480243815\\\",\\n+ \\\"id\\\": 480243815,\\n+ \\\"issue_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/26\\\",\\n+ \\\"node_id\\\": \\\"MDEyOklzc3VlQ29tbWVudDQ4MDI0MzgxNQ==\\\",\\n+ \\\"updated_at\\\": \\\"2019-04-05T11:35:47Z\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480243815\\\",\\n+ \\\"user\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars2.githubusercontent.com/u/49275095?v=4\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/users/ursabot/events{/privacy}\\\",\\n+ \\\"followers_url\\\": \\\"https://api.github.com/users/ursabot/followers\\\",\\n+ \\\"following_url\\\": \\\"https://api.github.com/users/ursabot/following{/other_user}\\\",\\n+ \\\"gists_url\\\": \\\"https://api.github.com/users/ursabot/gists{/gist_id}\\\",\\n+ \\\"gravatar_id\\\": \\\"\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursabot\\\",\\n+ \\\"id\\\": 49275095,\\n+ \\\"login\\\": \\\"ursabot\\\",\\n+ \\\"node_id\\\": \\\"MDQ6VXNlcjQ5Mjc1MDk1\\\",\\n+ \\\"organizations_url\\\": \\\"https://api.github.com/users/ursabot/orgs\\\",\\n+ \\\"received_events_url\\\": \\\"https://api.github.com/users/ursabot/received_events\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/users/ursabot/repos\\\",\\n+ \\\"site_admin\\\": false,\\n+ \\\"starred_url\\\": \\\"https://api.github.com/users/ursabot/starred{/owner}{/repo}\\\",\\n+ \\\"subscriptions_url\\\": \\\"https://api.github.com/users/ursabot/subscriptions\\\",\\n+ \\\"type\\\": \\\"User\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/users/ursabot\\\"\\n+ }\\n+ },\\n+ \\\"issue\\\": {\\n+ \\\"assignee\\\": null,\\n+ \\\"assignees\\\": [],\\n+ \\\"author_association\\\": \\\"MEMBER\\\",\\n+ \\\"body\\\": \\\"\\\",\\n+ \\\"closed_at\\\": null,\\n+ \\\"comments\\\": 2,\\n+ \\\"comments_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\\\",\\n+ \\\"created_at\\\": \\\"2019-04-05T11:22:15Z\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/26/events\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs/ursabot/pull/26\\\",\\n+ \\\"id\\\": 429706959,\\n+ \\\"labels\\\": [],\\n+ \\\"labels_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}\\\",\\n+ \\\"locked\\\": false,\\n+ \\\"milestone\\\": null,\\n+ \\\"node_id\\\": \\\"MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy\\\",\\n+ \\\"number\\\": 26,\\n+ \\\"pull_request\\\": {\\n+ \\\"diff_url\\\": \\\"https://github.com/ursa-labs/ursabot/pull/26.diff\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs/ursabot/pull/26\\\",\\n+ \\\"patch_url\\\": \\\"https://github.com/ursa-labs/ursabot/pull/26.patch\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\\\"\\n+ },\\n+ \\\"repository_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot\\\",\\n+ \\\"state\\\": \\\"open\\\",\\n+ \\\"title\\\": \\\"Unittests for GithubHook\\\",\\n+ \\\"updated_at\\\": \\\"2019-04-05T11:35:47Z\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/26\\\",\\n+ \\\"user\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars1.githubusercontent.com/u/961747?v=4\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/users/kszucs/events{/privacy}\\\",\\n+ \\\"followers_url\\\": \\\"https://api.github.com/users/kszucs/followers\\\",\\n+ \\\"following_url\\\": \\\"https://api.github.com/users/kszucs/following{/other_user}\\\",\\n+ \\\"gists_url\\\": \\\"https://api.github.com/users/kszucs/gists{/gist_id}\\\",\\n+ \\\"gravatar_id\\\": \\\"\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/kszucs\\\",\\n+ \\\"id\\\": 961747,\\n+ \\\"login\\\": \\\"kszucs\\\",\\n+ \\\"node_id\\\": \\\"MDQ6VXNlcjk2MTc0Nw==\\\",\\n+ \\\"organizations_url\\\": \\\"https://api.github.com/users/kszucs/orgs\\\",\\n+ \\\"received_events_url\\\": \\\"https://api.github.com/users/kszucs/received_events\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/users/kszucs/repos\\\",\\n+ \\\"site_admin\\\": false,\\n+ \\\"starred_url\\\": \\\"https://api.github.com/users/kszucs/starred{/owner}{/repo}\\\",\\n+ \\\"subscriptions_url\\\": \\\"https://api.github.com/users/kszucs/subscriptions\\\",\\n+ \\\"type\\\": \\\"User\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/users/kszucs\\\"\\n+ }\\n+ },\\n+ \\\"organization\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars2.githubusercontent.com/u/46514972?v=4\\\",\\n+ \\\"description\\\": \\\"Innovation lab for open source data science tools, powered by Apache Arrow\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/orgs/ursa-labs/events\\\",\\n+ \\\"hooks_url\\\": \\\"https://api.github.com/orgs/ursa-labs/hooks\\\",\\n+ \\\"id\\\": 46514972,\\n+ \\\"issues_url\\\": \\\"https://api.github.com/orgs/ursa-labs/issues\\\",\\n+ \\\"login\\\": \\\"ursa-labs\\\",\\n+ \\\"members_url\\\": \\\"https://api.github.com/orgs/ursa-labs/members{/member}\\\",\\n+ \\\"node_id\\\": \\\"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\\\",\\n+ \\\"public_members_url\\\": \\\"https://api.github.com/orgs/ursa-labs/public_members{/member}\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/orgs/ursa-labs/repos\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/orgs/ursa-labs\\\"\\n+ },\\n+ \\\"repository\\\": {\\n+ \\\"archive_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\\\",\\n+ \\\"archived\\\": false,\\n+ \\\"assignees_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\\\",\\n+ \\\"blobs_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\\\",\\n+ \\\"branches_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\\\",\\n+ \\\"clone_url\\\": \\\"https://github.com/ursa-labs/ursabot.git\\\",\\n+ \\\"collaborators_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\\\",\\n+ \\\"comments_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\\\",\\n+ \\\"commits_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\\\",\\n+ \\\"compare_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\\\",\\n+ \\\"contents_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\\\",\\n+ \\\"contributors_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/contributors\\\",\\n+ \\\"created_at\\\": \\\"2019-02-04T15:40:31Z\\\",\\n+ \\\"default_branch\\\": \\\"master\\\",\\n+ \\\"deployments_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/deployments\\\",\\n+ \\\"description\\\": null,\\n+ \\\"disabled\\\": false,\\n+ \\\"downloads_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/downloads\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/events\\\",\\n+ \\\"fork\\\": false,\\n+ \\\"forks\\\": 0,\\n+ \\\"forks_count\\\": 0,\\n+ \\\"forks_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/forks\\\",\\n+ \\\"full_name\\\": \\\"ursa-labs/ursabot\\\",\\n+ \\\"git_commits_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\\\",\\n+ \\\"git_refs_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\\\",\\n+ \\\"git_tags_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\\\",\\n+ \\\"git_url\\\": \\\"git://github.com/ursa-labs/ursabot.git\\\",\\n+ \\\"has_downloads\\\": true,\\n+ \\\"has_issues\\\": true,\\n+ \\\"has_pages\\\": false,\\n+ \\\"has_projects\\\": true,\\n+ \\\"has_wiki\\\": true,\\n+ \\\"homepage\\\": null,\\n+ \\\"hooks_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/hooks\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs/ursabot\\\",\\n+ \\\"id\\\": 169101701,\\n+ \\\"issue_comment_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\\\",\\n+ \\\"issue_events_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\\\",\\n+ \\\"issues_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\\\",\\n+ \\\"keys_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\\\",\\n+ \\\"labels_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\\\",\\n+ \\\"language\\\": \\\"Jupyter Notebook\\\",\\n+ \\\"languages_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/languages\\\",\\n+ \\\"license\\\": null,\\n+ \\\"merges_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/merges\\\",\\n+ \\\"milestones_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\\\",\\n+ \\\"mirror_url\\\": null,\\n+ \\\"name\\\": \\\"ursabot\\\",\\n+ \\\"node_id\\\": \\\"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\\\",\\n+ \\\"notifications_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\\\",\\n+ \\\"open_issues\\\": 19,\\n+ \\\"open_issues_count\\\": 19,\\n+ \\\"owner\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars2.githubusercontent.com/u/46514972?v=4\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/users/ursa-labs/events{/privacy}\\\",\\n+ \\\"followers_url\\\": \\\"https://api.github.com/users/ursa-labs/followers\\\",\\n+ \\\"following_url\\\": \\\"https://api.github.com/users/ursa-labs/following{/other_user}\\\",\\n+ \\\"gists_url\\\": \\\"https://api.github.com/users/ursa-labs/gists{/gist_id}\\\",\\n+ \\\"gravatar_id\\\": \\\"\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs\\\",\\n+ \\\"id\\\": 46514972,\\n+ \\\"login\\\": \\\"ursa-labs\\\",\\n+ \\\"node_id\\\": \\\"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\\\",\\n+ \\\"organizations_url\\\": \\\"https://api.github.com/users/ursa-labs/orgs\\\",\\n+ \\\"received_events_url\\\": \\\"https://api.github.com/users/ursa-labs/received_events\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/users/ursa-labs/repos\\\",\\n+ \\\"site_admin\\\": false,\\n+ \\\"starred_url\\\": \\\"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\\\",\\n+ \\\"subscriptions_url\\\": \\\"https://api.github.com/users/ursa-labs/subscriptions\\\",\\n+ \\\"type\\\": \\\"Organization\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/users/ursa-labs\\\"\\n+ },\\n+ \\\"private\\\": false,\\n+ \\\"pulls_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\\\",\\n+ \\\"pushed_at\\\": \\\"2019-04-05T11:22:16Z\\\",\\n+ \\\"releases_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\\\",\\n+ \\\"size\\\": 892,\\n+ \\\"ssh_url\\\": \\\"git@github.com:ursa-labs/ursabot.git\\\",\\n+ \\\"stargazers_count\\\": 1,\\n+ \\\"stargazers_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/stargazers\\\",\\n+ \\\"statuses_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\\\",\\n+ \\\"subscribers_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/subscribers\\\",\\n+ \\\"subscription_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/subscription\\\",\\n+ \\\"svn_url\\\": \\\"https://github.com/ursa-labs/ursabot\\\",\\n+ \\\"tags_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/tags\\\",\\n+ \\\"teams_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/teams\\\",\\n+ \\\"trees_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\\\",\\n+ \\\"updated_at\\\": \\\"2019-04-04T17:49:10Z\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot\\\",\\n+ \\\"watchers\\\": 1,\\n+ \\\"watchers_count\\\": 1\\n+ },\\n+ \\\"sender\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars2.githubusercontent.com/u/49275095?v=4\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/users/ursabot/events{/privacy}\\\",\\n+ \\\"followers_url\\\": \\\"https://api.github.com/users/ursabot/followers\\\",\\n+ \\\"following_url\\\": \\\"https://api.github.com/users/ursabot/following{/other_user}\\\",\\n+ \\\"gists_url\\\": \\\"https://api.github.com/users/ursabot/gists{/gist_id}\\\",\\n+ \\\"gravatar_id\\\": \\\"\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursabot\\\",\\n+ \\\"id\\\": 49275095,\\n+ \\\"login\\\": \\\"ursabot\\\",\\n+ \\\"node_id\\\": \\\"MDQ6VXNlcjQ5Mjc1MDk1\\\",\\n+ \\\"organizations_url\\\": \\\"https://api.github.com/users/ursabot/orgs\\\",\\n+ \\\"received_events_url\\\": \\\"https://api.github.com/users/ursabot/received_events\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/users/ursabot/repos\\\",\\n+ \\\"site_admin\\\": false,\\n+ \\\"starred_url\\\": \\\"https://api.github.com/users/ursabot/starred{/owner}{/repo}\\\",\\n+ \\\"subscriptions_url\\\": \\\"https://api.github.com/users/ursabot/subscriptions\\\",\\n+ \\\"type\\\": \\\"User\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/users/ursabot\\\"\\n+ }\\n+}\"\n+ },\n+ {\n+ \"sha\": \"a8082dbc91fdfe815b795e49ec10e49000771ef5\",\n+ \"filename\": \"ursabot/tests/fixtures/issue-comment-not-mentioning-ursabot.json\",\n+ \"status\": \"added\",\n+ \"additions\": 212,\n+ \"deletions\": 0,\n+ \"changes\": 212,\n+ \"blob_url\": \"https://github.com/ursa-labs/ursabot/blob/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-not-mentioning-ursabot.json\",\n+ \"raw_url\": \"https://github.com/ursa-labs/ursabot/raw/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-not-mentioning-ursabot.json\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/fixtures/issue-comment-not-mentioning-ursabot.json?ref=2705da2b616b98fa6010a25813c5a7a27456f71d\",\n+ \"patch\": \"@@ -0,0 +1,212 @@\\n+{\\n+ \\\"action\\\": \\\"created\\\",\\n+ \\\"comment\\\": {\\n+ \\\"author_association\\\": \\\"MEMBER\\\",\\n+ \\\"body\\\": \\\"bear is no game\\\",\\n+ \\\"created_at\\\": \\\"2019-04-05T11:26:56Z\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480241727\\\",\\n+ \\\"id\\\": 480241727,\\n+ \\\"issue_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/26\\\",\\n+ \\\"node_id\\\": \\\"MDEyOklzc3VlQ29tbWVudDQ4MDI0MTcyNw==\\\",\\n+ \\\"updated_at\\\": \\\"2019-04-05T11:26:56Z\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480241727\\\",\\n+ \\\"user\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars1.githubusercontent.com/u/961747?v=4\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/users/kszucs/events{/privacy}\\\",\\n+ \\\"followers_url\\\": \\\"https://api.github.com/users/kszucs/followers\\\",\\n+ \\\"following_url\\\": \\\"https://api.github.com/users/kszucs/following{/other_user}\\\",\\n+ \\\"gists_url\\\": \\\"https://api.github.com/users/kszucs/gists{/gist_id}\\\",\\n+ \\\"gravatar_id\\\": \\\"\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/kszucs\\\",\\n+ \\\"id\\\": 961747,\\n+ \\\"login\\\": \\\"kszucs\\\",\\n+ \\\"node_id\\\": \\\"MDQ6VXNlcjk2MTc0Nw==\\\",\\n+ \\\"organizations_url\\\": \\\"https://api.github.com/users/kszucs/orgs\\\",\\n+ \\\"received_events_url\\\": \\\"https://api.github.com/users/kszucs/received_events\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/users/kszucs/repos\\\",\\n+ \\\"site_admin\\\": false,\\n+ \\\"starred_url\\\": \\\"https://api.github.com/users/kszucs/starred{/owner}{/repo}\\\",\\n+ \\\"subscriptions_url\\\": \\\"https://api.github.com/users/kszucs/subscriptions\\\",\\n+ \\\"type\\\": \\\"User\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/users/kszucs\\\"\\n+ }\\n+ },\\n+ \\\"issue\\\": {\\n+ \\\"assignee\\\": null,\\n+ \\\"assignees\\\": [],\\n+ \\\"author_association\\\": \\\"MEMBER\\\",\\n+ \\\"body\\\": \\\"\\\",\\n+ \\\"closed_at\\\": null,\\n+ \\\"comments\\\": 0,\\n+ \\\"comments_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\\\",\\n+ \\\"created_at\\\": \\\"2019-04-05T11:22:15Z\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/26/events\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs/ursabot/pull/26\\\",\\n+ \\\"id\\\": 429706959,\\n+ \\\"labels\\\": [],\\n+ \\\"labels_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}\\\",\\n+ \\\"locked\\\": false,\\n+ \\\"milestone\\\": null,\\n+ \\\"node_id\\\": \\\"MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy\\\",\\n+ \\\"number\\\": 26,\\n+ \\\"pull_request\\\": {\\n+ \\\"diff_url\\\": \\\"https://github.com/ursa-labs/ursabot/pull/26.diff\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs/ursabot/pull/26\\\",\\n+ \\\"patch_url\\\": \\\"https://github.com/ursa-labs/ursabot/pull/26.patch\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\\\"\\n+ },\\n+ \\\"repository_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot\\\",\\n+ \\\"state\\\": \\\"open\\\",\\n+ \\\"title\\\": \\\"Unittests for GithubHook\\\",\\n+ \\\"updated_at\\\": \\\"2019-04-05T11:26:56Z\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/26\\\",\\n+ \\\"user\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars1.githubusercontent.com/u/961747?v=4\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/users/kszucs/events{/privacy}\\\",\\n+ \\\"followers_url\\\": \\\"https://api.github.com/users/kszucs/followers\\\",\\n+ \\\"following_url\\\": \\\"https://api.github.com/users/kszucs/following{/other_user}\\\",\\n+ \\\"gists_url\\\": \\\"https://api.github.com/users/kszucs/gists{/gist_id}\\\",\\n+ \\\"gravatar_id\\\": \\\"\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/kszucs\\\",\\n+ \\\"id\\\": 961747,\\n+ \\\"login\\\": \\\"kszucs\\\",\\n+ \\\"node_id\\\": \\\"MDQ6VXNlcjk2MTc0Nw==\\\",\\n+ \\\"organizations_url\\\": \\\"https://api.github.com/users/kszucs/orgs\\\",\\n+ \\\"received_events_url\\\": \\\"https://api.github.com/users/kszucs/received_events\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/users/kszucs/repos\\\",\\n+ \\\"site_admin\\\": false,\\n+ \\\"starred_url\\\": \\\"https://api.github.com/users/kszucs/starred{/owner}{/repo}\\\",\\n+ \\\"subscriptions_url\\\": \\\"https://api.github.com/users/kszucs/subscriptions\\\",\\n+ \\\"type\\\": \\\"User\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/users/kszucs\\\"\\n+ }\\n+ },\\n+ \\\"organization\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars2.githubusercontent.com/u/46514972?v=4\\\",\\n+ \\\"description\\\": \\\"Innovation lab for open source data science tools, powered by Apache Arrow\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/orgs/ursa-labs/events\\\",\\n+ \\\"hooks_url\\\": \\\"https://api.github.com/orgs/ursa-labs/hooks\\\",\\n+ \\\"id\\\": 46514972,\\n+ \\\"issues_url\\\": \\\"https://api.github.com/orgs/ursa-labs/issues\\\",\\n+ \\\"login\\\": \\\"ursa-labs\\\",\\n+ \\\"members_url\\\": \\\"https://api.github.com/orgs/ursa-labs/members{/member}\\\",\\n+ \\\"node_id\\\": \\\"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\\\",\\n+ \\\"public_members_url\\\": \\\"https://api.github.com/orgs/ursa-labs/public_members{/member}\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/orgs/ursa-labs/repos\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/orgs/ursa-labs\\\"\\n+ },\\n+ \\\"repository\\\": {\\n+ \\\"archive_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\\\",\\n+ \\\"archived\\\": false,\\n+ \\\"assignees_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\\\",\\n+ \\\"blobs_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\\\",\\n+ \\\"branches_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\\\",\\n+ \\\"clone_url\\\": \\\"https://github.com/ursa-labs/ursabot.git\\\",\\n+ \\\"collaborators_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\\\",\\n+ \\\"comments_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\\\",\\n+ \\\"commits_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\\\",\\n+ \\\"compare_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\\\",\\n+ \\\"contents_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\\\",\\n+ \\\"contributors_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/contributors\\\",\\n+ \\\"created_at\\\": \\\"2019-02-04T15:40:31Z\\\",\\n+ \\\"default_branch\\\": \\\"master\\\",\\n+ \\\"deployments_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/deployments\\\",\\n+ \\\"description\\\": null,\\n+ \\\"disabled\\\": false,\\n+ \\\"downloads_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/downloads\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/events\\\",\\n+ \\\"fork\\\": false,\\n+ \\\"forks\\\": 0,\\n+ \\\"forks_count\\\": 0,\\n+ \\\"forks_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/forks\\\",\\n+ \\\"full_name\\\": \\\"ursa-labs/ursabot\\\",\\n+ \\\"git_commits_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\\\",\\n+ \\\"git_refs_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\\\",\\n+ \\\"git_tags_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\\\",\\n+ \\\"git_url\\\": \\\"git://github.com/ursa-labs/ursabot.git\\\",\\n+ \\\"has_downloads\\\": true,\\n+ \\\"has_issues\\\": true,\\n+ \\\"has_pages\\\": false,\\n+ \\\"has_projects\\\": true,\\n+ \\\"has_wiki\\\": true,\\n+ \\\"homepage\\\": null,\\n+ \\\"hooks_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/hooks\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs/ursabot\\\",\\n+ \\\"id\\\": 169101701,\\n+ \\\"issue_comment_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\\\",\\n+ \\\"issue_events_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\\\",\\n+ \\\"issues_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\\\",\\n+ \\\"keys_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\\\",\\n+ \\\"labels_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\\\",\\n+ \\\"language\\\": \\\"Jupyter Notebook\\\",\\n+ \\\"languages_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/languages\\\",\\n+ \\\"license\\\": null,\\n+ \\\"merges_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/merges\\\",\\n+ \\\"milestones_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\\\",\\n+ \\\"mirror_url\\\": null,\\n+ \\\"name\\\": \\\"ursabot\\\",\\n+ \\\"node_id\\\": \\\"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\\\",\\n+ \\\"notifications_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\\\",\\n+ \\\"open_issues\\\": 19,\\n+ \\\"open_issues_count\\\": 19,\\n+ \\\"owner\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars2.githubusercontent.com/u/46514972?v=4\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/users/ursa-labs/events{/privacy}\\\",\\n+ \\\"followers_url\\\": \\\"https://api.github.com/users/ursa-labs/followers\\\",\\n+ \\\"following_url\\\": \\\"https://api.github.com/users/ursa-labs/following{/other_user}\\\",\\n+ \\\"gists_url\\\": \\\"https://api.github.com/users/ursa-labs/gists{/gist_id}\\\",\\n+ \\\"gravatar_id\\\": \\\"\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs\\\",\\n+ \\\"id\\\": 46514972,\\n+ \\\"login\\\": \\\"ursa-labs\\\",\\n+ \\\"node_id\\\": \\\"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\\\",\\n+ \\\"organizations_url\\\": \\\"https://api.github.com/users/ursa-labs/orgs\\\",\\n+ \\\"received_events_url\\\": \\\"https://api.github.com/users/ursa-labs/received_events\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/users/ursa-labs/repos\\\",\\n+ \\\"site_admin\\\": false,\\n+ \\\"starred_url\\\": \\\"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\\\",\\n+ \\\"subscriptions_url\\\": \\\"https://api.github.com/users/ursa-labs/subscriptions\\\",\\n+ \\\"type\\\": \\\"Organization\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/users/ursa-labs\\\"\\n+ },\\n+ \\\"private\\\": false,\\n+ \\\"pulls_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\\\",\\n+ \\\"pushed_at\\\": \\\"2019-04-05T11:22:16Z\\\",\\n+ \\\"releases_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\\\",\\n+ \\\"size\\\": 892,\\n+ \\\"ssh_url\\\": \\\"git@github.com:ursa-labs/ursabot.git\\\",\\n+ \\\"stargazers_count\\\": 1,\\n+ \\\"stargazers_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/stargazers\\\",\\n+ \\\"statuses_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\\\",\\n+ \\\"subscribers_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/subscribers\\\",\\n+ \\\"subscription_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/subscription\\\",\\n+ \\\"svn_url\\\": \\\"https://github.com/ursa-labs/ursabot\\\",\\n+ \\\"tags_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/tags\\\",\\n+ \\\"teams_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/teams\\\",\\n+ \\\"trees_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\\\",\\n+ \\\"updated_at\\\": \\\"2019-04-04T17:49:10Z\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot\\\",\\n+ \\\"watchers\\\": 1,\\n+ \\\"watchers_count\\\": 1\\n+ },\\n+ \\\"sender\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars1.githubusercontent.com/u/961747?v=4\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/users/kszucs/events{/privacy}\\\",\\n+ \\\"followers_url\\\": \\\"https://api.github.com/users/kszucs/followers\\\",\\n+ \\\"following_url\\\": \\\"https://api.github.com/users/kszucs/following{/other_user}\\\",\\n+ \\\"gists_url\\\": \\\"https://api.github.com/users/kszucs/gists{/gist_id}\\\",\\n+ \\\"gravatar_id\\\": \\\"\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/kszucs\\\",\\n+ \\\"id\\\": 961747,\\n+ \\\"login\\\": \\\"kszucs\\\",\\n+ \\\"node_id\\\": \\\"MDQ6VXNlcjk2MTc0Nw==\\\",\\n+ \\\"organizations_url\\\": \\\"https://api.github.com/users/kszucs/orgs\\\",\\n+ \\\"received_events_url\\\": \\\"https://api.github.com/users/kszucs/received_events\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/users/kszucs/repos\\\",\\n+ \\\"site_admin\\\": false,\\n+ \\\"starred_url\\\": \\\"https://api.github.com/users/kszucs/starred{/owner}{/repo}\\\",\\n+ \\\"subscriptions_url\\\": \\\"https://api.github.com/users/kszucs/subscriptions\\\",\\n+ \\\"type\\\": \\\"User\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/users/kszucs\\\"\\n+ }\\n+}\"\n+ },\n+ {\n+ \"sha\": \"2770e29ba9086394455315e590c0b433d08e437e\",\n+ \"filename\": \"ursabot/tests/fixtures/issue-comment-with-empty-command.json\",\n+ \"status\": \"added\",\n+ \"additions\": 212,\n+ \"deletions\": 0,\n+ \"changes\": 212,\n+ \"blob_url\": \"https://github.com/ursa-labs/ursabot/blob/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-with-empty-command.json\",\n+ \"raw_url\": \"https://github.com/ursa-labs/ursabot/raw/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-with-empty-command.json\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/fixtures/issue-comment-with-empty-command.json?ref=2705da2b616b98fa6010a25813c5a7a27456f71d\",\n+ \"patch\": \"@@ -0,0 +1,212 @@\\n+{\\n+ \\\"action\\\": \\\"created\\\",\\n+ \\\"comment\\\": {\\n+ \\\"author_association\\\": \\\"MEMBER\\\",\\n+ \\\"body\\\": \\\"@ursabot \\\",\\n+ \\\"created_at\\\": \\\"2019-04-05T11:35:46Z\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs/ursabot/pull/26#issuecomment-480243811\\\",\\n+ \\\"id\\\": 480243811,\\n+ \\\"issue_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/26\\\",\\n+ \\\"node_id\\\": \\\"MDEyOklzc3VlQ29tbWVudDQ4MDI0MzgxMQ==\\\",\\n+ \\\"updated_at\\\": \\\"2019-04-05T11:35:46Z\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480243811\\\",\\n+ \\\"user\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars1.githubusercontent.com/u/961747?v=4\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/users/kszucs/events{/privacy}\\\",\\n+ \\\"followers_url\\\": \\\"https://api.github.com/users/kszucs/followers\\\",\\n+ \\\"following_url\\\": \\\"https://api.github.com/users/kszucs/following{/other_user}\\\",\\n+ \\\"gists_url\\\": \\\"https://api.github.com/users/kszucs/gists{/gist_id}\\\",\\n+ \\\"gravatar_id\\\": \\\"\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/kszucs\\\",\\n+ \\\"id\\\": 961747,\\n+ \\\"login\\\": \\\"kszucs\\\",\\n+ \\\"node_id\\\": \\\"MDQ6VXNlcjk2MTc0Nw==\\\",\\n+ \\\"organizations_url\\\": \\\"https://api.github.com/users/kszucs/orgs\\\",\\n+ \\\"received_events_url\\\": \\\"https://api.github.com/users/kszucs/received_events\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/users/kszucs/repos\\\",\\n+ \\\"site_admin\\\": false,\\n+ \\\"starred_url\\\": \\\"https://api.github.com/users/kszucs/starred{/owner}{/repo}\\\",\\n+ \\\"subscriptions_url\\\": \\\"https://api.github.com/users/kszucs/subscriptions\\\",\\n+ \\\"type\\\": \\\"User\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/users/kszucs\\\"\\n+ }\\n+ },\\n+ \\\"issue\\\": {\\n+ \\\"assignee\\\": null,\\n+ \\\"assignees\\\": [],\\n+ \\\"author_association\\\": \\\"MEMBER\\\",\\n+ \\\"body\\\": \\\"\\\",\\n+ \\\"closed_at\\\": null,\\n+ \\\"comments\\\": 1,\\n+ \\\"comments_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\\\",\\n+ \\\"created_at\\\": \\\"2019-04-05T11:22:15Z\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/26/events\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs/ursabot/pull/26\\\",\\n+ \\\"id\\\": 429706959,\\n+ \\\"labels\\\": [],\\n+ \\\"labels_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/26/labels{/name}\\\",\\n+ \\\"locked\\\": false,\\n+ \\\"milestone\\\": null,\\n+ \\\"node_id\\\": \\\"MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy\\\",\\n+ \\\"number\\\": 26,\\n+ \\\"pull_request\\\": {\\n+ \\\"diff_url\\\": \\\"https://github.com/ursa-labs/ursabot/pull/26.diff\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs/ursabot/pull/26\\\",\\n+ \\\"patch_url\\\": \\\"https://github.com/ursa-labs/ursabot/pull/26.patch\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\\\"\\n+ },\\n+ \\\"repository_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot\\\",\\n+ \\\"state\\\": \\\"open\\\",\\n+ \\\"title\\\": \\\"Unittests for GithubHook\\\",\\n+ \\\"updated_at\\\": \\\"2019-04-05T11:35:46Z\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/26\\\",\\n+ \\\"user\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars1.githubusercontent.com/u/961747?v=4\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/users/kszucs/events{/privacy}\\\",\\n+ \\\"followers_url\\\": \\\"https://api.github.com/users/kszucs/followers\\\",\\n+ \\\"following_url\\\": \\\"https://api.github.com/users/kszucs/following{/other_user}\\\",\\n+ \\\"gists_url\\\": \\\"https://api.github.com/users/kszucs/gists{/gist_id}\\\",\\n+ \\\"gravatar_id\\\": \\\"\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/kszucs\\\",\\n+ \\\"id\\\": 961747,\\n+ \\\"login\\\": \\\"kszucs\\\",\\n+ \\\"node_id\\\": \\\"MDQ6VXNlcjk2MTc0Nw==\\\",\\n+ \\\"organizations_url\\\": \\\"https://api.github.com/users/kszucs/orgs\\\",\\n+ \\\"received_events_url\\\": \\\"https://api.github.com/users/kszucs/received_events\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/users/kszucs/repos\\\",\\n+ \\\"site_admin\\\": false,\\n+ \\\"starred_url\\\": \\\"https://api.github.com/users/kszucs/starred{/owner}{/repo}\\\",\\n+ \\\"subscriptions_url\\\": \\\"https://api.github.com/users/kszucs/subscriptions\\\",\\n+ \\\"type\\\": \\\"User\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/users/kszucs\\\"\\n+ }\\n+ },\\n+ \\\"organization\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars2.githubusercontent.com/u/46514972?v=4\\\",\\n+ \\\"description\\\": \\\"Innovation lab for open source data science tools, powered by Apache Arrow\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/orgs/ursa-labs/events\\\",\\n+ \\\"hooks_url\\\": \\\"https://api.github.com/orgs/ursa-labs/hooks\\\",\\n+ \\\"id\\\": 46514972,\\n+ \\\"issues_url\\\": \\\"https://api.github.com/orgs/ursa-labs/issues\\\",\\n+ \\\"login\\\": \\\"ursa-labs\\\",\\n+ \\\"members_url\\\": \\\"https://api.github.com/orgs/ursa-labs/members{/member}\\\",\\n+ \\\"node_id\\\": \\\"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\\\",\\n+ \\\"public_members_url\\\": \\\"https://api.github.com/orgs/ursa-labs/public_members{/member}\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/orgs/ursa-labs/repos\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/orgs/ursa-labs\\\"\\n+ },\\n+ \\\"repository\\\": {\\n+ \\\"archive_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\\\",\\n+ \\\"archived\\\": false,\\n+ \\\"assignees_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\\\",\\n+ \\\"blobs_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\\\",\\n+ \\\"branches_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\\\",\\n+ \\\"clone_url\\\": \\\"https://github.com/ursa-labs/ursabot.git\\\",\\n+ \\\"collaborators_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\\\",\\n+ \\\"comments_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\\\",\\n+ \\\"commits_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\\\",\\n+ \\\"compare_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\\\",\\n+ \\\"contents_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\\\",\\n+ \\\"contributors_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/contributors\\\",\\n+ \\\"created_at\\\": \\\"2019-02-04T15:40:31Z\\\",\\n+ \\\"default_branch\\\": \\\"master\\\",\\n+ \\\"deployments_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/deployments\\\",\\n+ \\\"description\\\": null,\\n+ \\\"disabled\\\": false,\\n+ \\\"downloads_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/downloads\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/events\\\",\\n+ \\\"fork\\\": false,\\n+ \\\"forks\\\": 0,\\n+ \\\"forks_count\\\": 0,\\n+ \\\"forks_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/forks\\\",\\n+ \\\"full_name\\\": \\\"ursa-labs/ursabot\\\",\\n+ \\\"git_commits_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\\\",\\n+ \\\"git_refs_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\\\",\\n+ \\\"git_tags_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\\\",\\n+ \\\"git_url\\\": \\\"git://github.com/ursa-labs/ursabot.git\\\",\\n+ \\\"has_downloads\\\": true,\\n+ \\\"has_issues\\\": true,\\n+ \\\"has_pages\\\": false,\\n+ \\\"has_projects\\\": true,\\n+ \\\"has_wiki\\\": true,\\n+ \\\"homepage\\\": null,\\n+ \\\"hooks_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/hooks\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs/ursabot\\\",\\n+ \\\"id\\\": 169101701,\\n+ \\\"issue_comment_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\\\",\\n+ \\\"issue_events_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\\\",\\n+ \\\"issues_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\\\",\\n+ \\\"keys_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\\\",\\n+ \\\"labels_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\\\",\\n+ \\\"language\\\": \\\"Jupyter Notebook\\\",\\n+ \\\"languages_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/languages\\\",\\n+ \\\"license\\\": null,\\n+ \\\"merges_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/merges\\\",\\n+ \\\"milestones_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\\\",\\n+ \\\"mirror_url\\\": null,\\n+ \\\"name\\\": \\\"ursabot\\\",\\n+ \\\"node_id\\\": \\\"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\\\",\\n+ \\\"notifications_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\\\",\\n+ \\\"open_issues\\\": 19,\\n+ \\\"open_issues_count\\\": 19,\\n+ \\\"owner\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars2.githubusercontent.com/u/46514972?v=4\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/users/ursa-labs/events{/privacy}\\\",\\n+ \\\"followers_url\\\": \\\"https://api.github.com/users/ursa-labs/followers\\\",\\n+ \\\"following_url\\\": \\\"https://api.github.com/users/ursa-labs/following{/other_user}\\\",\\n+ \\\"gists_url\\\": \\\"https://api.github.com/users/ursa-labs/gists{/gist_id}\\\",\\n+ \\\"gravatar_id\\\": \\\"\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs\\\",\\n+ \\\"id\\\": 46514972,\\n+ \\\"login\\\": \\\"ursa-labs\\\",\\n+ \\\"node_id\\\": \\\"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\\\",\\n+ \\\"organizations_url\\\": \\\"https://api.github.com/users/ursa-labs/orgs\\\",\\n+ \\\"received_events_url\\\": \\\"https://api.github.com/users/ursa-labs/received_events\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/users/ursa-labs/repos\\\",\\n+ \\\"site_admin\\\": false,\\n+ \\\"starred_url\\\": \\\"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\\\",\\n+ \\\"subscriptions_url\\\": \\\"https://api.github.com/users/ursa-labs/subscriptions\\\",\\n+ \\\"type\\\": \\\"Organization\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/users/ursa-labs\\\"\\n+ },\\n+ \\\"private\\\": false,\\n+ \\\"pulls_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\\\",\\n+ \\\"pushed_at\\\": \\\"2019-04-05T11:22:16Z\\\",\\n+ \\\"releases_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\\\",\\n+ \\\"size\\\": 892,\\n+ \\\"ssh_url\\\": \\\"git@github.com:ursa-labs/ursabot.git\\\",\\n+ \\\"stargazers_count\\\": 1,\\n+ \\\"stargazers_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/stargazers\\\",\\n+ \\\"statuses_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\\\",\\n+ \\\"subscribers_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/subscribers\\\",\\n+ \\\"subscription_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/subscription\\\",\\n+ \\\"svn_url\\\": \\\"https://github.com/ursa-labs/ursabot\\\",\\n+ \\\"tags_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/tags\\\",\\n+ \\\"teams_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/teams\\\",\\n+ \\\"trees_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\\\",\\n+ \\\"updated_at\\\": \\\"2019-04-04T17:49:10Z\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot\\\",\\n+ \\\"watchers\\\": 1,\\n+ \\\"watchers_count\\\": 1\\n+ },\\n+ \\\"sender\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars1.githubusercontent.com/u/961747?v=4\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/users/kszucs/events{/privacy}\\\",\\n+ \\\"followers_url\\\": \\\"https://api.github.com/users/kszucs/followers\\\",\\n+ \\\"following_url\\\": \\\"https://api.github.com/users/kszucs/following{/other_user}\\\",\\n+ \\\"gists_url\\\": \\\"https://api.github.com/users/kszucs/gists{/gist_id}\\\",\\n+ \\\"gravatar_id\\\": \\\"\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/kszucs\\\",\\n+ \\\"id\\\": 961747,\\n+ \\\"login\\\": \\\"kszucs\\\",\\n+ \\\"node_id\\\": \\\"MDQ6VXNlcjk2MTc0Nw==\\\",\\n+ \\\"organizations_url\\\": \\\"https://api.github.com/users/kszucs/orgs\\\",\\n+ \\\"received_events_url\\\": \\\"https://api.github.com/users/kszucs/received_events\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/users/kszucs/repos\\\",\\n+ \\\"site_admin\\\": false,\\n+ \\\"starred_url\\\": \\\"https://api.github.com/users/kszucs/starred{/owner}{/repo}\\\",\\n+ \\\"subscriptions_url\\\": \\\"https://api.github.com/users/kszucs/subscriptions\\\",\\n+ \\\"type\\\": \\\"User\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/users/kszucs\\\"\\n+ }\\n+}\"\n+ },\n+ {\n+ \"sha\": \"80ff46510a2f39ae60f7c3a98e5fdaef8e688784\",\n+ \"filename\": \"ursabot/tests/fixtures/issue-comment-without-pull-request.json\",\n+ \"status\": \"added\",\n+ \"additions\": 206,\n+ \"deletions\": 0,\n+ \"changes\": 206,\n+ \"blob_url\": \"https://github.com/ursa-labs/ursabot/blob/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-without-pull-request.json\",\n+ \"raw_url\": \"https://github.com/ursa-labs/ursabot/raw/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/fixtures/issue-comment-without-pull-request.json\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/fixtures/issue-comment-without-pull-request.json?ref=2705da2b616b98fa6010a25813c5a7a27456f71d\",\n+ \"patch\": \"@@ -0,0 +1,206 @@\\n+{\\n+ \\\"action\\\": \\\"created\\\",\\n+ \\\"comment\\\": {\\n+ \\\"author_association\\\": \\\"NONE\\\",\\n+ \\\"body\\\": \\\"Ursabot only listens to pull request comments!\\\",\\n+ \\\"created_at\\\": \\\"2019-04-05T11:53:43Z\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs/ursabot/issues/19#issuecomment-480248217\\\",\\n+ \\\"id\\\": 480248217,\\n+ \\\"issue_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/19\\\",\\n+ \\\"node_id\\\": \\\"MDEyOklzc3VlQ29tbWVudDQ4MDI0ODIxNw==\\\",\\n+ \\\"updated_at\\\": \\\"2019-04-05T11:53:43Z\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/comments/480248217\\\",\\n+ \\\"user\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars2.githubusercontent.com/u/49275095?v=4\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/users/ursabot/events{/privacy}\\\",\\n+ \\\"followers_url\\\": \\\"https://api.github.com/users/ursabot/followers\\\",\\n+ \\\"following_url\\\": \\\"https://api.github.com/users/ursabot/following{/other_user}\\\",\\n+ \\\"gists_url\\\": \\\"https://api.github.com/users/ursabot/gists{/gist_id}\\\",\\n+ \\\"gravatar_id\\\": \\\"\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursabot\\\",\\n+ \\\"id\\\": 49275095,\\n+ \\\"login\\\": \\\"ursabot\\\",\\n+ \\\"node_id\\\": \\\"MDQ6VXNlcjQ5Mjc1MDk1\\\",\\n+ \\\"organizations_url\\\": \\\"https://api.github.com/users/ursabot/orgs\\\",\\n+ \\\"received_events_url\\\": \\\"https://api.github.com/users/ursabot/received_events\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/users/ursabot/repos\\\",\\n+ \\\"site_admin\\\": false,\\n+ \\\"starred_url\\\": \\\"https://api.github.com/users/ursabot/starred{/owner}{/repo}\\\",\\n+ \\\"subscriptions_url\\\": \\\"https://api.github.com/users/ursabot/subscriptions\\\",\\n+ \\\"type\\\": \\\"User\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/users/ursabot\\\"\\n+ }\\n+ },\\n+ \\\"issue\\\": {\\n+ \\\"assignee\\\": null,\\n+ \\\"assignees\\\": [],\\n+ \\\"author_association\\\": \\\"MEMBER\\\",\\n+ \\\"body\\\": \\\"\\\",\\n+ \\\"closed_at\\\": null,\\n+ \\\"comments\\\": 4,\\n+ \\\"comments_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/19/comments\\\",\\n+ \\\"created_at\\\": \\\"2019-04-02T09:56:41Z\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/19/events\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs/ursabot/issues/19\\\",\\n+ \\\"id\\\": 428131685,\\n+ \\\"labels\\\": [],\\n+ \\\"labels_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/19/labels{/name}\\\",\\n+ \\\"locked\\\": false,\\n+ \\\"milestone\\\": null,\\n+ \\\"node_id\\\": \\\"MDU6SXNzdWU0MjgxMzE2ODU=\\\",\\n+ \\\"number\\\": 19,\\n+ \\\"repository_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot\\\",\\n+ \\\"state\\\": \\\"open\\\",\\n+ \\\"title\\\": \\\"Build ursabot itself via ursabot\\\",\\n+ \\\"updated_at\\\": \\\"2019-04-05T11:53:43Z\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/19\\\",\\n+ \\\"user\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars1.githubusercontent.com/u/961747?v=4\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/users/kszucs/events{/privacy}\\\",\\n+ \\\"followers_url\\\": \\\"https://api.github.com/users/kszucs/followers\\\",\\n+ \\\"following_url\\\": \\\"https://api.github.com/users/kszucs/following{/other_user}\\\",\\n+ \\\"gists_url\\\": \\\"https://api.github.com/users/kszucs/gists{/gist_id}\\\",\\n+ \\\"gravatar_id\\\": \\\"\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/kszucs\\\",\\n+ \\\"id\\\": 961747,\\n+ \\\"login\\\": \\\"kszucs\\\",\\n+ \\\"node_id\\\": \\\"MDQ6VXNlcjk2MTc0Nw==\\\",\\n+ \\\"organizations_url\\\": \\\"https://api.github.com/users/kszucs/orgs\\\",\\n+ \\\"received_events_url\\\": \\\"https://api.github.com/users/kszucs/received_events\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/users/kszucs/repos\\\",\\n+ \\\"site_admin\\\": false,\\n+ \\\"starred_url\\\": \\\"https://api.github.com/users/kszucs/starred{/owner}{/repo}\\\",\\n+ \\\"subscriptions_url\\\": \\\"https://api.github.com/users/kszucs/subscriptions\\\",\\n+ \\\"type\\\": \\\"User\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/users/kszucs\\\"\\n+ }\\n+ },\\n+ \\\"organization\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars2.githubusercontent.com/u/46514972?v=4\\\",\\n+ \\\"description\\\": \\\"Innovation lab for open source data science tools, powered by Apache Arrow\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/orgs/ursa-labs/events\\\",\\n+ \\\"hooks_url\\\": \\\"https://api.github.com/orgs/ursa-labs/hooks\\\",\\n+ \\\"id\\\": 46514972,\\n+ \\\"issues_url\\\": \\\"https://api.github.com/orgs/ursa-labs/issues\\\",\\n+ \\\"login\\\": \\\"ursa-labs\\\",\\n+ \\\"members_url\\\": \\\"https://api.github.com/orgs/ursa-labs/members{/member}\\\",\\n+ \\\"node_id\\\": \\\"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\\\",\\n+ \\\"public_members_url\\\": \\\"https://api.github.com/orgs/ursa-labs/public_members{/member}\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/orgs/ursa-labs/repos\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/orgs/ursa-labs\\\"\\n+ },\\n+ \\\"repository\\\": {\\n+ \\\"archive_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\\\",\\n+ \\\"archived\\\": false,\\n+ \\\"assignees_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\\\",\\n+ \\\"blobs_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\\\",\\n+ \\\"branches_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\\\",\\n+ \\\"clone_url\\\": \\\"https://github.com/ursa-labs/ursabot.git\\\",\\n+ \\\"collaborators_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\\\",\\n+ \\\"comments_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\\\",\\n+ \\\"commits_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\\\",\\n+ \\\"compare_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\\\",\\n+ \\\"contents_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\\\",\\n+ \\\"contributors_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/contributors\\\",\\n+ \\\"created_at\\\": \\\"2019-02-04T15:40:31Z\\\",\\n+ \\\"default_branch\\\": \\\"master\\\",\\n+ \\\"deployments_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/deployments\\\",\\n+ \\\"description\\\": null,\\n+ \\\"disabled\\\": false,\\n+ \\\"downloads_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/downloads\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/events\\\",\\n+ \\\"fork\\\": false,\\n+ \\\"forks\\\": 0,\\n+ \\\"forks_count\\\": 0,\\n+ \\\"forks_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/forks\\\",\\n+ \\\"full_name\\\": \\\"ursa-labs/ursabot\\\",\\n+ \\\"git_commits_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\\\",\\n+ \\\"git_refs_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\\\",\\n+ \\\"git_tags_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\\\",\\n+ \\\"git_url\\\": \\\"git://github.com/ursa-labs/ursabot.git\\\",\\n+ \\\"has_downloads\\\": true,\\n+ \\\"has_issues\\\": true,\\n+ \\\"has_pages\\\": false,\\n+ \\\"has_projects\\\": true,\\n+ \\\"has_wiki\\\": true,\\n+ \\\"homepage\\\": null,\\n+ \\\"hooks_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/hooks\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs/ursabot\\\",\\n+ \\\"id\\\": 169101701,\\n+ \\\"issue_comment_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\\\",\\n+ \\\"issue_events_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\\\",\\n+ \\\"issues_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\\\",\\n+ \\\"keys_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\\\",\\n+ \\\"labels_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\\\",\\n+ \\\"language\\\": \\\"Jupyter Notebook\\\",\\n+ \\\"languages_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/languages\\\",\\n+ \\\"license\\\": null,\\n+ \\\"merges_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/merges\\\",\\n+ \\\"milestones_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\\\",\\n+ \\\"mirror_url\\\": null,\\n+ \\\"name\\\": \\\"ursabot\\\",\\n+ \\\"node_id\\\": \\\"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\\\",\\n+ \\\"notifications_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\\\",\\n+ \\\"open_issues\\\": 19,\\n+ \\\"open_issues_count\\\": 19,\\n+ \\\"owner\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars2.githubusercontent.com/u/46514972?v=4\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/users/ursa-labs/events{/privacy}\\\",\\n+ \\\"followers_url\\\": \\\"https://api.github.com/users/ursa-labs/followers\\\",\\n+ \\\"following_url\\\": \\\"https://api.github.com/users/ursa-labs/following{/other_user}\\\",\\n+ \\\"gists_url\\\": \\\"https://api.github.com/users/ursa-labs/gists{/gist_id}\\\",\\n+ \\\"gravatar_id\\\": \\\"\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursa-labs\\\",\\n+ \\\"id\\\": 46514972,\\n+ \\\"login\\\": \\\"ursa-labs\\\",\\n+ \\\"node_id\\\": \\\"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\\\",\\n+ \\\"organizations_url\\\": \\\"https://api.github.com/users/ursa-labs/orgs\\\",\\n+ \\\"received_events_url\\\": \\\"https://api.github.com/users/ursa-labs/received_events\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/users/ursa-labs/repos\\\",\\n+ \\\"site_admin\\\": false,\\n+ \\\"starred_url\\\": \\\"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\\\",\\n+ \\\"subscriptions_url\\\": \\\"https://api.github.com/users/ursa-labs/subscriptions\\\",\\n+ \\\"type\\\": \\\"Organization\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/users/ursa-labs\\\"\\n+ },\\n+ \\\"private\\\": false,\\n+ \\\"pulls_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\\\",\\n+ \\\"pushed_at\\\": \\\"2019-04-05T11:22:16Z\\\",\\n+ \\\"releases_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\\\",\\n+ \\\"size\\\": 892,\\n+ \\\"ssh_url\\\": \\\"git@github.com:ursa-labs/ursabot.git\\\",\\n+ \\\"stargazers_count\\\": 1,\\n+ \\\"stargazers_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/stargazers\\\",\\n+ \\\"statuses_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\\\",\\n+ \\\"subscribers_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/subscribers\\\",\\n+ \\\"subscription_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/subscription\\\",\\n+ \\\"svn_url\\\": \\\"https://github.com/ursa-labs/ursabot\\\",\\n+ \\\"tags_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/tags\\\",\\n+ \\\"teams_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/teams\\\",\\n+ \\\"trees_url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\\\",\\n+ \\\"updated_at\\\": \\\"2019-04-04T17:49:10Z\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/repos/ursa-labs/ursabot\\\",\\n+ \\\"watchers\\\": 1,\\n+ \\\"watchers_count\\\": 1\\n+ },\\n+ \\\"sender\\\": {\\n+ \\\"avatar_url\\\": \\\"https://avatars2.githubusercontent.com/u/49275095?v=4\\\",\\n+ \\\"events_url\\\": \\\"https://api.github.com/users/ursabot/events{/privacy}\\\",\\n+ \\\"followers_url\\\": \\\"https://api.github.com/users/ursabot/followers\\\",\\n+ \\\"following_url\\\": \\\"https://api.github.com/users/ursabot/following{/other_user}\\\",\\n+ \\\"gists_url\\\": \\\"https://api.github.com/users/ursabot/gists{/gist_id}\\\",\\n+ \\\"gravatar_id\\\": \\\"\\\",\\n+ \\\"html_url\\\": \\\"https://github.com/ursabot\\\",\\n+ \\\"id\\\": 49275095,\\n+ \\\"login\\\": \\\"ursabot\\\",\\n+ \\\"node_id\\\": \\\"MDQ6VXNlcjQ5Mjc1MDk1\\\",\\n+ \\\"organizations_url\\\": \\\"https://api.github.com/users/ursabot/orgs\\\",\\n+ \\\"received_events_url\\\": \\\"https://api.github.com/users/ursabot/received_events\\\",\\n+ \\\"repos_url\\\": \\\"https://api.github.com/users/ursabot/repos\\\",\\n+ \\\"site_admin\\\": false,\\n+ \\\"starred_url\\\": \\\"https://api.github.com/users/ursabot/starred{/owner}{/repo}\\\",\\n+ \\\"subscriptions_url\\\": \\\"https://api.github.com/users/ursabot/subscriptions\\\",\\n+ \\\"type\\\": \\\"User\\\",\\n+ \\\"url\\\": \\\"https://api.github.com/users/ursabot\\\"\\n+ }\\n+}\"\n+ },\n+ {\n+ \"sha\": \"c738bb0eb54c87ba0f23e97e827d77c2be74d0b6\",\n+ \"filename\": \"ursabot/tests/test_hooks.py\",\n+ \"status\": \"modified\",\n+ \"additions\": 4,\n+ \"deletions\": 4,\n+ \"changes\": 8,\n+ \"blob_url\": \"https://github.com/ursa-labs/ursabot/blob/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/test_hooks.py\",\n+ \"raw_url\": \"https://github.com/ursa-labs/ursabot/raw/2705da2b616b98fa6010a25813c5a7a27456f71d/ursabot/tests/test_hooks.py\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/test_hooks.py?ref=2705da2b616b98fa6010a25813c5a7a27456f71d\",\n+ \"patch\": \"@@ -54,7 +54,7 @@ class TestGithubHook(ChangeHookTestCase):\\n await self.request('ping', {})\\n assert len(self.hook.master.data.updates.changesAdded) == 0\\n \\n- @ensure_deferred\\n- async def test_issue_comment(self):\\n- payload = {}\\n- await self.request('issue_comment', payload)\\n+ # @ensure_deferred\\n+ # async def test_issue_comment(self):\\n+ # payload = {}\\n+ # await self.request('issue_comment', payload)\"\n+ }\n+ ]\n+}" - }, - { - "sha": "ad061d7244b917e6ea3853698dc3bc2a8c9c6857", - "filename": "ursabot/tests/fixtures/pull-request-26.json", - "status": "added", - "additions": 335, - "deletions": 0, - "changes": 335, - "blob_url": "https://github.com/ursa-labs/ursabot/blob/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/tests/fixtures/pull-request-26.json", - "raw_url": "https://github.com/ursa-labs/ursabot/raw/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/tests/fixtures/pull-request-26.json", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/fixtures/pull-request-26.json?ref=70267dee34884e4b972388e1b30d57f6248c58d0", - "patch": "@@ -0,0 +1,335 @@\n+{\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\",\n+ \"id\": 267785552,\n+ \"node_id\": \"MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy\",\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot/pull/26\",\n+ \"diff_url\": \"https://github.com/ursa-labs/ursabot/pull/26.diff\",\n+ \"patch_url\": \"https://github.com/ursa-labs/ursabot/pull/26.patch\",\n+ \"issue_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\",\n+ \"number\": 26,\n+ \"state\": \"open\",\n+ \"locked\": false,\n+ \"title\": \"Unittests for GithubHook\",\n+ \"user\": {\n+ \"login\": \"kszucs\",\n+ \"id\": 961747,\n+ \"node_id\": \"MDQ6VXNlcjk2MTc0Nw==\",\n+ \"avatar_url\": \"https://avatars1.githubusercontent.com/u/961747?v=4\",\n+ \"gravatar_id\": \"\",\n+ \"url\": \"https://api.github.com/users/kszucs\",\n+ \"html_url\": \"https://github.com/kszucs\",\n+ \"followers_url\": \"https://api.github.com/users/kszucs/followers\",\n+ \"following_url\": \"https://api.github.com/users/kszucs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/kszucs/gists{/gist_id}\",\n+ \"starred_url\": \"https://api.github.com/users/kszucs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/kszucs/subscriptions\",\n+ \"organizations_url\": \"https://api.github.com/users/kszucs/orgs\",\n+ \"repos_url\": \"https://api.github.com/users/kszucs/repos\",\n+ \"events_url\": \"https://api.github.com/users/kszucs/events{/privacy}\",\n+ \"received_events_url\": \"https://api.github.com/users/kszucs/received_events\",\n+ \"type\": \"User\",\n+ \"site_admin\": false\n+ },\n+ \"body\": \"\",\n+ \"created_at\": \"2019-04-05T11:22:15Z\",\n+ \"updated_at\": \"2019-04-05T12:01:40Z\",\n+ \"closed_at\": null,\n+ \"merged_at\": null,\n+ \"merge_commit_sha\": \"cc5dc3606988b3824be54df779ed2028776113cb\",\n+ \"assignee\": null,\n+ \"assignees\": [\n+\n+ ],\n+ \"requested_reviewers\": [\n+\n+ ],\n+ \"requested_teams\": [\n+\n+ ],\n+ \"labels\": [\n+\n+ ],\n+ \"milestone\": null,\n+ \"commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26/commits\",\n+ \"review_comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26/comments\",\n+ \"review_comment_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/comments{/number}\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\",\n+ \"statuses_url\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/2705da2b616b98fa6010a25813c5a7a27456f71d\",\n+ \"head\": {\n+ \"label\": \"ursa-labs:test-hook\",\n+ \"ref\": \"test-hook\",\n+ \"sha\": \"2705da2b616b98fa6010a25813c5a7a27456f71d\",\n+ \"user\": {\n+ \"login\": \"ursa-labs\",\n+ \"id\": 46514972,\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"gravatar_id\": \"\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"type\": \"Organization\",\n+ \"site_admin\": false\n+ },\n+ \"repo\": {\n+ \"id\": 169101701,\n+ \"node_id\": \"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\",\n+ \"name\": \"ursabot\",\n+ \"full_name\": \"ursa-labs/ursabot\",\n+ \"private\": false,\n+ \"owner\": {\n+ \"login\": \"ursa-labs\",\n+ \"id\": 46514972,\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"gravatar_id\": \"\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"type\": \"Organization\",\n+ \"site_admin\": false\n+ },\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"description\": null,\n+ \"fork\": false,\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"forks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/forks\",\n+ \"keys_url\": \"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\",\n+ \"collaborators_url\": \"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\",\n+ \"teams_url\": \"https://api.github.com/repos/ursa-labs/ursabot/teams\",\n+ \"hooks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/hooks\",\n+ \"issue_events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/events\",\n+ \"assignees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\",\n+ \"branches_url\": \"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\",\n+ \"tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/tags\",\n+ \"blobs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\",\n+ \"git_tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\",\n+ \"git_refs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\",\n+ \"trees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\",\n+ \"statuses_url\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\",\n+ \"languages_url\": \"https://api.github.com/repos/ursa-labs/ursabot/languages\",\n+ \"stargazers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/stargazers\",\n+ \"contributors_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contributors\",\n+ \"subscribers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscribers\",\n+ \"subscription_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscription\",\n+ \"commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\",\n+ \"git_commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\",\n+ \"issue_comment_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\",\n+ \"compare_url\": \"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\",\n+ \"merges_url\": \"https://api.github.com/repos/ursa-labs/ursabot/merges\",\n+ \"archive_url\": \"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\",\n+ \"downloads_url\": \"https://api.github.com/repos/ursa-labs/ursabot/downloads\",\n+ \"issues_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\",\n+ \"pulls_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\",\n+ \"milestones_url\": \"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\",\n+ \"notifications_url\": \"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\",\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\",\n+ \"releases_url\": \"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\",\n+ \"deployments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/deployments\",\n+ \"created_at\": \"2019-02-04T15:40:31Z\",\n+ \"updated_at\": \"2019-04-04T17:49:10Z\",\n+ \"pushed_at\": \"2019-04-05T12:01:40Z\",\n+ \"git_url\": \"git://github.com/ursa-labs/ursabot.git\",\n+ \"ssh_url\": \"git@github.com:ursa-labs/ursabot.git\",\n+ \"clone_url\": \"https://github.com/ursa-labs/ursabot.git\",\n+ \"svn_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"homepage\": null,\n+ \"size\": 898,\n+ \"stargazers_count\": 1,\n+ \"watchers_count\": 1,\n+ \"language\": \"Jupyter Notebook\",\n+ \"has_issues\": true,\n+ \"has_projects\": true,\n+ \"has_downloads\": true,\n+ \"has_wiki\": true,\n+ \"has_pages\": false,\n+ \"forks_count\": 0,\n+ \"mirror_url\": null,\n+ \"archived\": false,\n+ \"disabled\": false,\n+ \"open_issues_count\": 19,\n+ \"license\": null,\n+ \"forks\": 0,\n+ \"open_issues\": 19,\n+ \"watchers\": 1,\n+ \"default_branch\": \"master\"\n+ }\n+ },\n+ \"base\": {\n+ \"label\": \"ursa-labs:master\",\n+ \"ref\": \"master\",\n+ \"sha\": \"a162ad254b589b924db47e057791191b39613fd5\",\n+ \"user\": {\n+ \"login\": \"ursa-labs\",\n+ \"id\": 46514972,\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"gravatar_id\": \"\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"type\": \"Organization\",\n+ \"site_admin\": false\n+ },\n+ \"repo\": {\n+ \"id\": 169101701,\n+ \"node_id\": \"MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=\",\n+ \"name\": \"ursabot\",\n+ \"full_name\": \"ursa-labs/ursabot\",\n+ \"private\": false,\n+ \"owner\": {\n+ \"login\": \"ursa-labs\",\n+ \"id\": 46514972,\n+ \"node_id\": \"MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy\",\n+ \"avatar_url\": \"https://avatars2.githubusercontent.com/u/46514972?v=4\",\n+ \"gravatar_id\": \"\",\n+ \"url\": \"https://api.github.com/users/ursa-labs\",\n+ \"html_url\": \"https://github.com/ursa-labs\",\n+ \"followers_url\": \"https://api.github.com/users/ursa-labs/followers\",\n+ \"following_url\": \"https://api.github.com/users/ursa-labs/following{/other_user}\",\n+ \"gists_url\": \"https://api.github.com/users/ursa-labs/gists{/gist_id}\",\n+ \"starred_url\": \"https://api.github.com/users/ursa-labs/starred{/owner}{/repo}\",\n+ \"subscriptions_url\": \"https://api.github.com/users/ursa-labs/subscriptions\",\n+ \"organizations_url\": \"https://api.github.com/users/ursa-labs/orgs\",\n+ \"repos_url\": \"https://api.github.com/users/ursa-labs/repos\",\n+ \"events_url\": \"https://api.github.com/users/ursa-labs/events{/privacy}\",\n+ \"received_events_url\": \"https://api.github.com/users/ursa-labs/received_events\",\n+ \"type\": \"Organization\",\n+ \"site_admin\": false\n+ },\n+ \"html_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"description\": null,\n+ \"fork\": false,\n+ \"url\": \"https://api.github.com/repos/ursa-labs/ursabot\",\n+ \"forks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/forks\",\n+ \"keys_url\": \"https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}\",\n+ \"collaborators_url\": \"https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}\",\n+ \"teams_url\": \"https://api.github.com/repos/ursa-labs/ursabot/teams\",\n+ \"hooks_url\": \"https://api.github.com/repos/ursa-labs/ursabot/hooks\",\n+ \"issue_events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}\",\n+ \"events_url\": \"https://api.github.com/repos/ursa-labs/ursabot/events\",\n+ \"assignees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}\",\n+ \"branches_url\": \"https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}\",\n+ \"tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/tags\",\n+ \"blobs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}\",\n+ \"git_tags_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}\",\n+ \"git_refs_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}\",\n+ \"trees_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}\",\n+ \"statuses_url\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}\",\n+ \"languages_url\": \"https://api.github.com/repos/ursa-labs/ursabot/languages\",\n+ \"stargazers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/stargazers\",\n+ \"contributors_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contributors\",\n+ \"subscribers_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscribers\",\n+ \"subscription_url\": \"https://api.github.com/repos/ursa-labs/ursabot/subscription\",\n+ \"commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}\",\n+ \"git_commits_url\": \"https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}\",\n+ \"comments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/comments{/number}\",\n+ \"issue_comment_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}\",\n+ \"contents_url\": \"https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}\",\n+ \"compare_url\": \"https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}\",\n+ \"merges_url\": \"https://api.github.com/repos/ursa-labs/ursabot/merges\",\n+ \"archive_url\": \"https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}\",\n+ \"downloads_url\": \"https://api.github.com/repos/ursa-labs/ursabot/downloads\",\n+ \"issues_url\": \"https://api.github.com/repos/ursa-labs/ursabot/issues{/number}\",\n+ \"pulls_url\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}\",\n+ \"milestones_url\": \"https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}\",\n+ \"notifications_url\": \"https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}\",\n+ \"labels_url\": \"https://api.github.com/repos/ursa-labs/ursabot/labels{/name}\",\n+ \"releases_url\": \"https://api.github.com/repos/ursa-labs/ursabot/releases{/id}\",\n+ \"deployments_url\": \"https://api.github.com/repos/ursa-labs/ursabot/deployments\",\n+ \"created_at\": \"2019-02-04T15:40:31Z\",\n+ \"updated_at\": \"2019-04-04T17:49:10Z\",\n+ \"pushed_at\": \"2019-04-05T12:01:40Z\",\n+ \"git_url\": \"git://github.com/ursa-labs/ursabot.git\",\n+ \"ssh_url\": \"git@github.com:ursa-labs/ursabot.git\",\n+ \"clone_url\": \"https://github.com/ursa-labs/ursabot.git\",\n+ \"svn_url\": \"https://github.com/ursa-labs/ursabot\",\n+ \"homepage\": null,\n+ \"size\": 898,\n+ \"stargazers_count\": 1,\n+ \"watchers_count\": 1,\n+ \"language\": \"Jupyter Notebook\",\n+ \"has_issues\": true,\n+ \"has_projects\": true,\n+ \"has_downloads\": true,\n+ \"has_wiki\": true,\n+ \"has_pages\": false,\n+ \"forks_count\": 0,\n+ \"mirror_url\": null,\n+ \"archived\": false,\n+ \"disabled\": false,\n+ \"open_issues_count\": 19,\n+ \"license\": null,\n+ \"forks\": 0,\n+ \"open_issues\": 19,\n+ \"watchers\": 1,\n+ \"default_branch\": \"master\"\n+ }\n+ },\n+ \"_links\": {\n+ \"self\": {\n+ \"href\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26\"\n+ },\n+ \"html\": {\n+ \"href\": \"https://github.com/ursa-labs/ursabot/pull/26\"\n+ },\n+ \"issue\": {\n+ \"href\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26\"\n+ },\n+ \"comments\": {\n+ \"href\": \"https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments\"\n+ },\n+ \"review_comments\": {\n+ \"href\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26/comments\"\n+ },\n+ \"review_comment\": {\n+ \"href\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/comments{/number}\"\n+ },\n+ \"commits\": {\n+ \"href\": \"https://api.github.com/repos/ursa-labs/ursabot/pulls/26/commits\"\n+ },\n+ \"statuses\": {\n+ \"href\": \"https://api.github.com/repos/ursa-labs/ursabot/statuses/2705da2b616b98fa6010a25813c5a7a27456f71d\"\n+ }\n+ },\n+ \"author_association\": \"MEMBER\",\n+ \"merged\": false,\n+ \"mergeable\": true,\n+ \"rebaseable\": true,\n+ \"mergeable_state\": \"unstable\",\n+ \"merged_by\": null,\n+ \"comments\": 5,\n+ \"review_comments\": 0,\n+ \"maintainer_can_modify\": false,\n+ \"commits\": 2,\n+ \"additions\": 1124,\n+ \"deletions\": 0,\n+ \"changed_files\": 7\n+}" - }, - { - "sha": "e87b27d2d7b4956d15f7468488b96cf6a06686f4", - "filename": "ursabot/tests/test_hooks.py", - "status": "added", - "additions": 116, - "deletions": 0, - "changes": 116, - "blob_url": "https://github.com/ursa-labs/ursabot/blob/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/tests/test_hooks.py", - "raw_url": "https://github.com/ursa-labs/ursabot/raw/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/tests/test_hooks.py", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/tests/test_hooks.py?ref=70267dee34884e4b972388e1b30d57f6248c58d0", - "patch": "@@ -0,0 +1,116 @@\n+import json\n+from pathlib import Path\n+from twisted.trial import unittest\n+\n+from buildbot.test.util.misc import TestReactorMixin\n+from buildbot.test.fake.httpclientservice import \\\n+ HTTPClientService as FakeHTTPClientService\n+from buildbot.test.unit.test_www_hooks_github import (\n+ _prepare_request, _prepare_github_change_hook)\n+\n+from ursabot.utils import ensure_deferred\n+from ursabot.hooks import GithubHook\n+\n+\n+class ChangeHookTestCase(unittest.TestCase, TestReactorMixin):\n+\n+ klass = None\n+\n+ @ensure_deferred\n+ async def setUp(self):\n+ self.setUpTestReactor()\n+\n+ assert self.klass is not None\n+ self.hook = _prepare_github_change_hook(self, **{'class': self.klass})\n+ self.master = self.hook.master\n+ self.http = await FakeHTTPClientService.getFakeService(\n+ self.master, self, 'https://api.github.com',\n+ headers={'User-Agent': 'Buildbot'}, debug=False, verify=False)\n+\n+ await self.master.startService()\n+\n+ @ensure_deferred\n+ async def tearDown(self):\n+ await self.master.stopService()\n+\n+ async def trigger(self, event, payload, headers=None, _secret=None):\n+ payload = json.dumps(payload).encode()\n+ request = _prepare_request(event, payload, _secret=_secret,\n+ headers=headers)\n+ await request.test_render(self.hook)\n+ return request\n+\n+ def load_fixture(self, name):\n+ path = Path(__file__).parent / 'fixtures' / f'{name}.json'\n+ with path.open('r') as fp:\n+ return json.load(fp)\n+\n+\n+class TestGithubHook(ChangeHookTestCase):\n+\n+ klass = GithubHook\n+\n+ @ensure_deferred\n+ async def test_ping(self):\n+ await self.trigger('ping', {})\n+ assert len(self.hook.master.data.updates.changesAdded) == 0\n+\n+ @ensure_deferred\n+ async def test_issue_comment_not_mentioning_ursabot(self):\n+ payload = self.load_fixture('issue-comment-not-mentioning-ursabot')\n+ await self.trigger('issue_comment', payload=payload)\n+ assert len(self.hook.master.data.updates.changesAdded) == 0\n+\n+ @ensure_deferred\n+ async def test_issue_comment_by_ursabot(self):\n+ payload = self.load_fixture('issue-comment-by-ursabot')\n+ await self.trigger('issue_comment', payload=payload)\n+ assert len(self.hook.master.data.updates.changesAdded) == 0\n+\n+ @ensure_deferred\n+ async def test_issue_comment_with_empty_command(self):\n+ # responds to the comment\n+ request_json = {'body': 'Unknown command \"\"'}\n+ response_json = ''\n+ self.http.expect('post', '/repos/ursa-labs/ursabot/issues/26/comments',\n+ json=request_json, content_json=response_json)\n+\n+ payload = self.load_fixture('issue-comment-with-empty-command')\n+ await self.trigger('issue_comment', payload=payload)\n+ assert len(self.hook.master.data.updates.changesAdded) == 0\n+\n+ @ensure_deferred\n+ async def test_issue_comment_without_pull_request(self):\n+ # responds to the comment\n+ request_json = {\n+ 'body': 'Ursabot only listens to pull request comments!'\n+ }\n+ response_json = ''\n+ self.http.expect('post', '/repos/ursa-labs/ursabot/issues/19/comments',\n+ json=request_json, content_json=response_json)\n+\n+ payload = self.load_fixture('issue-comment-without-pull-request')\n+ await self.trigger('issue_comment', payload=payload)\n+ assert len(self.hook.master.data.updates.changesAdded) == 0\n+\n+ @ensure_deferred\n+ async def test_issue_comment_build_command(self):\n+ # handle_issue_comment queries the pull request\n+ request_json = self.load_fixture('pull-request-26')\n+ self.http.expect('get', '/repos/ursa-labs/ursabot/pulls/26',\n+ content_json=request_json)\n+ # tigger handle_pull_request which fetches the commit\n+ request_json = self.load_fixture('pull-request-26-commit')\n+ commit = '2705da2b616b98fa6010a25813c5a7a27456f71d'\n+ self.http.expect('get', f'/repos/ursa-labs/ursabot/commits/{commit}',\n+ content_json=request_json)\n+\n+ # then responds to the comment\n+ request_json = {'body': \"I've successfully started builds for this PR\"}\n+ response_json = ''\n+ self.http.expect('post', '/repos/ursa-labs/ursabot/issues/26/comments',\n+ json=request_json, content_json=response_json)\n+\n+ payload = self.load_fixture('issue-comment-build-command')\n+ await self.trigger('issue_comment', payload=payload)\n+ assert len(self.hook.master.data.updates.changesAdded) == 1" - }, - { - "sha": "3ff0e88660cf186420e8bc672735e4d446963192", - "filename": "ursabot/utils.py", - "status": "added", - "additions": 10, - "deletions": 0, - "changes": 10, - "blob_url": "https://github.com/ursa-labs/ursabot/blob/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/utils.py", - "raw_url": "https://github.com/ursa-labs/ursabot/raw/70267dee34884e4b972388e1b30d57f6248c58d0/ursabot/utils.py", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/ursabot/utils.py?ref=70267dee34884e4b972388e1b30d57f6248c58d0", - "patch": "@@ -0,0 +1,10 @@\n+import functools\n+from twisted.internet import defer\n+\n+\n+def ensure_deferred(f):\n+ @functools.wraps(f)\n+ def wrapper(*args, **kwargs):\n+ result = f(*args, **kwargs)\n+ return defer.ensureDeferred(result)\n+ return wrapper" - } -] \ No newline at end of file diff --git a/dev/archery/archery/tests/fixtures/pull-request-26.json b/dev/archery/archery/tests/fixtures/pull-request-26.json deleted file mode 100644 index d295afb396e3c..0000000000000 --- a/dev/archery/archery/tests/fixtures/pull-request-26.json +++ /dev/null @@ -1,329 +0,0 @@ -{ - "url": "https://api.github.com/repos/ursa-labs/ursabot/pulls/26", - "id": 267785552, - "node_id": "MDExOlB1bGxSZXF1ZXN0MjY3Nzg1NTUy", - "html_url": "https://github.com/ursa-labs/ursabot/pull/26", - "diff_url": "https://github.com/ursa-labs/ursabot/pull/26.diff", - "patch_url": "https://github.com/ursa-labs/ursabot/pull/26.patch", - "issue_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26", - "number": 26, - "state": "open", - "locked": false, - "title": "Unittests for GithubHook", - "user": { - "login": "kszucs", - "id": 961747, - "node_id": "MDQ6VXNlcjk2MTc0Nw==", - "avatar_url": "https://avatars1.githubusercontent.com/u/961747?v=4", - "gravatar_id": "", - "url": "https://api.github.com/users/kszucs", - "html_url": "https://github.com/kszucs", - "followers_url": "https://api.github.com/users/kszucs/followers", - "following_url": "https://api.github.com/users/kszucs/following{/other_user}", - "gists_url": "https://api.github.com/users/kszucs/gists{/gist_id}", - "starred_url": "https://api.github.com/users/kszucs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/kszucs/subscriptions", - "organizations_url": "https://api.github.com/users/kszucs/orgs", - "repos_url": "https://api.github.com/users/kszucs/repos", - "events_url": "https://api.github.com/users/kszucs/events{/privacy}", - "received_events_url": "https://api.github.com/users/kszucs/received_events", - "type": "User", - "site_admin": false - }, - "body": "", - "body_html": "", - "body_text": "", - "created_at": "2019-04-05T11:22:15Z", - "updated_at": "2019-04-05T12:01:40Z", - "closed_at": null, - "merged_at": null, - "merge_commit_sha": "cc5dc3606988b3824be54df779ed2028776113cb", - "assignee": null, - "assignees": [], - "requested_reviewers": [], - "requested_teams": [], - "labels": [], - "milestone": null, - "commits_url": "https://api.github.com/repos/ursa-labs/ursabot/pulls/26/commits", - "review_comments_url": "https://api.github.com/repos/ursa-labs/ursabot/pulls/26/comments", - "review_comment_url": "https://api.github.com/repos/ursa-labs/ursabot/pulls/comments{/number}", - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments", - "statuses_url": "https://api.github.com/repos/ursa-labs/ursabot/statuses/2705da2b616b98fa6010a25813c5a7a27456f71d", - "head": { - "label": "ursa-labs:test-hook", - "ref": "test-hook", - "sha": "2705da2b616b98fa6010a25813c5a7a27456f71d", - "user": { - "login": "ursa-labs", - "id": 46514972, - "node_id": "MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy", - "avatar_url": "https://avatars2.githubusercontent.com/u/46514972?v=4", - "gravatar_id": "", - "url": "https://api.github.com/users/ursa-labs", - "html_url": "https://github.com/ursa-labs", - "followers_url": "https://api.github.com/users/ursa-labs/followers", - "following_url": "https://api.github.com/users/ursa-labs/following{/other_user}", - "gists_url": "https://api.github.com/users/ursa-labs/gists{/gist_id}", - "starred_url": "https://api.github.com/users/ursa-labs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/ursa-labs/subscriptions", - "organizations_url": "https://api.github.com/users/ursa-labs/orgs", - "repos_url": "https://api.github.com/users/ursa-labs/repos", - "events_url": "https://api.github.com/users/ursa-labs/events{/privacy}", - "received_events_url": "https://api.github.com/users/ursa-labs/received_events", - "type": "Organization", - "site_admin": false - }, - "repo": { - "id": 169101701, - "node_id": "MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=", - "name": "ursabot", - "full_name": "ursa-labs/ursabot", - "private": false, - "owner": { - "login": "ursa-labs", - "id": 46514972, - "node_id": "MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy", - "avatar_url": "https://avatars2.githubusercontent.com/u/46514972?v=4", - "gravatar_id": "", - "url": "https://api.github.com/users/ursa-labs", - "html_url": "https://github.com/ursa-labs", - "followers_url": "https://api.github.com/users/ursa-labs/followers", - "following_url": "https://api.github.com/users/ursa-labs/following{/other_user}", - "gists_url": "https://api.github.com/users/ursa-labs/gists{/gist_id}", - "starred_url": "https://api.github.com/users/ursa-labs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/ursa-labs/subscriptions", - "organizations_url": "https://api.github.com/users/ursa-labs/orgs", - "repos_url": "https://api.github.com/users/ursa-labs/repos", - "events_url": "https://api.github.com/users/ursa-labs/events{/privacy}", - "received_events_url": "https://api.github.com/users/ursa-labs/received_events", - "type": "Organization", - "site_admin": false - }, - "html_url": "https://github.com/ursa-labs/ursabot", - "description": null, - "fork": false, - "url": "https://api.github.com/repos/ursa-labs/ursabot", - "forks_url": "https://api.github.com/repos/ursa-labs/ursabot/forks", - "keys_url": "https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}", - "collaborators_url": "https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}", - "teams_url": "https://api.github.com/repos/ursa-labs/ursabot/teams", - "hooks_url": "https://api.github.com/repos/ursa-labs/ursabot/hooks", - "issue_events_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}", - "events_url": "https://api.github.com/repos/ursa-labs/ursabot/events", - "assignees_url": "https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}", - "branches_url": "https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}", - "tags_url": "https://api.github.com/repos/ursa-labs/ursabot/tags", - "blobs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}", - "git_tags_url": "https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}", - "git_refs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}", - "trees_url": "https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}", - "statuses_url": "https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}", - "languages_url": "https://api.github.com/repos/ursa-labs/ursabot/languages", - "stargazers_url": "https://api.github.com/repos/ursa-labs/ursabot/stargazers", - "contributors_url": "https://api.github.com/repos/ursa-labs/ursabot/contributors", - "subscribers_url": "https://api.github.com/repos/ursa-labs/ursabot/subscribers", - "subscription_url": "https://api.github.com/repos/ursa-labs/ursabot/subscription", - "commits_url": "https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}", - "git_commits_url": "https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}", - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/comments{/number}", - "issue_comment_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}", - "compare_url": "https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}", - "merges_url": "https://api.github.com/repos/ursa-labs/ursabot/merges", - "archive_url": "https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}", - "downloads_url": "https://api.github.com/repos/ursa-labs/ursabot/downloads", - "issues_url": "https://api.github.com/repos/ursa-labs/ursabot/issues{/number}", - "pulls_url": "https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}", - "milestones_url": "https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}", - "notifications_url": "https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}", - "labels_url": "https://api.github.com/repos/ursa-labs/ursabot/labels{/name}", - "releases_url": "https://api.github.com/repos/ursa-labs/ursabot/releases{/id}", - "deployments_url": "https://api.github.com/repos/ursa-labs/ursabot/deployments", - "created_at": "2019-02-04T15:40:31Z", - "updated_at": "2019-04-04T17:49:10Z", - "pushed_at": "2019-04-05T12:01:40Z", - "git_url": "git://github.com/ursa-labs/ursabot.git", - "ssh_url": "git@github.com:ursa-labs/ursabot.git", - "clone_url": "https://github.com/ursa-labs/ursabot.git", - "svn_url": "https://github.com/ursa-labs/ursabot", - "homepage": null, - "size": 898, - "stargazers_count": 1, - "watchers_count": 1, - "language": "Jupyter Notebook", - "has_issues": true, - "has_projects": true, - "has_downloads": true, - "has_wiki": true, - "has_pages": false, - "forks_count": 0, - "mirror_url": null, - "archived": false, - "disabled": false, - "open_issues_count": 19, - "license": null, - "forks": 0, - "open_issues": 19, - "watchers": 1, - "default_branch": "master" - } - }, - "base": { - "label": "ursa-labs:master", - "ref": "master", - "sha": "a162ad254b589b924db47e057791191b39613fd5", - "user": { - "login": "ursa-labs", - "id": 46514972, - "node_id": "MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy", - "avatar_url": "https://avatars2.githubusercontent.com/u/46514972?v=4", - "gravatar_id": "", - "url": "https://api.github.com/users/ursa-labs", - "html_url": "https://github.com/ursa-labs", - "followers_url": "https://api.github.com/users/ursa-labs/followers", - "following_url": "https://api.github.com/users/ursa-labs/following{/other_user}", - "gists_url": "https://api.github.com/users/ursa-labs/gists{/gist_id}", - "starred_url": "https://api.github.com/users/ursa-labs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/ursa-labs/subscriptions", - "organizations_url": "https://api.github.com/users/ursa-labs/orgs", - "repos_url": "https://api.github.com/users/ursa-labs/repos", - "events_url": "https://api.github.com/users/ursa-labs/events{/privacy}", - "received_events_url": "https://api.github.com/users/ursa-labs/received_events", - "type": "Organization", - "site_admin": false - }, - "repo": { - "id": 169101701, - "node_id": "MDEwOlJlcG9zaXRvcnkxNjkxMDE3MDE=", - "name": "ursabot", - "full_name": "ursa-labs/ursabot", - "private": false, - "owner": { - "login": "ursa-labs", - "id": 46514972, - "node_id": "MDEyOk9yZ2FuaXphdGlvbjQ2NTE0OTcy", - "avatar_url": "https://avatars2.githubusercontent.com/u/46514972?v=4", - "gravatar_id": "", - "url": "https://api.github.com/users/ursa-labs", - "html_url": "https://github.com/ursa-labs", - "followers_url": "https://api.github.com/users/ursa-labs/followers", - "following_url": "https://api.github.com/users/ursa-labs/following{/other_user}", - "gists_url": "https://api.github.com/users/ursa-labs/gists{/gist_id}", - "starred_url": "https://api.github.com/users/ursa-labs/starred{/owner}{/repo}", - "subscriptions_url": "https://api.github.com/users/ursa-labs/subscriptions", - "organizations_url": "https://api.github.com/users/ursa-labs/orgs", - "repos_url": "https://api.github.com/users/ursa-labs/repos", - "events_url": "https://api.github.com/users/ursa-labs/events{/privacy}", - "received_events_url": "https://api.github.com/users/ursa-labs/received_events", - "type": "Organization", - "site_admin": false - }, - "html_url": "https://github.com/ursa-labs/ursabot", - "description": null, - "fork": false, - "url": "https://api.github.com/repos/ursa-labs/ursabot", - "forks_url": "https://api.github.com/repos/ursa-labs/ursabot/forks", - "keys_url": "https://api.github.com/repos/ursa-labs/ursabot/keys{/key_id}", - "collaborators_url": "https://api.github.com/repos/ursa-labs/ursabot/collaborators{/collaborator}", - "teams_url": "https://api.github.com/repos/ursa-labs/ursabot/teams", - "hooks_url": "https://api.github.com/repos/ursa-labs/ursabot/hooks", - "issue_events_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/events{/number}", - "events_url": "https://api.github.com/repos/ursa-labs/ursabot/events", - "assignees_url": "https://api.github.com/repos/ursa-labs/ursabot/assignees{/user}", - "branches_url": "https://api.github.com/repos/ursa-labs/ursabot/branches{/branch}", - "tags_url": "https://api.github.com/repos/ursa-labs/ursabot/tags", - "blobs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/blobs{/sha}", - "git_tags_url": "https://api.github.com/repos/ursa-labs/ursabot/git/tags{/sha}", - "git_refs_url": "https://api.github.com/repos/ursa-labs/ursabot/git/refs{/sha}", - "trees_url": "https://api.github.com/repos/ursa-labs/ursabot/git/trees{/sha}", - "statuses_url": "https://api.github.com/repos/ursa-labs/ursabot/statuses/{sha}", - "languages_url": "https://api.github.com/repos/ursa-labs/ursabot/languages", - "stargazers_url": "https://api.github.com/repos/ursa-labs/ursabot/stargazers", - "contributors_url": "https://api.github.com/repos/ursa-labs/ursabot/contributors", - "subscribers_url": "https://api.github.com/repos/ursa-labs/ursabot/subscribers", - "subscription_url": "https://api.github.com/repos/ursa-labs/ursabot/subscription", - "commits_url": "https://api.github.com/repos/ursa-labs/ursabot/commits{/sha}", - "git_commits_url": "https://api.github.com/repos/ursa-labs/ursabot/git/commits{/sha}", - "comments_url": "https://api.github.com/repos/ursa-labs/ursabot/comments{/number}", - "issue_comment_url": "https://api.github.com/repos/ursa-labs/ursabot/issues/comments{/number}", - "contents_url": "https://api.github.com/repos/ursa-labs/ursabot/contents/{+path}", - "compare_url": "https://api.github.com/repos/ursa-labs/ursabot/compare/{base}...{head}", - "merges_url": "https://api.github.com/repos/ursa-labs/ursabot/merges", - "archive_url": "https://api.github.com/repos/ursa-labs/ursabot/{archive_format}{/ref}", - "downloads_url": "https://api.github.com/repos/ursa-labs/ursabot/downloads", - "issues_url": "https://api.github.com/repos/ursa-labs/ursabot/issues{/number}", - "pulls_url": "https://api.github.com/repos/ursa-labs/ursabot/pulls{/number}", - "milestones_url": "https://api.github.com/repos/ursa-labs/ursabot/milestones{/number}", - "notifications_url": "https://api.github.com/repos/ursa-labs/ursabot/notifications{?since,all,participating}", - "labels_url": "https://api.github.com/repos/ursa-labs/ursabot/labels{/name}", - "releases_url": "https://api.github.com/repos/ursa-labs/ursabot/releases{/id}", - "deployments_url": "https://api.github.com/repos/ursa-labs/ursabot/deployments", - "created_at": "2019-02-04T15:40:31Z", - "updated_at": "2019-04-04T17:49:10Z", - "pushed_at": "2019-04-05T12:01:40Z", - "git_url": "git://github.com/ursa-labs/ursabot.git", - "ssh_url": "git@github.com:ursa-labs/ursabot.git", - "clone_url": "https://github.com/ursa-labs/ursabot.git", - "svn_url": "https://github.com/ursa-labs/ursabot", - "homepage": null, - "size": 898, - "stargazers_count": 1, - "watchers_count": 1, - "language": "Jupyter Notebook", - "has_issues": true, - "has_projects": true, - "has_downloads": true, - "has_wiki": true, - "has_pages": false, - "forks_count": 0, - "mirror_url": null, - "archived": false, - "disabled": false, - "open_issues_count": 19, - "license": null, - "forks": 0, - "open_issues": 19, - "watchers": 1, - "default_branch": "master" - } - }, - "_links": { - "self": { - "href": "https://api.github.com/repos/ursa-labs/ursabot/pulls/26" - }, - "html": { - "href": "https://github.com/ursa-labs/ursabot/pull/26" - }, - "issue": { - "href": "https://api.github.com/repos/ursa-labs/ursabot/issues/26" - }, - "comments": { - "href": "https://api.github.com/repos/ursa-labs/ursabot/issues/26/comments" - }, - "review_comments": { - "href": "https://api.github.com/repos/ursa-labs/ursabot/pulls/26/comments" - }, - "review_comment": { - "href": "https://api.github.com/repos/ursa-labs/ursabot/pulls/comments{/number}" - }, - "commits": { - "href": "https://api.github.com/repos/ursa-labs/ursabot/pulls/26/commits" - }, - "statuses": { - "href": "https://api.github.com/repos/ursa-labs/ursabot/statuses/2705da2b616b98fa6010a25813c5a7a27456f71d" - } - }, - "author_association": "MEMBER", - "merged": false, - "mergeable": true, - "rebaseable": true, - "mergeable_state": "unstable", - "merged_by": null, - "comments": 5, - "review_comments": 0, - "maintainer_can_modify": false, - "commits": 2, - "additions": 1124, - "deletions": 0, - "changed_files": 7 -} \ No newline at end of file diff --git a/dev/archery/archery/tests/test_benchmarks.py b/dev/archery/archery/tests/test_benchmarks.py deleted file mode 100644 index fab1e8d443219..0000000000000 --- a/dev/archery/archery/tests/test_benchmarks.py +++ /dev/null @@ -1,383 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import json - -from archery.benchmark.codec import JsonEncoder -from archery.benchmark.core import Benchmark, median -from archery.benchmark.compare import ( - BenchmarkComparator, RunnerComparator -) -from archery.benchmark.google import ( - GoogleBenchmark, GoogleBenchmarkObservation -) -from archery.benchmark.runner import StaticBenchmarkRunner - - -def test_benchmark_comparator(): - unit = "micros" - - assert not BenchmarkComparator( - Benchmark("contender", unit, True, [10], unit, [1]), - Benchmark("baseline", unit, True, [20], unit, [1]), - ).regression - - assert BenchmarkComparator( - Benchmark("contender", unit, False, [10], unit, [1]), - Benchmark("baseline", unit, False, [20], unit, [1]), - ).regression - - assert BenchmarkComparator( - Benchmark("contender", unit, True, [20], unit, [1]), - Benchmark("baseline", unit, True, [10], unit, [1]), - ).regression - - assert not BenchmarkComparator( - Benchmark("contender", unit, False, [20], unit, [1]), - Benchmark("baseline", unit, False, [10], unit, [1]), - ).regression - - -def test_static_runner_from_json_not_a_regression(): - archery_result = { - "suites": [ - { - "name": "arrow-value-parsing-benchmark", - "benchmarks": [ - { - "name": "FloatParsing", - "unit": "items_per_second", - "less_is_better": False, - "values": [ - 109941112.87296811 - ], - "time_unit": "ns", - "times": [ - 9095.800104330105 - ] - }, - ] - } - ] - } - - contender = StaticBenchmarkRunner.from_json(json.dumps(archery_result)) - baseline = StaticBenchmarkRunner.from_json(json.dumps(archery_result)) - [comparison] = RunnerComparator(contender, baseline).comparisons - assert not comparison.regression - - -def test_static_runner_from_json_regression(): - archery_result = { - "suites": [ - { - "name": "arrow-value-parsing-benchmark", - "benchmarks": [ - { - "name": "FloatParsing", - "unit": "items_per_second", - "less_is_better": False, - "values": [ - 109941112.87296811 - ], - "time_unit": "ns", - "times": [ - 9095.800104330105 - ] - }, - ] - } - ] - } - - contender = StaticBenchmarkRunner.from_json(json.dumps(archery_result)) - - # introduce artificial regression - archery_result['suites'][0]['benchmarks'][0]['values'][0] *= 2 - baseline = StaticBenchmarkRunner.from_json(json.dumps(archery_result)) - - [comparison] = RunnerComparator(contender, baseline).comparisons - assert comparison.regression - - -def test_benchmark_median(): - assert median([10]) == 10 - assert median([1, 2, 3]) == 2 - assert median([1, 2]) == 1.5 - assert median([1, 2, 3, 4]) == 2.5 - assert median([1, 1, 1, 1]) == 1 - try: - median([]) - assert False - except ValueError: - pass - - -def assert_benchmark(name, google_result, archery_result): - observation = GoogleBenchmarkObservation(**google_result) - benchmark = GoogleBenchmark(name, [observation]) - result = json.dumps(benchmark, cls=JsonEncoder) - assert json.loads(result) == archery_result - - -def test_items_per_second(): - name = "ArrayArrayKernel/32768/0" - google_result = { - "cpu_time": 116292.58886653671, - "items_per_second": 281772039.9844759, - "iterations": 5964, - "name": name, - "null_percent": 0.0, - "real_time": 119811.77313729875, - "repetition_index": 0, - "repetitions": 0, - "run_name": name, - "run_type": "iteration", - "size": 32768.0, - "threads": 1, - "time_unit": "ns", - } - archery_result = { - "counters": {"iterations": 5964, - "null_percent": 0.0, - "repetition_index": 0, - "repetitions": 0, - "run_name": name, - "threads": 1}, - "name": name, - "unit": "items_per_second", - "less_is_better": False, - "values": [281772039.9844759], - "time_unit": "ns", - "times": [119811.77313729875], - } - assert "items_per_second" in google_result - assert "bytes_per_second" not in google_result - assert_benchmark(name, google_result, archery_result) - - -def test_bytes_per_second(): - name = "BufferOutputStreamLargeWrites/real_time" - google_result = { - "bytes_per_second": 1890209037.3405428, - "cpu_time": 17018127.659574457, - "iterations": 47, - "name": name, - "real_time": 17458386.53190963, - "repetition_index": 1, - "repetitions": 0, - "run_name": name, - "run_type": "iteration", - "threads": 1, - "time_unit": "ns", - } - archery_result = { - "counters": {"iterations": 47, - "repetition_index": 1, - "repetitions": 0, - "run_name": name, - "threads": 1}, - "name": name, - "unit": "bytes_per_second", - "less_is_better": False, - "values": [1890209037.3405428], - "time_unit": "ns", - "times": [17458386.53190963], - } - assert "items_per_second" not in google_result - assert "bytes_per_second" in google_result - assert_benchmark(name, google_result, archery_result) - - -def test_both_items_and_bytes_per_second(): - name = "ArrayArrayKernel/32768/0" - google_result = { - "bytes_per_second": 281772039.9844759, - "cpu_time": 116292.58886653671, - "items_per_second": 281772039.9844759, - "iterations": 5964, - "name": name, - "null_percent": 0.0, - "real_time": 119811.77313729875, - "repetition_index": 0, - "repetitions": 0, - "run_name": name, - "run_type": "iteration", - "size": 32768.0, - "threads": 1, - "time_unit": "ns", - } - # Note that bytes_per_second trumps items_per_second - archery_result = { - "counters": {"iterations": 5964, - "null_percent": 0.0, - "repetition_index": 0, - "repetitions": 0, - "run_name": name, - "threads": 1}, - "name": name, - "unit": "bytes_per_second", - "less_is_better": False, - "values": [281772039.9844759], - "time_unit": "ns", - "times": [119811.77313729875], - } - assert "items_per_second" in google_result - assert "bytes_per_second" in google_result - assert_benchmark(name, google_result, archery_result) - - -def test_neither_items_nor_bytes_per_second(): - name = "AllocateDeallocate/size:1048576/real_time" - google_result = { - "cpu_time": 1778.6004847419827, - "iterations": 352765, - "name": name, - "real_time": 1835.3137357788837, - "repetition_index": 0, - "repetitions": 0, - "run_name": name, - "run_type": "iteration", - "threads": 1, - "time_unit": "ns", - } - archery_result = { - "counters": {"iterations": 352765, - "repetition_index": 0, - "repetitions": 0, - "run_name": name, - "threads": 1}, - "name": name, - "unit": "ns", - "less_is_better": True, - "values": [1835.3137357788837], - "time_unit": "ns", - "times": [1835.3137357788837], - } - assert "items_per_second" not in google_result - assert "bytes_per_second" not in google_result - assert_benchmark(name, google_result, archery_result) - - -def test_prefer_real_time(): - name = "AllocateDeallocate/size:1048576/real_time" - google_result = { - "cpu_time": 1778.6004847419827, - "iterations": 352765, - "name": name, - "real_time": 1835.3137357788837, - "repetition_index": 0, - "repetitions": 0, - "run_name": name, - "run_type": "iteration", - "threads": 1, - "time_unit": "ns", - } - archery_result = { - "counters": {"iterations": 352765, - "repetition_index": 0, - "repetitions": 0, - "run_name": name, - "threads": 1}, - "name": name, - "unit": "ns", - "less_is_better": True, - "values": [1835.3137357788837], - "time_unit": "ns", - "times": [1835.3137357788837], - } - assert name.endswith("/real_time") - assert_benchmark(name, google_result, archery_result) - - -def test_prefer_cpu_time(): - name = "AllocateDeallocate/size:1048576" - google_result = { - "cpu_time": 1778.6004847419827, - "iterations": 352765, - "name": name, - "real_time": 1835.3137357788837, - "repetition_index": 0, - "repetitions": 0, - "run_name": name, - "run_type": "iteration", - "threads": 1, - "time_unit": "ns", - } - archery_result = { - "counters": {"iterations": 352765, - "repetition_index": 0, - "repetitions": 0, - "run_name": name, - "threads": 1}, - "name": name, - "unit": "ns", - "less_is_better": True, - "values": [1778.6004847419827], - "time_unit": "ns", - "times": [1835.3137357788837], - } - assert not name.endswith("/real_time") - assert_benchmark(name, google_result, archery_result) - - -def test_omits_aggregates(): - name = "AllocateDeallocate/size:1048576/real_time" - google_aggregate = { - "aggregate_name": "mean", - "cpu_time": 1757.428694267678, - "iterations": 3, - "name": "AllocateDeallocate/size:1048576/real_time_mean", - "real_time": 1849.3869337041162, - "repetitions": 0, - "run_name": name, - "run_type": "aggregate", - "threads": 1, - "time_unit": "ns", - } - google_result = { - "cpu_time": 1778.6004847419827, - "iterations": 352765, - "name": name, - "real_time": 1835.3137357788837, - "repetition_index": 0, - "repetitions": 0, - "run_name": name, - "run_type": "iteration", - "threads": 1, - "time_unit": "ns", - } - archery_result = { - "counters": {"iterations": 352765, - "repetition_index": 0, - "repetitions": 0, - "run_name": name, - "threads": 1}, - "name": name, - "unit": "ns", - "less_is_better": True, - "values": [1835.3137357788837], - "time_unit": "ns", - "times": [1835.3137357788837], - } - assert google_aggregate["run_type"] == "aggregate" - assert google_result["run_type"] == "iteration" - observation1 = GoogleBenchmarkObservation(**google_aggregate) - observation2 = GoogleBenchmarkObservation(**google_result) - benchmark = GoogleBenchmark(name, [observation1, observation2]) - result = json.dumps(benchmark, cls=JsonEncoder) - assert json.loads(result) == archery_result diff --git a/dev/archery/archery/tests/test_bot.py b/dev/archery/archery/tests/test_bot.py deleted file mode 100644 index e00853ceb2cb1..0000000000000 --- a/dev/archery/archery/tests/test_bot.py +++ /dev/null @@ -1,201 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import json -from unittest.mock import Mock - -import click -import pytest -import responses as rsps - -from archery.bot import CommentBot, CommandError, group - - -@pytest.fixture -def responses(): - with rsps.RequestsMock() as mock: - yield mock - - -def github_url(path): - return 'https://api.github.com:443/{}'.format(path.strip('/')) - - -@group() -def custom_handler(): - pass - - -@custom_handler.command() -@click.pass_obj -def extra(obj): - return obj - - -@custom_handler.command() -@click.option('--force', '-f', is_flag=True) -def build(force): - return force - - -@custom_handler.command() -@click.option('--name', required=True) -def benchmark(name): - return name - - -def test_click_based_commands(): - assert custom_handler('build') is False - assert custom_handler('build -f') is True - - assert custom_handler('benchmark --name strings') == 'strings' - with pytest.raises(CommandError): - assert custom_handler('benchmark') - - assert custom_handler('extra', extra='data') == {'extra': 'data'} - - -@pytest.mark.parametrize('fixture_name', [ - # the bot is not mentioned, nothing to do - 'event-issue-comment-not-mentioning-ursabot.json', - # don't respond to itself, it prevents recursive comment storms! - 'event-issue-comment-by-ursabot.json', - # non-authorized user sent the comment, do not respond - 'event-issue-comment-by-non-authorized-user.json', -]) -def test_noop_events(load_fixture, fixture_name): - payload = load_fixture(fixture_name) - - handler = Mock() - bot = CommentBot(name='ursabot', token='', handler=handler) - bot.handle('issue_comment', payload) - - handler.assert_not_called() - - -def test_issue_comment_without_pull_request(load_fixture, responses): - responses.add( - responses.GET, - github_url('/repositories/169101701/issues/19'), - json=load_fixture('issue-19.json'), - status=200 - ) - responses.add( - responses.GET, - github_url('repos/ursa-labs/ursabot/pulls/19'), - json={}, - status=404 - ) - responses.add( - responses.POST, - github_url('/repos/ursa-labs/ursabot/issues/19/comments'), - json={} - ) - - def handler(command, **kwargs): - pass - - payload = load_fixture('event-issue-comment-without-pull-request.json') - bot = CommentBot(name='ursabot', token='', handler=handler) - bot.handle('issue_comment', payload) - - post = responses.calls[2] - assert json.loads(post.request.body) == { - 'body': "The comment bot only listens to pull request comments!" - } - - -def test_respond_with_usage(load_fixture, responses): - responses.add( - responses.GET, - github_url('/repositories/169101701/issues/26'), - json=load_fixture('issue-26.json'), - status=200 - ) - responses.add( - responses.GET, - github_url('/repos/ursa-labs/ursabot/pulls/26'), - json=load_fixture('pull-request-26.json'), - status=200 - ) - responses.add( - responses.GET, - github_url('/repos/ursa-labs/ursabot/issues/comments/480243811'), - json=load_fixture('issue-comment-480243811.json') - ) - responses.add( - responses.POST, - github_url('/repos/ursa-labs/ursabot/issues/26/comments'), - json={} - ) - - def handler(command, **kwargs): - raise CommandError('test-usage') - - payload = load_fixture('event-issue-comment-with-empty-command.json') - bot = CommentBot(name='ursabot', token='', handler=handler) - bot.handle('issue_comment', payload) - - post = responses.calls[3] - assert json.loads(post.request.body) == {'body': '```\ntest-usage\n```'} - - -@pytest.mark.parametrize(('command', 'reaction'), [ - ('@ursabot build', '+1'), - ('@ursabot listen', '-1'), -]) -def test_issue_comment_with_commands(load_fixture, responses, command, - reaction): - responses.add( - responses.GET, - github_url('/repositories/169101701/issues/26'), - json=load_fixture('issue-26.json'), - status=200 - ) - responses.add( - responses.GET, - github_url('/repos/ursa-labs/ursabot/pulls/26'), - json=load_fixture('pull-request-26.json'), - status=200 - ) - responses.add( - responses.GET, - github_url('/repos/ursa-labs/ursabot/issues/comments/480248726'), - json=load_fixture('issue-comment-480248726.json') - ) - responses.add( - responses.POST, - github_url( - '/repos/ursa-labs/ursabot/issues/comments/480248726/reactions' - ), - json={} - ) - - def handler(command, **kwargs): - if command == 'build': - return True - else: - raise ValueError('Only `build` command is supported.') - - payload = load_fixture('event-issue-comment-build-command.json') - payload["comment"]["body"] = command - - bot = CommentBot(name='ursabot', token='', handler=handler) - bot.handle('issue_comment', payload) - - post = responses.calls[3] - assert json.loads(post.request.body) == {'content': reaction} diff --git a/dev/archery/archery/tests/test_cli.py b/dev/archery/archery/tests/test_cli.py deleted file mode 100644 index b3199dfaf1fbf..0000000000000 --- a/dev/archery/archery/tests/test_cli.py +++ /dev/null @@ -1,162 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from unittest.mock import patch - -from click.testing import CliRunner - -from archery.cli import archery -from archery.docker import DockerCompose - - -@patch.object(DockerCompose, "pull") -@patch.object(DockerCompose, "build") -@patch.object(DockerCompose, "run") -def test_docker_run_with_custom_command(run, build, pull): - # with custom command - args = ["docker", "run", "ubuntu-cpp", "bash"] - result = CliRunner().invoke(archery, args) - assert result.exit_code == 0 - pull.assert_called_once_with( - "ubuntu-cpp", pull_leaf=True, using_docker=False - ) - build.assert_called_once_with( - "ubuntu-cpp", - use_cache=True, - use_leaf_cache=True, - using_docker=False, - using_buildx=False - ) - run.assert_called_once_with( - "ubuntu-cpp", - command="bash", - env={}, - user=None, - using_docker=False, - volumes=(), - ) - - -@patch.object(DockerCompose, "pull") -@patch.object(DockerCompose, "build") -@patch.object(DockerCompose, "run") -def test_docker_run_options(run, build, pull): - # environment variables and volumes - args = [ - "docker", - "run", - "-e", - "ARROW_GANDIVA=OFF", - "-e", - "ARROW_FLIGHT=ON", - "--volume", - "./build:/build", - "-v", - "./ccache:/ccache:delegated", - "-u", - "root", - "ubuntu-cpp", - ] - result = CliRunner().invoke(archery, args) - assert result.exit_code == 0 - pull.assert_called_once_with( - "ubuntu-cpp", pull_leaf=True, using_docker=False - ) - build.assert_called_once_with( - "ubuntu-cpp", - use_cache=True, - use_leaf_cache=True, - using_docker=False, - using_buildx=False - ) - run.assert_called_once_with( - "ubuntu-cpp", - command=None, - env={"ARROW_GANDIVA": "OFF", "ARROW_FLIGHT": "ON"}, - user="root", - using_docker=False, - volumes=( - "./build:/build", - "./ccache:/ccache:delegated", - ), - ) - - -@patch.object(DockerCompose, "run") -def test_docker_run_without_pulling_or_building(run): - args = ["docker", "run", "--no-pull", "--no-build", "ubuntu-cpp"] - result = CliRunner().invoke(archery, args) - assert result.exit_code == 0 - run.assert_called_once_with( - "ubuntu-cpp", - command=None, - env={}, - user=None, - using_docker=False, - volumes=(), - ) - - -@patch.object(DockerCompose, "pull") -@patch.object(DockerCompose, "build") -def test_docker_run_only_pulling_and_building(build, pull): - args = ["docker", "run", "ubuntu-cpp", "--build-only"] - result = CliRunner().invoke(archery, args) - assert result.exit_code == 0 - pull.assert_called_once_with( - "ubuntu-cpp", pull_leaf=True, using_docker=False - ) - build.assert_called_once_with( - "ubuntu-cpp", - use_cache=True, - use_leaf_cache=True, - using_docker=False, - using_buildx=False - ) - - -@patch.object(DockerCompose, "build") -@patch.object(DockerCompose, "run") -def test_docker_run_without_build_cache(run, build): - args = [ - "docker", - "run", - "--no-pull", - "--force-build", - "--user", - "me", - "--no-cache", - "--no-leaf-cache", - "ubuntu-cpp", - ] - result = CliRunner().invoke(archery, args) - assert result.exit_code == 0 - build.assert_called_once_with( - "ubuntu-cpp", - use_cache=False, - use_leaf_cache=False, - using_docker=False, - using_buildx=False - ) - run.assert_called_once_with( - "ubuntu-cpp", - command=None, - env={}, - user="me", - using_docker=False, - volumes=(), - ) diff --git a/dev/archery/archery/tests/test_docker.py b/dev/archery/archery/tests/test_docker.py deleted file mode 100644 index 09dcd27a71334..0000000000000 --- a/dev/archery/archery/tests/test_docker.py +++ /dev/null @@ -1,512 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import collections -import os -import re -import subprocess -from unittest import mock - -import pytest - -from archery.docker import DockerCompose -from archery.testing import assert_subprocess_calls, override_env, PartialEnv - - -missing_service_compose_yml = """ -version: '3.5' - -x-hierarchy: - - foo: - - sub-foo: - - sub-sub-foo - - another-sub-sub-foo - - bar: - - sub-bar - - baz - -services: - foo: - image: org/foo - sub-sub-foo: - image: org/sub-sub-foo - another-sub-sub-foo: - image: org/another-sub-sub-foo - bar: - image: org/bar - sub-bar: - image: org/sub-bar - baz: - image: org/baz -""" - -missing_node_compose_yml = """ -version: '3.5' - -x-hierarchy: - - foo: - - sub-foo: - - sub-sub-foo - - another-sub-sub-foo - - bar - - baz - -services: - foo: - image: org/foo - sub-foo: - image: org/sub-foo - sub-sub-foo: - image: org/sub-foo-foo - another-sub-sub-foo: - image: org/another-sub-sub-foo - bar: - image: org/bar - sub-bar: - image: org/sub-bar - baz: - image: org/baz -""" - -ok_compose_yml = """ -version: '3.5' - -x-hierarchy: - - foo: - - sub-foo: - - sub-sub-foo - - another-sub-sub-foo - - bar: - - sub-bar - - baz - -services: - foo: - image: org/foo - sub-foo: - image: org/sub-foo - sub-sub-foo: - image: org/sub-sub-foo - another-sub-sub-foo: - image: org/another-sub-sub-foo - bar: - image: org/bar - sub-bar: - image: org/sub-bar - baz: - image: org/baz -""" - -arrow_compose_yml = """ -version: '3.5' - -x-with-gpus: - - ubuntu-cuda - -x-hierarchy: - - conda-cpp: - - conda-python: - - conda-python-pandas - - conda-python-dask - - ubuntu-cpp: - - ubuntu-cpp-cmake32 - - ubuntu-c-glib: - - ubuntu-ruby - - ubuntu-cuda - -services: - conda-cpp: - image: org/conda-cpp - build: - context: . - dockerfile: ci/docker/conda-cpp.dockerfile - conda-python: - image: org/conda-python - build: - context: . - dockerfile: ci/docker/conda-cpp.dockerfile - args: - python: 3.6 - conda-python-pandas: - image: org/conda-python-pandas - build: - context: . - dockerfile: ci/docker/conda-python-pandas.dockerfile - conda-python-dask: - image: org/conda-python-dask - ubuntu-cpp: - image: org/ubuntu-cpp - build: - context: . - dockerfile: ci/docker/ubuntu-${UBUNTU}-cpp.dockerfile - ubuntu-cpp-cmake32: - image: org/ubuntu-cpp-cmake32 - ubuntu-c-glib: - image: org/ubuntu-c-glib - ubuntu-ruby: - image: org/ubuntu-ruby - ubuntu-cuda: - image: org/ubuntu-cuda - environment: - CUDA_ENV: 1 - OTHER_ENV: 2 - volumes: - - /host:/container - command: /bin/bash -c "echo 1 > /tmp/dummy && cat /tmp/dummy" -""" - -arrow_compose_env = { - 'UBUNTU': '20.04', # overridden below - 'PYTHON': '3.6', - 'PANDAS': 'latest', - 'DASK': 'latest', # overridden below -} - - -def create_config(directory, yml_content, env_content=None): - env_path = directory / '.env' - config_path = directory / 'docker-compose.yml' - - with config_path.open('w') as fp: - fp.write(yml_content) - - if env_content is not None: - with env_path.open('w') as fp: - for k, v in env_content.items(): - fp.write("{}={}\n".format(k, v)) - - return config_path - - -def format_run(args): - cmd = ["run", "--rm"] - if isinstance(args, str): - return " ".join(cmd + [args]) - else: - return cmd + args - - -@pytest.fixture -def arrow_compose_path(tmpdir): - return create_config(tmpdir, arrow_compose_yml, arrow_compose_env) - - -def test_config_validation(tmpdir): - config_path = create_config(tmpdir, missing_service_compose_yml) - msg = "`sub-foo` is defined in `x-hierarchy` bot not in `services`" - with pytest.raises(ValueError, match=msg): - DockerCompose(config_path) - - config_path = create_config(tmpdir, missing_node_compose_yml) - msg = "`sub-bar` is defined in `services` but not in `x-hierarchy`" - with pytest.raises(ValueError, match=msg): - DockerCompose(config_path) - - config_path = create_config(tmpdir, ok_compose_yml) - DockerCompose(config_path) # no issue - - -def assert_docker_calls(compose, expected_args): - base_command = ['docker'] - expected_commands = [] - for args in expected_args: - if isinstance(args, str): - args = re.split(r"\s", args) - expected_commands.append(base_command + args) - return assert_subprocess_calls(expected_commands, check=True) - - -def assert_compose_calls(compose, expected_args, env=mock.ANY): - base_command = ['docker-compose', '--file', str(compose.config.path)] - expected_commands = [] - for args in expected_args: - if isinstance(args, str): - args = re.split(r"\s", args) - expected_commands.append(base_command + args) - return assert_subprocess_calls(expected_commands, check=True, env=env) - - -def test_arrow_example_validation_passes(arrow_compose_path): - DockerCompose(arrow_compose_path) - - -def test_compose_default_params_and_env(arrow_compose_path): - compose = DockerCompose(arrow_compose_path, params=dict( - UBUNTU='18.04', - DASK='master' - )) - assert compose.config.dotenv == arrow_compose_env - assert compose.config.params == { - 'UBUNTU': '18.04', - 'DASK': 'master', - } - - -def test_forwarding_env_variables(arrow_compose_path): - expected_calls = [ - "pull --ignore-pull-failures conda-cpp", - "build conda-cpp", - ] - expected_env = PartialEnv( - MY_CUSTOM_VAR_A='a', - MY_CUSTOM_VAR_B='b' - ) - with override_env({'MY_CUSTOM_VAR_A': 'a', 'MY_CUSTOM_VAR_B': 'b'}): - compose = DockerCompose(arrow_compose_path) - with assert_compose_calls(compose, expected_calls, env=expected_env): - assert os.environ['MY_CUSTOM_VAR_A'] == 'a' - assert os.environ['MY_CUSTOM_VAR_B'] == 'b' - compose.pull('conda-cpp') - compose.build('conda-cpp') - - -def test_compose_pull(arrow_compose_path): - compose = DockerCompose(arrow_compose_path) - - expected_calls = [ - "pull --ignore-pull-failures conda-cpp", - ] - with assert_compose_calls(compose, expected_calls): - compose.clear_pull_memory() - compose.pull('conda-cpp') - - expected_calls = [ - "pull --ignore-pull-failures conda-cpp", - "pull --ignore-pull-failures conda-python", - "pull --ignore-pull-failures conda-python-pandas" - ] - with assert_compose_calls(compose, expected_calls): - compose.clear_pull_memory() - compose.pull('conda-python-pandas') - - expected_calls = [ - "pull --ignore-pull-failures conda-cpp", - "pull --ignore-pull-failures conda-python", - ] - with assert_compose_calls(compose, expected_calls): - compose.clear_pull_memory() - compose.pull('conda-python-pandas', pull_leaf=False) - - -def test_compose_pull_params(arrow_compose_path): - expected_calls = [ - "pull --ignore-pull-failures conda-cpp", - "pull --ignore-pull-failures conda-python", - ] - compose = DockerCompose(arrow_compose_path, params=dict(UBUNTU='18.04')) - expected_env = PartialEnv(PYTHON='3.6', PANDAS='latest') - with assert_compose_calls(compose, expected_calls, env=expected_env): - compose.clear_pull_memory() - compose.pull('conda-python-pandas', pull_leaf=False) - - -def test_compose_build(arrow_compose_path): - compose = DockerCompose(arrow_compose_path) - - expected_calls = [ - "build conda-cpp", - ] - with assert_compose_calls(compose, expected_calls): - compose.build('conda-cpp') - - expected_calls = [ - "build --no-cache conda-cpp" - ] - with assert_compose_calls(compose, expected_calls): - compose.build('conda-cpp', use_cache=False) - - expected_calls = [ - "build conda-cpp", - "build conda-python", - "build conda-python-pandas" - ] - with assert_compose_calls(compose, expected_calls): - compose.build('conda-python-pandas') - - expected_calls = [ - "build --no-cache conda-cpp", - "build --no-cache conda-python", - "build --no-cache conda-python-pandas", - ] - with assert_compose_calls(compose, expected_calls): - compose.build('conda-python-pandas', use_cache=False) - - expected_calls = [ - "build conda-cpp", - "build conda-python", - "build --no-cache conda-python-pandas", - ] - with assert_compose_calls(compose, expected_calls): - compose.build('conda-python-pandas', use_cache=True, - use_leaf_cache=False) - - -@mock.patch.dict(os.environ, {"BUILDKIT_INLINE_CACHE": "1"}) -def test_compose_buildkit_inline_cache(arrow_compose_path): - compose = DockerCompose(arrow_compose_path) - - expected_calls = [ - "build --build-arg BUILDKIT_INLINE_CACHE=1 conda-cpp", - ] - with assert_compose_calls(compose, expected_calls): - compose.build('conda-cpp') - - -def test_compose_build_params(arrow_compose_path): - expected_calls = [ - "build ubuntu-cpp", - ] - - compose = DockerCompose(arrow_compose_path, params=dict(UBUNTU='18.04')) - expected_env = PartialEnv(UBUNTU="18.04") - with assert_compose_calls(compose, expected_calls, env=expected_env): - compose.build('ubuntu-cpp') - - compose = DockerCompose(arrow_compose_path, params=dict(UBUNTU='16.04')) - expected_env = PartialEnv(UBUNTU="16.04") - with assert_compose_calls(compose, expected_calls, env=expected_env): - compose.build('ubuntu-cpp') - - expected_calls = [ - "build --no-cache conda-cpp", - "build --no-cache conda-python", - "build --no-cache conda-python-pandas", - ] - compose = DockerCompose(arrow_compose_path, params=dict(UBUNTU='18.04')) - expected_env = PartialEnv(PYTHON='3.6', PANDAS='latest') - with assert_compose_calls(compose, expected_calls, env=expected_env): - compose.build('conda-python-pandas', use_cache=False) - - -def test_compose_run(arrow_compose_path): - expected_calls = [ - format_run("conda-cpp"), - ] - compose = DockerCompose(arrow_compose_path) - with assert_compose_calls(compose, expected_calls): - compose.run('conda-cpp') - - expected_calls = [ - format_run("conda-python") - ] - expected_env = PartialEnv(PYTHON='3.6') - with assert_compose_calls(compose, expected_calls, env=expected_env): - compose.run('conda-python') - - compose = DockerCompose(arrow_compose_path, params=dict(PYTHON='3.8')) - expected_env = PartialEnv(PYTHON='3.8') - with assert_compose_calls(compose, expected_calls, env=expected_env): - compose.run('conda-python') - - compose = DockerCompose(arrow_compose_path, params=dict(PYTHON='3.8')) - for command in ["bash", "echo 1"]: - expected_calls = [ - format_run(["conda-python", command]), - ] - expected_env = PartialEnv(PYTHON='3.8') - with assert_compose_calls(compose, expected_calls, env=expected_env): - compose.run('conda-python', command) - - expected_calls = [ - ( - format_run("-e CONTAINER_ENV_VAR_A=a -e CONTAINER_ENV_VAR_B=b " - "conda-python") - ) - ] - compose = DockerCompose(arrow_compose_path) - expected_env = PartialEnv(PYTHON='3.6') - with assert_compose_calls(compose, expected_calls, env=expected_env): - env = collections.OrderedDict([ - ("CONTAINER_ENV_VAR_A", "a"), - ("CONTAINER_ENV_VAR_B", "b") - ]) - compose.run('conda-python', env=env) - - expected_calls = [ - ( - format_run("--volume /host/build:/build --volume " - "/host/ccache:/ccache:delegated conda-python") - ) - ] - compose = DockerCompose(arrow_compose_path) - with assert_compose_calls(compose, expected_calls): - volumes = ("/host/build:/build", "/host/ccache:/ccache:delegated") - compose.run('conda-python', volumes=volumes) - - -def test_compose_push(arrow_compose_path): - compose = DockerCompose(arrow_compose_path, params=dict(PYTHON='3.8')) - expected_env = PartialEnv(PYTHON="3.8") - expected_calls = [ - mock.call(["docker", "login", "-u", "user", "-p", "pass"], check=True), - ] - for image in ["conda-cpp", "conda-python", "conda-python-pandas"]: - expected_calls.append( - mock.call(["docker-compose", "--file", str(compose.config.path), - "push", image], check=True, env=expected_env) - ) - with assert_subprocess_calls(expected_calls): - compose.push('conda-python-pandas', user='user', password='pass') - - -def test_compose_error(arrow_compose_path): - compose = DockerCompose(arrow_compose_path, params=dict( - PYTHON='3.8', - PANDAS='master' - )) - - error = subprocess.CalledProcessError(99, []) - with mock.patch('subprocess.run', side_effect=error): - with pytest.raises(RuntimeError) as exc: - compose.run('conda-cpp') - - exception_message = str(exc.value) - assert "exited with a non-zero exit code 99" in exception_message - assert "PANDAS: latest" in exception_message - assert "export PANDAS=master" in exception_message - - -def test_image_with_gpu(arrow_compose_path): - compose = DockerCompose(arrow_compose_path) - - expected_calls = [ - [ - "run", "--rm", "--gpus", "all", - "-e", "CUDA_ENV=1", - "-e", "OTHER_ENV=2", - "-v", "/host:/container:rw", - "org/ubuntu-cuda", - '/bin/bash -c "echo 1 > /tmp/dummy && cat /tmp/dummy"' - ] - ] - with assert_docker_calls(compose, expected_calls): - compose.run('ubuntu-cuda') - - -def test_listing_images(arrow_compose_path): - compose = DockerCompose(arrow_compose_path) - assert sorted(compose.images()) == [ - 'conda-cpp', - 'conda-python', - 'conda-python-dask', - 'conda-python-pandas', - 'ubuntu-c-glib', - 'ubuntu-cpp', - 'ubuntu-cpp-cmake32', - 'ubuntu-cuda', - 'ubuntu-ruby', - ] diff --git a/dev/archery/archery/tests/test_release.py b/dev/archery/archery/tests/test_release.py deleted file mode 100644 index 75aac89212325..0000000000000 --- a/dev/archery/archery/tests/test_release.py +++ /dev/null @@ -1,333 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import pytest - -from archery.release import ( - Release, MajorRelease, MinorRelease, PatchRelease, - Jira, Version, Issue, CommitTitle, Commit -) -from archery.testing import DotDict - - -# subset of issues per revision -_issues = { - "1.0.1": [ - Issue("ARROW-9684", type="Bug", summary="[C++] Title"), - Issue("ARROW-9667", type="New Feature", summary="[Crossbow] Title"), - Issue("ARROW-9659", type="Bug", summary="[C++] Title"), - Issue("ARROW-9644", type="Bug", summary="[C++][Dataset] Title"), - Issue("ARROW-9643", type="Bug", summary="[C++] Title"), - Issue("ARROW-9609", type="Bug", summary="[C++] Title"), - Issue("ARROW-9606", type="Bug", summary="[C++][Dataset] Title") - ], - "1.0.0": [ - Issue("ARROW-300", type="New Feature", summary="[Format] Title"), - Issue("ARROW-4427", type="Task", summary="[Doc] Title"), - Issue("ARROW-5035", type="Improvement", summary="[C#] Title"), - Issue("ARROW-8473", type="Bug", summary="[Rust] Title"), - Issue("ARROW-8472", type="Bug", summary="[Go][Integration] Title"), - Issue("ARROW-8471", type="Bug", summary="[C++][Integration] Title"), - Issue("ARROW-8974", type="Improvement", summary="[C++] Title"), - Issue("ARROW-8973", type="New Feature", summary="[Java] Title") - ], - "0.17.1": [ - Issue("ARROW-8684", type="Bug", summary="[Python] Title"), - Issue("ARROW-8657", type="Bug", summary="[C++][Parquet] Title"), - Issue("ARROW-8641", type="Bug", summary="[Python] Title"), - Issue("ARROW-8609", type="Bug", summary="[C++] Title"), - ], - "0.17.0": [ - Issue("ARROW-2882", type="New Feature", summary="[C++][Python] Title"), - Issue("ARROW-2587", type="Bug", summary="[Python] Title"), - Issue("ARROW-2447", type="Improvement", summary="[C++] Title"), - Issue("ARROW-2255", type="Bug", summary="[Integration] Title"), - Issue("ARROW-1907", type="Bug", summary="[C++/Python] Title"), - Issue("ARROW-1636", type="New Feature", summary="[Format] Title") - ] -} - - -class FakeJira(Jira): - - def __init__(self): - pass - - def project_versions(self, project='ARROW'): - return [ - Version.parse("3.0.0", released=False), - Version.parse("2.0.0", released=False), - Version.parse("1.1.0", released=False), - Version.parse("1.0.1", released=False), - Version.parse("1.0.0", released=True), - Version.parse("0.17.1", released=True), - Version.parse("0.17.0", released=True), - Version.parse("0.16.0", released=True), - Version.parse("0.15.2", released=True), - Version.parse("0.15.1", released=True), - Version.parse("0.15.0", released=True), - ] - - def project_issues(self, version, project='ARROW'): - return _issues[str(version)] - - -@pytest.fixture -def fake_jira(): - return FakeJira() - - -def test_version(fake_jira): - v = Version.parse("1.2.5") - assert str(v) == "1.2.5" - assert v.major == 1 - assert v.minor == 2 - assert v.patch == 5 - assert v.released is False - assert v.release_date is None - - v = Version.parse("1.0.0", released=True, release_date="2020-01-01") - assert str(v) == "1.0.0" - assert v.major == 1 - assert v.minor == 0 - assert v.patch == 0 - assert v.released is True - assert v.release_date == "2020-01-01" - - -def test_issue(fake_jira): - i = Issue("ARROW-1234", type='Bug', summary="title") - assert i.key == "ARROW-1234" - assert i.type == "Bug" - assert i.summary == "title" - assert i.project == "ARROW" - assert i.number == 1234 - - i = Issue("PARQUET-1111", type='Improvement', summary="another title") - assert i.key == "PARQUET-1111" - assert i.type == "Improvement" - assert i.summary == "another title" - assert i.project == "PARQUET" - assert i.number == 1111 - - fake_jira_issue = DotDict({ - 'key': 'ARROW-2222', - 'fields': { - 'issuetype': { - 'name': 'Feature' - }, - 'summary': 'Issue title' - } - }) - i = Issue.from_jira(fake_jira_issue) - assert i.key == "ARROW-2222" - assert i.type == "Feature" - assert i.summary == "Issue title" - assert i.project == "ARROW" - assert i.number == 2222 - - -def test_commit_title(): - t = CommitTitle.parse( - "ARROW-9598: [C++][Parquet] Fix writing nullable structs" - ) - assert t.project == "ARROW" - assert t.issue == "ARROW-9598" - assert t.components == ["C++", "Parquet"] - assert t.summary == "Fix writing nullable structs" - - t = CommitTitle.parse( - "ARROW-8002: [C++][Dataset][R] Support partitioned dataset writing" - ) - assert t.project == "ARROW" - assert t.issue == "ARROW-8002" - assert t.components == ["C++", "Dataset", "R"] - assert t.summary == "Support partitioned dataset writing" - - t = CommitTitle.parse( - "ARROW-9600: [Rust][Arrow] pin older version of proc-macro2 during " - "build" - ) - assert t.project == "ARROW" - assert t.issue == "ARROW-9600" - assert t.components == ["Rust", "Arrow"] - assert t.summary == "pin older version of proc-macro2 during build" - - t = CommitTitle.parse("[Release] Update versions for 1.0.0") - assert t.project is None - assert t.issue is None - assert t.components == ["Release"] - assert t.summary == "Update versions for 1.0.0" - - t = CommitTitle.parse("[Python][Doc] Fix rst role dataset.rst (#7725)") - assert t.project is None - assert t.issue is None - assert t.components == ["Python", "Doc"] - assert t.summary == "Fix rst role dataset.rst (#7725)" - - t = CommitTitle.parse( - "PARQUET-1882: [C++] Buffered Reads should allow for 0 length" - ) - assert t.project == 'PARQUET' - assert t.issue == 'PARQUET-1882' - assert t.components == ["C++"] - assert t.summary == "Buffered Reads should allow for 0 length" - - t = CommitTitle.parse( - "ARROW-9340 [R] Use CRAN version of decor package " - "\nsomething else\n" - "\nwhich should be truncated" - ) - assert t.project == 'ARROW' - assert t.issue == 'ARROW-9340' - assert t.components == ["R"] - assert t.summary == "Use CRAN version of decor package " - - -def test_release_basics(fake_jira): - r = Release.from_jira("1.0.0", jira=fake_jira) - assert isinstance(r, MajorRelease) - assert r.is_released is True - assert r.branch == 'master' - assert r.tag == 'apache-arrow-1.0.0' - - r = Release.from_jira("1.1.0", jira=fake_jira) - assert isinstance(r, MinorRelease) - assert r.is_released is False - assert r.branch == 'maint-1.x.x' - assert r.tag == 'apache-arrow-1.1.0' - - # minor releases before 1.0 are treated as major releases - r = Release.from_jira("0.17.0", jira=fake_jira) - assert isinstance(r, MajorRelease) - assert r.is_released is True - assert r.branch == 'master' - assert r.tag == 'apache-arrow-0.17.0' - - r = Release.from_jira("0.17.1", jira=fake_jira) - assert isinstance(r, PatchRelease) - assert r.is_released is True - assert r.branch == 'maint-0.17.x' - assert r.tag == 'apache-arrow-0.17.1' - - -def test_previous_and_next_release(fake_jira): - r = Release.from_jira("3.0.0", jira=fake_jira) - assert isinstance(r.previous, MajorRelease) - assert r.previous.version == Version.parse("2.0.0") - with pytest.raises(ValueError, match="There is no upcoming release set"): - assert r.next - - r = Release.from_jira("2.0.0", jira=fake_jira) - assert isinstance(r.previous, MajorRelease) - assert isinstance(r.next, MajorRelease) - assert r.previous.version == Version.parse("1.0.0") - assert r.next.version == Version.parse("3.0.0") - - r = Release.from_jira("1.1.0", jira=fake_jira) - assert isinstance(r.previous, MajorRelease) - assert isinstance(r.next, MajorRelease) - assert r.previous.version == Version.parse("1.0.0") - assert r.next.version == Version.parse("2.0.0") - - r = Release.from_jira("1.0.0", jira=fake_jira) - assert isinstance(r.next, MajorRelease) - assert isinstance(r.previous, MajorRelease) - assert r.previous.version == Version.parse("0.17.0") - assert r.next.version == Version.parse("2.0.0") - - r = Release.from_jira("0.17.0", jira=fake_jira) - assert isinstance(r.previous, MajorRelease) - assert r.previous.version == Version.parse("0.16.0") - - r = Release.from_jira("0.15.2", jira=fake_jira) - assert isinstance(r.previous, PatchRelease) - assert isinstance(r.next, MajorRelease) - assert r.previous.version == Version.parse("0.15.1") - assert r.next.version == Version.parse("0.16.0") - - r = Release.from_jira("0.15.1", jira=fake_jira) - assert isinstance(r.previous, MajorRelease) - assert isinstance(r.next, PatchRelease) - assert r.previous.version == Version.parse("0.15.0") - assert r.next.version == Version.parse("0.15.2") - - -def test_release_issues(fake_jira): - # major release issues - r = Release.from_jira("1.0.0", jira=fake_jira) - assert r.issues.keys() == set([ - "ARROW-300", - "ARROW-4427", - "ARROW-5035", - "ARROW-8473", - "ARROW-8472", - "ARROW-8471", - "ARROW-8974", - "ARROW-8973" - ]) - # minor release issues - r = Release.from_jira("0.17.0", jira=fake_jira) - assert r.issues.keys() == set([ - "ARROW-2882", - "ARROW-2587", - "ARROW-2447", - "ARROW-2255", - "ARROW-1907", - "ARROW-1636", - ]) - # patch release issues - r = Release.from_jira("1.0.1", jira=fake_jira) - assert r.issues.keys() == set([ - "ARROW-9684", - "ARROW-9667", - "ARROW-9659", - "ARROW-9644", - "ARROW-9643", - "ARROW-9609", - "ARROW-9606" - ]) - - -@pytest.mark.parametrize(('version', 'ncommits'), [ - ("1.0.0", 771), - ("0.17.1", 27), - ("0.17.0", 569), - ("0.15.1", 41) -]) -def test_release_commits(fake_jira, version, ncommits): - r = Release.from_jira(version, jira=fake_jira) - assert len(r.commits) == ncommits - for c in r.commits: - assert isinstance(c, Commit) - assert isinstance(c.title, CommitTitle) - assert c.url.endswith(c.hexsha) - - -def test_maintenance_patch_selection(fake_jira): - r = Release.from_jira("0.17.1", jira=fake_jira) - - shas_to_pick = [ - c.hexsha for c in r.commits_to_pick(exclude_already_applied=False) - ] - expected = [ - '8939b4bd446ee406d5225c79d563a27d30fd7d6d', - 'bcef6c95a324417e85e0140f9745d342cd8784b3', - '6002ec388840de5622e39af85abdc57a2cccc9b2', - '9123dadfd123bca7af4eaa9455f5b0d1ca8b929d', - ] - assert shas_to_pick == expected diff --git a/dev/archery/archery/tests/test_testing.py b/dev/archery/archery/tests/test_testing.py deleted file mode 100644 index 117b9288d74b6..0000000000000 --- a/dev/archery/archery/tests/test_testing.py +++ /dev/null @@ -1,62 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import subprocess - -import pytest - -from archery.testing import PartialEnv, assert_subprocess_calls - - -def test_partial_env(): - assert PartialEnv(a=1, b=2) == {'a': 1, 'b': 2, 'c': 3} - assert PartialEnv(a=1) == {'a': 1, 'b': 2, 'c': 3} - assert PartialEnv(a=1, b=2) == {'a': 1, 'b': 2} - assert PartialEnv(a=1, b=2) != {'b': 2, 'c': 3} - assert PartialEnv(a=1, b=2) != {'a': 1, 'c': 3} - - -def test_assert_subprocess_calls(): - expected_calls = [ - "echo Hello", - ["echo", "World"] - ] - with assert_subprocess_calls(expected_calls): - subprocess.run(['echo', 'Hello']) - subprocess.run(['echo', 'World']) - - expected_env = PartialEnv( - CUSTOM_ENV_A='a', - CUSTOM_ENV_C='c' - ) - with assert_subprocess_calls(expected_calls, env=expected_env): - env = { - 'CUSTOM_ENV_A': 'a', - 'CUSTOM_ENV_B': 'b', - 'CUSTOM_ENV_C': 'c' - } - subprocess.run(['echo', 'Hello'], env=env) - subprocess.run(['echo', 'World'], env=env) - - with pytest.raises(AssertionError): - with assert_subprocess_calls(expected_calls, env=expected_env): - env = { - 'CUSTOM_ENV_B': 'b', - 'CUSTOM_ENV_C': 'c' - } - subprocess.run(['echo', 'Hello'], env=env) - subprocess.run(['echo', 'World'], env=env) diff --git a/dev/archery/archery/utils/__init__.py b/dev/archery/archery/utils/__init__.py deleted file mode 100644 index 13a83393a9124..0000000000000 --- a/dev/archery/archery/utils/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. diff --git a/dev/archery/archery/utils/cache.py b/dev/archery/archery/utils/cache.py deleted file mode 100644 index d92c5f32e270b..0000000000000 --- a/dev/archery/archery/utils/cache.py +++ /dev/null @@ -1,80 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from pathlib import Path -import os -from urllib.request import urlopen - -from .logger import logger - -ARCHERY_CACHE_DIR = Path.home() / ".cache" / "archery" - - -class Cache: - """ Cache stores downloaded objects, notably apache-rat.jar. """ - - def __init__(self, path=ARCHERY_CACHE_DIR): - self.root = path - - if not path.exists(): - os.makedirs(path) - - def key_path(self, key): - """ Return the full path of a key. """ - return self.root/key - - def get(self, key): - """ Return the full path of a key if cached, None otherwise. """ - path = self.key_path(key) - return path if path.exists() else None - - def delete(self, key): - """ Remove a key (and the file) from the cache. """ - path = self.get(key) - if path: - path.unlink() - - def get_or_insert(self, key, create): - """ - Get or Insert a key from the cache. If the key is not found, the - `create` closure will be evaluated. - - The `create` closure takes a single parameter, the path where the - object should be store. The file should only be created upon success. - """ - path = self.key_path(key) - - if not path.exists(): - create(path) - - return path - - def get_or_insert_from_url(self, key, url): - """ - Get or Insert a key from the cache. If the key is not found, the file - is downloaded from `url`. - """ - def download(path): - """ Tiny wrapper that download a file and save as key. """ - logger.debug("Downloading {} as {}".format(url, path)) - conn = urlopen(url) - # Ensure the download is completed before writing to disks. - content = conn.read() - with open(path, "wb") as path_fd: - path_fd.write(content) - - return self.get_or_insert(key, download) diff --git a/dev/archery/archery/utils/cmake.py b/dev/archery/archery/utils/cmake.py deleted file mode 100644 index f93895b1a09ce..0000000000000 --- a/dev/archery/archery/utils/cmake.py +++ /dev/null @@ -1,215 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import os -import re -from shutil import rmtree, which - -from .command import Command, default_bin - - -class CMake(Command): - def __init__(self, cmake_bin=None): - self.bin = default_bin(cmake_bin, "cmake") - - @staticmethod - def default_generator(): - """ Infer default generator. - - Gives precedence to ninja if there exists an executable named `ninja` - in the search path. - """ - found_ninja = which("ninja") - return "Ninja" if found_ninja else "Unix Makefiles" - - -cmake = CMake() - - -class CMakeDefinition: - """ CMakeDefinition captures the cmake invocation arguments. - - It allows creating build directories with the same definition, e.g. - ``` - build_1 = cmake_def.build("/tmp/build-1") - build_2 = cmake_def.build("/tmp/build-2") - - ... - - build1.all() - build2.all() - """ - - def __init__(self, source, build_type="release", generator=None, - definitions=None, env=None): - """ Initialize a CMakeDefinition - - Parameters - ---------- - source : str - Source directory where the top-level CMakeLists.txt is - located. This is usually the root of the project. - generator : str, optional - definitions: list(str), optional - env : dict(str,str), optional - Environment to use when invoking cmake. This can be required to - work around cmake deficiencies, e.g. CC and CXX. - """ - self.source = os.path.abspath(source) - self.build_type = build_type - self.generator = generator if generator else cmake.default_generator() - self.definitions = definitions if definitions else [] - self.env = env - - @property - def arguments(self): - """" Return the arguments to cmake invocation. """ - arguments = [ - "-G{}".format(self.generator), - ] + self.definitions + [ - self.source - ] - return arguments - - def build(self, build_dir, force=False, cmd_kwargs=None, **kwargs): - """ Invoke cmake into a build directory. - - Parameters - ---------- - build_dir : str - Directory in which the CMake build will be instantiated. - force : bool - If the build folder exists, delete it before. Otherwise if it's - present, an error will be returned. - """ - if os.path.exists(build_dir): - # Extra safety to ensure we're deleting a build folder. - if not CMakeBuild.is_build_dir(build_dir): - raise FileExistsError( - "{} is not a cmake build".format(build_dir) - ) - if not force: - raise FileExistsError( - "{} exists use force=True".format(build_dir) - ) - rmtree(build_dir) - - os.mkdir(build_dir) - - cmd_kwargs = cmd_kwargs if cmd_kwargs else {} - cmake(*self.arguments, cwd=build_dir, env=self.env, **cmd_kwargs) - return CMakeBuild(build_dir, self.build_type, definition=self, - **kwargs) - - def __repr__(self): - return "CMakeDefinition[source={}]".format(self.source) - - -CMAKE_BUILD_TYPE_RE = re.compile("CMAKE_BUILD_TYPE:STRING=([a-zA-Z]+)") - - -class CMakeBuild(CMake): - """ CMakeBuild represents a build directory initialized by cmake. - - The build instance can be used to build/test/install. It alleviates the - user to know which generator is used. - """ - - def __init__(self, build_dir, build_type, definition=None): - """ Initialize a CMakeBuild. - - The caller must ensure that cmake was invoked in the build directory. - - Parameters - ---------- - definition : CMakeDefinition - The definition to build from. - build_dir : str - The build directory to setup into. - """ - assert CMakeBuild.is_build_dir(build_dir) - super().__init__() - self.build_dir = os.path.abspath(build_dir) - self.build_type = build_type - self.definition = definition - - @property - def binaries_dir(self): - return os.path.join(self.build_dir, self.build_type) - - def run(self, *argv, verbose=False, **kwargs): - cmake_args = ["--build", self.build_dir, "--"] - extra = [] - if verbose: - extra.append("-v" if self.bin.endswith("ninja") else "VERBOSE=1") - # Commands must be ran under the build directory - return super().run(*cmake_args, *extra, - *argv, **kwargs, cwd=self.build_dir) - - def all(self): - return self.run("all") - - def clean(self): - return self.run("clean") - - def install(self): - return self.run("install") - - def test(self): - return self.run("test") - - @staticmethod - def is_build_dir(path): - """ Indicate if a path is CMake build directory. - - This method only checks for the existence of paths and does not do any - validation whatsoever. - """ - cmake_cache = os.path.join(path, "CMakeCache.txt") - cmake_files = os.path.join(path, "CMakeFiles") - return os.path.exists(cmake_cache) and os.path.exists(cmake_files) - - @staticmethod - def from_path(path): - """ Instantiate a CMakeBuild from a path. - - This is used to recover from an existing physical directory (created - with or without CMakeBuild). - - Note that this method is not idempotent as the original definition will - be lost. Only build_type is recovered. - """ - if not CMakeBuild.is_build_dir(path): - raise ValueError("Not a valid CMakeBuild path: {}".format(path)) - - build_type = None - # Infer build_type by looking at CMakeCache.txt and looking for a magic - # definition - cmake_cache_path = os.path.join(path, "CMakeCache.txt") - with open(cmake_cache_path, "r") as cmake_cache: - candidates = CMAKE_BUILD_TYPE_RE.findall(cmake_cache.read()) - build_type = candidates[0].lower() if candidates else "release" - - return CMakeBuild(path, build_type) - - def __repr__(self): - return ("CMakeBuild[" - "build = {}," - "build_type = {}," - "definition = {}]".format(self.build_dir, - self.build_type, - self.definition)) diff --git a/dev/archery/archery/utils/command.py b/dev/archery/archery/utils/command.py deleted file mode 100644 index 84d2842073f38..0000000000000 --- a/dev/archery/archery/utils/command.py +++ /dev/null @@ -1,97 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import os -import shlex -import shutil -import subprocess - -from .logger import logger, ctx - - -def default_bin(name, default): - assert(default) - env_name = "ARCHERY_{0}_BIN".format(default.upper()) - return name if name else os.environ.get(env_name, default) - - -# Decorator running a command and returning stdout -class capture_stdout: - def __init__(self, strip=False, listify=False): - self.strip = strip - self.listify = listify - - def __call__(self, f): - def strip_it(x): - return x.strip() if self.strip else x - - def list_it(x): - return x.decode('utf-8').splitlines() if self.listify else x - - def wrapper(*argv, **kwargs): - # Ensure stdout is captured - kwargs["stdout"] = subprocess.PIPE - return list_it(strip_it(f(*argv, **kwargs).stdout)) - return wrapper - - -class Command: - """ A runnable command. - - Class inheriting from the Command class must provide the bin - property/attribute. - """ - - def __init__(self, bin): - self.bin = bin - - def run(self, *argv, **kwargs): - assert hasattr(self, "bin") - invocation = shlex.split(self.bin) - invocation.extend(argv) - - for key in ["stdout", "stderr"]: - # Preserve caller intention, otherwise silence - if key not in kwargs and ctx.quiet: - kwargs[key] = subprocess.PIPE - - # Prefer safe by default - if "check" not in kwargs: - kwargs["check"] = True - - logger.debug("Executing `{}`".format(invocation)) - return subprocess.run(invocation, **kwargs) - - @property - def available(self): - """ Indicate if the command binary is found in PATH. """ - binary = shlex.split(self.bin)[0] - return shutil.which(binary) is not None - - def __call__(self, *argv, **kwargs): - return self.run(*argv, **kwargs) - - -class CommandStackMixin: - def run(self, *argv, **kwargs): - stacked_args = self.argv + argv - return super(CommandStackMixin, self).run(*stacked_args, **kwargs) - - -class Bash(Command): - def __init__(self, bash_bin=None): - self.bin = default_bin(bash_bin, "bash") diff --git a/dev/archery/archery/utils/git.py b/dev/archery/archery/utils/git.py deleted file mode 100644 index 798bc5d7096fb..0000000000000 --- a/dev/archery/archery/utils/git.py +++ /dev/null @@ -1,100 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from .command import Command, capture_stdout, default_bin -from ..compat import _stringify_path - - -# Decorator prepending argv with the git sub-command found with the method -# name. -def git_cmd(fn): - # function name is the subcommand - sub_cmd = fn.__name__.replace("_", "-") - - def wrapper(self, *argv, **kwargs): - return fn(self, sub_cmd, *argv, **kwargs) - return wrapper - - -class Git(Command): - def __init__(self, git_bin=None): - self.bin = default_bin(git_bin, "git") - - def run_cmd(self, cmd, *argv, git_dir=None, **kwargs): - """ Inject flags before sub-command in argv. """ - opts = [] - if git_dir is not None: - opts.extend(["-C", _stringify_path(git_dir)]) - - return self.run(*opts, cmd, *argv, **kwargs) - - @capture_stdout(strip=False) - @git_cmd - def archive(self, *argv, **kwargs): - return self.run_cmd(*argv, **kwargs) - - @git_cmd - def clone(self, *argv, **kwargs): - return self.run_cmd(*argv, **kwargs) - - @git_cmd - def fetch(self, *argv, **kwargs): - return self.run_cmd(*argv, **kwargs) - - @git_cmd - def checkout(self, *argv, **kwargs): - return self.run_cmd(*argv, **kwargs) - - def dirty(self, **kwargs): - return len(self.status("--short", **kwargs)) > 0 - - @git_cmd - def log(self, *argv, **kwargs): - return self.run_cmd(*argv, **kwargs) - - @capture_stdout(strip=True, listify=True) - @git_cmd - def ls_files(self, *argv, listify=False, **kwargs): - stdout = self.run_cmd(*argv, **kwargs) - return stdout - - @capture_stdout(strip=True) - @git_cmd - def rev_parse(self, *argv, **kwargs): - return self.run_cmd(*argv, **kwargs) - - @capture_stdout(strip=True) - @git_cmd - def status(self, *argv, **kwargs): - return self.run_cmd(*argv, **kwargs) - - @capture_stdout(strip=True) - def head(self, **kwargs): - """ Return commit pointed by HEAD. """ - return self.rev_parse("HEAD", **kwargs) - - @capture_stdout(strip=True) - def current_branch(self, **kwargs): - return self.rev_parse("--abbrev-ref", "HEAD", **kwargs) - - def repository_root(self, git_dir=None, **kwargs): - """ Locates the repository's root path from a subdirectory. """ - stdout = self.rev_parse("--show-toplevel", git_dir=git_dir, **kwargs) - return stdout.decode('utf-8') - - -git = Git() diff --git a/dev/archery/archery/utils/lint.py b/dev/archery/archery/utils/lint.py deleted file mode 100644 index 3b94d0139c07e..0000000000000 --- a/dev/archery/archery/utils/lint.py +++ /dev/null @@ -1,387 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import gzip -import os -from pathlib import Path - -import click - -from .command import Bash, Command, default_bin -from .cmake import CMake -from .git import git -from .logger import logger -from ..lang.cpp import CppCMakeDefinition, CppConfiguration -from ..lang.rust import Cargo -from ..lang.python import Autopep8, Flake8, NumpyDoc -from .rat import Rat, exclusion_from_globs -from .tmpdir import tmpdir - - -class LintValidationException(Exception): - pass - - -class LintResult: - def __init__(self, success, reason=None): - self.success = success - - def ok(self): - if not self.success: - raise LintValidationException - - @staticmethod - def from_cmd(command_result): - return LintResult(command_result.returncode == 0) - - -def cpp_linter(src, build_dir, clang_format=True, cpplint=True, - clang_tidy=False, iwyu=False, iwyu_all=False, - fix=False): - """ Run clang-format, cpplint and clang-tidy on cpp/ codebase. """ - logger.info("Running C++ linters") - - cmake = CMake() - if not cmake.available: - logger.error("cpp linter requested but cmake binary not found.") - return - - # A cmake build directory is required to populate `compile_commands.json` - # which in turn is required by clang-tidy. It also provides a convenient - # way to hide clang-format/clang-tidy invocation via the Generate - # (ninja/make) targets. - - # ARROW_LINT_ONLY exits early but ignore building compile_command.json - lint_only = not (iwyu or clang_tidy) - cmake_args = {"with_python": False, "with_lint_only": lint_only} - cmake_def = CppCMakeDefinition(src.cpp, CppConfiguration(**cmake_args)) - - build = cmake_def.build(build_dir) - if clang_format: - target = "format" if fix else "check-format" - yield LintResult.from_cmd(build.run(target, check=False)) - - if cpplint: - yield LintResult.from_cmd(build.run("lint", check=False)) - yield LintResult.from_cmd(build.run("lint_cpp_cli", check=False)) - - if clang_tidy: - yield LintResult.from_cmd(build.run("check-clang-tidy", check=False)) - - if iwyu: - if iwyu_all: - iwyu_cmd = "iwyu-all" - else: - iwyu_cmd = "iwyu" - yield LintResult.from_cmd(build.run(iwyu_cmd, check=False)) - - -class CMakeFormat(Command): - def __init__(self, cmake_format_bin): - self.bin = cmake_format_bin - - -def cmake_linter(src, fix=False): - """ Run cmake-format.py on all CMakeFiles.txt """ - logger.info("Running cmake-format linters") - - if not fix: - logger.warn("run-cmake-format modifies files, regardless of --fix") - - arrow_cmake_format = os.path.join(src.path, "run-cmake-format.py") - cmake_format = CMakeFormat(cmake_format_bin=arrow_cmake_format) - yield LintResult.from_cmd(cmake_format("--check")) - - -def python_linter(src, fix=False): - """Run Python linters on python/pyarrow, python/examples, setup.py - and dev/. """ - setup_py = os.path.join(src.python, "setup.py") - setup_cfg = os.path.join(src.python, "setup.cfg") - - logger.info("Running Python formatter (autopep8)") - - autopep8 = Autopep8() - if not autopep8.available: - logger.error( - "Python formatter requested but autopep8 binary not found. " - "Please run `pip install -r dev/archery/requirements-lint.txt`") - return - - # Gather files for autopep8 - patterns = ["python/pyarrow/**/*.py", - "python/pyarrow/**/*.pyx", - "python/pyarrow/**/*.pxd", - "python/pyarrow/**/*.pxi", - "python/examples/**/*.py", - "dev/archery/**/*.py", - ] - files = [setup_py] - for pattern in patterns: - files += list(map(str, Path(src.path).glob(pattern))) - - args = ['--global-config', setup_cfg, '--ignore-local-config'] - if fix: - args += ['-j0', '--in-place'] - args += sorted(files) - yield LintResult.from_cmd(autopep8(*args)) - else: - # XXX `-j0` doesn't work well with `--exit-code`, so instead - # we capture the diff and check whether it's empty - # (https://github.com/hhatto/autopep8/issues/543) - args += ['-j0', '--diff'] - args += sorted(files) - diff = autopep8.run_captured(*args) - if diff: - print(diff.decode('utf8')) - yield LintResult(success=False) - else: - yield LintResult(success=True) - - # Run flake8 after autopep8 (the latter may have modified some files) - logger.info("Running Python linter (flake8)") - - flake8 = Flake8() - if not flake8.available: - logger.error( - "Python linter requested but flake8 binary not found. " - "Please run `pip install -r dev/archery/requirements-lint.txt`") - return - - flake8_exclude = ['.venv*'] - - yield LintResult.from_cmd( - flake8("--extend-exclude=" + ','.join(flake8_exclude), - setup_py, src.pyarrow, os.path.join(src.python, "examples"), - src.dev, check=False)) - config = os.path.join(src.python, ".flake8.cython") - yield LintResult.from_cmd( - flake8("--config=" + config, src.pyarrow, check=False)) - - -def python_numpydoc(symbols=None, allow_rules=None, disallow_rules=None): - """Run numpydoc linter on python. - - Pyarrow must be available for import. - """ - logger.info("Running Python docstring linters") - # by default try to run on all pyarrow package - symbols = symbols or { - 'pyarrow', - 'pyarrow.compute', - 'pyarrow.csv', - 'pyarrow.dataset', - 'pyarrow.feather', - 'pyarrow.flight', - 'pyarrow.fs', - 'pyarrow.gandiva', - 'pyarrow.ipc', - 'pyarrow.json', - 'pyarrow.orc', - 'pyarrow.parquet', - 'pyarrow.plasma', - 'pyarrow.types', - } - try: - numpydoc = NumpyDoc(symbols) - except RuntimeError as e: - logger.error(str(e)) - yield LintResult(success=False) - return - - results = numpydoc.validate( - # limit the validation scope to the pyarrow package - from_package='pyarrow', - allow_rules=allow_rules, - disallow_rules=disallow_rules - ) - - if len(results) == 0: - yield LintResult(success=True) - return - - number_of_violations = 0 - for obj, result in results: - errors = result['errors'] - - # inspect doesn't play nice with cython generated source code, - # to use a hacky way to represent a proper __qualname__ - doc = getattr(obj, '__doc__', '') - name = getattr(obj, '__name__', '') - qualname = getattr(obj, '__qualname__', '') - module = getattr(obj, '__module__', '') - instance = getattr(obj, '__self__', '') - if instance: - klass = instance.__class__.__name__ - else: - klass = '' - - try: - cython_signature = doc.splitlines()[0] - except Exception: - cython_signature = '' - - desc = '.'.join(filter(None, [module, klass, qualname or name])) - - click.echo() - click.echo(click.style(desc, bold=True, fg='yellow')) - if cython_signature: - qualname_with_signature = '.'.join([module, cython_signature]) - click.echo( - click.style( - '-> {}'.format(qualname_with_signature), - fg='yellow' - ) - ) - - for error in errors: - number_of_violations += 1 - click.echo('{}: {}'.format(*error)) - - msg = 'Total number of docstring violations: {}'.format( - number_of_violations - ) - click.echo() - click.echo(click.style(msg, fg='red')) - - yield LintResult(success=False) - - -def rat_linter(src, root): - """Run apache-rat license linter.""" - logger.info("Running apache-rat linter") - - if src.git_dirty: - logger.warn("Due to the usage of git-archive, uncommitted files will" - " not be checked for rat violations. ") - - exclusion = exclusion_from_globs( - os.path.join(src.dev, "release", "rat_exclude_files.txt")) - - # Creates a git-archive of ArrowSources, apache-rat expects a gzip - # compressed tar archive. - archive_path = os.path.join(root, "apache-arrow.tar.gz") - src.archive(archive_path, compressor=gzip.compress) - report = Rat().report(archive_path) - - violations = list(report.validate(exclusion=exclusion)) - for violation in violations: - print("apache-rat license violation: {}".format(violation)) - - yield LintResult(len(violations) == 0) - - -def r_linter(src): - """Run R linter.""" - logger.info("Running R linter") - r_lint_sh = os.path.join(src.r, "lint.sh") - yield LintResult.from_cmd(Bash().run(r_lint_sh, check=False)) - - -def rust_linter(src): - """Run Rust linter.""" - logger.info("Running Rust linter") - cargo = Cargo() - - if not cargo.available: - logger.error("Rust linter requested but cargo executable not found.") - return - - yield LintResult.from_cmd(cargo.run("+stable", "fmt", "--all", "--", - "--check", cwd=src.rust, - check=False)) - - -class Hadolint(Command): - def __init__(self, hadolint_bin=None): - self.bin = default_bin(hadolint_bin, "hadolint") - - -def is_docker_image(path): - dirname = os.path.dirname(path) - filename = os.path.basename(path) - - excluded = dirname.startswith( - "dev") or dirname.startswith("python/manylinux") - - return filename.startswith("Dockerfile") and not excluded - - -def docker_linter(src): - """Run Hadolint docker linter.""" - logger.info("Running Docker linter") - - hadolint = Hadolint() - - if not hadolint.available: - logger.error( - "hadolint linter requested but hadolint binary not found.") - return - - for path in git.ls_files(git_dir=src.path): - if is_docker_image(path): - yield LintResult.from_cmd(hadolint.run(path, check=False, - cwd=src.path)) - - -def linter(src, fix=False, *, clang_format=False, cpplint=False, - clang_tidy=False, iwyu=False, iwyu_all=False, - python=False, numpydoc=False, cmake_format=False, rat=False, - r=False, rust=False, docker=False): - """Run all linters.""" - with tmpdir(prefix="arrow-lint-") as root: - build_dir = os.path.join(root, "cpp-build") - - # Linters yield LintResult without raising exceptions on failure. - # This allows running all linters in one pass and exposing all - # errors to the user. - results = [] - - if clang_format or cpplint or clang_tidy or iwyu: - results.extend(cpp_linter(src, build_dir, - clang_format=clang_format, - cpplint=cpplint, - clang_tidy=clang_tidy, - iwyu=iwyu, - iwyu_all=iwyu_all, - fix=fix)) - - if python: - results.extend(python_linter(src, fix=fix)) - - if numpydoc: - results.extend(python_numpydoc()) - - if cmake_format: - results.extend(cmake_linter(src, fix=fix)) - - if rat: - results.extend(rat_linter(src, root)) - - if r: - results.extend(r_linter(src)) - - if rust: - results.extend(rust_linter(src)) - - if docker: - results.extend(docker_linter(src)) - - # Raise error if one linter failed, ensuring calling code can exit with - # non-zero. - for result in results: - result.ok() diff --git a/dev/archery/archery/utils/logger.py b/dev/archery/archery/utils/logger.py deleted file mode 100644 index 9d0feda88e6ea..0000000000000 --- a/dev/archery/archery/utils/logger.py +++ /dev/null @@ -1,29 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import logging - -""" Global logger. """ -logger = logging.getLogger("archery") - - -class LoggingContext: - def __init__(self, quiet=False): - self.quiet = quiet - - -ctx = LoggingContext() diff --git a/dev/archery/archery/utils/rat.py b/dev/archery/archery/utils/rat.py deleted file mode 100644 index e7fe19a7ea8c4..0000000000000 --- a/dev/archery/archery/utils/rat.py +++ /dev/null @@ -1,70 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import fnmatch -import re -from xml.etree import ElementTree - -from ..lang.java import Jar -from .cache import Cache -from .command import capture_stdout - -RAT_VERSION = 0.13 -RAT_JAR_FILENAME = "apache-rat-{}.jar".format(RAT_VERSION) -RAT_URL_ = "https://repo1.maven.org/maven2/org/apache/rat/apache-rat" -RAT_URL = "/".join([RAT_URL_, str(RAT_VERSION), RAT_JAR_FILENAME]) - - -class Rat(Jar): - def __init__(self): - jar = Cache().get_or_insert_from_url(RAT_JAR_FILENAME, RAT_URL) - Jar.__init__(self, jar) - - @capture_stdout(strip=False) - def run_report(self, archive_path, **kwargs): - return self.run("--xml", archive_path, **kwargs) - - def report(self, archive_path, **kwargs): - return RatReport(self.run_report(archive_path, **kwargs)) - - -def exclusion_from_globs(exclusions_path): - with open(exclusions_path, 'r') as exclusions_fd: - exclusions = [e.strip() for e in exclusions_fd] - return lambda path: any([fnmatch.fnmatch(path, e) for e in exclusions]) - - -class RatReport: - def __init__(self, xml): - self.xml = xml - self.tree = ElementTree.fromstring(xml) - - def __repr__(self): - return "RatReport({})".format(self.xml) - - def validate(self, exclusion=None): - for r in self.tree.findall('resource'): - approvals = r.findall('license-approval') - if not approvals or approvals[0].attrib['name'] == 'true': - continue - - clean_name = re.sub('^[^/]+/', '', r.attrib['name']) - - if exclusion and exclusion(clean_name): - continue - - yield clean_name diff --git a/dev/archery/archery/utils/report.py b/dev/archery/archery/utils/report.py deleted file mode 100644 index 6c7587ddd8729..0000000000000 --- a/dev/archery/archery/utils/report.py +++ /dev/null @@ -1,64 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from abc import ABCMeta, abstractmethod -import datetime - -import jinja2 - - -def markdown_escape(s): - for char in ('*', '#', '_', '~', '`', '>'): - s = s.replace(char, '\\' + char) - return s - - -class Report(metaclass=ABCMeta): - - def __init__(self, **kwargs): - for field in self.fields: - if field not in kwargs: - raise ValueError('Missing keyword argument {}'.format(field)) - self._data = kwargs - - def __getattr__(self, key): - return self._data[key] - - @abstractmethod - def fields(self): - pass - - @property - @abstractmethod - def templates(self): - pass - - -class JinjaReport(Report): - - def __init__(self, **kwargs): - self.env = jinja2.Environment( - loader=jinja2.PackageLoader('archery', 'templates') - ) - self.env.filters['md'] = markdown_escape - self.env.globals['today'] = datetime.date.today - super().__init__(**kwargs) - - def render(self, template_name): - template_path = self.templates[template_name] - template = self.env.get_template(template_path) - return template.render(**self._data) diff --git a/dev/archery/archery/utils/source.py b/dev/archery/archery/utils/source.py deleted file mode 100644 index 1ae0fe025049a..0000000000000 --- a/dev/archery/archery/utils/source.py +++ /dev/null @@ -1,205 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import os -from pathlib import Path -import subprocess - -from .git import git - - -class InvalidArrowSource(Exception): - pass - - -class ArrowSources: - """ ArrowSources is a companion class representing a directory containing - Apache Arrow's sources. - """ - # Note that WORKSPACE is a reserved git revision name by this module to - # reference the current git workspace. In other words, this indicates to - # ArrowSources.at_revision that no cloning/checkout is required. - WORKSPACE = "WORKSPACE" - - def __init__(self, path): - """ Initialize an ArrowSources - - The caller must ensure that path is valid arrow source directory (can - be checked with ArrowSources.valid) - - Parameters - ---------- - path : src - """ - self.path = Path(path) - - @property - def archery(self): - """ Returns the archery directory of an Arrow sources. """ - return self.dev / "archery" - - @property - def cpp(self): - """ Returns the cpp directory of an Arrow sources. """ - return self.path / "cpp" - - @property - def dev(self): - """ Returns the dev directory of an Arrow sources. """ - return self.path / "dev" - - @property - def python(self): - """ Returns the python directory of an Arrow sources. """ - return self.path / "python" - - @property - def pyarrow(self): - """ Returns the python/pyarrow directory of an Arrow sources. """ - return self.python / "pyarrow" - - @property - def r(self): - """ Returns the r directory of an Arrow sources. """ - return self.path / "r" - - @property - def rust(self): - """ Returns the rust directory of an Arrow sources. """ - return self.path / "rust" - - @property - def git_backed(self): - """ Indicate if the sources are backed by git. """ - return (self.path / ".git").exists() - - @property - def git_dirty(self): - """ Indicate if the sources is a dirty git directory. """ - return self.git_backed and git.dirty(git_dir=self.path) - - def archive(self, path, dereference=False, compressor=None, revision=None): - """ Saves a git archive at path. """ - if not self.git_backed: - raise ValueError("{} is not backed by git".format(self)) - - rev = revision if revision else "HEAD" - archive = git.archive("--prefix=apache-arrow/", rev, - git_dir=self.path) - - # TODO(fsaintjacques): fix dereference for - - if compressor: - archive = compressor(archive) - - with open(path, "wb") as archive_fd: - archive_fd.write(archive) - - def at_revision(self, revision, clone_dir): - """ Return a copy of the current sources for a specified git revision. - - This method may return the current object if no checkout is required. - The caller is responsible to remove the cloned repository directory. - - The user can use the special WORKSPACE token to mean the current git - workspace (no checkout performed). - - The second value of the returned tuple indicates if a clone was - performed. - - Parameters - ---------- - revision : str - Revision to checkout sources at. - clone_dir : str - Path to checkout the local clone. - """ - if not self.git_backed: - raise ValueError("{} is not backed by git".format(self)) - - if revision == ArrowSources.WORKSPACE: - return self, False - - # A local clone is required to leave the current sources intact such - # that builds depending on said sources are not invalidated (or worse - # slightly affected when re-invoking the generator). - # "--local" only works when dest dir is on same volume of source dir. - # "--shared" works even if dest dir is on different volume. - git.clone("--shared", self.path, clone_dir) - - # Revision can reference "origin/" (or any remotes) that are not found - # in the local clone. Thus, revisions are dereferenced in the source - # repository. - original_revision = git.rev_parse(revision) - - git.checkout(original_revision, git_dir=clone_dir) - - return ArrowSources(clone_dir), True - - @staticmethod - def find(path=None): - """ Infer Arrow sources directory from various method. - - The following guesses are done in order until a valid match is found: - - 1. Checks the given optional parameter. - - 2. Checks if the environment variable `ARROW_SRC` is defined and use - this. - - 3. Checks if the current working directory (cwd) is an Arrow source - directory. - - 4. Checks if this file (cli.py) is still in the original source - repository. If so, returns the relative path to the source - directory. - """ - - # Explicit via environment - env = os.environ.get("ARROW_SRC") - - # Implicit via cwd - cwd = Path.cwd() - - # Implicit via current file - try: - this = Path(__file__).parents[4] - except IndexError: - this = None - - # Implicit via git repository (if archery is installed system wide) - try: - repo = git.repository_root(git_dir=cwd) - except subprocess.CalledProcessError: - # We're not inside a git repository. - repo = None - - paths = list(filter(None, [path, env, cwd, this, repo])) - for p in paths: - try: - return ArrowSources(p) - except InvalidArrowSource: - pass - - searched_paths = "\n".join([" - {}".format(p) for p in paths]) - raise InvalidArrowSource( - "Unable to locate Arrow's source directory. " - "Searched paths are:\n{}".format(searched_paths) - ) - - def __repr__(self): - return self.path diff --git a/dev/archery/archery/utils/tmpdir.py b/dev/archery/archery/utils/tmpdir.py deleted file mode 100644 index 07d7355c87fb8..0000000000000 --- a/dev/archery/archery/utils/tmpdir.py +++ /dev/null @@ -1,28 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from contextlib import contextmanager -from tempfile import mkdtemp, TemporaryDirectory - - -@contextmanager -def tmpdir(preserve=False, prefix="arrow-archery-"): - if preserve: - yield mkdtemp(prefix=prefix) - else: - with TemporaryDirectory(prefix=prefix) as tmp: - yield tmp diff --git a/dev/archery/conftest.py b/dev/archery/conftest.py deleted file mode 100644 index 06a643bea5645..0000000000000 --- a/dev/archery/conftest.py +++ /dev/null @@ -1,70 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import pathlib - -import pytest - - -def pytest_addoption(parser): - parser.addoption( - "--enable-integration", - action="store_true", - default=False, - help="run slow tests" - ) - - -def pytest_configure(config): - config.addinivalue_line( - "markers", - ( - "integration: mark test as integration tests involving more " - "extensive setup (only used for crossbow at the moment)" - ) - ) - - -def pytest_collection_modifyitems(config, items): - if config.getoption("--enable-integration"): - return - marker = pytest.mark.skip(reason="need --enable-integration option to run") - for item in items: - if "integration" in item.keywords: - item.add_marker(marker) - - -@pytest.fixture -def load_fixture(request): - current_test_directory = pathlib.Path(request.node.fspath).parent - - def decoder(path): - with path.open('r') as fp: - if path.suffix == '.json': - import json - return json.load(fp) - elif path.suffix == '.yaml': - import yaml - return yaml.load(fp) - else: - return fp.read() - - def loader(name, decoder=decoder): - path = current_test_directory / 'fixtures' / name - return decoder(path) - - return loader diff --git a/dev/archery/generate_files_for_endian_test.sh b/dev/archery/generate_files_for_endian_test.sh deleted file mode 100755 index 54019ea570e2a..0000000000000 --- a/dev/archery/generate_files_for_endian_test.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# This script generates json and arrow files of each type (e.g. primitive) for integration endian test -# Usage: generate_files_for_endian_test.sh -# ARROW_CPP_EXE_PATH : where Arrow C++ binaries can be found -# TMP_DIR : where files will be generated - -set -e - -: ${ARROW_CPP_EXE_PATH:=/arrow/cpp/build/debug/} -: ${TMP_DIR:=/tmp/arrow} - -json_dir=$TMP_DIR/arrow.$$ -mkdir -p $json_dir - -archery integration --stop-on-error --with-cpp=1 --tempdir=$json_dir - -for f in $json_dir/*.json ; do - $ARROW_CPP_EXE_PATH/arrow-json-integration-test -mode JSON_TO_ARROW -json $f -arrow ${f%.*}.arrow_file -integration true ; -done -for f in $json_dir/*.arrow_file ; do - $ARROW_CPP_EXE_PATH/arrow-file-to-stream $f > ${f%.*}.stream; -done -for f in $json_dir/*.json ; do - gzip $f ; -done -echo "The files are under $json_dir" diff --git a/dev/archery/requirements-lint.txt b/dev/archery/requirements-lint.txt deleted file mode 100644 index fc7f339ed4dbe..0000000000000 --- a/dev/archery/requirements-lint.txt +++ /dev/null @@ -1,3 +0,0 @@ -autopep8 -flake8 -cmake_format==0.5.2 diff --git a/dev/archery/requirements.txt b/dev/archery/requirements.txt deleted file mode 100644 index 0e1258adbb63f..0000000000000 --- a/dev/archery/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -click -pygithub -python-dotenv -ruamel.yaml diff --git a/dev/archery/setup.py b/dev/archery/setup.py deleted file mode 100755 index 0537e8b4d311a..0000000000000 --- a/dev/archery/setup.py +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env python -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import functools -import operator -import sys -from setuptools import setup - -if sys.version_info < (3, 6): - sys.exit('Python < 3.6 is not supported') - -# For pathlib.Path compatibility -jinja_req = 'jinja2>=2.11' - -extras = { - 'benchmark': ['pandas'], - 'docker': ['ruamel.yaml', 'python-dotenv'], - 'release': [jinja_req, 'jira', 'semver', 'gitpython'], - 'crossbow': ['github3.py', jinja_req, 'pygit2', 'ruamel.yaml', - 'setuptools_scm'], -} -extras['bot'] = extras['crossbow'] + ['pygithub', 'jira'] -extras['all'] = list(set(functools.reduce(operator.add, extras.values()))) - -setup( - name='archery', - version="0.1.0", - description='Apache Arrow Developers Tools', - url='http://github.com/apache/arrow', - maintainer='Arrow Developers', - maintainer_email='dev@arrow.apache.org', - packages=[ - 'archery', - 'archery.benchmark', - 'archery.integration', - 'archery.lang', - 'archery.utils' - ], - include_package_data=True, - install_requires=['click>=7'], - tests_require=['pytest', 'responses'], - extras_require=extras, - entry_points=''' - [console_scripts] - archery=archery.cli:archery - ''' -) diff --git a/dev/benchmarking/.env b/dev/benchmarking/.env deleted file mode 100644 index 7485f5866d7a2..0000000000000 --- a/dev/benchmarking/.env +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -PG_USER=benchmark -PG_PASS=benchmark diff --git a/dev/benchmarking/.gitignore b/dev/benchmarking/.gitignore deleted file mode 100644 index cda00d658189d..0000000000000 --- a/dev/benchmarking/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/machine.json diff --git a/dev/benchmarking/Dockerfile b/dev/benchmarking/Dockerfile deleted file mode 100644 index f470333979ca4..0000000000000 --- a/dev/benchmarking/Dockerfile +++ /dev/null @@ -1,23 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -FROM postgres:11-alpine - -# Any `.sh` and `.sql` files copied to the entrypoint directory -# will be run during startup. See `docker-entrypoint.sh` in -# https://github.com/docker-library/postgres/blob/master/11/alpine/ -COPY ddl/* /docker-entrypoint-initdb.d/ diff --git a/dev/benchmarking/README.md b/dev/benchmarking/README.md deleted file mode 100644 index 0c49baf3a9f2f..0000000000000 --- a/dev/benchmarking/README.md +++ /dev/null @@ -1,256 +0,0 @@ - - -> NOTE: For those deploying this database, Postgres does not by default use -> UTF-8, however it is [required for the jsonb][pg-jsonb] format used in -> some columns to always work. This [stackoverflow post][so-utf8] describes -> how to do it for Amazon RDS. This [section of the docs][pg-charset] -> states how to do it in general, i.e.: `initdb -E UTF8`. - -# Benchmark database - -This directory contains files related to the benchmark database. - -- 'ddl/\*.sql' contains the database definition. -- 'examples/' contain code to test the database and demonstrate its use. -- 'Dockerfile' and 'docker-compose.yml' are for developing benchmarks - against a testing database. -- An auto-generated summary of views in the [Data model][./data_model.rst]. - -## Setup - -To create a 'machine.json' file that will uniquely identify a computer for -benchmark submission, run the provided shell script and fill in the prompts -to identify the GPU. - -> NOTE: this does not work on VMs or Windows. - -```shell -./make_machine_json.sh -``` - -Submit the machine details via http using the command - -> NOTE: This will only work if we have selected graphql as a client -> and have it running in production or if during development -> you have run `docker-compose up` to create and run both a -> database Docker container and graphql client Docker container. - -```shell -./graphql_submit.sh machine machine.json localhost:5000/graphql -``` - -or submit after starting up the psql client from this directory, using - -``` -\set content `cat machine.json` -SELECT ingest_machine_view(:'content'::jsonb); -``` - -> NOTE: If you don't have a "machine.json" file generated, -> use the example file "examples/machine.json" instead. - -## Local testing - -There is a file named "[.env][.env]" in this directory that is used by -`docker-compose` to set up the postgres user and password for the -local containers. Currently the name and password are both -`benchmark`. This will be the password for the psql client as well. - -The Postgres Alpine image runs any added '\*.sql' and '\*.sh' scripts placed -in '/docker-entrypoint-initdb.d/' during its startup script, so the local -database will be set up automatically once the container is running. - -To start the containers, be sure to have [Docker installed][docker], -and then run the following from this directory (arrow/dev/benchmarking). - -``` -docker-compose up -``` - -This will start a process that will show logs from both the running -Postgres container and the running GraphQL container. -To stop the running containers gracefully, background the process -and run - -``` -docker-compose down -fg # To re-foreground the backgrounded process while it exits -``` - -You will still have the container images "benchmarking_pg", -"graphile/postgraphile", and "postgres:11-alpine" on your -computer. You should keep them if you want to run this again. -If you don't, then remove them with the command: - -``` -docker rmi benchmarking_pg postgres:11-alpine graphile/postgraphile -``` - -### Postgres client - -The `psql` shell client is bundled with the PostgreSQL core distribution -available from the [Postgres download page][postgres-downloads]. -Using the `PG_USER` defined in the `.env` file (currently "benchmark"), -the command to connect to the container is: - -```shell -psql -h localhost -p 5432 -U benchmark -``` - -There is an example script in [examples/example.sql](examples/example.sql) that -runs some queries against the database. To run it in the psql client, type -the following in the psql command-line interface: - -``` -\i examples/example.sql -``` - -#### Bulk ingestion using CSV - -An example CSV file for bulk ingestion is in -[examples/benchmark_run_example.csv](examples/benchmark_run_example.csv). -The columns are listed in the same order as they are defined, to avoid having -to explicitly name every column in ingestion. The "id" column is left empty -and will be automatically assigned on insert. - -To ingest the example CSV file from the command line, -use the command below: - -```shell -CSV='examples/benchmark_run_example.csv' && \ -psql -U benchmark -h localhost -p 5432 \ - -c "\copy benchmark_run_view FROM '${CSV}' WITH (FORMAT csv, HEADER);" -``` - -#### Bulk ingestion using JSON - -To ingest the example JSON file using the psql client, use the command below. - -``` -\set content `cat examples/benchmark_example.json` -SELECT ingest_benchmark_view(:'content'::jsonb); -``` - -### HTTP client - -This section requires an actual HTTP client to be up, either -for the production database or via the testing setup. -(See the [local testing section](#local-testing) for how to set it up). - -The 'graphile/postgraphile' container provides an HTTP interface -to the database via two url routes: - -- A GraphiQL page ([localhost:5000/graphiql][graphiql]) - to aid visual exploration of the data model. - (The `--watch` flag on the command line. Not recommended for production.) -- An endpoint that receives POST requests only (localhost:5000/graphql). - -#### Ingestion - -The script [graphql_submit.sh](./graphql_submit.sh) simplifies submission -to the database via curl. Examples: - -```shell -./graphql_submit.sh benchmarks examples/benchmark_example.json -./graphql_submit.sh runs examples/benchmark_run_example.json -``` - -#### Querying - -The output of the query is a JSON object that is hard to read on the command line. -Here is an example query in the shell: - -```shell -curl -X POST \ - -H "Content-Type: application/json" \ - --data '{"query": "{projectDetails{ projectName }}"}' \ - localhost:5000/graphql -``` - -which (if you have previously run the "examples.sql" command) yields - -``` -{"data":{"projectDetails":{"projectName":"Apache Arrow"}}} -``` - -Here is an example query using Python: - -```python -import json -import requests - -uri = "http://localhost:5000/graphql" -query = json.load(open("examples/graphql_query_environment_view.json")) -response = requests.post(uri, json=query) -message = "{benchmarkLanguage}: {languageImplementationVersion}, {dependencies}" - -for row in response.json()['data']['allEnvironmentViews']['edges']: - print(message.format(**row['node'])) - -# result: -# -# Python: CPython 2.7, {"six":"","numpy":"1.14","other_lib":"1.0"} -# Python: CPython 2.7, {"six":"","numpy":"1.15","other_lib":"1.0"} -# Python: CPython 3.6, {"boost":"1.42","numpy":"1.15"} -``` - -## Deployment - -(work in progress). - -> NOTE: For those deploying this database, Postgres does not by default use -> UTF-8, however it is [required for the jsonb][pg-jsonb] format used in -> some columns to always work. This [stackoverflow post][so-utf8] describes -> how to do it for Amazon RDS. This [section of the docs][pg-charset] -> states how to do it in general, i.e.: `initdb -E UTF8`. - -## Quick reference - -- String variables `'have single quotes'` -- Arrays `'{"have", "curly", "braces"}'::text[]` or `'{1, 2, 3}'::integer[]` -- JSONb `'{"has":"this", "format":42}'::jsonb` -- Elements inserted using JSON-formatted strings can use standard - JSON-formatted arrays (`[1, 2, 3]`) and do not have to use the above - string formats. -- When comparing nullable values use `x IS NOT DISTINCT FROM y` rather than `x = y` -- An auto-generated summary of the [Data model][./data_model.rst]. - -## Data model documentation - -To recreate the data model documentation, -(1) install the [psql client][postgres-downloads] -(sorry you need to download the whole thing), -(2) start the docker container using `docker-compose up`, -(3) and then run these scripts: - -``` -./make_dotfile.sh -./make_data_model_rst.sh -``` - -[pg-jsonb]: https://www.postgresql.org/docs/11/datatype-json.html#id-1.5.7.22.3 -[so-utf8]: https://stackoverflow.com/a/33557023 -[pg-charset]: https://www.postgresql.org/docs/9.3/multibyte.html#AEN34424 -[docker]: https://www.docker.com/get-started -[citext-limitations]: https://www.postgresql.org/docs/11/citext.html#id-1.11.7.17.7 -[postgres-downloads]: https://www.postgresql.org/download/ -[graphiql]: http://localhost:5000/graphiql -[postgraphile-lambda]: https://github.com/graphile/postgraphile-lambda-example -[postgraphile-cli]: https://www.graphile.org/postgraphile/usage-cli/ diff --git a/dev/benchmarking/data_model.dot b/dev/benchmarking/data_model.dot deleted file mode 100644 index d311acd4e5f1c..0000000000000 --- a/dev/benchmarking/data_model.dot +++ /dev/null @@ -1,219 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements.See the NOTICE file - distributed with this work for additional information - regarding copyright ownership.The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License.You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied.See the License for the - specific language governing permissions and limitations - under the License. -*/ - -/* - WARNING - This is an auto-generated file. Please do not edit. - - To reproduce, please run :code:`./make_data_model_rst.sh`. - (This requires you have the - `psql client `_ - and have started the docker containers using - :code:`docker-compose up`). -*/ -digraph database { - concentrate = true; - rankdir = LR; - ratio = ".75"; - node [shape = none, fontsize="11", fontname="Helvetica"]; - edge [fontsize="8", fontname="Helvetica"]; -legend -[fontsize = "14" -label = -< - - - - - - -
Legend
pk = primary key
fk = foreign key
u = unique*
o = optional
* multiple uniques in the same table are a unique group
> -]; -benchmark -[label = - < - - - - - - - - -
benchmark
benchmark_id (pk)
benchmark_language_id (pk)
benchmark_name (u)
parameter_names (o)
benchmark_description
benchmark_version (u)
unit_id (fk)
> -]; -benchmark_language -[label = - < - - - -
benchmark_language
benchmark_language_id (pk)
benchmark_language (u)
> -]; -benchmark_run -[label = - < - - - - - - - - - - - - - - - - - - - - -
benchmark_run
benchmark_run_id (pk)
parameter_values (u)
value
git_commit_timestamp (u)
git_hash
val_min (o)
val_q1 (o)
val_q3 (o)
val_max (o)
std_dev
n_obs
run_timestamp (u)
run_metadata (o)
run_notes (o)
machine_id (u) (fk)
environment_id (u) (fk)
language_implementation_version_id (fk)
benchmark_language_id (fk)
benchmark_id (u) (fk)
> -]; -benchmark_type -[label = - < - - - - -
benchmark_type
benchmark_type_id (pk)
benchmark_type (u)
lessisbetter
> -]; -cpu -[label = - < - - - - - - - - - - - -
cpu
cpu_id (pk)
cpu_model_name (u)
cpu_core_count
cpu_thread_count
cpu_frequency_max_hz
cpu_frequency_min_hz
cpu_l1d_cache_bytes
cpu_l1i_cache_bytes
cpu_l2_cache_bytes
cpu_l3_cache_bytes
> -]; -dependencies -[label = - < - - - -
dependencies
dependencies_id (pk)
dependencies (u)
> -]; -gpu -[label = - < - - - - - -
gpu
gpu_id (pk)
gpu_information (u)
gpu_part_number
gpu_product_name
> -]; -language_implementation_version -[label = - < - - - - -
language_implementation_version
language_implementation_version_id (pk)
benchmark_language_id (pk)
language_implementation_version (u)
> -]; -machine -[label = - < - - - - - - - - - - -
machine
machine_id (pk)
machine_name
mac_address (u)
memory_bytes
cpu_actual_frequency_hz
machine_other_attributes (o)
cpu_id (fk)
gpu_id (fk)
os_id (fk)
> -]; -os -[label = - < - - - - - -
os
os_id (pk)
os_name (u)
architecture_name (u)
kernel_name (u)
> -]; -project -[label = - < - - - - - - -
project
project_id (pk)
project_name (u)
project_url (u)
repo_url (u)
last_changed
> -]; -unit -[label = - < - - - - -
unit
unit_id (pk)
units (u)
benchmark_type_id (fk)
> -]; -environment -[label = - < - - - - - -
environment
environment_id (pk)
language_implementation_version_id (pk)
benchmark_language_id (pk)
dependencies_id (u) (fk)
> -]; -machine:cpu_id -> cpu:cpu_id; -machine:gpu_id -> gpu:gpu_id; -machine:os_id -> os:os_id; -benchmark:benchmark_language_id -> benchmark_language:benchmark_language_id; -environment:benchmark_language_id -> benchmark_language:benchmark_language_id; -language_implementation_version:benchmark_language_id -> benchmark_language:benchmark_language_id; -environment:dependencies_id -> dependencies:dependencies_id; -environment:benchmark_language_id -> language_implementation_version:benchmark_language_id; -environment:language_implementation_version_id -> language_implementation_version:language_implementation_version_id; -unit:benchmark_type_id -> benchmark_type:benchmark_type_id; -benchmark_run:machine_id -> machine:machine_id; -benchmark:unit_id -> unit:unit_id; -benchmark_run:language_implementation_version_id -> environment:language_implementation_version_id; -benchmark_run:benchmark_language_id -> environment:benchmark_language_id; -benchmark_run:environment_id -> environment:environment_id; -benchmark_run:benchmark_language_id -> benchmark:benchmark_language_id; -benchmark_run:benchmark_id -> benchmark:benchmark_id; -} - diff --git a/dev/benchmarking/data_model.rst b/dev/benchmarking/data_model.rst deleted file mode 100644 index d0f3dc7fc996a..0000000000000 --- a/dev/benchmarking/data_model.rst +++ /dev/null @@ -1,373 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - - -.. WARNING -.. This is an auto-generated file. Please do not edit. - -.. To reproduce, please run :code:`./make_data_model_rst.sh`. -.. (This requires you have the -.. `psql client `_ -.. and have started the docker containers using -.. :code:`docker-compose up`). - - -.. _benchmark-data-model: - -Benchmark data model -==================== - - -.. graphviz:: data_model.dot - - -.. _benchmark-ingestion: - -Benchmark ingestion helper functions -==================================== - -ingest_benchmark_run_view -------------------------- - -:code:`ingest_benchmark_run_view(from_jsonb jsonb)` - -The argument is a JSON object. NOTE: key names must be entirely -lowercase, or the insert will fail. Extra key-value pairs are ignored. -Example:: - - [ - { - "benchmark_name": "Benchmark 2", - "benchmark_version": "version 0", - "parameter_values": {"arg0": 100, "arg1": 5}, - "value": 2.5, - "git_commit_timestamp": "2019-02-08 22:35:53 +0100", - "git_hash": "324d3cf198444a", - "val_min": 1, - "val_q1": 2, - "val_q3": 3, - "val_max": 4, - "std_dev": 1.41, - "n_obs": 8, - "run_timestamp": "2019-02-14 03:00:05 -0600", - "mac_address": "08:00:2b:01:02:03", - "benchmark_language": "Python", - "language_implementation_version": "CPython 2.7", - "dependencies": {"six": "", "numpy": "1.14", "other_lib": "1.0"} - }, - { - "benchmark_name": "Benchmark 2", - "benchmark_version": "version 0", - "parameter_values": {"arg0": 1000, "arg1": 5}, - "value": 5, - "git_commit_timestamp": "2019-02-08 22:35:53 +0100", - "git_hash": "324d3cf198444a", - "std_dev": 3.14, - "n_obs": 8, - "run_timestamp": "2019-02-14 03:00:10 -0600", - "mac_address": "08:00:2b:01:02:03", - "benchmark_language": "Python", - "language_implementation_version": "CPython 2.7", - "dependencies": {"six": "", "numpy": "1.14", "other_lib": "1.0"} - } - ] -To identify which columns in "benchmark_run_view" are required, -please see the view documentation in :ref:`benchmark-data-model`. - - - -back to `Benchmark data model `_ - - -ingest_benchmark_view ---------------------- - -:code:`ingest_benchmark_view(from_jsonb jsonb)` - -The argument is a JSON object. NOTE: key names must be entirely -lowercase, or the insert will fail. Extra key-value pairs are ignored. -Example:: - - [ - { - "benchmark_name": "Benchmark 1", - "parameter_names": ["arg0", "arg1", "arg2"], - "benchmark_description": "First benchmark", - "benchmark_type": "Time", - "units": "miliseconds", - "lessisbetter": true, - "benchmark_version": "second version", - "benchmark_language": "Python" - }, - { - "benchmark_name": "Benchmark 2", - "parameter_names": ["arg0", "arg1"], - "benchmark_description": "Description 2.", - "benchmark_type": "Time", - "units": "nanoseconds", - "lessisbetter": true, - "benchmark_version": "second version", - "benchmark_language": "Python" - } - ] - -To identify which columns in "benchmark_view" are required, -please see the view documentation in :ref:`benchmark-data-model`. - - - -back to `Benchmark data model `_ - - -ingest_benchmark_runs_with_context ----------------------------------- - -:code:`ingest_benchmark_runs_with_context(from_jsonb jsonb)` - -The argument is a JSON object. NOTE: key names must be entirely -lowercase, or the insert will fail. Extra key-value pairs are ignored. -The object contains three key-value pairs:: - - {"context": { - "mac_address": "08:00:2b:01:02:03", - "benchmark_language": "Python", - "language_implementation_version": "CPython 3.6", - "dependencies": {"six": "", "numpy": "1.14", "other_lib": "1.0"}, - "git_commit_timestamp": "2019-02-14 22:42:22 +0100", - "git_hash": "123456789abcde", - "run_timestamp": "2019-02-14 03:00:40 -0600", - "extra stuff": "does not hurt anything and will not be added." - }, - "benchmark_version": { - "Benchmark Name 1": "Any string can be a version.", - "Benchmark Name 2": "A git hash can be a version.", - "An Unused Benchmark Name": "Will be ignored." - }, - "benchmarks": [ - { - "benchmark_name": "Benchmark Name 1", - "parameter_values": {"argument1": 1, "argument2": "value2"}, - "value": 42, - "val_min": 41.2, - "val_q1": 41.5, - "val_q3": 42.5, - "val_max": 42.8, - "std_dev": 0.5, - "n_obs": 100, - "run_metadata": {"any": "key-value pairs"}, - "run_notes": "Any relevant notes." - }, - { - "benchmark_name": "Benchmark Name 2", - "parameter_values": {"not nullable": "Use {} if no params."}, - "value": 8, - "std_dev": 1, - "n_obs": 2, - } - ] - } - -- The entry for "context" contains the machine, environment, and timestamp - information common to all of the runs -- The entry for "benchmark_version" maps benchmark - names to their version strings. (Which can be a git hash, - the entire code string, a number, or any other string of your choice.) -- The entry for "benchmarks" is a list of benchmark run data - for the given context and benchmark versions. The first example - benchmark run entry contains all possible values, even - nullable ones, and the second entry omits all nullable values. - - - - -back to `Benchmark data model `_ - - -ingest_machine_view -------------------- - -:code:`ingest_machine_view(from_jsonb jsonb)` - -The argument is a JSON object. NOTE: key names must be entirely -lowercase, or the insert will fail. Extra key-value pairs are ignored. -Example:: - - { - "mac_address": "0a:00:2d:01:02:03", - "machine_name": "Yet-Another-Machine-Name", - "memory_bytes": 8589934592, - "cpu_actual_frequency_hz": 2300000000, - "os_name": "OSX", - "architecture_name": "x86_64", - "kernel_name": "18.2.0", - "cpu_model_name": "Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz", - "cpu_core_count": 2, - "cpu_thread_count": 4, - "cpu_frequency_max_hz": 2300000000, - "cpu_frequency_min_hz": 2300000000, - "cpu_l1d_cache_bytes": 32768, - "cpu_l1i_cache_bytes": 32768, - "cpu_l2_cache_bytes": 262144, - "cpu_l3_cache_bytes": 4194304, - "machine_other_attributes": {"just": "an example"}, - "gpu_information": "", - "gpu_part_number": "", - "gpu_product_name": "" - } - -To identify which columns in "machine_view" are required, -please see the view documentation in :ref:`benchmark-data-model`. - - - -back to `Benchmark data model `_ - - - -.. _benchmark-views: - -Benchmark views -=============== - - -benchmark_run_view ------------------- - -Each benchmark run. - -- Each entry is unique on the machine, environment, benchmark, - and git commit timestamp. - -=============================== =========== ======== =========== =========== -Column Type Nullable Default Description -=============================== =========== ======== =========== =========== -benchmark_run_id int8 not null serial primary key -benchmark_name citext not null unique -benchmark_version citext not null unique -parameter_values jsonb not null '{}'::jsonb unique -value numeric not null -git_commit_timestamp timestamptz not null unique -git_hash text not null -val_min numeric -val_q1 numeric -val_q3 numeric -val_max numeric -std_dev numeric not null -n_obs int4 not null -run_timestamp timestamptz not null unique -run_metadata jsonb -run_notes text -mac_address macaddr not null unique -benchmark_language citext not null unique -language_implementation_version citext not null ''::citext unique -dependencies jsonb not null '{}'::jsonb unique -=============================== =========== ======== =========== =========== - -back to `Benchmark data model `_ - -benchmark_view --------------- - -The details about a particular benchmark. - -- "benchmark_name" is unique for a given "benchmark_language" -- Each entry is unique on - ("benchmark_language", "benchmark_name", "benchmark_version") - -===================== ====== ======== ======= =========== -Column Type Nullable Default Description -===================== ====== ======== ======= =========== -benchmark_id int4 not null serial primary key -benchmark_name citext not null unique -parameter_names _text -benchmark_description text not null -benchmark_type citext not null unique -units citext not null unique -lessisbetter bool not null -benchmark_version citext not null unique -benchmark_language citext not null unique -===================== ====== ======== ======= =========== - -back to `Benchmark data model `_ - -environment_view ----------------- - -The build environment used for a reported benchmark run. -(Will be inferred from each "benchmark_run" if not explicitly added). - -- Each entry is unique on - ("benchmark_language", "language_implementation_version", "dependencies") -- "benchmark_language" is unique in the "benchmark_language" table -- "benchmark_language" plus "language_implementation_version" is unique in - the "language_implementation_version" table -- "dependencies" is unique in the "dependencies" table - -=============================== ====== ======== =========== =========== -Column Type Nullable Default Description -=============================== ====== ======== =========== =========== -environment_id int4 not null serial primary key -benchmark_language citext not null unique -language_implementation_version citext not null ''::citext unique -dependencies jsonb not null '{}'::jsonb unique -=============================== ====== ======== =========== =========== - -back to `Benchmark data model `_ - -machine_view ------------- - -The machine environment (CPU, GPU, OS) used for each benchmark run. - -- "mac_address" is unique in the "machine" table -- "gpu_part_number" is unique in the "gpu" (graphics processing unit) table - Empty string (''), not null, is used for machines that won't use the GPU -- "cpu_model_name" is unique in the "cpu" (central processing unit) table -- "os_name", "os_architecture_name", and "os_kernel_name" - are unique in the "os" (operating system) table -- "machine_other_attributes" is a key-value store for any other relevant - data, e.g. '{"hard_disk_type": "solid state"}' - -======================== ======= ======== ========== =========== -Column Type Nullable Default Description -======================== ======= ======== ========== =========== -machine_id int4 not null serial primary key -mac_address macaddr not null unique -machine_name citext not null -memory_bytes int8 not null -cpu_actual_frequency_hz int8 not null -os_name citext not null unique -architecture_name citext not null unique -kernel_name citext not null ''::citext unique -cpu_model_name citext not null unique -cpu_core_count int4 not null -cpu_thread_count int4 not null -cpu_frequency_max_hz int8 not null -cpu_frequency_min_hz int8 not null -cpu_l1d_cache_bytes int4 not null -cpu_l1i_cache_bytes int4 not null -cpu_l2_cache_bytes int4 not null -cpu_l3_cache_bytes int4 not null -gpu_information citext not null ''::citext unique -gpu_part_number citext not null ''::citext -gpu_product_name citext not null ''::citext -machine_other_attributes jsonb -======================== ======= ======== ========== =========== - -back to `Benchmark data model `_ - - diff --git a/dev/benchmarking/ddl/0_setup.sql b/dev/benchmarking/ddl/0_setup.sql deleted file mode 100644 index ec10446412434..0000000000000 --- a/dev/benchmarking/ddl/0_setup.sql +++ /dev/null @@ -1,23 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - -CREATE EXTENSION IF NOT EXISTS "citext"; -- type for case-insensitive text - --- For future fine-grained control over function execution by user group. -ALTER DEFAULT PRIVILEGES REVOKE EXECUTE ON functions FROM public; diff --git a/dev/benchmarking/ddl/1_00_table_public_project.sql b/dev/benchmarking/ddl/1_00_table_public_project.sql deleted file mode 100644 index c52d66cfd950d..0000000000000 --- a/dev/benchmarking/ddl/1_00_table_public_project.sql +++ /dev/null @@ -1,45 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - - --- PROJECT -CREATE TABLE IF NOT EXISTS public.project -( - project_id SERIAL PRIMARY KEY - , project_name citext NOT NULL - , project_url text NOT NULL - , repo_url text NOT NULL - , last_changed timestamp (0) without time zone NOT NULL DEFAULT now() -); -COMMENT ON TABLE public.project - IS 'Project name and relevant URLs.'; -COMMENT ON COLUMN public.project.project_url - IS 'Homepage URL.'; -COMMENT ON COLUMN public.project.repo_url - IS 'Git repo URL to link stored commit hashes to code in a webpage.'; -COMMENT ON COLUMN public.project.last_changed - IS 'New project details are added with a new timestamp. ' - 'The project details with the newest timestamp will be used.'; - --- CONSTRAINTS -CREATE UNIQUE INDEX project_unique_index_on_project_name_urls - ON public.project(project_name, project_url, repo_url); -COMMENT ON INDEX - public.project_unique_index_on_project_name_urls - IS 'Enforce uniqueness of project name and urls.'; diff --git a/dev/benchmarking/ddl/1_01_table_public_cpu.sql b/dev/benchmarking/ddl/1_01_table_public_cpu.sql deleted file mode 100644 index df1a9e757d251..0000000000000 --- a/dev/benchmarking/ddl/1_01_table_public_cpu.sql +++ /dev/null @@ -1,63 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - - --- CPU -CREATE TABLE IF NOT EXISTS public.cpu -( - cpu_id SERIAL PRIMARY KEY - , cpu_model_name citext NOT NULL UNIQUE - , cpu_core_count integer NOT NULL - , cpu_thread_count integer NOT NULL - , cpu_frequency_max_Hz bigint NOT NULL - , cpu_frequency_min_Hz bigint NOT NULL - , cpu_L1d_cache_bytes integer NOT NULL - , cpu_L1i_cache_bytes integer NOT NULL - , cpu_L2_cache_bytes integer NOT NULL - , cpu_L3_cache_bytes integer NOT NULL -); -COMMENT ON TABLE public.cpu - IS 'CPU model and its specifications.'; -COMMENT ON COLUMN public.cpu.cpu_id - IS 'The primary key for the CPU table. ' - 'NOTE: This is a synthetic primary key and not meant to represent a ' - 'processor instruction to read capabilities.'; -COMMENT ON COLUMN public.cpu.cpu_model_name - IS 'The output of `sysctl -n machdep.cpu.brand_stringp`.'; -COMMENT ON COLUMN public.cpu.cpu_core_count - IS 'The output of `sysctl -n hw.physicalcpu`.'; -COMMENT ON COLUMN public.cpu.cpu_thread_count - IS 'The output of `sysctl -n hw.logicalcpu`.'; -COMMENT ON COLUMN public.cpu.cpu_frequency_max_Hz - IS 'The output of `sysctl -n hw.cpufrequency_max`.'; -COMMENT ON COLUMN public.cpu.cpu_frequency_min_Hz - IS 'The output of `sysctl -n hw.cpufrequency_min`.'; -COMMENT ON COLUMN public.cpu.cpu_L1d_cache_bytes - IS 'The output of `sysctl -n hw.l1dcachesize`.'; -COMMENT ON COLUMN public.cpu.cpu_L1i_cache_bytes - IS 'The output of `sysctl -n hw.l1icachesize`.'; -COMMENT ON COLUMN public.cpu.cpu_L2_cache_bytes - IS 'The output of `sysctl -n hw.l2cachesize`.'; -COMMENT ON COLUMN public.cpu.cpu_L3_cache_bytes - IS 'The output of `sysctl -n hw.l3cachesize`.'; - --- CONSTRAINTS -ALTER TABLE public.cpu - ADD CONSTRAINT cpu_check_cpu_model_name_length - CHECK (char_length(cpu_model_name) < 255); diff --git a/dev/benchmarking/ddl/1_02_table_public_gpu.sql b/dev/benchmarking/ddl/1_02_table_public_gpu.sql deleted file mode 100644 index 564af19de7a6e..0000000000000 --- a/dev/benchmarking/ddl/1_02_table_public_gpu.sql +++ /dev/null @@ -1,43 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - - --- GPU -CREATE TABLE IF NOT EXISTS public.gpu -( - gpu_id SERIAL PRIMARY KEY - , gpu_information citext UNIQUE NOT NULL DEFAULT '' - , gpu_part_number citext NOT NULL DEFAULT '' - , gpu_product_name citext NOT NULL DEFAULT '' -); -COMMENT ON TABLE public.gpu IS 'GPU specifications.'; -COMMENT ON COLUMN public.gpu.gpu_information - IS 'The output of `nvidia-smi -q` (on Linux or Windows), or `cuda-smi` ' - 'or `kextstat | grep -i cuda` on OSX, or another command; anything ' - 'that gets a string to uniquely identify the GPU.'; - --- CONSTRAINTS -CREATE INDEX gpu_index_on_part_number - ON public.gpu (gpu_part_number); - -CREATE INDEX gpu_index_on_product_name - ON public.gpu (gpu_product_name); - -CREATE INDEX gpu_index_on_product_name_and_part_number - ON public.gpu (gpu_product_name, gpu_part_number); diff --git a/dev/benchmarking/ddl/1_03_table_public_os.sql b/dev/benchmarking/ddl/1_03_table_public_os.sql deleted file mode 100644 index 7b03d82f48748..0000000000000 --- a/dev/benchmarking/ddl/1_03_table_public_os.sql +++ /dev/null @@ -1,57 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - - --- OS -CREATE TABLE IF NOT EXISTS public.os -( - os_id SERIAL PRIMARY KEY - , os_name citext NOT NULL - , architecture_name citext NOT NULL - , kernel_name citext NOT NULL DEFAULT '' -); --- @name os. forces retention of an 's' in the Graphile GraphQL api. -COMMENT ON TABLE public.os - IS E'@name os.\nOperating system name and kernel (version).'; -COMMENT ON COLUMN public.os.os_name - IS 'Operating system name. For example, OSX, Ubuntu, Windows`.'; -COMMENT ON COLUMN public.os.architecture_name - IS 'Operating system architecture; the output of `uname -m`.'; -COMMENT ON COLUMN public.os.kernel_name - IS 'Operating system kernel, or NULL. ' - 'On Linux/OSX, the output of `uname -r`. ' - 'On Windows, the output of `ver`.'; - --- CONSTRAINTS -ALTER TABLE public.os - ADD CONSTRAINT os_check_os_name_length - CHECK (char_length(os_name) < 63); - -ALTER TABLE public.os - ADD CONSTRAINT os_check_architecture_name_length - CHECK (char_length(architecture_name) < 63); - -ALTER TABLE public.os - ADD CONSTRAINT os_check_kernel_name_length - CHECK (char_length(kernel_name) < 63); - -CREATE UNIQUE INDEX os_unique_index - ON public.os(os_name, architecture_name, kernel_name); -COMMENT ON INDEX public.os_unique_index - IS 'Enforce uniqueness of os, architecture, and kernel names.'; diff --git a/dev/benchmarking/ddl/1_04_table_public_benchmark_language.sql b/dev/benchmarking/ddl/1_04_table_public_benchmark_language.sql deleted file mode 100644 index 2e35536770932..0000000000000 --- a/dev/benchmarking/ddl/1_04_table_public_benchmark_language.sql +++ /dev/null @@ -1,35 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - - --- BENCHMARK_LANGUAGE -CREATE TABLE IF NOT EXISTS public.benchmark_language -( - benchmark_language_id SERIAL PRIMARY KEY - , benchmark_language citext NOT NULL UNIQUE -); -COMMENT ON TABLE public.benchmark_language - IS 'The language the benchmark was written in (and presumably for).'; -COMMENT ON COLUMN public.benchmark_language.benchmark_language - IS 'The benchmark language. For example: Python'; - --- CONSTRAINTS -ALTER TABLE public.benchmark_language - ADD CONSTRAINT benchmark_language_check_language_length - CHECK (char_length(benchmark_language) < 63); diff --git a/dev/benchmarking/ddl/1_05_table_public_dependencies.sql b/dev/benchmarking/ddl/1_05_table_public_dependencies.sql deleted file mode 100644 index 3744a0c35a873..0000000000000 --- a/dev/benchmarking/ddl/1_05_table_public_dependencies.sql +++ /dev/null @@ -1,31 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - - --- DEPENDENCIES -CREATE TABLE IF NOT EXISTS public.dependencies -( - dependencies_id SERIAL PRIMARY KEY - , dependencies jsonb UNIQUE NOT NULL DEFAULT '{}'::jsonb -); -COMMENT ON TABLE public.dependencies - IS E'@name dependencies.\n' - 'A JSON object mapping dependencies to their versions.'; -COMMENT ON COLUMN public.dependencies.dependencies - IS 'For example: ''{"boost": "1.69", "conda": "", "numpy": "1.15"}''.'; diff --git a/dev/benchmarking/ddl/1_06_table_public_language_implementation_version.sql b/dev/benchmarking/ddl/1_06_table_public_language_implementation_version.sql deleted file mode 100644 index f7d26e4e2d2e5..0000000000000 --- a/dev/benchmarking/ddl/1_06_table_public_language_implementation_version.sql +++ /dev/null @@ -1,46 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - - --- LANGUAGE_IMPLEMENTATION_VERSION -CREATE TABLE IF NOT EXISTS public.language_implementation_version -( - language_implementation_version_id SERIAL - , language_implementation_version citext NOT NULL DEFAULT '' - , benchmark_language_id integer NOT NULL - , PRIMARY KEY (language_implementation_version_id, benchmark_language_id) - , FOREIGN KEY (benchmark_language_id) REFERENCES public.benchmark_language -); -COMMENT ON TABLE public.language_implementation_version - IS 'The benchmark language implementation or compiler version, e.g. ' - '''CPython 2.7'' or ''PyPy x.y'' or ''gcc 7.3.0'' or ' - '''gcc (Ubuntu 7.3.0-27ubuntu1~18.04) 7.3.0''.'; -COMMENT ON COLUMN public.language_implementation_version.language_implementation_version - IS 'The version number used in the benchmark environment (e.g. ''2.7'').'; - --- CONSTRAINTS -ALTER TABLE public.language_implementation_version - ADD CONSTRAINT language_implementation_version_check_version_length - CHECK (char_length(language_implementation_version) < 255); - -CREATE UNIQUE INDEX language_implementation_version_unique_index - ON public.language_implementation_version - (benchmark_language_id, language_implementation_version); -COMMENT ON INDEX language_implementation_version_unique_index - IS 'Enforce unique implementation versions for the languages.'; diff --git a/dev/benchmarking/ddl/1_07_table_public_benchmark_type.sql b/dev/benchmarking/ddl/1_07_table_public_benchmark_type.sql deleted file mode 100644 index 1143cdb0015d4..0000000000000 --- a/dev/benchmarking/ddl/1_07_table_public_benchmark_type.sql +++ /dev/null @@ -1,39 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - - --- BENCHMARK_TYPE -CREATE TABLE IF NOT EXISTS public.benchmark_type -( - benchmark_type_id SERIAL PRIMARY KEY - , benchmark_type citext NOT NULL UNIQUE - , lessisbetter boolean NOT NULL -); -COMMENT ON TABLE public.benchmark_type - IS 'The type of benchmark. For example "time", "mem", "peakmem", "track"'; -COMMENT ON COLUMN public.benchmark_type.benchmark_type - IS 'The type of units, so ''time'' for seconds, miliseconds, or ' - '''mem'' for kilobytes, megabytes.'; -COMMENT ON COLUMN public.benchmark_type.lessisbetter - IS 'True if a smaller benchmark value is better.'; - --- CONSTRAINTS -ALTER TABLE public.benchmark_type - ADD CONSTRAINT benchmark_type_check_benchmark_type_char_length - CHECK (char_length(benchmark_type) < 63); diff --git a/dev/benchmarking/ddl/1_08_table_public_machine.sql b/dev/benchmarking/ddl/1_08_table_public_machine.sql deleted file mode 100644 index 8f219d3e0cfa4..0000000000000 --- a/dev/benchmarking/ddl/1_08_table_public_machine.sql +++ /dev/null @@ -1,69 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - - --- MACHINE -CREATE TABLE IF NOT EXISTS public.machine -( - machine_id SERIAL PRIMARY KEY - , machine_name citext NOT NULL - , mac_address macaddr NOT NULL - , memory_bytes bigint NOT NULL - , cpu_actual_frequency_Hz bigint NOT NULL - , machine_other_attributes jsonb - , cpu_id integer NOT NULL - , gpu_id integer NOT NULL - , os_id integer NOT NULL - , FOREIGN KEY (cpu_id) REFERENCES public.cpu - , FOREIGN KEY (gpu_id) REFERENCES public.gpu - , FOREIGN KEY (os_id) REFERENCES public.os -); -COMMENT ON TABLE public.machine - IS 'Unique identifiers for a machine.'; -COMMENT ON COLUMN public.machine.machine_name - IS 'A machine name of your choice.'; -COMMENT ON COLUMN public.machine.mac_address - IS 'The mac_address of a physical network interface to uniquely ' - 'identify a computer. Postgres accepts standard formats, including ' - '''08:00:2b:01:02:03'', ''08-00-2b-01-02-03'', ''08002b:010203'''; -COMMENT ON COLUMN public.machine.memory_bytes - IS 'The output of `sysctl -n hw.memsize`.'; -COMMENT ON COLUMN public.machine.cpu_actual_frequency_Hz - IS 'The output of `sysctl -n hw.cpufrequency`.'; -COMMENT ON COLUMN public.machine.machine_other_attributes - IS 'Additional attributes of interest, as a JSON object. ' - 'For example: ''{"hard_disk_type": "solid state"}''::jsonb.'; - --- CONSTRAINTS -CREATE UNIQUE INDEX machine_index_on_mac_address - ON public.machine(mac_address); -COMMENT ON INDEX machine_index_on_mac_address - IS 'Enforce unique mac address'; - -CREATE INDEX machine_index_on_cpu_id - ON public.machine(cpu_id); - -CREATE INDEX machine_index_on_gpu_id - ON public.machine(gpu_id); - -CREATE INDEX machine_index_on_os_id - ON public.machine(os_id); - -CREATE INDEX machine_index_on_cpu_gpu_os_id - ON public.machine(cpu_id, gpu_id, os_id); diff --git a/dev/benchmarking/ddl/1_09_table_public_unit.sql b/dev/benchmarking/ddl/1_09_table_public_unit.sql deleted file mode 100644 index a8cf576696d10..0000000000000 --- a/dev/benchmarking/ddl/1_09_table_public_unit.sql +++ /dev/null @@ -1,37 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - - --- UNIT -CREATE TABLE IF NOT EXISTS public.unit -( - unit_id SERIAL PRIMARY KEY - , units citext NOT NULL UNIQUE - , benchmark_type_id integer NOT NULL - , FOREIGN KEY (benchmark_type_id) - REFERENCES public.benchmark_type(benchmark_type_id) -); -COMMENT ON TABLE public.unit IS 'The actual units for a reported benchmark.'; -COMMENT ON COLUMN public.unit.units - IS 'For example: nanoseconds, microseconds, bytes, megabytes.'; - --- CONSTRAINTS -ALTER TABLE public.unit - ADD CONSTRAINT unit_check_units_string_length - CHECK (char_length(units) < 63); diff --git a/dev/benchmarking/ddl/1_10_table_public_environment.sql b/dev/benchmarking/ddl/1_10_table_public_environment.sql deleted file mode 100644 index e3a6d23957f2d..0000000000000 --- a/dev/benchmarking/ddl/1_10_table_public_environment.sql +++ /dev/null @@ -1,51 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - - --- ENVIRONMENT -CREATE TABLE IF NOT EXISTS public.environment -( - environment_id SERIAL - , language_implementation_version_id integer NOT NULL - , benchmark_language_id integer NOT NULL - , dependencies_id integer NOT NULL - , PRIMARY KEY - (environment_id, benchmark_language_id, language_implementation_version_id) - , FOREIGN KEY - (benchmark_language_id) - REFERENCES public.benchmark_language - , FOREIGN KEY - (language_implementation_version_id, benchmark_language_id) - REFERENCES public.language_implementation_version( - language_implementation_version_id - , benchmark_language_id - ) - , FOREIGN KEY - (dependencies_id) - REFERENCES public.dependencies -); -COMMENT ON TABLE public.environment - IS 'Identifies a build environment for a specific suite of benchmarks.'; - --- CONSTRAINTS -CREATE UNIQUE INDEX environment_unique_index - ON public.environment - (benchmark_language_id, language_implementation_version_id, dependencies_id); -COMMENT ON INDEX environment_unique_index - IS 'Enforce unique combinations of language version and dependencies.'; diff --git a/dev/benchmarking/ddl/1_11_table_public_benchmark.sql b/dev/benchmarking/ddl/1_11_table_public_benchmark.sql deleted file mode 100644 index 18895823df68c..0000000000000 --- a/dev/benchmarking/ddl/1_11_table_public_benchmark.sql +++ /dev/null @@ -1,54 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - - --- BENCHMARK -CREATE TABLE IF NOT EXISTS public.benchmark -( - benchmark_id SERIAL - , benchmark_name citext NOT NULL - , parameter_names text[] - , benchmark_description text NOT NULL - , benchmark_version citext NOT NULL - , unit_id integer NOT NULL - , benchmark_language_id integer NOT NULL - , PRIMARY KEY (benchmark_id, benchmark_language_id) - , FOREIGN KEY (benchmark_language_id) REFERENCES public.benchmark_language - , FOREIGN KEY (unit_id) REFERENCES public.unit -); -COMMENT ON TABLE public.benchmark - IS 'Identifies an individual benchmark.'; -COMMENT ON COLUMN public.benchmark.parameter_names - IS 'A list of strings identifying the parameter names in the benchmark.'; -COMMENT ON COLUMN public.benchmark.benchmark_version - IS 'Can be any string. In Airspeed Velocity, the version is ' - 'by default the hash of the entire code string for the benchmark.'; - --- CONSTRAINTS -CREATE INDEX benchmark_index_on_benchmark_language_id - ON public.benchmark(benchmark_language_id); - -CREATE INDEX benchmark_index_on_unit_id - ON public.benchmark(unit_id); - -CREATE UNIQUE INDEX benchmark_unique_index_on_language_benchmark_version - ON public.benchmark - (benchmark_language_id, benchmark_name, benchmark_version); -COMMENT ON INDEX public.benchmark_unique_index_on_language_benchmark_version - IS 'Enforce uniqueness of benchmark name and version for a given language.'; diff --git a/dev/benchmarking/ddl/1_12_table_public_benchmark_run.sql b/dev/benchmarking/ddl/1_12_table_public_benchmark_run.sql deleted file mode 100644 index 20b9ef0bb9639..0000000000000 --- a/dev/benchmarking/ddl/1_12_table_public_benchmark_run.sql +++ /dev/null @@ -1,112 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - - --- BENCHMARK_RUN -CREATE TABLE IF NOT EXISTS public.benchmark_run -( - benchmark_run_id BIGSERIAL PRIMARY KEY - , parameter_values jsonb NOT NULL DEFAULT '{}'::jsonb - , value numeric NOT NULL - , git_commit_timestamp timestamp (0) with time zone NOT NULL - , git_hash text NOT NULL - , val_min numeric - , val_q1 numeric - , val_q3 numeric - , val_max numeric - , std_dev numeric NOT NULL - , n_obs integer NOT NULL - , run_timestamp timestamp (0) with time zone NOT NULL - , run_metadata jsonb - , run_notes text - , machine_id integer NOT NULL - , environment_id integer NOT NULL - , language_implementation_version_id integer NOT NULL - , benchmark_language_id integer NOT NULL - , benchmark_id integer NOT NULL - , FOREIGN KEY (machine_id) REFERENCES public.machine - , FOREIGN KEY - (environment_id, benchmark_language_id, language_implementation_version_id) - REFERENCES public.environment - , FOREIGN KEY (benchmark_id, benchmark_language_id) - REFERENCES public.benchmark(benchmark_id, benchmark_language_id) -); -COMMENT ON TABLE public.benchmark_run - IS 'One run per benchmark run.'; -COMMENT ON COLUMN public.benchmark_run.parameter_values - IS 'A JSON object mapping the parameter names from ' - '"benchmark.parameter_names" to values.'; -COMMENT ON COLUMN public.benchmark_run.value - IS 'The average value from the benchmark run.'; -COMMENT ON COLUMN public.benchmark_run.git_commit_timestamp - IS 'Get this using `git show -s --date=local --format="%ci" `. ' - 'ISO 8601 is recommended, e.g. ''2019-01-30 03:12 -0600''.'; -COMMENT ON COLUMN public.benchmark_run.git_hash - IS 'The commit has of the codebase currently being benchmarked.'; -COMMENT ON COLUMN public.benchmark_run.val_min - IS 'The smallest benchmark run value for this run.'; -COMMENT ON COLUMN public.benchmark_run.val_q1 - IS 'The first quartile of the benchmark run values for this run.'; -COMMENT ON COLUMN public.benchmark_run.val_q3 - IS 'The third quartile of the benchmark run values for this run.'; -COMMENT ON COLUMN public.benchmark_run.val_max - IS 'The largest benchmark run value for this run.'; -COMMENT ON COLUMN public.benchmark_run.std_dev - IS 'The standard deviation of the run values for this benchmark run.'; -COMMENT ON COLUMN public.benchmark_run.n_obs - IS 'The number of observations for this benchmark run.'; -COMMENT ON COLUMN public.benchmark_run.run_metadata - IS 'Additional metadata of interest, as a JSON object. ' - 'For example: ''{"ci_99": [2.7e-06, 3.1e-06]}''::jsonb.'; -COMMENT ON COLUMN public.benchmark_run.run_notes - IS 'Additional notes of interest, as a text string. '; - --- CONSTRAINTS -ALTER TABLE public.benchmark_run - ADD CONSTRAINT benchmark_run_check_std_dev_nonnegative - CHECK (std_dev >= 0); - -ALTER TABLE public.benchmark_run - ADD CONSTRAINT benchmark_run_check_n_obs_positive - CHECK (n_obs > 0); - -CREATE INDEX benchmark_run_index_on_environment_id - ON public.benchmark_run(environment_id); - -CREATE INDEX benchmark_run_index_on_machine_id - ON public.benchmark_run(machine_id); - -CREATE INDEX benchmark_run_index_on_benchmark_id - ON public.benchmark_run(benchmark_id, benchmark_language_id); - -CREATE INDEX benchmark_run_index_on_benchmark_environment_time - ON public.benchmark_run - (benchmark_id, environment_id, git_commit_timestamp); -COMMENT ON INDEX - public.benchmark_run_index_on_benchmark_environment_time - IS 'Index to improve sorting by benchmark, environment, and timestamp.'; - -CREATE UNIQUE INDEX - benchmark_run_unique_index_on_env_benchmark_timestamp_params - ON public.benchmark_run - (machine_id, environment_id, benchmark_id, git_commit_timestamp, parameter_values, run_timestamp); -COMMENT ON INDEX - public.benchmark_run_unique_index_on_env_benchmark_timestamp_params - IS 'Enforce uniqueness of benchmark run for a given machine, ' - 'environment, benchmark, git commit timestamp, and parameter values.'; diff --git a/dev/benchmarking/ddl/2_00_views.sql b/dev/benchmarking/ddl/2_00_views.sql deleted file mode 100644 index cbd295e506d8b..0000000000000 --- a/dev/benchmarking/ddl/2_00_views.sql +++ /dev/null @@ -1,324 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - --- NOTE: --- The function for documentation depends on view columns --- being named exactly the same as in the table view. - --- MACHINE_VIEW -CREATE OR REPLACE VIEW public.machine_view AS - SELECT - machine.machine_id - , mac_address - , machine_name - , memory_bytes - , cpu_actual_frequency_Hz - , os_name - , architecture_name - , kernel_name - , cpu_model_name - , cpu_core_count - , cpu_thread_count - , cpu_frequency_max_Hz - , cpu_frequency_min_Hz - , cpu_L1d_cache_bytes - , cpu_L1i_cache_bytes - , cpu_L2_cache_bytes - , cpu_L3_cache_bytes - , gpu_information - , gpu_part_number - , gpu_product_name - , machine_other_attributes - FROM public.machine AS machine - JOIN public.cpu AS cpu ON machine.cpu_id = cpu.cpu_id - JOIN public.gpu AS gpu ON machine.gpu_id = gpu.gpu_id - JOIN public.os AS os ON machine.os_id = os.os_id; -COMMENT ON VIEW public.machine_view IS -E'The machine environment (CPU, GPU, OS) used for each benchmark run.\n\n' - '- "mac_address" is unique in the "machine" table\n' - '- "gpu_part_number" is unique in the "gpu" (graphics processing unit) table\n' - ' Empty string (''''), not null, is used for machines that won''t use the GPU\n' - '- "cpu_model_name" is unique in the "cpu" (central processing unit) table\n' - '- "os_name", "os_architecture_name", and "os_kernel_name"\n' - ' are unique in the "os" (operating system) table\n' - '- "machine_other_attributes" is a key-value store for any other relevant\n' - ' data, e.g. ''{"hard_disk_type": "solid state"}'''; - - --- LANGUAGE_IMPLEMENTATION_VERSION_VIEW -CREATE OR REPLACE VIEW public.language_implementation_version_view AS - SELECT - lv.language_implementation_version_id - , bl.benchmark_language - , lv.language_implementation_version - FROM public.language_implementation_version AS lv - JOIN public.benchmark_language AS bl - ON lv.benchmark_language_id = bl.benchmark_language_id; - --- ENVIRONMENT_VIEW -CREATE OR REPLACE VIEW public.environment_view AS - SELECT - env.environment_id - , benchmark_language - , language_implementation_version - , dependencies - FROM public.environment AS env - JOIN public.benchmark_language AS language - ON env.benchmark_language_id = language.benchmark_language_id - JOIN public.language_implementation_version AS version - ON env.language_implementation_version_id = version.language_implementation_version_id - JOIN public.dependencies AS deps - ON env.dependencies_id = deps.dependencies_id; -COMMENT ON VIEW public.environment_view IS -E'The build environment used for a reported benchmark run.\n' - '(Will be inferred from each "benchmark_run" if not explicitly added).\n\n' - '- Each entry is unique on\n' - ' ("benchmark_language", "language_implementation_version", "dependencies")\n' - '- "benchmark_language" is unique in the "benchmark_language" table\n' - '- "benchmark_language" plus "language_implementation_version" is unique in\n' - ' the "language_implementation_version" table\n' - '- "dependencies" is unique in the "dependencies" table'; - --- UNIT_VIEW -CREATE OR REPLACE VIEW public.unit_view AS - SELECT - unit.unit_id - , units - , benchmark_type - , lessisbetter - FROM public.unit AS unit - JOIN public.benchmark_type AS bt - ON unit.benchmark_type_id = bt.benchmark_type_id; - --- BENCHMARK_VIEW -CREATE OR REPLACE VIEW public.benchmark_view AS - SELECT - b.benchmark_id - , benchmark_name - , parameter_names - , benchmark_description - , benchmark_type - , units - , lessisbetter - , benchmark_version - , benchmark_language - FROM public.benchmark AS b - JOIN public.benchmark_language AS benchmark_language - ON b.benchmark_language_id = benchmark_language.benchmark_language_id - JOIN public.unit AS unit - ON b.unit_id = unit.unit_id - JOIN public.benchmark_type AS benchmark_type - ON unit.benchmark_type_id = benchmark_type.benchmark_type_id; -COMMENT ON VIEW public.benchmark_view IS -E'The details about a particular benchmark.\n\n' - '- "benchmark_name" is unique for a given "benchmark_language"\n' - '- Each entry is unique on\n' - ' ("benchmark_language", "benchmark_name", "benchmark_version")'; - --- BENCHMARK_RUN_VIEW -CREATE OR REPLACE VIEW public.benchmark_run_view AS - SELECT - run.benchmark_run_id - -- benchmark_view (name, version, language only) - , benchmark_name - , benchmark_version - -- datum - , parameter_values - , value - , git_commit_timestamp - , git_hash - , val_min - , val_q1 - , val_q3 - , val_max - , std_dev - , n_obs - , run_timestamp - , run_metadata - , run_notes - -- machine_view (mac address only) - , mac_address - -- environment_view - , env.benchmark_language - , language_implementation_version - , dependencies - FROM public.benchmark_run AS run - JOIN public.benchmark_view AS benchmark - ON run.benchmark_id = benchmark.benchmark_id - JOIN public.machine_view AS machine - ON run.machine_id = machine.machine_id - JOIN public.environment_view AS env - ON run.environment_id = env.environment_id; -COMMENT ON VIEW public.benchmark_run_view IS -E'Each benchmark run.\n\n' - '- Each entry is unique on the machine, environment, benchmark,\n' - ' and git commit timestamp.'; - --- FULL_BENCHMARK_RUN_VIEW -CREATE OR REPLACE VIEW public.full_benchmark_run_view AS - SELECT - run.benchmark_run_id - -- benchmark_view - , benchmark_name - , parameter_names - , benchmark_description - , benchmark_type - , units - , lessisbetter - , benchmark_version - -- datum - , parameter_values - , value - , git_commit_timestamp - , git_hash - , val_min - , val_q1 - , val_q3 - , val_max - , std_dev - , n_obs - , run_timestamp - , run_metadata - , run_notes - -- machine_view - , machine_name - , mac_address - , memory_bytes - , cpu_actual_frequency_Hz - , os_name - , architecture_name - , kernel_name - , cpu_model_name - , cpu_core_count - , cpu_thread_count - , cpu_frequency_max_Hz - , cpu_frequency_min_Hz - , cpu_L1d_cache_bytes - , cpu_L1i_cache_bytes - , cpu_L2_cache_bytes - , cpu_L3_cache_bytes - , gpu_information - , gpu_part_number - , gpu_product_name - , machine_other_attributes - -- environment_view - , env.benchmark_language - , env.language_implementation_version - , dependencies - FROM public.benchmark_run AS run - JOIN public.benchmark_view AS benchmark - ON run.benchmark_id = benchmark.benchmark_id - JOIN public.machine_view AS machine - ON run.machine_id = machine.machine_id - JOIN public.environment_view AS env - ON run.environment_id = env.environment_id; - --- SUMMARIZED_TABLES_VIEW -CREATE VIEW public.summarized_tables_view AS - WITH chosen AS ( - SELECT - cls.oid AS id - , cls.relname as tbl_name - FROM pg_catalog.pg_class AS cls - JOIN pg_catalog.pg_namespace AS ns ON cls.relnamespace = ns.oid - WHERE - cls.relkind = 'r' - AND ns.nspname = 'public' - ), all_constraints AS ( - SELECT - chosen.id AS tbl_id - , chosen.tbl_name - , unnest(conkey) AS col_id - , 'foreign key' AS col_constraint - FROM pg_catalog.pg_constraint - JOIN chosen ON chosen.id = conrelid - WHERE contype = 'f' - - UNION - - SELECT - chosen.id - , chosen.tbl_name - , unnest(indkey) - , 'unique' - FROM pg_catalog.pg_index i - JOIN chosen ON chosen.id = i.indrelid - WHERE i.indisunique AND NOT i.indisprimary - - UNION - - SELECT - chosen.id - , chosen.tbl_name - , unnest(indkey) - , 'primary key' - FROM pg_catalog.pg_index i - JOIN chosen on chosen.id = i.indrelid - WHERE i.indisprimary - ), gathered_constraints AS ( - SELECT - tbl_id - , tbl_name - , col_id - , string_agg(col_constraint, ', ' ORDER BY col_constraint) - AS col_constraint - FROM all_constraints - GROUP BY tbl_id, tbl_name, col_id - ) - SELECT - chosen.tbl_name AS table_name - , columns.attnum AS column_number - , columns.attname AS column_name - , typ.typname AS type_name - , CASE - WHEN columns.attnotnull - THEN 'not null' - ELSE '' - END AS nullable - , CASE - WHEN defaults.adsrc like 'nextval%' - THEN 'serial' - ELSE defaults.adsrc - END AS default_value - , CASE - WHEN gc.col_constraint = '' OR gc.col_constraint IS NULL - THEN cnstrnt.consrc - WHEN cnstrnt.consrc IS NULL - THEN gc.col_constraint - ELSE gc.col_constraint || ', ' || cnstrnt.consrc - END AS description - FROM pg_catalog.pg_attribute AS columns - JOIN chosen ON columns.attrelid = chosen.id - JOIN pg_catalog.pg_type AS typ - ON typ.oid = columns.atttypid - LEFT JOIN gathered_constraints AS gc - ON gc.col_id = columns.attnum - AND gc.tbl_id = columns.attrelid - LEFT JOIN pg_attrdef AS defaults - ON defaults.adrelid = chosen.id - AND defaults.adnum = columns.attnum - LEFT JOIN pg_catalog.pg_constraint AS cnstrnt - ON cnstrnt.conrelid = columns.attrelid - AND columns.attrelid = ANY(cnstrnt.conkey) - WHERE - columns.attnum > 0 - ORDER BY table_name, column_number; -COMMENT ON VIEW public.summarized_tables_view - IS 'A summary of all columns from all tables in the public schema, ' - ' identifying nullability, primary/foreign keys, and data type.'; diff --git a/dev/benchmarking/ddl/3_00_functions_helpers.sql b/dev/benchmarking/ddl/3_00_functions_helpers.sql deleted file mode 100644 index b10b69a4e914e..0000000000000 --- a/dev/benchmarking/ddl/3_00_functions_helpers.sql +++ /dev/null @@ -1,643 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - - --- PROJECT_DETAILS -CREATE TYPE public.type_project_details AS ( - project_name text - , project_url text - , repo_url text -); - -CREATE OR REPLACE FUNCTION public.project_details() -RETURNS public.type_project_details AS -$$ - SELECT project_name, project_url, repo_url - FROM public.project - ORDER BY last_changed DESC - LIMIT 1 -$$ -LANGUAGE sql STABLE; -COMMENT ON FUNCTION public.project_details() -IS 'Get the current project name, url, and repo url.'; - - --------------------------- GET-OR-SET FUNCTIONS -------------------------- --- The following functions have the naming convention "get__id". --- All of them attempt to SELECT the desired row given the column --- values, and if it does not exist will INSERT it. --- --- When functions are overloaded with fewer columns, it is to allow --- selection only, given columns that comprise a unique index. - --- GET_CPU_ID -CREATE OR REPLACE FUNCTION public.get_cpu_id( - cpu_model_name citext - , cpu_core_count integer - , cpu_thread_count integer - , cpu_frequency_max_Hz bigint - , cpu_frequency_min_Hz bigint - , cpu_L1d_cache_bytes integer - , cpu_L1i_cache_bytes integer - , cpu_L2_cache_bytes integer - , cpu_L3_cache_bytes integer -) -RETURNS integer AS -$$ - DECLARE - result integer; - BEGIN - SELECT cpu_id INTO result FROM public.cpu AS cpu - WHERE cpu.cpu_model_name = $1 - AND cpu.cpu_core_count = $2 - AND cpu.cpu_thread_count = $3 - AND cpu.cpu_frequency_max_Hz = $4 - AND cpu.cpu_frequency_min_Hz = $5 - AND cpu.cpu_L1d_cache_bytes = $6 - AND cpu.cpu_L1i_cache_bytes = $7 - AND cpu.cpu_L2_cache_bytes = $8 - AND cpu.cpu_L3_cache_bytes = $9; - - IF result IS NULL THEN - INSERT INTO public.cpu( - cpu_model_name - , cpu_core_count - , cpu_thread_count - , cpu_frequency_max_Hz - , cpu_frequency_min_Hz - , cpu_L1d_cache_bytes - , cpu_L1i_cache_bytes - , cpu_L2_cache_bytes - , cpu_L3_cache_bytes - ) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) - RETURNING cpu_id INTO result; - END IF; - - RETURN result; - END -$$ -LANGUAGE plpgsql; -COMMENT ON FUNCTION public.get_cpu_id( - citext - , integer - , integer - , bigint -- cpu_frequency_max_Hz - , bigint -- cpu_frequency_min_Hz - , integer - , integer - , integer - , integer -) -IS 'Insert or select CPU data, returning "cpu.cpu_id".'; - --- GET_GPU_ID -CREATE OR REPLACE FUNCTION public.get_gpu_id( - gpu_information citext DEFAULT NULL - , gpu_part_number citext DEFAULT NULL - , gpu_product_name citext DEFAULT NULL -) -RETURNS integer AS -$$ - DECLARE - result integer; - BEGIN - SELECT gpu_id INTO result FROM public.gpu AS gpu - WHERE - gpu.gpu_information = COALESCE($1, '') - AND gpu.gpu_part_number = COALESCE($2, '') - AND gpu.gpu_product_name = COALESCE($3, ''); - - IF result IS NULL THEN - INSERT INTO public.gpu( - gpu_information - , gpu_part_number - , gpu_product_name - ) - VALUES (COALESCE($1, ''), COALESCE($2, ''), COALESCE($3, '')) - RETURNING gpu_id INTO result; - END IF; - - RETURN result; - END -$$ -LANGUAGE plpgsql; -COMMENT ON FUNCTION public.get_gpu_id(citext, citext, citext) -IS 'Insert or select GPU data, returning "gpu.gpu_id".'; - --- GET_OS_ID -CREATE OR REPLACE FUNCTION public.get_os_id( - os_name citext - , architecture_name citext - , kernel_name citext DEFAULT '' -) -RETURNS integer AS -$$ - DECLARE - result integer; - BEGIN - SELECT os_id INTO result FROM public.os AS os - WHERE os.os_name = $1 - AND os.architecture_name = $2 - AND os.kernel_name = COALESCE($3, ''); - - IF result is NULL THEN - INSERT INTO public.os(os_name, architecture_name, kernel_name) - VALUES ($1, $2, COALESCE($3, '')) - RETURNING os_id INTO result; - END IF; - - RETURN result; - END -$$ -LANGUAGE plpgsql; -COMMENT ON FUNCTION public.get_os_id(citext, citext, citext) -IS 'Insert or select OS data, returning "os.os_id".'; - --- GET_MACHINE_ID (full signature) -CREATE OR REPLACE FUNCTION public.get_machine_id( - mac_address macaddr - , machine_name citext - , memory_bytes bigint - , cpu_actual_frequency_Hz bigint - -- os - , os_name citext - , architecture_name citext - , kernel_name citext - -- cpu - , cpu_model_name citext - , cpu_core_count integer - , cpu_thread_count integer - , cpu_frequency_max_Hz bigint - , cpu_frequency_min_Hz bigint - , L1d_cache_bytes integer - , L1i_cache_bytes integer - , L2_cache_bytes integer - , L3_cache_bytes integer - -- gpu - , gpu_information citext DEFAULT '' - , gpu_part_number citext DEFAULT NULL - , gpu_product_name citext DEFAULT NULL - -- nullable machine attributes - , machine_other_attributes jsonb DEFAULT NULL -) -RETURNS integer AS -$$ - DECLARE - found_cpu_id integer; - found_gpu_id integer; - found_os_id integer; - result integer; - BEGIN - -- Can't bypass looking up all the values because of unique constraint. - SELECT public.get_cpu_id( - cpu_model_name - , cpu_core_count - , cpu_thread_count - , cpu_frequency_max_Hz - , cpu_frequency_min_Hz - , L1d_cache_bytes - , L1i_cache_bytes - , L2_cache_bytes - , L3_cache_bytes - ) INTO found_cpu_id; - - SELECT public.get_gpu_id( - gpu_information - , gpu_part_number - , gpu_product_name - ) INTO found_gpu_id; - - SELECT public.get_os_id( - os_name - , architecture_name - , kernel_name - ) INTO found_os_id; - - SELECT machine_id INTO result FROM public.machine AS m - WHERE m.os_id = found_os_id - AND m.cpu_id = found_cpu_id - AND m.gpu_id = found_gpu_id - AND m.mac_address = $1 - AND m.machine_name = $2 - AND m.memory_bytes = $3 - AND m.cpu_actual_frequency_Hz = $4; - - IF result IS NULL THEN - INSERT INTO public.machine( - os_id - , cpu_id - , gpu_id - , mac_address - , machine_name - , memory_bytes - , cpu_actual_frequency_Hz - , machine_other_attributes - ) - VALUES (found_os_id, found_cpu_id, found_gpu_id, $1, $2, $3, $4, $20) - RETURNING machine_id INTO result; - END IF; - - RETURN result; - END -$$ -LANGUAGE plpgsql; -COMMENT ON FUNCTION public.get_machine_id( - macaddr - , citext - , bigint -- memory_bytes - , bigint -- cpu_frequency_actual_Hz - -- os - , citext - , citext - , citext - -- cpu - , citext - , integer - , integer - , bigint -- cpu_frequency_max_Hz - , bigint -- cpu_frequency_min_Hz - , integer - , integer - , integer - , integer - -- gpu - , citext - , citext - , citext - -- nullable machine attributes - , jsonb -) -IS 'Insert or select machine data, returning "machine.machine_id".'; - --- GET_MACHINE_ID (given unique mac_address) -CREATE OR REPLACE FUNCTION public.get_machine_id(mac_address macaddr) -RETURNS integer AS -$$ - SELECT machine_id FROM public.machine AS m - WHERE m.mac_address = $1; -$$ -LANGUAGE sql STABLE; -COMMENT ON FUNCTION public.get_machine_id(macaddr) -IS 'Select machine_id given its mac address, returning "machine.machine_id".'; - --- GET_BENCHMARK_LANGUAGE_ID -CREATE OR REPLACE FUNCTION public.get_benchmark_language_id(language citext) -RETURNS integer AS -$$ - DECLARE - result integer; - BEGIN - SELECT benchmark_language_id INTO result - FROM public.benchmark_language AS bl - WHERE bl.benchmark_language = language; - - IF result IS NULL THEN - INSERT INTO public.benchmark_language(benchmark_language) - VALUES (language) - RETURNING benchmark_language_id INTO result; - END IF; - - RETURN result; - END -$$ -LANGUAGE plpgsql; -COMMENT ON FUNCTION public.get_benchmark_language_id(citext) -IS 'Insert or select benchmark_language returning ' - '"benchmark_language.benchmark_language_id".'; - --- GET_LANGUAGE_IMPLEMENTATION_VERSION_ID -CREATE OR REPLACE FUNCTION public.get_language_implementation_version_id( - language citext - , language_implementation_version citext DEFAULT '' -) -RETURNS integer AS -$$ - DECLARE - language_id integer; - result integer; - BEGIN - SELECT public.get_benchmark_language_id($1) INTO language_id; - - SELECT language_implementation_version_id INTO result FROM public.language_implementation_version AS lv - WHERE lv.benchmark_language_id = language_id - AND lv.language_implementation_version = COALESCE($2, ''); - - IF result IS NULL THEN - INSERT INTO - public.language_implementation_version(benchmark_language_id, language_implementation_version) - VALUES (language_id, COALESCE($2, '')) - RETURNING language_implementation_version_id INTO result; - END IF; - - RETURN result; - END -$$ -LANGUAGE plpgsql; -COMMENT ON FUNCTION public.get_language_implementation_version_id(citext, citext) -IS 'Insert or select language and version data, ' - 'returning "language_implementation_version.language_implementation_version_id".'; - -CREATE OR REPLACE FUNCTION public.get_language_implementation_version_id( - -- overload for when language_id is known - language_id integer - , language_implementation_version citext DEFAULT '' -) -RETURNS integer AS -$$ - DECLARE - result integer; - BEGIN - SELECT language_implementation_version_id INTO result FROM public.language_implementation_version AS lv - WHERE lv.benchmark_language_id = language_id - AND lv.language_implementation_version = COALESCE($2, ''); - - IF result IS NULL THEN - INSERT INTO - public.language_implementation_version(benchmark_language_id, language_implementation_version) - VALUES (language_id, COALESCE($2, '')) - RETURNING language_implementation_version_id INTO result; - END IF; - - RETURN result; - END -$$ -LANGUAGE plpgsql; - --- GET_LANGUAGE_DEPENDENCY_LOOKUP_ID -CREATE OR REPLACE FUNCTION public.get_dependencies_id( - dependencies jsonb DEFAULT '{}'::jsonb -) -RETURNS integer AS -$$ - DECLARE - result integer; - BEGIN - SELECT dependencies_id INTO result - FROM public.dependencies AS ldl - WHERE ldl.dependencies = COALESCE($1, '{}'::jsonb); - - IF result IS NULL THEN - INSERT INTO - public.dependencies(dependencies) - VALUES (COALESCE($1, '{}'::jsonb)) - RETURNING dependencies_id INTO result; - END IF; - - RETURN result; - END -$$ -LANGUAGE plpgsql; -COMMENT ON FUNCTION public.get_dependencies_id(jsonb) -IS 'Insert or select dependencies, returning "dependencies.dependencies_id".'; - --- GET_ENVIRONMENT_ID -CREATE OR REPLACE FUNCTION public.get_environment_id( - language citext, - language_implementation_version citext DEFAULT '', - dependencies jsonb DEFAULT '{}'::jsonb -) -RETURNS integer AS -$$ - DECLARE - found_language_id integer; - found_version_id integer; - found_dependencies_id integer; - result integer; - BEGIN - SELECT public.get_benchmark_language_id($1) INTO found_language_id; - SELECT - public.get_language_implementation_version_id(found_language_id, $2) - INTO found_version_id; - SELECT - public.get_dependencies_id ($3) - INTO found_dependencies_id; - - SELECT environment_id INTO result FROM public.environment AS e - WHERE e.benchmark_language_id = found_language_id - AND e.language_implementation_version_id = found_version_id - AND e.dependencies_id = found_dependencies_id; - - IF result IS NULL THEN - INSERT INTO - public.environment( - benchmark_language_id - , language_implementation_version_id - , dependencies_id - ) - VALUES (found_language_id, found_version_id, found_dependencies_id) - RETURNING environment_id INTO result; - END IF; - - RETURN result; - END -$$ -LANGUAGE plpgsql; -COMMENT ON FUNCTION public.get_environment_id(citext, citext, jsonb) -IS 'Insert or select language, language version, and dependencies, ' - 'returning "environment.environment_id".'; - --- GET_BENCHMARK_TYPE_ID (full signature) -CREATE OR REPLACE FUNCTION public.get_benchmark_type_id( - benchmark_type citext - , lessisbetter boolean -) -RETURNS integer AS -$$ - DECLARE - result integer; - BEGIN - SELECT benchmark_type_id INTO result FROM public.benchmark_type AS bt - WHERE bt.benchmark_type = $1 - AND bt.lessisbetter = $2; - - IF result IS NULL THEN - INSERT INTO public.benchmark_type(benchmark_type, lessisbetter) - VALUES($1, $2) - RETURNING benchmark_type_id INTO result; - END IF; - - RETURN result; - END -$$ -LANGUAGE plpgsql; -COMMENT ON FUNCTION public.get_benchmark_type_id(citext, boolean) -IS 'Insert or select benchmark type and lessisbetter, ' - 'returning "benchmark_type.benchmark_type_id".'; - --- GET_BENCHMARK_TYPE_ID (given unique benchmark_type string only) -CREATE OR REPLACE FUNCTION public.get_benchmark_type_id( - benchmark_type citext -) -RETURNS integer AS -$$ - DECLARE - result integer; - BEGIN - SELECT benchmark_type_id INTO result FROM public.benchmark_type AS bt - WHERE bt.benchmark_type = $1; - - RETURN result; - END -$$ -LANGUAGE plpgsql; -COMMENT ON FUNCTION public.get_benchmark_type_id(citext) -IS 'Select benchmark_type_id given benchmark type (e.g. ''time''), ' - 'returning "benchmark_type.benchmark_type_id".'; - --- GET_UNIT_ID (full signature) -CREATE OR REPLACE FUNCTION public.get_unit_id( - benchmark_type citext - , units citext - , lessisbetter boolean DEFAULT NULL -) -RETURNS integer AS -$$ - DECLARE - found_benchmark_type_id integer; - result integer; - BEGIN - - IF ($3 IS NOT NULL) -- if lessisbetter is not null - THEN - SELECT public.get_benchmark_type_id($1, $3) - INTO found_benchmark_type_id; - ELSE - SELECT public.get_benchmark_type_id($1) - INTO found_benchmark_type_id; - END IF; - - SELECT unit_id INTO result FROM public.unit AS u - WHERE u.benchmark_type_id = found_benchmark_type_id - AND u.units = $2; - - IF result IS NULL THEN - INSERT INTO public.unit(benchmark_type_id, units) - VALUES(found_benchmark_type_id, $2) - RETURNING unit_id INTO result; - END IF; - - RETURN result; - END -$$ -LANGUAGE plpgsql; -COMMENT ON FUNCTION public.get_unit_id(citext, citext, boolean) -IS 'Insert or select benchmark type (e.g. ''time''), ' - 'units string (e.g. ''miliseconds''), ' - 'and "lessisbetter" (true if smaller benchmark values are better), ' - 'returning "unit.unit_id".'; - --- GET_UNIT_ID (given unique units string only) -CREATE OR REPLACE FUNCTION public.get_unit_id(units citext) -RETURNS integer AS -$$ - SELECT unit_id FROM public.unit AS u - WHERE u.units = units; -$$ -LANGUAGE sql STABLE; -COMMENT ON FUNCTION public.get_unit_id(citext) -IS 'Select unit_id given unit name, returning "unit.unit_id".'; - --- GET_BENCHMARK_ID (full signature) -CREATE OR REPLACE FUNCTION public.get_benchmark_id( - benchmark_language citext - , benchmark_name citext - , parameter_names text[] - , benchmark_description text - , benchmark_version citext - , benchmark_type citext - , units citext - , lessisbetter boolean -) -RETURNS integer AS -$$ - DECLARE - found_benchmark_language_id integer; - found_unit_id integer; - result integer; - BEGIN - SELECT public.get_benchmark_language_id( - benchmark_language - ) INTO found_benchmark_language_id; - - SELECT public.get_unit_id( - benchmark_type - , units - , lessisbetter - ) INTO found_unit_id; - - SELECT benchmark_id INTO result FROM public.benchmark AS b - WHERE b.benchmark_language_id = found_benchmark_language_id - AND b.benchmark_name = $2 - -- handle nullable "parameter_names" - AND b.parameter_names IS NOT DISTINCT FROM $3 - AND b.benchmark_description = $4 - AND b.benchmark_version = $5 - AND b.unit_id = found_unit_id; - - IF result IS NULL THEN - INSERT INTO public.benchmark( - benchmark_language_id - , benchmark_name - , parameter_names - , benchmark_description - , benchmark_version - , unit_id - ) - VALUES (found_benchmark_language_id, $2, $3, $4, $5, found_unit_id) - RETURNING benchmark_id INTO result; - END IF; - - RETURN result; - END -$$ -LANGUAGE plpgsql; -COMMENT ON FUNCTION public.get_benchmark_id( - citext - , citext - , text[] - , text - , citext - , citext - , citext - , boolean -) -IS 'Insert/select benchmark given data, returning "benchmark.benchmark_id".'; - --- GET_BENCHMARK_ID (by unique columns) -CREATE OR REPLACE FUNCTION public.get_benchmark_id( - benchmark_language citext - , benchmark_name citext - , benchmark_version citext -) -RETURNS integer AS -$$ - WITH language AS ( - SELECT public.get_benchmark_language_id(benchmark_language) AS id - ) - SELECT b.benchmark_id - FROM public.benchmark AS b - JOIN language ON b.benchmark_language_id = language.id - WHERE b.benchmark_name = benchmark_name - AND benchmark_version = benchmark_version -$$ -LANGUAGE sql STABLE; -COMMENT ON FUNCTION public.get_benchmark_id(citext, citext, citext) -IS 'Select existing benchmark given unique columns, ' - 'returning "benchmark.benchmark_id".'; diff --git a/dev/benchmarking/ddl/3_01_functions_triggers.sql b/dev/benchmarking/ddl/3_01_functions_triggers.sql deleted file mode 100644 index b6ce4741ac0fd..0000000000000 --- a/dev/benchmarking/ddl/3_01_functions_triggers.sql +++ /dev/null @@ -1,574 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - - --------------------------- TRIGGER FUNCTIONS -------------------------- --- Views that do not select from a single table or view are not --- automatically updatable. These trigger functions are intended --- to be run instead of INSERT into the complicated views. - - --- LANGUAGE_IMPLEMENTATION_VERSION_VIEW_INSERT_ROW -CREATE OR REPLACE FUNCTION public.language_implementation_version_view_insert_row() -RETURNS trigger AS -$$ - DECLARE - language_id integer; - result integer; - BEGIN - IF NEW.benchmark_language IS NULL THEN - RAISE EXCEPTION 'Column "benchmark_language" cannot be NULL.'; - END IF; - IF NEW.language_implementation_version IS NULL THEN - RAISE EXCEPTION - 'Column "language_implementation_version" cannot be NULL (use '''' instead).'; - END IF; - - SELECT public.get_benchmark_language_id(NEW.benchmark_language) - INTO language_id; - - SELECT language_implementation_version_id INTO result FROM public.language_implementation_version AS lv - WHERE lv.benchmark_language_id = language_id - AND lv.language_implementation_version = NEW.language_implementation_version; - - IF result IS NOT NULL THEN - -- row already exists - RETURN NULL; - ELSE - INSERT INTO - public.language_implementation_version( - benchmark_language_id - , language_implementation_version - ) - VALUES (language_id, NEW.language_implementation_version) - RETURNING language_implementation_version_id INTO NEW.language_implementation_version_id; - END IF; - - RETURN NEW; - END -$$ -LANGUAGE plpgsql; - --- ENVIRONMENT_VIEW_INSERT_ROW -CREATE OR REPLACE FUNCTION public.environment_view_insert_row() -RETURNS trigger AS -$$ - DECLARE - found_language_id integer; - found_version_id integer; - found_dependencies_id integer; - result integer; - BEGIN - IF NEW.benchmark_language IS NULL - THEN - RAISE EXCEPTION 'Column "benchmark_language" cannot be NULL.'; - END IF; - IF NEW.language_implementation_version IS NULL THEN - RAISE EXCEPTION - 'Column "language_implementation_version" cannot be NULL (use '''' instead).'; - END IF; - - SELECT public.get_benchmark_language_id(NEW.benchmark_language) - INTO found_language_id; - - SELECT public.get_language_implementation_version_id( - found_language_id - , NEW.language_implementation_version - ) - INTO found_version_id; - - SELECT public.get_dependencies_id(NEW.dependencies) - INTO found_dependencies_id; - - SELECT environment_id INTO result FROM public.environment AS e - WHERE e.benchmark_language_id = found_language_id - AND e.language_implementation_version_id = found_version_id - AND e.dependencies_id = found_dependencies_id; - - IF result IS NOT NULL THEN - -- row already exists - RETURN NULL; - ELSE - INSERT INTO - public.environment( - benchmark_language_id - , language_implementation_version_id - , dependencies_id - ) - VALUES (found_language_id, found_version_id, found_dependencies_id) - RETURNING environment_id INTO NEW.environment_id; - END IF; - - RETURN NEW; - END -$$ -LANGUAGE plpgsql; - --- MACHINE_VIEW_INSERT_ROW -CREATE OR REPLACE FUNCTION public.machine_view_insert_row() -RETURNS trigger AS -$$ - DECLARE - found_cpu_id integer; - found_gpu_id integer; - found_os_id integer; - result integer; - BEGIN - IF ( - NEW.machine_name IS NULL - OR NEW.memory_bytes IS NULL - OR NEW.cpu_model_name IS NULL - OR NEW.cpu_core_count IS NULL - OR NEW.cpu_thread_count IS NULL - OR NEW.cpu_frequency_max_Hz IS NULL - OR NEW.cpu_frequency_min_Hz IS NULL - OR NEW.cpu_L1d_cache_bytes IS NULL - OR NEW.cpu_L1i_cache_bytes IS NULL - OR NEW.cpu_L2_cache_bytes IS NULL - OR NEW.cpu_L3_cache_bytes IS NULL - OR NEW.os_name IS NULL - OR NEW.architecture_name IS NULL - ) - THEN - RAISE EXCEPTION 'None of the columns in "machine_view" can be NULL. ' - 'all columns in table "gpu" will default to the empty string '''', ' - 'as will blank "os.kernel_name". This is to allow uniqueness ' - 'constraints to work. Thank you!.'; - END IF; - - SELECT public.get_cpu_id( - NEW.cpu_model_name - , NEW.cpu_core_count - , NEW.cpu_thread_count - , NEW.cpu_frequency_max_Hz - , NEW.cpu_frequency_min_Hz - , NEW.cpu_L1d_cache_bytes - , NEW.cpu_L1i_cache_bytes - , NEW.cpu_L2_cache_bytes - , NEW.cpu_L3_cache_bytes - ) INTO found_cpu_id; - - SELECT public.get_gpu_id( - NEW.gpu_information - , NEW.gpu_part_number - , NEW.gpu_product_name - ) INTO found_gpu_id; - - SELECT public.get_os_id( - NEW.os_name - , NEW.architecture_name - , NEW.kernel_name - ) INTO found_os_id; - - SELECT machine_id INTO result FROM public.machine AS m - WHERE m.os_id = found_os_id - AND m.cpu_id = found_cpu_id - AND m.gpu_id = found_gpu_id - AND m.machine_name = NEW.machine_name - AND m.memory_bytes = NEW.memory_bytes - AND m.cpu_actual_frequency_Hz = NEW.cpu_actual_frequency_Hz; - - IF result IS NOT NULL THEN - -- row already exists - RETURN NULL; - ELSE - INSERT INTO public.machine( - os_id - , cpu_id - , gpu_id - , machine_name - , mac_address - , memory_bytes - , cpu_actual_frequency_Hz - , machine_other_attributes - ) - VALUES ( - found_os_id - , found_cpu_id - , found_gpu_id - , NEW.machine_name - , NEW.mac_address - , NEW.memory_bytes - , NEW.cpu_actual_frequency_Hz - , NEW.machine_other_attributes - ) - RETURNING machine_id INTO NEW.machine_id; - END IF; - - RETURN NEW; - END -$$ -LANGUAGE plpgsql; - --- UNIT_VIEW_INSERT_ROW -CREATE OR REPLACE FUNCTION public.unit_view_insert_row() -RETURNS trigger AS -$$ - DECLARE - found_benchmark_type_id integer; - result integer; - BEGIN - IF (NEW.benchmark_type IS NULL OR NEW.units IS NULL) - THEN - RAISE EXCEPTION E'"benchmark_type" and "units" cannot be NULL.\n' - 'Further, if the "benchmark_type" has never been defined, ' - '"lessisbetter" must be defined or there will be an error.'; - END IF; - - -- It's OK for "lessisbetter" = NULL if "benchmark_type" already exists. - SELECT public.get_benchmark_type_id(NEW.benchmark_type, NEW.lessisbetter) - INTO found_benchmark_type_id; - - SELECT unit_id INTO result FROM public.unit AS u - WHERE u.benchmark_type_id = found_benchmark_type_id - AND u.units = NEW.units; - - IF result IS NOT NULL THEN - -- row already exists - RETURN NULL; - ELSE - INSERT INTO public.unit ( - benchmark_type_id - , units - ) - VALUES ( - found_benchmark_type_id - , NEW.units - ) - RETURNING unit_id INTO NEW.unit_id; - END IF; - - RETURN NEW; - END -$$ -LANGUAGE plpgsql; - --- BENCHMARK_VIEW_INSERT_ROW -CREATE OR REPLACE FUNCTION public.benchmark_view_insert_row() -RETURNS trigger AS -$$ - DECLARE - found_benchmark_language_id integer; - found_units_id integer; - result integer; - BEGIN - IF ( - NEW.benchmark_name IS NULL - OR NEW.benchmark_version IS NULL - OR NEW.benchmark_language IS NULL - OR NEW.benchmark_type IS NULL - OR NEW.benchmark_description IS NULL - OR NEW.units IS NULL - ) - THEN - RAISE EXCEPTION 'The only nullable column in this view is ' - '"benchmark.parameter_names".'; - END IF; - - SELECT public.get_benchmark_language_id( - NEW.benchmark_language - ) INTO found_benchmark_language_id; - - SELECT public.get_unit_id(NEW.units) INTO found_units_id; - - SELECT benchmark_id INTO result FROM public.benchmark AS b - WHERE b.benchmark_language_id = found_benchmark_language_id - AND b.benchmark_name = NEW.benchmark_name - -- handle nullable "parameter_names" - AND b.parameter_names IS NOT DISTINCT FROM NEW.parameter_names - AND b.benchmark_description = NEW.benchmark_description - AND b.benchmark_version = NEW.benchmark_version - AND b.unit_id = found_units_id; - - IF result IS NOT NULL THEN - -- row already exists - RETURN NULL; - ELSE - INSERT INTO public.benchmark( - benchmark_language_id - , benchmark_name - , parameter_names - , benchmark_description - , benchmark_version - , unit_id - ) - VALUES ( - found_benchmark_language_id - , NEW.benchmark_name - , NEW.parameter_names - , NEW.benchmark_description - , NEW.benchmark_version - , found_units_id - ) - RETURNING benchmark_id INTO NEW.benchmark_id; - END IF; - - RETURN NEW; - END -$$ -LANGUAGE plpgsql; - --- BENCHMARK_RUN_VIEW_INSERT_ROW -CREATE OR REPLACE FUNCTION public.benchmark_run_view_insert_row() -RETURNS trigger AS -$$ - DECLARE - found_benchmark_id integer; - found_benchmark_language_id integer; - found_machine_id integer; - found_environment_id integer; - found_language_implementation_version_id integer; - BEGIN - IF ( - NEW.benchmark_name IS NULL - OR NEW.benchmark_version IS NULL - OR NEW.benchmark_language IS NULL - OR NEW.value IS NULL - OR NEW.run_timestamp IS NULL - OR NEW.git_commit_timestamp IS NULL - OR NEW.git_hash IS NULL - OR NEW.language_implementation_version IS NULL - OR NEW.mac_address IS NULL - ) - THEN - RAISE EXCEPTION 'Only the following columns can be NULL: ' - '"parameter_names", "val_min", "val_q1", "val_q3", "val_max".'; - END IF; - - SELECT public.get_benchmark_id( - NEW.benchmark_language - , NEW.benchmark_name - , NEW.benchmark_version - ) INTO found_benchmark_id; - - SELECT public.get_benchmark_language_id( - NEW.benchmark_language - ) INTO found_benchmark_language_id; - - SELECT public.get_machine_id( - NEW.mac_address - ) INTO found_machine_id; - - SELECT public.get_environment_id( - NEW.benchmark_language - , NEW.language_implementation_version - , NEW.dependencies - ) INTO found_environment_id; - - SELECT public.get_language_implementation_version_id( - found_benchmark_language_id, - NEW.language_implementation_version - ) INTO found_language_implementation_version_id; - - INSERT INTO public.benchmark_run ( - parameter_values - , value - , git_commit_timestamp - , git_hash - , val_min - , val_q1 - , val_q3 - , val_max - , std_dev - , n_obs - , run_timestamp - , run_metadata - , run_notes - , machine_id - , benchmark_language_id - , language_implementation_version_id - , environment_id - , benchmark_id - ) - VALUES ( - COALESCE(NEW.parameter_values, '{}'::jsonb) - , NEW.value - , NEW.git_commit_timestamp - , NEW.git_hash - , NEW.val_min - , NEW.val_q1 - , NEW.val_q3 - , NEW.val_max - , NEW.std_dev - , NEW.n_obs - , NEW.run_timestamp - , NEW.run_metadata - , NEW.run_notes - , found_machine_id - , found_benchmark_language_id - , found_language_implementation_version_id - , found_environment_id - , found_benchmark_id - ) returning benchmark_run_id INTO NEW.benchmark_run_id; - - RETURN NEW; - END -$$ -LANGUAGE plpgsql; - --- FULL_BENCHMARK_RUN_VIEW_INSERT_ROW -CREATE OR REPLACE FUNCTION public.full_benchmark_run_view_insert_row() -RETURNS trigger AS -$$ - DECLARE - found_benchmark_id integer; - found_benchmark_language_id integer; - found_machine_id integer; - found_environment_id integer; - found_language_implementation_version_id integer; - BEGIN - IF ( - NEW.value IS NULL - OR NEW.git_hash IS NULL - OR NEW.git_commit_timestamp IS NULL - OR NEW.run_timestamp IS NULL - -- benchmark - OR NEW.benchmark_name IS NULL - OR NEW.benchmark_description IS NULL - OR NEW.benchmark_version IS NULL - OR NEW.benchmark_language IS NULL - -- unit - OR NEW.benchmark_type IS NULL - OR NEW.units IS NULL - OR NEW.lessisbetter IS NULL - -- machine - OR NEW.machine_name IS NULL - OR NEW.memory_bytes IS NULL - OR NEW.cpu_model_name IS NULL - OR NEW.cpu_core_count IS NULL - OR NEW.os_name IS NULL - OR NEW.architecture_name IS NULL - OR NEW.kernel_name IS NULL - OR NEW.cpu_model_name IS NULL - OR NEW.cpu_core_count IS NULL - OR NEW.cpu_thread_count IS NULL - OR NEW.cpu_frequency_max_Hz IS NULL - OR NEW.cpu_frequency_min_Hz IS NULL - OR NEW.cpu_L1d_cache_bytes IS NULL - OR NEW.cpu_L1i_cache_bytes IS NULL - OR NEW.cpu_L2_cache_bytes IS NULL - OR NEW.cpu_L3_cache_bytes IS NULL - ) - THEN - RAISE EXCEPTION 'Only the following columns can be NULL: ' - '"machine_other_attributes", "parameter_names", "val_min", ' - '"val_q1", "val_q3", "val_max", "run_metadata", "run_notes". ' - 'If "gpu_information", "gpu_part_number", "gpu_product_name", or ' - '"kernel_name" are null, they will be silently turned into an ' - 'empty string ('''').'; - END IF; - - SELECT public.get_benchmark_id( - NEW.benchmark_language - , NEW.benchmark_name - , NEW.parameter_names - , NEW.benchmark_description - , NEW.benchmark_version - , NEW.benchmark_type - , NEW.units - , NEW.lessisbetter - ) INTO found_benchmark_id; - - SELECT public.get_benchmark_language_id( - NEW.benchmark_language - ) INTO found_benchmark_language_id; - - SELECT public.get_machine_id( - NEW.mac_address - , NEW.machine_name - , NEW.memory_bytes - , NEW.cpu_actual_frequency_Hz - -- os - , NEW.os_name - , NEW.architecture_name - , NEW.kernel_name - -- cpu - , NEW.cpu_model_name - , NEW.cpu_core_count - , NEW.cpu_thread_count - , NEW.cpu_frequency_max_Hz - , NEW.cpu_frequency_min_Hz - , NEW.cpu_L1d_cache_bytes - , NEW.cpu_L1i_cache_bytes - , NEW.cpu_L2_cache_bytes - , NEW.cpu_L3_cache_bytes - -- gpu - , NEW.gpu_information - , NEW.gpu_part_number - , NEW.gpu_product_name - -- nullable machine attributes - , NEW.machine_other_attributes - ) INTO found_machine_id; - - SELECT public.get_environment_id( - NEW.benchmark_language - , NEW.language_implementation_version - , NEW.dependencies - ) INTO found_environment_id; - - SELECT public.get_language_implementation_version_id( - found_benchmark_language_id, - NEW.language_implementation_version - ) INTO found_language_implementation_version_id; - - INSERT INTO public.benchmark_run ( - parameter_values - , value - , git_commit_timestamp - , git_hash - , val_min - , val_q1 - , val_q3 - , val_max - , std_dev - , n_obs - , run_timestamp - , run_metadata - , run_notes - , machine_id - , benchmark_language_id - , language_implementation_version_id - , environment_id - , benchmark_id - ) - VALUES ( - NEW.parameter_values - , NEW.value - , NEW.git_commit_timestamp - , NEW.git_hash - , NEW.val_min - , NEW.val_q1 - , NEW.val_q3 - , NEW.val_max - , NEW.std_dev - , NEW.n_obs - , NEW.run_timestamp - , NEW.run_metadata - , NEW.run_notes - , found_machine_id - , found_benchmark_language_id - , found_language_implementation_version_id - , found_environment_id - , found_benchmark_id - ) returning benchmark_run_id INTO NEW.benchmark_run_id; - - RETURN NEW; - END -$$ -LANGUAGE plpgsql; diff --git a/dev/benchmarking/ddl/3_02_functions_ingestion.sql b/dev/benchmarking/ddl/3_02_functions_ingestion.sql deleted file mode 100644 index 000c61d00e7b0..0000000000000 --- a/dev/benchmarking/ddl/3_02_functions_ingestion.sql +++ /dev/null @@ -1,323 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - - --------------------------- IMPORT HELPERS -------------------------- --- Load from JSON (from https://stackoverflow.com/a/48396608) --- How to use it in the psql client: --- \set content `cat /examples/machine.json` --- select ingest_machine(:'content'::jsonb); --- INGEST_MACHINE_VIEW -CREATE OR REPLACE FUNCTION public.ingest_machine_view(from_jsonb jsonb) -RETURNS integer AS -$$ - DECLARE - result integer; - BEGIN - INSERT INTO public.machine_view - SELECT * FROM jsonb_populate_record(null::public.machine_view, from_jsonb) - RETURNING machine_id INTO result; - RETURN result; - END -$$ -LANGUAGE plpgsql; -COMMENT ON FUNCTION public.ingest_machine_view(jsonb) IS - E'The argument is a JSON object. NOTE: key names must be entirely\n' - 'lowercase, or the insert will fail. Extra key-value pairs are ignored.\n' - 'Example::\n\n' - ' {\n' - ' "mac_address": "0a:00:2d:01:02:03",\n' - ' "machine_name": "Yet-Another-Machine-Name",\n' - ' "memory_bytes": 8589934592,\n' - ' "cpu_actual_frequency_hz": 2300000000,\n' - ' "os_name": "OSX",\n' - ' "architecture_name": "x86_64",\n' - ' "kernel_name": "18.2.0",\n' - ' "cpu_model_name": "Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz",\n' - ' "cpu_core_count": 2,\n' - ' "cpu_thread_count": 4,\n' - ' "cpu_frequency_max_hz": 2300000000,\n' - ' "cpu_frequency_min_hz": 2300000000,\n' - ' "cpu_l1d_cache_bytes": 32768,\n' - ' "cpu_l1i_cache_bytes": 32768,\n' - ' "cpu_l2_cache_bytes": 262144,\n' - ' "cpu_l3_cache_bytes": 4194304,\n' - ' "machine_other_attributes": {"just": "an example"},\n' - ' "gpu_information": "",\n' - ' "gpu_part_number": "",\n' - ' "gpu_product_name": ""\n' - ' }\n\n' - 'To identify which columns in "machine_view" are required,\n' - 'please see the view documentation in :ref:`benchmark-data-model`.\n'; - --- INGEST_BENCHMARK_VIEW -CREATE OR REPLACE FUNCTION public.ingest_benchmark_view(from_jsonb jsonb) -RETURNS setof integer AS -$$ - BEGIN - RETURN QUERY - INSERT INTO public.benchmark_view - SELECT * FROM jsonb_populate_recordset( - null::public.benchmark_view - , from_jsonb - ) - RETURNING benchmark_id; - END -$$ -LANGUAGE plpgsql; -COMMENT ON FUNCTION public.ingest_benchmark_view(jsonb) IS - E'The argument is a JSON object. NOTE: key names must be entirely\n' - 'lowercase, or the insert will fail. Extra key-value pairs are ignored.\n' - 'Example::\n\n' - ' [\n' - ' {\n' - ' "benchmark_name": "Benchmark 1",\n' - ' "parameter_names": ["arg0", "arg1", "arg2"],\n' - ' "benchmark_description": "First benchmark",\n' - ' "benchmark_type": "Time",\n' - ' "units": "miliseconds",\n' - ' "lessisbetter": true,\n' - ' "benchmark_version": "second version",\n' - ' "benchmark_language": "Python"\n' - ' },\n' - ' {\n' - ' "benchmark_name": "Benchmark 2",\n' - ' "parameter_names": ["arg0", "arg1"],\n' - ' "benchmark_description": "Description 2.",\n' - ' "benchmark_type": "Time",\n' - ' "units": "nanoseconds",\n' - ' "lessisbetter": true,\n' - ' "benchmark_version": "second version",\n' - ' "benchmark_language": "Python"\n' - ' }\n' - ' ]\n\n' - 'To identify which columns in "benchmark_view" are required,\n' - 'please see the view documentation in :ref:`benchmark-data-model`.\n'; - --- INGEST_BENCHMARK_RUN_VIEW -CREATE OR REPLACE FUNCTION public.ingest_benchmark_run_view(from_jsonb jsonb) -RETURNS setof bigint AS -$$ - BEGIN - RETURN QUERY - INSERT INTO public.benchmark_run_view - SELECT * FROM - jsonb_populate_recordset(null::public.benchmark_run_view, from_jsonb) - RETURNING benchmark_run_id; - END -$$ -LANGUAGE plpgsql; -COMMENT ON FUNCTION public.ingest_benchmark_run_view(jsonb) IS - E'The argument is a JSON object. NOTE: key names must be entirely\n' - 'lowercase, or the insert will fail. Extra key-value pairs are ignored.\n' - 'Example::\n\n' - ' [\n' - ' {\n' - ' "benchmark_name": "Benchmark 2",\n' - ' "benchmark_version": "version 0",\n' - ' "parameter_values": {"arg0": 100, "arg1": 5},\n' - ' "value": 2.5,\n' - ' "git_commit_timestamp": "2019-02-08 22:35:53 +0100",\n' - ' "git_hash": "324d3cf198444a",\n' - ' "val_min": 1,\n' - ' "val_q1": 2,\n' - ' "val_q3": 3,\n' - ' "val_max": 4,\n' - ' "std_dev": 1.41,\n' - ' "n_obs": 8,\n' - ' "run_timestamp": "2019-02-14 03:00:05 -0600",\n' - ' "mac_address": "08:00:2b:01:02:03",\n' - ' "benchmark_language": "Python",\n' - ' "language_implementation_version": "CPython 2.7",\n' - ' "dependencies": {"six": "", "numpy": "1.14", "other_lib": "1.0"}\n' - ' },\n' - ' {\n' - ' "benchmark_name": "Benchmark 2",\n' - ' "benchmark_version": "version 0",\n' - ' "parameter_values": {"arg0": 1000, "arg1": 5},\n' - ' "value": 5,\n' - ' "git_commit_timestamp": "2019-02-08 22:35:53 +0100",\n' - ' "git_hash": "324d3cf198444a",\n' - ' "std_dev": 3.14,\n' - ' "n_obs": 8,\n' - ' "run_timestamp": "2019-02-14 03:00:10 -0600",\n' - ' "mac_address": "08:00:2b:01:02:03",\n' - ' "benchmark_language": "Python",\n' - ' "language_implementation_version": "CPython 2.7",\n' - ' "dependencies": {"six": "", "numpy": "1.14", "other_lib": "1.0"}\n' - ' }\n' - ' ]\n' - 'To identify which columns in "benchmark_run_view" are required,\n' - 'please see the view documentation in :ref:`benchmark-data-model`.\n'; - --- INGEST_BENCHMARK_RUNS_WITH_CONTEXT -CREATE OR REPLACE FUNCTION public.ingest_benchmark_runs_with_context(from_jsonb jsonb) -RETURNS setof bigint AS -$$ - DECLARE - context_jsonb jsonb; - found_environment_id integer; - found_machine_id integer; - BEGIN - SELECT from_jsonb -> 'context' INTO context_jsonb; - - SELECT public.get_machine_id((context_jsonb ->> 'mac_address')::macaddr) - INTO found_machine_id; - - SELECT get_environment_id( - (context_jsonb ->> 'benchmark_language')::citext - , (context_jsonb ->> 'language_implementation_version')::citext - , context_jsonb -> 'dependencies' - ) INTO found_environment_id; - - RETURN QUERY - WITH run_datum AS ( - SELECT * - FROM jsonb_to_recordset(from_jsonb -> 'benchmarks') - AS x( - benchmark_name citext - , parameter_values jsonb - , value numeric - , val_min numeric - , val_q1 numeric - , val_q3 numeric - , val_max numeric - , std_dev numeric - , n_obs integer - , run_timestamp timestamp (0) with time zone - , run_metadata jsonb - , run_notes text - ) - ), benchmark_name_and_id AS ( - SELECT - key AS benchmark_name - , public.get_benchmark_id( - (context_jsonb ->> 'benchmark_language')::citext - , key::citext -- benchmark_name - , value::citext -- benchmark_version - ) AS benchmark_id - FROM jsonb_each_text(from_jsonb -> 'benchmark_version') - ) - INSERT INTO public.benchmark_run ( - benchmark_id - -- run_datum - , parameter_values - , value - , val_min - , val_q1 - , val_q3 - , val_max - , std_dev - , n_obs - , run_metadata - , run_notes - -- additional context information - , git_commit_timestamp - , git_hash - , run_timestamp - -- machine - , machine_id - -- environment - , environment_id - , language_implementation_version_id - , benchmark_language_id - ) - SELECT - b.benchmark_id - -- run_datum - , run_datum.parameter_values - , run_datum.value - , run_datum.val_min - , run_datum.val_q1 - , run_datum.val_q3 - , run_datum.val_max - , run_datum.std_dev - , run_datum.n_obs - , run_datum.run_metadata - , run_datum.run_notes - -- additional context information - , (context_jsonb ->> 'git_commit_timestamp')::timestamp (0) with time zone - , context_jsonb ->> 'git_hash' - , (context_jsonb ->> 'run_timestamp')::timestamp (0) with time zone - -- machine - , found_machine_id - -- environment - , e.environment_id - , e.language_implementation_version_id - , e.benchmark_language_id - FROM run_datum - JOIN public.environment AS e - ON e.environment_id = found_environment_id - JOIN benchmark_name_and_id AS b - ON b.benchmark_name = run_datum.benchmark_name - RETURNING benchmark_run_id; - END -$$ -LANGUAGE plpgsql; -COMMENT ON FUNCTION public.ingest_benchmark_runs_with_context(jsonb) IS - E'The argument is a JSON object. NOTE: key names must be entirely\n' - 'lowercase, or the insert will fail. Extra key-value pairs are ignored.\n' - 'The object contains three key-value pairs::\n\n' - ' {"context": {\n' - ' "mac_address": "08:00:2b:01:02:03",\n' - ' "benchmark_language": "Python",\n' - ' "language_implementation_version": "CPython 3.6",\n' - ' "dependencies": {"six": "", "numpy": "1.14", "other_lib": "1.0"},\n' - ' "git_commit_timestamp": "2019-02-14 22:42:22 +0100",\n' - ' "git_hash": "123456789abcde",\n' - ' "run_timestamp": "2019-02-14 03:00:40 -0600",\n' - ' "extra stuff": "does not hurt anything and will not be added."\n' - ' },\n' - ' "benchmark_version": {\n' - ' "Benchmark Name 1": "Any string can be a version.",\n' - ' "Benchmark Name 2": "A git hash can be a version.",\n' - ' "An Unused Benchmark Name": "Will be ignored."\n' - ' },\n' - ' "benchmarks": [\n' - ' {\n' - ' "benchmark_name": "Benchmark Name 1",\n' - ' "parameter_values": {"argument1": 1, "argument2": "value2"},\n' - ' "value": 42,\n' - ' "val_min": 41.2,\n' - ' "val_q1": 41.5,\n' - ' "val_q3": 42.5,\n' - ' "val_max": 42.8,\n' - ' "std_dev": 0.5,\n' - ' "n_obs": 100,\n' - ' "run_metadata": {"any": "key-value pairs"},\n' - ' "run_notes": "Any relevant notes."\n' - ' },\n' - ' {\n' - ' "benchmark_name": "Benchmark Name 2",\n' - ' "parameter_values": {"not nullable": "Use {} if no params."},\n' - ' "value": 8,\n' - ' "std_dev": 1,\n' - ' "n_obs": 2,\n' - ' }\n' - ' ]\n' - ' }\n\n' - '- The entry for "context" contains the machine, environment, and timestamp\n' - ' information common to all of the runs\n' - '- The entry for "benchmark_version" maps benchmark\n' - ' names to their version strings. (Which can be a git hash,\n' - ' the entire code string, a number, or any other string of your choice.)\n' - '- The entry for "benchmarks" is a list of benchmark run data\n' - ' for the given context and benchmark versions. The first example\n' - ' benchmark run entry contains all possible values, even\n' - ' nullable ones, and the second entry omits all nullable values.\n\n'; diff --git a/dev/benchmarking/ddl/3_10_functions_documentation.sql b/dev/benchmarking/ddl/3_10_functions_documentation.sql deleted file mode 100644 index 6b2a057909f86..0000000000000 --- a/dev/benchmarking/ddl/3_10_functions_documentation.sql +++ /dev/null @@ -1,395 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - - --- _DOCUMENTATION_INGESTION -CREATE OR REPLACE FUNCTION public._documentation_ingestion() -RETURNS text AS -$$ - WITH ingestion_docs AS ( - SELECT - proname || E'\n' - || rpad('', character_length(proname), '-') - || E'\n\n:code:`' - || proname || '(' - || string_agg(a.argname || ' ' || typname , ', ') - || E')`\n\n' - || description - || E'\n\n\nback to `Benchmark data model `_\n' - AS docs - FROM pg_catalog.pg_proc - JOIN pg_catalog.pg_namespace - ON nspname='public' - AND pg_namespace.oid = pronamespace - AND proname LIKE '%ingest%' - JOIN pg_catalog.pg_description - ON pg_description.objoid=pg_proc.oid, - LATERAL unnest(proargnames, proargtypes) AS a(argname, argtype) - JOIN pg_catalog.pg_type - ON pg_type.oid = a.argtype - GROUP BY proname, description - ) - SELECT - string_agg(docs, E'\n\n') AS docs - FROM ingestion_docs; -$$ -LANGUAGE sql STABLE; - --- _DOCUMENTATION_VIEW_DETAILS -CREATE OR REPLACE FUNCTION public._documentation_view_details(view_name citext) -RETURNS TABLE( - column_name name - , type_name name - , nullable text - , default_value text - , description text -) AS -$$ - WITH view_columns AS ( - SELECT - attname AS column_name - , attnum AS column_order - FROM pg_catalog.pg_attribute - WHERE attrelid=view_name::regclass - ) - SELECT - t.column_name - , type_name - , coalesce(nullable, '') - , coalesce(default_value, '') - , coalesce(description, '') - FROM public.summarized_tables_view AS t - JOIN view_columns AS v ON v.column_name = t.column_name - WHERE t.table_name || '_view' = view_name OR t.column_name NOT LIKE '%_id' - ORDER BY column_order; -$$ -LANGUAGE sql STABLE; - - --- _DOCUMENTATION_VIEW_PIECES -CREATE OR REPLACE FUNCTION public._documentation_view_pieces(view_name citext) -RETURNS TABLE (rst_formatted text) -AS -$$ -DECLARE - column_length integer; - type_length integer; - nullable_length integer; - default_length integer; - description_length integer; - sep text; - border text; -BEGIN - - -- All of the hard-coded constants here are the string length of the table - -- column headers: 'Column', 'Type', 'Nullable', 'Default', 'Description' - SELECT greatest(6, max(character_length(column_name))) - FROM public._documentation_view_details(view_name) INTO column_length; - - SELECT greatest(4, max(character_length(type_name))) - FROM public._documentation_view_details(view_name) INTO type_length; - - SELECT greatest(8, max(character_length(nullable))) - FROM public._documentation_view_details(view_name) INTO nullable_length; - - SELECT greatest(7, max(character_length(default_value))) - FROM public._documentation_view_details(view_name) INTO default_length; - - SELECT greatest(11, max(character_length(description))) - FROM public._documentation_view_details(view_name) INTO description_length; - - SELECT ' ' INTO sep; - - SELECT - concat_ws(sep - , rpad('', column_length, '=') - , rpad('', type_length, '=') - , rpad('', nullable_length, '=') - , rpad('', default_length, '=') - , rpad('', description_length, '=') - ) - INTO border; - - RETURN QUERY - SELECT - border - UNION ALL - SELECT - concat_ws(sep - , rpad('Column', column_length, ' ') - , rpad('Type', type_length, ' ') - , rpad('Nullable', nullable_length, ' ') - , rpad('Default', default_length, ' ') - , rpad('Description', description_length, ' ') - ) - UNION ALL - SELECT border - UNION ALL - SELECT - concat_ws(sep - , rpad(v.column_name, column_length, ' ') - , rpad(v.type_name, type_length, ' ') - , rpad(v.nullable, nullable_length, ' ') - , rpad(v.default_value, default_length, ' ') - , rpad(v.description, description_length, ' ') - ) - FROM public._documentation_view_details(view_name) AS v - UNION ALL - SELECT border; - -END -$$ -LANGUAGE plpgsql STABLE; - - --- DOCUMENTATION_FOR -CREATE OR REPLACE FUNCTION public.documentation_for(view_name citext) -RETURNS text AS -$$ - DECLARE - view_description text; - view_table_markup text; - BEGIN - SELECT description FROM pg_catalog.pg_description - WHERE pg_description.objoid = view_name::regclass - INTO view_description; - - SELECT - view_name || E'\n' || rpad('', length(view_name), '-') || E'\n\n' || - view_description || E'\n\n' || - string_agg(rst_formatted, E'\n') - INTO view_table_markup - FROM public._documentation_view_pieces(view_name); - - RETURN view_table_markup; - END -$$ -LANGUAGE plpgsql STABLE; -COMMENT ON FUNCTION public.documentation_for(citext) -IS E'Create an ".rst"-formatted table describing a specific view.\n' - 'Example: SELECT public.documentation_for(''endpoint'');'; - - --- DOCUMENTATION -CREATE OR REPLACE FUNCTION public.documentation(dotfile_name text) -RETURNS TABLE (full_text text) AS -$$ - WITH v AS ( - SELECT - public.documentation_for(relname::citext) - || E'\n\nback to `Benchmark data model `_\n' - AS view_documentation - FROM pg_catalog.pg_trigger - JOIN pg_catalog.pg_class ON pg_trigger.tgrelid = pg_class.oid - WHERE NOT tgisinternal - ) - SELECT - E'\n.. _benchmark-data-model:\n\n' - 'Benchmark data model\n' - '====================\n\n\n' - '.. graphviz:: ' - || dotfile_name - || E'\n\n\n.. _benchmark-ingestion:\n\n' - 'Benchmark ingestion helper functions\n' - '====================================\n\n' - || public._documentation_ingestion() - || E'\n\n\n.. _benchmark-views:\n\n' - 'Benchmark views\n' - '===============\n\n\n' - || string_agg(v.view_documentation, E'\n') - FROM v - GROUP BY True; -$$ -LANGUAGE sql STABLE; -COMMENT ON FUNCTION public.documentation(text) -IS E'Create an ".rst"-formatted file that shows the columns in ' - 'every insertable view in the "public" schema.\n' - 'The text argument is the name of the generated dotfile to be included.\n' - 'Example: SELECT public.documentation(''data_model.dot'');'; - - --- _DOCUMENTATION_DOTFILE_NODE_FOR -CREATE OR REPLACE FUNCTION public._documentation_dotfile_node_for(tablename name) -RETURNS text AS -$$ -DECLARE - result text; -BEGIN - WITH node AS ( - SELECT - tablename::text AS lines - UNION ALL - SELECT - E'[label = \n' - ' <' - UNION ALL - -- table name - SELECT - ' ' - UNION ALL - -- primary keys - SELECT - ' ' - FROM public.summarized_tables_view - WHERE table_name = tablename - AND description LIKE '%primary key%' - UNION ALL - -- columns - SELECT - ' ' - FROM public.summarized_tables_view - WHERE table_name = tablename - AND (description IS NULL OR description not like '%key%') - UNION ALL - -- foreign keys - SELECT - ' ' - FROM public.summarized_tables_view - WHERE table_name = tablename - AND description LIKE '%foreign key%' - AND description NOT LIKE '%primary key%' - UNION ALL - SELECT - E'
' - || tablename - || '
' - || column_name - || ' (pk)
' - || column_name - || CASE WHEN description LIKE '%unique' THEN ' (u)' ELSE '' END - || CASE WHEN nullable <> 'not null' THEN ' (o)' ELSE '' END - || '
' - || column_name - || CASE WHEN description LIKE '%unique' THEN ' (u)' ELSE '' END - || ' (fk)
>\n];' - ) - SELECT - string_agg(lines, E'\n') - INTO result - FROM node; - - RETURN result; -END -$$ -LANGUAGE plpgsql STABLE; - - --- _DOCUMENTATION_DOTFILE_EDGES -CREATE OR REPLACE FUNCTION public._documentation_dotfile_edges() -RETURNS text AS -$$ -DECLARE - result text; -BEGIN - WITH relationship AS ( - SELECT - conrelid AS fk_table_id - , confrelid AS pk_table_id - , unnest(conkey) AS fk_colnum - , unnest(confkey) AS pk_colnum - FROM pg_catalog.pg_constraint - WHERE confkey IS NOT NULL - AND connamespace='public'::regnamespace - ), all_edges AS ( - SELECT - fk_tbl.relname || ':' || fk_col.attname - || ' -> ' - || pk_tbl.relname || ':' || pk_col.attname - || ';' AS lines - FROM relationship - -- foreign key table + column - JOIN pg_catalog.pg_attribute AS fk_col - ON fk_col.attrelid = relationship.fk_table_id - AND fk_col.attnum = relationship.fk_colnum - JOIN pg_catalog.pg_class AS fk_tbl - ON fk_tbl.oid = relationship.fk_table_id - -- primary key table + column - JOIN pg_catalog.pg_attribute AS pk_col - ON pk_col.attrelid = relationship.pk_table_id - AND pk_col.attnum = relationship.pk_colnum - JOIN pg_catalog.pg_class AS pk_tbl - ON pk_tbl.oid = relationship.pk_table_id - ) - SELECT - string_agg(lines, E'\n') - INTO result - FROM all_edges; - - RETURN result; -END -$$ -LANGUAGE plpgsql STABLE; - - --- DOCUMENTATION_DOTFILE -CREATE OR REPLACE FUNCTION public.documentation_dotfile() -RETURNS text AS -$$ -DECLARE - schemaname name := 'public'; - result text; -BEGIN - WITH file_contents AS ( - SELECT - E'digraph database {\n concentrate = true;\n' - ' rankdir = LR;\n' - ' ratio = ".75";\n' - ' node [shape = none, fontsize="11", fontname="Helvetica"];\n' - ' edge [fontsize="8", fontname="Helvetica"];' - AS lines - UNION ALL - SELECT - E'legend\n[fontsize = "14"\nlabel =\n' - '<\n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - ' \n' - '
Legend
pk = primary key
fk = foreign key
u = unique*
o = optional
' - '* multiple uniques in the same table are a unique group
>\n];' - UNION ALL - SELECT - string_agg( - public._documentation_dotfile_node_for(relname), - E'\n' -- Forcing the 'env' table to the end makes a better image - ORDER BY (CASE WHEN relname LIKE 'env%' THEN 'z' ELSE relname END) - ) - FROM pg_catalog.pg_class - WHERE relkind='r' AND relnamespace = schemaname::regnamespace - UNION ALL - SELECT - public._documentation_dotfile_edges() - UNION ALL - SELECT - '}' - ) - SELECT - string_agg(lines, E'\n') AS dotfile - INTO result - FROM file_contents; - RETURN result; -END -$$ -LANGUAGE plpgsql STABLE; -COMMENT ON FUNCTION public.documentation_dotfile() -IS E'Create a Graphviz dotfile of the data model: ' - 'every table in the "public" schema.\n' - 'Example: SELECT public.documentation_dotfile();'; diff --git a/dev/benchmarking/ddl/4_00_triggers.sql b/dev/benchmarking/ddl/4_00_triggers.sql deleted file mode 100644 index 5fb0e50185951..0000000000000 --- a/dev/benchmarking/ddl/4_00_triggers.sql +++ /dev/null @@ -1,61 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - - --- LANGUAGE_IMPLEMENTATION_VERSION_VIEW_TRIGGER_INSERT -CREATE TRIGGER language_implementation_version_view_trigger_insert - INSTEAD OF INSERT ON public.language_implementation_version_view - FOR EACH ROW - EXECUTE FUNCTION public.language_implementation_version_view_insert_row(); - --- ENVIRONMENT_VIEW_TRIGGER_INSERT -CREATE TRIGGER environment_view_trigger_insert - INSTEAD OF INSERT ON public.environment_view - FOR EACH ROW - EXECUTE FUNCTION public.environment_view_insert_row(); - --- MACHINE_VIEW_TRIGGER_INSERT -CREATE TRIGGER machine_view_trigger_insert - INSTEAD OF INSERT ON public.machine_view - FOR EACH ROW - EXECUTE FUNCTION public.machine_view_insert_row(); - --- UNIT_VIEW_TRIGGER_INSERT -CREATE TRIGGER unit_view_trigger_insert - INSTEAD OF INSERT ON public.unit_view - FOR EACH ROW - EXECUTE FUNCTION public.unit_view_insert_row(); - --- BENCHMARK_VIEW_TRIGGER_INSERT -CREATE TRIGGER benchmark_view_trigger_insert - INSTEAD OF INSERT ON public.benchmark_view - FOR EACH ROW - EXECUTE FUNCTION public.benchmark_view_insert_row(); - --- BENCHMARK_RUN_VIEW_TRIGGER_INSERT -CREATE TRIGGER benchmark_run_view_trigger_insert - INSTEAD OF INSERT ON public.benchmark_run_view - FOR EACH ROW - EXECUTE FUNCTION public.benchmark_run_view_insert_row(); - --- FULL_BENCHMARK_RUN_VIEW_TRIGGER_INSERT -CREATE TRIGGER full_benchmark_run_view_trigger_insert - INSTEAD OF INSERT ON public.full_benchmark_run_view - FOR EACH ROW - EXECUTE FUNCTION public.full_benchmark_run_view_insert_row(); diff --git a/dev/benchmarking/ddl/5_00_permissions.sql b/dev/benchmarking/ddl/5_00_permissions.sql deleted file mode 100644 index dd72c40db3130..0000000000000 --- a/dev/benchmarking/ddl/5_00_permissions.sql +++ /dev/null @@ -1,73 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ ----------------------------- ROLES ---------------------------- --- ARROW_WEB -CREATE ROLE arrow_web login password 'arrow'; -COMMENT ON ROLE arrow_web IS 'Anonymous login user.'; - --- ARROW_ADMIN -CREATE ROLE arrow_admin; -COMMENT ON ROLE arrow_admin - IS 'Can select, insert, update, and delete on all public tables.'; - --- ARROW_ANONYMOUS -CREATE ROLE arrow_anonymous; -COMMENT ON ROLE arrow_anonymous - IS 'Can insert and select on all public tables.'; - -GRANT arrow_anonymous TO arrow_web; - - ----------------------------- PRIVILEGES ---------------------------- -GRANT USAGE ON SCHEMA public TO arrow_anonymous, arrow_admin; - --- ARROW_ADMIN -GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA public TO arrow_admin; -GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public to arrow_admin; -GRANT SELECT, UPDATE, INSERT, DELETE ON ALL TABLES IN SCHEMA public - TO arrow_admin; - --- ARROW_ANONYMOUS -GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA public TO arrow_anonymous; -GRANT SELECT ON ALL TABLES IN SCHEMA public TO arrow_anonymous; -GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public to arrow_anonymous; -GRANT INSERT ON - public.benchmark - , public.benchmark_language - , public.dependencies - , public.language_implementation_version - , public.benchmark_run - , public.benchmark_type - , public.cpu - , public.environment - , public.environment_view - , public.gpu - , public.machine - , public.machine_view - , public.os - , public.unit - --, public.project -- The only disallowed table is `project`. - , public.benchmark_run_view - , public.benchmark_view - , public.environment_view - , public.full_benchmark_run_view - , public.language_implementation_version_view - , public.machine_view - , public.unit_view -TO arrow_anonymous; diff --git a/dev/benchmarking/docker-compose.yml b/dev/benchmarking/docker-compose.yml deleted file mode 100644 index ca60206bfdfb6..0000000000000 --- a/dev/benchmarking/docker-compose.yml +++ /dev/null @@ -1,43 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -version: '3' -services: - - pg: - build: - context: . - dockerfile: Dockerfile - restart: always - ports: - - '5432:5432' - environment: - - POSTGRES_PASSWORD=${PG_PASS} - - POSTGRES_USER=${PG_USER} - - graphile: - image: graphile/postgraphile - restart: always - ports: - - 5000:5000 - depends_on: - - pg - command: - - --connection - - postgres://${PG_USER}:${PG_PASS}@pg:5432/${PG_USER} - - --schema - - public - - --watch diff --git a/dev/benchmarking/examples/benchmark_example.json b/dev/benchmarking/examples/benchmark_example.json deleted file mode 100644 index d6f58c2862ece..0000000000000 --- a/dev/benchmarking/examples/benchmark_example.json +++ /dev/null @@ -1,32 +0,0 @@ -[ - { - "benchmark_name": "Benchmark 1", - "parameter_names": ["arg0", "arg1", "arg2"], - "benchmark_description": "First benchmark", - "benchmark_type": "Time", - "units": "miliseconds", - "lessisbetter": true, - "benchmark_version": "second version", - "benchmark_language": "Python" - }, - { - "benchmark_name": "Benchmark 2", - "parameter_names": ["arg0", "arg1"], - "benchmark_description": "Description 2.", - "benchmark_type": "Time", - "units": "nanoseconds", - "lessisbetter": true, - "benchmark_version": "second version", - "benchmark_language": "Python" - }, - { - "benchmark_name": "Benchmark 3", - "parameter_names": ["arg0"], - "benchmark_description": "Third benchmark", - "benchmark_type": "Memory", - "units": "kilobytes", - "lessisbetter": true, - "benchmark_version": "1", - "benchmark_language": "Python" - } -] diff --git a/dev/benchmarking/examples/benchmark_run_example.csv b/dev/benchmarking/examples/benchmark_run_example.csv deleted file mode 100644 index eab208a1c709e..0000000000000 --- a/dev/benchmarking/examples/benchmark_run_example.csv +++ /dev/null @@ -1,6 +0,0 @@ -benchmark_run_id,benchmark_name,benchmark_version,parameter_values,value,git_commit_timestamp,git_hash,val_min,val_q1,val_q3,val_max,std_dev,n_obs,run_timestamp,run_metadata,run_notes,mac_address,benchmark_language,language_implementation_version,dependencies -,Benchmark 2,version 0,"{""arg0"": 100, ""arg1"": 5}",2.5,2019-01-31 14:31:10 -0600,8136c46d5c60fb,1,2,3,4,1.41,8,2019-02-14 02:00:00 -0600,,,08:00:2b:01:02:03,Python,CPython 2.7,"{""six"": """", ""numpy"": ""1.14"", ""other_lib"": ""1.0""}" -,Benchmark 2,version 0,"{""arg0"": 1000, ""arg1"": 5}",5,2019-01-31 14:31:10 -0600,8136c46d5c60fb,2,4,6,8,3.14,8,2019-02-14 02:01:00 -0600,,,08:00:2b:01:02:03,Python,CPython 2.7,"{""six"": """", ""numpy"": ""1.14"", ""other_lib"": ""1.0""}" -,Benchmark 2,version 0,"{""arg0"": 100, ""arg1"": 5}",2.5,2019-01-31 14:31:10 -0600,8136c46d5c60fb,0.5,1,3,5,3,8,2019-02-14 02:02:00 -0600,,,08:00:2b:01:02:03,Python,CPython 3.6,"{""boost"": ""1.42"", ""numpy"": ""1.15""}" -,Benchmark 2,version 0,"{""arg0"": 1000, ""arg1"": 5}",3,2019-01-31 14:31:10 -0600,8136c46d5c60fb,2,2.5,4,4.5,1.5,8,2019-02-14 02:03:00 -0600,,,08:00:2b:01:02:03,Python,CPython 3.6,"{""boost"": ""1.42"", ""numpy"": ""1.15""}" -,Benchmark 2,version 0,"{""arg0"": 1000, ""arg1"": 10}",3,2019-01-31 14:31:10 -0600,8136c46d5c60fb,1,2,4,5,2,8,2019-02-14 02:03:30 -0600,,,08:00:2b:01:02:03,Python,CPython 2.7,"{""six"": """", ""numpy"": ""1.15"", ""other_lib"": ""1.0""}" diff --git a/dev/benchmarking/examples/benchmark_run_example.json b/dev/benchmarking/examples/benchmark_run_example.json deleted file mode 100644 index 2ded776c9898d..0000000000000 --- a/dev/benchmarking/examples/benchmark_run_example.json +++ /dev/null @@ -1,97 +0,0 @@ -[ - { - "benchmark_name": "Benchmark 2", - "benchmark_version": "version 0", - "parameter_values": {"arg0": 100, "arg1": 5}, - "value": 2.5, - "git_commit_timestamp": "2019-02-08 22:35:53 +0100", - "git_hash": "324d3cf198444a", - "val_min": 1, - "val_q1": 2, - "val_q3": 3, - "val_max": 4, - "std_dev": 1.41, - "n_obs": 8, - "run_timestamp": "2019-02-14 03:00:05 -0600", - "mac_address": "08:00:2b:01:02:03", - "benchmark_language": "Python", - "language_implementation_version": "CPython 2.7", - "dependencies": {"six": "", "numpy": "1.14", "other_lib": "1.0"} - }, - { - "benchmark_name": "Benchmark 2", - "benchmark_version": "version 0", - "parameter_values": {"arg0": 1000, "arg1": 5}, - "value": 5, - "git_commit_timestamp": "2019-02-08 22:35:53 +0100", - "git_hash": "324d3cf198444a", - "val_min": 2, - "val_q1": 4, - "val_q3": 6, - "val_max": 8, - "std_dev": 3.14, - "n_obs": 8, - "run_timestamp": "2019-02-14 03:00:10 -0600", - "mac_address": "08:00:2b:01:02:03", - "benchmark_language": "Python", - "language_implementation_version": "CPython 2.7", - "dependencies": {"six": "", "numpy": "1.14", "other_lib": "1.0"} - }, - { - "benchmark_name": "Benchmark 2", - "benchmark_version": "version 0", - "parameter_values": {"arg0": 100, "arg1": 5}, - "value": 2.5, - "git_commit_timestamp": "2019-02-08 22:35:53 +0100", - "git_hash": "324d3cf198444a", - "val_min": 0.5, - "val_q1": 1, - "val_q3": 3, - "val_max": 5, - "std_dev": 3, - "n_obs": 8, - "run_timestamp": "2019-02-14 03:00:20 -0600", - "mac_address": "08:00:2b:01:02:03", - "benchmark_language": "Python", - "language_implementation_version": "CPython 2.7", - "dependencies": {"boost": "1.42", "numpy": "1.15"} - }, - { - "benchmark_name": "Benchmark 2", - "benchmark_version": "version 0", - "parameter_values": {"arg0": 1000, "arg1": 5}, - "value": 3, - "git_commit_timestamp": "2019-02-08 22:35:53 +0100", - "git_hash": "324d3cf198444a", - "val_min": 2, - "val_q1": 2.5, - "val_q3": 4, - "val_max": 4.5, - "std_dev": 1.5, - "n_obs": 8, - "run_timestamp": "2019-02-14 03:00:30 -0600", - "mac_address": "08:00:2b:01:02:03", - "benchmark_language": "Python", - "language_implementation_version": "CPython 2.7", - "dependencies": {"boost": "1.42", "numpy": "1.15"} - }, - { - "benchmark_name": "Benchmark 2", - "benchmark_version": "version 0", - "parameter_values": {"arg0": 1000, "arg1": 10}, - "value": 3, - "git_commit_timestamp": "2019-02-08 22:35:53 +0100", - "git_hash": "324d3cf198444a", - "val_min": 1, - "val_q1": 2, - "val_q3": 4, - "val_max": 5, - "std_dev": 2, - "n_obs": 8, - "run_timestamp": "2019-02-14 03:00:40 -0600", - "mac_address": "08:00:2b:01:02:03", - "benchmark_language": "Python", - "language_implementation_version": "CPython 2.7", - "dependencies": {"six": "", "numpy": "1.15", "other_lib": "1.0"} - } -] diff --git a/dev/benchmarking/examples/benchmark_with_context_example.json b/dev/benchmarking/examples/benchmark_with_context_example.json deleted file mode 100644 index f9e6e31309f68..0000000000000 --- a/dev/benchmarking/examples/benchmark_with_context_example.json +++ /dev/null @@ -1,73 +0,0 @@ -{ - "context": { - "mac_address": "08:00:2b:01:02:03", - "benchmark_language": "Python", - "language_implementation_version": "CPython 2.7", - "dependencies": {"six": "", "numpy": "1.14", "other_lib": "1.0"}, - "git_commit_timestamp": "2019-02-14 22:42:22 +0100", - "git_hash": "123456789abcde", - "run_timestamp": "2019-02-25 03:00:40 -0600", - "Extra stuff": "does not hurt anything and won't be added.", - "However": "all of the entries above 'Extra stuff' are required." - }, - "benchmark_version": { - "Benchmark 2": "version 0", - "Benchmark 3": "any string is a version. (Benchmark 3 not actually used)" - }, - "benchmarks": [ - { - "benchmark_name": "Benchmark 2", - "parameter_values": {"arg0": 1, "arg1": 5}, - "value": 2.5, - "val_min": 1, - "val_q1": 2, - "val_q3": 3, - "val_max": 4, - "std_dev": 1.41, - "n_obs": 8, - "run_metadata": {"any": "json object is admissible"}, - "run_notes": "This value is an arbitrary-length string." - }, - { - "benchmark_name": "Benchmark 2", - "parameter_values": {"arg0": 2, "arg1": 5}, - "value": 5, - "std_dev": 3.14, - "n_obs": 8 - }, - { - "benchmark_name": "Benchmark 2", - "parameter_values": {"arg0": 3, "arg1": 5}, - "value": 2.5, - "val_min": 0.5, - "val_q1": 1, - "val_q3": 3, - "val_max": 5, - "std_dev": 3, - "n_obs": 8, - "run_notes": "The previous run in this list has the minimal set of keys." - }, - { - "benchmark_name": "Benchmark 2", - "parameter_values": {"arg0": 4, "arg1": 5}, - "value": 3, - "val_min": 2, - "val_q1": 2.5, - "val_q3": 4, - "val_max": 4.5, - "std_dev": 1.5, - "n_obs": 8 - }, - { - "benchmark_name": "Benchmark 2", - "parameter_values": {"arg0": 5, "arg1": 5}, - "value": 3, - "val_min": 1, - "val_q1": 2, - "val_q3": 4, - "val_max": 5, - "std_dev": 2, - "n_obs": 8 - } - ] -} diff --git a/dev/benchmarking/examples/example.sql b/dev/benchmarking/examples/example.sql deleted file mode 100644 index e93269af75bd0..0000000000000 --- a/dev/benchmarking/examples/example.sql +++ /dev/null @@ -1,232 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - - --- Example insert into each of the views: -INSERT INTO public.project(project_name, project_url, repo_url) -VALUES ( - 'Apache Arrow' - , 'https://arrow.apache.org/' - , 'https://github.com/apache/arrow'); - -INSERT INTO public.environment_view - (benchmark_language, language_implementation_version, dependencies) -VALUES - ('Python', 'CPython 2.7', '{"six": "", "numpy": "1.14", "other_lib": "1.0"}'), - ('Python', 'CPython 3.6', '{"boost": "1.42", "numpy": "1.15"}'); - -INSERT INTO public.dependencies(dependencies) -VALUES - ('{"boost": "1.68", "numpy": "1.14"}'), - ('{"boost": "1.42", "numpy": "1.16"}'); - -INSERT INTO public.language_implementation_version_view - (benchmark_language, language_implementation_version) -VALUES - ('Python', 'CPython 2.7'), - ('Python', 'CPython 3.6'); - -INSERT INTO public.unit_view - (benchmark_type, units, lessisbetter) -VALUES - ('Memory', 'gigabytes', True), - ('Memory', 'kilobytes', True); - - -\echo 'use \\dv to list the views views'; -\dv - - -SELECT * FROM environment_view; -SELECT * FROM unit_view; - - -INSERT INTO public.machine_view ( - mac_address - , machine_name - , memory_bytes - , cpu_actual_frequency_hz - , os_name - , architecture_name - , kernel_name - , cpu_model_name - , cpu_core_count - , cpu_thread_count - , cpu_frequency_max_hz - , cpu_frequency_min_hz - , cpu_l1d_cache_bytes - , cpu_l1i_cache_bytes - , cpu_l2_cache_bytes - , cpu_l3_cache_bytes - , machine_other_attributes -) VALUES ( - '08:00:2b:01:02:03' -- mac_address - , 'My-Machine-Name' -- machine_name - , 8589934592 -- memory_bytes - -- All (?) standard mac address formats are allowable: - -- https://www.postgresql.org/docs/11/datatype-net-types.html - , 2300000000 -- cpu_actual_frequency_Hz - , 'OSX' -- os_name - , 'x86_64' -- architecture_name - , '18.2.0' -- kernel - , 'Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz' -- cpu_model_name - , 2 -- cpu_core_count - , 4 -- cpu_thread_count - , 2300000000 -- cpu_frequency_max_Hz - , 2300000000 -- cpu_frequency_min_Hz - , 32768 -- cpu_l1d_cache_bytes - , 32768 -- cpu_l1i_cache_bytes - , 262144 -- cpu_l2_cache_bytes - , 4194304 -- cpu_l3_cache_bytes - , '{"example": "for machine_other_attributes"}'::jsonb -); - - -INSERT INTO public.full_benchmark_run_view ( - benchmark_name - , parameter_names - , benchmark_description - , benchmark_type - , units - , lessisbetter - , benchmark_version - -- datum - , parameter_values - , value - , git_commit_timestamp - , git_hash - , val_min - , val_q1 - , val_q3 - , val_max - , std_dev - , n_obs - , run_timestamp - , run_metadata - , run_notes - -- machine_view - , machine_name - , mac_address - , memory_bytes - , cpu_actual_frequency_hz - , os_name - , architecture_name - , kernel_name - , cpu_model_name - , cpu_core_count - , cpu_thread_count - , cpu_frequency_max_hz - , cpu_frequency_min_hz - , cpu_l1d_cache_bytes - , cpu_l1i_cache_bytes - , cpu_l2_cache_bytes - , cpu_l3_cache_bytes - , machine_other_attributes - -- environment_view - , benchmark_language - , language_implementation_version - , dependencies -) VALUES ( - 'Benchmark 3' - , '{"arg0"}'::text[] - , 'Third benchmark' - , 'Memory' - , 'kilobytes' - , TRUE - , '0' - -- datum - , '{"arg0": 10}'::jsonb - , 0.5 - , '2019-01-31 14:31:10 -0600' - , '8136c46d5c60fb' - , 0.5 - , 0.5 - , 0.5 - , 0.5 - , 0 - , 2 - , '2019-02-14 14:00:00 -0600' - , '{"ci_99": [2.7e-06, 3.1e-06]}'::jsonb - , 'Additional run_notes.' - -- machine_view - , 'My-Machine-Name' - , '09-00-2c-01-02-03' - , 8589934592 - , 2300000000 - , 'OSX' - , 'x86_64' - , '18.2.0' - , 'Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz' - , 2 - , 4 - , 2300000000 - , 2300000000 - , 32768 - , 32768 - , 262144 - , 4194304 - , '{"example": "for machine_other_attributes"}'::jsonb - -- environment_view - , 'Python' - , 'CPython 2.7' - , '{"six": "", "numpy": "1.15", "other_lib": "1.0"}'::jsonb -); - - --- Bulk load from CSV. First column is empty; serial "benchmark_run_id" will be assigned. ---\copy benchmark_run_view FROM 'examples/benchmark_run_example.csv' WITH (FORMAT csv, HEADER); - --- Load from JSON ---\set content `cat examples/benchmark_example.json` ---SELECT ingest_benchmark_view(:'content'::jsonb); - -INSERT INTO public.benchmark_view ( - benchmark_name - , parameter_names - , benchmark_description - , benchmark_type - , units - , lessisbetter - , benchmark_version - , benchmark_language - ) VALUES ( - 'Benchmark 1' - , '{"arg0", "arg1", "arg2"}'::text[] - , E'Description.\nNewlines are OK in a string escaped with leading "E".' - , 'Time' - , 'miliseconds' - , TRUE - , 'Hash of code or other way to identify distinct benchmark versions.' - , 'Python' - ), ( - 'Benchmark 2' - , '{"arg0", "arg1"}'::text[] - , 'Description 2.' - , 'Time' - , 'nanoseconds' - , TRUE - , 'version 0' - , 'Python' - ); - - -\x -SELECT * from benchmark_run_view; - -\x diff --git a/dev/benchmarking/examples/example_graphql_mutation.json b/dev/benchmarking/examples/example_graphql_mutation.json deleted file mode 100644 index fec5eed0a68a5..0000000000000 --- a/dev/benchmarking/examples/example_graphql_mutation.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "query": "mutation ($p: CreateProjectInput!){createProject(input:$p){project{id}}}", - "variables": { - "p": { - "project": { - "projectName": "Apache Arrow", - "projectUrl": "https://www.arrow.apache.org", - "repoUrl": "https://www.github.com/apache/arrow" - } - } - } -} diff --git a/dev/benchmarking/examples/graphql_query_environment_view.json b/dev/benchmarking/examples/graphql_query_environment_view.json deleted file mode 100644 index 78804fa918a23..0000000000000 --- a/dev/benchmarking/examples/graphql_query_environment_view.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "query": "{allEnvironmentViews(orderBy: [BENCHMARK_LANGUAGE_ASC, LANGUAGE_IMPLEMENTATION_VERSION_ASC, DEPENDENCIES_ASC]) {edges {node {environmentId, benchmarkLanguage, languageImplementationVersion, dependencies}}}}" -} diff --git a/dev/benchmarking/examples/machine.json b/dev/benchmarking/examples/machine.json deleted file mode 100644 index 2485e2bc1c4e2..0000000000000 --- a/dev/benchmarking/examples/machine.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "mac_address": "0a:00:2d:01:02:03", - "machine_name": "Yet-Another-Machine-Name", - "memory_bytes": 8589934592, - "cpu_actual_frequency_hz": 2300000000, - "os_name": "OSX", - "architecture_name": "x86_64", - "kernel_name": "18.2.0", - "cpu_model_name": "Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz", - "cpu_core_count": 2, - "cpu_thread_count": 4, - "cpu_frequency_max_hz": 2300000000, - "cpu_frequency_min_hz": 2300000000, - "cpu_l1d_cache_bytes": 32768, - "cpu_l1i_cache_bytes": 32768, - "cpu_l2_cache_bytes": 262144, - "cpu_l3_cache_bytes": 4194304, - "machine_other_attributes": {"just": "an example"}, - "gpu_information": "", - "gpu_part_number": "", - "gpu_product_name": "" -} diff --git a/dev/benchmarking/graphql_submit.sh b/dev/benchmarking/graphql_submit.sh deleted file mode 100755 index 2eaab9cdfa5d4..0000000000000 --- a/dev/benchmarking/graphql_submit.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -OPTIONS=("machine" "benchmarks" "runs") - -option=${1-help} -datafile=${2-machine.json} -uri=${3-localhost:5000/graphql} - -help() { - cat < ${1} -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - - -LICENSE -} - -warning() { - cat <<'WARNING' >> ${1} -.. WARNING -.. This is an auto-generated file. Please do not edit. - -.. To reproduce, please run :code:`./make_data_model_rst.sh`. -.. (This requires you have the -.. `psql client `_ -.. and have started the docker containers using -.. :code:`docker-compose up`). - -WARNING -} - -echo "Making ${OUTFILE}" - -license ${OUTFILE} -warning ${OUTFILE} - -PGPASSWORD=arrow \ - psql --tuples-only --username=arrow_web \ - --dbname=benchmark --port=5432 --host=localhost \ - --command="select public.documentation('${DOTFILE}');" \ - | sed "s/ *+$//" | sed "s/^ //" >> ${OUTFILE} diff --git a/dev/benchmarking/make_dotfile.sh b/dev/benchmarking/make_dotfile.sh deleted file mode 100755 index b86dc3eb3c6d3..0000000000000 --- a/dev/benchmarking/make_dotfile.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -set -e -OUTFILE=data_model.dot - -license() { - cat <<'LICENSE' > ${1} -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements.See the NOTICE file - distributed with this work for additional information - regarding copyright ownership.The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License.You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied.See the License for the - specific language governing permissions and limitations - under the License. -*/ - -LICENSE -} - -warning() { - cat <<'WARNING' >> ${1} -/* - WARNING - This is an auto-generated file. Please do not edit. - - To reproduce, please run :code:`./make_data_model_rst.sh`. - (This requires you have the - `psql client `_ - and have started the docker containers using - :code:`docker-compose up`). -*/ -WARNING -} - -echo "Making ${OUTFILE}" - -license ${OUTFILE} -warning ${OUTFILE} - -PGPASSWORD=arrow \ - psql --tuples-only --username=arrow_web \ - --dbname=benchmark --port=5432 --host=localhost \ - --command="select public.documentation_dotfile();" \ - | sed "s/ *+$//" | sed "s/^ //" >> ${OUTFILE} diff --git a/dev/benchmarking/make_machine_json.sh b/dev/benchmarking/make_machine_json.sh deleted file mode 100755 index 09bf0ea2d15dc..0000000000000 --- a/dev/benchmarking/make_machine_json.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -set -e -OUTFILE=machine.json - -echo "Making ${OUTFILE}" -echo "** NOTE: This command fails on everything but OSX right now. **" -echo "* also, the intent is to make this script not suck, just not now. *" -echo "Please type GPU details here (or manually modify ${OUTFILE} later)." -read -p "GPU information string (or ): " gpu_information -read -p "GPU part number (or ): " gpu_part_number -read -p "GPU product name (or ): " gpu_product_name - - -cat < ${OUTFILE} -{ - "mac_address": "$(ifconfig en1 | awk '/ether/{print $2}')", - "machine_name": "$(uname -n)", - "memory_bytes": $(sysctl -n hw.memsize), - "cpu_actual_frequency_hz": $(sysctl -n hw.cpufrequency), - "os_name": "$(uname -s)", - "architecture_name": "$(uname -m)", - "kernel_name": "$(uname -r)", - "cpu_model_name": "$(sysctl -n machdep.cpu.brand_string)", - "cpu_core_count": $(sysctl -n hw.physicalcpu), - "cpu_thread_count": $(sysctl -n hw.logicalcpu), - "cpu_frequency_max_hz": $(sysctl -n hw.cpufrequency_max), - "cpu_frequency_min_hz": $(sysctl -n hw.cpufrequency_min), - "cpu_l1d_cache_bytes": $(sysctl -n hw.l1dcachesize), - "cpu_l1i_cache_bytes": $(sysctl -n hw.l1icachesize), - "cpu_l2_cache_bytes": $(sysctl -n hw.l2cachesize), - "cpu_l3_cache_bytes": $(sysctl -n hw.l3cachesize), - "gpu_information": "${gpu_information}", - "gpu_part_number": "${gpu_part_number}", - "gpu_product_name": "${gpu_product_name}" -} -MACHINE_JSON - -echo "Machine details saved in ${OUTFILE}" diff --git a/dev/build-ballista-docker-arm64.sh b/dev/build-ballista-docker-arm64.sh deleted file mode 100755 index 5d951773ada41..0000000000000 --- a/dev/build-ballista-docker-arm64.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -if [ -z "${DOCKER_REPO}" ]; then - echo "DOCKER_REPO env var must be set" - exit -1 -fi -cargo install cross -cross build --release --target aarch64-unknown-linux-gnu -rm -rf temp-ballista-docker -mkdir temp-ballista-docker -cp target/aarch64-unknown-linux-gnu/release/ballista-executor temp-ballista-docker -cp target/aarch64-unknown-linux-gnu/release/ballista-scheduler temp-ballista-docker -cp target/aarch64-unknown-linux-gnu/release/tpch temp-ballista-docker -mkdir temp-ballista-docker/queries/ -cp benchmarks/queries/*.sql temp-ballista-docker/queries/ -docker buildx build --push -t $DOCKER_REPO/ballista-arm64 --platform=linux/arm64 -f dev/docker/ballista-arm64.Dockerfile temp-ballista-docker -rm -rf temp-ballista-docker \ No newline at end of file diff --git a/dev/build-ballista-docker.sh b/dev/build-ballista-docker.sh deleted file mode 100755 index bc028da9e716a..0000000000000 --- a/dev/build-ballista-docker.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e - -. ./dev/build-set-env.sh -docker build -t ballista-base:$BALLISTA_VERSION -f dev/docker/ballista-base.dockerfile . -docker build -t ballista:$BALLISTA_VERSION -f dev/docker/ballista.dockerfile . diff --git a/dev/build-set-env.sh b/dev/build-set-env.sh deleted file mode 100755 index 3eb29e7ce1443..0000000000000 --- a/dev/build-set-env.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -export BALLISTA_VERSION=$(awk -F'[ ="]+' '$1 == "version" { print $2 }' ballista/rust/core/Cargo.toml) diff --git a/dev/build-ui.sh b/dev/build-ui.sh deleted file mode 100755 index 070839702500e..0000000000000 --- a/dev/build-ui.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e - -. ./dev/build-set-env.sh -docker build -t ballista-scheduler-ui:$BALLISTA_VERSION -f dev/docker/ballista-scheduler-ui.dockerfile ballista/ui/scheduler diff --git a/dev/merge.conf.sample b/dev/merge.conf.sample deleted file mode 100644 index c71b211614daf..0000000000000 --- a/dev/merge.conf.sample +++ /dev/null @@ -1,25 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Configuration for the merge_arrow_pr.py tool -# Install a copy of this file at ~/.config/arrow/merge.conf - -[jira] -# issues.apache.org JIRA credentials. Sadly, the jira instance doesn't offer -# token credentials. Ensure that the file is properly protected. -username=johnsmith -password=123456 diff --git a/dev/merge_arrow_pr.py b/dev/merge_arrow_pr.py deleted file mode 100755 index 373ceb8e20f3f..0000000000000 --- a/dev/merge_arrow_pr.py +++ /dev/null @@ -1,610 +0,0 @@ -#!/usr/bin/env python - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Utility for creating well-formed pull request merges and pushing them to -# Apache. -# usage: ./merge_arrow_pr.py (see config env vars below) -# -# This utility assumes you already have a local Arrow git clone and that you -# have added remotes corresponding to both (i) the GitHub Apache Arrow mirror -# and (ii) the apache git repo. -# -# There are several pieces of authorization possibly needed via environment -# variables -# -# APACHE_JIRA_USERNAME: your Apache JIRA id -# APACHE_JIRA_PASSWORD: your Apache JIRA password -# ARROW_GITHUB_API_TOKEN: a GitHub API token to use for API requests (to avoid -# rate limiting) - -import configparser -import os -import pprint -import re -import subprocess -import sys -import requests -import getpass - -from six.moves import input -import six - -try: - import jira.client - import jira.exceptions -except ImportError: - print("Could not find jira library. " - "Run 'sudo pip install jira' to install.") - print("Exiting without trying to close the associated JIRA.") - sys.exit(1) - -# Remote name which points to the GitHub site -PR_REMOTE_NAME = os.environ.get("PR_REMOTE_NAME", "apache") - -# For testing to avoid accidentally pushing to apache -DEBUG = bool(int(os.environ.get("DEBUG", 0))) - - -if DEBUG: - print("**************** DEBUGGING ****************") - - -# Prefix added to temporary branches -BRANCH_PREFIX = "PR_TOOL" -JIRA_API_BASE = "https://issues.apache.org/jira" - - -def get_json(url, headers=None): - req = requests.get(url, headers=headers) - return req.json() - - -def run_cmd(cmd): - if isinstance(cmd, six.string_types): - cmd = cmd.split(' ') - - try: - output = subprocess.check_output(cmd) - except subprocess.CalledProcessError as e: - # this avoids hiding the stdout / stderr of failed processes - print('Command failed: %s' % cmd) - print('With output:') - print('--------------') - print(e.output) - print('--------------') - raise e - - if isinstance(output, six.binary_type): - output = output.decode('utf-8') - return output - - -original_head = run_cmd("git rev-parse HEAD")[:8] - - -def clean_up(): - print("Restoring head pointer to %s" % original_head) - run_cmd("git checkout %s" % original_head) - - branches = run_cmd("git branch").replace(" ", "").split("\n") - - for branch in [x for x in branches - if x.startswith(BRANCH_PREFIX)]: - print("Deleting local branch %s" % branch) - run_cmd("git branch -D %s" % branch) - - -_REGEX_CI_DIRECTIVE = re.compile(r'\[[^\]]*\]') - - -def strip_ci_directives(commit_message): - # Remove things like '[force ci]', '[skip appveyor]' from the assembled - # commit message - return _REGEX_CI_DIRECTIVE.sub('', commit_message) - - -def fix_version_from_branch(branch, versions): - # Note: Assumes this is a sorted (newest->oldest) list of un-released - # versions - if branch == "master": - return versions[-1] - else: - branch_ver = branch.replace("branch-", "") - return [x for x in versions if x.name.startswith(branch_ver)][-1] - - -# We can merge both ARROW and PARQUET patchesa -SUPPORTED_PROJECTS = ['ARROW', 'PARQUET'] -PR_TITLE_REGEXEN = [(project, re.compile(r'^(' + project + r'-[0-9]+)\b.*$')) - for project in SUPPORTED_PROJECTS] - - -class JiraIssue(object): - - def __init__(self, jira_con, jira_id, project, cmd): - self.jira_con = jira_con - self.jira_id = jira_id - self.project = project - self.cmd = cmd - - try: - self.issue = jira_con.issue(jira_id) - except Exception as e: - self.cmd.fail("ASF JIRA could not find %s\n%s" % (jira_id, e)) - - @property - def current_fix_versions(self): - return self.issue.fields.fixVersions - - def get_candidate_fix_versions(self, merge_branches=('master',)): - # Only suggest versions starting with a number, like 0.x but not JS-0.x - all_versions = self.jira_con.project_versions(self.project) - unreleased_versions = [x for x in all_versions - if not x.raw['released']] - - unreleased_versions = sorted(unreleased_versions, - key=lambda x: x.name, reverse=True) - - mainline_versions = self._filter_mainline_versions(unreleased_versions) - - mainline_non_patch_versions = [] - for v in mainline_versions: - (major, minor, patch) = v.name.split(".") - if patch == "0": - mainline_non_patch_versions.append(v) - - if len(mainline_versions) > len(mainline_non_patch_versions): - # If there is a non-patch release, suggest that instead - mainline_versions = mainline_non_patch_versions - - default_fix_versions = [ - fix_version_from_branch(x, mainline_versions).name - for x in merge_branches] - - return all_versions, default_fix_versions - - def _filter_mainline_versions(self, versions): - if self.project == 'PARQUET': - mainline_regex = re.compile(r'cpp-\d.*') - else: - mainline_regex = re.compile(r'\d.*') - - return [x for x in versions if mainline_regex.match(x.name)] - - def resolve(self, fix_versions, comment): - fields = self.issue.fields - cur_status = fields.status.name - - if cur_status == "Resolved" or cur_status == "Closed": - self.cmd.fail("JIRA issue %s already has status '%s'" - % (self.jira_id, cur_status)) - - if DEBUG: - print("JIRA issue %s untouched" % (self.jira_id)) - return - - resolve = [x for x in self.jira_con.transitions(self.jira_id) - if x['name'] == "Resolve Issue"][0] - - # ARROW-6915: do not overwrite existing fix versions corresponding to - # point releases - fix_versions = list(fix_versions) - fix_version_names = set(x['name'] for x in fix_versions) - for version in self.current_fix_versions: - major, minor, patch = version.name.split('.') - if patch != '0' and version.name not in fix_version_names: - fix_versions.append(version.raw) - - self.jira_con.transition_issue(self.jira_id, resolve["id"], - comment=comment, - fixVersions=fix_versions) - - print("Successfully resolved %s!" % (self.jira_id)) - - self.issue = self.jira_con.issue(self.jira_id) - self.show() - - def show(self): - fields = self.issue.fields - print(format_jira_output(self.jira_id, fields.status.name, - fields.summary, fields.assignee, - fields.components)) - - -def format_jira_output(jira_id, status, summary, assignee, components): - if assignee is None: - assignee = "NOT ASSIGNED!!!" - else: - assignee = assignee.displayName - - if len(components) == 0: - components = 'NO COMPONENTS!!!' - else: - components = ', '.join((x.name for x in components)) - - return """=== JIRA {} === -Summary\t\t{} -Assignee\t{} -Components\t{} -Status\t\t{} -URL\t\t{}/{}""".format(jira_id, summary, assignee, components, status, - '/'.join((JIRA_API_BASE, 'browse')), jira_id) - - -class GitHubAPI(object): - - def __init__(self, project_name): - self.github_api = ("https://api.github.com/repos/apache/{0}" - .format(project_name)) - - token = os.environ.get('ARROW_GITHUB_API_TOKEN', None) - if token: - self.headers = {'Authorization': 'token {0}'.format(token)} - else: - self.headers = None - - def get_pr_data(self, number): - return get_json("%s/pulls/%s" % (self.github_api, number), - headers=self.headers) - - -class CommandInput(object): - """ - Interface to input(...) to enable unit test mocks to be created - """ - - def fail(self, msg): - clean_up() - raise Exception(msg) - - def prompt(self, prompt): - return input(prompt) - - def getpass(self, prompt): - return getpass.getpass(prompt) - - def continue_maybe(self, prompt): - while True: - result = input("\n%s (y/n): " % prompt) - if result.lower() == "y": - return - elif result.lower() == "n": - self.fail("Okay, exiting") - else: - prompt = "Please input 'y' or 'n'" - - -class PullRequest(object): - - def __init__(self, cmd, github_api, git_remote, jira_con, number): - self.cmd = cmd - self.git_remote = git_remote - self.con = jira_con - self.number = number - self._pr_data = github_api.get_pr_data(number) - try: - self.url = self._pr_data["url"] - self.title = self._pr_data["title"] - self.body = self._pr_data["body"] - self.target_ref = self._pr_data["base"]["ref"] - self.user_login = self._pr_data["user"]["login"] - self.base_ref = self._pr_data["head"]["ref"] - except KeyError: - pprint.pprint(self._pr_data) - raise - self.description = "%s/%s" % (self.user_login, self.base_ref) - - self.jira_issue = self._get_jira() - - def show(self): - print("\n=== Pull Request #%s ===" % self.number) - print("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" - % (self.title, self.description, self.target_ref, self.url)) - if self.jira_issue is not None: - self.jira_issue.show() - else: - print("Minor PR. Please ensure it meets guidelines for minor.\n") - - @property - def is_merged(self): - return bool(self._pr_data["merged"]) - - @property - def is_mergeable(self): - return bool(self._pr_data["mergeable"]) - - def _get_jira(self): - jira_id = None - for project, regex in PR_TITLE_REGEXEN: - m = regex.search(self.title) - if m: - jira_id = m.group(1) - break - - if jira_id is None and not self.title.startswith("MINOR:"): - options = ' or '.join('{0}-XXX'.format(project) - for project in SUPPORTED_PROJECTS) - self.cmd.fail("PR title should be prefixed by a jira id " - "{0}, but found {1}".format(options, self.title)) - - return JiraIssue(self.con, jira_id, project, self.cmd) - - def merge(self): - """ - merge the requested PR and return the merge hash - """ - pr_branch_name = "%s_MERGE_PR_%s" % (BRANCH_PREFIX, self.number) - target_branch_name = "%s_MERGE_PR_%s_%s" % (BRANCH_PREFIX, - self.number, - self.target_ref.upper()) - run_cmd("git fetch %s pull/%s/head:%s" % (self.git_remote, - self.number, - pr_branch_name)) - run_cmd("git fetch %s %s:%s" % (self.git_remote, self.target_ref, - target_branch_name)) - run_cmd("git checkout %s" % target_branch_name) - - had_conflicts = False - try: - run_cmd(['git', 'merge', pr_branch_name, '--ff', '--squash']) - except Exception as e: - msg = ("Error merging: %s\nWould you like to " - "manually fix-up this merge?" % e) - self.cmd.continue_maybe(msg) - msg = ("Okay, please fix any conflicts and 'git add' " - "conflicting files... Finished?") - self.cmd.continue_maybe(msg) - had_conflicts = True - - commit_authors = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name, - '--pretty=format:%an <%ae>']).split("\n") - distinct_authors = sorted(set(commit_authors), - key=lambda x: commit_authors.count(x), - reverse=True) - - for i, author in enumerate(distinct_authors): - print("Author {}: {}".format(i + 1, author)) - - if len(distinct_authors) > 1: - primary_author, distinct_authors = get_primary_author( - self.cmd, distinct_authors) - else: - # If there is only one author, do not prompt for a lead author - primary_author = distinct_authors[0] - - merge_message_flags = [] - - merge_message_flags += ["-m", self.title] - if self.body is not None: - merge_message_flags += ["-m", self.body] - - committer_name = run_cmd("git config --get user.name").strip() - committer_email = run_cmd("git config --get user.email").strip() - - authors = ("Authored-by:" if len(distinct_authors) == 1 - else "Lead-authored-by:") - authors += " %s" % (distinct_authors.pop(0)) - if len(distinct_authors) > 0: - authors += "\n" + "\n".join(["Co-authored-by: %s" % a - for a in distinct_authors]) - authors += "\n" + "Signed-off-by: %s <%s>" % (committer_name, - committer_email) - - if had_conflicts: - committer_name = run_cmd("git config --get user.name").strip() - committer_email = run_cmd("git config --get user.email").strip() - message = ("This patch had conflicts when merged, " - "resolved by\nCommitter: %s <%s>" % - (committer_name, committer_email)) - merge_message_flags += ["-m", message] - - # The string "Closes #%s" string is required for GitHub to correctly - # close the PR - merge_message_flags += [ - "-m", - "Closes #%s from %s" - % (self.number, self.description)] - merge_message_flags += ["-m", authors] - - if DEBUG: - print("\n".join(merge_message_flags)) - - run_cmd(['git', 'commit', - '--no-verify', # do not run commit hooks - '--author="%s"' % primary_author] + - merge_message_flags) - - self.cmd.continue_maybe("Merge complete (local ref %s). Push to %s?" - % (target_branch_name, self.git_remote)) - - try: - push_cmd = ('git push %s %s:%s' % (self.git_remote, - target_branch_name, - self.target_ref)) - if DEBUG: - print(push_cmd) - else: - run_cmd(push_cmd) - except Exception as e: - clean_up() - self.cmd.fail("Exception while pushing: %s" % e) - - merge_hash = run_cmd("git rev-parse %s" % target_branch_name)[:8] - clean_up() - print("Pull request #%s merged!" % self.number) - print("Merge hash: %s" % merge_hash) - return merge_hash - - -def get_primary_author(cmd, distinct_authors): - author_pat = re.compile(r'(.*) <(.*)>') - - while True: - primary_author = cmd.prompt( - "Enter primary author in the format of " - "\"name \" [%s]: " % distinct_authors[0]) - - if primary_author == "": - return distinct_authors[0], distinct_authors - - if author_pat.match(primary_author): - break - print('Bad author "{}", please try again'.format(primary_author)) - - # When primary author is specified manually, de-dup it from - # author list and put it at the head of author list. - distinct_authors = [x for x in distinct_authors - if x != primary_author] - distinct_authors = [primary_author] + distinct_authors - return primary_author, distinct_authors - - -def prompt_for_fix_version(cmd, jira_issue): - (all_versions, - default_fix_versions) = jira_issue.get_candidate_fix_versions() - - default_fix_versions = ",".join(default_fix_versions) - - issue_fix_versions = cmd.prompt("Enter comma-separated " - "fix version(s) [%s]: " - % default_fix_versions) - if issue_fix_versions == "": - issue_fix_versions = default_fix_versions - issue_fix_versions = issue_fix_versions.replace(" ", "").split(",") - - def get_version_json(version_str): - return [x for x in all_versions if x.name == version_str][0].raw - - return [get_version_json(v) for v in issue_fix_versions] - - -CONFIG_FILE = "~/.config/arrow/merge.conf" - - -def load_configuration(): - config = configparser.ConfigParser() - config.read(os.path.expanduser(CONFIG_FILE)) - return config - - -def get_credentials(cmd): - username, password = None, None - - config = load_configuration() - if "jira" in config.sections(): - username = config["jira"].get("username") - password = config["jira"].get("password") - - # Fallback to environment variables - if not username: - username = os.environ.get("APACHE_JIRA_USERNAME") - - if not password: - password = os.environ.get("APACHE_JIRA_PASSWORD") - - # Fallback to user tty prompt - if not username: - username = cmd.prompt("Env APACHE_JIRA_USERNAME not set, " - "please enter your JIRA username:") - - if not password: - password = cmd.getpass("Env APACHE_JIRA_PASSWORD not set, " - "please enter your JIRA password:") - - return (username, password) - - -def connect_jira(cmd): - try: - return jira.client.JIRA(options={'server': JIRA_API_BASE}, - basic_auth=get_credentials(cmd)) - except jira.exceptions.JIRAError as e: - if "CAPTCHA_CHALLENGE" in e.text: - print("") - print("It looks like you need to answer a captcha challenge for " - "this account (probably due to a login attempt with an " - "incorrect password). Please log in at " - "https://issues.apache.org/jira and complete the captcha " - "before running this tool again.") - print("Exiting.") - sys.exit(1) - raise e - - -def get_pr_num(): - if len(sys.argv) == 2: - return sys.argv[1] - - return input("Which pull request would you like to merge? (e.g. 34): ") - - -def cli(): - # Location of your Arrow git clone - ARROW_HOME = os.path.abspath(os.path.dirname(__file__)) - PROJECT_NAME = os.environ.get('ARROW_PROJECT_NAME') or 'arrow' - print("ARROW_HOME = " + ARROW_HOME) - print("PROJECT_NAME = " + PROJECT_NAME) - - cmd = CommandInput() - - pr_num = get_pr_num() - - os.chdir(ARROW_HOME) - - github_api = GitHubAPI(PROJECT_NAME) - - jira_con = connect_jira(cmd) - pr = PullRequest(cmd, github_api, PR_REMOTE_NAME, jira_con, pr_num) - - if pr.is_merged: - print("Pull request %s has already been merged") - sys.exit(0) - - if not pr.is_mergeable: - msg = ("Pull request %s is not mergeable in its current form.\n" - % pr_num + "Continue? (experts only!)") - cmd.continue_maybe(msg) - - pr.show() - - cmd.continue_maybe("Proceed with merging pull request #%s?" % pr_num) - - # merged hash not used - pr.merge() - - if pr.jira_issue is None: - print("Minor PR. No JIRA issue to update.\n") - return - - cmd.continue_maybe("Would you like to update the associated JIRA?") - jira_comment = ( - "Issue resolved by pull request %s\n[%s/%s]" - % (pr_num, - "https://github.com/apache/" + PROJECT_NAME + "/pull", - pr_num)) - - fix_versions_json = prompt_for_fix_version(cmd, pr.jira_issue) - pr.jira_issue.resolve(fix_versions_json, jira_comment) - - -if __name__ == '__main__': - try: - cli() - except Exception: - raise diff --git a/dev/merge_arrow_pr.sh b/dev/merge_arrow_pr.sh deleted file mode 100755 index 147f6c4bc0d11..0000000000000 --- a/dev/merge_arrow_pr.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/sh - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Wrapper script that automatically creates a Python virtual environment -# and runs merge_arrow_pr.py inside it. - -set -e - -PYTHON=$(which python3) -PYVER=$($PYTHON -c "import sys; print('.'.join(map(str, sys.version_info[:2])))") - -GIT_ROOT=$(git rev-parse --show-toplevel) -ENV_DIR=$GIT_ROOT/dev/.venv$PYVER - -ENV_PYTHON=$ENV_DIR/bin/python3 -ENV_PIP="$ENV_PYTHON -m pip --no-input" - -check_venv() { - [ -x $ENV_PYTHON ] || { - echo "Virtual environment broken: $ENV_PYTHON not an executable" - exit 1 - } -} - -create_venv() { - echo "" - echo "Creating Python virtual environment in $ENV_DIR ..." - echo "" - $PYTHON -m venv $ENV_DIR - $ENV_PIP install -q -r $GIT_ROOT/dev/requirements_merge_arrow_pr.txt || { - echo "Failed to setup virtual environment" - echo "Please delete directory '$ENV_DIR' and try again" - exit $? - } -} - -[ -d $ENV_DIR ] || create_venv -check_venv - -$ENV_PYTHON $GIT_ROOT/dev/merge_arrow_pr.py "$@" diff --git a/dev/release/.env.example b/dev/release/.env.example deleted file mode 100644 index 0126cdd3f2981..0000000000000 --- a/dev/release/.env.example +++ /dev/null @@ -1,48 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# The GPG key ID to sign artifacts. The GPG key ID must be registered -# to both of the followings: -# -# * https://dist.apache.org/repos/dist/dev/arrow/KEYS -# * https://dist.apache.org/repos/dist/release/arrow/KEYS -# -# See these files how to import your GPG key ID to these files. -# -# You must set this. -#GPG_KEY_ID=08D3564B7C6A9CAFBFF6A66791D18FCF079F8007 - -# The Bintray repository where artifacts are uploaded. -# You can use your Bintray repository such as kou/arrow for test. -BINTRAY_REPOSITORY=apache/arrow - -# The Bintray repository where released artifacts exist. -# The released artifacts are used to build APT/Yum repository. -# The Bintray repository isn't changed. (Download only. No upload.) -# -# Normally, you don't need to change this. -SOURCE_BINTRAY_REPOSITORY=apache/arrow - -# The Bintray user name to upload artifacts to Bintray. -# -# You must set this. -#BINTRAY_USER=kou - -# The Bintray API key to upload artifacts to Bintray. -# -# You must set this. -#BINTRAY_API_KEY=secret diff --git a/dev/release/.gitignore b/dev/release/.gitignore deleted file mode 100644 index f3d708a6a2ccf..0000000000000 --- a/dev/release/.gitignore +++ /dev/null @@ -1,21 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -/.env -/binary/id_rsa -/binary/id_rsa.pub -/binary/tmp/ diff --git a/dev/release/01-prepare-test.rb b/dev/release/01-prepare-test.rb deleted file mode 100644 index b316ad20a9d8d..0000000000000 --- a/dev/release/01-prepare-test.rb +++ /dev/null @@ -1,665 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -class PrepareTest < Test::Unit::TestCase - include GitRunnable - include VersionDetectable - - def setup - @current_commit = git_current_commit - detect_versions - - top_dir = Pathname(__dir__).parent.parent - @original_git_repository = top_dir + ".git" - Dir.mktmpdir do |dir| - @test_git_repository = Pathname(dir) + "arrow" - git("clone", @original_git_repository.to_s, @test_git_repository.to_s) - Dir.chdir(@test_git_repository) do - @tag_name = "apache-arrow-#{@release_version}" - @release_branch = "release-#{@release_version}-rc0" - @script = "dev/release/01-prepare.sh" - git("checkout", "-b", @release_branch, @current_commit) - yield - end - FileUtils.rm_rf(@test_git_repository) - end - end - - def omit_on_release_branch - omit("Not for release branch") if on_release_branch? - end - - def prepare(*targets) - if targets.last.is_a?(Hash) - additional_env = targets.pop - else - additional_env = {} - end - env = {"PREPARE_DEFAULT" => "0"} - targets.each do |target| - env["PREPARE_#{target}"] = "1" - end - env = env.merge(additional_env) - sh(env, @script, @release_version, @next_version, "0") - end - - def parse_patch(patch) - diffs = [] - in_hunk = false - patch.each_line do |line| - case line - when /\A--- a\// - path = $POSTMATCH.chomp - diffs << {path: path, hunks: []} - in_hunk = false - when /\A@@/ - in_hunk = true - diffs.last[:hunks] << [] - when /\A[-+]/ - next unless in_hunk - diffs.last[:hunks].last << line.chomp - end - end - diffs.sort_by do |diff| - diff[:path] - end - end - - def test_linux_packages - user = "Arrow Developers" - email = "dev@arrow.apache.org" - prepare("LINUX_PACKAGES", - "DEBFULLNAME" => user, - "DEBEMAIL" => email) - changes = parse_patch(git("log", "-n", "1", "-p")) - sampled_changes = changes.collect do |change| - { - path: change[:path], - sampled_hunks: change[:hunks].collect(&:first), - # sampled_hunks: change[:hunks], - } - end - base_dir = "dev/tasks/linux-packages" - today = Time.now.utc.strftime("%a %b %d %Y") - expected_changes = [ - { - path: "#{base_dir}/apache-arrow-apt-source/debian/changelog", - sampled_hunks: [ - "+apache-arrow-apt-source (#{@release_version}-1) " + - "unstable; urgency=low", - ], - }, - { - path: - "#{base_dir}/apache-arrow-release/yum/apache-arrow-release.spec.in", - sampled_hunks: [ - "+* #{today} #{user} <#{email}> - #{@release_version}-1", - ], - }, - { - path: "#{base_dir}/apache-arrow/debian/changelog", - sampled_hunks: [ - "+apache-arrow (#{@release_version}-1) unstable; urgency=low", - ], - }, - { - path: "#{base_dir}/apache-arrow/yum/arrow.spec.in", - sampled_hunks: [ - "+* #{today} #{user} <#{email}> - #{@release_version}-1", - ], - }, - ] - assert_equal(expected_changes, sampled_changes) - end - - def test_version_pre_tag - omit_on_release_branch - prepare("VERSION_PRE_TAG") - assert_equal([ - { - path: "c_glib/meson.build", - hunks: [ - ["-version = '#{@snapshot_version}'", - "+version = '#{@release_version}'"], - ], - }, - { - path: "ci/scripts/PKGBUILD", - hunks: [ - ["-pkgver=#{@previous_version}.9000", - "+pkgver=#{@release_version}"], - ], - }, - { - path: "cpp/CMakeLists.txt", - hunks: [ - ["-set(ARROW_VERSION \"#{@snapshot_version}\")", - "+set(ARROW_VERSION \"#{@release_version}\")"], - ], - }, - { - path: "cpp/vcpkg.json", - hunks: [ - ["- \"version-string\": \"#{@snapshot_version}\",", - "+ \"version-string\": \"#{@release_version}\","], - ], - }, - { - path: "csharp/Directory.Build.props", - hunks: [ - ["- #{@snapshot_version}", - "+ #{@release_version}"], - ], - }, - { - path: "dev/tasks/homebrew-formulae/apache-arrow.rb", - hunks: [ - ["- url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@snapshot_version}/apache-arrow-#{@snapshot_version}.tar.gz\"", - "+ url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@release_version}/apache-arrow-#{@release_version}.tar.gz\""], - ], - }, - { - path: "dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb", - hunks: [ - ["- url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@previous_version}.9000/apache-arrow-#{@previous_version}.9000.tar.gz\"", - "+ url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@release_version}/apache-arrow-#{@release_version}.tar.gz\""], - ], - }, - { - path: "js/package.json", - hunks: [ - ["- \"version\": \"#{@snapshot_version}\"", - "+ \"version\": \"#{@release_version}\""] - ], - }, - { - path: "matlab/CMakeLists.txt", - hunks: [ - ["-set(MLARROW_VERSION \"#{@snapshot_version}\")", - "+set(MLARROW_VERSION \"#{@release_version}\")"], - ], - }, - { - path: "python/setup.py", - hunks: [ - ["-default_version = '#{@snapshot_version}'", - "+default_version = '#{@release_version}'"], - ], - }, - { - path: "r/DESCRIPTION", - hunks: [ - ["-Version: #{@previous_version}.9000", - "+Version: #{@release_version}"], - ], - }, - { - path: "r/NEWS.md", - hunks: [ - ["-\# arrow #{@previous_version}.9000", - "+\# arrow #{@release_version}"], - ], - }, - { - path: "ruby/red-arrow-cuda/lib/arrow-cuda/version.rb", - hunks: [ - ["- VERSION = \"#{@snapshot_version}\"", - "+ VERSION = \"#{@release_version}\""], - ], - }, - { - path: "ruby/red-arrow-dataset/lib/arrow-dataset/version.rb", - hunks: [ - ["- VERSION = \"#{@snapshot_version}\"", - "+ VERSION = \"#{@release_version}\""], - ], - }, - { - path: "ruby/red-arrow/lib/arrow/version.rb", - hunks: [ - ["- VERSION = \"#{@snapshot_version}\"", - "+ VERSION = \"#{@release_version}\""], - ], - }, - { - path: "ruby/red-gandiva/lib/gandiva/version.rb", - hunks: [ - ["- VERSION = \"#{@snapshot_version}\"", - "+ VERSION = \"#{@release_version}\""], - ], - }, - { - path: "ruby/red-parquet/lib/parquet/version.rb", - hunks: [ - ["- VERSION = \"#{@snapshot_version}\"", - "+ VERSION = \"#{@release_version}\""], - ], - }, - { - path: "ruby/red-plasma/lib/plasma/version.rb", - hunks: [ - ["- VERSION = \"#{@snapshot_version}\"", - "+ VERSION = \"#{@release_version}\""], - ], - }, - { - path: "rust/arrow-flight/Cargo.toml", - hunks: [ - ["-version = \"#{@snapshot_version}\"", - "+version = \"#{@release_version}\""], - ["-arrow = { path = \"../arrow\", version = \"#{@snapshot_version}\" }", - "+arrow = { path = \"../arrow\", version = \"#{@release_version}\" }"], - ], - }, - { - path: "rust/arrow-pyarrow-integration-testing/Cargo.toml", - hunks: [ - ["-version = \"#{@snapshot_version}\"", - "+version = \"#{@release_version}\""], - ["-arrow = { path = \"../arrow\", version = \"#{@snapshot_version}\" }", - "+arrow = { path = \"../arrow\", version = \"#{@release_version}\" }"], - ], - }, - { - path: "rust/arrow/Cargo.toml", - hunks: [ - ["-version = \"#{@snapshot_version}\"", - "+version = \"#{@release_version}\""], - ], - }, - { - path: "rust/benchmarks/Cargo.toml", - hunks: [ - ["-version = \"#{@snapshot_version}\"", - "+version = \"#{@release_version}\""], - ], - }, - { - path: "rust/datafusion-examples/Cargo.toml", - hunks: [ - ["-version = \"#{@snapshot_version}\"", - "+version = \"#{@release_version}\""], - ], - }, - { - path: "rust/datafusion/Cargo.toml", - hunks: [ - ["-version = \"#{@snapshot_version}\"", - "+version = \"#{@release_version}\""], - ["-arrow = { path = \"../arrow\", version = \"#{@snapshot_version}\", features = [\"prettyprint\"] }", - "-parquet = { path = \"../parquet\", version = \"#{@snapshot_version}\", features = [\"arrow\"] }", - "+arrow = { path = \"../arrow\", version = \"#{@release_version}\", features = [\"prettyprint\"] }", - "+parquet = { path = \"../parquet\", version = \"#{@release_version}\", features = [\"arrow\"] }"], - ], - }, - { - path: "rust/datafusion/README.md", - hunks: [ - ["-datafusion = \"#{@snapshot_version}\"", - "+datafusion = \"#{@release_version}\""], - ], - }, - { - path: "rust/integration-testing/Cargo.toml", - hunks: [ - ["-version = \"#{@snapshot_version}\"", - "+version = \"#{@release_version}\""], - ], - }, - { - path: "rust/parquet/Cargo.toml", - hunks: [ - ["-version = \"#{@snapshot_version}\"", - "+version = \"#{@release_version}\""], - ["-arrow = { path = \"../arrow\", version = \"#{@snapshot_version}\", optional = true }", - "+arrow = { path = \"../arrow\", version = \"#{@release_version}\", optional = true }"], - ["-arrow = { path = \"../arrow\", version = \"#{@snapshot_version}\" }", - "+arrow = { path = \"../arrow\", version = \"#{@release_version}\" }"] - ], - }, - { - path: "rust/parquet/README.md", - hunks: [ - ["-parquet = \"#{@snapshot_version}\"", - "+parquet = \"#{@release_version}\""], - ["-See [crate documentation](https://docs.rs/crate/parquet/#{@snapshot_version}) on available API.", - "+See [crate documentation](https://docs.rs/crate/parquet/#{@release_version}) on available API."], - ], - }, - { - path: "rust/parquet_derive/Cargo.toml", - hunks: [ - ["-version = \"#{@snapshot_version}\"", - "+version = \"#{@release_version}\""], - ["-parquet = { path = \"../parquet\", version = \"#{@snapshot_version}\" }", - "+parquet = { path = \"../parquet\", version = \"#{@release_version}\" }"], - ], - }, - { - path: "rust/parquet_derive/README.md", - hunks: [ - ["-parquet = \"#{@snapshot_version}\"", - "-parquet_derive = \"#{@snapshot_version}\"", - "+parquet = \"#{@release_version}\"", - "+parquet_derive = \"#{@release_version}\""], - ], - }, - { - path: "rust/parquet_derive_test/Cargo.toml", - hunks: [ - ["-version = \"#{@snapshot_version}\"", - "+version = \"#{@release_version}\""], - ["-parquet = { path = \"../parquet\", version = \"#{@snapshot_version}\" }", - "-parquet_derive = { path = \"../parquet_derive\", version = \"#{@snapshot_version}\" }", - "+parquet = { path = \"../parquet\", version = \"#{@release_version}\" }", - "+parquet_derive = { path = \"../parquet_derive\", version = \"#{@release_version}\" }"], - ], - }, - ], - parse_patch(git("log", "-n", "1", "-p"))) - end - - def test_version_post_tag - if on_release_branch? - prepare("VERSION_POST_TAG") - else - prepare("VERSION_PRE_TAG", - "VERSION_POST_TAG") - end - assert_equal([ - { - path: "c_glib/meson.build", - hunks: [ - ["-version = '#{@release_version}'", - "+version = '#{@next_snapshot_version}'"], - ], - }, - { - path: "ci/scripts/PKGBUILD", - hunks: [ - ["-pkgver=#{@release_version}", - "+pkgver=#{@release_version}.9000"], - ], - }, - { - path: "cpp/CMakeLists.txt", - hunks: [ - ["-set(ARROW_VERSION \"#{@release_version}\")", - "+set(ARROW_VERSION \"#{@next_snapshot_version}\")"], - ], - }, - { - path: "cpp/vcpkg.json", - hunks: [ - ["- \"version-string\": \"#{@release_version}\",", - "+ \"version-string\": \"#{@next_snapshot_version}\","], - ], - }, - { - path: "csharp/Directory.Build.props", - hunks: [ - ["- #{@release_version}", - "+ #{@next_snapshot_version}"], - ], - }, - { - path: "dev/tasks/homebrew-formulae/apache-arrow.rb", - hunks: [ - ["- url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@release_version}/apache-arrow-#{@release_version}.tar.gz\"", - "+ url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@next_snapshot_version}/apache-arrow-#{@next_snapshot_version}.tar.gz\""], - ], - }, - { - path: "dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb", - hunks: [ - ["- url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@release_version}/apache-arrow-#{@release_version}.tar.gz\"", - "+ url \"https://www.apache.org/dyn/closer.lua?path=arrow/arrow-#{@release_version}.9000/apache-arrow-#{@release_version}.9000.tar.gz\""], - ], - }, - { - path: "js/package.json", - hunks: [ - ["- \"version\": \"#{@release_version}\"", - "+ \"version\": \"#{@next_snapshot_version}\""], - ], - }, - { - path: "matlab/CMakeLists.txt", - hunks: [ - ["-set(MLARROW_VERSION \"#{@release_version}\")", - "+set(MLARROW_VERSION \"#{@next_snapshot_version}\")"], - ], - }, - { - path: "python/setup.py", - hunks: [ - ["-default_version = '#{@release_version}'", - "+default_version = '#{@next_snapshot_version}'"], - ], - }, - { - path: "r/DESCRIPTION", - hunks: [ - ["-Version: #{@release_version}", - "+Version: #{@release_version}.9000"], - ], - }, - { - path: "r/NEWS.md", - # Note that these are additions only, no replacement - hunks: [ - ["+# arrow #{@release_version}.9000", - "+"], - ], - }, - { - path: "ruby/red-arrow-cuda/lib/arrow-cuda/version.rb", - hunks: [ - ["- VERSION = \"#{@release_version}\"", - "+ VERSION = \"#{@next_snapshot_version}\""], - ], - }, - { - path: "ruby/red-arrow-dataset/lib/arrow-dataset/version.rb", - hunks: [ - ["- VERSION = \"#{@release_version}\"", - "+ VERSION = \"#{@next_snapshot_version}\""], - ], - }, - { - path: "ruby/red-arrow/lib/arrow/version.rb", - hunks: [ - ["- VERSION = \"#{@release_version}\"", - "+ VERSION = \"#{@next_snapshot_version}\""], - ], - }, - { - path: "ruby/red-gandiva/lib/gandiva/version.rb", - hunks: [ - ["- VERSION = \"#{@release_version}\"", - "+ VERSION = \"#{@next_snapshot_version}\""], - ], - }, - { - path: "ruby/red-parquet/lib/parquet/version.rb", - hunks: [ - ["- VERSION = \"#{@release_version}\"", - "+ VERSION = \"#{@next_snapshot_version}\""], - ], - }, - { - path: "ruby/red-plasma/lib/plasma/version.rb", - hunks: [ - ["- VERSION = \"#{@release_version}\"", - "+ VERSION = \"#{@next_snapshot_version}\""], - ], - }, - { - path: "rust/arrow-flight/Cargo.toml", - hunks: [ - ["-version = \"#{@release_version}\"", - "+version = \"#{@next_snapshot_version}\""], - ["-arrow = { path = \"../arrow\", version = \"#{@release_version}\" }", - "+arrow = { path = \"../arrow\", version = \"#{@next_snapshot_version}\" }"], - ], - }, - { - path: "rust/arrow-pyarrow-integration-testing/Cargo.toml", - hunks: [ - ["-version = \"#{@release_version}\"", - "+version = \"#{@next_snapshot_version}\""], - ["-arrow = { path = \"../arrow\", version = \"#{@release_version}\" }", - "+arrow = { path = \"../arrow\", version = \"#{@next_snapshot_version}\" }"], - ], - }, - { - path: "rust/arrow/Cargo.toml", - hunks: [ - ["-version = \"#{@release_version}\"", - "+version = \"#{@next_snapshot_version}\""], - ], - }, - { - path: "rust/benchmarks/Cargo.toml", - hunks: [ - ["-version = \"#{@release_version}\"", - "+version = \"#{@next_snapshot_version}\""], - ], - }, - { - path: "rust/datafusion-examples/Cargo.toml", - hunks: [ - ["-version = \"#{@release_version}\"", - "+version = \"#{@next_snapshot_version}\""], - ], - }, - { - path: "rust/datafusion/Cargo.toml", - hunks: [ - ["-version = \"#{@release_version}\"", - "+version = \"#{@next_snapshot_version}\""], - ["-arrow = { path = \"../arrow\", version = \"#{@release_version}\", features = [\"prettyprint\"] }", - "-parquet = { path = \"../parquet\", version = \"#{@release_version}\", features = [\"arrow\"] }", - "+arrow = { path = \"../arrow\", version = \"#{@next_snapshot_version}\", features = [\"prettyprint\"] }", - "+parquet = { path = \"../parquet\", version = \"#{@next_snapshot_version}\", features = [\"arrow\"] }"], - ], - }, - { - path: "rust/datafusion/README.md", - hunks: [ - ["-datafusion = \"#{@release_version}\"", - "+datafusion = \"#{@next_snapshot_version}\""], - ], - }, - { - path: "rust/integration-testing/Cargo.toml", - hunks: [ - ["-version = \"#{@release_version}\"", - "+version = \"#{@next_snapshot_version}\""], - ], - }, - { - path: "rust/parquet/Cargo.toml", - hunks: [ - ["-version = \"#{@release_version}\"", - "+version = \"#{@next_snapshot_version}\""], - ["-arrow = { path = \"../arrow\", version = \"#{@release_version}\", optional = true }", - "+arrow = { path = \"../arrow\", version = \"#{@next_snapshot_version}\", optional = true }"], - ["-arrow = { path = \"../arrow\", version = \"#{@release_version}\" }", - "+arrow = { path = \"../arrow\", version = \"#{@next_snapshot_version}\" }"] - ], - }, - { - path: "rust/parquet/README.md", - hunks: [ - ["-parquet = \"#{@release_version}\"", - "+parquet = \"#{@next_snapshot_version}\""], - ["-See [crate documentation](https://docs.rs/crate/parquet/#{@release_version}) on available API.", - "+See [crate documentation](https://docs.rs/crate/parquet/#{@next_snapshot_version}) on available API."], - ], - }, - { - path: "rust/parquet_derive/Cargo.toml", - hunks: [ - ["-version = \"#{@release_version}\"", - "+version = \"#{@next_snapshot_version}\""], - ["-parquet = { path = \"../parquet\", version = \"#{@release_version}\" }", - "+parquet = { path = \"../parquet\", version = \"#{@next_snapshot_version}\" }"], - ], - }, - { - path: "rust/parquet_derive/README.md", - hunks: [ - ["-parquet = \"#{@release_version}\"", - "-parquet_derive = \"#{@release_version}\"", - "+parquet = \"#{@next_snapshot_version}\"", - "+parquet_derive = \"#{@next_snapshot_version}\""], - ], - }, - { - path: "rust/parquet_derive_test/Cargo.toml", - hunks: [ - ["-version = \"#{@release_version}\"", - "+version = \"#{@next_snapshot_version}\""], - ["-parquet = { path = \"../parquet\", version = \"#{@release_version}\" }", - "-parquet_derive = { path = \"../parquet_derive\", version = \"#{@release_version}\" }", - "+parquet = { path = \"../parquet\", version = \"#{@next_snapshot_version}\" }", - "+parquet_derive = { path = \"../parquet_derive\", version = \"#{@next_snapshot_version}\" }"], - ], - }, - ], - parse_patch(git("log", "-n", "1", "-p"))) - end - - def test_deb_package_names - prepare("DEB_PACKAGE_NAMES") - changes = parse_patch(git("log", "-n", "1", "-p")) - sampled_changes = changes.collect do |change| - first_hunk = change[:hunks][0] - first_removed_line = first_hunk.find {|line| line.start_with?("-")} - first_added_line = first_hunk.find {|line| line.start_with?("+")} - { - sampled_diff: [first_removed_line, first_added_line], - path: change[:path], - } - end - expected_changes = [ - { - sampled_diff: [ - "-dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib#{@so_version}.install", - "+dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib#{@next_so_version}.install", - ], - path: "dev/release/rat_exclude_files.txt" - }, - { - sampled_diff: [ - "-Package: libarrow#{@so_version}", - "+Package: libarrow#{@next_so_version}", - ], - path: "dev/tasks/linux-packages/apache-arrow/debian/control.in" - }, - { - sampled_diff: [ - "- - libarrow-glib#{@so_version}-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb", - "+ - libarrow-glib#{@next_so_version}-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb", - ], - path: "dev/tasks/tasks.yml", - }, - ] - assert_equal(expected_changes, sampled_changes) - end -end diff --git a/dev/release/01-prepare.sh b/dev/release/01-prepare.sh deleted file mode 100755 index 80703c2d87fc8..0000000000000 --- a/dev/release/01-prepare.sh +++ /dev/null @@ -1,291 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -set -ue - -SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - -if [ "$#" -ne 3 ]; then - echo "Usage: $0 " - exit 1 -fi - -update_versions() { - local base_version=$1 - local next_version=$2 - local type=$3 - - case ${type} in - release) - local version=${base_version} - local r_version=${base_version} - ;; - snapshot) - local version=${next_version}-SNAPSHOT - local r_version=${base_version}.9000 - ;; - esac - - cd "${SOURCE_DIR}/../../c_glib" - sed -i.bak -E -e \ - "s/^version = '.+'/version = '${version}'/" \ - meson.build - rm -f meson.build.bak - git add meson.build - cd - - - cd "${SOURCE_DIR}/../../ci/scripts" - sed -i.bak -E -e \ - "s/^pkgver=.+/pkgver=${r_version}/" \ - PKGBUILD - rm -f PKGBUILD.bak - git add PKGBUILD - cd - - - cd "${SOURCE_DIR}/../../cpp" - sed -i.bak -E -e \ - "s/^set\(ARROW_VERSION \".+\"\)/set(ARROW_VERSION \"${version}\")/" \ - CMakeLists.txt - rm -f CMakeLists.txt.bak - git add CMakeLists.txt - - sed -i.bak -E -e \ - "s/\"version-string\": \".+\"/\"version-string\": \"${version}\"/" \ - vcpkg.json - rm -f vcpkg.json.bak - git add vcpkg.json - cd - - - cd "${SOURCE_DIR}/../../csharp" - sed -i.bak -E -e \ - "s/^ .+<\/Version>/ ${version}<\/Version>/" \ - Directory.Build.props - rm -f Directory.Build.props.bak - git add Directory.Build.props - cd - - - cd "${SOURCE_DIR}/../../dev/tasks/homebrew-formulae" - sed -i.bak -E -e \ - "s/arrow-[0-9.]+[0-9]+/arrow-${r_version}/g" \ - autobrew/apache-arrow.rb - rm -f autobrew/apache-arrow.rb.bak - git add autobrew/apache-arrow.rb - sed -i.bak -E -e \ - "s/arrow-[0-9.\-]+[0-9SNAPHOT]+/arrow-${version}/g" \ - apache-arrow.rb - rm -f apache-arrow.rb.bak - git add apache-arrow.rb - cd - - - cd "${SOURCE_DIR}/../../js" - sed -i.bak -E -e \ - "s/^ \"version\": \".+\"/ \"version\": \"${version}\"/" \ - package.json - rm -f package.json.bak - git add package.json - cd - - - cd "${SOURCE_DIR}/../../matlab" - sed -i.bak -E -e \ - "s/^set\(MLARROW_VERSION \".+\"\)/set(MLARROW_VERSION \"${version}\")/" \ - CMakeLists.txt - rm -f CMakeLists.txt.bak - git add CMakeLists.txt - cd - - - cd "${SOURCE_DIR}/../../python" - sed -i.bak -E -e \ - "s/^default_version = '.+'/default_version = '${version}'/" \ - setup.py - rm -f setup.py.bak - git add setup.py - cd - - - cd "${SOURCE_DIR}/../../r" - sed -i.bak -E -e \ - "s/^Version: .+/Version: ${r_version}/" \ - DESCRIPTION - rm -f DESCRIPTION.bak - git add DESCRIPTION - if [ ${type} = "snapshot" ]; then - # Add a news entry for the new dev version - echo "dev" - sed -i.bak -E -e \ - "0,/^# arrow /s/^(# arrow .+)/# arrow ${r_version}\n\n\1/" \ - NEWS.md - else - # Replace dev version with release version - echo "release" - sed -i.bak -E -e \ - "0,/^# arrow /s/^# arrow .+/# arrow ${r_version}/" \ - NEWS.md - fi - rm -f NEWS.md.bak - git add NEWS.md - cd - - - cd "${SOURCE_DIR}/../../ruby" - sed -i.bak -E -e \ - "s/^ VERSION = \".+\"/ VERSION = \"${version}\"/g" \ - */*/*/version.rb - rm -f */*/*/version.rb.bak - git add */*/*/version.rb - cd - - - cd "${SOURCE_DIR}/../../rust" - sed -i.bak -E \ - -e "s/^version = \".+\"/version = \"${version}\"/g" \ - -e "s/^(arrow = .* version = )\".*\"(( .*)|(, features = .*)|(, optional = .*))$/\\1\"${version}\"\\2/g" \ - -e "s/^(arrow-flight = .* version = )\".+\"( .*)/\\1\"${version}\"\\2/g" \ - -e "s/^(parquet = .* version = )\".*\"(( .*)|(, features = .*))$/\\1\"${version}\"\\2/g" \ - -e "s/^(parquet_derive = .* version = )\".*\"(( .*)|(, features = .*))$/\\1\"${version}\"\\2/g" \ - */Cargo.toml - rm -f */Cargo.toml.bak - git add */Cargo.toml - - sed -i.bak -E \ - -e "s/^([^ ]+) = \".+\"/\\1 = \"${version}\"/g" \ - -e "s,docs\.rs/crate/([^/]+)/[^)]+,docs.rs/crate/\\1/${version},g" \ - */README.md - rm -f */README.md.bak - git add */README.md - cd - -} - -############################## Pre-Tag Commits ############################## - -version=$1 -next_version=$2 -next_version_snapshot="${next_version}-SNAPSHOT" -rc_number=$3 - -release_tag="apache-arrow-${version}" -release_branch="release-${version}" -release_candidate_branch="release-${version}-rc${rc_number}" - -: ${PREPARE_DEFAULT:=1} -: ${PREPARE_CHANGELOG:=${PREPARE_DEFAULT}} -: ${PREPARE_LINUX_PACKAGES:=${PREPARE_DEFAULT}} -: ${PREPARE_VERSION_PRE_TAG:=${PREPARE_DEFAULT}} -: ${PREPARE_BRANCH:=${PREPARE_DEFAULT}} -: ${PREPARE_TAG:=${PREPARE_DEFAULT}} -: ${PREPARE_VERSION_POST_TAG:=${PREPARE_DEFAULT}} -: ${PREPARE_DEB_PACKAGE_NAMES:=${PREPARE_DEFAULT}} - -if [ ${PREPARE_TAG} -gt 0 ]; then - if [ $(git tag -l "${release_tag}") ]; then - echo "Delete existing git tag $release_tag" - git tag -d "${release_tag}" - fi -fi - -if [ ${PREPARE_BRANCH} -gt 0 ]; then - if [[ $(git branch -l "${release_candidate_branch}") ]]; then - next_rc_number=$(($rc_number+1)) - echo "Branch ${release_candidate_branch} already exists, so create a new release candidate:" - echo "1. Checkout the master branch for major releases and maint- for patch releases." - echo "2. Execute the script again with bumped RC number." - echo "Commands:" - echo " git checkout master" - echo " dev/release/01-prepare.sh ${version} ${next_version} ${next_rc_number}" - exit 1 - fi - - echo "Create local branch ${release_candidate_branch} for release candidate ${rc_number}" - git checkout -b ${release_candidate_branch} -fi - -############################## Pre-Tag Commits ############################## - -if [ ${PREPARE_CHANGELOG} -gt 0 ]; then - echo "Updating changelog for $version" - # Update changelog - archery release changelog add $version - git add ${SOURCE_DIR}/../../CHANGELOG.md - git commit -m "[Release] Update CHANGELOG.md for $version" -fi - -if [ ${PREPARE_LINUX_PACKAGES} -gt 0 ]; then - echo "Updating .deb/.rpm changelogs for $version" - cd $SOURCE_DIR/../tasks/linux-packages - rake \ - version:update \ - ARROW_RELEASE_TIME="$(date +%Y-%m-%dT%H:%M:%S%z)" \ - ARROW_VERSION=${version} - git add */debian*/changelog */yum/*.spec.in - git commit -m "[Release] Update .deb/.rpm changelogs for $version" - cd - -fi - -if [ ${PREPARE_VERSION_PRE_TAG} -gt 0 ]; then - echo "Prepare release ${version} on tag ${release_tag} then reset to version ${next_version_snapshot}" - - update_versions "${version}" "${next_version}" "release" - git commit -m "[Release] Update versions for ${version}" -fi - -############################## Tag the Release ############################## - -if [ ${PREPARE_TAG} -gt 0 ]; then - git tag -a "${release_tag}" -m "[Release] Apache Arrow Release ${version}" -fi - -############################## Post-Tag Commits ############################# - -if [ ${PREPARE_VERSION_POST_TAG} -gt 0 ]; then - echo "Updating versions for ${next_version_snapshot}" - update_versions "${version}" "${next_version}" "snapshot" - git commit -m "[Release] Update versions for ${next_version_snapshot}" -fi - -if [ ${PREPARE_DEB_PACKAGE_NAMES} -gt 0 ]; then - echo "Updating .deb package names for ${next_version}" - so_version() { - local version=$1 - local major_version=$(echo $version | sed -E -e 's/^([0-9]+)\.[0-9]+\.[0-9]+$/\1/') - local minor_version=$(echo $version | sed -E -e 's/^[0-9]+\.([0-9]+)\.[0-9]+$/\1/') - expr ${major_version} \* 100 + ${minor_version} - } - deb_lib_suffix=$(so_version $version) - next_deb_lib_suffix=$(so_version $next_version) - if [ "${deb_lib_suffix}" != "${next_deb_lib_suffix}" ]; then - cd $SOURCE_DIR/../tasks/linux-packages/apache-arrow - for target in debian*/lib*${deb_lib_suffix}.install; do - git mv \ - ${target} \ - $(echo $target | sed -e "s/${deb_lib_suffix}/${next_deb_lib_suffix}/") - done - deb_lib_suffix_substitute_pattern="s/(lib(arrow|gandiva|parquet|plasma)[-a-z]*)${deb_lib_suffix}/\\1${next_deb_lib_suffix}/g" - sed -i.bak -E -e "${deb_lib_suffix_substitute_pattern}" debian*/control* - rm -f debian*/control*.bak - git add debian*/control* - cd - - cd $SOURCE_DIR/../tasks/ - sed -i.bak -E -e "${deb_lib_suffix_substitute_pattern}" tasks.yml - rm -f tasks.yml.bak - git add tasks.yml - cd - - cd $SOURCE_DIR - sed -i.bak -E -e "${deb_lib_suffix_substitute_pattern}" rat_exclude_files.txt - rm -f rat_exclude_files.txt.bak - git add rat_exclude_files.txt - git commit -m "[Release] Update .deb package names for $next_version" - cd - - fi -fi diff --git a/dev/release/02-source-test.rb b/dev/release/02-source-test.rb deleted file mode 100644 index 7d92881f282ac..0000000000000 --- a/dev/release/02-source-test.rb +++ /dev/null @@ -1,146 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -class SourceTest < Test::Unit::TestCase - include GitRunnable - include VersionDetectable - - def setup - @current_commit = git_current_commit - detect_versions - @tag_name = "apache-arrow-#{@release_version}" - @script = File.expand_path("dev/release/02-source.sh") - - Dir.mktmpdir do |dir| - Dir.chdir(dir) do - yield - end - end - end - - def source(*targets) - env = { - "SOURCE_DEFAULT" => "0", - "release_hash" => @current_commit, - } - targets.each do |target| - env["SOURCE_#{target}"] = "1" - end - output = sh(env, @script, @release_version, "0") - sh("tar", "xf", "#{@tag_name}.tar.gz") - output - end - - def test_symbolic_links - source - Dir.chdir(@tag_name) do - assert_equal([], - Find.find(".").find_all {|path| File.symlink?(path)}) - end - end - - def test_csharp_git_commit_information - source - Dir.chdir("#{@tag_name}/csharp") do - FileUtils.mv("dummy.git", "../.git") - sh("dotnet", "pack", "-c", "Release") - FileUtils.mv("../.git", "dummy.git") - Dir.chdir("artifacts/Apache.Arrow/Release") do - sh("unzip", "Apache.Arrow.#{@snapshot_version}.nupkg") - FileUtils.chmod(0400, "Apache.Arrow.nuspec") - nuspec = REXML::Document.new(File.read("Apache.Arrow.nuspec")) - nuspec_repository = nuspec.elements["package/metadata/repository"] - attributes = {} - nuspec_repository.attributes.each do |key, value| - attributes[key] = value - end - assert_equal({ - "type" => "git", - "url" => "https://github.com/apache/arrow", - "commit" => @current_commit, - }, - attributes) - end - end - end - - def test_python_version - source - Dir.chdir("#{@tag_name}/python") do - sh("python3", "setup.py", "sdist") - if on_release_branch? - pyarrow_source_archive = "dist/pyarrow-#{@release_version}.tar.gz" - else - pyarrow_source_archive = "dist/pyarrow-#{@release_version}a0.tar.gz" - end - assert_equal([pyarrow_source_archive], - Dir.glob("dist/pyarrow-*.tar.gz")) - end - end - - def test_vote - jira_url = "https://issues.apache.org/jira" - jql_conditions = [ - "project = ARROW", - "status in (Resolved, Closed)", - "fixVersion = #{@release_version}", - ] - jql = jql_conditions.join(" AND ") - n_resolved_issues = nil - search_url = URI("#{jira_url}/rest/api/2/search?jql=#{CGI.escape(jql)}") - search_url.open do |response| - n_resolved_issues = JSON.parse(response.read)["total"] - end - output = source("VOTE") - assert_equal(<<-VOTE.strip, output[/^-+$(.+?)^-+$/m, 1].strip) -To: dev@arrow.apache.org -Subject: [VOTE] Release Apache Arrow #{@release_version} - RC0 - -Hi, - -I would like to propose the following release candidate (RC0) of Apache -Arrow version #{@release_version}. This is a release consisting of #{n_resolved_issues} -resolved JIRA issues[1]. - -This release candidate is based on commit: -#{@current_commit} [2] - -The source release rc0 is hosted at [3]. -The binary artifacts are hosted at [4][5][6][7]. -The changelog is located at [8]. - -Please download, verify checksums and signatures, run the unit tests, -and vote on the release. See [9] for how to validate a release candidate. - -The vote will be open for at least 72 hours. - -[ ] +1 Release this as Apache Arrow #{@release_version} -[ ] +0 -[ ] -1 Do not release this as Apache Arrow #{@release_version} because... - -[1]: https://issues.apache.org/jira/issues/?jql=project%20%3D%20ARROW%20AND%20status%20in%20%28Resolved%2C%20Closed%29%20AND%20fixVersion%20%3D%20#{@release_version} -[2]: https://github.com/apache/arrow/tree/#{@current_commit} -[3]: https://dist.apache.org/repos/dist/dev/arrow/apache-arrow-#{@release_version}-rc0 -[4]: https://bintray.com/apache/arrow/centos-rc/#{@release_version}-rc0 -[5]: https://bintray.com/apache/arrow/debian-rc/#{@release_version}-rc0 -[6]: https://bintray.com/apache/arrow/python-rc/#{@release_version}-rc0 -[7]: https://bintray.com/apache/arrow/ubuntu-rc/#{@release_version}-rc0 -[8]: https://github.com/apache/arrow/blob/#{@current_commit}/CHANGELOG.md -[9]: https://cwiki.apache.org/confluence/display/ARROW/How+to+Verify+Release+Candidates - VOTE - end -end diff --git a/dev/release/02-source.sh b/dev/release/02-source.sh deleted file mode 100755 index 89850e7543dd4..0000000000000 --- a/dev/release/02-source.sh +++ /dev/null @@ -1,162 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -set -e - -: ${SOURCE_DEFAULT:=1} -: ${SOURCE_RAT:=${SOURCE_DEFAULT}} -: ${SOURCE_UPLOAD:=${SOURCE_DEFAULT}} -: ${SOURCE_VOTE:=${SOURCE_DEFAULT}} - -SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" - -if [ "$#" -ne 2 ]; then - echo "Usage: $0 " - exit -fi - -version=$1 -rc=$2 - -tag=apache-arrow-${version} -tagrc=${tag}-rc${rc} -rc_url="https://dist.apache.org/repos/dist/dev/arrow/${tagrc}" - -echo "Preparing source for tag ${tag}" - -: ${release_hash:=$(cd "${SOURCE_TOP_DIR}" && git rev-list --max-count=1 ${tag})} - -if [ ${SOURCE_UPLOAD} -gt 0 ]; then - if [ -z "$release_hash" ]; then - echo "Cannot continue: unknown git tag: $tag" - exit - fi -fi - -echo "Using commit $release_hash" - -tarball=${tag}.tar.gz - -rm -rf ${tag} -# be conservative and use the release hash, even though git produces the same -# archive (identical hashes) using the scm tag -(cd "${SOURCE_TOP_DIR}" && \ - git archive ${release_hash} --prefix ${tag}/) | \ - tar xf - - -# Resolve all hard and symbolic links -rm -rf ${tag}.tmp -mv ${tag} ${tag}.tmp -cp -R -L ${tag}.tmp ${tag} -rm -rf ${tag}.tmp - -# Create a dummy .git/ directory to download the source files from GitHub with Source Link in C#. -dummy_git=${tag}/csharp/dummy.git -mkdir ${dummy_git} -pushd ${dummy_git} -echo ${release_hash} > HEAD -echo '[remote "origin"] url = https://github.com/apache/arrow.git' >> config -mkdir objects refs -popd - -# Create new tarball from modified source directory -tar czf ${tarball} ${tag} -rm -rf ${tag} - -if [ ${SOURCE_RAT} -gt 0 ]; then - "${SOURCE_DIR}/run-rat.sh" ${tarball} -fi - -if [ ${SOURCE_UPLOAD} -gt 0 ]; then - # sign the archive - gpg --armor --output ${tarball}.asc --detach-sig ${tarball} - shasum -a 256 $tarball > ${tarball}.sha256 - shasum -a 512 $tarball > ${tarball}.sha512 - - # check out the arrow RC folder - svn co --depth=empty https://dist.apache.org/repos/dist/dev/arrow tmp - - # add the release candidate for the tag - mkdir -p tmp/${tagrc} - - # copy the rc tarball into the tmp dir - cp ${tarball}* tmp/${tagrc} - - # commit to svn - svn add tmp/${tagrc} - svn ci -m "Apache Arrow ${version} RC${rc}" tmp/${tagrc} - - # clean up - rm -rf tmp - - echo "Success! The release candidate is available here:" - echo " ${rc_url}" - echo "" - echo "Commit SHA1: ${release_hash}" - echo "" -fi - -if [ ${SOURCE_VOTE} -gt 0 ]; then - echo "The following draft email has been created to send to the" - echo "dev@arrow.apache.org mailing list" - echo "" - echo "---------------------------------------------------------" - jira_url="https://issues.apache.org/jira" - jql="project%20%3D%20ARROW%20AND%20status%20in%20%28Resolved%2C%20Closed%29%20AND%20fixVersion%20%3D%20${version}" - n_resolved_issues=$(curl "${jira_url}/rest/api/2/search/?jql=${jql}" | jq ".total") - cat < " - exit -fi - -version=$1 -rc_number=$2 -version_with_rc="${version}-rc${rc_number}" -crossbow_job_prefix="release-${version_with_rc}" - -release_tag="apache-arrow-${version}" -release_candidate_branch="release-${version}-rc${rc_number}" - -: ${GIT_REMOTE:="origin"} - -git checkout ${release_candidate_branch} -git push -u ${GIT_REMOTE} ${release_candidate_branch} - -# archery will submit a job with id: "${crossbow_job_prefix}-0" unless there -# are jobs submitted with the same prefix (the integer at the end is auto -# incremented) -archery crossbow submit \ - --job-prefix ${crossbow_job_prefix} \ - --arrow-version ${version_with_rc} \ - --group packaging diff --git a/dev/release/04-binary-download.sh b/dev/release/04-binary-download.sh deleted file mode 100755 index d0b61b05884a6..0000000000000 --- a/dev/release/04-binary-download.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -set -e - -if [ "$#" -ne 2 ]; then - echo "Usage: $0 " - exit -fi - -version=$1 -rc_number=$2 -version_with_rc="${version}-rc${rc_number}" -crossbow_job_prefix="release-${version_with_rc}" - -# archery will submit a job with id: "${crossbow_job_prefix}-0" unless there -# are jobs submitted with the same prefix (the integer at the end is auto -# incremented) -: ${CROSSBOW_JOB_ID:="${crossbow_job_prefix}-0"} - -archery crossbow download-artifacts ${CROSSBOW_JOB_ID} diff --git a/dev/release/05-binary-upload.sh b/dev/release/05-binary-upload.sh deleted file mode 100755 index 4a360c28b0470..0000000000000 --- a/dev/release/05-binary-upload.sh +++ /dev/null @@ -1,137 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e -set -u -set -o pipefail - -SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -if [ "$#" -ne 2 ]; then - echo "Usage: $0 " - exit -fi - -version=$1 -rc=$2 - -version_with_rc="${version}-rc${rc}" -crossbow_job_prefix="release-${version_with_rc}" -crossbow_package_dir="${SOURCE_DIR}/../../packages" - -: ${CROSSBOW_JOB_ID:="${crossbow_job_prefix}-0"} -artifact_dir="${crossbow_package_dir}/${CROSSBOW_JOB_ID}" - -if [ ! -e "$artifact_dir" ]; then - echo "$artifact_dir does not exist" - exit 1 -fi - -if [ ! -d "$artifact_dir" ]; then - echo "$artifact_dir is not a directory" - exit 1 -fi - -cd "${SOURCE_DIR}" - -: ${BINTRAY_REPOSITORY_CUSTOM:=${BINTRAY_REPOSITORY:-}} -: ${SOURCE_BINTRAY_REPOSITORY_CUSTOM:=${SOURCE_BINTRAY_REPOSITORY:-}}} - -if [ ! -f .env ]; then - echo "You must create $(pwd)/.env" - echo "You can use $(pwd)/.env.example as template" - exit 1 -fi -. .env - -if [ -n "${BINTRAY_REPOSITORY_CUSTOM}" ]; then - BINTRAY_REPOSITORY=${BINTRAY_REPOSITORY_CUSTOM} -fi - -if [ -n "${SOURCE_BINTRAY_REPOSITORY_CUSTOM}" ]; then - SOURCE_BINTRAY_REPOSITORY=${SOURCE_BINTRAY_REPOSITORY_CUSTOM} -fi - -. binary-common.sh - -# By default upload all artifacts. -# To deactivate one category, deactivate the category and all of its dependents. -# To explicitly select one category, set UPLOAD_DEFAULT=0 UPLOAD_X=1. -: ${UPLOAD_DEFAULT:=1} -: ${UPLOAD_CENTOS_RPM:=${UPLOAD_DEFAULT}} -: ${UPLOAD_CENTOS_YUM:=${UPLOAD_DEFAULT}} -: ${UPLOAD_DEBIAN_APT:=${UPLOAD_DEFAULT}} -: ${UPLOAD_DEBIAN_DEB:=${UPLOAD_DEFAULT}} -: ${UPLOAD_NUGET:=${UPLOAD_DEFAULT}} -: ${UPLOAD_PYTHON:=${UPLOAD_DEFAULT}} -: ${UPLOAD_UBUNTU_APT:=${UPLOAD_DEFAULT}} -: ${UPLOAD_UBUNTU_DEB:=${UPLOAD_DEFAULT}} - -rake_tasks=() -apt_targets=() -yum_targets=() -if [ ${UPLOAD_DEBIAN_DEB} -gt 0 ]; then - rake_tasks+=(deb) - apt_targets+=(debian) -fi -if [ ${UPLOAD_DEBIAN_APT} -gt 0 ]; then - rake_tasks+=(apt:rc) - apt_targets+=(debian) -fi -if [ ${UPLOAD_UBUNTU_DEB} -gt 0 ]; then - rake_tasks+=(deb) - apt_targets+=(ubuntu) -fi -if [ ${UPLOAD_UBUNTU_APT} -gt 0 ]; then - rake_tasks+=(apt:rc) - apt_targets+=(ubuntu) -fi -if [ ${UPLOAD_CENTOS_RPM} -gt 0 ]; then - rake_tasks+=(rpm) - yum_targets+=(centos) -fi -if [ ${UPLOAD_CENTOS_YUM} -gt 0 ]; then - rake_tasks+=(yum:rc) - yum_targets+=(centos) -fi -if [ ${UPLOAD_NUGET} -gt 0 ]; then - rake_tasks+=(nuget:rc) -fi -if [ ${UPLOAD_PYTHON} -gt 0 ]; then - rake_tasks+=(python:rc) -fi -rake_tasks+=(summary:rc) - -tmp_dir=binary/tmp -mkdir -p "${tmp_dir}" -source_artifacts_dir="${tmp_dir}/artifacts" -rm -rf "${source_artifacts_dir}" -cp -a "${artifact_dir}" "${source_artifacts_dir}" - -docker_run \ - ./runner.sh \ - rake \ - "${rake_tasks[@]}" \ - APT_TARGETS=$(IFS=,; echo "${apt_targets[*]}") \ - ARTIFACTS_DIR="${tmp_dir}/artifacts" \ - BINTRAY_REPOSITORY=${BINTRAY_REPOSITORY} \ - RC=${rc} \ - SOURCE_BINTRAY_REPOSITORY=${SOURCE_BINTRAY_REPOSITORY} \ - VERSION=${version} \ - YUM_TARGETS=$(IFS=,; echo "${yum_targets[*]}") diff --git a/dev/release/README.md b/dev/release/README.md deleted file mode 100644 index 0a9cc3e04b3a8..0000000000000 --- a/dev/release/README.md +++ /dev/null @@ -1,24 +0,0 @@ - - -## Release management scripts - -To learn more, see the project wiki: - -https://cwiki.apache.org/confluence/display/ARROW/Release+Management+Guide diff --git a/dev/release/Rakefile b/dev/release/Rakefile deleted file mode 100644 index ff57bad5e8fa2..0000000000000 --- a/dev/release/Rakefile +++ /dev/null @@ -1,37 +0,0 @@ -# -*- ruby -*- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -require_relative "binary-task" - -if File.exist?(".env") - File.open(".env") do |env| - env.each_line do |line| - case line.strip - when /\A#/ - when /\A([^=]+)=(.*)\z/ - key = $1 - value = $2 - ENV[key] ||= value - end - end - end -end - -binary_task = BinaryTask.new -binary_task.define diff --git a/dev/release/VERIFY.md b/dev/release/VERIFY.md deleted file mode 100644 index ec77bccaf5b41..0000000000000 --- a/dev/release/VERIFY.md +++ /dev/null @@ -1,76 +0,0 @@ - - -# Verifying Arrow releases - -## Windows - -We've provided a convenience script for verifying the C++ and Python builds on -Windows. Read the comments in `verify-release-candidate.bat` for instructions. - -## Linux and macOS - -We've provided a convenience script for verifying the C++, Python, C -GLib, Java and JavaScript builds on Linux and macOS. Read the comments in -`verify-release-candidate.sh` for instructions. - -### C GLib - -You need the followings to verify C GLib build: - -- GLib -- GObject Introspection -- Ruby (not EOL-ed version is required) -- gobject-introspection gem -- test-unit gem - -You can install them by the followings on Debian GNU/Linux and Ubuntu: - -```console -% sudo apt install -y -V libgirepository1.0-dev ruby-dev -% sudo gem install gobject-introspection test-unit -``` - -You can install them by the followings on CentOS: - -```console -% sudo yum install -y gobject-introspection-devel -% git clone https://github.com/sstephenson/rbenv.git ~/.rbenv -% git clone https://github.com/sstephenson/ruby-build.git ~/.rbenv/plugins/ruby-build -% echo 'export PATH="$HOME/.rbenv/bin:$PATH"' >> ~/.bash_profile -% echo 'eval "$(rbenv init -)"' >> ~/.bash_profile -% exec ${SHELL} --login -% sudo yum install -y gcc make patch openssl-devel readline-devel zlib-devel -% rbenv install 2.4.2 -% rbenv global 2.4.2 -% gem install gobject-introspection test-unit -``` - -You can install them by the followings on macOS: - -```console -% brew install -y gobject-introspection -% gem install gobject-introspection test-unit -``` - -You need to set `PKG_CONFIG_PATH` to find libffi on macOS: - -```console -% export PKG_CONFIG_PATH=$(brew --prefix libffi)/lib/pkgconfig:$PKG_CONFIG_PATH -``` diff --git a/dev/release/binary-common.sh b/dev/release/binary-common.sh deleted file mode 100644 index 7c66e375f8d80..0000000000000 --- a/dev/release/binary-common.sh +++ /dev/null @@ -1,86 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -docker_image_name=apache-arrow/release-binary -gpg_agent_extra_socket="$(gpgconf --list-dirs agent-extra-socket)" -if [ $(uname) = "Darwin" ]; then - docker_uid=10000 - docker_gid=10000 -else - docker_uid=$(id -u) - docker_gid=$(id -g) -fi -docker_ssh_key="${SOURCE_DIR}/binary/id_rsa" - -if [ ! -f "${docker_ssh_key}" ]; then - ssh-keygen -N "" -f "${docker_ssh_key}" -fi - -docker_gpg_ssh() { - local ssh_port=$1 - shift - local known_hosts_file=$(mktemp -t "arrow-binary-gpg-ssh-known-hosts.XXXXX") - local exit_code= - if ssh \ - -o StrictHostKeyChecking=no \ - -o UserKnownHostsFile=${known_hosts_file} \ - -i "${docker_ssh_key}" \ - -p ${ssh_port} \ - -R "/home/arrow/.gnupg/S.gpg-agent:${gpg_agent_extra_socket}" \ - arrow@127.0.0.1 \ - "$@"; then - exit_code=$?; - else - exit_code=$?; - fi - rm -f ${known_hosts_file} - return ${exit_code} -} - -docker_run() { - local container_id_dir=$(mktemp -d -t "arrow-binary-gpg-container.XXXXX") - local container_id_file=${container_id_dir}/id - docker \ - run \ - --cidfile ${container_id_file} \ - --detach \ - --publish-all \ - --rm \ - --volume "$PWD":/host \ - ${docker_image_name} \ - bash -c " -if [ \$(id -u) -ne ${docker_uid} ]; then - usermod --uid ${docker_uid} arrow - chown -R arrow: ~arrow -fi -/usr/sbin/sshd -D -" - local container_id=$(cat ${container_id_file}) - local ssh_port=$(docker port ${container_id} | grep -E -o '[0-9]+$') - # Wait for sshd available - while ! docker_gpg_ssh ${ssh_port} : > /dev/null 2>&1; do - sleep 0.1 - done - gpg --export ${GPG_KEY_ID} | docker_gpg_ssh ${ssh_port} gpg --import - docker_gpg_ssh ${ssh_port} "$@" - docker kill ${container_id} - rm -rf ${container_id_dir} -} - -docker build -t ${docker_image_name} "${SOURCE_DIR}/binary" - -chmod go-rwx "${docker_ssh_key}" diff --git a/dev/release/binary-task.rb b/dev/release/binary-task.rb deleted file mode 100644 index 42bc1fe47665c..0000000000000 --- a/dev/release/binary-task.rb +++ /dev/null @@ -1,1909 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -require "digest/sha2" -require "io/console" -require "json" -require "net/http" -require "pathname" -require "tempfile" -require "thread" -require "time" - -class BinaryTask - include Rake::DSL - - class ThreadPool - def initialize(use_case, &worker) - @n_workers = choose_n_workers(use_case) - @worker = worker - @jobs = Thread::Queue.new - @workers = @n_workers.times.collect do - Thread.new do - loop do - job = @jobs.pop - break if job.nil? - @worker.call(job) - end - end - end - end - - def <<(job) - @jobs << job - end - - def join - @n_workers.times do - @jobs << nil - end - @workers.each(&:join) - end - - private - def choose_n_workers(use_case) - case use_case - when :bintray - # Too many workers cause Bintray error. - 6 - when :gpg - # Too many workers cause gpg-agent error. - 2 - else - raise "Unknown use case: #{use_case}" - end - end - end - - class ProgressReporter - def initialize(label, count_max=0) - @label = label - @count_max = count_max - - @mutex = Thread::Mutex.new - - @time_start = Time.now - @time_previous = Time.now - @count_current = 0 - @count_previous = 0 - end - - def advance - @mutex.synchronize do - @count_current += 1 - - return if @count_max.zero? - - time_current = Time.now - if time_current - @time_previous <= 1 - return - end - - show_progress(time_current) - end - end - - def increment_max - @mutex.synchronize do - @count_max += 1 - end - end - - def finish - @mutex.synchronize do - return if @count_max.zero? - show_progress(Time.now) - $stderr.puts - end - end - - private - def show_progress(time_current) - n_finishes = @count_current - @count_previous - throughput = n_finishes.to_f / (time_current - @time_previous) - @time_previous = time_current - @count_previous = @count_current - - message = build_message(time_current, throughput) - $stderr.print("\r#{message}") if message - end - - def build_message(time_current, throughput) - percent = (@count_current / @count_max.to_f) * 100 - formatted_count = "[%s/%s]" % [ - format_count(@count_current), - format_count(@count_max), - ] - elapsed_second = time_current - @time_start - if throughput.zero? - rest_second = 0 - else - rest_second = (@count_max - @count_current) / throughput - end - separator = " - " - progress = "%5.1f%% %s %s %s %s" % [ - percent, - formatted_count, - format_time_interval(elapsed_second), - format_time_interval(rest_second), - format_throughput(throughput), - ] - label = @label - - width = guess_terminal_width - return "#{label}#{separator}#{progress}" if width.nil? - - return nil if progress.size > width - - label_width = width - progress.size - separator.size - if label.size > label_width - ellipsis = "..." - shorten_label_width = label_width - ellipsis.size - if shorten_label_width < 1 - return progress - else - label = label[0, shorten_label_width] + ellipsis - end - end - "#{label}#{separator}#{progress}" - end - - def format_count(count) - "%d" % count - end - - def format_time_interval(interval) - if interval < 60 - "00:00:%02d" % interval - elsif interval < (60 * 60) - minute, second = interval.divmod(60) - "00:%02d:%02d" % [minute, second] - elsif interval < (60 * 60 * 24) - minute, second = interval.divmod(60) - hour, minute = minute.divmod(60) - "%02d:%02d:%02d" % [hour, minute, second] - else - minute, second = interval.divmod(60) - hour, minute = minute.divmod(60) - day, hour = hour.divmod(24) - "%dd %02d:%02d:%02d" % [day, hour, minute, second] - end - end - - def format_throughput(throughput) - "%2d/s" % throughput - end - - def guess_terminal_width - guess_terminal_width_from_io || - guess_terminal_width_from_command || - guess_terminal_width_from_env || - 80 - end - - def guess_terminal_width_from_io - if IO.respond_to?(:console) and IO.console - IO.console.winsize[1] - elsif $stderr.respond_to?(:winsize) - begin - $stderr.winsize[1] - rescue SystemCallError - nil - end - else - nil - end - end - - def guess_terminal_width_from_command - IO.pipe do |input, output| - begin - pid = spawn("tput", "cols", {:out => output, :err => output}) - rescue SystemCallError - return nil - end - - output.close - _, status = Process.waitpid2(pid) - return nil unless status.success? - - result = input.read.chomp - begin - Integer(result, 10) - rescue ArgumentError - nil - end - end - end - - def guess_terminal_width_from_env - env = ENV["COLUMNS"] || ENV["TERM_WIDTH"] - return nil if env.nil? - - begin - Integer(env, 10) - rescue ArgumentError - nil - end - end - end - - class BintrayClient - class Error < StandardError - attr_reader :request - attr_reader :response - def initialize(request, response, message) - @request = request - @response = response - super(message) - end - end - - def initialize(options={}) - @options = options - repository = @options[:repository] - @subject, @repository = repository.split("/", 2) if repository - @package = @options[:package] - @version = @options[:version] - @user = @options[:user] - @api_key = @options[:api_key] - end - - def request(method, headers, *components, &block) - url = build_request_url(*components) - http = Net::HTTP.new(url.host, url.port) - http.set_debug_output($stderr) if ENV["DEBUG"] - http.use_ssl = true - http.start do |http| - request = build_request(method, url, headers, &block) - http.request(request) do |response| - case response - when Net::HTTPSuccess - return JSON.parse(response.body) - else - message = "failed to request: " - message << "#{url}: #{request.method}: " - message << "#{response.message} #{response.code}:\n" - message << response.body - raise Error.new(request, response, message) - end - end - end - end - - def repository - request(:get, - {}, - "repos", - @subject, - @repository) - end - - def create_repository - request(:post, - {}, - "repos", - @subject, - @repository) do - request = { - "name" => @repository, - "desc" => "Apache Arrow", - } - JSON.generate(request) - end - end - - def ensure_repository - begin - repository - rescue Error => error - case error.response - when Net::HTTPNotFound - create_repository - else - raise - end - end - end - - def package - request(:get, - {}, - "packages", - @subject, - @repository, - @package) - end - - def package_versions - begin - package["versions"] - rescue Error => error - case error.response - when Net::HTTPNotFound - [] - else - raise - end - end - end - - def create_package(description) - request(:post, - {}, - "packages", - @subject, - @repository) do - request = { - "name" => @package, - "desc" => description, - "licenses" => ["Apache-2.0"], - "vcs_url" => "https://github.com/apache/arrow.git", - "website_url" => "https://arrow.apache.org/", - "issue_tracker_url" => "https://issues.apache.org/jira/browse/ARROW", - "github_repo" => "apache/arrow", - "public_download_numbers" => true, - "public_stats" => true, - } - JSON.generate(request) - end - end - - def ensure_package(description) - begin - package - rescue Error => error - case error.response - when Net::HTTPNotFound - create_package(description) - else - raise - end - end - end - - def create_version(description) - request(:post, - {}, - "packages", - @subject, - @repository, - @package, - "versions") do - request = { - "name" => @version, - "desc" => description, - } - JSON.generate(request) - end - end - - def ensure_version(version, description) - return if package["versions"].include?(version) - create_version(description) - end - - def files - request(:get, - {}, - "packages", - @subject, - @repository, - @package, - "versions", - @version, - "files") - end - - def upload(path, destination_path) - sha256 = Digest::SHA256.file(path).hexdigest - headers = { - "X-Bintray-Override" => "1", - "X-Bintray-Package" => @package, - "X-Bintray-Publish" => "1", - "X-Bintray-Version" => @version, - "X-Checksum-Sha2" => sha256, - "Content-Length" => File.size(path).to_s, - } - File.open(path, "rb") do |input| - request(:put, - headers, - "content", - @subject, - @repository, - destination_path) do - input - end - end - end - - def delete(path) - request(:delete, - {}, - "content", - @subject, - @repository, - path) - end - - private - def build_request_url(*components) - if components.last.is_a?(Hash) - parameters = components.pop - else - parameters = nil - end - path = components.join("/") - url = "https://bintray.com/api/v1/#{path}" - if parameters - separator = "?" - parameters.each do |key, value| - url << "#{separator}#{CGI.escape(key)}=#{CGI.escape(value)}" - separator = "&" - end - end - URI(url) - end - - def build_request(method, url, headers, &block) - case method - when :get - request = Net::HTTP::Get.new(url, headers) - when :post - request = Net::HTTP::Post.new(url, headers) - when :put - request = Net::HTTP::Put.new(url, headers) - when :delete - request = Net::HTTP::Delete.new(url, headers) - else - raise "unsupported HTTP method: #{method.inspect}" - end - request.basic_auth(@user, @api_key) if @user and @api_key - if block_given? - request["Content-Type"] = "application/json" - body = yield - if body.is_a?(String) - request.body = body - else - request.body_stream = body - end - end - request - end - end - - module HashChekable - def same_hash?(path, sha256) - return false unless File.exist?(path) - Digest::SHA256.file(path).hexdigest == sha256 - end - end - - class BintrayDownloader - include HashChekable - - def initialize(repository:, - distribution:, - version:, - rc: nil, - destination:, - user:, - api_key:) - @repository = repository - @distribution = distribution - @version = version - @rc = rc - @destination = destination - @user = user - @api_key = api_key - end - - def download - client.ensure_repository - - progress_label = "Downloading: #{package} #{full_version}" - progress_reporter = ProgressReporter.new(progress_label) - pool = ThreadPool.new(:bintray) do |path, output_path| - download_file(path, output_path) - progress_reporter.advance - end - target_files.each do |file| - path = file["path"] - path_without_package = path.split("/", 2)[1..-1].join("/") - output_path = "#{@destination}/#{path_without_package}" - yield(output_path) - sha256 = file["sha256"] - next if same_hash?(output_path, sha256) - output_dir = File.dirname(output_path) - FileUtils.mkdir_p(output_dir) - progress_reporter.increment_max - pool << [path, output_path] - end - pool.join - progress_reporter.finish - end - - private - def package - if @rc - "#{@distribution}-rc" - else - @distribution - end - end - - def full_version - if @rc - "#{@version}-rc#{@rc}" - else - @version - end - end - - def client(options={}) - default_options = { - repository: @repository, - package: package, - version: full_version, - user: @user, - api_key: @api_key, - } - BintrayClient.new(default_options.merge(options)) - end - - def target_files - begin - client.files - rescue BintrayClient::Error - [] - end - end - - def download_file(path, output_path) - max_n_retries = 5 - n_retries = 0 - url = URI("https://dl.bintray.com/#{@repository}/#{path}") - begin - download_url(url, output_path) - rescue OpenSSL::OpenSSLError, - SocketError, - SystemCallError, - Timeout::Error => error - n_retries += 1 - if n_retries <= max_n_retries - $stderr.puts - $stderr.puts("Retry #{n_retries}: #{url}: " + - "#{error.class}: #{error.message}") - retry - else - raise - end - end - end - - def download_url(url, output_path) - loop do - http = Net::HTTP.new(url.host, url.port) - http.set_debug_output($stderr) if ENV["DEBUG"] - http.use_ssl = true - http.start do |http| - request = Net::HTTP::Get.new(url) - http.request(request) do |response| - case response - when Net::HTTPSuccess - save_response(response, output_path) - return - when Net::HTTPRedirection - url = URI(response["Location"]) - when Net::HTTPNotFound - $stderr.puts(build_download_error_message(url, response)) - return - else - raise build_download_error_message(url, response) - end - end - end - end - end - - def save_response(response, output_path) - File.open(output_path, "wb") do |output| - response.read_body do |chunk| - output.print(chunk) - end - end - last_modified = response["Last-Modified"] - if last_modified - FileUtils.touch(output_path, mtime: Time.rfc2822(last_modified)) - end - end - - def build_download_error_message(url, response) - message = "failed to download: " - message << "#{url}: #{response.message} #{response.code}:\n" - message << response.body - message - end - end - - class BintrayUploader - include HashChekable - - def initialize(repository:, - distribution:, - distribution_label:, - version:, - rc: nil, - source:, - destination_prefix: "", - user:, - api_key:) - @repository = repository - @distribution = distribution - @distribution_label = distribution_label - @version = version - @rc = rc - @source = source - @destination_prefix = destination_prefix - @user = user - @api_key = api_key - end - - def upload - client.ensure_repository - client.ensure_package(package_description) - client.ensure_version(full_version, version_description) - - progress_label = "Uploading: #{package} #{full_version}" - progress_reporter = ProgressReporter.new(progress_label) - pool = ThreadPool.new(:bintray) do |path, relative_path| - upload_file(path, relative_path) - progress_reporter.advance - end - - files = existing_files - source = Pathname(@source) - source.glob("**/*") do |path| - next if path.directory? - destination_path = - "#{package}/#{@destination_prefix}#{path.relative_path_from(source)}" - file = files[destination_path] - next if file and same_hash?(path.to_s, file["sha256"]) - progress_reporter.increment_max - pool << [path, destination_path] - end - pool.join - progress_reporter.finish - end - - private - def package - if @rc - "#{@distribution}-rc" - else - @distribution - end - end - - def full_version - if @rc - "#{@version}-rc#{@rc}" - else - @version - end - end - - def package_description - if @rc - release_type = "RC" - else - release_type = "Release" - end - case @distribution - when "debian", "ubuntu" - "#{release_type} deb packages for #{@distribution_label}" - when "centos" - "#{release_type} RPM packages for #{@distribution_label}" - else - "#{release_type} binaries for #{@distribution_label}" - end - end - - def version_description - if @rc - "Apache Arrow #{@version} RC#{@rc} for #{@distribution_label}" - else - "Apache Arrow #{@version} for #{@distribution_label}" - end - end - - def client - BintrayClient.new(repository: @repository, - package: package, - version: full_version, - user: @user, - api_key: @api_key) - end - - def existing_files - files = {} - client.files.each do |file| - files[file["path"]] = file - end - files - end - - def upload_file(path, destination_path) - max_n_retries = 3 - n_retries = 0 - begin - begin - client.upload(path, destination_path) - rescue BintrayClient::Error => error - case error.response - when Net::HTTPConflict - n_retries += 1 - if n_retries <= max_n_retries - client.delete(destination_path) - retry - else - $stderr.puts(error) - end - else - $stderr.puts(error) - end - end - rescue OpenSSL::OpenSSLError, - SocketError, - SystemCallError, - Timeout::Error => error - n_retries += 1 - if n_retries <= max_n_retries - $stderr.puts - $stderr.puts("Retry #{n_retries}: #{path}: " + - "#{error.class}: #{error.message}") - retry - else - raise - end - end - end - end - - def define - define_apt_tasks - define_yum_tasks - define_python_tasks - define_nuget_tasks - define_summary_tasks - end - - private - def env_value(name) - value = ENV[name] - value = yield(name) if value.nil? and block_given? - raise "Specify #{name} environment variable" if value.nil? - value - end - - def verbose? - ENV["VERBOSE"] == "yes" - end - - def default_output - if verbose? - nil - else - IO::NULL - end - end - - def gpg_key_id - env_value("GPG_KEY_ID") - end - - def shorten_gpg_key_id(id) - id[-8..-1] - end - - def rpm_gpg_key_package_name(id) - "gpg-pubkey-#{shorten_gpg_key_id(id).downcase}" - end - - def bintray_user - env_value("BINTRAY_USER") - end - - def bintray_api_key - env_value("BINTRAY_API_KEY") - end - - def bintray_repository - env_value("BINTRAY_REPOSITORY") - end - - def source_bintray_repository - env_value("SOURCE_BINTRAY_REPOSITORY") do - bintray_repository - end - end - - def artifacts_dir - env_value("ARTIFACTS_DIR") - end - - def version - env_value("VERSION") - end - - def rc - env_value("RC") - end - - def full_version - "#{version}-rc#{rc}" - end - - def valid_sign?(path, sign_path) - IO.pipe do |input, output| - begin - sh({"LANG" => "C"}, - "gpg", - "--verify", - sign_path, - path, - out: default_output, - err: output, - verbose: false) - rescue - return false - end - output.close - /Good signature/ === input.read - end - end - - def sign(source_path, destination_path) - if File.exist?(destination_path) - return if valid_sign?(source_path, destination_path) - rm(destination_path, verbose: false) - end - sh("gpg", - "--detach-sig", - "--local-user", gpg_key_id, - "--output", destination_path, - source_path, - out: default_output, - verbose: verbose?) - end - - def sha512(source_path, destination_path) - if File.exist?(destination_path) - sha512 = File.read(destination_path).split[0] - return if Digest::SHA512.file(source_path).hexdigest == sha512 - end - absolute_destination_path = File.expand_path(destination_path) - Dir.chdir(File.dirname(source_path)) do - sh("shasum", - "--algorithm", "512", - File.basename(source_path), - out: absolute_destination_path, - verbose: verbose?) - end - end - - def sign_dir(label, dir) - progress_label = "Signing: #{label}" - progress_reporter = ProgressReporter.new(progress_label) - - target_paths = [] - Pathname(dir).glob("**/*") do |path| - next if path.directory? - case path.extname - when ".asc", ".sha512" - next - end - progress_reporter.increment_max - target_paths << path.to_s - end - target_paths.each do |path| - sign(path, "#{path}.asc") - sha512(path, "#{path}.sha512") - progress_reporter.advance - end - progress_reporter.finish - end - - def download_distribution(distribution, - destination, - with_source_repository: false) - existing_paths = {} - Pathname(destination).glob("**/*") do |path| - next if path.directory? - existing_paths[path.to_s] = true - end - if with_source_repository - source_client = BintrayClient.new(repository: source_bintray_repository, - package: distribution, - user: bintray_user, - api_key: bintray_api_key) - source_client.package_versions[0, 10].each do |source_version| - downloader = BintrayDownloader.new(repository: source_bintray_repository, - distribution: distribution, - version: source_version, - destination: destination, - user: bintray_user, - api_key: bintray_api_key) - downloader.download do |output_path| - existing_paths.delete(output_path) - end - end - end - downloader = BintrayDownloader.new(repository: bintray_repository, - distribution: distribution, - version: version, - rc: rc, - destination: destination, - user: bintray_user, - api_key: bintray_api_key) - downloader.download do |output_path| - existing_paths.delete(output_path) - end - existing_paths.each_key do |path| - rm_f(path, verbose: verbose?) - end - end - - def same_content?(path1, path2) - File.exist?(path1) and - File.exist?(path2) and - Digest::SHA256.file(path1) == Digest::SHA256.file(path2) - end - - def copy_artifact(source_path, - destination_path, - progress_reporter) - return if same_content?(source_path, destination_path) - progress_reporter.increment_max - destination_dir = File.dirname(destination_path) - unless File.exist?(destination_dir) - mkdir_p(destination_dir, verbose: verbose?) - end - cp(source_path, destination_path, verbose: verbose?) - progress_reporter.advance - end - - def tmp_dir - "binary/tmp" - end - - def rc_dir - "#{tmp_dir}/rc" - end - - def release_dir - "#{tmp_dir}/release" - end - - def deb_dir - "#{rc_dir}/deb/#{full_version}" - end - - def apt_repository_label - "Apache Arrow" - end - - def apt_repository_description - "Apache Arrow packages" - end - - def apt_rc_repositories_dir - "#{rc_dir}/apt/repositories" - end - - def apt_release_repositories_dir - "#{release_dir}/apt/repositories" - end - - def available_apt_targets - [ - ["debian", "buster", "main"], - ["debian", "bullseye", "main"], - ["ubuntu", "bionic", "main"], - ["ubuntu", "focal", "main"], - ["ubuntu", "groovy", "main"], - ] - end - - def apt_distribution_label(distribution) - case distribution - when "debian" - "Debian" - when "ubuntu" - "Ubuntu" - else - distribution - end - end - - def apt_targets - env_apt_targets = (ENV["APT_TARGETS"] || "").split(",") - if env_apt_targets.empty? - available_apt_targets - else - available_apt_targets.select do |distribution, code_name, component| - env_apt_targets.any? do |env_apt_target| - env_apt_target.start_with?("#{distribution}-#{code_name}") - end - end - end - end - - def apt_distributions - apt_targets.collect(&:first).uniq - end - - def apt_architectures - [ - "amd64", - "arm64", - ] - end - - def define_deb_tasks - directory deb_dir - - namespace :deb do - desc "Copy deb packages" - task :copy => deb_dir do - apt_targets.each do |distribution, code_name, component| - progress_label = "Copying: #{distribution} #{code_name}" - progress_reporter = ProgressReporter.new(progress_label) - - source_dir_prefix = "#{artifacts_dir}/#{distribution}-#{code_name}" - Dir.glob("#{source_dir_prefix}*/**/*") do |path| - next if File.directory?(path) - base_name = File.basename(path) - if base_name.start_with?("apache-arrow-archive-keyring") - package_name = "apache-arrow-archive-keyring" - else - package_name = "apache-arrow" - end - distribution_dir = [ - deb_dir, - distribution, - ].join("/") - destination_path = [ - distribution_dir, - "pool", - code_name, - component, - package_name[0], - package_name, - base_name, - ].join("/") - copy_artifact(path, - destination_path, - progress_reporter) - case base_name - when /\A[^_]+-archive-keyring_.*\.deb\z/ - latest_archive_keyring_package_path = [ - distribution_dir, - "#{package_name}-latest-#{code_name}.deb" - ].join("/") - copy_artifact(path, - latest_archive_keyring_package_path, - progress_reporter) - end - end - progress_reporter.finish - end - end - - desc "Sign deb packages" - task :sign => deb_dir do - apt_distributions.each do |distribution| - distribution_dir = "#{deb_dir}/#{distribution}" - Dir.glob("#{distribution_dir}/**/*.dsc") do |path| - begin - sh({"LANG" => "C"}, - "gpg", - "--verify", - path, - out: IO::NULL, - err: IO::NULL, - verbose: false) - rescue - sh("debsign", - "--no-re-sign", - "-k#{gpg_key_id}", - path, - out: default_output, - verbose: verbose?) - end - end - sign_dir(distribution, distribution_dir) - end - end - - desc "Upload deb packages" - task :upload do - apt_distributions.each do |distribution| - distribution_dir = "#{deb_dir}/#{distribution}" - distribution_label = apt_distribution_label(distribution) - uploader = BintrayUploader.new(repository: bintray_repository, - distribution: distribution, - distribution_label: distribution_label, - version: version, - rc: rc, - source: distribution_dir, - user: bintray_user, - api_key: bintray_api_key) - uploader.upload - end - end - end - - desc "Release deb packages" - deb_tasks = [ - "deb:copy", - "deb:sign", - "deb:upload", - ] - task :deb => deb_tasks - end - - def generate_apt_release(dists_dir, code_name, component, architecture) - dir = "#{dists_dir}/#{component}/" - if architecture == "source" - dir << architecture - else - dir << "binary-#{architecture}" - end - - mkdir_p(dir, verbose: verbose?) - File.open("#{dir}/Release", "w") do |release| - release.puts(<<-RELEASE) -Archive: #{code_name} -Component: #{component} -Origin: #{apt_repository_label} -Label: #{apt_repository_label} -Architecture: #{architecture} - RELEASE - end - end - - def generate_apt_ftp_archive_generate_conf(code_name, component) - conf = <<-CONF -Dir::ArchiveDir "."; -Dir::CacheDir "."; -TreeDefault::Directory "pool/#{code_name}/#{component}"; -TreeDefault::SrcDirectory "pool/#{code_name}/#{component}"; -Default::Packages::Extensions ".deb"; -Default::Packages::Compress ". gzip xz"; -Default::Sources::Compress ". gzip xz"; -Default::Contents::Compress "gzip"; - CONF - - apt_architectures.each do |architecture| - conf << <<-CONF - -BinDirectory "dists/#{code_name}/#{component}/binary-#{architecture}" { - Packages "dists/#{code_name}/#{component}/binary-#{architecture}/Packages"; - Contents "dists/#{code_name}/#{component}/Contents-#{architecture}"; - SrcPackages "dists/#{code_name}/#{component}/source/Sources"; -}; - CONF - end - - conf << <<-CONF - -Tree "dists/#{code_name}" { - Sections "#{component}"; - Architectures "#{apt_architectures.join(" ")} source"; -}; - CONF - - conf - end - - def generate_apt_ftp_archive_release_conf(code_name, component) - <<-CONF -APT::FTPArchive::Release::Origin "#{apt_repository_label}"; -APT::FTPArchive::Release::Label "#{apt_repository_label}"; -APT::FTPArchive::Release::Architectures "#{apt_architectures.join(" ")}"; -APT::FTPArchive::Release::Codename "#{code_name}"; -APT::FTPArchive::Release::Suite "#{code_name}"; -APT::FTPArchive::Release::Components "#{component}"; -APT::FTPArchive::Release::Description "#{apt_repository_description}"; - CONF - end - - def apt_update(repositories_dir) - apt_targets.each do |distribution, code_name, component| - base_dir = "#{repositories_dir}/#{distribution}" - pool_dir = "#{base_dir}/pool/#{code_name}" - next unless File.exist?(pool_dir) - dists_dir = "#{base_dir}/dists/#{code_name}" - rm_rf(dists_dir, verbose: verbose?) - generate_apt_release(dists_dir, code_name, component, "source") - apt_architectures.each do |architecture| - generate_apt_release(dists_dir, code_name, component, architecture) - end - - generate_conf_file = Tempfile.new("apt-ftparchive-generate.conf") - File.open(generate_conf_file.path, "w") do |conf| - conf.puts(generate_apt_ftp_archive_generate_conf(code_name, - component)) - end - cd(base_dir, verbose: verbose?) do - sh("apt-ftparchive", - "generate", - generate_conf_file.path, - out: default_output, - verbose: verbose?) - end - - Dir.glob("#{dists_dir}/Release*") do |release| - rm_f(release, verbose: verbose?) - end - Dir.glob("#{base_dir}/*.db") do |db| - rm_f(db, verbose: verbose?) - end - release_conf_file = Tempfile.new("apt-ftparchive-release.conf") - File.open(release_conf_file.path, "w") do |conf| - conf.puts(generate_apt_ftp_archive_release_conf(code_name, - component)) - end - release_file = Tempfile.new("apt-ftparchive-release") - sh("apt-ftparchive", - "-c", release_conf_file.path, - "release", - dists_dir, - out: release_file.path, - verbose: verbose?) - release_path = "#{dists_dir}/Release" - signed_release_path = "#{release_path}.gpg" - in_release_path = "#{dists_dir}/InRelease" - mv(release_file.path, release_path, verbose: verbose?) - chmod(0644, release_path, verbose: verbose?) - sh("gpg", - "--sign", - "--detach-sign", - "--armor", - "--local-user", gpg_key_id, - "--output", signed_release_path, - release_path, - out: default_output, - verbose: verbose?) - sh("gpg", - "--clear-sign", - "--local-user", gpg_key_id, - "--output", in_release_path, - release_path, - out: default_output, - verbose: verbose?) - end - end - - def define_apt_rc_tasks - directory apt_rc_repositories_dir - - namespace :apt do - namespace :rc do - desc "Download deb files for RC APT repositories" - task :download => apt_rc_repositories_dir do - apt_distributions.each do |distribution| - download_distribution(distribution, - "#{apt_rc_repositories_dir}/#{distribution}", - with_source_repository: true) - end - end - - desc "Update RC APT repositories" - task :update do - apt_update(apt_rc_repositories_dir) - apt_targets.each do |distribution, code_name, component| - base_dir = "#{apt_rc_repositories_dir}/#{distribution}" - dists_dir = "#{base_dir}/dists/#{code_name}" - next unless File.exist?(dists_dir) - sign_dir("#{distribution} #{code_name}", - dists_dir) - end - end - - desc "Upload RC APT repositories" - task :upload => apt_rc_repositories_dir do - apt_distributions.each do |distribution| - dists_dir = "#{apt_rc_repositories_dir}/#{distribution}/dists" - distribution_label = apt_distribution_label(distribution) - uploader = BintrayUploader.new(repository: bintray_repository, - distribution: distribution, - distribution_label: distribution_label, - version: version, - rc: rc, - source: dists_dir, - destination_prefix: "dists/", - user: bintray_user, - api_key: bintray_api_key) - uploader.upload - end - end - end - - desc "Release RC APT repositories" - apt_rc_tasks = [ - "apt:rc:download", - "apt:rc:update", - "apt:rc:upload", - ] - task :rc => apt_rc_tasks - end - end - - def define_apt_release_tasks - directory apt_release_repositories_dir - - namespace :apt do - namespace :release do - desc "Download RC APT repositories" - task :download => apt_release_repositories_dir do - apt_distributions.each do |distribution| - distribution_dir = "#{apt_release_repositories_dir}/#{distribution}" - download_distribution(distribution, distribution_dir) - end - end - - desc "Upload release APT repositories" - task :upload => apt_release_repositories_dir do - apt_distributions.each do |distribution| - distribution_dir = "#{apt_release_repositories_dir}/#{distribution}" - distribution_label = apt_distribution_label(distribution) - uploader = BintrayUploader.new(repository: bintray_repository, - distribution: distribution, - distribution_label: distribution_label, - version: version, - source: distribution_dir, - user: bintray_user, - api_key: bintray_api_key) - uploader.upload - end - end - end - - desc "Release APT repositories" - apt_release_tasks = [ - "apt:release:download", - "apt:release:upload", - ] - task :release => apt_release_tasks - end - end - - def define_apt_tasks - define_deb_tasks - define_apt_rc_tasks - define_apt_release_tasks - end - - def rpm_dir - "#{rc_dir}/rpm/#{full_version}" - end - - def yum_rc_repositories_dir - "#{rc_dir}/yum/repositories" - end - - def yum_release_repositories_dir - "#{release_dir}/yum/repositories" - end - - def available_yum_targets - [ - ["centos", "7"], - ["centos", "8"], - ] - end - - def yum_distribution_label(distribution) - case distribution - when "centos" - "CentOS" - else - distribution - end - end - - def yum_targets - env_yum_targets = (ENV["YUM_TARGETS"] || "").split(",") - if env_yum_targets.empty? - available_yum_targets - else - available_yum_targets.select do |distribution, distribution_version| - env_yum_targets.any? do |env_yum_target| - env_yum_target.start_with?("#{distribution}-#{distribution_version}") - end - end - end - end - - def yum_distributions - yum_targets.collect(&:first).uniq - end - - def yum_architectures - [ - "aarch64", - "x86_64", - ] - end - - def signed_rpm?(rpm) - IO.pipe do |input, output| - system("rpm", "--checksig", rpm, out: output) - output.close - signature = input.gets.sub(/\A#{Regexp.escape(rpm)}: /, "") - signature.split.include?("signatures") - end - end - - def sign_rpms(directory) - thread_pool = ThreadPool.new(:gpg) do |rpm| - unless signed_rpm?(rpm) - sh("rpm", - "-D", "_gpg_name #{gpg_key_id}", - "-D", "__gpg_check_password_cmd /bin/true true", - "--resign", - rpm, - out: default_output, - verbose: verbose?) - end - end - Dir.glob("#{directory}/**/*.rpm") do |rpm| - thread_pool << rpm - end - thread_pool.join - end - - def rpm_sign(directory) - unless system("rpm", "-q", - rpm_gpg_key_package_name(gpg_key_id), - out: IO::NULL) - gpg_key = Tempfile.new(["apache-arrow-binary", ".asc"]) - sh("gpg", - "--armor", - "--export", gpg_key_id, - out: gpg_key.path, - verbose: verbose?) - sh("rpm", - "--import", gpg_key.path, - out: default_output, - verbose: verbose?) - gpg_key.close! - end - - yum_targets.each do |distribution, distribution_version| - source_dir = [ - directory, - distribution, - distribution_version, - ].join("/") - sign_rpms(source_dir) - end - end - - def define_rpm_tasks - directory rpm_dir - - namespace :rpm do - desc "Copy RPM packages" - task :copy => rpm_dir do - yum_targets.each do |distribution, distribution_version| - progress_label = "Copying: #{distribution} #{distribution_version}" - progress_reporter = ProgressReporter.new(progress_label) - - destination_prefix = [ - rpm_dir, - distribution, - distribution_version, - ].join("/") - source_dir_prefix = - "#{artifacts_dir}/#{distribution}-#{distribution_version}" - Dir.glob("#{source_dir_prefix}*/**/*") do |path| - next if File.directory?(path) - base_name = File.basename(path) - type = base_name.split(".")[-2] - destination_paths = [] - case type - when "src" - destination_paths << [ - destination_prefix, - "Source", - "SPackages", - base_name, - ].join("/") - when "noarch" - yum_architectures.each do |architecture| - destination_paths << [ - destination_prefix, - architecture, - "Packages", - base_name, - ].join("/") - end - else - destination_paths << [ - destination_prefix, - type, - "Packages", - base_name, - ].join("/") - end - destination_paths.each do |destination_path| - copy_artifact(path, - destination_path, - progress_reporter) - end - case base_name - when /\A(apache-arrow-release)-.*\.noarch\.rpm\z/ - package_name = $1 - latest_release_package_path = [ - destination_prefix, - "#{package_name}-latest.rpm" - ].join("/") - copy_artifact(path, - latest_release_package_path, - progress_reporter) - end - end - - progress_reporter.finish - end - end - - desc "Sign RPM packages" - task :sign do - rpm_sign(rpm_dir) - yum_targets.each do |distribution, distribution_version| - source_dir = [ - rpm_dir, - distribution, - distribution_version, - ].join("/") - sign_dir("#{distribution}-#{distribution_version}", - source_dir) - end - end - - desc "Upload RPM packages" - task :upload do - yum_distributions.each do |distribution| - distribution_dir = "#{rpm_dir}/#{distribution}" - distribution_label = yum_distribution_label(distribution) - uploader = BintrayUploader.new(repository: bintray_repository, - distribution: distribution, - distribution_label: distribution_label, - version: version, - rc: rc, - source: distribution_dir, - user: bintray_user, - api_key: bintray_api_key) - uploader.upload - end - end - end - - desc "Release RPM packages" - rpm_tasks = [ - "rpm:copy", - "rpm:sign", - "rpm:upload", - ] - task :rpm => rpm_tasks - end - - def yum_update(repositories_dir) - yum_distributions.each do |distribution| - distribution_dir = "#{repositories_dir}/#{distribution}" - Dir.glob("#{distribution_dir}/**/repodata") do |repodata| - rm_rf(repodata, verbose: verbose?) - end - end - - yum_targets.each do |distribution, distribution_version| - base_dir = [ - repositories_dir, - distribution, - distribution_version, - ].join("/") - base_dir = Pathname(base_dir) - next unless base_dir.directory? - base_dir.glob("*") do |arch_dir| - next unless arch_dir.directory? - sh(ENV["CREATEREPO"] || "createrepo", - "--update", - arch_dir.to_s, - out: default_output, - verbose: verbose?) - end - end - end - - def define_yum_rc_tasks - directory yum_rc_repositories_dir - - namespace :yum do - namespace :rc do - desc "Download RPM files for RC Yum repositories" - task :download => yum_rc_repositories_dir do - yum_distributions.each do |distribution| - distribution_dir = "#{yum_rc_repositories_dir}/#{distribution}" - download_distribution(distribution, - distribution_dir, - with_source_repository: true) - end - end - - desc "Update RC Yum repositories" - task :update => yum_rc_repositories_dir do - yum_update(yum_rc_repositories_dir) - yum_targets.each do |distribution, distribution_version| - base_dir = [ - yum_rc_repositories_dir, - distribution, - distribution_version, - ].join("/") - base_dir = Pathname(base_dir) - next unless base_dir.directory? - base_dir.glob("*") do |arch_dir| - next unless arch_dir.directory? - sign_label = - "#{distribution}-#{distribution_version} #{arch_dir.basename}" - sign_dir(sign_label, - arch_dir.to_s) - end - end - end - - desc "Upload RC Yum repositories" - task :upload => yum_rc_repositories_dir do - yum_targets.each do |distribution, distribution_version| - distribution_label = yum_distribution_label(distribution) - base_dir = [ - yum_rc_repositories_dir, - distribution, - distribution_version, - ].join("/") - base_dir = Pathname(base_dir) - base_dir.glob("**/repodata") do |repodata_dir| - relative_dir = [ - distribution_version, - repodata_dir.relative_path_from(base_dir).to_s - ].join("/") - uploader = - BintrayUploader.new(repository: bintray_repository, - distribution: distribution, - distribution_label: distribution_label, - version: version, - rc: rc, - source: repodata_dir.to_s, - destination_prefix: "#{relative_dir}/", - user: bintray_user, - api_key: bintray_api_key) - uploader.upload - end - end - end - end - - desc "Release RC Yum packages" - yum_rc_tasks = [ - "yum:rc:download", - "yum:rc:update", - "yum:rc:upload", - ] - task :rc => yum_rc_tasks - end - end - - def define_yum_release_tasks - directory yum_release_repositories_dir - - namespace :yum do - namespace :release do - desc "Download RC Yum repositories" - task :download => yum_release_repositories_dir do - yum_distributions.each do |distribution| - distribution_dir = "#{yum_release_repositories_dir}/#{distribution}" - download_distribution(distribution, distribution_dir) - end - end - - desc "Upload release Yum repositories" - task :upload => yum_release_repositories_dir do - yum_distributions.each do |distribution| - distribution_dir = "#{yum_release_repositories_dir}/#{distribution}" - distribution_label = yum_distribution_label(distribution) - uploader = BintrayUploader.new(repository: bintray_repository, - distribution: distribution, - distribution_label: distribution_label, - version: version, - source: distribution_dir, - user: bintray_user, - api_key: bintray_api_key) - uploader.upload - end - end - end - - desc "Release Yum packages" - yum_release_tasks = [ - "yum:release:download", - "yum:release:upload", - ] - task :release => yum_release_tasks - end - end - - def define_yum_tasks - define_rpm_tasks - define_yum_rc_tasks - define_yum_release_tasks - end - - def define_generic_data_rc_tasks(label, - id, - rc_dir, - target_files_glob) - directory rc_dir - - namespace id do - namespace :rc do - desc "Copy #{label} packages" - task :copy => rc_dir do - progress_label = "Copying: #{label}" - progress_reporter = ProgressReporter.new(progress_label) - - Pathname(artifacts_dir).glob(target_files_glob) do |path| - next if path.directory? - destination_path = [ - rc_dir, - path.basename.to_s, - ].join("/") - copy_artifact(path, destination_path, progress_reporter) - end - - progress_reporter.finish - end - - desc "Sign #{label} packages" - task :sign => rc_dir do - sign_dir(label, rc_dir) - end - - desc "Upload #{label} packages" - task :upload do - uploader = BintrayUploader.new(repository: bintray_repository, - distribution: id.to_s, - distribution_label: label, - version: version, - rc: rc, - source: rc_dir, - destination_prefix: "#{full_version}/", - user: bintray_user, - api_key: bintray_api_key) - uploader.upload - end - end - - desc "Release RC #{label} packages" - rc_tasks = [ - "#{id}:rc:copy", - "#{id}:rc:sign", - "#{id}:rc:upload", - ] - task :rc => rc_tasks - end - end - - def define_generic_data_release_tasks(label, id, release_dir) - directory release_dir - - namespace id do - namespace :release do - desc "Download RC #{label} packages" - task :download => release_dir do - download_distribution(id.to_s, release_dir) - end - - desc "Upload release #{label} packages" - task :upload => release_dir do - packages_dir = "#{release_dir}/#{full_version}" - uploader = BintrayUploader.new(repository: bintray_repository, - distribution: id.to_s, - distribution_label: label, - version: version, - source: packages_dir, - destination_prefix: "#{version}/", - user: bintray_user, - api_key: bintray_api_key) - uploader.upload - end - end - - desc "Release #{label} packages" - release_tasks = [ - "#{id}:release:download", - "#{id}:release:upload", - ] - task :release => release_tasks - end - end - - def define_generic_data_tasks(label, - id, - rc_dir, - release_dir, - target_files_glob) - define_generic_data_rc_tasks(label, id, rc_dir, target_files_glob) - define_generic_data_release_tasks(label, id, release_dir) - end - - def define_python_tasks - define_generic_data_tasks("Python", - :python, - "#{rc_dir}/python/#{full_version}", - "#{release_dir}/python/#{full_version}", - "{conda-*,wheel-*,python-sdist}/**/*") - end - - def define_nuget_tasks - define_generic_data_tasks("NuGet", - :nuget, - "#{rc_dir}/nuget/#{full_version}", - "#{release_dir}/nuget/#{full_version}", - "nuget/**/*") - end - - def define_summary_tasks - namespace :summary do - desc "Show RC summary" - task :rc do - puts(<<-SUMMARY) -Success! The release candidate binaries are available here: - https://bintray.com/#{bintray_repository}/debian-rc/#{full_version} - https://bintray.com/#{bintray_repository}/ubuntu-rc/#{full_version} - https://bintray.com/#{bintray_repository}/centos-rc/#{full_version} - https://bintray.com/#{bintray_repository}/python-rc/#{full_version} - https://bintray.com/#{bintray_repository}/nuget-rc/#{full_version} - SUMMARY - end - - desc "Show release summary" - task :release do - puts(<<-SUMMARY) -Success! The release binaries are available here: - https://bintray.com/#{bintray_repository}/debian/#{version} - https://bintray.com/#{bintray_repository}/ubuntu/#{version} - https://bintray.com/#{bintray_repository}/centos/#{version} - https://bintray.com/#{bintray_repository}/python/#{version} - https://bintray.com/#{bintray_repository}/nuget/#{version} - SUMMARY - end - end - end -end diff --git a/dev/release/binary/.dockerignore b/dev/release/binary/.dockerignore deleted file mode 100644 index f2c46d8ce2450..0000000000000 --- a/dev/release/binary/.dockerignore +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -/tmp/ diff --git a/dev/release/binary/Dockerfile b/dev/release/binary/Dockerfile deleted file mode 100644 index 26ef3f9e8c058..0000000000000 --- a/dev/release/binary/Dockerfile +++ /dev/null @@ -1,68 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -FROM ubuntu:18.04 - -ENV DEBIAN_FRONTEND noninteractive - -ARG DEBUG - -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ - apt update ${quiet} && \ - apt install -y -V ${quiet} \ - apt-utils \ - createrepo \ - devscripts \ - gpg \ - locales \ - openssh-server \ - rake \ - rpm \ - ruby \ - sudo && \ - apt clean && \ - rm -rf /var/lib/apt/lists/* - -RUN locale-gen en_US.UTF-8 - -RUN mkdir -p /run/sshd -RUN echo "StreamLocalBindUnlink yes" >> /etc/ssh/sshd_config - -ENV ARROW_USER arrow -ENV ARROW_UID 10000 - -RUN \ - groupadd --gid ${ARROW_UID} ${ARROW_USER} && \ - useradd --uid ${ARROW_UID} --gid ${ARROW_UID} --create-home ${ARROW_USER} && \ - mkdir -p /home/arrow/.gnupg /home/arrow/.ssh && \ - chown -R arrow: /home/arrow/.gnupg /home/arrow/.ssh && \ - chmod -R og-rwx /home/arrow/.gnupg /home/arrow/.ssh && \ - echo "${ARROW_USER} ALL=(ALL:ALL) NOPASSWD:ALL" | \ - EDITOR=tee visudo -f /etc/sudoers.d/arrow - -COPY id_rsa.pub /home/arrow/.ssh/authorized_keys -RUN \ - chown -R arrow: /home/arrow/.ssh && \ - chmod -R og-rwx /home/arrow/.ssh - -COPY runner.sh /home/arrow/runner.sh -RUN \ - chown -R arrow: /home/arrow/runner.sh && \ - chmod +x /home/arrow/runner.sh - -EXPOSE 22 diff --git a/dev/release/binary/runner.sh b/dev/release/binary/runner.sh deleted file mode 100755 index 5cf5033d68b55..0000000000000 --- a/dev/release/binary/runner.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -u - -export LANG=C - -target_dir=/host/binary/tmp -original_owner=$(stat --format=%u ${target_dir}) -original_group=$(stat --format=%g ${target_dir}) - -sudo -H chown -R ${USER}: ${target_dir} -restore_owner() { - sudo -H chown -R ${original_owner}:${original_group} ${target_dir} -} -trap restore_owner EXIT - -cd /host - -"$@" diff --git a/dev/release/check-rat-report.py b/dev/release/check-rat-report.py deleted file mode 100644 index e30d72bddd7f8..0000000000000 --- a/dev/release/check-rat-report.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/python -############################################################################## -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -############################################################################## -import fnmatch -import re -import sys -import xml.etree.ElementTree as ET - -if len(sys.argv) != 3: - sys.stderr.write("Usage: %s exclude_globs.lst rat_report.xml\n" % - sys.argv[0]) - sys.exit(1) - -exclude_globs_filename = sys.argv[1] -xml_filename = sys.argv[2] - -globs = [line.strip() for line in open(exclude_globs_filename, "r")] - -tree = ET.parse(xml_filename) -root = tree.getroot() -resources = root.findall('resource') - -all_ok = True -for r in resources: - approvals = r.findall('license-approval') - if not approvals or approvals[0].attrib['name'] == 'true': - continue - clean_name = re.sub('^[^/]+/', '', r.attrib['name']) - excluded = False - for g in globs: - if fnmatch.fnmatch(clean_name, g): - excluded = True - break - if not excluded: - sys.stdout.write("NOT APPROVED: %s (%s): %s\n" % ( - clean_name, r.attrib['name'], approvals[0].attrib['name'])) - all_ok = False - -if not all_ok: - sys.exit(1) - -print('OK') -sys.exit(0) diff --git a/dev/release/download_rc_binaries.py b/dev/release/download_rc_binaries.py deleted file mode 100755 index 5ed8ece778303..0000000000000 --- a/dev/release/download_rc_binaries.py +++ /dev/null @@ -1,173 +0,0 @@ -#!/usr/bin/env python - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import re - -import argparse -import concurrent.futures as cf -import functools -import hashlib -import json -import os -import subprocess -import urllib.request - - -BINTRAY_API_ROOT = "https://bintray.com/api/v1" -BINTRAY_DL_ROOT = "https://dl.bintray.com" -BINTRAY_REPO = os.getenv('BINTRAY_REPOSITORY', 'apache/arrow') -DEFAULT_PARALLEL_DOWNLOADS = 8 - - -class Bintray: - - def __init__(self, repo=BINTRAY_REPO): - self.repo = repo - - def get_file_list(self, package, version): - url = os.path.join(BINTRAY_API_ROOT, 'packages', self.repo, package, - 'versions', version, 'files') - request = urllib.request.urlopen(url).read() - return json.loads(request) - - def download_files(self, files, dest=None, num_parallel=None, - re_match=None): - """ - Download files from Bintray in parallel. If file already exists, will - overwrite if the checksum does not match what Bintray says it should be - - Parameters - ---------- - files : List[Dict] - File listing from Bintray - dest : str, default None - Defaults to current working directory - num_parallel : int, default 8 - Number of files to download in parallel. If set to None, uses - default - """ - if dest is None: - dest = os.getcwd() - if num_parallel is None: - num_parallel = DEFAULT_PARALLEL_DOWNLOADS - - if re_match is not None: - regex = re.compile(re_match) - files = [x for x in files if regex.match(x['path'])] - - if num_parallel == 1: - for path in files: - self._download_file(dest, path) - else: - parallel_map_terminate_early( - functools.partial(self._download_file, dest), - files, - num_parallel - ) - - def _download_file(self, dest, info): - relpath = info['path'] - - base, filename = os.path.split(relpath) - - dest_dir = os.path.join(dest, base) - os.makedirs(dest_dir, exist_ok=True) - - dest_path = os.path.join(dest_dir, filename) - - if os.path.exists(dest_path): - with open(dest_path, 'rb') as f: - sha256sum = hashlib.sha256(f.read()).hexdigest() - if sha256sum == info['sha256']: - print('Local file {} sha256 matches, skipping' - .format(dest_path)) - return - else: - print('Local file sha256 does not match, overwriting') - - print("Downloading {} to {}".format(relpath, dest_path)) - - bintray_abspath = os.path.join(BINTRAY_DL_ROOT, self.repo, relpath) - - cmd = [ - 'curl', '--fail', '--location', '--retry', '5', - '--output', dest_path, bintray_abspath - ] - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - stdout, stderr = proc.communicate() - if proc.returncode != 0: - raise Exception("Downloading {} failed\nstdout: {}\nstderr: {}" - .format(relpath, stdout, stderr)) - - -def parallel_map_terminate_early(f, iterable, num_parallel): - tasks = [] - with cf.ProcessPoolExecutor(num_parallel) as pool: - for v in iterable: - tasks.append(pool.submit(functools.partial(f, v))) - - for task in cf.as_completed(tasks): - if task.exception() is not None: - e = task.exception() - for task in tasks: - task.cancel() - raise e - - -ARROW_PACKAGE_TYPES = ['centos', 'debian', 'nuget', 'python', 'ubuntu'] - - -def download_rc_binaries(version, rc_number, re_match=None, dest=None, - num_parallel=None, target_package_type=None): - bintray = Bintray() - - version_string = '{}-rc{}'.format(version, rc_number) - if target_package_type: - package_types = [target_package_type] - else: - package_types = ARROW_PACKAGE_TYPES - for package_type in package_types: - files = bintray.get_file_list('{}-rc'.format(package_type), - version_string) - bintray.download_files(files, re_match=re_match, dest=dest, - num_parallel=num_parallel) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Download release candidate binaries' - ) - parser.add_argument('version', type=str, help='The version number') - parser.add_argument('rc_number', type=int, - help='The release candidate number, e.g. 0, 1, etc') - parser.add_argument('-e', '--regexp', type=str, default=None, - help=('Regular expression to match on file names ' - 'to only download certain files')) - parser.add_argument('--dest', type=str, default=os.getcwd(), - help='The output folder for the downloaded files') - parser.add_argument('--num_parallel', type=int, default=8, - help='The number of concurrent downloads to do') - parser.add_argument('--package_type', type=str, default=None, - help='The package type to be downloaded') - args = parser.parse_args() - - download_rc_binaries(args.version, args.rc_number, dest=args.dest, - re_match=args.regexp, num_parallel=args.num_parallel, - target_package_type=args.package_type) diff --git a/dev/release/post-01-upload.sh b/dev/release/post-01-upload.sh deleted file mode 100755 index 4f8053de8c948..0000000000000 --- a/dev/release/post-01-upload.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -set -e -set -u - -if [ "$#" -ne 2 ]; then - echo "Usage: $0 " - exit -fi - -version=$1 -rc=$2 - -tmp_dir=tmp-apache-arrow-dist - -echo "Recreate temporary directory: ${tmp_dir}" -rm -rf ${tmp_dir} -mkdir -p ${tmp_dir} - -echo "Clone dev dist repository" -svn \ - co \ - https://dist.apache.org/repos/dist/dev/arrow/apache-arrow-${version}-rc${rc} \ - ${tmp_dir}/dev - -echo "Clone release dist repository" -svn co https://dist.apache.org/repos/dist/release/arrow ${tmp_dir}/release - -echo "Copy ${version}-rc${rc} to release working copy" -release_version=arrow-${version} -mkdir -p ${tmp_dir}/release/${release_version} -cp -r ${tmp_dir}/dev/* ${tmp_dir}/release/${release_version}/ -svn add ${tmp_dir}/release/${release_version} - -echo "Keep only the three most recent versions" -old_releases=$( - svn ls ${tmp_dir}/release/ | \ - grep '^arrow-' | \ - sort --version-sort --reverse | \ - tail -n +4 -) -for old_release_version in $old_releases; do - echo "Remove old release ${old_release_version}" - svn delete ${tmp_dir}/release/${old_release_version} -done - -echo "Commit release" -svn ci -m "Apache Arrow ${version}" ${tmp_dir}/release - -echo "Clean up" -rm -rf ${tmp_dir} - -echo "Success! The release is available here:" -echo " https://dist.apache.org/repos/dist/release/arrow/${release_version}" diff --git a/dev/release/post-02-binary.sh b/dev/release/post-02-binary.sh deleted file mode 100755 index 9f531afad9fda..0000000000000 --- a/dev/release/post-02-binary.sh +++ /dev/null @@ -1,95 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e -set -o pipefail - -SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -if [ "$#" -ne 2 ]; then - echo "Usage: $0 " - exit -fi - -version=$1 -rc=$2 - -cd "${SOURCE_DIR}" - -: ${BINTRAY_REPOSITORY_CUSTOM:=${BINTRAY_REPOSITORY:-}} - -if [ ! -f .env ]; then - echo "You must create $(pwd)/.env" - echo "You can use $(pwd)/.env.example as template" - exit 1 -fi -. .env - -if [ -n "${BINTRAY_REPOSITORY_CUSTOM}" ]; then - BINTRAY_REPOSITORY=${BINTRAY_REPOSITORY_CUSTOM} -fi - -. binary-common.sh - -# By default deploy all artifacts. -# To deactivate one category, deactivate the category and all of its dependents. -# To explicitly select one category, set DEPLOY_DEFAULT=0 DEPLOY_X=1. -: ${DEPLOY_DEFAULT:=1} -: ${DEPLOY_CENTOS:=${DEPLOY_DEFAULT}} -: ${DEPLOY_DEBIAN:=${DEPLOY_DEFAULT}} -: ${DEPLOY_NUGET:=${DEPLOY_DEFAULT}} -: ${DEPLOY_PYTHON:=${DEPLOY_DEFAULT}} -: ${DEPLOY_UBUNTU:=${DEPLOY_DEFAULT}} - -rake_tasks=() -apt_targets=() -yum_targets=() -if [ ${DEPLOY_DEBIAN} -gt 0 ]; then - rake_tasks+=(apt:release) - apt_targets+=(debian) -fi -if [ ${DEPLOY_UBUNTU} -gt 0 ]; then - rake_tasks+=(apt:release) - apt_targets+=(ubuntu) -fi -if [ ${DEPLOY_CENTOS} -gt 0 ]; then - rake_tasks+=(yum:release) - yum_targets+=(centos) -fi -if [ ${DEPLOY_NUGET} -gt 0 ]; then - rake_tasks+=(nuget:release) -fi -if [ ${DEPLOY_PYTHON} -gt 0 ]; then - rake_tasks+=(python:release) -fi -rake_tasks+=(summary:release) - -tmp_dir=binary/tmp -mkdir -p "${tmp_dir}" - -docker_run \ - ./runner.sh \ - rake \ - "${rake_tasks[@]}" \ - APT_TARGETS=$(IFS=,; echo "${apt_targets[*]}") \ - ARTIFACTS_DIR="${tmp_dir}/artifacts" \ - BINTRAY_REPOSITORY=${BINTRAY_REPOSITORY} \ - RC=${rc} \ - VERSION=${version} \ - YUM_TARGETS=$(IFS=,; echo "${yum_targets[*]}") diff --git a/dev/release/post-03-website.sh b/dev/release/post-03-website.sh deleted file mode 100755 index b427142ea989e..0000000000000 --- a/dev/release/post-03-website.sh +++ /dev/null @@ -1,266 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e -set -u - -SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -ARROW_DIR="${SOURCE_DIR}/../.." -ARROW_SITE_DIR="${ARROW_DIR}/../arrow-site" - -if [ "$#" -ne 2 ]; then - echo "Usage: $0 " - exit 1 -fi - -previous_version=$1 -version=$2 - -branch_name=release-note-${version} -release_dir="${ARROW_SITE_DIR}/_release" -announce_file="${release_dir}/${version}.md" -versions_yml="${ARROW_SITE_DIR}/_data/versions.yml" - -pushd "${ARROW_SITE_DIR}" -git checkout master -git checkout -b ${branch_name} -popd - -pushd "${ARROW_DIR}" - -release_date=$(LANG=C date "+%-d %B %Y") -previous_tag_date=$(git log -n 1 --pretty=%aI apache-arrow-${previous_version}) -rough_previous_release_date=$(date --date "${previous_tag_date}" +%s) -rough_release_date=$(date +%s) -rough_n_development_months=$(( - (${rough_release_date} - ${rough_previous_release_date}) / (60 * 60 * 24 * 30) -)) - -git_tag=apache-arrow-${version} -git_range=apache-arrow-${previous_version}..${git_tag} - -committers_command_line="git shortlog -csn ${git_range}" -contributors_command_line="git shortlog -sn ${git_range}" - -committers=$(${committers_command_line}) -contributors=$(${contributors_command_line}) - -n_commits=$(git log --pretty=oneline ${git_range} | wc -l) -n_contributors=$(${contributors_command_line} | wc -l) - -git_tag_hash=$(git log -n 1 --pretty=%H ${git_tag}) - -popd - -pushd "${ARROW_SITE_DIR}" - -# Add announce for the current version -cat < "${announce_file}" ---- -layout: default -title: Apache Arrow ${version} Release -permalink: /release/${version}.html ---- - - -# Apache Arrow ${version} (${release_date}) - -This is a major release covering more than ${rough_n_development_months} months of development. - -## Download - -* [**Source Artifacts**][1] -* **Binary Artifacts** - * [For CentOS][2] - * [For Debian][3] - * [For Python][4] - * [For Ubuntu][5] -* [Git tag][6] - -## Contributors - -This release includes ${n_commits} commits from ${n_contributors} distinct contributors. - -\`\`\`console -$ ${contributors_command_line} -ANNOUNCE - -echo "${contributors}" >> "${announce_file}" - -cat <> "${announce_file}" -\`\`\` - -## Patch Committers - -The following Apache committers merged contributed patches to the repository. - -\`\`\`console -$ ${committers_command_line} -ANNOUNCE - -echo "${committers}" >> "${announce_file}" - -cat <> "${announce_file}" -\`\`\` - -## Changelog - -ANNOUNCE - -archery release changelog generate ${version} | \ - sed -e 's/^#/##/g' >> "${announce_file}" - -cat <> "${announce_file}" -[1]: https://www.apache.org/dyn/closer.lua/arrow/arrow-${version}/ -[2]: https://bintray.com/apache/arrow/centos/${version}/ -[3]: https://bintray.com/apache/arrow/debian/${version}/ -[4]: https://bintray.com/apache/arrow/python/${version}/ -[5]: https://bintray.com/apache/arrow/ubuntu/${version}/ -[6]: https://github.com/apache/arrow/releases/tag/apache-arrow-${version} -ANNOUNCE -git add "${announce_file}" - - -# Update index -pushd "${release_dir}" - -index_file=index.md -rm -f ${index_file} -announce_files="$(ls | sort --version-sort --reverse)" -cat < ${index_file} ---- -layout: default -title: Releases -permalink: /release/index.html ---- - - -# Apache Arrow Releases - -Navigate to the release page for downloads and the changelog. - -INDEX - -i=0 -for md_file in ${announce_files}; do - i=$((i + 1)) - title=$(grep '^# Apache Arrow' ${md_file} | sed -e 's/^# Apache Arrow //') - echo "* [${title}][${i}]" >> ${index_file} -done -echo >> ${index_file} - -i=0 -for md_file in ${announce_files}; do - i=$((i + 1)) - html_file=$(echo ${md_file} | sed -e 's/md$/html/') - echo "[${i}]: {{ site.baseurl }}/release/${html_file}" >> ${index_file} -done - -git add ${index_file} - -popd - - -# Update versions.yml -pinned_version=$(echo ${version} | sed -e 's/\.[^.]*$/.*/') - -apache_download_url=https://downloads.apache.org - -cat < "${versions_yml}" -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to you under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Database of the current version -# -current: - number: '${version}' - pinned_number: '${pinned_version}' - date: '${release_date}' - git-tag: '${git_tag_hash}' - github-tag-link: 'https://github.com/apache/arrow/releases/tag/${git_tag}' - release-notes: 'https://arrow.apache.org/release/${version}.html' - mirrors: 'https://www.apache.org/dyn/closer.lua/arrow/arrow-${version}/' - tarball-name: 'apache-arrow-${version}.tar.gz' - tarball-url: 'https://www.apache.org/dyn/closer.lua?action=download&filename=arrow/arrow-${version}/apache-arrow-${version}.tar.gz' - java-artifacts: 'http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.arrow%22%20AND%20v%3A%22${version}%22' - asc: '${apache_download_url}/arrow/arrow-${version}/apache-arrow-${version}.tar.gz.asc' - sha256: '${apache_download_url}/arrow/arrow-${version}/apache-arrow-${version}.tar.gz.sha256' - sha512: '${apache_download_url}/arrow/arrow-${version}/apache-arrow-${version}.tar.gz.sha512' -YAML -git add "${versions_yml}" - -git commit -m "[Website] Add release note for ${version}" -git push -u origin ${branch_name} - -github_url=$(git remote get-url origin | \ - sed \ - -e 's,^git@github.com:,https://github.com/,' \ - -e 's,\.git$,,') - -echo "Success!" -echo "Create a pull request:" -echo " ${github_url}/pull/new/${branch_name}" - -popd diff --git a/dev/release/post-04-ruby.sh b/dev/release/post-04-ruby.sh deleted file mode 100755 index 7bc42aaeeb1f7..0000000000000 --- a/dev/release/post-04-ruby.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash -# -*- indent-tabs-mode: nil; sh-indentation: 2; sh-basic-offset: 2 -*- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -set -e -set -o pipefail - -SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - -if [ "$#" -ne 1 ]; then - echo "Usage: $0 " - exit -fi - -version=$1 -archive_name=apache-arrow-${version} -tar_gz=${archive_name}.tar.gz - -rm -f ${tar_gz} -curl \ - --remote-name \ - --fail \ - https://downloads.apache.org/arrow/arrow-${version}/${tar_gz} -rm -rf ${archive_name} -tar xf ${tar_gz} -modules=() -for module in ${archive_name}/ruby/red-*; do - pushd ${module} - rake release - modules+=($(basename ${module})) - popd -done -rm -rf ${archive_name} -rm -f ${tar_gz} - -echo "Success! The released RubyGems are available here:" -for module in ${modules[@]}; do - echo " https://rubygems.org/gems/${module}/versions/${version}" -done diff --git a/dev/release/post-05-js.sh b/dev/release/post-05-js.sh deleted file mode 100755 index 3df07e43ea1e2..0000000000000 --- a/dev/release/post-05-js.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash -# -*- indent-tabs-mode: nil; sh-indentation: 2; sh-basic-offset: 2 -*- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -set -e - -SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - -if [ "$#" -ne 1 ]; then - echo "Usage: $0 " - exit -fi - -version=$1 -archive_name=apache-arrow-${version} -tar_gz=${archive_name}.tar.gz - -rm -f ${tar_gz} -curl \ - --remote-name \ - --fail \ - https://downloads.apache.org/arrow/arrow-${version}/${tar_gz} -rm -rf ${archive_name} -tar xf ${tar_gz} -pushd ${archive_name}/js -./npm-release.sh -popd -rm -rf ${archive_name} -rm -f ${tar_gz} - -echo "Success! The released npm packages are available here:" -echo " https://www.npmjs.com/package/apache-arrow/v/${version}" diff --git a/dev/release/post-06-csharp.sh b/dev/release/post-06-csharp.sh deleted file mode 100755 index e9572025ab50c..0000000000000 --- a/dev/release/post-06-csharp.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash -# -*- indent-tabs-mode: nil; sh-indentation: 2; sh-basic-offset: 2 -*- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -set -eux - -SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -if [ "$#" -ne 1 ]; then - echo "Usage: $0 " - exit -fi - -version=$1 - -if [ -z "${NUGET_API_KEY}" ]; then - echo "NUGET_API_KEY is empty" - exit 1 -fi - -base_names=() -base_names+=(Apache.Arrow.${version}) -base_names+=(Apache.Arrow.Flight.${version}) -base_names+=(Apache.Arrow.Flight.AspNetCore.${version}) -for base_name in ${base_names[@]}; do - for extension in nupkg snupkg; do - path=${base_name}.${extension} - rm -f ${path} - curl \ - --fail \ - --location \ - --remote-name \ - https://apache.bintray.com/arrow/nuget/${version}/${path} - done - dotnet nuget push \ - ${base_name}.nupkg \ - -k ${NUGET_API_KEY} \ - -s https://api.nuget.org/v3/index.json -done - -echo "Success! The released NuGet package is available here:" -echo " https://www.nuget.org/packages/Apache.Arrow/${version}" diff --git a/dev/release/post-07-rust.sh b/dev/release/post-07-rust.sh deleted file mode 100755 index 3c94607565f94..0000000000000 --- a/dev/release/post-07-rust.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash -# -*- indent-tabs-mode: nil; sh-indentation: 2; sh-basic-offset: 2 -*- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -set -e -set -o pipefail - -SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -if [ "$#" -ne 1 ]; then - echo "Usage: $0 " - exit -fi - -version=$1 - -: ${INSTALL_RUST:=no} - -if [ "${INSTALL_RUST}" == "yes" ]; then - export RUSTUP_HOME="$(pwd)/release-rustup" - export CARGO_HOME="${RUSTUP_HOME}" - rm -rf "${RUSTUP_HOME}" - curl https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path - export PATH="${RUSTUP_HOME}/bin:$PATH" - source "${RUSTUP_HOME}/env" - rustup default stable - cargo login -fi - -archive_name=apache-arrow-${version} -tar_gz=${archive_name}.tar.gz -rm -f ${tar_gz} -curl \ - --remote-name \ - --fail \ - https://downloads.apache.org/arrow/arrow-${version}/${tar_gz} -rm -rf ${archive_name} -tar xf ${tar_gz} -modules=() -for cargo_toml in ${archive_name}/rust/*/Cargo.toml; do - module_dir=$(dirname ${cargo_toml}) - pushd ${module_dir} - cargo publish --allow-dirty - modules+=($(basename ${module_dir})) - popd -done -popd -rm -rf ${archive_name} -rm -f ${tar_gz} - -if [ "${INSTALL_RUST}" == "yes" ]; then - rm -rf "${RUSTUP_HOME}" -fi - -echo "Success! The released packages are available here:" -for module in ${modules[@]}; do - echo " https://crates.io/crates/${module}/${version}" -done diff --git a/dev/release/post-08-remove-rc.sh b/dev/release/post-08-remove-rc.sh deleted file mode 100755 index a02861364645a..0000000000000 --- a/dev/release/post-08-remove-rc.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -set -e -set -u -set -o pipefail - -SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -if [ "$#" -ne 1 ]; then - echo "Usage: $0 " - exit -fi - -version=$1 - -base_url=https://dist.apache.org/repos/dist/dev/arrow -pattern="^apache-arrow-${version}-rc" -paths=$() -if svn ls ${base_url} | grep "${pattern}" > /dev/null 2>&1; then - rc_paths=$(svn ls ${base_url} | grep "${pattern}") - rc_urls=() - for rc_path in ${rc_paths}; do - rc_urls+=(${base_url}/${rc_path}) - done - svn rm --message "Remove RC for ${version}" ${rc_urls[@]} - echo "Removed RC artifacts:" - for rc_url in ${rc_urls[@]}; do - echo " ${rc_url}" - done -else - echo "No RC artifacts at ${base_url}" -fi diff --git a/dev/release/post-09-docs.sh b/dev/release/post-09-docs.sh deleted file mode 100755 index c9f75b48b2c80..0000000000000 --- a/dev/release/post-09-docs.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e -set -u - -SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -ARROW_DIR="${SOURCE_DIR}/../.." -ARROW_SITE_DIR="${ARROW_DIR}/../arrow-site" - -if [ "$#" -ne 1 ]; then - echo "Usage: $0 " - exit 1 -fi - -version=$1 -release_tag="apache-arrow-${version}" -branch_name=release-docs-${version} - -pushd "${ARROW_SITE_DIR}" -git checkout asf-site -git checkout -b ${branch_name} -rm -rf docs/* -git checkout docs/c_glib/index.html -popd - -pushd "${ARROW_DIR}" -git checkout "${release_tag}" - -archery docker run \ - -v "${ARROW_SITE_DIR}/docs:/build/docs" \ - -e ARROW_DOCS_VERSION="${version}" \ - -e UBUNTU=20.10 \ - ubuntu-docs - -: ${PUSH:=1} - -if [ ${PUSH} -gt 0 ]; then - pushd "${ARROW_SITE_DIR}" - git add docs - git commit -m "[Website] Update documentations for ${version}" - git push -u origin ${branch_name} - github_url=$(git remote get-url origin | \ - sed \ - -e 's,^git@github.com:,https://github.com/,' \ - -e 's,\.git$,,') - popd - - echo "Success!" - echo "Create a pull request:" - echo " ${github_url}/pull/new/${branch_name}" -fi diff --git a/dev/release/post-10-python.sh b/dev/release/post-10-python.sh deleted file mode 100755 index 0f7a480cde610..0000000000000 --- a/dev/release/post-10-python.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e -set -o pipefail - -SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -if [ "$#" -ne 2 ]; then - echo "Usage: $0 " - exit -fi - -version=$1 -rc=$2 - -tmp=$(mktemp -d -t "arrow-post-python.XXXXX") -${PYTHON:-python} \ - "${SOURCE_DIR}/download_rc_binaries.py" \ - ${version} \ - ${rc} \ - --dest="${tmp}" \ - --package_type=python -twine upload ${tmp}/python-rc/${version}-rc${rc}/*.{whl,tar.gz} -rm -rf "${tmp}" - -echo "Success! The released PyPI packages are available here:" -echo " https://pypi.org/project/pyarrow/${version}" diff --git a/dev/release/post-11-java.sh b/dev/release/post-11-java.sh deleted file mode 100755 index d9dc32a7f557c..0000000000000 --- a/dev/release/post-11-java.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e -set -o pipefail - -SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - -if [ "$#" -ne 1 ]; then - echo "Usage: $0 " - exit -fi - -version=$1 -archive_name=apache-arrow-${version} -tar_gz=${archive_name}.tar.gz - -rm -f ${tar_gz} -curl \ - --remote-name \ - --fail \ - https://downloads.apache.org/arrow/arrow-${version}/${tar_gz} -rm -rf ${archive_name} -tar xf ${tar_gz} - -# build the jni bindings similarly like the 01-perform.sh does -mkdir -p ${archive_name}/cpp/java-build -pushd ${archive_name}/cpp/java-build -cmake \ - -DARROW_GANDIVA=ON \ - -DARROW_GANDIVA_JAVA=ON \ - -DARROW_JNI=ON \ - -DARROW_ORC=ON \ - -DCMAKE_BUILD_TYPE=release \ - -G Ninja \ - .. -ninja -popd - -# go in the java subfolder -pushd ${archive_name}/java -# stage the artifacts using both the apache-release and arrow-jni profiles -mvn -Papache-release,arrow-jni -Darrow.cpp.build.dir=$(realpath ../cpp/java-build) deploy -popd - -echo "Success! The maven artifacts have been stated. Proceed with the following steps:" -echo "1. Login to the apache repository: https://repository.apache.org/#stagingRepositories" -echo "2. Select the arrow staging repository you just just created: orgapachearrow-100x" -echo "3. Click the \"close\" button" -echo "4. Once validation has passed, click the \"release\" button" -echo "" -echo "Note, that you must set up Maven to be able to publish to Apache's repositories." -echo "Read more at https://www.apache.org/dev/publishing-maven-artifacts.html." diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt deleted file mode 100644 index 96beccd0af81e..0000000000000 --- a/dev/release/rat_exclude_files.txt +++ /dev/null @@ -1,108 +0,0 @@ -*.npmrc -*.gitignore -*.dockerignore -.gitmodules -*_generated.h -*_generated.js -*_generated.ts -*.csv -*.json -*.snap -.github/ISSUE_TEMPLATE/*.md -.github/pull_request_template.md -ci/etc/rprofile -ci/etc/*.patch -ci/vcpkg/*.patch -CHANGELOG.md -dev/requirements*.txt -dev/archery/MANIFEST.in -dev/archery/requirements*.txt -dev/archery/archery/tests/fixtures/* -dev/archery/archery/crossbow/tests/fixtures/* -dev/release/rat_exclude_files.txt -dev/tasks/homebrew-formulae/apache-arrow.rb -dev/tasks/linux-packages/apache-arrow-apt-source/debian/apache-arrow-apt-source.install -dev/tasks/linux-packages/apache-arrow-apt-source/debian/compat -dev/tasks/linux-packages/apache-arrow-apt-source/debian/control -dev/tasks/linux-packages/apache-arrow-apt-source/debian/rules -dev/tasks/linux-packages/apache-arrow-apt-source/debian/source/format -dev/tasks/linux-packages/apache-arrow/debian/compat -dev/tasks/linux-packages/apache-arrow/debian/control.in -dev/tasks/linux-packages/apache-arrow/debian/gir1.2-arrow-1.0.install -dev/tasks/linux-packages/apache-arrow/debian/gir1.2-arrow-cuda-1.0.install -dev/tasks/linux-packages/apache-arrow/debian/gir1.2-arrow-dataset-1.0.install -dev/tasks/linux-packages/apache-arrow/debian/gir1.2-gandiva-1.0.install -dev/tasks/linux-packages/apache-arrow/debian/gir1.2-parquet-1.0.install -dev/tasks/linux-packages/apache-arrow/debian/gir1.2-plasma-1.0.install -dev/tasks/linux-packages/apache-arrow/debian/libarrow-dev.install -dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-dev.install -dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-doc.doc-base -dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-doc.install -dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-doc.links -dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib400.install -dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-dev.install -dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib-dev.install -dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib400.install -dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda400.install -dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-dev.install -dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-dev.install -dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-doc.doc-base -dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-doc.install -dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-doc.links -dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib400.install -dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset400.install -dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight-dev.install -dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight400.install -dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-dev.install -dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-flight-dev.install -dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-flight400.install -dev/tasks/linux-packages/apache-arrow/debian/libarrow-python400.install -dev/tasks/linux-packages/apache-arrow/debian/libarrow400.install -dev/tasks/linux-packages/apache-arrow/debian/libgandiva-dev.install -dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-dev.install -dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-doc.doc-base -dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-doc.install -dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-doc.links -dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib400.install -dev/tasks/linux-packages/apache-arrow/debian/libgandiva400.install -dev/tasks/linux-packages/apache-arrow/debian/libparquet-dev.install -dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-dev.install -dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-doc.doc-base -dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-doc.install -dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-doc.links -dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib400.install -dev/tasks/linux-packages/apache-arrow/debian/libparquet400.install -dev/tasks/linux-packages/apache-arrow/debian/libplasma-dev.install -dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-dev.install -dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-doc.doc-base -dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-doc.install -dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-doc.links -dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib400.install -dev/tasks/linux-packages/apache-arrow/debian/libplasma400.install -dev/tasks/linux-packages/apache-arrow/debian/patches/series -dev/tasks/linux-packages/apache-arrow/debian/plasma-store-server.install -dev/tasks/linux-packages/apache-arrow/debian/rules -dev/tasks/linux-packages/apache-arrow/debian/source/format -dev/tasks/linux-packages/apache-arrow/debian/watch -dev/tasks/requirements*.txt -dev/tasks/conda-recipes/* -pax_global_header -MANIFEST.in -__init__.pxd -__init__.py -requirements.txt -*.html -*.sgml -*.css -*.png -*.ico -*.svg -*.devhelp2 -*.scss -.gitattributes -rust-toolchain -benchmarks/queries/q*.sql -ballista/rust/scheduler/testdata/* -ballista/ui/scheduler/yarn.lock -python/rust-toolchain -python/requirements*.txt diff --git a/dev/release/run-rat.sh b/dev/release/run-rat.sh deleted file mode 100755 index 94fa55fbe0974..0000000000000 --- a/dev/release/run-rat.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -RAT_VERSION=0.13 - -# download apache rat -if [ ! -f apache-rat-${RAT_VERSION}.jar ]; then - curl -s https://repo1.maven.org/maven2/org/apache/rat/apache-rat/${RAT_VERSION}/apache-rat-${RAT_VERSION}.jar > apache-rat-${RAT_VERSION}.jar -fi - -RAT="java -jar apache-rat-${RAT_VERSION}.jar -x " - -RELEASE_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) - -# generate the rat report -$RAT $1 > rat.txt -python $RELEASE_DIR/check-rat-report.py $RELEASE_DIR/rat_exclude_files.txt rat.txt > filtered_rat.txt -cat filtered_rat.txt -UNAPPROVED=`cat filtered_rat.txt | grep "NOT APPROVED" | wc -l` - -if [ "0" -eq "${UNAPPROVED}" ]; then - echo "No unapproved licenses" -else - echo "${UNAPPROVED} unapproved licences. Check rat report: rat.txt" - exit 1 -fi diff --git a/dev/release/run-test.rb b/dev/release/run-test.rb deleted file mode 100755 index 90df39b13fa29..0000000000000 --- a/dev/release/run-test.rb +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env ruby -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -$VERBOSE = true - -require "pathname" - -test_dir = Pathname.new(__dir__) - -require "test-unit" -require_relative "test-helper" - -ENV["TEST_UNIT_MAX_DIFF_TARGET_STRING_SIZE"] = "10000" - -exit(Test::Unit::AutoRunner.run(true, test_dir.to_s)) diff --git a/dev/release/setup-gpg-agent.sh b/dev/release/setup-gpg-agent.sh deleted file mode 100644 index 3e31d0e4e3c55..0000000000000 --- a/dev/release/setup-gpg-agent.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -# source me -eval $(gpg-agent --daemon --allow-preset-passphrase) -gpg --use-agent -s LICENSE.txt -rm -rf LICENSE.txt.gpg diff --git a/dev/release/test-helper.rb b/dev/release/test-helper.rb deleted file mode 100644 index 8a272ddfe04a2..0000000000000 --- a/dev/release/test-helper.rb +++ /dev/null @@ -1,96 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -require "English" -require "cgi/util" -require "fileutils" -require "find" -require "json" -require "open-uri" -require "rexml/document" -require "tempfile" -require "tmpdir" - -module CommandRunnable - class Error < StandardError - end - - def sh(*command_line, check_result: true) - if command_line[0].is_a?(Hash) - env = command_line.shift - else - env = {} - end - stdout = Tempfile.new("command-stdout.log") - stderr = Tempfile.new("command-stderr.log") - success = system(env, *command_line, out: stdout.path, err: stderr.path) - if check_result - unless success - message = "Failed to run: #{command_line.join(" ")}\n" - message << "stdout:\n #{stdout.read}\n" - message << "stderr:\n #{stderr.read}" - raise Error, message - end - end - stdout.read - end -end - -module GitRunnable - include CommandRunnable - - def git(*args) - if args[0].is_a?(Hash) - env = args.shift - else - env = {} - end - sh(env, "git", *args) - end - - def git_current_commit - git("rev-parse", "HEAD").chomp - end - - def git_tags - git("tags").lines(chomp: true) - end -end - -module VersionDetectable - def detect_versions - top_dir = Pathname(__dir__).parent.parent - cpp_cmake_lists = top_dir + "cpp" + "CMakeLists.txt" - @snapshot_version = cpp_cmake_lists.read[/ARROW_VERSION "(.+?)"/, 1] - @release_version = @snapshot_version.gsub(/-SNAPSHOT\z/, "") - @so_version = compute_so_version(@release_version) - @next_version = @release_version.gsub(/\A\d+/) {|major| major.succ} - @next_snapshot_version = "#{@next_version}-SNAPSHOT" - @next_so_version = compute_so_version(@next_version) - r_description = top_dir + "r" + "DESCRIPTION" - @previous_version = r_description.read[/^Version: (.+?)\.9000$/, 1] - end - - def compute_so_version(version) - major, minor, _patch = version.split(".") - Integer(major, 10) * 100 + Integer(minor, 10) - end - - def on_release_branch? - @snapshot_version == @release_version - end -end diff --git a/dev/release/verify-apt.sh b/dev/release/verify-apt.sh deleted file mode 100755 index e7b87a3a4da6e..0000000000000 --- a/dev/release/verify-apt.sh +++ /dev/null @@ -1,150 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -exu - -if [ $# -lt 2 ]; then - echo "Usage: $0 VERSION rc" - echo " $0 VERSION release" - echo " $0 VERSION local" - echo " e.g.: $0 0.13.0 rc # Verify 0.13.0 RC" - echo " e.g.: $0 0.13.0 release # Verify 0.13.0" - echo " e.g.: $0 0.13.0-dev20210203 local # Verify 0.13.0-dev20210203 on local" - exit 1 -fi - -VERSION="$1" -TYPE="$2" - -local_prefix="/arrow/dev/tasks/linux-packages" - -export DEBIAN_FRONTEND=noninteractive - -apt update -apt install -y -V \ - curl \ - lsb-release - -code_name="$(lsb_release --codename --short)" -distribution="$(lsb_release --id --short | tr 'A-Z' 'a-z')" -artifactory_base_url="https://apache.jfrog.io/artifactory/arrow/${distribution}" -if [ "${TYPE}" = "rc" ]; then - artifactory_base_url+="-rc" -fi - -have_flight=yes -have_plasma=yes -workaround_missing_packages=() -case "${distribution}-${code_name}" in - debian-*) - sed \ - -i"" \ - -e "s/ main$/ main contrib non-free/g" \ - /etc/apt/sources.list - ;; -esac -if [ "$(arch)" = "aarch64" ]; then - have_plasma=no -fi - -if [ "${TYPE}" = "local" ]; then - case "${VERSION}" in - *-dev*) - package_version="$(echo "${VERSION}" | sed -e 's/-dev\(.*\)$/~dev\1/g')" - ;; - *-rc*) - package_version="$(echo "${VERSION}" | sed -e 's/-rc.*$//g')" - ;; - *) - package_version="${VERSION}" - ;; - esac - package_version+="-1" - apt_source_path="${local_prefix}/apt/repositories" - apt_source_path+="/${distribution}/pool/${code_name}/main" - apt_source_path+="/a/apache-arrow-apt-source" - apt_source_path+="/apache-arrow-apt-source_${package_version}_all.deb" - apt install -y -V "${apt_source_path}" -else - package_version="${VERSION}-1" - apt_source_base_name="apache-arrow-apt-source-latest-${code_name}.deb" - curl \ - --output "${apt_source_base_name}" \ - "${artifactory_base_url}/${apt_source_base_name}" - apt install -y -V "./${apt_source_base_name}" -fi - -if [ "${TYPE}" = "local" ]; then - sed \ - -i"" \ - -e "s,^URIs: .*$,URIs: file://${local_prefix}/apt/repositories/${distribution},g" \ - /etc/apt/sources.list.d/apache-arrow.sources - keys="${local_prefix}/KEYS" - if [ -f "${keys}" ]; then - gpg \ - --no-default-keyring \ - --keyring /usr/share/keyrings/apache-arrow-apt-source.gpg \ - --import "${keys}" - fi -else - if [ "${TYPE}" = "rc" ]; then - sed \ - -i"" \ - -e "s,^URIs: \\(.*\\)/,URIs: \\1-rc/,g" \ - /etc/apt/sources.list.d/apache-arrow.sources - fi -fi - -apt update - -apt install -y -V libarrow-glib-dev=${package_version} -required_packages=() -required_packages+=(cmake) -required_packages+=(g++) -required_packages+=(git) -required_packages+=(${workaround_missing_packages[@]}) -apt install -y -V ${required_packages[@]} -mkdir -p build -cp -a /arrow/cpp/examples/minimal_build build -pushd build/minimal_build -cmake . -make -j$(nproc) -./arrow_example -popd - -apt install -y -V libarrow-glib-dev=${package_version} -apt install -y -V libarrow-glib-doc=${package_version} - -if [ "${have_flight}" = "yes" ]; then - apt install -y -V libarrow-flight-dev=${package_version} -fi - -apt install -y -V libarrow-python-dev=${package_version} - -if [ "${have_plasma}" = "yes" ]; then - apt install -y -V libplasma-glib-dev=${package_version} - apt install -y -V libplasma-glib-doc=${package_version} - apt install -y -V plasma-store-server=${package_version} -fi - -apt install -y -V libgandiva-glib-dev=${package_version} -apt install -y -V libgandiva-glib-doc=${package_version} - -apt install -y -V libparquet-glib-dev=${package_version} -apt install -y -V libparquet-glib-doc=${package_version} diff --git a/dev/release/verify-release-candidate-wheels.bat b/dev/release/verify-release-candidate-wheels.bat deleted file mode 100644 index 2b57113a1bf78..0000000000000 --- a/dev/release/verify-release-candidate-wheels.bat +++ /dev/null @@ -1,107 +0,0 @@ -@rem Licensed to the Apache Software Foundation (ASF) under one -@rem or more contributor license agreements. See the NOTICE file -@rem distributed with this work for additional information -@rem regarding copyright ownership. The ASF licenses this file -@rem to you under the Apache License, Version 2.0 (the -@rem "License"); you may not use this file except in compliance -@rem with the License. You may obtain a copy of the License at -@rem -@rem http://www.apache.org/licenses/LICENSE-2.0 -@rem -@rem Unless required by applicable law or agreed to in writing, -@rem software distributed under the License is distributed on an -@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -@rem KIND, either express or implied. See the License for the -@rem specific language governing permissions and limitations -@rem under the License. - -@rem This script downloads and installs all Windows wheels for a release -@rem candidate into temporary conda environments and makes sure that imports -@rem work - -@rem To run the script: -@rem verify-release-candidate-wheels.bat VERSION RC_NUM - -@echo on - -set _CURRENT_DIR=%CD% -set _VERIFICATION_DIR=C:\tmp\arrow-verify-release-wheels - -if not exist "C:\tmp\" mkdir C:\tmp -if exist %_VERIFICATION_DIR% rd %_VERIFICATION_DIR% /s /q -if not exist %_VERIFICATION_DIR% mkdir %_VERIFICATION_DIR% - -cd %_VERIFICATION_DIR% - -@rem clone Arrow repository to obtain test requirements -set GIT_ENV_PATH=%_VERIFICATION_DIR%\_git -call conda create -p %GIT_ENV_PATH% ^ - --no-shortcuts -f -q -y git ^ - || EXIT /B 1 -call activate %GIT_ENV_PATH% - -git clone https://github.com/apache/arrow.git || EXIT /B 1 -pushd arrow -git submodule update --init -popd - -call deactivate - -set ARROW_TEST_DATA=%cd%\arrow\testing\data - -CALL :verify_wheel 3.6 %1 %2 m -if errorlevel 1 GOTO error - -CALL :verify_wheel 3.7 %1 %2 m -if errorlevel 1 GOTO error - -CALL :verify_wheel 3.8 %1 %2 -if errorlevel 1 GOTO error - -:done -cd %_CURRENT_DIR% - -EXIT /B %ERRORLEVEL% - -:error -call deactivate -cd %_CURRENT_DIR% - -EXIT /B 1 - -@rem a batch function to verify a single wheel -:verify_wheel - -set PY_VERSION=%1 -set ARROW_VERSION=%2 -set RC_NUMBER=%3 -set ABI_TAG=%4 -set PY_VERSION_NO_PERIOD=%PY_VERSION:.=% - -set CONDA_ENV_PATH=%_VERIFICATION_DIR%\_verify-wheel-%PY_VERSION% -call conda create -p %CONDA_ENV_PATH% ^ - --no-shortcuts -f -q -y python=%PY_VERSION% ^ - || EXIT /B 1 -call activate %CONDA_ENV_PATH% - -set WHEEL_FILENAME=pyarrow-%ARROW_VERSION%-cp%PY_VERSION_NO_PERIOD%-cp%PY_VERSION_NO_PERIOD%%ABI_TAG%-win_amd64.whl - -@rem Requires GNU Wget for Windows -wget --no-check-certificate -O %WHEEL_FILENAME% https://bintray.com/apache/arrow/download_file?file_path=python-rc%%2F%ARROW_VERSION%-rc%RC_NUMBER%%%2F%WHEEL_FILENAME% || EXIT /B 1 - -pip install %WHEEL_FILENAME% || EXIT /B 1 - -pip install -r arrow/python/requirements-test.txt || EXIT /B 1 - -py.test %CONDA_ENV_PATH%\Lib\site-packages\pyarrow --pdb -v || EXIT /B 1 - -python -c "import pyarrow" || EXIT /B 1 -python -c "import pyarrow.parquet" || EXIT /B 1 -python -c "import pyarrow.flight" || EXIT /B 1 -python -c "import pyarrow.dataset" || EXIT /B 1 - -:done - -call deactivate - -EXIT /B 0 diff --git a/dev/release/verify-release-candidate.bat b/dev/release/verify-release-candidate.bat deleted file mode 100644 index bef78fc920cca..0000000000000 --- a/dev/release/verify-release-candidate.bat +++ /dev/null @@ -1,130 +0,0 @@ -@rem Licensed to the Apache Software Foundation (ASF) under one -@rem or more contributor license agreements. See the NOTICE file -@rem distributed with this work for additional information -@rem regarding copyright ownership. The ASF licenses this file -@rem to you under the Apache License, Version 2.0 (the -@rem "License"); you may not use this file except in compliance -@rem with the License. You may obtain a copy of the License at -@rem -@rem http://www.apache.org/licenses/LICENSE-2.0 -@rem -@rem Unless required by applicable law or agreed to in writing, -@rem software distributed under the License is distributed on an -@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -@rem KIND, either express or implied. See the License for the -@rem specific language governing permissions and limitations -@rem under the License. - -@rem To run the script: -@rem verify-release-candidate.bat VERSION RC_NUM - -@echo on - -if not exist "C:\tmp\" mkdir C:\tmp -if exist "C:\tmp\arrow-verify-release" rd C:\tmp\arrow-verify-release /s /q -if not exist "C:\tmp\arrow-verify-release" mkdir C:\tmp\arrow-verify-release - -set _VERIFICATION_DIR=C:\tmp\arrow-verify-release -set _VERIFICATION_DIR_UNIX=C:/tmp/arrow-verify-release -set _VERIFICATION_CONDA_ENV=%_VERIFICATION_DIR%\conda-env -set _DIST_URL=https://dist.apache.org/repos/dist/dev/arrow -set _TARBALL=apache-arrow-%1.tar.gz -set ARROW_SOURCE=%_VERIFICATION_DIR%\apache-arrow-%1 -set INSTALL_DIR=%_VERIFICATION_DIR%\install - -@rem Requires GNU Wget for Windows -wget --no-check-certificate -O %_TARBALL% %_DIST_URL%/apache-arrow-%1-rc%2/%_TARBALL% || exit /B 1 - -tar xf %_TARBALL% -C %_VERIFICATION_DIR_UNIX% - -set PYTHON=3.6 - -@rem Using call with conda.bat seems necessary to avoid terminating the batch -@rem script execution -call conda create --no-shortcuts -c conda-forge -f -q -y -p %_VERIFICATION_CONDA_ENV% ^ - --file=ci\conda_env_cpp.yml ^ - --file=ci\conda_env_python.yml ^ - git ^ - python=%PYTHON% ^ - || exit /B 1 - -call activate %_VERIFICATION_CONDA_ENV% || exit /B 1 - -set GENERATOR=Visual Studio 15 2017 Win64 -set CONFIGURATION=release - -pushd %ARROW_SOURCE% - -set ARROW_HOME=%INSTALL_DIR% -set PARQUET_HOME=%INSTALL_DIR% -set PATH=%INSTALL_DIR%\bin;%PATH% - -@rem Build and test Arrow C++ libraries -mkdir %ARROW_SOURCE%\cpp\build -pushd %ARROW_SOURCE%\cpp\build - -@rem This is the path for Visual Studio Community 2017 -call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\Tools\VsDevCmd.bat" -arch=amd64 - -@rem NOTE(wesm): not using Ninja for now to be able to more easily control the -@rem generator used - -cmake -G "%GENERATOR%" ^ - -DARROW_BOOST_USE_SHARED=ON ^ - -DARROW_BUILD_STATIC=OFF ^ - -DARROW_BUILD_TESTS=ON ^ - -DARROW_CXXFLAGS="/MP" ^ - -DARROW_DATASET=ON ^ - -DARROW_FLIGHT=ON ^ - -DARROW_MIMALLOC=ON ^ - -DARROW_PARQUET=ON ^ - -DARROW_PYTHON=ON ^ - -DARROW_WITH_BROTLI=ON ^ - -DARROW_WITH_BZ2=ON ^ - -DARROW_WITH_LZ4=ON ^ - -DARROW_WITH_SNAPPY=ON ^ - -DARROW_WITH_ZLIB=ON ^ - -DARROW_WITH_ZSTD=ON ^ - -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ - -DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^ - -DCMAKE_UNITY_BUILD=ON ^ - -DGTest_SOURCE=BUNDLED ^ - .. || exit /B - -cmake --build . --target INSTALL --config Release || exit /B 1 - -@rem NOTE(wesm): Building googletest is flaky for me with ninja. Building it -@rem first fixes the problem - -@rem ninja googletest_ep || exit /B 1 -@rem ninja install || exit /B 1 - -@rem Get testing datasets for Parquet unit tests -git clone https://github.com/apache/parquet-testing.git %_VERIFICATION_DIR%\parquet-testing -set PARQUET_TEST_DATA=%_VERIFICATION_DIR%\parquet-testing\data - -git clone https://github.com/apache/arrow-testing.git %_VERIFICATION_DIR%\arrow-testing -set ARROW_TEST_DATA=%_VERIFICATION_DIR%\arrow-testing\data - -@rem Needed so python-test.exe works -set PYTHONPATH_ORIGINAL=%PYTHONPATH% -set PYTHONPATH=%CONDA_PREFIX%\Lib;%CONDA_PREFIX%\Lib\site-packages;%CONDA_PREFIX%\DLLs;%CONDA_PREFIX%;%PYTHONPATH% -ctest -VV || exit /B 1 -set PYTHONPATH=%PYTHONPATH_ORIGINAL% -popd - -@rem Build and import pyarrow -pushd %ARROW_SOURCE%\python - -pip install -r requirements-test.txt || exit /B 1 - -set PYARROW_CMAKE_GENERATOR=%GENERATOR% -set PYARROW_WITH_FLIGHT=1 -set PYARROW_WITH_PARQUET=1 -set PYARROW_WITH_DATASET=1 -python setup.py build_ext --inplace --bundle-arrow-cpp bdist_wheel || exit /B 1 -pytest pyarrow -v -s --enable-parquet || exit /B 1 - -popd - -call deactivate diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh deleted file mode 100755 index 3fc926f4e82ea..0000000000000 --- a/dev/release/verify-release-candidate.sh +++ /dev/null @@ -1,808 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -# Requirements -# - Ruby >= 2.3 -# - Maven >= 3.3.9 -# - JDK >=7 -# - gcc >= 4.8 -# - Node.js >= 11.12 (best way is to use nvm) -# - Go >= 1.11 -# -# If using a non-system Boost, set BOOST_ROOT and add Boost libraries to -# LD_LIBRARY_PATH. -# -# To reuse build artifacts between runs set ARROW_TMPDIR environment variable to -# a directory where the temporary files should be placed to, note that this -# directory is not cleaned up automatically. - -case $# in - 3) ARTIFACT="$1" - VERSION="$2" - RC_NUMBER="$3" - case $ARTIFACT in - source|binaries|wheels) ;; - *) echo "Invalid argument: '${ARTIFACT}', valid options are \ -'source', 'binaries', or 'wheels'" - exit 1 - ;; - esac - ;; - *) echo "Usage: $0 source|binaries X.Y.Z RC_NUMBER" - exit 1 - ;; -esac - -set -e -set -x -set -o pipefail - -SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" -ARROW_DIR="$(dirname $(dirname ${SOURCE_DIR}))" - -detect_cuda() { - if ! (which nvcc && which nvidia-smi) > /dev/null; then - return 1 - fi - - local n_gpus=$(nvidia-smi --list-gpus | wc -l) - return $((${n_gpus} < 1)) -} - -# Build options for the C++ library - -if [ -z "${ARROW_CUDA:-}" ] && detect_cuda; then - ARROW_CUDA=ON -fi -: ${ARROW_CUDA:=OFF} -: ${ARROW_FLIGHT:=ON} -: ${ARROW_GANDIVA:=ON} - -ARROW_DIST_URL='https://dist.apache.org/repos/dist/dev/arrow' - -download_dist_file() { - curl \ - --silent \ - --show-error \ - --fail \ - --location \ - --remote-name $ARROW_DIST_URL/$1 -} - -download_rc_file() { - download_dist_file apache-arrow-${VERSION}-rc${RC_NUMBER}/$1 -} - -import_gpg_keys() { - download_dist_file KEYS - gpg --import KEYS -} - -fetch_archive() { - local dist_name=$1 - download_rc_file ${dist_name}.tar.gz - download_rc_file ${dist_name}.tar.gz.asc - download_rc_file ${dist_name}.tar.gz.sha256 - download_rc_file ${dist_name}.tar.gz.sha512 - gpg --verify ${dist_name}.tar.gz.asc ${dist_name}.tar.gz - shasum -a 256 -c ${dist_name}.tar.gz.sha256 - shasum -a 512 -c ${dist_name}.tar.gz.sha512 -} - -verify_dir_artifact_signatures() { - # verify the signature and the checksums of each artifact - find $1 -name '*.asc' | while read sigfile; do - artifact=${sigfile/.asc/} - gpg --verify $sigfile $artifact || exit 1 - - # go into the directory because the checksum files contain only the - # basename of the artifact - pushd $(dirname $artifact) - base_artifact=$(basename $artifact) - if [ -f $base_artifact.sha256 ]; then - shasum -a 256 -c $base_artifact.sha256 || exit 1 - fi - shasum -a 512 -c $base_artifact.sha512 || exit 1 - popd - done -} - -test_binary() { - local download_dir=binaries - mkdir -p ${download_dir} - - python $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER \ - --dest=${download_dir} - - verify_dir_artifact_signatures ${download_dir} -} - -test_apt() { - for target in "debian:buster" \ - "arm64v8/debian:buster" \ - "ubuntu:bionic" \ - "arm64v8/ubuntu:bionic" \ - "ubuntu:focal" \ - "arm64v8/ubuntu:focal" \ - "ubuntu:groovy" \ - "arm64v8/ubuntu:groovy"; do \ - case "${target}" in - arm64v8/*) - if [ "$(arch)" = "aarch64" -o -e /usr/bin/qemu-aarch64-static ]; then - : # OK - else - continue - fi - ;; - esac - if ! docker run --rm -v "${SOURCE_DIR}"/../..:/arrow:delegated \ - "${target}" \ - /arrow/dev/release/verify-apt.sh \ - "${VERSION}" \ - "rc" \ - "${BINTRAY_REPOSITORY}"; then - echo "Failed to verify the APT repository for ${target}" - exit 1 - fi - done -} - -test_yum() { - for target in "centos:7" \ - "centos:8" \ - "arm64v8/centos:8"; do - case "${target}" in - arm64v8/*) - if [ "$(arch)" = "aarch64" -o -e /usr/bin/qemu-aarch64-static ]; then - : # OK - else - continue - fi - ;; - esac - if ! docker run --rm -v "${SOURCE_DIR}"/../..:/arrow:delegated \ - "${target}" \ - /arrow/dev/release/verify-yum.sh \ - "${VERSION}" \ - "rc" \ - "${BINTRAY_REPOSITORY}"; then - echo "Failed to verify the Yum repository for ${target}" - exit 1 - fi - done -} - - -setup_tempdir() { - cleanup() { - if [ "${TEST_SUCCESS}" = "yes" ]; then - rm -fr "${ARROW_TMPDIR}" - else - echo "Failed to verify release candidate. See ${ARROW_TMPDIR} for details." - fi - } - - if [ -z "${ARROW_TMPDIR}" ]; then - # clean up automatically if ARROW_TMPDIR is not defined - ARROW_TMPDIR=$(mktemp -d -t "$1.XXXXX") - trap cleanup EXIT - else - # don't clean up automatically - mkdir -p "${ARROW_TMPDIR}" - fi -} - -setup_miniconda() { - # Setup short-lived miniconda for Python and integration tests - if [ "$(uname)" == "Darwin" ]; then - if [ "$(uname -m)" == "arm64" ]; then - MINICONDA_URL=https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh - else - MINICONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - fi - else - MINICONDA_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - fi - - MINICONDA=$PWD/test-miniconda - - if [ ! -d "${MINICONDA}" ]; then - # Setup miniconda only if the directory doesn't exist yet - wget -O miniconda.sh $MINICONDA_URL - bash miniconda.sh -b -p $MINICONDA - rm -f miniconda.sh - fi - echo "Installed miniconda at ${MINICONDA}" - - . $MINICONDA/etc/profile.d/conda.sh - - conda create -n arrow-test -y -q -c conda-forge \ - python=3.8 \ - nomkl \ - numpy \ - pandas \ - cython - conda activate arrow-test - echo "Using conda environment ${CONDA_PREFIX}" -} - -# Build and test Java (Requires newer Maven -- I used 3.3.9) - -test_package_java() { - pushd java - - mvn test - mvn package - - popd -} - -# Build and test C++ - -test_and_install_cpp() { - mkdir -p cpp/build - pushd cpp/build - - ARROW_CMAKE_OPTIONS=" -${ARROW_CMAKE_OPTIONS:-} --DCMAKE_INSTALL_PREFIX=$ARROW_HOME --DCMAKE_INSTALL_LIBDIR=lib --DARROW_FLIGHT=${ARROW_FLIGHT} --DARROW_PLASMA=ON --DARROW_ORC=ON --DARROW_PYTHON=ON --DARROW_GANDIVA=${ARROW_GANDIVA} --DARROW_PARQUET=ON --DARROW_DATASET=ON --DPARQUET_REQUIRE_ENCRYPTION=ON --DARROW_VERBOSE_THIRDPARTY_BUILD=ON --DARROW_WITH_BZ2=ON --DARROW_WITH_ZLIB=ON --DARROW_WITH_ZSTD=ON --DARROW_WITH_LZ4=ON --DARROW_WITH_SNAPPY=ON --DARROW_WITH_BROTLI=ON --DARROW_BOOST_USE_SHARED=ON --DCMAKE_BUILD_TYPE=release --DARROW_BUILD_TESTS=ON --DARROW_BUILD_INTEGRATION=ON --DARROW_CUDA=${ARROW_CUDA} --DARROW_DEPENDENCY_SOURCE=AUTO -" - cmake $ARROW_CMAKE_OPTIONS .. - - make -j$NPROC install - - # TODO: ARROW-5036: plasma-serialization_tests broken - # TODO: ARROW-5054: libgtest.so link failure in flight-server-test - LD_LIBRARY_PATH=$PWD/release:$LD_LIBRARY_PATH ctest \ - --exclude-regex "plasma-serialization_tests" \ - -j$NPROC \ - --output-on-failure \ - -L unittest - popd -} - -test_csharp() { - pushd csharp - - local csharp_bin=${PWD}/bin - mkdir -p ${csharp_bin} - - if which dotnet > /dev/null 2>&1; then - if ! which sourcelink > /dev/null 2>&1; then - local dotnet_tools_dir=$HOME/.dotnet/tools - if [ -d "${dotnet_tools_dir}" ]; then - PATH="${dotnet_tools_dir}:$PATH" - fi - fi - else - local dotnet_version=3.1.405 - local dotnet_platform= - case "$(uname)" in - Linux) - dotnet_platform=linux - ;; - Darwin) - dotnet_platform=macos - ;; - esac - local dotnet_download_thank_you_url=https://dotnet.microsoft.com/download/thank-you/dotnet-sdk-${dotnet_version}-${dotnet_platform}-x64-binaries - local dotnet_download_url=$( \ - curl --location ${dotnet_download_thank_you_url} | \ - grep 'window\.open' | \ - grep -E -o '[^"]+' | \ - sed -n 2p) - curl ${dotnet_download_url} | \ - tar xzf - -C ${csharp_bin} - PATH=${csharp_bin}:${PATH} - fi - - dotnet test - mv dummy.git ../.git - dotnet pack -c Release - mv ../.git dummy.git - - if ! which sourcelink > /dev/null 2>&1; then - dotnet tool install --tool-path ${csharp_bin} sourcelink - PATH=${csharp_bin}:${PATH} - if ! sourcelink --help > /dev/null 2>&1; then - export DOTNET_ROOT=${csharp_bin} - fi - fi - - sourcelink test artifacts/Apache.Arrow/Release/netstandard1.3/Apache.Arrow.pdb - sourcelink test artifacts/Apache.Arrow/Release/netcoreapp2.1/Apache.Arrow.pdb - - popd -} - -# Build and test Python - -test_python() { - pushd python - - pip install -r requirements-build.txt -r requirements-test.txt - - export PYARROW_WITH_DATASET=1 - export PYARROW_WITH_PARQUET=1 - export PYARROW_WITH_PLASMA=1 - if [ "${ARROW_CUDA}" = "ON" ]; then - export PYARROW_WITH_CUDA=1 - fi - if [ "${ARROW_FLIGHT}" = "ON" ]; then - export PYARROW_WITH_FLIGHT=1 - fi - if [ "${ARROW_GANDIVA}" = "ON" ]; then - export PYARROW_WITH_GANDIVA=1 - fi - - python setup.py build_ext --inplace - pytest pyarrow -v --pdb - - popd -} - -test_glib() { - pushd c_glib - - pip install meson - - meson build --prefix=$ARROW_HOME --libdir=lib - ninja -C build - ninja -C build install - - export GI_TYPELIB_PATH=$ARROW_HOME/lib/girepository-1.0:$GI_TYPELIB_PATH - - if ! bundle --version; then - gem install --no-document bundler - fi - - bundle install --path vendor/bundle - bundle exec ruby test/run-test.rb - - popd -} - -test_js() { - pushd js - - if [ "${INSTALL_NODE}" -gt 0 ]; then - export NVM_DIR="`pwd`/.nvm" - mkdir -p $NVM_DIR - curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.35.3/install.sh | \ - PROFILE=/dev/null bash - [ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh" - - nvm install --lts - npm install -g yarn - fi - - yarn --frozen-lockfile - yarn run-s clean:all lint build - yarn test - popd -} - -test_ruby() { - pushd ruby - - local modules="red-arrow red-plasma red-gandiva red-parquet" - if [ "${ARROW_CUDA}" = "ON" ]; then - modules="${modules} red-arrow-cuda" - fi - - for module in ${modules}; do - pushd ${module} - bundle install --path vendor/bundle - bundle exec ruby test/run-test.rb - popd - done - - popd -} - -test_go() { - local VERSION=1.14.1 - local ARCH=amd64 - - if [ "$(uname)" == "Darwin" ]; then - local OS=darwin - else - local OS=linux - fi - - local GO_ARCHIVE=go$VERSION.$OS-$ARCH.tar.gz - wget https://dl.google.com/go/$GO_ARCHIVE - - mkdir -p local-go - tar -xzf $GO_ARCHIVE -C local-go - rm -f $GO_ARCHIVE - - export GOROOT=`pwd`/local-go/go - export GOPATH=`pwd`/local-go/gopath - export PATH=$GOROOT/bin:$GOPATH/bin:$PATH - - pushd go/arrow - - go get -v ./... - go test ./... - go clean -modcache - - popd -} - -test_rust() { - # install rust toolchain in a similar fashion like test-miniconda - export RUSTUP_HOME=$PWD/test-rustup - export CARGO_HOME=$PWD/test-rustup - - curl https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path - - export PATH=$RUSTUP_HOME/bin:$PATH - source $RUSTUP_HOME/env - - # build and test rust - pushd rust - - # raises on any formatting errors - rustup component add rustfmt --toolchain stable - cargo +stable fmt --all -- --check - - # we are targeting Rust nightly for releases - rustup default nightly - - # use local modules because we don't publish modules to crates.io yet - sed \ - -i.bak \ - -E \ - -e 's/^arrow = "([^"]*)"/arrow = { version = "\1", path = "..\/arrow" }/g' \ - -e 's/^parquet = "([^"]*)"/parquet = { version = "\1", path = "..\/parquet" }/g' \ - */Cargo.toml - - # raises on any warnings - RUSTFLAGS="-D warnings" cargo build - cargo test - - popd -} - -# Run integration tests -test_integration() { - JAVA_DIR=$PWD/java - CPP_BUILD_DIR=$PWD/cpp/build - - export ARROW_JAVA_INTEGRATION_JAR=$JAVA_DIR/tools/target/arrow-tools-$VERSION-jar-with-dependencies.jar - export ARROW_CPP_EXE_PATH=$CPP_BUILD_DIR/release - - pip install -e dev/archery - - INTEGRATION_TEST_ARGS="" - - if [ "${ARROW_FLIGHT}" = "ON" ]; then - INTEGRATION_TEST_ARGS="${INTEGRATION_TEST_ARGS} --run-flight" - fi - - # Flight integration test executable have runtime dependency on - # release/libgtest.so - LD_LIBRARY_PATH=$ARROW_CPP_EXE_PATH:$LD_LIBRARY_PATH \ - archery integration \ - --with-cpp=${TEST_INTEGRATION_CPP} \ - --with-java=${TEST_INTEGRATION_JAVA} \ - --with-js=${TEST_INTEGRATION_JS} \ - --with-go=${TEST_INTEGRATION_GO} \ - $INTEGRATION_TEST_ARGS -} - -clone_testing_repositories() { - # Clone testing repositories if not cloned already - if [ ! -d "arrow-testing" ]; then - git clone https://github.com/apache/arrow-testing.git - fi - if [ ! -d "parquet-testing" ]; then - git clone https://github.com/apache/parquet-testing.git - fi - export ARROW_TEST_DATA=$PWD/arrow-testing/data - export PARQUET_TEST_DATA=$PWD/parquet-testing/data -} - -test_source_distribution() { - export ARROW_HOME=$ARROW_TMPDIR/install - export PARQUET_HOME=$ARROW_TMPDIR/install - export LD_LIBRARY_PATH=$ARROW_HOME/lib:${LD_LIBRARY_PATH:-} - export PKG_CONFIG_PATH=$ARROW_HOME/lib/pkgconfig:${PKG_CONFIG_PATH:-} - - if [ "$(uname)" == "Darwin" ]; then - NPROC=$(sysctl -n hw.ncpu) - else - NPROC=$(nproc) - fi - - clone_testing_repositories - - if [ ${TEST_JAVA} -gt 0 ]; then - test_package_java - fi - if [ ${TEST_CPP} -gt 0 ]; then - test_and_install_cpp - fi - if [ ${TEST_CSHARP} -gt 0 ]; then - test_csharp - fi - if [ ${TEST_PYTHON} -gt 0 ]; then - test_python - fi - if [ ${TEST_GLIB} -gt 0 ]; then - test_glib - fi - if [ ${TEST_RUBY} -gt 0 ]; then - test_ruby - fi - if [ ${TEST_JS} -gt 0 ]; then - test_js - fi - if [ ${TEST_GO} -gt 0 ]; then - test_go - fi - if [ ${TEST_RUST} -gt 0 ]; then - test_rust - fi - if [ ${TEST_INTEGRATION} -gt 0 ]; then - test_integration - fi -} - -test_binary_distribution() { - : ${BINTRAY_REPOSITORY:=apache/arrow} - - if [ ${TEST_BINARY} -gt 0 ]; then - test_binary - fi - if [ ${TEST_APT} -gt 0 ]; then - test_apt - fi - if [ ${TEST_YUM} -gt 0 ]; then - test_yum - fi -} - -check_python_imports() { - python << IMPORT_TESTS -import platform - -import pyarrow -import pyarrow.parquet -import pyarrow.plasma -import pyarrow.fs -import pyarrow._hdfs -import pyarrow.dataset -import pyarrow.flight - -if platform.system() == "Darwin": - macos_version = tuple(map(int, platform.mac_ver()[0].split('.'))) - check_s3fs = macos_version >= (10, 13) -else: - check_s3fs = True - -if check_s3fs: - import pyarrow._s3fs -IMPORT_TESTS -} - -test_linux_wheels() { - local py_arches="3.6m 3.7m 3.8 3.9" - local manylinuxes="2010 2014" - - for py_arch in ${py_arches}; do - local env=_verify_wheel-${py_arch} - conda create -yq -n ${env} python=${py_arch//[mu]/} - conda activate ${env} - pip install -U pip - - for ml_spec in ${manylinuxes}; do - # check the mandatory and optional imports - pip install python-rc/${VERSION}-rc${RC_NUMBER}/pyarrow-${VERSION}-cp${py_arch//[mu.]/}-cp${py_arch//./}-manylinux${ml_spec}_x86_64.whl - check_python_imports - - # install test requirements and execute the tests - pip install -r ${ARROW_DIR}/python/requirements-test.txt - python -c 'import pyarrow; pyarrow.create_library_symlinks()' - pytest --pyargs pyarrow - done - - conda deactivate - done -} - -test_macos_wheels() { - local py_arches="3.6m 3.7m 3.8 3.9" - - for py_arch in ${py_arches}; do - local env=_verify_wheel-${py_arch} - conda create -yq -n ${env} python=${py_arch//m/} - conda activate ${env} - pip install -U pip - - # check the mandatory and optional imports - pip install --find-links python-rc/${VERSION}-rc${RC_NUMBER} pyarrow==${VERSION} - check_python_imports - - # install test requirements and execute the tests - pip install -r ${ARROW_DIR}/python/requirements-test.txt - python -c 'import pyarrow; pyarrow.create_library_symlinks()' - pytest --pyargs pyarrow - - conda deactivate - done -} - -test_wheels() { - clone_testing_repositories - - local download_dir=binaries - mkdir -p ${download_dir} - - if [ "$(uname)" == "Darwin" ]; then - local filter_regex=.*macosx.* - else - local filter_regex=.*manylinux.* - fi - - python $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER \ - --regex=${filter_regex} \ - --dest=${download_dir} - - verify_dir_artifact_signatures ${download_dir} - - pushd ${download_dir} - - if [ "$(uname)" == "Darwin" ]; then - test_macos_wheels - else - test_linux_wheels - fi - - popd -} - -# By default test all functionalities. -# To deactivate one test, deactivate the test and all of its dependents -# To explicitly select one test, set TEST_DEFAULT=0 TEST_X=1 - -# Install NodeJS locally for running the JavaScript tests rather than using the -# system Node installation, which may be too old. -: ${INSTALL_NODE:=1} - -if [ "${ARTIFACT}" == "source" ]; then - : ${TEST_SOURCE:=1} -elif [ "${ARTIFACT}" == "wheels" ]; then - TEST_WHEELS=1 -else - TEST_BINARY_DISTRIBUTIONS=1 -fi -: ${TEST_SOURCE:=0} -: ${TEST_WHEELS:=0} -: ${TEST_BINARY_DISTRIBUTIONS:=0} - -: ${TEST_DEFAULT:=1} -: ${TEST_JAVA:=${TEST_DEFAULT}} -: ${TEST_CPP:=${TEST_DEFAULT}} -: ${TEST_CSHARP:=${TEST_DEFAULT}} -: ${TEST_GLIB:=${TEST_DEFAULT}} -: ${TEST_RUBY:=${TEST_DEFAULT}} -: ${TEST_PYTHON:=${TEST_DEFAULT}} -: ${TEST_JS:=${TEST_DEFAULT}} -: ${TEST_GO:=${TEST_DEFAULT}} -: ${TEST_RUST:=${TEST_DEFAULT}} -: ${TEST_INTEGRATION:=${TEST_DEFAULT}} -if [ ${TEST_BINARY_DISTRIBUTIONS} -gt 0 ]; then - TEST_BINARY_DISTRIBUTIONS_DEFAULT=${TEST_DEFAULT} -else - TEST_BINARY_DISTRIBUTIONS_DEFAULT=0 -fi -: ${TEST_BINARY:=${TEST_BINARY_DISTRIBUTIONS_DEFAULT}} -: ${TEST_APT:=${TEST_BINARY_DISTRIBUTIONS_DEFAULT}} -: ${TEST_YUM:=${TEST_BINARY_DISTRIBUTIONS_DEFAULT}} - -# For selective Integration testing, set TEST_DEFAULT=0 TEST_INTEGRATION_X=1 TEST_INTEGRATION_Y=1 -: ${TEST_INTEGRATION_CPP:=${TEST_INTEGRATION}} -: ${TEST_INTEGRATION_JAVA:=${TEST_INTEGRATION}} -: ${TEST_INTEGRATION_JS:=${TEST_INTEGRATION}} -: ${TEST_INTEGRATION_GO:=${TEST_INTEGRATION}} - -# Automatically test if its activated by a dependent -TEST_GLIB=$((${TEST_GLIB} + ${TEST_RUBY})) -TEST_CPP=$((${TEST_CPP} + ${TEST_GLIB} + ${TEST_PYTHON} + ${TEST_INTEGRATION_CPP})) -TEST_JAVA=$((${TEST_JAVA} + ${TEST_INTEGRATION_JAVA})) -TEST_JS=$((${TEST_JS} + ${TEST_INTEGRATION_JS})) -TEST_GO=$((${TEST_GO} + ${TEST_INTEGRATION_GO})) -TEST_INTEGRATION=$((${TEST_INTEGRATION} + ${TEST_INTEGRATION_CPP} + ${TEST_INTEGRATION_JAVA} + ${TEST_INTEGRATION_JS} + ${TEST_INTEGRATION_GO})) - -NEED_MINICONDA=$((${TEST_CPP} + ${TEST_WHEELS} + ${TEST_BINARY} + ${TEST_INTEGRATION})) - -: ${TEST_ARCHIVE:=apache-arrow-${VERSION}.tar.gz} -case "${TEST_ARCHIVE}" in - /*) - ;; - *) - TEST_ARCHIVE=${PWD}/${TEST_ARCHIVE} - ;; -esac - -TEST_SUCCESS=no - -setup_tempdir "arrow-${VERSION}" -echo "Working in sandbox ${ARROW_TMPDIR}" -cd ${ARROW_TMPDIR} - -if [ ${NEED_MINICONDA} -gt 0 ]; then - setup_miniconda -fi - -if [ "${ARTIFACT}" == "source" ]; then - dist_name="apache-arrow-${VERSION}" - if [ ${TEST_SOURCE} -gt 0 ]; then - import_gpg_keys - if [ ! -d "${dist_name}" ]; then - fetch_archive ${dist_name} - tar xf ${dist_name}.tar.gz - fi - else - mkdir -p ${dist_name} - if [ ! -f ${TEST_ARCHIVE} ]; then - echo "${TEST_ARCHIVE} not found" - exit 1 - fi - tar xf ${TEST_ARCHIVE} -C ${dist_name} --strip-components=1 - fi - pushd ${dist_name} - test_source_distribution - popd -elif [ "${ARTIFACT}" == "wheels" ]; then - import_gpg_keys - test_wheels -else - import_gpg_keys - test_binary_distribution -fi - -TEST_SUCCESS=yes -echo 'Release candidate looks good!' -exit 0 diff --git a/dev/release/verify-yum.sh b/dev/release/verify-yum.sh deleted file mode 100755 index b9c46c43898cc..0000000000000 --- a/dev/release/verify-yum.sh +++ /dev/null @@ -1,154 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -exu - -if [ $# -lt 2 ]; then - echo "Usage: $0 VERSION rc" - echo " $0 VERSION release" - echo " $0 VERSION local" - echo " e.g.: $0 0.13.0 rc # Verify 0.13.0 RC" - echo " e.g.: $0 0.13.0 release # Verify 0.13.0" - echo " e.g.: $0 0.13.0-dev20210203 local # Verify 0.13.0-dev20210203 on local" - exit 1 -fi - -VERSION="$1" -TYPE="$2" - -local_prefix="/arrow/dev/tasks/linux-packages" - -artifactory_base_url="https://apache.jfrog.io/artifactory/arrow/centos" -if [ "${TYPE}" = "rc" ]; then - artifactory_base_url+="-rc" -fi - -distribution=$(. /etc/os-release && echo "${ID}") -distribution_version=$(. /etc/os-release && echo "${VERSION_ID}") - -cmake_package=cmake -cmake_command=cmake -have_flight=yes -have_gandiva=yes -have_glib=yes -have_parquet=yes -install_command="dnf install -y --enablerepo=powertools" -case "${distribution}-${distribution_version}" in - centos-7) - cmake_package=cmake3 - cmake_command=cmake3 - have_flight=no - have_gandiva=no - install_command="yum install -y" - ;; -esac -if [ "$(arch)" = "aarch64" ]; then - have_gandiva=no -fi - -if [ "${TYPE}" = "local" ]; then - case "${VERSION}" in - *-dev*) - package_version="$(echo "${VERSION}" | sed -e 's/-dev\(.*\)$/-0.dev\1/g')" - ;; - *-rc*) - package_version="$(echo "${VERSION}" | sed -e 's/-rc.*$//g')" - package_version+="-1" - ;; - *) - package_version="${VERSION}-1" - ;; - esac - package_version+=".el${distribution_version}" - release_path="${local_prefix}/yum/repositories" - release_path+="/centos/${distribution_version}/$(arch)/Packages" - release_path+="/apache-arrow-release-${package_version}.noarch.rpm" - ${install_command} "${release_path}" -else - package_version="${VERSION}" - ${install_command} \ - ${artifactory_base_url}/${distribution_version}/apache-arrow-release-latest.rpm -fi - -if [ "${TYPE}" = "local" ]; then - sed \ - -i"" \ - -e "s,baseurl=https://apache\.jfrog\.io/artifactory/arrow/,baseurl=file://${local_prefix}/yum/repositories/,g" \ - /etc/yum.repos.d/Apache-Arrow.repo - keys="${local_prefix}/KEYS" - if [ -f "${keys}" ]; then - cp "${keys}" /etc/pki/rpm-gpg/RPM-GPG-KEY-Apache-Arrow - fi -else - if [ "${TYPE}" = "rc" ]; then - sed \ - -i"" \ - -e "s,/centos/,/centos-rc/,g" \ - /etc/yum.repos.d/Apache-Arrow.repo - fi -fi - -${install_command} --enablerepo=epel arrow-devel-${package_version} -${install_command} \ - ${cmake_package} \ - gcc-c++ \ - git \ - make -mkdir -p build -cp -a /arrow/cpp/examples/minimal_build build -pushd build/minimal_build -${cmake_command} . -make -j$(nproc) -./arrow_example -popd - -if [ "${have_glib}" = "yes" ]; then - ${install_command} --enablerepo=epel arrow-glib-devel-${package_version} - ${install_command} --enablerepo=epel arrow-glib-doc-${package_version} -fi -${install_command} --enablerepo=epel arrow-python-devel-${package_version} - -if [ "${have_glib}" = "yes" ]; then - ${install_command} --enablerepo=epel plasma-glib-devel-${package_version} - ${install_command} --enablerepo=epel plasma-glib-doc-${package_version} -else - ${install_command} --enablerepo=epel plasma-devel-${package_version} -fi - -if [ "${have_flight}" = "yes" ]; then - ${install_command} --enablerepo=epel arrow-flight-devel-${package_version} -fi - -if [ "${have_gandiva}" = "yes" ]; then - if [ "${have_glib}" = "yes" ]; then - ${install_command} --enablerepo=epel gandiva-glib-devel-${package_version} - ${install_command} --enablerepo=epel gandiva-glib-doc-${package_version} - else - ${install_command} --enablerepo=epel gandiva-devel-${package_version} - fi -fi - -if [ "${have_parquet}" = "yes" ]; then - if [ "${have_glib}" = "yes" ]; then - ${install_command} --enablerepo=epel parquet-glib-devel-${package_version} - ${install_command} --enablerepo=epel parquet-glib-doc-${package_version} - else - ${install_command} --enablerepo=epel parquet-devel-${package_version} - fi -fi diff --git a/dev/requirements_merge_arrow_pr.txt b/dev/requirements_merge_arrow_pr.txt deleted file mode 100644 index 7ac17dc1b1933..0000000000000 --- a/dev/requirements_merge_arrow_pr.txt +++ /dev/null @@ -1,3 +0,0 @@ -jira -requests -six diff --git a/dev/tasks/README.md b/dev/tasks/README.md deleted file mode 100644 index 1af9739db29e2..0000000000000 --- a/dev/tasks/README.md +++ /dev/null @@ -1,19 +0,0 @@ - - -See the usage guide under the [documentation page](../../docs/source/developers/crossbow.rst) diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.6.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.6.____cpython.yaml deleted file mode 100644 index dd4c04197c914..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.6.____cpython.yaml +++ /dev/null @@ -1,70 +0,0 @@ -aws_sdk_cpp: -- 1.8.151 -bzip2: -- '1' -c_compiler: -- gcc -c_compiler_version: -- '9' -cdt_name: -- cos6 -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -cuda_compiler: -- nvcc -cuda_compiler_version: -- '10.2' -cxx_compiler: -- gxx -cxx_compiler_version: -- '9' -docker_image: -- quay.io/condaforge/linux-anvil-cuda:10.2 -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.36' -libprotobuf: -- '3.15' -lz4_c: -- 1.9.3 -numpy: -- '1.17' -orc: -- 1.6.7 -pin_run_as_build: - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.6.* *_cpython -re2: -- 2021.04.01 -snappy: -- '1' -target_platform: -- linux-64 -thrift_cpp: -- 0.14.1 -zip_keys: -- - c_compiler_version - - cxx_compiler_version -- - cuda_compiler_version - - cdt_name - - docker_image -- - python - - numpy -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.7.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.7.____cpython.yaml deleted file mode 100644 index f0c43929b56c3..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.7.____cpython.yaml +++ /dev/null @@ -1,70 +0,0 @@ -aws_sdk_cpp: -- 1.8.151 -bzip2: -- '1' -c_compiler: -- gcc -c_compiler_version: -- '9' -cdt_name: -- cos6 -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -cuda_compiler: -- nvcc -cuda_compiler_version: -- '10.2' -cxx_compiler: -- gxx -cxx_compiler_version: -- '9' -docker_image: -- quay.io/condaforge/linux-anvil-cuda:10.2 -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.36' -libprotobuf: -- '3.15' -lz4_c: -- 1.9.3 -numpy: -- '1.17' -orc: -- 1.6.7 -pin_run_as_build: - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.7.* *_cpython -re2: -- 2021.04.01 -snappy: -- '1' -target_platform: -- linux-64 -thrift_cpp: -- 0.14.1 -zip_keys: -- - c_compiler_version - - cxx_compiler_version -- - cuda_compiler_version - - cdt_name - - docker_image -- - python - - numpy -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.8.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.8.____cpython.yaml deleted file mode 100644 index 149e70f438b15..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.17python3.8.____cpython.yaml +++ /dev/null @@ -1,70 +0,0 @@ -aws_sdk_cpp: -- 1.8.151 -bzip2: -- '1' -c_compiler: -- gcc -c_compiler_version: -- '9' -cdt_name: -- cos6 -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -cuda_compiler: -- nvcc -cuda_compiler_version: -- '10.2' -cxx_compiler: -- gxx -cxx_compiler_version: -- '9' -docker_image: -- quay.io/condaforge/linux-anvil-cuda:10.2 -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.36' -libprotobuf: -- '3.15' -lz4_c: -- 1.9.3 -numpy: -- '1.17' -orc: -- 1.6.7 -pin_run_as_build: - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.8.* *_cpython -re2: -- 2021.04.01 -snappy: -- '1' -target_platform: -- linux-64 -thrift_cpp: -- 0.14.1 -zip_keys: -- - c_compiler_version - - cxx_compiler_version -- - cuda_compiler_version - - cdt_name - - docker_image -- - python - - numpy -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.19python3.9.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.19python3.9.____cpython.yaml deleted file mode 100644 index fb15d4e715670..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_version10.2numpy1.19python3.9.____cpython.yaml +++ /dev/null @@ -1,70 +0,0 @@ -aws_sdk_cpp: -- 1.8.151 -bzip2: -- '1' -c_compiler: -- gcc -c_compiler_version: -- '9' -cdt_name: -- cos6 -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -cuda_compiler: -- nvcc -cuda_compiler_version: -- '10.2' -cxx_compiler: -- gxx -cxx_compiler_version: -- '9' -docker_image: -- quay.io/condaforge/linux-anvil-cuda:10.2 -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.36' -libprotobuf: -- '3.15' -lz4_c: -- 1.9.3 -numpy: -- '1.19' -orc: -- 1.6.7 -pin_run_as_build: - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.9.* *_cpython -re2: -- 2021.04.01 -snappy: -- '1' -target_platform: -- linux-64 -thrift_cpp: -- 0.14.1 -zip_keys: -- - c_compiler_version - - cxx_compiler_version -- - cuda_compiler_version - - cdt_name - - docker_image -- - python - - numpy -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.6.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.6.____cpython.yaml deleted file mode 100644 index d977f9e5779ea..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.6.____cpython.yaml +++ /dev/null @@ -1,70 +0,0 @@ -aws_sdk_cpp: -- 1.8.151 -bzip2: -- '1' -c_compiler: -- gcc -c_compiler_version: -- '9' -cdt_name: -- cos6 -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -cuda_compiler: -- nvcc -cuda_compiler_version: -- None -cxx_compiler: -- gxx -cxx_compiler_version: -- '9' -docker_image: -- quay.io/condaforge/linux-anvil-comp7 -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.36' -libprotobuf: -- '3.15' -lz4_c: -- 1.9.3 -numpy: -- '1.17' -orc: -- 1.6.7 -pin_run_as_build: - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.6.* *_cpython -re2: -- 2021.04.01 -snappy: -- '1' -target_platform: -- linux-64 -thrift_cpp: -- 0.14.1 -zip_keys: -- - c_compiler_version - - cxx_compiler_version -- - cuda_compiler_version - - cdt_name - - docker_image -- - python - - numpy -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.7.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.7.____cpython.yaml deleted file mode 100644 index 6ffa87a5eb9e3..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.7.____cpython.yaml +++ /dev/null @@ -1,70 +0,0 @@ -aws_sdk_cpp: -- 1.8.151 -bzip2: -- '1' -c_compiler: -- gcc -c_compiler_version: -- '9' -cdt_name: -- cos6 -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -cuda_compiler: -- nvcc -cuda_compiler_version: -- None -cxx_compiler: -- gxx -cxx_compiler_version: -- '9' -docker_image: -- quay.io/condaforge/linux-anvil-comp7 -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.36' -libprotobuf: -- '3.15' -lz4_c: -- 1.9.3 -numpy: -- '1.17' -orc: -- 1.6.7 -pin_run_as_build: - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.7.* *_cpython -re2: -- 2021.04.01 -snappy: -- '1' -target_platform: -- linux-64 -thrift_cpp: -- 0.14.1 -zip_keys: -- - c_compiler_version - - cxx_compiler_version -- - cuda_compiler_version - - cdt_name - - docker_image -- - python - - numpy -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.8.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.8.____cpython.yaml deleted file mode 100644 index 7105f634953a3..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.17python3.8.____cpython.yaml +++ /dev/null @@ -1,70 +0,0 @@ -aws_sdk_cpp: -- 1.8.151 -bzip2: -- '1' -c_compiler: -- gcc -c_compiler_version: -- '9' -cdt_name: -- cos6 -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -cuda_compiler: -- nvcc -cuda_compiler_version: -- None -cxx_compiler: -- gxx -cxx_compiler_version: -- '9' -docker_image: -- quay.io/condaforge/linux-anvil-comp7 -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.36' -libprotobuf: -- '3.15' -lz4_c: -- 1.9.3 -numpy: -- '1.17' -orc: -- 1.6.7 -pin_run_as_build: - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.8.* *_cpython -re2: -- 2021.04.01 -snappy: -- '1' -target_platform: -- linux-64 -thrift_cpp: -- 0.14.1 -zip_keys: -- - c_compiler_version - - cxx_compiler_version -- - cuda_compiler_version - - cdt_name - - docker_image -- - python - - numpy -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.19python3.9.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.19python3.9.____cpython.yaml deleted file mode 100644 index efe0148cc810b..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/linux_64_cuda_compiler_versionNonenumpy1.19python3.9.____cpython.yaml +++ /dev/null @@ -1,70 +0,0 @@ -aws_sdk_cpp: -- 1.8.151 -bzip2: -- '1' -c_compiler: -- gcc -c_compiler_version: -- '9' -cdt_name: -- cos6 -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -cuda_compiler: -- nvcc -cuda_compiler_version: -- None -cxx_compiler: -- gxx -cxx_compiler_version: -- '9' -docker_image: -- quay.io/condaforge/linux-anvil-comp7 -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.36' -libprotobuf: -- '3.15' -lz4_c: -- 1.9.3 -numpy: -- '1.19' -orc: -- 1.6.7 -pin_run_as_build: - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.9.* *_cpython -re2: -- 2021.04.01 -snappy: -- '1' -target_platform: -- linux-64 -thrift_cpp: -- 0.14.1 -zip_keys: -- - c_compiler_version - - cxx_compiler_version -- - cuda_compiler_version - - cdt_name - - docker_image -- - python - - numpy -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_python3.6.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_python3.6.____cpython.yaml deleted file mode 100644 index f2d3ceaac6891..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_python3.6.____cpython.yaml +++ /dev/null @@ -1,71 +0,0 @@ -BUILD: -- aarch64-conda_cos7-linux-gnu -aws_sdk_cpp: -- 1.8.63 -boost_cpp: -- 1.74.0 -bzip2: -- '1' -c_compiler: -- gcc -c_compiler_version: -- '7' -cdt_arch: -- aarch64 -cdt_name: -- cos7 -channel_sources: -- conda-forge -channel_targets: -- conda-forge main -cuda_compiler_version: -- None -cxx_compiler: -- gxx -cxx_compiler_version: -- '7' -docker_image: -- condaforge/linux-anvil-aarch64 -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.32' -libprotobuf: -- '3.13' -lz4_c: -- 1.9.2 -numpy: -- '1.16' -orc: -- 1.6.5 -pin_run_as_build: - boost-cpp: - max_pin: x.x.x - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.6.* *_cpython -re2: -- 2020.10.01 -snappy: -- '1' -target_platform: -- linux-aarch64 -zip_keys: -- - c_compiler_version - - cxx_compiler_version -- - numpy - - python -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_python3.7.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_python3.7.____cpython.yaml deleted file mode 100644 index 611c39c907cf3..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_python3.7.____cpython.yaml +++ /dev/null @@ -1,71 +0,0 @@ -BUILD: -- aarch64-conda_cos7-linux-gnu -aws_sdk_cpp: -- 1.8.63 -boost_cpp: -- 1.74.0 -bzip2: -- '1' -c_compiler: -- gcc -c_compiler_version: -- '7' -cdt_arch: -- aarch64 -cdt_name: -- cos7 -channel_sources: -- conda-forge -channel_targets: -- conda-forge main -cuda_compiler_version: -- None -cxx_compiler: -- gxx -cxx_compiler_version: -- '7' -docker_image: -- condaforge/linux-anvil-aarch64 -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.32' -libprotobuf: -- '3.13' -lz4_c: -- 1.9.2 -numpy: -- '1.16' -orc: -- 1.6.5 -pin_run_as_build: - boost-cpp: - max_pin: x.x.x - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.7.* *_cpython -re2: -- 2020.10.01 -snappy: -- '1' -target_platform: -- linux-aarch64 -zip_keys: -- - c_compiler_version - - cxx_compiler_version -- - numpy - - python -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_python3.8.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_python3.8.____cpython.yaml deleted file mode 100644 index 2f0fc0e230670..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_python3.8.____cpython.yaml +++ /dev/null @@ -1,71 +0,0 @@ -BUILD: -- aarch64-conda_cos7-linux-gnu -aws_sdk_cpp: -- 1.8.63 -boost_cpp: -- 1.74.0 -bzip2: -- '1' -c_compiler: -- gcc -c_compiler_version: -- '7' -cdt_arch: -- aarch64 -cdt_name: -- cos7 -channel_sources: -- conda-forge -channel_targets: -- conda-forge main -cuda_compiler_version: -- None -cxx_compiler: -- gxx -cxx_compiler_version: -- '7' -docker_image: -- condaforge/linux-anvil-aarch64 -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.32' -libprotobuf: -- '3.13' -lz4_c: -- 1.9.2 -numpy: -- '1.16' -orc: -- 1.6.5 -pin_run_as_build: - boost-cpp: - max_pin: x.x.x - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.8.* *_cpython -re2: -- 2020.10.01 -snappy: -- '1' -target_platform: -- linux-aarch64 -zip_keys: -- - c_compiler_version - - cxx_compiler_version -- - numpy - - python -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_python3.9.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/linux_aarch64_python3.9.____cpython.yaml deleted file mode 100644 index 2ec87205a0f1a..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/linux_aarch64_python3.9.____cpython.yaml +++ /dev/null @@ -1,71 +0,0 @@ -BUILD: -- aarch64-conda_cos7-linux-gnu -aws_sdk_cpp: -- 1.8.63 -boost_cpp: -- 1.74.0 -bzip2: -- '1' -c_compiler: -- gcc -c_compiler_version: -- '7' -cdt_arch: -- aarch64 -cdt_name: -- cos7 -channel_sources: -- conda-forge -channel_targets: -- conda-forge main -cuda_compiler_version: -- None -cxx_compiler: -- gxx -cxx_compiler_version: -- '7' -docker_image: -- condaforge/linux-anvil-aarch64 -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.32' -libprotobuf: -- '3.13' -lz4_c: -- 1.9.2 -numpy: -- '1.19' -orc: -- 1.6.5 -pin_run_as_build: - boost-cpp: - max_pin: x.x.x - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.9.* *_cpython -re2: -- 2020.10.01 -snappy: -- '1' -target_platform: -- linux-aarch64 -zip_keys: -- - c_compiler_version - - cxx_compiler_version -- - numpy - - python -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.6.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.6.____cpython.yaml deleted file mode 100644 index 7b2dbb34d76db..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.6.____cpython.yaml +++ /dev/null @@ -1,65 +0,0 @@ -MACOSX_DEPLOYMENT_TARGET: -- '10.9' -aws_sdk_cpp: -- 1.8.151 -bzip2: -- '1' -c_compiler: -- clang -c_compiler_version: -- '11' -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -cuda_compiler_version: -- None -cxx_compiler: -- clangxx -cxx_compiler_version: -- '11' -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.36' -libprotobuf: -- '3.15' -lz4_c: -- 1.9.3 -macos_machine: -- x86_64-apple-darwin13.4.0 -numpy: -- '1.17' -orc: -- 1.6.7 -pin_run_as_build: - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.6.* *_cpython -re2: -- 2021.04.01 -snappy: -- '1' -target_platform: -- osx-64 -thrift_cpp: -- 0.14.1 -zip_keys: -- - c_compiler_version - - cxx_compiler_version -- - python - - numpy -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.7.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.7.____cpython.yaml deleted file mode 100644 index 8e3e828ab8a87..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.7.____cpython.yaml +++ /dev/null @@ -1,65 +0,0 @@ -MACOSX_DEPLOYMENT_TARGET: -- '10.9' -aws_sdk_cpp: -- 1.8.151 -bzip2: -- '1' -c_compiler: -- clang -c_compiler_version: -- '11' -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -cuda_compiler_version: -- None -cxx_compiler: -- clangxx -cxx_compiler_version: -- '11' -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.36' -libprotobuf: -- '3.15' -lz4_c: -- 1.9.3 -macos_machine: -- x86_64-apple-darwin13.4.0 -numpy: -- '1.17' -orc: -- 1.6.7 -pin_run_as_build: - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.7.* *_cpython -re2: -- 2021.04.01 -snappy: -- '1' -target_platform: -- osx-64 -thrift_cpp: -- 0.14.1 -zip_keys: -- - c_compiler_version - - cxx_compiler_version -- - python - - numpy -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.8.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.8.____cpython.yaml deleted file mode 100644 index cdd53c6006e62..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.17python3.8.____cpython.yaml +++ /dev/null @@ -1,65 +0,0 @@ -MACOSX_DEPLOYMENT_TARGET: -- '10.9' -aws_sdk_cpp: -- 1.8.151 -bzip2: -- '1' -c_compiler: -- clang -c_compiler_version: -- '11' -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -cuda_compiler_version: -- None -cxx_compiler: -- clangxx -cxx_compiler_version: -- '11' -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.36' -libprotobuf: -- '3.15' -lz4_c: -- 1.9.3 -macos_machine: -- x86_64-apple-darwin13.4.0 -numpy: -- '1.17' -orc: -- 1.6.7 -pin_run_as_build: - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.8.* *_cpython -re2: -- 2021.04.01 -snappy: -- '1' -target_platform: -- osx-64 -thrift_cpp: -- 0.14.1 -zip_keys: -- - c_compiler_version - - cxx_compiler_version -- - python - - numpy -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.19python3.9.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.19python3.9.____cpython.yaml deleted file mode 100644 index 37df6a9ec536d..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/osx_64_numpy1.19python3.9.____cpython.yaml +++ /dev/null @@ -1,65 +0,0 @@ -MACOSX_DEPLOYMENT_TARGET: -- '10.9' -aws_sdk_cpp: -- 1.8.151 -bzip2: -- '1' -c_compiler: -- clang -c_compiler_version: -- '11' -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -cuda_compiler_version: -- None -cxx_compiler: -- clangxx -cxx_compiler_version: -- '11' -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.36' -libprotobuf: -- '3.15' -lz4_c: -- 1.9.3 -macos_machine: -- x86_64-apple-darwin13.4.0 -numpy: -- '1.19' -orc: -- 1.6.7 -pin_run_as_build: - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.9.* *_cpython -re2: -- 2021.04.01 -snappy: -- '1' -target_platform: -- osx-64 -thrift_cpp: -- 0.14.1 -zip_keys: -- - c_compiler_version - - cxx_compiler_version -- - python - - numpy -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.ci_support/osx_arm64_python3.8.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/osx_arm64_python3.8.____cpython.yaml deleted file mode 100644 index 5894b8ee70b50..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/osx_arm64_python3.8.____cpython.yaml +++ /dev/null @@ -1,65 +0,0 @@ -MACOSX_DEPLOYMENT_TARGET: -- '11.0' -aws_sdk_cpp: -- 1.8.151 -bzip2: -- '1' -c_compiler: -- clang -c_compiler_version: -- '11' -channel_sources: -- conda-forge/label/rust_dev,conda-forge -channel_targets: -- conda-forge main -cuda_compiler_version: -- None -cxx_compiler: -- clangxx -cxx_compiler_version: -- '11' -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.36' -libprotobuf: -- '3.15' -lz4_c: -- 1.9.3 -macos_machine: -- arm64-apple-darwin20.0.0 -numpy: -- '1.19' -orc: -- 1.6.7 -pin_run_as_build: - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.8.* *_cpython -re2: -- 2021.04.01 -snappy: -- '1' -target_platform: -- osx-arm64 -thrift_cpp: -- 0.14.1 -zip_keys: -- - c_compiler_version - - cxx_compiler_version -- - python - - numpy -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.ci_support/osx_arm64_python3.9.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/osx_arm64_python3.9.____cpython.yaml deleted file mode 100644 index 4e6014c5db8ac..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/osx_arm64_python3.9.____cpython.yaml +++ /dev/null @@ -1,65 +0,0 @@ -MACOSX_DEPLOYMENT_TARGET: -- '11.0' -aws_sdk_cpp: -- 1.8.151 -bzip2: -- '1' -c_compiler: -- clang -c_compiler_version: -- '11' -channel_sources: -- conda-forge/label/rust_dev,conda-forge -channel_targets: -- conda-forge main -cuda_compiler_version: -- None -cxx_compiler: -- clangxx -cxx_compiler_version: -- '11' -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.36' -libprotobuf: -- '3.15' -lz4_c: -- 1.9.3 -macos_machine: -- arm64-apple-darwin20.0.0 -numpy: -- '1.19' -orc: -- 1.6.7 -pin_run_as_build: - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.9.* *_cpython -re2: -- 2021.04.01 -snappy: -- '1' -target_platform: -- osx-arm64 -thrift_cpp: -- 0.14.1 -zip_keys: -- - c_compiler_version - - cxx_compiler_version -- - python - - numpy -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base3.6.yaml b/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base3.6.yaml deleted file mode 100644 index ac945ce72d382..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base3.6.yaml +++ /dev/null @@ -1,22 +0,0 @@ -c_compiler: -- gcc -c_compiler_version: -- '7' -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -cxx_compiler: -- gxx -cxx_compiler_version: -- '7' -docker_image: -- condaforge/linux-anvil-comp7 -pin_run_as_build: - r-base: - min_pin: x.x - max_pin: x.x -r_base: -- '3.6' -target_platform: -- linux-64 diff --git a/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.0.yaml b/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.0.yaml deleted file mode 100644 index 51d26f834cc16..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/r/linux_64_r_base4.0.yaml +++ /dev/null @@ -1,22 +0,0 @@ -c_compiler: -- gcc -c_compiler_version: -- '7' -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -cxx_compiler: -- gxx -cxx_compiler_version: -- '7' -docker_image: -- condaforge/linux-anvil-comp7 -pin_run_as_build: - r-base: - min_pin: x.x - max_pin: x.x -r_base: -- '4.0' -target_platform: -- linux-64 diff --git a/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base3.6.yaml b/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base3.6.yaml deleted file mode 100644 index e3c5b898be6f4..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base3.6.yaml +++ /dev/null @@ -1,26 +0,0 @@ -MACOSX_DEPLOYMENT_TARGET: -- '10.9' -c_compiler: -- clang -c_compiler_version: -- '10' -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -cxx_compiler: -- clangxx -cxx_compiler_version: -- '10' -macos_machine: -- x86_64-apple-darwin13.4.0 -macos_min_version: -- '10.9' -pin_run_as_build: - r-base: - min_pin: x.x - max_pin: x.x -r_base: -- '3.6' -target_platform: -- osx-64 diff --git a/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.0.yaml b/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.0.yaml deleted file mode 100644 index 8343a284b9754..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/r/osx_64_r_base4.0.yaml +++ /dev/null @@ -1,26 +0,0 @@ -MACOSX_DEPLOYMENT_TARGET: -- '10.9' -c_compiler: -- clang -c_compiler_version: -- '10' -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -cxx_compiler: -- clangxx -cxx_compiler_version: -- '10' -macos_machine: -- x86_64-apple-darwin13.4.0 -macos_min_version: -- '10.9' -pin_run_as_build: - r-base: - min_pin: x.x - max_pin: x.x -r_base: -- '4.0' -target_platform: -- osx-64 diff --git a/dev/tasks/conda-recipes/.ci_support/r/win_64_r_base3.6.yaml b/dev/tasks/conda-recipes/.ci_support/r/win_64_r_base3.6.yaml deleted file mode 100644 index 3fb7f88499af3..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/r/win_64_r_base3.6.yaml +++ /dev/null @@ -1,12 +0,0 @@ -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -pin_run_as_build: - r-base: - min_pin: x.x - max_pin: x.x -r_base: -- '3.6' -target_platform: -- win-64 diff --git a/dev/tasks/conda-recipes/.ci_support/r/win_64_r_base4.0.yaml b/dev/tasks/conda-recipes/.ci_support/r/win_64_r_base4.0.yaml deleted file mode 100644 index 02c2a70756d0e..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/r/win_64_r_base4.0.yaml +++ /dev/null @@ -1,12 +0,0 @@ -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -pin_run_as_build: - r-base: - min_pin: x.x - max_pin: x.x -r_base: -- '4.0' -target_platform: -- win-64 diff --git a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.6.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.6.____cpython.yaml deleted file mode 100644 index 8fbbb64af9c51..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.6.____cpython.yaml +++ /dev/null @@ -1,55 +0,0 @@ -aws_sdk_cpp: -- 1.8.151 -bzip2: -- '1' -c_compiler: -- vs2017 -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -cuda_compiler: -- nvcc -cuda_compiler_version: -- None -cxx_compiler: -- vs2017 -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.36' -libprotobuf: -- '3.15' -lz4_c: -- 1.9.3 -numpy: -- '1.17' -pin_run_as_build: - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.6.* *_cpython -re2: -- 2021.04.01 -snappy: -- '1' -target_platform: -- win-64 -thrift_cpp: -- 0.14.1 -zip_keys: -- - numpy - - python -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.7.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.7.____cpython.yaml deleted file mode 100644 index 4b702a3898088..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.7.____cpython.yaml +++ /dev/null @@ -1,55 +0,0 @@ -aws_sdk_cpp: -- 1.8.151 -bzip2: -- '1' -c_compiler: -- vs2017 -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -cuda_compiler: -- nvcc -cuda_compiler_version: -- None -cxx_compiler: -- vs2017 -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.36' -libprotobuf: -- '3.15' -lz4_c: -- 1.9.3 -numpy: -- '1.17' -pin_run_as_build: - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.7.* *_cpython -re2: -- 2021.04.01 -snappy: -- '1' -target_platform: -- win-64 -thrift_cpp: -- 0.14.1 -zip_keys: -- - numpy - - python -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.8.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.8.____cpython.yaml deleted file mode 100644 index 6ae6c2fde4f6e..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.17python3.8.____cpython.yaml +++ /dev/null @@ -1,55 +0,0 @@ -aws_sdk_cpp: -- 1.8.151 -bzip2: -- '1' -c_compiler: -- vs2017 -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -cuda_compiler: -- nvcc -cuda_compiler_version: -- None -cxx_compiler: -- vs2017 -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.36' -libprotobuf: -- '3.15' -lz4_c: -- 1.9.3 -numpy: -- '1.17' -pin_run_as_build: - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.8.* *_cpython -re2: -- 2021.04.01 -snappy: -- '1' -target_platform: -- win-64 -thrift_cpp: -- 0.14.1 -zip_keys: -- - numpy - - python -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.19python3.9.____cpython.yaml b/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.19python3.9.____cpython.yaml deleted file mode 100644 index 73a8b5099bbfd..0000000000000 --- a/dev/tasks/conda-recipes/.ci_support/win_64_cuda_compiler_versionNonenumpy1.19python3.9.____cpython.yaml +++ /dev/null @@ -1,55 +0,0 @@ -aws_sdk_cpp: -- 1.8.151 -bzip2: -- '1' -c_compiler: -- vs2017 -channel_sources: -- conda-forge,defaults -channel_targets: -- conda-forge main -cuda_compiler: -- nvcc -cuda_compiler_version: -- None -cxx_compiler: -- vs2017 -gflags: -- '2.2' -glog: -- 0.4.0 -grpc_cpp: -- '1.36' -libprotobuf: -- '3.15' -lz4_c: -- 1.9.3 -numpy: -- '1.19' -pin_run_as_build: - bzip2: - max_pin: x - lz4-c: - max_pin: x.x.x - python: - min_pin: x.x - max_pin: x.x - zlib: - max_pin: x.x -python: -- 3.9.* *_cpython -re2: -- 2021.04.01 -snappy: -- '1' -target_platform: -- win-64 -thrift_cpp: -- 0.14.1 -zip_keys: -- - numpy - - python -zlib: -- '1.2' -zstd: -- '1.4' diff --git a/dev/tasks/conda-recipes/.scripts/logging_utils.sh b/dev/tasks/conda-recipes/.scripts/logging_utils.sh deleted file mode 100644 index a53ef3f2c7ada..0000000000000 --- a/dev/tasks/conda-recipes/.scripts/logging_utils.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -# Provide a unified interface for the different logging -# utilities CI providers offer. If unavailable, provide -# a compatible fallback (e.g. bare `echo xxxxxx`). - -function startgroup { - # Start a foldable group of log lines - # Pass a single argument, quoted - case ${CI:-} in - azure ) - echo "##[group]$1";; - travis ) - echo "$1" - echo -en 'travis_fold:start:'"${1// /}"'\\r';; - * ) - echo "$1";; - esac -} - -function endgroup { - # End a foldable group of log lines - # Pass a single argument, quoted - case ${CI:-} in - azure ) - echo "##[endgroup]";; - travis ) - echo -en 'travis_fold:end:'"${1// /}"'\\r';; - esac -} diff --git a/dev/tasks/conda-recipes/README.md b/dev/tasks/conda-recipes/README.md deleted file mode 100644 index 074cefe52de01..0000000000000 --- a/dev/tasks/conda-recipes/README.md +++ /dev/null @@ -1,65 +0,0 @@ - - -# Conda Forge recipes - -This directory must be migrated periodically with the upstrem updates of -[arrow-cpp-feedstock][arrow-cpp-feedstock], -[parquet-cpp-feedstock][parquet-cpp-feedstock]. -conda-forge repositories because of multiple vendored files. - -## Keeping the recipes synchronized - -The recipes here are tested on nightly basis, so they follow the development -versions of arrow instead of the upstream recipes, which are suitable for the -latest releases. - -### Backporting from the upstream feedstocks - -In most of the cases these recipes are more accurate, then the upstream -feedstocks. Although the upstream feedstocks regularly receive automatic updates -by the conda-forge team so we need to backport those changes to the crossbow -recipes. Most of these updates are touching the version pinning files -(under `.ci_support`) and other CI related configuration files. - -Because all three recipes must be built in the same continuous integration -job prefer porting from the [arrpw-cpp feedstock][arrow-cpp-feedstock]. - -#### Updating the variants: - -Copy the configuration files from `arrow-cpp-feedstock/.ci_support` to the -`.ci_support` folder. - -#### Updating the CI configurations: - -The `.azure-pipelines/azure-pipelines-[linux|osx|win].yml` should be ported -to the local counterparts under `.azure-pipelines` with keeping the crossbow -related parts (the cloning of arrow and the jinja templated variables) and -moving the matrix definitions like [this][matrix-definition] to the crossbow -[tasks.yml][../tasks.yml] config file. - -### Porting recipes from crossbow to the upstream feedstocks - -Theoretically these recipes should be up to date with the actual version of -Arrow, so during the release procedure the content of these recipes should be -copied to the upstream feedstocks. - -[arrow-cpp-feedstock]: https://github.com/conda-forge/arrow-cpp-feedstock -[parquet-cpp-feedstock]: https://github.com/conda-forge/parquet-cpp-feedstock -[matrix-definition]: https://github.com/conda-forge/arrow-cpp-feedstock/blob/master/.azure-pipelines/azure-pipelines-linux.yml#L12 diff --git a/dev/tasks/conda-recipes/arrow-cpp/LLVM_LICENSE.txt b/dev/tasks/conda-recipes/arrow-cpp/LLVM_LICENSE.txt deleted file mode 100644 index 461398bab7a7c..0000000000000 --- a/dev/tasks/conda-recipes/arrow-cpp/LLVM_LICENSE.txt +++ /dev/null @@ -1,68 +0,0 @@ -============================================================================== -LLVM Release License -============================================================================== -University of Illinois/NCSA -Open Source License - -Copyright (c) 2003-2018 University of Illinois at Urbana-Champaign. -All rights reserved. - -Developed by: - - LLVM Team - - University of Illinois at Urbana-Champaign - - http://llvm.org - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal with -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimers. - - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimers in the - documentation and/or other materials provided with the distribution. - - * Neither the names of the LLVM Team, University of Illinois at - Urbana-Champaign, nor the names of its contributors may be used to - endorse or promote products derived from this Software without specific - prior written permission. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE -SOFTWARE. - -============================================================================== -Copyrights and Licenses for Third Party Software Distributed with LLVM: -============================================================================== -The LLVM software contains code written by third parties. Such software will -have its own individual LICENSE.TXT file in the directory in which it appears. -This file will describe the copyrights, license, and restrictions which apply -to that code. - -The disclaimer of warranty in the University of Illinois Open Source License -applies to all code in the LLVM Distribution, and nothing in any of the -other licenses gives permission to use the names of the LLVM Team or the -University of Illinois to endorse or promote products derived from this -Software. - -The following pieces of software have additional or alternate copyrights, -licenses, and/or restrictions: - -Program Directory -------- --------- -Google Test llvm/utils/unittest/googletest -OpenBSD regex llvm/lib/Support/{reg*, COPYRIGHT.regex} -pyyaml tests llvm/test/YAMLParser/{*.data, LICENSE.TXT} -ARM contributions llvm/lib/Target/ARM/LICENSE.TXT -md5 contributions llvm/lib/Support/MD5.cpp llvm/include/llvm/Support/MD5.h diff --git a/dev/tasks/conda-recipes/arrow-cpp/bld-arrow.bat b/dev/tasks/conda-recipes/arrow-cpp/bld-arrow.bat deleted file mode 100644 index 2cc6ed1ba3ec5..0000000000000 --- a/dev/tasks/conda-recipes/arrow-cpp/bld-arrow.bat +++ /dev/null @@ -1,54 +0,0 @@ -@echo on - -mkdir "%SRC_DIR%"\cpp\build -pushd "%SRC_DIR%"\cpp\build - -:: Enable CUDA support -if "%cuda_compiler_version%"=="None" ( - set "EXTRA_CMAKE_ARGS=-DARROW_CUDA=OFF" -) else ( - REM this should move to nvcc-feedstock - set "CUDA_PATH=%CUDA_PATH:\=/%" - set "CUDA_HOME=%CUDA_HOME:\=/%" - - set "EXTRA_CMAKE_ARGS=-DARROW_CUDA=ON" -) - -cmake -G "Ninja" ^ - -DBUILD_SHARED_LIBS=ON ^ - -DCMAKE_INSTALL_PREFIX="%LIBRARY_PREFIX%" ^ - -DARROW_DEPENDENCY_SOURCE=SYSTEM ^ - -DARROW_PACKAGE_PREFIX="%LIBRARY_PREFIX%" ^ - -DLLVM_TOOLS_BINARY_DIR="%LIBRARY_BIN%" ^ - -DPython3_EXECUTABLE="%PYTHON%" ^ - -DARROW_WITH_BZ2:BOOL=ON ^ - -DARROW_WITH_ZLIB:BOOL=ON ^ - -DARROW_WITH_ZSTD:BOOL=ON ^ - -DARROW_WITH_LZ4:BOOL=ON ^ - -DARROW_WITH_SNAPPY:BOOL=ON ^ - -DARROW_WITH_BROTLI:BOOL=ON ^ - -DARROW_BOOST_USE_SHARED:BOOL=ON ^ - -DARROW_BUILD_TESTS:BOOL=OFF ^ - -DARROW_BUILD_UTILITIES:BOOL=OFF ^ - -DARROW_BUILD_STATIC:BOOL=OFF ^ - -DCMAKE_BUILD_TYPE=release ^ - -DARROW_SSE42:BOOL=OFF ^ - -DARROW_PYTHON:BOOL=ON ^ - -DARROW_MIMALLOC:BOOL=ON ^ - -DARROW_DATASET:BOOL=ON ^ - -DARROW_FLIGHT:BOOL=ON ^ - -DARROW_HDFS:BOOL=ON ^ - -DARROW_PARQUET:BOOL=ON ^ - -DARROW_GANDIVA:BOOL=ON ^ - -DARROW_ORC:BOOL=ON ^ - -DARROW_S3:BOOL=ON ^ - -DBoost_NO_BOOST_CMAKE=ON ^ - -DCMAKE_UNITY_BUILD=ON ^ - %EXTRA_CMAKE_ARGS% ^ - .. -if errorlevel 1 exit 1 - -cmake --build . --target install --config Release -if errorlevel 1 exit 1 - -popd diff --git a/dev/tasks/conda-recipes/arrow-cpp/bld-pyarrow.bat b/dev/tasks/conda-recipes/arrow-cpp/bld-pyarrow.bat deleted file mode 100644 index 89cec3710c3e4..0000000000000 --- a/dev/tasks/conda-recipes/arrow-cpp/bld-pyarrow.bat +++ /dev/null @@ -1,44 +0,0 @@ -@echo on -pushd "%SRC_DIR%"\python - -@rem the symlinks for cmake modules don't work here -@rem NOTE: In contrast to conda-forge, they work here as we clone from git. -@rem del cmake_modules\BuildUtils.cmake -@rem del cmake_modules\SetupCxxFlags.cmake -@rem del cmake_modules\CompilerInfo.cmake -@rem del cmake_modules\FindNumPy.cmake -@rem del cmake_modules\FindPythonLibsNew.cmake -@rem copy /Y "%SRC_DIR%\cpp\cmake_modules\BuildUtils.cmake" cmake_modules\ -@rem copy /Y "%SRC_DIR%\cpp\cmake_modules\SetupCxxFlags.cmake" cmake_modules\ -@rem copy /Y "%SRC_DIR%\cpp\cmake_modules\CompilerInfo.cmake" cmake_modules\ -@rem copy /Y "%SRC_DIR%\cpp\cmake_modules\FindNumPy.cmake" cmake_modules\ -@rem copy /Y "%SRC_DIR%\cpp\cmake_modules\FindPythonLibsNew.cmake" cmake_modules\ - -SET ARROW_HOME=%LIBRARY_PREFIX% -SET SETUPTOOLS_SCM_PRETEND_VERSION=%PKG_VERSION% -SET PYARROW_BUILD_TYPE=release -SET PYARROW_WITH_S3=1 -SET PYARROW_WITH_HDFS=1 -SET PYARROW_WITH_DATASET=1 -SET PYARROW_WITH_FLIGHT=1 -SET PYARROW_WITH_GANDIVA=1 -SET PYARROW_WITH_PARQUET=1 -SET PYARROW_CMAKE_GENERATOR=Ninja - -:: Enable CUDA support -if "%cuda_compiler_version%"=="None" ( - set "PYARROW_WITH_CUDA=0" -) else ( - set "PYARROW_WITH_CUDA=1" -) - -%PYTHON% setup.py ^ - build_ext ^ - install --single-version-externally-managed ^ - --record=record.txt -if errorlevel 1 exit 1 -popd - -if [%PKG_NAME%] == [pyarrow] ( - rd /s /q %SP_DIR%\pyarrow\tests -) diff --git a/dev/tasks/conda-recipes/arrow-cpp/build-arrow.sh b/dev/tasks/conda-recipes/arrow-cpp/build-arrow.sh deleted file mode 100644 index f9c1d975ec346..0000000000000 --- a/dev/tasks/conda-recipes/arrow-cpp/build-arrow.sh +++ /dev/null @@ -1,95 +0,0 @@ -#!/bin/bash - -set -e -set -x - -mkdir cpp/build -pushd cpp/build - -EXTRA_CMAKE_ARGS="" - -# Include g++'s system headers -if [ "$(uname)" == "Linux" ]; then - SYSTEM_INCLUDES=$(echo | ${CXX} -E -Wp,-v -xc++ - 2>&1 | grep '^ ' | awk '{print "-isystem;" substr($1, 1)}' | tr '\n' ';') - EXTRA_CMAKE_ARGS=" -DARROW_GANDIVA_PC_CXX_FLAGS=${SYSTEM_INCLUDES}" -fi - -# Enable CUDA support -if [[ ! -z "${cuda_compiler_version+x}" && "${cuda_compiler_version}" != "None" ]] -then - if [[ -z "${CUDA_HOME+x}" ]] - then - echo "cuda_compiler_version=${cuda_compiler_version} CUDA_HOME=$CUDA_HOME" - CUDA_GDB_EXECUTABLE=$(which cuda-gdb || exit 0) - if [[ -n "$CUDA_GDB_EXECUTABLE" ]] - then - CUDA_HOME=$(dirname $(dirname $CUDA_GDB_EXECUTABLE)) - else - echo "Cannot determine CUDA_HOME: cuda-gdb not in PATH" - return 1 - fi - fi - EXTRA_CMAKE_ARGS=" ${EXTRA_CMAKE_ARGS} -DARROW_CUDA=ON -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_HOME} -DCMAKE_LIBRARY_PATH=${CUDA_HOME}/lib64/stubs" -else - EXTRA_CMAKE_ARGS=" ${EXTRA_CMAKE_ARGS} -DARROW_CUDA=OFF" -fi - -if [[ "${target_platform}" == "osx-arm64" ]]; then - # We need llvm 11+ support in Arrow for this - EXTRA_CMAKE_ARGS=" ${EXTRA_CMAKE_ARGS} -DARROW_GANDIVA=OFF" - sed -ie "s;protoc-gen-grpc.*$;protoc-gen-grpc=${BUILD_PREFIX}/bin/grpc_cpp_plugin\";g" ../src/arrow/flight/CMakeLists.txt - sed -ie 's;"--with-jemalloc-prefix\=je_arrow_";"--with-jemalloc-prefix\=je_arrow_" "--with-lg-page\=14";g' ../cmake_modules/ThirdpartyToolchain.cmake -else - EXTRA_CMAKE_ARGS=" ${EXTRA_CMAKE_ARGS} -DARROW_GANDIVA=ON" -fi - -cmake \ - -DARROW_BOOST_USE_SHARED=ON \ - -DARROW_BUILD_BENCHMARKS=OFF \ - -DARROW_BUILD_STATIC=OFF \ - -DARROW_BUILD_TESTS=OFF \ - -DARROW_BUILD_UTILITIES=OFF \ - -DBUILD_SHARED_LIBS=ON \ - -DARROW_DATASET=ON \ - -DARROW_DEPENDENCY_SOURCE=SYSTEM \ - -DARROW_FLIGHT=ON \ - -DARROW_FLIGHT_REQUIRE_TLSCREDENTIALSOPTIONS=ON \ - -DARROW_HDFS=ON \ - -DARROW_JEMALLOC=ON \ - -DARROW_MIMALLOC=ON \ - -DARROW_ORC=ON \ - -DARROW_PACKAGE_PREFIX=$PREFIX \ - -DARROW_PARQUET=ON \ - -DARROW_PLASMA=ON \ - -DARROW_PYTHON=ON \ - -DARROW_S3=ON \ - -DARROW_SIMD_LEVEL=NONE \ - -DARROW_USE_LD_GOLD=ON \ - -DARROW_WITH_BROTLI=ON \ - -DARROW_WITH_BZ2=ON \ - -DARROW_WITH_LZ4=ON \ - -DARROW_WITH_SNAPPY=ON \ - -DARROW_WITH_ZLIB=ON \ - -DARROW_WITH_ZSTD=ON \ - -DCMAKE_BUILD_TYPE=release \ - -DCMAKE_INSTALL_LIBDIR=lib \ - -DCMAKE_INSTALL_PREFIX=$PREFIX \ - -DLLVM_TOOLS_BINARY_DIR=$PREFIX/bin \ - -DPython3_EXECUTABLE=${PYTHON} \ - -DProtobuf_PROTOC_EXECUTABLE=$BUILD_PREFIX/bin/protoc \ - -GNinja \ - ${EXTRA_CMAKE_ARGS} \ - .. - -# Commented out until jemalloc and mimalloc are fixed upstream -if [[ "${target_platform}" == "osx-arm64" ]]; then - ninja jemalloc_ep-prefix/src/jemalloc_ep-stamp/jemalloc_ep-patch mimalloc_ep-prefix/src/mimalloc_ep-stamp/mimalloc_ep-patch - cp $BUILD_PREFIX/share/gnuconfig/config.* jemalloc_ep-prefix/src/jemalloc_ep/build-aux/ - sed -ie 's/list(APPEND mi_cflags -march=native)//g' mimalloc_ep-prefix/src/mimalloc_ep/CMakeLists.txt - # Use the correct register for thread-local storage - sed -ie 's/tpidr_el0/tpidrro_el0/g' mimalloc_ep-prefix/src/mimalloc_ep/include/mimalloc-internal.h -fi - -ninja install - -popd diff --git a/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh b/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh deleted file mode 100644 index a394e999f7b24..0000000000000 --- a/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/sh - -set -e -set -x - -# Build dependencies -export ARROW_HOME=$PREFIX -export PARQUET_HOME=$PREFIX -export SETUPTOOLS_SCM_PRETEND_VERSION=$PKG_VERSION -export PYARROW_BUILD_TYPE=release -export PYARROW_BUNDLE_ARROW_CPP_HEADERS=0 -export PYARROW_WITH_DATASET=1 -export PYARROW_WITH_FLIGHT=1 -if [[ "${target_platform}" == "osx-arm64" ]]; then - # We need llvm 11+ support in Arrow for this - export PYARROW_WITH_GANDIVA=0 -else - export PYARROW_WITH_GANDIVA=1 -fi -export PYARROW_WITH_HDFS=1 -export PYARROW_WITH_ORC=1 -export PYARROW_WITH_PARQUET=1 -export PYARROW_WITH_PLASMA=1 -export PYARROW_WITH_S3=1 -export PYARROW_CMAKE_GENERATOR=Ninja -BUILD_EXT_FLAGS="" - -# Enable CUDA support -if [[ ! -z "${cuda_compiler_version+x}" && "${cuda_compiler_version}" != "None" ]]; then - export PYARROW_WITH_CUDA=1 -else - export PYARROW_WITH_CUDA=0 -fi - -# Resolve: Make Error at cmake_modules/SetupCxxFlags.cmake:338 (message): Unsupported arch flag: -march=. -if [[ "${target_platform}" == "linux-aarch64" ]]; then - export PYARROW_CMAKE_OPTIONS="-DARROW_ARMV8_ARCH=armv8-a" -fi - -cd python - -$PYTHON setup.py \ - build_ext \ - install --single-version-externally-managed \ - --record=record.txt - -if [[ "$PKG_NAME" == "pyarrow" ]]; then - rm -r ${SP_DIR}/pyarrow/tests -fi diff --git a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml b/dev/tasks/conda-recipes/arrow-cpp/meta.yaml deleted file mode 100644 index 48a8629866d9f..0000000000000 --- a/dev/tasks/conda-recipes/arrow-cpp/meta.yaml +++ /dev/null @@ -1,302 +0,0 @@ -# NOTE: In constrast to the conda-forge recipe, ARROW_VERSION is a templated variable here. -{% set version = ARROW_VERSION %} -{% set cuda_enabled = cuda_compiler_version != "None" %} -{% set build_ext_version = ARROW_VERSION %} -{% set build_ext = "cuda" if cuda_enabled else "cpu" %} -{% set proc_build_number = "0" %} - -package: - name: arrow-cpp-ext - version: {{ version }} - -source: - path: ../../../../ - -build: - number: 0 - # for cuda on win/linux, building with 9.2 is enough to be compatible with all later versions, - # since arrow is only using libcuda, and not libcudart. - skip: true # [(win or linux) and cuda_compiler_version not in ("None", "10.2")] - skip: true # [osx and cuda_compiler_version != "None"] - run_exports: - - {{ pin_subpackage("arrow-cpp", max_pin="x.x.x") }} - -outputs: - - name: arrow-cpp-proc - version: {{ build_ext_version }} - build: - number: {{ proc_build_number }} - string: "{{ build_ext }}" - test: - commands: - - exit 0 - about: - home: http://github.com/apache/arrow - license: Apache-2.0 - license_file: - - LICENSE.txt - summary: 'A meta-package to select Arrow build variant' - - - name: arrow-cpp - script: build-arrow.sh # [not win] - script: bld-arrow.bat # [win] - version: {{ version }} - build: - string: py{{ CONDA_PY }}h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}_{{ build_ext }} - run_exports: - - {{ pin_subpackage("arrow-cpp", max_pin="x.x.x") }} - ignore_run_exports: - - cudatoolkit - track_features: - {{ "- arrow-cuda" if cuda_enabled else "" }} - requirements: - build: - - python # [build_platform != target_platform] - - cross-python_{{ target_platform }} # [build_platform != target_platform] - - cython # [build_platform != target_platform] - - numpy # [build_platform != target_platform] - - gnuconfig # [osx and arm64] - - libprotobuf - - grpc-cpp - - cmake - - autoconf # [unix] - - ninja - - make # [unix] - - {{ compiler('c') }} - - {{ compiler('cxx') }} - - {{ compiler("cuda") }} # [cuda_compiler_version != "None"] - host: - - aws-sdk-cpp - - boost-cpp >=1.70 - - brotli - - bzip2 - - c-ares - - gflags - - glog - - grpc-cpp - - libprotobuf - - clangdev 10 # [not (osx and arm64)] - - llvmdev 10 # [not (osx and arm64)] - - libutf8proc - - lz4-c - - numpy - - orc # [unix] - - python - - rapidjson - - re2 - - snappy - - thrift-cpp - - zlib - - zstd - run: - - {{ pin_compatible('numpy', lower_bound='1.16') }} - - python - run_constrained: - - arrow-cpp-proc * {{ build_ext }} - - cudatoolkit >=9.2 # [cuda_compiler_version != "None"] - - about: - home: http://github.com/apache/arrow - license: Apache-2.0 - license_file: - - LICENSE.txt - summary: C++ libraries for Apache Arrow - - test: - commands: - # headers - - test -f $PREFIX/include/arrow/api.h # [unix] - - test -f $PREFIX/include/arrow/flight/types.h # [unix] - - test -f $PREFIX/include/plasma/client.h # [unix] - - test -f $PREFIX/include/gandiva/engine.h # [unix and not (osx and arm64)] - - test -f $PREFIX/include/parquet/api/reader.h # [unix] - - if not exist %LIBRARY_INC%\\arrow\\api.h exit 1 # [win] - - if not exist %LIBRARY_INC%\\gandiva\\engine.h exit 1 # [win] - - if not exist %LIBRARY_INC%\\parquet\\api\\reader.h exit 1 # [win] - - # shared - - test -f $PREFIX/lib/libarrow.so # [linux] - - test -f $PREFIX/lib/libarrow_dataset.so # [linux] - - test -f $PREFIX/lib/libarrow_flight.so # [linux] - - test -f $PREFIX/lib/libarrow_python.so # [linux] - - test -f $PREFIX/lib/libparquet.so # [linux] - - test -f $PREFIX/lib/libgandiva.so # [linux] - - test -f $PREFIX/lib/libplasma.so # [linux] - - test -f $PREFIX/lib/libarrow_cuda${SHLIB_EXT} # [(cuda_compiler_version != "None") and unix] - - test ! -f $PREFIX/lib/libarrow_cuda${SHLIB_EXT} # [(cuda_compiler_version == "None") and unix] - - if not exist %PREFIX%\\Library\\bin\\arrow_cuda.dll exit 1 # [(cuda_compiler_version != "None") and win] - - if exist %PREFIX%\\Library\\bin\\arrow_cuda.dll exit 1 # [(cuda_compiler_version == "None") and win] - - test -f $PREFIX/lib/libarrow.dylib # [osx] - - test -f $PREFIX/lib/libarrow_dataset.dylib # [osx] - - test -f $PREFIX/lib/libarrow_python.dylib # [osx] - - test -f $PREFIX/lib/libgandiva.dylib # [osx and not arm64] - - test -f $PREFIX/lib/libparquet.dylib # [osx] - - test -f $PREFIX/lib/libplasma.dylib # [osx] - - if not exist %PREFIX%\\Library\\bin\\arrow.dll exit 1 # [win] - - if not exist %PREFIX%\\Library\\bin\\arrow_dataset.dll exit 1 # [win] - - if not exist %PREFIX%\\Library\\bin\\arrow_flight.dll exit 1 # [win] - - if not exist %PREFIX%\\Library\\bin\\arrow_python.dll exit 1 # [win] - - if not exist %PREFIX%\\Library\\bin\\parquet.dll exit 1 # [win] - - if not exist %PREFIX%\\Library\\bin\\gandiva.dll exit 1 # [win] - - # absence of static libraries - - test ! -f $PREFIX/lib/libarrow.a # [unix] - - test ! -f $PREFIX/lib/libarrow_dataset.a # [unix] - - test ! -f $PREFIX/lib/libarrow_flight.a # [unix] - - test ! -f $PREFIX/lib/libarrow_python.a # [unix] - - test ! -f $PREFIX/lib/libplasma.a # [unix] - - test ! -f $PREFIX/lib/libparquet.a # [unix] - - test ! -f $PREFIX/lib/libgandiva.a # [unix] - - if exist %PREFIX%\\Library\\lib\\arrow_static.lib exit 1 # [win] - - if exist %PREFIX%\\Library\\lib\\arrow_dataset_static.lib exit 1 # [win] - - if exist %PREFIX%\\Library\\lib\\arrow_flight_static.lib exit 1 # [win] - - if exist %PREFIX%\\Library\\lib\\arrow_python_static.lib exit 1 # [win] - - if exist %PREFIX%\\Library\\lib\\parquet_static.lib exit 1 # [win] - - if exist %PREFIX%\\Library\\lib\\gandiva_static.lib exit 1 # [win] - - - name: pyarrow - script: build-pyarrow.sh # [not win] - script: bld-pyarrow.bat # [win] - version: {{ version }} - build: - string: py{{ CONDA_PY }}h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}_{{ build_ext }} - ignore_run_exports: - - cudatoolkit - track_features: - {{ "- arrow-cuda" if cuda_enabled else "" }} - requirements: - build: - - python # [build_platform != target_platform] - - cross-python_{{ target_platform }} # [build_platform != target_platform] - - cython # [build_platform != target_platform] - - numpy # [build_platform != target_platform] - - cmake - - ninja - - make # [unix] - - {{ compiler('c') }} - - {{ compiler('cxx') }} - # pyarrow does not require nvcc but it needs to link against libraries in arrow-cpp=*=*cuda - - {{ compiler("cuda") }} # [cuda_compiler_version != "None"] - host: - - {{ pin_subpackage('arrow-cpp', exact=True) }} - - cython - - numpy - - python - - setuptools - - setuptools_scm - - six - run: - - {{ pin_subpackage('arrow-cpp', exact=True) }} - - {{ pin_compatible('numpy', lower_bound='1.16') }} - # empty parquet-cpp metapackage, force old versions to be uninstalled - - parquet-cpp 1.5.1.* - - python - run_constrained: - - arrow-cpp-proc * {{ build_ext }} - - cudatoolkit >=9.2 # [cuda_compiler_version != "None"] - - about: - home: http://github.com/apache/arrow - license: Apache-2.0 - license_file: - - LICENSE.txt - summary: Python libraries for Apache Arrow - - test: - imports: - - pyarrow - - pyarrow.dataset - - pyarrow.flight - - pyarrow.gandiva # [not (osx and arm64)] - - pyarrow.orc # [unix] - - pyarrow.parquet - - pyarrow.plasma # [unix] - - pyarrow.fs - - pyarrow._s3fs - - pyarrow._hdfs - # We can only test importing cuda package but cannot run when a - # CUDA device is not available, for instance, when building from CI. - # On Windows, we cannot even do that due to `nvcuda.dll` not being found, see - # https://conda-forge.org/docs/maintainer/knowledge_base.html#nvcuda-dll-cannot-be-found-on-windows - # However, we check below for (at least) the presence of a correctly-compiled module - - pyarrow.cuda # [cuda_compiler_version != "None" and not win] - commands: - - test ! -f ${SP_DIR}/pyarrow/tests/test_array.py # [unix] - - if exist %SP_DIR%/pyarrow/tests/test_array.py exit 1 # [win] - # Need to remove dot from PY_VER; %MYVAR:x=y% replaces "x" in %MYVAR% with "y" - - if not exist %SP_DIR%/pyarrow/_cuda.cp%PY_VER:.=%-win_amd64.pyd exit 1 # [win and cuda_compiler_version != "None"] - - - name: pyarrow-tests - script: build-pyarrow.sh # [not win] - script: bld-pyarrow.bat # [win] - version: {{ version }} - build: - string: py{{ CONDA_PY }}h{{ PKG_HASH }}_{{ PKG_BUILDNUM }}_{{ build_ext }} - ignore_run_exports: - - cudatoolkit - track_features: - {{ "- arrow-cuda" if cuda_enabled else "" }} - requirements: - build: - - python # [build_platform != target_platform] - - cross-python_{{ target_platform }} # [build_platform != target_platform] - - cython # [build_platform != target_platform] - - numpy # [build_platform != target_platform] - - cmake - - ninja - - make # [unix] - - {{ compiler('c') }} - - {{ compiler('cxx') }} - # pyarrow does not require nvcc but it needs to link against libraries in arrow-cpp=*=*cuda - - {{ compiler("cuda") }} # [cuda_compiler_version != "None"] - host: - - {{ pin_subpackage('arrow-cpp', exact=True) }} - - {{ pin_subpackage('pyarrow', exact=True) }} - - cython - - numpy - - python - - setuptools - - setuptools_scm - - six - run: - - {{ pin_subpackage('pyarrow', exact=True) }} - - python - run_constrained: - - arrow-cpp-proc * {{ build_ext }} - - cudatoolkit >=9.2 # [cuda_compiler_version != "None"] - - about: - home: http://github.com/apache/arrow - license: Apache-2.0 - license_file: - - LICENSE.txt - summary: Python test files for Apache Arrow - - test: - commands: - - test -f ${SP_DIR}/pyarrow/tests/test_array.py # [unix] - - if not exist %SP_DIR%/pyarrow/tests/test_array.py exit 1 # [win] - -about: - home: http://github.com/apache/arrow - license: Apache-2.0 - license_file: - - LICENSE.txt - summary: C++ and Python libraries for Apache Arrow - -extra: - recipe-maintainers: - - wesm - - xhochy - - leifwalsh - - jreback - - cpcloud - - pcmoritz - - robertnishihara - - siddharthteotia - - kou - - kszucs - - pitrou - - pearu - - nealrichardson - - jakirkham diff --git a/dev/tasks/conda-recipes/azure.clean.yml b/dev/tasks/conda-recipes/azure.clean.yml deleted file mode 100644 index 55ac36528addf..0000000000000 --- a/dev/tasks/conda-recipes/azure.clean.yml +++ /dev/null @@ -1,28 +0,0 @@ -jobs: -- job: linux - pool: - vmImage: ubuntu-18.04 - timeoutInMinutes: 360 - - steps: - - script: | - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - git -C arrow checkout FETCH_HEAD - git -C arrow submodule update --init --recursive - displayName: Clone arrow - - - script: | - conda install -y -c conda-forge pandas anaconda-client packaging - displayName: Install requirements - - - script: | - {% if arrow.branch == 'master' %} - mkdir -p $HOME/.continuum/anaconda-client/tokens/ - echo $(CROSSBOW_ANACONDA_TOKEN) > $HOME/.continuum/anaconda-client/tokens/https%3A%2F%2Fapi.anaconda.org.token - {% endif %} - eval "$(conda shell.bash hook)" - conda activate base - python3 arrow/dev/tasks/conda-recipes/clean.py {% if arrow.branch == 'master' %}FORCE{% endif %} - displayName: Delete outdated packages - diff --git a/dev/tasks/conda-recipes/azure.linux.yml b/dev/tasks/conda-recipes/azure.linux.yml deleted file mode 100755 index 161fd14e90cbf..0000000000000 --- a/dev/tasks/conda-recipes/azure.linux.yml +++ /dev/null @@ -1,38 +0,0 @@ -{% import 'macros.jinja' as macros with context %} - -jobs: -- job: linux - pool: - vmImage: ubuntu-16.04 - timeoutInMinutes: 360 - - variables: - CONFIG: {{ config }} - R_CONFIG: {{ r_config|default("") }} - ARROW_VERSION: {{ arrow.no_rc_version }} - UPLOAD_PACKAGES: False - - steps: - # configure qemu binfmt-misc running. This allows us to run docker containers - # embedded qemu-static - - script: | - docker run --rm --privileged multiarch/qemu-user-static:register --reset --credential yes - ls /proc/sys/fs/binfmt_misc/ - displayName: Configure binfmt_misc - condition: not(startsWith(variables['CONFIG'], 'linux_64')) - - {{ macros.azure_checkout_arrow() }} - - - task: CondaEnvironment@1 - inputs: - packageSpecs: 'anaconda-client shyaml' - installOptions: '-c conda-forge' - updateConda: false - - - script: | - mkdir build_artifacts - CI=azure arrow/dev/tasks/conda-recipes/run_docker_build.sh $(pwd)/build_artifacts - displayName: Run docker build - - {{ macros.azure_upload_releases("build_artifacts/linux-64/*.tar.bz2") }} - {{ macros.azure_upload_anaconda("build_artifacts/linux-64/*.tar.bz2") }} diff --git a/dev/tasks/conda-recipes/azure.osx.yml b/dev/tasks/conda-recipes/azure.osx.yml deleted file mode 100755 index dbb1a68aca6f4..0000000000000 --- a/dev/tasks/conda-recipes/azure.osx.yml +++ /dev/null @@ -1,80 +0,0 @@ -{% import 'macros.jinja' as macros with context %} - -jobs: -- job: osx - pool: - vmImage: macOS-10.14 - timeoutInMinutes: 360 - variables: - CONFIG: {{ config }} - R_CONFIG: {{ r_config|default("") }} - ARROW_VERSION: {{ arrow.no_rc_version }} - UPLOAD_PACKAGES: False - steps: - - script: | - echo "Removing homebrew from Azure to avoid conflicts." - curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/uninstall > ~/uninstall_homebrew - chmod +x ~/uninstall_homebrew - ~/uninstall_homebrew -fq - rm ~/uninstall_homebrew - displayName: Remove homebrew - - - bash: | - echo "##vso[task.prependpath]$CONDA/bin" - sudo chown -R $USER $CONDA - displayName: Add conda to PATH - - - script: | - source activate base - conda install -n base -c conda-forge --quiet --yes conda-forge-ci-setup=3 conda-build - displayName: 'Add conda-forge-ci-setup=3' - - {{ macros.azure_checkout_arrow() }} - - - script: | - source activate base - echo "Configuring conda." - - setup_conda_rc ./ ./ ./.ci_support/${CONFIG}.yaml - export CI=azure - source run_conda_forge_build_setup - conda update --yes --quiet --override-channels -c conda-forge -c defaults --all - displayName: Configure conda and conda-build - workingDirectory: arrow/dev/tasks/conda-recipes - env: - OSX_FORCE_SDK_DOWNLOAD: "1" - - - script: | - source activate base - mangle_compiler ./ ./ ./.ci_support/${CONFIG}.yaml - workingDirectory: arrow/dev/tasks/conda-recipes - displayName: Mangle compiler - - - script: | - source activate base - make_build_number ./ ./ ./.ci_support/${CONFIG}.yaml - workingDirectory: arrow/dev/tasks/conda-recipes - displayName: Generate build number clobber file - - - script: | - source activate base - set +x - if [[ "${CONFIG}" == osx_arm* ]]; then - EXTRA_CB_OPTIONS="${EXTRA_CB_OPTIONS:-} --no-test" - fi - conda build arrow-cpp \ - -m ./.ci_support/${CONFIG}.yaml \ - --clobber-file ./.ci_support/clobber_${CONFIG}.yaml \ - ${EXTRA_CB_OPTIONS:-} \ - --output-folder ./build_artifacts - - if [ ! -z "${R_CONFIG}" ]; then - conda build r-arrow \ - -m ./.ci_support/r/${R_CONFIG}.yaml \ - --output-folder ./build_artifacts - fi - workingDirectory: arrow/dev/tasks/conda-recipes - displayName: Build recipes - - {{ macros.azure_upload_releases("arrow/dev/tasks/conda-recipes/build_artifacts/osx-*/*.tar.bz2") }} - {{ macros.azure_upload_anaconda("arrow/dev/tasks/conda-recipes/build_artifacts/osx-*/*.tar.bz2") }} diff --git a/dev/tasks/conda-recipes/azure.win.yml b/dev/tasks/conda-recipes/azure.win.yml deleted file mode 100755 index a3ec6931caf24..0000000000000 --- a/dev/tasks/conda-recipes/azure.win.yml +++ /dev/null @@ -1,77 +0,0 @@ -{% import 'macros.jinja' as macros with context %} - -jobs: -- job: win - pool: - vmImage: vs2017-win2016 - timeoutInMinutes: 360 - variables: - CONFIG: {{ config }} - R_CONFIG: {{ r_config|default("") }} - ARROW_VERSION: {{ arrow.no_rc_version }} - CONDA_BLD_PATH: D:\\bld\\ - UPLOAD_PACKAGES: False - - steps: - - script: | - choco install vcpython27 -fdv -y --debug - condition: contains(variables['CONFIG'], 'vs2008') - displayName: Install vcpython27.msi (if needed) - - - powershell: | - Set-PSDebug -Trace 1 - $batchcontent = @" - ECHO ON - SET vcpython=C:\Program Files (x86)\Common Files\Microsoft\Visual C++ for Python\9.0 - DIR "%vcpython%" - CALL "%vcpython%\vcvarsall.bat" %* - "@ - $batchDir = "C:\Program Files (x86)\Common Files\Microsoft\Visual C++ for Python\9.0\VC" - $batchPath = "$batchDir" + "\vcvarsall.bat" - New-Item -Path $batchPath -ItemType "file" -Force - Set-Content -Value $batchcontent -Path $batchPath - Get-ChildItem -Path $batchDir - Get-ChildItem -Path ($batchDir + '\..') - condition: contains(variables['CONFIG'], 'vs2008') - displayName: Patch vs2008 (if needed) - - - task: CondaEnvironment@1 - inputs: - packageSpecs: 'python=3.6 conda-build conda conda-forge::conda-forge-ci-setup=3 pip' # Optional - installOptions: "-c conda-forge" - updateConda: true - displayName: Install conda-build and activate environment - - script: set PYTHONUNBUFFERED=1 - - {{ macros.azure_checkout_arrow()|indent(2) }} - - # Configure the VM - - script: setup_conda_rc .\ .\ .\.ci_support\%CONFIG%.yaml - workingDirectory: arrow\dev\tasks\conda-recipes - - # Configure the VM. - - script: | - set "CI=azure" - call activate base - run_conda_forge_build_setup - displayName: conda-forge build setup - workingDirectory: arrow\dev\tasks\conda-recipes - - - script: | - conda.exe build arrow-cpp parquet-cpp -m .ci_support\%CONFIG%.yaml - displayName: Build recipe - workingDirectory: arrow\dev\tasks\conda-recipes - env: - PYTHONUNBUFFERED: 1 - condition: not(contains(variables['CONFIG'], 'vs2008')) - - - script: | - conda.exe build r-arrow -m .ci_support\r\%R_CONFIG%.yaml - displayName: Build recipe - workingDirectory: arrow\dev\tasks\conda-recipes - env: - PYTHONUNBUFFERED: 1 - condition: contains(variables['R_CONFIG'], 'win') - - {{ macros.azure_upload_releases("D:\bld\win-64\*.tar.bz2")|indent(2) }} - {{ macros.azure_upload_anaconda("D:\bld\win-64\*.tar.bz2")|indent(2) }} diff --git a/dev/tasks/conda-recipes/build_steps.sh b/dev/tasks/conda-recipes/build_steps.sh deleted file mode 100755 index 25864c08a7080..0000000000000 --- a/dev/tasks/conda-recipes/build_steps.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env bash - -# NOTE: This script has been slightly adopted to suite the Apache Arrow / crossbow CI -# setup. The next time this is updated to the current version on conda-forge, -# you will also make this additions afterwards. - -# PLEASE NOTE: This script has been automatically generated by conda-smithy. Any changes here -# will be lost next time ``conda smithy rerender`` is run. If you would like to make permanent -# changes to this script, consider a proposal to conda-smithy so that other feedstocks can also -# benefit from the improvement. - -set -xeuo pipefail - -output_dir=${1} - -export PYTHONUNBUFFERED=1 -export FEEDSTOCK_ROOT="${FEEDSTOCK_ROOT:-/home/conda/feedstock_root}" -export CI_SUPPORT="${FEEDSTOCK_ROOT}/.ci_support" -export CONFIG_FILE="${CI_SUPPORT}/${CONFIG}.yaml" - -cat >~/.condarc < List[str]: - env = os.environ.copy() - env["CONDA_SUBDIR"] = platform - pkgs_json = check_output( - [ - "conda", - "search", - "--json", - "-c", - "arrow-nightlies", - "--override-channels", - package_name, - ], - env=env, - ) - pkgs = pd.DataFrame(json.loads(pkgs_json)[package_name]) - pkgs["version"] = pkgs["version"].map(Version) - pkgs["py_version"] = pkgs["build"].str.slice(0, 4) - - to_delete = [] - - for (subdir, python), group in pkgs.groupby(["subdir", "py_version"]): - group = group.sort_values(by="version", ascending=False) - - if len(group) > VERSIONS_TO_KEEP: - del_candidates = group[VERSIONS_TO_KEEP:] - to_delete += ( - f"arrow-nightlies/{package_name}/" - + del_candidates["version"].astype(str) - + del_candidates["url"].str.replace( - "https://conda.anaconda.org/arrow-nightlies", "" - ) - ).to_list() - - return to_delete - - -if __name__ == "__main__": - to_delete = [] - for package in PACKAGES: - for platform in PLATFORMS: - if [package, platform] in EXCLUDED_PATTERNS: - continue - to_delete += packages_to_delete(package, platform) - - for name in to_delete: - print(f"Deleting {name} …") - if "FORCE" in sys.argv: - check_call(["anaconda", "remove", "-f", name]) diff --git a/dev/tasks/conda-recipes/conda-forge.yml b/dev/tasks/conda-recipes/conda-forge.yml deleted file mode 100644 index 4c07b5dd3e0bb..0000000000000 --- a/dev/tasks/conda-recipes/conda-forge.yml +++ /dev/null @@ -1 +0,0 @@ -channel_priority: strict diff --git a/dev/tasks/conda-recipes/drone-steps.sh b/dev/tasks/conda-recipes/drone-steps.sh deleted file mode 100755 index dffdb41b0887a..0000000000000 --- a/dev/tasks/conda-recipes/drone-steps.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -set -ex - -OUTPUT_DIR=$1 -QUEUE_REMOTE_URL=$2 -TASK_BRANCH=$3 -TASK_TAG=$4 -UPLOAD_TO_ANACONDA=$5 - -conda install -y mamba -$FEEDSTOCK_ROOT/build_steps.sh ${OUTPUT_DIR} - -# Upload as Github release -mamba install -y anaconda-client shyaml -c conda-forge - -pushd $DRONE_WORKSPACE -pip install -e arrow/dev/archery[crossbow] -archery crossbow \ - --queue-path . \ - --queue-remote ${QUEUE_REMOTE_URL} \ - upload-artifacts \ - --sha ${TASK_BRANCH} \ - --tag ${TASK_TAG} \ - --pattern "${OUTPUT_DIR}/linux-aarch64/*.tar.bz2" - -if [[ "${UPLOAD_TO_ANACONDA}" == "1" ]]; then - anaconda -t ${CROSSBOW_ANACONDA_TOKEN} upload --force build_artifacts/linux-aarch64/*.tar.bz2 -fi diff --git a/dev/tasks/conda-recipes/drone.yml b/dev/tasks/conda-recipes/drone.yml deleted file mode 100644 index a461c79b9d4ce..0000000000000 --- a/dev/tasks/conda-recipes/drone.yml +++ /dev/null @@ -1,43 +0,0 @@ ---- -kind: pipeline -name: {{ config }} - -platform: - os: linux - arch: arm64 - -# Omit double builds with crossbow -trigger: - event: - - push - -steps: -- name: Install and build - image: condaforge/linux-anvil-aarch64 - environment: - CONFIG: {{ config }} - UPLOAD_PACKAGES: False - ARROW_VERSION: {{ arrow.no_rc_version }} - PLATFORM: linux-aarch64 - BINSTAR_TOKEN: - from_secret: BINSTAR_TOKEN - FEEDSTOCK_TOKEN: - from_secret: FEEDSTOCK_TOKEN - STAGING_BINSTAR_TOKEN: - from_secret: STAGING_BINSTAR_TOKEN - CROSSBOW_GITHUB_TOKEN: - from_secret: CROSSBOW_GITHUB_TOKEN - CROSSBOW_ANACONDA_TOKEN: - from_secret: CROSSBOW_ANACONDA_TOKEN - commands: - - export RECIPE_ROOT="$FEEDSTOCK_ROOT/arrow-cpp" - - export CI=drone - - export GIT_BRANCH="{{ arrow.branch }}" - - export FEEDSTOCK_NAME=arrow-cpp - - export FEEDSTOCK_ROOT="$DRONE_WORKSPACE/arrow/dev/tasks/conda-recipes" - - sed -i '$ichown -R conda:conda "$FEEDSTOCK_ROOT"' /opt/docker/bin/entrypoint - - yum install -y git - - git clone --no-checkout {{ arrow.remote }} arrow - - pushd arrow && git fetch -t {{ arrow.remote }} {{ arrow.branch }} && git checkout FETCH_HEAD && git submodule update --init --recursive && popd - - mkdir -p $(pwd)/build_artifacts && chmod a+rwx $(pwd)/build_artifacts - - /opt/docker/bin/entrypoint $FEEDSTOCK_ROOT/drone-steps.sh $(pwd)/build_artifacts {{ queue_remote_url }} {{ task.branch }} {{ task.tag }} {% if arrow.branch == 'master' %}1{% else %}0{% endif %} diff --git a/dev/tasks/conda-recipes/parquet-cpp/meta.yaml b/dev/tasks/conda-recipes/parquet-cpp/meta.yaml deleted file mode 100644 index 5de06c32b1df8..0000000000000 --- a/dev/tasks/conda-recipes/parquet-cpp/meta.yaml +++ /dev/null @@ -1,51 +0,0 @@ -# ARROW-3229: this is a meta-package to prevent conflicts in the future - -{% set parquet_version = "1.5.1" %} - -package: - name: parquet-cpp - version: {{ parquet_version }} - -build: - number: 0 - skip: true # [win32] - skip: true # [win and py<35] - -requirements: - host: - # NOTE: in the upstream feedstock use >= instead of = - - arrow-cpp ={{ ARROW_VERSION }} - run: - - arrow-cpp ={{ ARROW_VERSION }} - -test: - commands: - # headers - - test -f $PREFIX/include/parquet/api/reader.h # [unix] - - if not exist %LIBRARY_INC%\\parquet\\api\\reader.h exit 1 # [win] - - # shared - - test -f $PREFIX/lib/libparquet.so # [linux] - - test -f $PREFIX/lib/libparquet.dylib # [osx] - - if not exist %PREFIX%\\Library\\bin\\parquet.dll exit 1 # [win] - - # absence of static libraries - - test ! -f $PREFIX/lib/libparquet.a # [unix] - - if exist %PREFIX%\\Library\\lib\\parquet_static.lib exit 1 # [win] - -about: - home: http://github.com/apache/arrow - license: Apache 2.0 - summary: 'C++ libraries for the Apache Parquet file format' - -extra: - recipe-maintainers: - - wesm - - xhochy - - leifwalsh - - jreback - - cpcloud - - siddharthteotia - - kou - - kszucs - - pitrou diff --git a/dev/tasks/conda-recipes/r-arrow/bld.bat b/dev/tasks/conda-recipes/r-arrow/bld.bat deleted file mode 100644 index a193ddc0a7748..0000000000000 --- a/dev/tasks/conda-recipes/r-arrow/bld.bat +++ /dev/null @@ -1,9 +0,0 @@ -bash %RECIPE_DIR%/build_win.sh -IF %ERRORLEVEL% NEQ 0 exit 1 -cp %RECIPE_DIR%/configure.win r -IF %ERRORLEVEL% NEQ 0 exit 1 -cp %RECIPE_DIR%/install.libs.R r/src -IF %ERRORLEVEL% NEQ 0 exit 1 -set "MAKEFLAGS=-j%CPU_COUNT%" -"%R%" CMD INSTALL --build r -IF %ERRORLEVEL% NEQ 0 exit 1 diff --git a/dev/tasks/conda-recipes/r-arrow/build.sh b/dev/tasks/conda-recipes/r-arrow/build.sh deleted file mode 100644 index 0a6c7961fc606..0000000000000 --- a/dev/tasks/conda-recipes/r-arrow/build.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -export DISABLE_AUTOBREW=1 -$R CMD INSTALL --build r/. diff --git a/dev/tasks/conda-recipes/r-arrow/build_win.sh b/dev/tasks/conda-recipes/r-arrow/build_win.sh deleted file mode 100755 index 88e0462f66388..0000000000000 --- a/dev/tasks/conda-recipes/r-arrow/build_win.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -set -exuo pipefail - - -# Rename arrow.dll to lib_arrow.dll to avoid conflicts with the arrow-cpp arrow.dll -sed -i -e 's/void R_init_arrow/__declspec(dllexport) void R_init_lib_arrow/g' r/src/arrowExports.cpp -sed -i -e 's/useDynLib(arrow/useDynLib(lib_arrow/g' r/NAMESPACE diff --git a/dev/tasks/conda-recipes/r-arrow/configure.win b/dev/tasks/conda-recipes/r-arrow/configure.win deleted file mode 100755 index 82fa1795699d6..0000000000000 --- a/dev/tasks/conda-recipes/r-arrow/configure.win +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -set -euxo pipefail - -echo "PKG_CPPFLAGS=-DNDEBUG -I\"${LIBRARY_PREFIX}/include\" -I\"${PREFIX}/include\" -DARROW_R_WITH_ARROW -DARROW_R_WITH_PARQUET -DARROW_R_WITH_DATASET -DARROW_R_WITH_S3" > src/Makevars.win -echo "PKG_CXXFLAGS=\$(CXX_VISIBILITY)" >> src/Makevars.win -echo 'CXX_STD=CXX11' >> src/Makevars.win -echo "PKG_LIBS=-L\"${LIBRARY_PREFIX}/lib\" -larrow_dataset -lparquet -larrow" >> src/Makevars.win diff --git a/dev/tasks/conda-recipes/r-arrow/install.libs.R b/dev/tasks/conda-recipes/r-arrow/install.libs.R deleted file mode 100644 index 005bbe16b9984..0000000000000 --- a/dev/tasks/conda-recipes/r-arrow/install.libs.R +++ /dev/null @@ -1,5 +0,0 @@ -src_dir <- file.path(R_PACKAGE_SOURCE, "src", fsep = "/") -dest_dir <- file.path(R_PACKAGE_DIR, paste0("libs", R_ARCH), fsep="/") - -dir.create(file.path(R_PACKAGE_DIR, paste0("libs", R_ARCH), fsep="/"), recursive = TRUE, showWarnings = FALSE) -file.copy(file.path(src_dir, "arrow.dll", fsep = "/"), file.path(dest_dir, "lib_arrow.dll", fsep = "/")) diff --git a/dev/tasks/conda-recipes/r-arrow/meta.yaml b/dev/tasks/conda-recipes/r-arrow/meta.yaml deleted file mode 100644 index 5f0643bef3719..0000000000000 --- a/dev/tasks/conda-recipes/r-arrow/meta.yaml +++ /dev/null @@ -1,66 +0,0 @@ -{% set version = ARROW_VERSION %} -{% set posix = 'm2-' if win else '' %} -{% set native = 'm2w64-' if win else '' %} - -package: - name: r-arrow - version: {{ version|replace("-", "_") }} - -source: - path: ../../../../ - -build: - merge_build_host: true # [win] - number: 0 - rpaths: - - lib/R/lib/ - - lib/ - -requirements: - build: - - {{ compiler('c') }} # [not win] - - {{ compiler('cxx') }} # [not win] - - {{ compiler('r_clang') }} # [win] - - pkg-config # [not win] - - {{ posix }}make - - {{ posix }}sed # [win] - - {{ posix }}coreutils # [win] - - {{ posix }}filesystem # [win] - - {{ posix }}zip # [win] - host: - # Needs to be here, otherwise merge_build_host runs into issues - - pkg-config # [win] - - r-base - - arrow-cpp {{ version }} - - r-cpp11 - - r-r6 - - r-assertthat - - r-bit64 - - r-purrr - - r-rlang - - r-tidyselect - run: - - r-base - - r-r6 - - r-assertthat - - r-bit64 - - r-purrr - - r-rlang - - r-tidyselect - -test: - commands: - - $R -e "library('arrow')" # [not win] - - "\"%R%\" -e \"library('arrow'); data(mtcars); write_parquet(mtcars, 'test.parquet')\"" # [win] - -about: - home: https://github.com/apache/arrow - license: Apache-2.0 - license_file: LICENSE.txt - summary: R Integration to 'Apache' 'Arrow'. - license_family: APACHE - -extra: - recipe-maintainers: - - conda-forge/r - - conda-forge/arrow-cpp diff --git a/dev/tasks/conda-recipes/run_docker_build.sh b/dev/tasks/conda-recipes/run_docker_build.sh deleted file mode 100755 index 7645c43e2fa71..0000000000000 --- a/dev/tasks/conda-recipes/run_docker_build.sh +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env bash - -# NOTE: This script has been slightly adopted to suite the Apache Arrow / crossbow CI -# setup. The next time this is updated to the current version on conda-forge, -# you will also make this additions afterwards. - -# PLEASE NOTE: This script has been automatically generated by conda-smithy. Any changes here -# will be lost next time ``conda smithy rerender`` is run. If you would like to make permanent -# changes to this script, consider a proposal to conda-smithy so that other feedstocks can also -# benefit from the improvement. - -set -xeo pipefail - -build_dir=${1} - -THISDIR="$( cd "$( dirname "$0" )" >/dev/null && pwd )" -ARROW_ROOT=$(cd "$THISDIR/../../.."; pwd;) -FEEDSTOCK_ROOT=$THISDIR - -docker info - -# In order for the conda-build process in the container to write to the mounted -# volumes, we need to run with the same id as the host machine, which is -# normally the owner of the mounted volumes, or at least has write permission -export HOST_USER_ID=$(id -u) -# Check if docker-machine is being used (normally on OSX) and get the uid from -# the VM -if hash docker-machine 2> /dev/null && docker-machine active > /dev/null; then - export HOST_USER_ID=$(docker-machine ssh $(docker-machine active) id -u) -fi - -if [ -z "$CONFIG" ]; then - set +x - FILES=`ls .ci_support/linux_*` - CONFIGS="" - for file in $FILES; do - CONFIGS="${CONFIGS}'${file:12:-5}' or "; - done - echo "Need to set CONFIG env variable. Value can be one of ${CONFIGS:0:-4}" - exit 1 -fi - -if [ -z "${DOCKER_IMAGE}" ]; then - SHYAML_INSTALLED="$(shyaml -h || echo NO)" - if [ "${SHYAML_INSTALLED}" == "NO" ]; then - echo "WARNING: DOCKER_IMAGE variable not set and shyaml not installed. Falling back to condaforge/linux-anvil-comp7" - DOCKER_IMAGE="condaforge/linux-anvil-comp7" - else - DOCKER_IMAGE="$(cat "${FEEDSTOCK_ROOT}/.ci_support/${CONFIG}.yaml" | shyaml get-value docker_image.0 condaforge/linux-anvil-comp7 )" - fi -fi - -mkdir -p "${build_dir}" -DONE_CANARY="${build_dir}/conda-forge-build-done-${CONFIG}" -rm -f "$DONE_CANARY" - -if [ -z "${CI}" ]; then - DOCKER_RUN_ARGS="-it " -fi - -export UPLOAD_PACKAGES="${UPLOAD_PACKAGES:-True}" -docker run ${DOCKER_RUN_ARGS} \ - --shm-size=2G \ - -v "${ARROW_ROOT}":/arrow:rw,z \ - -v "${build_dir}":/build:rw \ - -e FEEDSTOCK_ROOT="/arrow/dev/tasks/conda-recipes" \ - -e CONFIG \ - -e R_CONFIG \ - -e HOST_USER_ID \ - -e UPLOAD_PACKAGES \ - -e ARROW_VERSION \ - -e CI \ - $DOCKER_IMAGE \ - bash /arrow/dev/tasks/conda-recipes/build_steps.sh /build - -# verify that the end of the script was reached -test -f "$DONE_CANARY" diff --git a/dev/tasks/cpp-examples/github.linux.yml b/dev/tasks/cpp-examples/github.linux.yml deleted file mode 100644 index 717d3c44302a5..0000000000000 --- a/dev/tasks/cpp-examples/github.linux.yml +++ /dev/null @@ -1,46 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# NOTE: must set "Crossbow" as name to have the badge links working in the -# github comment reports! -name: Crossbow - -on: - push: - branches: - - "*-github-*" - -jobs: - test: - name: C++ Example - runs-on: ubuntu-latest - steps: - - name: Checkout Arrow - shell: bash - run: | - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - git -C arrow checkout FETCH_HEAD - git -C arrow submodule update --init --recursive - - name: Free Up Disk Space - shell: bash - run: arrow/ci/scripts/util_cleanup.sh - - name: Run - shell: bash - run: | - cd arrow/cpp/examples/{{ type }} - docker-compose run --rm {{ run }} diff --git a/dev/tasks/docker-tests/azure.linux.yml b/dev/tasks/docker-tests/azure.linux.yml deleted file mode 100644 index c3706be443463..0000000000000 --- a/dev/tasks/docker-tests/azure.linux.yml +++ /dev/null @@ -1,52 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -jobs: -- job: linux - pool: - vmImage: ubuntu-16.04 - timeoutInMinutes: 360 - {% if env is defined %} - variables: - {% for key, value in env.items() %} - {{ key }}: {{ value }} - {% endfor %} - {% endif %} - - steps: - - task: DockerInstaller@0 - displayName: Docker Installer - inputs: - dockerVersion: 17.09.0-ce - releaseType: stable - - - task: UsePythonVersion@0 - inputs: - versionSpec: '3.6' - - - script: | - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - git -C arrow checkout FETCH_HEAD - git -C arrow submodule update --init --recursive - displayName: Clone arrow - - - script: pip install -e arrow/dev/archery[docker] - displayName: Setup Archery - - - script: archery docker run -e SETUPTOOLS_SCM_PRETEND_VERSION="{{ arrow.no_rc_version }}" {{ run }} - displayName: Execute Docker Build diff --git a/dev/tasks/docker-tests/circle.linux.yml b/dev/tasks/docker-tests/circle.linux.yml deleted file mode 100644 index 3ddb93dc95efe..0000000000000 --- a/dev/tasks/docker-tests/circle.linux.yml +++ /dev/null @@ -1,51 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -version: 2 -jobs: - build: - machine: - image: ubuntu-1604:202004-01 - {%- if env is defined %} - environment: - {%- for key, value in env.items() %} - {{ key }}: {{ value }} - {%- endfor %} - {%- endif %} - steps: - - run: | - docker -v - docker-compose -v - - run: | - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - git -C arrow checkout FETCH_HEAD - git -C arrow submodule update --init --recursive - - run: - name: Execute Docker Build - command: | - pyenv versions - pyenv global 3.6.10 - pip install -e arrow/dev/archery[docker] - archery docker run -e SETUPTOOLS_SCM_PRETEND_VERSION="{{ arrow.no_rc_version }}" {{ run }} - no_output_timeout: "1h" - -workflows: - version: 2 - build: - jobs: - - build diff --git a/dev/tasks/docker-tests/github.linux.yml b/dev/tasks/docker-tests/github.linux.yml deleted file mode 100644 index 255c9ac14c4c8..0000000000000 --- a/dev/tasks/docker-tests/github.linux.yml +++ /dev/null @@ -1,42 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -{% import 'macros.jinja' as macros with context %} - -{{ macros.github_header() }} - -jobs: - test: - name: Docker Test - runs-on: ubuntu-latest - steps: - {{ macros.github_checkout_arrow()|indent }} - {{ macros.github_install_archery()|indent }} - - - name: Free Up Disk Space - shell: bash - run: arrow/ci/scripts/util_cleanup.sh - - - name: Execute Docker Build - shell: bash - {% if env is defined %} - env: - {% for key, value in env.items() %} - {{ key }}: {{ value }} - {% endfor %} - {% endif %} - run: archery docker run -e SETUPTOOLS_SCM_PRETEND_VERSION="{{ arrow.no_rc_version }}" {{ run }} diff --git a/dev/tasks/gandiva-jars/README.md b/dev/tasks/gandiva-jars/README.md deleted file mode 100644 index 5de59a08debb7..0000000000000 --- a/dev/tasks/gandiva-jars/README.md +++ /dev/null @@ -1,29 +0,0 @@ - - -# Updating manylinux for Gandiva Jar Build. - -Do the following to update arrow manylinux docker image for building Gandiva Jars - -- Install java in the manylinux image. -- To do above, update Dockerfile-x86_64_base under python/manylinux1 to install java. -- Please note only upto java7 is available in CentOS5, so install java7 in the base. -- Export JAVA_HOME environment variable. -- Then update build_boost.sh under python/manylinux1/scripts to build boost statically. - -Please look at https://github.com/praveenbingo/arrow/tree/buildGandivaDocker that already has these changes. diff --git a/dev/tasks/gandiva-jars/build-cpp-linux.sh b/dev/tasks/gandiva-jars/build-cpp-linux.sh deleted file mode 100755 index 42651739f840e..0000000000000 --- a/dev/tasks/gandiva-jars/build-cpp-linux.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -source /multibuild/manylinux_utils.sh - -# Quit on failure -set -e - -PYTHON_VERSION=3.6 -CPYTHON_PATH="$(cpython_path ${PYTHON_VERSION})" -PYTHON_INTERPRETER="${CPYTHON_PATH}/bin/python" -PIP="${CPYTHON_PATH}/bin/pip" - -ARROW_BUILD_DIR=/tmp/arrow-build -mkdir -p "${ARROW_BUILD_DIR}" -pushd "${ARROW_BUILD_DIR}" - -PATH="${CPYTHON_PATH}/bin:${PATH}" -export ARROW_TEST_DATA="/arrow/testing/data" - -cmake -DCMAKE_BUILD_TYPE=Release \ - -DARROW_DEPENDENCY_SOURCE="SYSTEM" \ - -DZLIB_ROOT=/usr/local \ - -DCMAKE_INSTALL_PREFIX=/arrow-dist \ - -DCMAKE_INSTALL_LIBDIR=lib \ - -DARROW_BUILD_TESTS=ON \ - -DARROW_BUILD_SHARED=ON \ - -DARROW_BOOST_USE_SHARED=OFF \ - -DARROW_PROTOBUF_USE_SHARED=OFF \ - -DARROW_OPENSSL_USE_SHARED=OFF \ - -DARROW_GANDIVA_PC_CXX_FLAGS="-isystem;/opt/rh/devtoolset-2/root/usr/include/c++/4.8.2;-isystem;/opt/rh/devtoolset-2/root/usr/include/c++/4.8.2/x86_64-CentOS-linux/" \ - -DARROW_JEMALLOC=ON \ - -DARROW_RPATH_ORIGIN=ON \ - -DARROW_PYTHON=OFF \ - -DARROW_PARQUET=OFF \ - -DARROW_DATASET=OFF \ - -DARROW_FILESYSTEM=OFF \ - -DPARQUET_BUILD_ENCRYPTION=OFF \ - -DPythonInterp_FIND_VERSION=${PYTHON_VERSION} \ - -DARROW_GANDIVA=ON \ - -DARROW_GANDIVA_JAVA=ON \ - -DARROW_GANDIVA_JAVA7=ON \ - -DBoost_NAMESPACE=arrow_boost \ - -Dgflags_SOURCE=BUNDLED \ - -DRapidJSON_SOURCE=BUNDLED \ - -DRE2_SOURCE=BUNDLED \ - -DARROW_BUILD_UTILITIES=OFF \ - -DBoost_NAMESPACE=arrow_boost \ - -DBOOST_ROOT=/arrow_boost_dist \ - -GNinja /arrow/cpp -ninja install -CTEST_OUTPUT_ON_FAILURE=1 ninja test -popd - - -# copy the library to distribution -cp -L /arrow-dist/lib/libgandiva_jni.so /arrow/dist diff --git a/dev/tasks/gandiva-jars/build-cpp-osx.sh b/dev/tasks/gandiva-jars/build-cpp-osx.sh deleted file mode 100755 index cc6ab246d963a..0000000000000 --- a/dev/tasks/gandiva-jars/build-cpp-osx.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e - -set -x - -# Builds arrow + gandiva and tests the same. -pushd cpp - mkdir build - pushd build - CMAKE_FLAGS="-DCMAKE_BUILD_TYPE=Release \ - -DARROW_GANDIVA=ON \ - -DARROW_GANDIVA_JAVA=ON \ - -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON \ - -DARROW_BUILD_TESTS=ON \ - -DARROW_BUILD_UTILITIES=OFF \ - -DPARQUET_BUILD_ENCRYPTION=OFF \ - -DARROW_PARQUET=OFF \ - -DARROW_FILESYSTEM=OFF \ - -DARROW_DATASET=OFF \ - -DARROW_BOOST_USE_SHARED=OFF \ - -DARROW_PROTOBUF_USE_SHARED=OFF \ - -DARROW_GFLAGS_USE_SHARED=OFF \ - -DARROW_OPENSSL_USE_SHARED=OFF" - - cmake $CMAKE_FLAGS .. - make -j4 - ctest - - cp -L release/libgandiva_jni.dylib $GITHUB_WORKSPACE/arrow/dist - popd -popd diff --git a/dev/tasks/gandiva-jars/build-java.sh b/dev/tasks/gandiva-jars/build-java.sh deleted file mode 100755 index 7dec07115a3ef..0000000000000 --- a/dev/tasks/gandiva-jars/build-java.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e - -CPP_BUILD_DIR=$GITHUB_WORKSPACE/arrow/dist/ - -pushd java - if [[ $OS_NAME == "linux" ]]; then - SO_DEP=ldd - GANDIVA_LIB="$CPP_BUILD_DIR"libgandiva_jni.so - WHITELIST=(linux-vdso libz librt libdl libpthread libstdc++ libm libgcc_s libc ld-linux-x86-64) - else - SO_DEP="otool -L" - GANDIVA_LIB="$CPP_BUILD_DIR"libgandiva_jni.dylib - WHITELIST=(libgandiva_jni libz libncurses libSystem libc++) - fi - - # print the shared library dependencies - eval "$SO_DEP" "$GANDIVA_LIB" - - if [[ $CHECK_SHARED_DEPENDENCIES ]] ; then - # exit if any shared library not in whitelisted set is found - echo "Checking shared dependencies" - while read -r line - do - found=false - for item in "${WHITELIST[@]}" - do - if [[ "$line" == *"$item"* ]] ; then - found=true - fi - done - if [[ "$found" == false ]] ; then - echo "Unexpected shared dependency found" - exit 1 - fi - done < <(eval "$SO_DEP" "$GANDIVA_LIB" | awk '{print $1}') - fi - - # build the entire project - mvn clean install -q -DskipTests -P arrow-jni -Darrow.cpp.build.dir=$CPP_BUILD_DIR - # test only gandiva - mvn test -q -P arrow-jni -pl gandiva -Dgandiva.cpp.build.dir=$CPP_BUILD_DIR - - # copy the jars to distribution folder - find gandiva/target/ -name "*.jar" -not -name "*tests*" -exec cp {} $CPP_BUILD_DIR \; -popd diff --git a/dev/tasks/gandiva-jars/github.linux.yml b/dev/tasks/gandiva-jars/github.linux.yml deleted file mode 100644 index aabcdbee0efc0..0000000000000 --- a/dev/tasks/gandiva-jars/github.linux.yml +++ /dev/null @@ -1,47 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -{% import 'macros.jinja' as macros with context %} - -{{ macros.github_header() }} - -jobs: - package: - name: Package Gandiva - runs-on: ubuntu-18.04 - steps: - - name: Checkout Arrow - run: | - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - if [ $CROSSBOW_USE_COMMIT_ID = true ]; then git -C arrow checkout {{ arrow.head }}; else git -C arrow checkout FETCH_HEAD; fi - git -C arrow submodule update --init --recursive - - name: Build Gandiva - run: | - python3 -VV - cd arrow - mkdir -p dist - export CC="gcc-4.9" CXX="g++-4.9" - ulimit -c unlimited -S - set -e - docker run -v $PWD:/arrow quay.io/anthonylouisbsb/arrow:gandivadocker /arrow/dev/tasks/gandiva-jars/build-cpp-linux.sh - dev/tasks/gandiva-jars/build-java.sh - env: - OS_NAME: "linux" - CHECK_SHARED_DEPENDENCIES: true - - {{ macros.github_upload_releases("arrow/dist/*.jar")|indent }} diff --git a/dev/tasks/gandiva-jars/github.osx.yml b/dev/tasks/gandiva-jars/github.osx.yml deleted file mode 100644 index 3dd6fe46bb658..0000000000000 --- a/dev/tasks/gandiva-jars/github.osx.yml +++ /dev/null @@ -1,46 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -{% import 'macros.jinja' as macros with context %} - -{{ macros.github_header() }} - -jobs: - package: - name: Package Gandiva - runs-on: macos-latest - steps: - - name: Checkout Arrow - run: | - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - if [ $CROSSBOW_USE_COMMIT_ID = true ]; then git -C arrow checkout {{ arrow.head }}; else git -C arrow checkout FETCH_HEAD; fi - git -C arrow submodule update --init --recursive - - name: Build Gandiva - run: | - cd arrow - mkdir -p dist - export ARROW_TEST_DATA=$PWD/testing/data - set -e - dev/tasks/gandiva-jars/build-cpp-osx.sh - dev/tasks/gandiva-jars/build-java.sh - env: - OS_NAME: "osx" - CHECK_SHARED_DEPENDENCIES: true - MACOSX_DEPLOYMENT_TARGET: "10.11" - - {{ macros.github_upload_releases("arrow/dist/*.jar")|indent }} diff --git a/dev/tasks/homebrew-formulae/apache-arrow.rb b/dev/tasks/homebrew-formulae/apache-arrow.rb deleted file mode 100644 index 953f1eea1c4b5..0000000000000 --- a/dev/tasks/homebrew-formulae/apache-arrow.rb +++ /dev/null @@ -1,69 +0,0 @@ -class ApacheArrow < Formula - desc "Columnar in-memory analytics layer designed to accelerate big data" - homepage "https://arrow.apache.org/" - url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-4.0.0-SNAPSHOT/apache-arrow-4.0.0-SNAPSHOT.tar.gz" - sha256 "9948ddb6d4798b51552d0dca3252dd6e3a7d0f9702714fc6f5a1b59397ce1d28" - license "Apache-2.0" - head "https://github.com/apache/arrow.git" - - depends_on "boost" => :build - depends_on "cmake" => :build - depends_on "llvm" => :build - depends_on "brotli" - depends_on "glog" - depends_on "grpc" - depends_on "lz4" - depends_on "numpy" - depends_on "openssl@1.1" - depends_on "protobuf" - depends_on "python@3.9" - depends_on "rapidjson" - depends_on "snappy" - depends_on "thrift" - depends_on "zstd" - - def install - ENV.cxx11 - # link against system libc++ instead of llvm provided libc++ - ENV.remove "HOMEBREW_LIBRARY_PATHS", Formula["llvm"].opt_lib - args = %W[ - -DARROW_FLIGHT=ON - -DARROW_GANDIVA=ON - -DARROW_JEMALLOC=ON - -DARROW_MIMALLOC=ON - -DARROW_ORC=ON - -DARROW_PARQUET=ON - -DARROW_PLASMA=ON - -DARROW_PROTOBUF_USE_SHARED=ON - -DARROW_PYTHON=ON - -DARROW_WITH_BZ2=ON - -DARROW_WITH_ZLIB=ON - -DARROW_WITH_ZSTD=ON - -DARROW_WITH_LZ4=ON - -DARROW_WITH_SNAPPY=ON - -DARROW_WITH_BROTLI=ON - -DARROW_INSTALL_NAME_RPATH=OFF - -DPython3_EXECUTABLE=#{Formula["python@3.9"].bin/"python3"} - ] - # Re-enable -DARROW_S3=ON and add back aws-sdk-cpp to depends_on in ARROW-6437 - - mkdir "build" - cd "build" do - system "cmake", "../cpp", *std_cmake_args, *args - system "make" - system "make", "install" - end - end - - test do - (testpath/"test.cpp").write <<~EOS - #include "arrow/api.h" - int main(void) { - arrow::int64(); - return 0; - } - EOS - system ENV.cxx, "test.cpp", "-std=c++11", "-I#{include}", "-L#{lib}", "-larrow", "-o", "test" - system "./test" - end -end diff --git a/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb b/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb deleted file mode 100644 index 351d7764603e5..0000000000000 --- a/dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb +++ /dev/null @@ -1,88 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# https://github.com/autobrew/homebrew-core/blob/master/Formula/apache-arrow.rb -class ApacheArrow < Formula - desc "Columnar in-memory analytics layer designed to accelerate big data" - homepage "https://arrow.apache.org/" - url "https://www.apache.org/dyn/closer.lua?path=arrow/arrow-3.0.0.9000/apache-arrow-3.0.0.9000.tar.gz" - sha256 "9948ddb6d4798b51552d0dca3252dd6e3a7d0f9702714fc6f5a1b59397ce1d28" - head "https://github.com/apache/arrow.git" - - bottle do - cellar :any - sha256 "a55211ba6f464681b7ca1b48defdad9cfbe1cf6fad8ff9ec875dc5a3c8f3c5ed" => :el_capitan_or_later - root_url "https://autobrew.github.io/bottles" - end - - # NOTE: if you add something here, be sure to add to PKG_LIBS in r/tools/autobrew - depends_on "boost" => :build - depends_on "cmake" => :build - depends_on "aws-sdk-cpp" - depends_on "lz4" - depends_on "snappy" - depends_on "thrift" - depends_on "zstd" - - def install - ENV.cxx11 - args = %W[ - -DARROW_BUILD_SHARED=OFF - -DARROW_BUILD_UTILITIES=ON - -DARROW_COMPUTE=ON - -DARROW_CSV=ON - -DARROW_DATASET=ON - -DARROW_FILESYSTEM=ON - -DARROW_HDFS=OFF - -DARROW_JEMALLOC=ON - -DARROW_JSON=ON - -DARROW_MIMALLOC=ON - -DARROW_PARQUET=ON - -DARROW_PYTHON=OFF - -DARROW_S3=ON - -DARROW_USE_GLOG=OFF - -DARROW_VERBOSE_THIRDPARTY_BUILD=ON - -DARROW_WITH_LZ4=ON - -DARROW_WITH_SNAPPY=ON - -DARROW_WITH_ZLIB=ON - -DARROW_WITH_ZSTD=ON - -DCMAKE_UNITY_BUILD=OFF - -DPARQUET_BUILD_EXECUTABLES=ON - -DLZ4_HOME=#{Formula["lz4"].prefix} - -DTHRIFT_HOME=#{Formula["thrift"].prefix} - ] - - mkdir "build" - cd "build" do - system "cmake", "../cpp", *std_cmake_args, *args - system "make" - system "make", "install" - end - end - - test do - (testpath/"test.cpp").write <<~EOS - #include "arrow/api.h" - int main(void) { - arrow::int64(); - return 0; - } - EOS - system ENV.cxx, "test.cpp", "-std=c++11", "-I#{include}", "-L#{lib}", "-larrow", "-o", "test" - system "./test" - end -end diff --git a/dev/tasks/homebrew-formulae/github.macos.yml b/dev/tasks/homebrew-formulae/github.macos.yml deleted file mode 100644 index 232cc38a91f3e..0000000000000 --- a/dev/tasks/homebrew-formulae/github.macos.yml +++ /dev/null @@ -1,56 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# NOTE: must set "Crossbow" as name to have the badge links working in the -# github comment reports! -name: Crossbow - -on: - push: - branches: - - "*-github-*" - -jobs: - autobrew: - name: "Autobrew" - runs-on: macOS-latest - steps: - - name: Checkout Arrow - run: | - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - git -C arrow checkout FETCH_HEAD - git -C arrow submodule update --init --recursive - - name: Configure homebrew formula for testing - env: - ARROW_FORMULA: ./arrow/dev/tasks/homebrew-formulae/{{ formula }} - run: | - # Pin the current commit in the formula to test so that we're not always pulling from master - sed -i.bak -E -e 's@https://github.com/apache/arrow.git"$@{{ arrow.remote }}.git", revision: "{{ arrow.head }}"@' $ARROW_FORMULA && rm -f $ARROW_FORMULA.bak - # Sometimes crossbow gives a remote URL with .git and sometimes not. Make sure there's only one - sed -i.bak -E -e 's@.git.git@.git@' $ARROW_FORMULA && rm -f $ARROW_FORMULA.bak - brew update - brew --version - brew unlink python@2 || true - brew config - brew doctor || true - cp $ARROW_FORMULA $(brew --repository homebrew/core)/Formula/apache-arrow.rb - - name: Test formula - run: | - brew install -v --HEAD apache-arrow - brew test apache-arrow - brew audit --strict apache-arrow diff --git a/dev/tasks/linux-packages/.gitignore b/dev/tasks/linux-packages/.gitignore deleted file mode 100644 index 0e49a90c1eb9d..0000000000000 --- a/dev/tasks/linux-packages/.gitignore +++ /dev/null @@ -1,28 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -/*/*.tar.gz -/*/apt/repositories/ -/*/apt/tmp/ -/*/apt/build.sh -/*/apt/env.sh -/*/yum/repositories/ -/*/yum/tmp/ -/*/yum/build.sh -/*/yum/env.sh -/apt/repositories/ -/yum/repositories/ diff --git a/dev/tasks/linux-packages/README.md b/dev/tasks/linux-packages/README.md deleted file mode 100644 index a1a14d1531aad..0000000000000 --- a/dev/tasks/linux-packages/README.md +++ /dev/null @@ -1,40 +0,0 @@ - - -# Linux packages for Apache Arrow C++ and GLib - -## Requirements - -- Ruby -- Docker -- Tools to build tar.gz for Apache Arrow C++ and GLib - -## How to build .deb packages - -```console -% rake version:update -% rake apt -``` - -## How to build .rpm packages - -```console -% rake version:update -% rake yum -``` diff --git a/dev/tasks/linux-packages/Rakefile b/dev/tasks/linux-packages/Rakefile deleted file mode 100644 index a84a43ae517ba..0000000000000 --- a/dev/tasks/linux-packages/Rakefile +++ /dev/null @@ -1,234 +0,0 @@ -# -*- ruby -*- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -require "English" - -require_relative "../../release/binary-task" -require_relative "helper" - -packages = [ - "apache-arrow", - "apache-arrow-apt-source", - "apache-arrow-release", -] - - -namespace :apt do - desc "Build deb packages" - task :build do - packages.each do |package| - cd(package) do - ruby("-S", "rake", "apt:build") - end - end - end -end - -namespace :yum do - desc "Build RPM packages" - task :build do - packages.each do |package| - cd(package) do - ruby("-S", "rake", "yum:build") - end - end - end -end - -namespace :version do - desc "Update versions" - task :update do - packages.each do |package| - cd(package) do - ruby("-S", "rake", "version:update") - end - end - end -end - -namespace :docker do - desc "Pull built images" - task :pull do - packages.each do |package| - cd(package) do - ruby("-S", "rake", "docker:pull") - end - end - end - - desc "Push built images" - task :push do - packages.each do |package| - cd(package) do - ruby("-S", "rake", "docker:push") - end - end - end -end - - -class LocalBinaryTask < BinaryTask - include Helper::ApacheArrow - - def initialize(packages) - @packages = packages - super() - end - - def define - define_apt_test_task - define_yum_test_task - end - - private - def latest_commit_time(git_directory) - cd(git_directory) do - return Time.iso8601(`git log -n 1 --format=%aI`.chomp).utc - end - end - - def version - @version ||= detect_version(detect_release_time) - end - - def resolve_docker_image(target) - image = "" - case target - when /-(?:arm64|aarch64)\z/ - target = $PREMATCH - image << "arm64v8/" - end - image << target.gsub(/-/, ":") - end - - def verify(target) - verify_command_line = [ - "docker", - "run", - "--rm", - "--log-driver", "none", - "--volume", "#{File.expand_path(arrow_source_dir)}:/arrow:delegated", - ] - if $stdin.tty? - verify_command_line << "--interactive" - verify_command_line << "--tty" - else - verify_command_line.concat(["--attach", "STDOUT"]) - verify_command_line.concat(["--attach", "STDERR"]) - end - verify_command_line << resolve_docker_image(target) - case target - when /\Adebian-/, /\Aubuntu-/ - verify_command_line << "/arrow/dev/release/verify-apt.sh" - else - verify_command_line << "/arrow/dev/release/verify-yum.sh" - end - verify_command_line << version - verify_command_line << "local" - sh(*verify_command_line) - end - - def apt_test_targets - targets = (ENV["APT_TARGETS"] || "").split(",") - targets = apt_test_targets_default if targets.empty? - targets - end - - def apt_test_targets_default - # Disable arm64 targets by default for now - # because they require some setups on host. - [ - "debian-buster", - # "debian-buster-arm64", - "debian-bullseye", - # "debian-bullseye-arm64", - "ubuntu-xenial", - # "ubuntu-xenial-arm64", - "ubuntu-bionic", - # "ubuntu-bionic-arm64", - "ubuntu-focal", - # "ubuntu-focal-arm64", - "ubuntu-groovy", - # "ubuntu-groovy-arm64", - ] - end - - def define_apt_test_task - namespace :apt do - desc "Test deb packages" - task :test do - repositories_dir = "apt/repositories" - rm_rf(repositories_dir) - @packages.each do |package| - package_repositories = "#{package}/apt/repositories" - next unless File.exist?(package_repositories) - sh("rsync", "-a", "#{package_repositories}/", repositories_dir) - end - Dir.glob("#{repositories_dir}/ubuntu/pool/*") do |code_name_dir| - universe_dir = "#{code_name_dir}/universe" - next unless File.exist?(universe_dir) - mv(universe_dir, "#{code_name_dir}/main") - end - apt_update(repositories_dir) - apt_test_targets.each do |target| - verify(target) - end - end - end - end - - def yum_test_targets - targets = (ENV["YUM_TARGETS"] || "").split(",") - targets = yum_test_targets_default if targets.empty? - targets - end - - def yum_test_targets_default - # Disable aarch64 targets by default for now - # because they require some setups on host. - [ - "centos-7", - "centos-8", - # "centos-8-aarch64", - ] - end - - def define_yum_test_task - namespace :yum do - desc "Test RPM packages" - task :test do - repositories_dir = "yum/repositories" - rm_rf(repositories_dir) - @packages.each do |package| - package_repositories = "#{package}/yum/repositories" - next unless File.exist?(package_repositories) - sh("rsync", "-a", "#{package_repositories}/", repositories_dir) - end - rpm_sign(repositories_dir) - yum_update(repositories_dir) - yum_test_targets.each do |target| - verify(target) - end - end - end - end -end - -local_binary_task = LocalBinaryTask.new(packages) -local_binary_task.define diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/Rakefile b/dev/tasks/linux-packages/apache-arrow-apt-source/Rakefile deleted file mode 100644 index 210fa951ee40a..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/Rakefile +++ /dev/null @@ -1,64 +0,0 @@ -# -*- ruby -*- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -require_relative "../helper" -require_relative "../package-task" - -class ApacheArrowAptSourcePackageTask < PackageTask - include Helper::ApacheArrow - - def initialize - release_time = detect_release_time - super("apache-arrow-apt-source", - detect_version(release_time), - release_time, - :rc_build_type => :release) - end - - private - def define_archive_task - file @archive_name do - rm_rf(@archive_base_name) - mkdir(@archive_base_name) - download("https://downloads.apache.org/arrow/KEYS", - "#{@archive_base_name}/KEYS") - sh("tar", "czf", @archive_name, @archive_base_name) - rm_rf(@archive_base_name) - end - - if deb_archive_name != @archive_name - file deb_archive_name => @archive_name do - if @archive_base_name == deb_archive_base_name - cp(@archive_name, deb_archive_name) - else - sh("tar", "xf", @archive_name) - mv(@archive_base_name, deb_archive_base_name) - sh("tar", "czf", deb_archive_name, deb_archive_base_name) - end - end - end - end - - def enable_yum? - false - end -end - -task = ApacheArrowAptSourcePackageTask.new -task.define diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/debian-bullseye/Dockerfile b/dev/tasks/linux-packages/apache-arrow-apt-source/apt/debian-bullseye/Dockerfile deleted file mode 100644 index 3193daaef081b..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/debian-bullseye/Dockerfile +++ /dev/null @@ -1,40 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -FROM debian:bullseye - -RUN \ - echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -RUN \ - echo 'APT::Install-Recommends "false";' > \ - /etc/apt/apt.conf.d/disable-install-recommends - -ARG DEBUG - -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ - apt update ${quiet} && \ - apt install -y -V ${quiet} \ - build-essential \ - debhelper \ - devscripts \ - fakeroot \ - gnupg \ - lsb-release && \ - apt clean diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/debian-buster/Dockerfile b/dev/tasks/linux-packages/apache-arrow-apt-source/apt/debian-buster/Dockerfile deleted file mode 100644 index 0d37f5dee71ba..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/debian-buster/Dockerfile +++ /dev/null @@ -1,41 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -FROM debian:buster - -RUN \ - echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -RUN \ - echo 'APT::Install-Recommends "false";' > \ - /etc/apt/apt.conf.d/disable-install-recommends - -ARG DEBUG - -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ - apt update ${quiet} && \ - apt install -y -V ${quiet} \ - build-essential \ - debhelper \ - devscripts \ - fakeroot \ - gnupg \ - lsb-release && \ - apt clean && \ - rm -rf /var/lib/apt/lists/* diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-bionic/Dockerfile b/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-bionic/Dockerfile deleted file mode 100644 index 53e11fb7eb7bc..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-bionic/Dockerfile +++ /dev/null @@ -1,41 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -FROM ubuntu:bionic - -RUN \ - echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -RUN \ - echo 'APT::Install-Recommends "false";' > \ - /etc/apt/apt.conf.d/disable-install-recommends - -ARG DEBUG - -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ - apt update ${quiet} && \ - apt install -y -V ${quiet} \ - build-essential \ - debhelper \ - devscripts \ - fakeroot \ - gnupg \ - lsb-release && \ - apt clean && \ - rm -rf /var/lib/apt/lists/* diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-focal/Dockerfile b/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-focal/Dockerfile deleted file mode 100644 index dc902d14d3dfd..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-focal/Dockerfile +++ /dev/null @@ -1,41 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -FROM ubuntu:focal - -RUN \ - echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -RUN \ - echo 'APT::Install-Recommends "false";' > \ - /etc/apt/apt.conf.d/disable-install-recommends - -ARG DEBUG - -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ - apt update ${quiet} && \ - apt install -y -V ${quiet} \ - build-essential \ - debhelper \ - devscripts \ - fakeroot \ - gnupg \ - lsb-release && \ - apt clean && \ - rm -rf /var/lib/apt/lists/* diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-groovy/Dockerfile b/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-groovy/Dockerfile deleted file mode 100644 index 7efd5d1df3231..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-groovy/Dockerfile +++ /dev/null @@ -1,41 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -FROM ubuntu:groovy - -RUN \ - echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -RUN \ - echo 'APT::Install-Recommends "false";' > \ - /etc/apt/apt.conf.d/disable-install-recommends - -ARG DEBUG - -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ - apt update ${quiet} && \ - apt install -y -V ${quiet} \ - build-essential \ - debhelper \ - devscripts \ - fakeroot \ - gnupg \ - lsb-release && \ - apt clean && \ - rm -rf /var/lib/apt/lists/* diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-xenial/Dockerfile b/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-xenial/Dockerfile deleted file mode 100644 index e05843081eef7..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/ubuntu-xenial/Dockerfile +++ /dev/null @@ -1,41 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -FROM ubuntu:xenial - -RUN \ - echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -RUN \ - echo 'APT::Install-Recommends "false";' > \ - /etc/apt/apt.conf.d/disable-install-recommends - -ARG DEBUG - -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ - apt update ${quiet} && \ - apt install -y -V ${quiet} \ - build-essential \ - debhelper \ - devscripts \ - fakeroot \ - gnupg \ - lsb-release && \ - apt clean && \ - rm -rf /var/lib/apt/lists/* diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/apache-arrow-apt-source.install b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/apache-arrow-apt-source.install deleted file mode 100644 index 7bcb2ecc9f05d..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/apache-arrow-apt-source.install +++ /dev/null @@ -1,2 +0,0 @@ -etc/apt/sources.list.d/* -usr/share/keyrings/* diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/compat b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/compat deleted file mode 100644 index ec635144f6004..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/compat +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/control b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/control deleted file mode 100644 index f54d52f98a2ee..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/control +++ /dev/null @@ -1,23 +0,0 @@ -Source: apache-arrow-apt-source -Section: misc -Priority: important -Maintainer: Apache Arrow Developers -Build-Depends: - debhelper (>= 9), - gnupg, - lsb-release -Standards-Version: 3.9.7 -Homepage: https://arrow.apache.org/ - -Package: apache-arrow-apt-source -Section: misc -Architecture: all -Replaces: apache-arrow-archive-keyring -Breaks: apache-arrow-archive-keyring -Depends: - ${misc:Depends}, - apt-transport-https, - gnupg -Description: GnuPG archive key of the Apache Arrow archive - The Apache Arrow project digitally signs its Release files. This - package contains the archive key used for that. diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/copyright b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/copyright deleted file mode 100644 index 274d64ca06b97..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/copyright +++ /dev/null @@ -1,26 +0,0 @@ -Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ -Upstream-Name: Apache Arrow -Upstream-Contact: Apache Arrow Developers - -Files: * -Copyright: 2016 The Apache Software Foundation -License: Apache-2.0 - -License: Apache-2.0 - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - . - http://www.apache.org/licenses/LICENSE-2.0 - . - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - . - On Debian systems, the full text of the Apache Software License version 2 can - be found in the file `/usr/share/common-licenses/Apache-2.0'. diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/rules b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/rules deleted file mode 100755 index bf7a85c8c8bcc..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/rules +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/make -f -# -*- makefile-gmake -*- -# -# Uncomment this to turn on verbose mode. -#export DH_VERBOSE=1 -# This has to be exported to make some magic below work. -export DH_OPTIONS - -%: - dh $@ - -override_dh_auto_build: - gpg \ - --no-default-keyring \ - --keyring ./apache-arrow-apt-source.gpg \ - --import KEYS - - ( \ - distribution=$$(lsb_release --id --short | tr 'A-Z' 'a-z'); \ - code_name=$$(lsb_release --codename --short); \ - echo "Types: deb deb-src"; \ - echo "URIs: https://apache.jfrog.io/artifactory/arrow/$${distribution}/"; \ - echo "Suites: $${code_name}"; \ - echo "Components: main"; \ - echo "Signed-By: /usr/share/keyrings/apache-arrow-apt-source.gpg"; \ - ) > apache-arrow.sources - -override_dh_install: - install -d debian/tmp/usr/share/keyrings/ - install -m 0644 apache-arrow-apt-source.gpg \ - debian/tmp/usr/share/keyrings/ - - install -d debian/tmp/etc/apt/sources.list.d/ - install -m 0644 apache-arrow.sources \ - debian/tmp/etc/apt/sources.list.d/ - - dh_install diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/source/format b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/source/format deleted file mode 100644 index 163aaf8d82b6c..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/source/format +++ /dev/null @@ -1 +0,0 @@ -3.0 (quilt) diff --git a/dev/tasks/linux-packages/apache-arrow-release/Rakefile b/dev/tasks/linux-packages/apache-arrow-release/Rakefile deleted file mode 100644 index 4a341c6f10b68..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-release/Rakefile +++ /dev/null @@ -1,66 +0,0 @@ -# -*- ruby -*- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -require_relative "../helper" -require_relative "../package-task" - -class ApacheArrowReleasePackageTask < PackageTask - include Helper::ApacheArrow - - def initialize - release_time = detect_release_time - super("apache-arrow-release", - detect_version(release_time), - release_time, - :rc_build_type => :release) - end - - private - def repo_path - "#{yum_dir}/Apache-Arrow.repo" - end - - def define_archive_task - file @archive_name => [repo_path] do - rm_rf(@archive_base_name) - mkdir(@archive_base_name) - download("https://downloads.apache.org/arrow/KEYS", - "#{@archive_base_name}/KEYS") - cp(repo_path, @archive_base_name) - sh("tar", "czf", @archive_name, @archive_base_name) - rm_rf(@archive_base_name) - end - - if rpm_archive_name != @archive_name - file rpm_archive_name => @archive_name do - sh("tar", "xf", @archive_name) - rpm_archive_base_name = File.basename(rpm_archive_name, ".tar.gz") - mv(@archive_base_name, rpm_archive_base_name) - sh("tar", "czf", rpm_archive_name, rpm_archive_base_name) - end - end - end - - def enable_apt? - false - end -end - -task = ApacheArrowReleasePackageTask.new -task.define diff --git a/dev/tasks/linux-packages/apache-arrow-release/yum/Apache-Arrow.repo b/dev/tasks/linux-packages/apache-arrow-release/yum/Apache-Arrow.repo deleted file mode 100644 index fd77306e6f538..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-release/yum/Apache-Arrow.repo +++ /dev/null @@ -1,37 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[apache-arrow-amazon-linux] -name=Apache Arrow for Amazon Linux 2 - $basearch -baseurl=https://apache.jfrog.io/artifactory/arrow/centos/7/$basearch/ -gpgcheck=1 -enabled=1 -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Apache-Arrow - -[apache-arrow-centos] -name=Apache Arrow for CentOS $releasever - $basearch -baseurl=https://apache.jfrog.io/artifactory/arrow/centos/$releasever/$basearch/ -gpgcheck=1 -enabled=1 -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Apache-Arrow - -[apache-arrow-rhel] -name=Apache Arrow for RHEL $releasever - $basearch -baseurl=https://apache.jfrog.io/artifactory/arrow/centos/$releasever/$basearch/ -gpgcheck=1 -enabled=1 -gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Apache-Arrow diff --git a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in deleted file mode 100644 index 9f546569e8684..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in +++ /dev/null @@ -1,110 +0,0 @@ -# -*- sh-shell: rpm -*- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -%define _centos_ver %{?centos_ver:%{centos_ver}}%{!?centos_ver:8} - -%define use_dnf (%{_centos_ver} >= 8) -%if %{use_dnf} -%define yum_repository_enable() (dnf config-manager --set-enabled %1) -%define yum_repository_disable() (dnf config-manager --set-disabled %1) -%else -%define yum_repository_enable() (yum-config-manager --enable %1) -%define yum_repository_disable() (yum-config-manager --disable %1) -%endif - -Name: @PACKAGE@ -Version: @VERSION@ -Release: @RELEASE@%{?dist} -Summary: Apache Arrow release files - -License: Apache-2.0 -URL: https://arrow.apache.org/ -Source0: @PACKAGE@-%{version}.tar.gz - -BuildArch: noarch - -Requires: epel-release -%if %{use_dnf} -Requires: dnf-command(config-manager) -%else -Requires: yum-utils -%endif - -%description -Apache Arrow release files. - -%prep -%setup -q - -%build -# We use distribution version explicitly because we can't use symbolic link -# on Bintray. CentOS uses 7 and 8 but RHEL uses 7Server and 8Server -# for $releasever. If we can use symbolic link on Bintray, we can use -# $releasever directly. -distribution_version=$(cut -d: -f5 /etc/system-release-cpe) -sed -i'' -e "s/\\\$releasever/${distribution_version}/g" Apache-Arrow.repo - -%install -rm -rf $RPM_BUILD_ROOT - -%{__install} -Dp -m0644 KEYS \ - $RPM_BUILD_ROOT%{_sysconfdir}/pki/rpm-gpg/RPM-GPG-KEY-Apache-Arrow - -%{__install} -d $RPM_BUILD_ROOT%{_sysconfdir}/yum.repos.d/ -%{__install} -Dp -m0644 Apache-Arrow.repo \ - $RPM_BUILD_ROOT%{_sysconfdir}/yum.repos.d/Apache-Arrow.repo - -%files -%defattr(-, root, root, 0755) -%doc -%dir %{_sysconfdir}/yum.repos.d/ -%dir %{_sysconfdir}/pki/rpm-gpg/ -%{_sysconfdir}/pki/rpm-gpg/RPM-GPG-KEY-Apache-Arrow -%config(noreplace) %{_sysconfdir}/yum.repos.d/Apache-Arrow.repo - -%post -if grep -q 'Amazon Linux release 2' /etc/system-release 2>/dev/null; then - %{yum_repository_enable apache-arrow-amazon-linux} - %{yum_repository_disable apache-arrow-centos} - %{yum_repository_disable apache-arrow-rhel} -elif grep -q 'Red Hat Enterprise Linux' /etc/system-release 2>/dev/null; then - %{yum_repository_disable apache-arrow-amazon-linux} - %{yum_repository_disable apache-arrow-centos} - %{yum_repository_enable apache-arrow-rhel} -else - %{yum_repository_disable apache-arrow-amazon-linux} - %{yum_repository_enable apache-arrow-centos} - %{yum_repository_disable apache-arrow-rhel} -fi - -%changelog -* Mon Jan 18 2021 Krisztián Szűcs - 3.0.0-1 -- New upstream release. - -* Mon Oct 12 2020 Krisztián Szűcs - 2.0.0-1 -- New upstream release. - -* Mon Jul 20 2020 Krisztián Szűcs - 1.0.0-1 -- New upstream release. - -* Thu Apr 16 2020 Krisztián Szűcs - 0.17.0-1 -- New upstream release. - -* Thu Jan 30 2020 Krisztián Szűcs - 0.16.0-1 -- New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow-release/yum/centos-7/Dockerfile b/dev/tasks/linux-packages/apache-arrow-release/yum/centos-7/Dockerfile deleted file mode 100644 index 0396593d7d526..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-release/yum/centos-7/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -FROM centos:7 - -ARG DEBUG - -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "--quiet") && \ - yum install -y ${quiet} epel-release && \ - yum install -y ${quiet} \ - rpm-build \ - rpmdevtools && \ - yum clean ${quiet} all diff --git a/dev/tasks/linux-packages/apache-arrow-release/yum/centos-8/Dockerfile b/dev/tasks/linux-packages/apache-arrow-release/yum/centos-8/Dockerfile deleted file mode 100644 index c2131bf84126a..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-release/yum/centos-8/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -FROM centos:8 - -ARG DEBUG - -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "--quiet") && \ - dnf install -y ${quiet} epel-release && \ - dnf install --enablerepo=powertools -y ${quiet} \ - rpm-build \ - rpmdevtools && \ - dnf clean ${quiet} all diff --git a/dev/tasks/linux-packages/apache-arrow/Rakefile b/dev/tasks/linux-packages/apache-arrow/Rakefile deleted file mode 100644 index d4848e417be2c..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/Rakefile +++ /dev/null @@ -1,120 +0,0 @@ -# -*- ruby -*- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -require_relative "../helper" -require_relative "../package-task" - -class ApacheArrowPackageTask < PackageTask - include Helper::ApacheArrow - - def initialize - release_time = detect_release_time - super("apache-arrow", - detect_version(release_time), - release_time, - :rc_build_type => :release) - @rpm_package = "arrow" - end - - private - def define_archive_task - file @archive_name do - case @version - when /\A\d+\.\d+\.\d+-rc\d+\z/ - download_rc_archive - when /\A\d+\.\d+\.\d+\z/ - download_released_archive - else - build_archive - end - end - - if deb_archive_name != @archive_name - file deb_archive_name => @archive_name do - cp(@archive_name, deb_archive_name) - end - end - - if rpm_archive_name != @archive_name - file rpm_archive_name => @archive_name do - cp(@archive_name, rpm_archive_name) - end - end - end - - def download_rc_archive - base_url = "https://dist.apache.org/repos/dist/dev/arrow" - archive_name_no_rc = @archive_name.gsub(/-rc\d+(\.tar\.gz)\z/, "\\1") - url = "#{base_url}/#{@package}-#{@version}/#{archive_name_no_rc}" - download(url, @archive_name) - end - - def download_released_archive - mirror_base_url = "https://www.apache.org/dyn/closer.lua/arrow" - mirror_list_url = "#{mirror_base_url}/arrow-#{@version}/#{@archive_name}" - open(mirror_list_url) do |response| - if /href="(.+?\/#{Regexp.escape(@archive_name)})"/ =~ response.read - download($1, ".") - end - end - end - - def build_archive - cd(arrow_source_dir) do - sh("git", "archive", "HEAD", - "--prefix", "#{@archive_base_name}/", - "--output", @full_archive_name) - end - end - - def apt_arm64_cuda_available_target?(target) - # ubuntu-20.10 has navidia-cuda-toolkit but not libcuda1. - # ubuntu-21.04 may support this. - false - end - - def apt_prepare_debian_control_cuda_architecture(control, target) - if apt_arm64_cuda_available_target?(target) - cuda_architecture = "any" - else - cuda_architecture = "i386 amd64" - end - control.gsub(/@CUDA_ARCHITECTURE@/, cuda_architecture) - end - - def apt_prepare_debian_control_grpc(control, target) - case target - when /\Adebian-buster/, /\Aubuntu-(?:bionic|focal)/ - use_system_grpc = "#" - else - use_system_grpc = "" - end - control.gsub(/@USE_SYSTEM_GRPC@/, use_system_grpc) - end - - def apt_prepare_debian_control(control_in, target) - control = control_in.dup - control = apt_prepare_debian_control_cuda_architecture(control, target) - control = apt_prepare_debian_control_grpc(control, target) - control - end -end - -task = ApacheArrowPackageTask.new -task.define diff --git a/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye-arm64/from b/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye-arm64/from deleted file mode 100644 index 34187b2af5a74..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye-arm64/from +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -arm64v8/debian:bullseye diff --git a/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye/Dockerfile deleted file mode 100644 index fa4961bc97e0a..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye/Dockerfile +++ /dev/null @@ -1,81 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG FROM=debian:bullseye -FROM ${FROM} - -COPY qemu-* /usr/bin/ - -RUN \ - echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -RUN \ - echo 'APT::Install-Recommends "false";' > \ - /etc/apt/apt.conf.d/disable-install-recommends - -RUN sed -i'' -e 's/main$/main contrib non-free/g' /etc/apt/sources.list - -ARG DEBUG -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ - apt update ${quiet} && \ - apt install -y -V ${quiet} \ - build-essential \ - ccache \ - clang-11 \ - cmake \ - debhelper \ - devscripts \ - git \ - gtk-doc-tools \ - libboost-filesystem-dev \ - libboost-system-dev \ - libbrotli-dev \ - libbz2-dev \ - libc-ares-dev \ - libcurl4-openssl-dev \ - libgirepository1.0-dev \ - libglib2.0-doc \ - libgmock-dev \ - libgoogle-glog-dev \ - libgrpc++-dev \ - libgtest-dev \ - liblz4-dev \ - libre2-dev \ - libsnappy-dev \ - libssl-dev \ - libthrift-dev \ - libutf8proc-dev \ - libzstd-dev \ - llvm-11-dev \ - lsb-release \ - ninja-build \ - pkg-config \ - protobuf-compiler-grpc \ - python3-dev \ - python3-numpy \ - python3-pip \ - rapidjson-dev \ - tzdata \ - zlib1g-dev && \ - if apt list | grep '^nvidia-cuda-toolkit/'; then \ - apt install -y -V ${quiet} nvidia-cuda-toolkit; \ - fi && \ - pip3 install --upgrade meson && \ - ln -s /usr/local/bin/meson /usr/bin/ && \ - apt clean diff --git a/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye/qemu-dummy-static b/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye/qemu-dummy-static deleted file mode 100755 index c42e0962def31..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye/qemu-dummy-static +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/sh -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Do nothing. This exists only for not requiring qemu-aarch64-static copy. -# Recent Debian (buster or later) and Ubuntu (18.10 or later) on amd64 hosts or -# arm64 host don't require qemu-aarch64-static in Docker image. But old Debian -# and Ubuntu hosts on amd64 require qemu-aarch64-static in Docker image. -# -# We use "COPY qemu* /usr/bin/" in Dockerfile. If we don't put any "qemnu*", -# the "COPY" is failed. It means that we always require "qemu*" even if we -# use recent Debian/Ubuntu or arm64 host. If we have this dummy "qemu*" file, -# the "COPY" isn't failed. It means that we can copy "qemu*" only when we -# need. -# -# See also "script" in dev/tasks/linux-packages/azure.linux.arm64.yml. -# Azure Pipelines uses old Ubuntu (18.04). -# So we need to put "qemu-aarch64-static" into this directory. diff --git a/dev/tasks/linux-packages/apache-arrow/apt/debian-buster-arm64/from b/dev/tasks/linux-packages/apache-arrow/apt/debian-buster-arm64/from deleted file mode 100644 index 8da222b86182f..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/debian-buster-arm64/from +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -arm64v8/debian:buster diff --git a/dev/tasks/linux-packages/apache-arrow/apt/debian-buster/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/debian-buster/Dockerfile deleted file mode 100644 index 5dcc1b46b2ddd..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/debian-buster/Dockerfile +++ /dev/null @@ -1,85 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG FROM=debian:buster -FROM ${FROM} - -COPY qemu-* /usr/bin/ - -RUN \ - echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -RUN \ - echo 'APT::Install-Recommends "false";' > \ - /etc/apt/apt.conf.d/disable-install-recommends - -RUN sed -i'' -e 's/main$/main contrib non-free/g' /etc/apt/sources.list - -RUN \ - echo "deb http://deb.debian.org/debian buster-backports main" > \ - /etc/apt/sources.list.d/backports.list - -ARG DEBUG -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ - apt update ${quiet} && \ - apt install -y -V ${quiet} \ - build-essential \ - ccache \ - cmake \ - debhelper \ - devscripts \ - git \ - gtk-doc-tools \ - libboost-filesystem-dev \ - libboost-system-dev \ - libbrotli-dev \ - libbz2-dev \ - libc-ares-dev \ - libcurl4-openssl-dev \ - libgirepository1.0-dev \ - libglib2.0-doc \ - libgmock-dev \ - libgoogle-glog-dev \ - libgtest-dev \ - liblz4-dev \ - libre2-dev \ - libsnappy-dev \ - libssl-dev \ - libthrift-dev \ - libutf8proc-dev \ - libzstd-dev \ - lsb-release \ - ninja-build \ - pkg-config \ - python3-dev \ - python3-numpy \ - python3-pip \ - rapidjson-dev \ - tzdata \ - zlib1g-dev && \ - apt install -y -V -t buster-backports ${quiet} \ - clang-8 \ - llvm-8-dev && \ - if apt list | grep '^nvidia-cuda-toolkit/'; then \ - apt install -y -V ${quiet} nvidia-cuda-toolkit; \ - fi && \ - pip3 install --upgrade meson && \ - ln -s /usr/local/bin/meson /usr/bin/ && \ - apt clean && \ - rm -rf /var/lib/apt/lists/* diff --git a/dev/tasks/linux-packages/apache-arrow/apt/debian-buster/qemu-dummy-static b/dev/tasks/linux-packages/apache-arrow/apt/debian-buster/qemu-dummy-static deleted file mode 100755 index c42e0962def31..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/debian-buster/qemu-dummy-static +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/sh -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Do nothing. This exists only for not requiring qemu-aarch64-static copy. -# Recent Debian (buster or later) and Ubuntu (18.10 or later) on amd64 hosts or -# arm64 host don't require qemu-aarch64-static in Docker image. But old Debian -# and Ubuntu hosts on amd64 require qemu-aarch64-static in Docker image. -# -# We use "COPY qemu* /usr/bin/" in Dockerfile. If we don't put any "qemnu*", -# the "COPY" is failed. It means that we always require "qemu*" even if we -# use recent Debian/Ubuntu or arm64 host. If we have this dummy "qemu*" file, -# the "COPY" isn't failed. It means that we can copy "qemu*" only when we -# need. -# -# See also "script" in dev/tasks/linux-packages/azure.linux.arm64.yml. -# Azure Pipelines uses old Ubuntu (18.04). -# So we need to put "qemu-aarch64-static" into this directory. diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-bionic-arm64/from b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-bionic-arm64/from deleted file mode 100644 index c3ba00cf01c9b..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-bionic-arm64/from +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -arm64v8/ubuntu:bionic diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-bionic/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-bionic/Dockerfile deleted file mode 100644 index 60be929519448..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-bionic/Dockerfile +++ /dev/null @@ -1,88 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG FROM=ubuntu:bionic -FROM ${FROM} - -COPY qemu-* /usr/bin/ - -RUN \ - echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -RUN \ - echo 'APT::Install-Recommends "false";' > \ - /etc/apt/apt.conf.d/disable-install-recommends - -ARG DEBUG -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ - apt update ${quiet} && \ - apt install -y -V ${quiet} \ - build-essential \ - ccache \ - clang-10 \ - cmake \ - devscripts \ - fakeroot \ - git \ - gtk-doc-tools \ - libboost-filesystem-dev \ - libboost-system-dev \ - libbrotli-dev \ - libbz2-dev \ - libcurl4-openssl-dev \ - libgirepository1.0-dev \ - libglib2.0-doc \ - libgoogle-glog-dev \ - libgtest-dev \ - liblz4-dev \ - libre2-dev \ - libsnappy-dev \ - libssl-dev \ - libutf8proc-dev \ - libzstd-dev \ - llvm-10-dev \ - lsb-release \ - ninja-build \ - pkg-config \ - python3-dev \ - python3-numpy \ - python3-pip \ - python3-setuptools \ - python3-wheel \ - rapidjson-dev \ - tzdata \ - zlib1g-dev && \ - (echo "includedir=/usr/include" && \ - echo "libdir=/usr/lib/$(dpkg-architecture -qDEB_HOST_MULTIARCH)" && \ - echo "" && \ - echo "Name: re2" && \ - echo "Description: RE2 is a fast, safe, thread-friendly regular expression engine." && \ - echo "Version: 0.0.0" && \ - echo "Cflags: -std=c++11 -pthread -I\${includedir}" && \ - echo "Libs: -pthread -L\${libdir} -lre2") | \ - tee "/usr/lib/$(dpkg-architecture -qDEB_HOST_MULTIARCH)/pkgconfig/re2.pc" && \ - if apt list | grep '^nvidia-cuda-toolkit/'; then \ - apt install -y -V ${quiet} nvidia-cuda-toolkit; \ - fi && \ - apt install -y -V -t bionic-backports ${quiet} \ - debhelper && \ - pip3 install --upgrade meson && \ - ln -s /usr/local/bin/meson /usr/bin/ && \ - apt clean && \ - rm -rf /var/lib/apt/lists/* diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-bionic/qemu-dummy-static b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-bionic/qemu-dummy-static deleted file mode 100755 index c42e0962def31..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-bionic/qemu-dummy-static +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/sh -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Do nothing. This exists only for not requiring qemu-aarch64-static copy. -# Recent Debian (buster or later) and Ubuntu (18.10 or later) on amd64 hosts or -# arm64 host don't require qemu-aarch64-static in Docker image. But old Debian -# and Ubuntu hosts on amd64 require qemu-aarch64-static in Docker image. -# -# We use "COPY qemu* /usr/bin/" in Dockerfile. If we don't put any "qemnu*", -# the "COPY" is failed. It means that we always require "qemu*" even if we -# use recent Debian/Ubuntu or arm64 host. If we have this dummy "qemu*" file, -# the "COPY" isn't failed. It means that we can copy "qemu*" only when we -# need. -# -# See also "script" in dev/tasks/linux-packages/azure.linux.arm64.yml. -# Azure Pipelines uses old Ubuntu (18.04). -# So we need to put "qemu-aarch64-static" into this directory. diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal-arm64/from b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal-arm64/from deleted file mode 100644 index 52ab48b66f223..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal-arm64/from +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -arm64v8/ubuntu:focal diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal/Dockerfile deleted file mode 100644 index ad83bfa90025a..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal/Dockerfile +++ /dev/null @@ -1,78 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG FROM=ubuntu:focal -FROM ${FROM} - -COPY qemu-* /usr/bin/ - -RUN \ - echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -RUN \ - echo 'APT::Install-Recommends "false";' > \ - /etc/apt/apt.conf.d/disable-install-recommends - -ARG DEBUG -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ - apt update ${quiet} && \ - apt install -y -V ${quiet} \ - build-essential \ - ccache \ - clang-10 \ - cmake \ - debhelper \ - devscripts \ - git \ - gtk-doc-tools \ - libboost-filesystem-dev \ - libboost-system-dev \ - libbrotli-dev \ - libbz2-dev \ - libcurl4-openssl-dev \ - libgirepository1.0-dev \ - libglib2.0-doc \ - libgmock-dev \ - libgoogle-glog-dev \ - libgtest-dev \ - liblz4-dev \ - libre2-dev \ - libsnappy-dev \ - libssl-dev \ - libthrift-dev \ - libutf8proc-dev \ - libzstd-dev \ - llvm-10-dev \ - lsb-release \ - ninja-build \ - pkg-config \ - python3-dev \ - python3-numpy \ - python3-pip \ - python3-setuptools \ - rapidjson-dev \ - tzdata \ - zlib1g-dev && \ - if apt list | grep '^nvidia-cuda-toolkit/'; then \ - apt install -y -V ${quiet} nvidia-cuda-toolkit; \ - fi && \ - apt clean && \ - python3 -m pip install --no-use-pep517 meson && \ - ln -s /usr/local/bin/meson /usr/bin/ && \ - rm -rf /var/lib/apt/lists/* diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal/qemu-dummy-static b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal/qemu-dummy-static deleted file mode 100755 index c42e0962def31..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-focal/qemu-dummy-static +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/sh -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Do nothing. This exists only for not requiring qemu-aarch64-static copy. -# Recent Debian (buster or later) and Ubuntu (18.10 or later) on amd64 hosts or -# arm64 host don't require qemu-aarch64-static in Docker image. But old Debian -# and Ubuntu hosts on amd64 require qemu-aarch64-static in Docker image. -# -# We use "COPY qemu* /usr/bin/" in Dockerfile. If we don't put any "qemnu*", -# the "COPY" is failed. It means that we always require "qemu*" even if we -# use recent Debian/Ubuntu or arm64 host. If we have this dummy "qemu*" file, -# the "COPY" isn't failed. It means that we can copy "qemu*" only when we -# need. -# -# See also "script" in dev/tasks/linux-packages/azure.linux.arm64.yml. -# Azure Pipelines uses old Ubuntu (18.04). -# So we need to put "qemu-aarch64-static" into this directory. diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-groovy-arm64/from b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-groovy-arm64/from deleted file mode 100644 index d1f6aa9a854c6..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-groovy-arm64/from +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -arm64v8/ubuntu:groovy diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-groovy/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-groovy/Dockerfile deleted file mode 100644 index d60e6320e3651..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-groovy/Dockerfile +++ /dev/null @@ -1,79 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG FROM=ubuntu:groovy -FROM ${FROM} - -COPY qemu-* /usr/bin/ - -RUN \ - echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -RUN \ - echo 'APT::Install-Recommends "false";' > \ - /etc/apt/apt.conf.d/disable-install-recommends - -ARG DEBUG -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ - apt update ${quiet} && \ - apt install -y -V ${quiet} \ - build-essential \ - ccache \ - clang-11 \ - cmake \ - debhelper \ - devscripts \ - git \ - gtk-doc-tools \ - libboost-filesystem-dev \ - libboost-system-dev \ - libbrotli-dev \ - libbz2-dev \ - libcurl4-openssl-dev \ - libgirepository1.0-dev \ - libglib2.0-doc \ - libgmock-dev \ - libgoogle-glog-dev \ - libgrpc++-dev \ - libgtest-dev \ - liblz4-dev \ - libre2-dev \ - libsnappy-dev \ - libssl-dev \ - libthrift-dev \ - libutf8proc-dev \ - libzstd-dev \ - llvm-11-dev \ - lsb-release \ - ninja-build \ - pkg-config \ - protobuf-compiler-grpc \ - python3-dev \ - python3-numpy \ - python3-pip \ - python3-setuptools \ - rapidjson-dev \ - tzdata \ - zlib1g-dev && \ - ! apt list | grep -q '^libcuda1' || \ - apt install -y -V ${quiet} nvidia-cuda-toolkit && \ - apt clean && \ - python3 -m pip install --no-use-pep517 meson && \ - ln -s /usr/local/bin/meson /usr/bin/ && \ - rm -rf /var/lib/apt/lists/* diff --git a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-groovy/qemu-dummy-static b/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-groovy/qemu-dummy-static deleted file mode 100755 index c42e0962def31..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/ubuntu-groovy/qemu-dummy-static +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/sh -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Do nothing. This exists only for not requiring qemu-aarch64-static copy. -# Recent Debian (buster or later) and Ubuntu (18.10 or later) on amd64 hosts or -# arm64 host don't require qemu-aarch64-static in Docker image. But old Debian -# and Ubuntu hosts on amd64 require qemu-aarch64-static in Docker image. -# -# We use "COPY qemu* /usr/bin/" in Dockerfile. If we don't put any "qemnu*", -# the "COPY" is failed. It means that we always require "qemu*" even if we -# use recent Debian/Ubuntu or arm64 host. If we have this dummy "qemu*" file, -# the "COPY" isn't failed. It means that we can copy "qemu*" only when we -# need. -# -# See also "script" in dev/tasks/linux-packages/azure.linux.arm64.yml. -# Azure Pipelines uses old Ubuntu (18.04). -# So we need to put "qemu-aarch64-static" into this directory. diff --git a/dev/tasks/linux-packages/apache-arrow/debian/changelog b/dev/tasks/linux-packages/apache-arrow/debian/changelog deleted file mode 100644 index 2adfc442de828..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/changelog +++ /dev/null @@ -1,111 +0,0 @@ -apache-arrow (3.0.0-1) unstable; urgency=low - - * New upstream release. - - -- Krisztián Szűcs Mon, 18 Jan 2021 21:33:18 -0000 - -apache-arrow (2.0.0-1) unstable; urgency=low - - * New upstream release. - - -- Krisztián Szűcs Mon, 12 Oct 2020 23:38:01 -0000 - -apache-arrow (1.0.0-1) unstable; urgency=low - - * New upstream release. - - -- Krisztián Szűcs Mon, 20 Jul 2020 20:41:07 -0000 - -apache-arrow (0.17.0-1) unstable; urgency=low - - * New upstream release. - - -- Krisztián Szűcs Thu, 16 Apr 2020 12:05:43 -0000 - -apache-arrow (0.16.0-1) unstable; urgency=low - - * New upstream release. - - -- Krisztián Szűcs Thu, 30 Jan 2020 20:21:44 -0000 - -apache-arrow (0.15.0-1) unstable; urgency=low - - * New upstream release. - - -- Krisztián Szűcs Mon, 30 Sep 2019 17:19:02 -0000 - -apache-arrow (0.14.0-1) unstable; urgency=low - - * New upstream release. - - -- Sutou Kouhei Fri, 28 Jun 2019 22:22:35 -0000 - -apache-arrow (0.13.0-1) unstable; urgency=low - - * New upstream release. - - -- Kouhei Sutou Thu, 28 Mar 2019 02:24:58 -0000 - -apache-arrow (0.12.0-1) unstable; urgency=low - - * New upstream release. - - -- Krisztián Szűcs Wed, 16 Jan 2019 03:29:25 -0000 - -apache-arrow (0.11.0-1) unstable; urgency=low - - * New upstream release. - - -- Kouhei Sutou Thu, 04 Oct 2018 00:33:42 -0000 - -apache-arrow (0.10.0-1) unstable; urgency=low - - * New upstream release. - - -- Phillip Cloud Thu, 02 Aug 2018 23:58:23 -0000 - -apache-arrow (0.9.0-1) unstable; urgency=low - - * New upstream release. - - -- Kouhei Sutou Fri, 16 Mar 2018 16:56:31 -0000 - -apache-arrow (0.8.0-1) unstable; urgency=low - - * New upstream release. - - * Add libarrow-gpu-glib0, libarrow-gpu-glib-dev and gir1.2-arrow-gpu-1.0. - - -- Uwe L. Korn Sun, 17 Dec 2017 20:24:44 -0000 - -apache-arrow (0.7.1-2) unstable; urgency=low - - * Add libarrow-gpu0 and libarrow-gpu-dev. - - * Add libarrow-python-dev. - - -- Kouhei Sutou Sun, 29 Oct 2017 21:59:13 +0900 - -apache-arrow (0.7.1-1) unstable; urgency=low - - * New upstream release. - - -- Kouhei Sutou Wed, 27 Sep 2017 13:19:05 -0000 - -apache-arrow (0.7.0-1) unstable; urgency=low - - * New upstream release. - - -- Wes McKinney Tue, 12 Sep 2017 22:01:14 -0000 - -apache-arrow (0.6.0-1) unstable; urgency=low - - * New upstream release. - - -- Kouhei Sutou Fri, 11 Aug 2017 21:27:51 -0000 - -apache-arrow (0.6.0.20170802-1) unstable; urgency=low - - * New upstream release. - - -- Kouhei Sutou Wed, 02 Aug 2017 22:28:18 -0000 diff --git a/dev/tasks/linux-packages/apache-arrow/debian/compat b/dev/tasks/linux-packages/apache-arrow/debian/compat deleted file mode 100644 index 48082f72f087c..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/compat +++ /dev/null @@ -1 +0,0 @@ -12 diff --git a/dev/tasks/linux-packages/apache-arrow/debian/control.in b/dev/tasks/linux-packages/apache-arrow/debian/control.in deleted file mode 100644 index f50b09e60438d..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/control.in +++ /dev/null @@ -1,583 +0,0 @@ -Source: apache-arrow -Section: devel -Priority: optional -Maintainer: Apache Arrow Developers -Build-Depends: - cmake, - debhelper (>= 12), - git, - gobject-introspection, - gtk-doc-tools, - libboost-filesystem-dev, - libboost-system-dev, - libbrotli-dev, - libbz2-dev, - libcurl4-openssl-dev, - libgirepository1.0-dev, - libgoogle-glog-dev, -@USE_SYSTEM_GRPC@ libgrpc++-dev, - libgtest-dev, - liblz4-dev, - libre2-dev, - libsnappy-dev, - libssl-dev, - libutf8proc-dev, - libzstd-dev, - ninja-build, - nvidia-cuda-toolkit [!arm64], - pkg-config, -@USE_SYSTEM_GRPC@ protobuf-compiler-grpc, - python3-dev, - python3-numpy, - tzdata, - zlib1g-dev -Build-Depends-Indep: libglib2.0-doc -Standards-Version: 3.9.8 -Homepage: https://arrow.apache.org/ - -Package: libarrow400 -Section: libs -Architecture: any -Multi-Arch: same -Pre-Depends: ${misc:Pre-Depends} -Depends: - ${misc:Depends}, - ${shlibs:Depends} -Description: Apache Arrow is a data processing library for analysis - . - This package provides C++ library files. - -Package: libarrow-cuda400 -Section: libs -Architecture: @CUDA_ARCHITECTURE@ -Multi-Arch: same -Pre-Depends: ${misc:Pre-Depends} -Depends: - ${misc:Depends}, - ${shlibs:Depends}, - libarrow400 (= ${binary:Version}) -Description: Apache Arrow is a data processing library for analysis - . - This package provides C++ library files for CUDA support. - -Package: libarrow-dataset400 -Section: libs -Architecture: any -Multi-Arch: same -Pre-Depends: ${misc:Pre-Depends} -Depends: - ${misc:Depends}, - ${shlibs:Depends}, - libarrow400 (= ${binary:Version}), - libparquet400 (= ${binary:Version}) -Description: Apache Arrow is a data processing library for analysis - . - This package provides C++ library files for Dataset module. - -Package: libarrow-flight400 -Section: libs -Architecture: any -Multi-Arch: same -Pre-Depends: ${misc:Pre-Depends} -Depends: - ${misc:Depends}, - ${shlibs:Depends}, - libarrow400 (= ${binary:Version}) -Description: Apache Arrow is a data processing library for analysis - . - This package provides C++ library files for Flight RPC system. - -Package: libarrow-python400 -Section: libs -Architecture: any -Multi-Arch: same -Pre-Depends: ${misc:Pre-Depends} -Depends: - ${misc:Depends}, - ${shlibs:Depends}, - libarrow400 (= ${binary:Version}), - python3, - python3-numpy -Description: Apache Arrow is a data processing library for analysis - . - This package provides C++ library files for Python support. - -Package: libarrow-python-flight400 -Section: libs -Architecture: any -Multi-Arch: same -Pre-Depends: ${misc:Pre-Depends} -Depends: - ${misc:Depends}, - ${shlibs:Depends}, - libarrow-flight400 (= ${binary:Version}), - libarrow-python400 (= ${binary:Version}) -Description: Apache Arrow is a data processing library for analysis - . - This package provides C++ library files for Flight and Python support. - -Package: libarrow-dev -Section: libdevel -Architecture: any -Multi-Arch: same -Depends: - ${misc:Depends}, - libarrow400 (= ${binary:Version}), - libbrotli-dev, - libbz2-dev, -@USE_SYSTEM_GRPC@ libgrpc++-dev, - liblz4-dev, - libre2-dev, - libsnappy-dev, - libssl-dev, - libutf8proc-dev, - libzstd-dev, -@USE_SYSTEM_GRPC@ protobuf-compiler-grpc, - zlib1g-dev -Description: Apache Arrow is a data processing library for analysis - . - This package provides C++ header files. - -Package: libarrow-cuda-dev -Section: libdevel -Architecture: @CUDA_ARCHITECTURE@ -Multi-Arch: same -Depends: - ${misc:Depends}, - libarrow-dev (= ${binary:Version}), - libarrow-cuda400 (= ${binary:Version}) -Description: Apache Arrow is a data processing library for analysis - . - This package provides C++ header files for CUDA support. - -Package: libarrow-dataset-dev -Section: libdevel -Architecture: any -Multi-Arch: same -Depends: - ${misc:Depends}, - libarrow-dev (= ${binary:Version}), - libarrow-dataset400 (= ${binary:Version}), - libparquet-dev (= ${binary:Version}) -Description: Apache Arrow is a data processing library for analysis - . - This package provides C++ header files for dataset module. - -Package: libarrow-flight-dev -Section: libdevel -Architecture: any -Multi-Arch: same -Depends: - ${misc:Depends}, - libarrow-dev (= ${binary:Version}), - libarrow-flight400 (= ${binary:Version}) -Description: Apache Arrow is a data processing library for analysis - . - This package provides C++ header files for Flight RPC system. - -Package: libarrow-python-dev -Section: libdevel -Architecture: any -Multi-Arch: same -Depends: - ${misc:Depends}, - libarrow-dev (= ${binary:Version}), - libarrow-python400 (= ${binary:Version}) -Description: Apache Arrow is a data processing library for analysis - . - This package provides C++ header files for Python support. - -Package: libarrow-python-flight-dev -Section: libdevel -Architecture: any -Multi-Arch: same -Depends: - ${misc:Depends}, - libarrow-flight-dev (= ${binary:Version}), - libarrow-python-dev (= ${binary:Version}), - libarrow-python-flight400 (= ${binary:Version}) -Description: Apache Arrow is a data processing library for analysis - . - This package provides C++ header files for Flight and Python support. - -Package: libgandiva400 -Section: libs -Architecture: any -Multi-Arch: same -Pre-Depends: ${misc:Pre-Depends} -Depends: - ${misc:Depends}, - ${shlibs:Depends}, - libarrow400 (= ${binary:Version}) -Description: Gandiva is a toolset for compiling and evaluating expressions - on Arrow Data. - . - This package provides C++ library files. - -Package: libgandiva-dev -Section: libdevel -Architecture: any -Multi-Arch: same -Depends: - ${misc:Depends}, - libarrow-dev (= ${binary:Version}), - libgandiva400 (= ${binary:Version}) -Description: Gandiva is a toolset for compiling and evaluating expressions - on Arrow Data. - . - This package provides C++ header files. - -Package: libplasma400 -Section: libs -Architecture: @CUDA_ARCHITECTURE@ -Multi-Arch: same -Pre-Depends: ${misc:Pre-Depends} -Depends: - ${misc:Depends}, - ${shlibs:Depends}, - libarrow-cuda400 (= ${binary:Version}) -Description: Plasma is an in-memory object store and cache for big data. - . - This package provides C++ library files to connect plasma-store-server. - -Package: plasma-store-server -Section: utils -Architecture: @CUDA_ARCHITECTURE@ -Pre-Depends: ${misc:Pre-Depends} -Depends: - ${misc:Depends}, - ${shlibs:Depends}, - libplasma400 (= ${binary:Version}) -Description: Plasma is an in-memory object store and cache for big data. - . - This package provides plasma-store-server. - -Package: libplasma-dev -Section: libdevel -Architecture: @CUDA_ARCHITECTURE@ -Multi-Arch: same -Depends: - ${misc:Depends}, - libarrow-cuda-dev (= ${binary:Version}), - libplasma400 (= ${binary:Version}) -Description: Plasma is an in-memory object store and cache for big data. - . - This package provides C++ header files. - -Package: libparquet400 -Section: libs -Architecture: any -Multi-Arch: same -Pre-Depends: ${misc:Pre-Depends} -Depends: - ${misc:Depends}, - ${shlibs:Depends} -Description: Apache Parquet is a columnar storage format - . - This package provides C++ library files to process Apache Parquet format. - -Package: libparquet-dev -Section: libdevel -Architecture: any -Multi-Arch: same -Depends: - ${misc:Depends}, - libarrow-dev (= ${binary:Version}), - libparquet400 (= ${binary:Version}) -Description: Apache Parquet is a columnar storage format - . - This package provides C++ header files. - -Package: libarrow-glib400 -Section: libs -Architecture: any -Multi-Arch: same -Pre-Depends: ${misc:Pre-Depends} -Depends: - ${misc:Depends}, - ${shlibs:Depends}, - libarrow400 (= ${binary:Version}) -Description: Apache Arrow is a data processing library for analysis - . - This package provides GLib based library files. - -Package: gir1.2-arrow-1.0 -Section: introspection -Architecture: any -Multi-Arch: same -Depends: - ${gir:Depends}, - ${misc:Depends} -Description: Apache Arrow is a data processing library for analysis - . - This package provides GObject Introspection typelib files. - -Package: libarrow-glib-dev -Section: libdevel -Architecture: any -Multi-Arch: same -Depends: - ${misc:Depends}, - libglib2.0-dev, - libarrow-dev (= ${binary:Version}), - libarrow-glib400 (= ${binary:Version}), - gir1.2-arrow-1.0 (= ${binary:Version}) -Suggests: libarrow-glib-doc -Description: Apache Arrow is a data processing library for analysis - . - This package provides GLib based header files. - -Package: libarrow-glib-doc -Section: doc -Architecture: all -Multi-Arch: foreign -Depends: - ${misc:Depends} -Recommends: libglib2.0-doc -Description: Apache Arrow is a data processing library for analysis - . - This package provides documentations. - -Package: libarrow-cuda-glib400 -Section: libs -Architecture: @CUDA_ARCHITECTURE@ -Multi-Arch: same -Pre-Depends: ${misc:Pre-Depends} -Depends: - ${misc:Depends}, - ${shlibs:Depends}, - libarrow-glib400 (= ${binary:Version}), - libarrow-cuda400 (= ${binary:Version}) -Description: Apache Arrow is a data processing library for analysis - . - This package provides GLib based library files for CUDA support. - -Package: gir1.2-arrow-cuda-1.0 -Section: introspection -Architecture: @CUDA_ARCHITECTURE@ -Multi-Arch: same -Depends: - ${gir:Depends}, - ${misc:Depends} -Description: Apache Arrow is a data processing library for analysis - . - This package provides GObject Introspection typelib files for CUDA support. - -Package: libarrow-cuda-glib-dev -Section: libdevel -Architecture: @CUDA_ARCHITECTURE@ -Multi-Arch: same -Depends: - ${misc:Depends}, - libarrow-cuda-dev (= ${binary:Version}), - libarrow-glib-dev (= ${binary:Version}), - libarrow-cuda-glib400 (= ${binary:Version}), - gir1.2-arrow-cuda-1.0 (= ${binary:Version}) -Description: Apache Arrow is a data processing library for analysis - . - This package provides GLib based header files for CUDA support. - -Package: libarrow-dataset-glib400 -Section: libs -Architecture: any -Multi-Arch: same -Pre-Depends: ${misc:Pre-Depends} -Depends: - ${misc:Depends}, - ${shlibs:Depends}, - libarrow-glib400 (= ${binary:Version}), - libarrow-dataset400 (= ${binary:Version}) -Description: Apache Arrow is a data processing library for analysis - . - This package provides GLib based library files for dataset module. - -Package: gir1.2-arrow-dataset-1.0 -Section: introspection -Architecture: any -Multi-Arch: same -Depends: - ${gir:Depends}, - ${misc:Depends} -Description: Apache Arrow is a data processing library for analysis - . - This package provides GObject Introspection typelib files for dataset module. - -Package: libarrow-dataset-glib-dev -Section: libdevel -Architecture: any -Multi-Arch: same -Depends: - ${misc:Depends}, - libarrow-dataset-dev (= ${binary:Version}), - libarrow-glib-dev (= ${binary:Version}), - libarrow-dataset-glib400 (= ${binary:Version}), - gir1.2-arrow-dataset-1.0 (= ${binary:Version}) -Description: Apache Arrow is a data processing library for analysis - . - This package provides GLib based header files for dataset module. - -Package: libarrow-dataset-glib-doc -Section: doc -Architecture: any -Multi-Arch: foreign -Depends: - ${misc:Depends} -Recommends: libarrow-glib-doc -Description: Apache Arrow is a data processing library for analysis - . - This package provides documentations for dataset module. - -Package: libgandiva-glib400 -Section: libs -Architecture: any -Multi-Arch: same -Pre-Depends: ${misc:Pre-Depends} -Depends: - ${misc:Depends}, - ${shlibs:Depends}, - libarrow-glib400 (= ${binary:Version}), - libgandiva400 (= ${binary:Version}) -Description: Gandiva is a toolset for compiling and evaluating expressions - on Arrow Data. - . - This package provides GLib based library files. - -Package: gir1.2-gandiva-1.0 -Section: introspection -Architecture: any -Multi-Arch: same -Depends: - ${gir:Depends}, - ${misc:Depends} -Description: Gandiva is a toolset for compiling and evaluating expressions - on Arrow Data. - . - This package provides GObject Introspection typelib files. - -Package: libgandiva-glib-dev -Section: libdevel -Architecture: any -Multi-Arch: same -Depends: - ${misc:Depends}, - libgandiva-dev (= ${binary:Version}), - libarrow-glib-dev (= ${binary:Version}), - libgandiva-glib400 (= ${binary:Version}), - gir1.2-gandiva-1.0 (= ${binary:Version}) -Description: Gandiva is a toolset for compiling and evaluating expressions - on Arrow Data. - . - This package provides GLib based header files. - -Package: libgandiva-glib-doc -Section: doc -Architecture: any -Multi-Arch: foreign -Depends: - ${misc:Depends} -Recommends: libglib2.0-doc -Description: Gandiva is a toolset for compiling and evaluating expressions - on Arrow Data. - . - This package provides documentations. - -Package: libplasma-glib400 -Section: libs -Architecture: @CUDA_ARCHITECTURE@ -Multi-Arch: same -Pre-Depends: ${misc:Pre-Depends} -Depends: - ${misc:Depends}, - ${shlibs:Depends}, - libarrow-cuda-glib400 (= ${binary:Version}), - libplasma400 (= ${binary:Version}) -Description: Plasma is an in-memory object store and cache for big data. - . - This package provides GLib based library files to connect plasma-store-server. - -Package: gir1.2-plasma-1.0 -Section: introspection -Architecture: @CUDA_ARCHITECTURE@ -Multi-Arch: same -Depends: - ${gir:Depends}, - ${misc:Depends} -Description: Plasma is an in-memory object store and cache for big data. - . - This package provides GObject Introspection typelib files. - -Package: libplasma-glib-dev -Section: libdevel -Architecture: @CUDA_ARCHITECTURE@ -Multi-Arch: same -Depends: - ${misc:Depends}, - libplasma-dev (= ${binary:Version}), - libarrow-cuda-glib-dev (= ${binary:Version}), - libplasma-glib400 (= ${binary:Version}), - gir1.2-plasma-1.0 (= ${binary:Version}) -Description: Plasma is an in-memory object store and cache for big data. - . - This package provides GLib based header files. - -Package: libplasma-glib-doc -Section: doc -Architecture: @CUDA_ARCHITECTURE@ -Multi-Arch: foreign -Depends: - ${misc:Depends} -Recommends: libglib2.0-doc -Description: Plasma is an in-memory object store and cache for big data. - . - This package provides documentations. - -Package: libparquet-glib400 -Section: libs -Architecture: any -Multi-Arch: same -Pre-Depends: ${misc:Pre-Depends} -Depends: - ${misc:Depends}, - ${shlibs:Depends}, - libarrow-glib400 (= ${binary:Version}), - libparquet400 (= ${binary:Version}) -Description: Apache Parquet is a columnar storage format - . - This package provides GLib based library files. - -Package: gir1.2-parquet-1.0 -Section: introspection -Architecture: any -Multi-Arch: same -Depends: - ${gir:Depends}, - ${misc:Depends} -Description: Apache Parquet is a columnar storage format - . - This package provides GObject Introspection typelib files. - -Package: libparquet-glib-dev -Section: libdevel -Architecture: any -Multi-Arch: same -Depends: - ${misc:Depends}, - libarrow-glib-dev (= ${binary:Version}), - libparquet-dev (= ${binary:Version}), - libparquet-glib400 (= ${binary:Version}), - gir1.2-parquet-1.0 (= ${binary:Version}) -Suggests: libparquet-glib-doc -Description: Apache Parquet is a columnar storage format - . - This package provides GLib based header files. - -Package: libparquet-glib-doc -Section: doc -Architecture: all -Multi-Arch: foreign -Depends: - ${misc:Depends} -Recommends: libglib2.0-doc -Description: Apache Parquet is a columnar storage format - . - This package provides documentations. diff --git a/dev/tasks/linux-packages/apache-arrow/debian/copyright b/dev/tasks/linux-packages/apache-arrow/debian/copyright deleted file mode 100644 index 9db0ea76f7219..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/copyright +++ /dev/null @@ -1,193 +0,0 @@ -Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ -Upstream-Name: Apache Arrow -Upstream-Contact: -Source: https://dist.apache.org/repos/dist/release/arrow/ - -Files: * -Copyright: 2016 The Apache Software Foundation -License: Apache-2.0 - -Files: TODO for "This product includes software from the SFrame project" -Copyright: 2015 Dato, Inc. - 2009 Carnegie Mellon University. -License: BSD-3-clause - -Files: TODO for "This product includes software from the Numpy project" -Copyright: 1995, 1996, 1997 Jim Hugunin, hugunin@mit.edu - 2005 Travis E. Oliphant oliphant@ee.byu.edu Brigham Young University -License: BSD-3-clause - -Files: TODO for "This product includes software from the Feather project" -Copyright: TODO -License: Apache-2.0 - -Files: TODO for "This product includes software from the DyND project" -Copyright: TODO -License: BSD-2-clause - -Files: TODO for "This product includes software from the LLVM project" -Copyright: 2003-2007 University of Illinois at Urbana-Champaign. -License: U-OF-I-BSD-LIKE - -Files: TODO for "This product includes software from the google-lint project" -Copyright: 2009 Google Inc. All rights reserved. -License: BSD-3-clause - -Files: TODO for "This product includes software from the mman-win32 project" -Copyright: 2010 kutuzov.viktor.84 -License: MIT - -Files: TODO for "This product includes software from the LevelDB project" -Copyright: 2011 The LevelDB Authors. All rights reserved. -License: BSD-3-clause - -Files: TODO for "This product includes software from the CMake project" -Copyright: 2001-2009 Kitware, Inc. - 2012-2014 Continuum Analytics, Inc. -License: BSD-3-clause - -Files: TODO for "This product includes software from https://github.com/matthew-brett/multibuild" -Copyright: 2013-2016, Matt Terry and Matthew Brett; all rights reserved. -License: BSD-2-clause - -Files: TODO for "This product includes software from the Ibis project" -Copyright: 2015 Cloudera, Inc. -License: Apache-2.0 - -Files: TODO for "This product includes code from Apache Kudu" -Copyright: 2016 The Apache Software Foundation -License: Apache-2.0 - -License: Apache-2.0 - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - . - http://www.apache.org/licenses/LICENSE-2.0 - . - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - . - On Debian systems, the full text of the Apache Software License version 2 can - be found in the file `/usr/share/common-licenses/Apache-2.0'. - -License: BSD-3-clause - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - . - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - 3. Neither the name of the University nor the names of its contributors - may be used to endorse or promote products derived from this software - without specific prior written permission. - . - THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - SUCH DAMAGE. - -License: BSD-2-clause - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - . - 1) Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - . - 2) Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - . - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY - WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - POSSIBILITY OF SUCH DAMAGE. - -License: U-OF-I-BSD-LIKE - ============================================================================== - LLVM Release License - ============================================================================== - University of Illinois/NCSA - Open Source License - . - Copyright (c) 2003-2013 University of Illinois at Urbana-Champaign. - All rights reserved. - . - Developed by: - . - LLVM Team - . - University of Illinois at Urbana-Champaign - . - http://llvm.org - . - Permission is hereby granted, free of charge, to any person obtaining a copy of - this software and associated documentation files (the "Software"), to deal with - the Software without restriction, including without limitation the rights to - use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies - of the Software, and to permit persons to whom the Software is furnished to do - so, subject to the following conditions: - . - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimers. - . - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimers in the - documentation and/or other materials provided with the distribution. - . - * Neither the names of the LLVM Team, University of Illinois at - Urbana-Champaign, nor the names of its contributors may be used to - endorse or promote products derived from this Software without specific - prior written permission. - . - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS - FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE - SOFTWARE. - -License: MIT - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - . - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - . - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. diff --git a/dev/tasks/linux-packages/apache-arrow/debian/gir1.2-arrow-1.0.install b/dev/tasks/linux-packages/apache-arrow/debian/gir1.2-arrow-1.0.install deleted file mode 100644 index e0197fcd327c8..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/gir1.2-arrow-1.0.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/girepository-1.0/Arrow-1.0.typelib diff --git a/dev/tasks/linux-packages/apache-arrow/debian/gir1.2-arrow-cuda-1.0.install b/dev/tasks/linux-packages/apache-arrow/debian/gir1.2-arrow-cuda-1.0.install deleted file mode 100644 index ef0d9f56f9dbc..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/gir1.2-arrow-cuda-1.0.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/girepository-1.0/ArrowCUDA-1.0.typelib diff --git a/dev/tasks/linux-packages/apache-arrow/debian/gir1.2-arrow-dataset-1.0.install b/dev/tasks/linux-packages/apache-arrow/debian/gir1.2-arrow-dataset-1.0.install deleted file mode 100644 index 27091dab36e35..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/gir1.2-arrow-dataset-1.0.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/girepository-1.0/ArrowDataset-1.0.typelib diff --git a/dev/tasks/linux-packages/apache-arrow/debian/gir1.2-gandiva-1.0.install b/dev/tasks/linux-packages/apache-arrow/debian/gir1.2-gandiva-1.0.install deleted file mode 100644 index 0433b367a24c8..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/gir1.2-gandiva-1.0.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/girepository-1.0/Gandiva-1.0.typelib diff --git a/dev/tasks/linux-packages/apache-arrow/debian/gir1.2-parquet-1.0.install b/dev/tasks/linux-packages/apache-arrow/debian/gir1.2-parquet-1.0.install deleted file mode 100644 index 13fde668132e3..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/gir1.2-parquet-1.0.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/girepository-1.0/Parquet-1.0.typelib diff --git a/dev/tasks/linux-packages/apache-arrow/debian/gir1.2-plasma-1.0.install b/dev/tasks/linux-packages/apache-arrow/debian/gir1.2-plasma-1.0.install deleted file mode 100644 index 7b7ce21581dfd..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/gir1.2-plasma-1.0.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/girepository-1.0/Plasma-1.0.typelib diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-dev.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-dev.install deleted file mode 100644 index 77e0b70f672fe..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-dev.install +++ /dev/null @@ -1,6 +0,0 @@ -usr/lib/*/cmake/arrow/ArrowCUDAConfig*.cmake -usr/lib/*/cmake/arrow/ArrowCUDATargets*.cmake -usr/lib/*/cmake/arrow/FindArrowCUDA.cmake -usr/lib/*/libarrow_cuda.a -usr/lib/*/libarrow_cuda.so -usr/lib/*/pkgconfig/arrow-cuda.pc diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib-dev.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib-dev.install deleted file mode 100644 index 778ae5fd74162..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib-dev.install +++ /dev/null @@ -1,4 +0,0 @@ -usr/include/arrow-cuda-glib/ -usr/lib/*/libarrow-cuda-glib.so -usr/lib/*/pkgconfig/arrow-cuda-glib.pc -usr/share/gir-1.0/ArrowCUDA-1.0.gir diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib400.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib400.install deleted file mode 100644 index a6d6375268d34..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib400.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/libarrow-cuda-glib.so.* diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda400.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda400.install deleted file mode 100644 index 5ae46468764f2..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda400.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/libarrow_cuda.so.* diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-dev.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-dev.install deleted file mode 100644 index 53e727ae0526a..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-dev.install +++ /dev/null @@ -1,6 +0,0 @@ -usr/lib/*/cmake/arrow/ArrowDatasetConfig*.cmake -usr/lib/*/cmake/arrow/ArrowDatasetTargets*.cmake -usr/lib/*/cmake/arrow/FindArrowDataset.cmake -usr/lib/*/libarrow_dataset.a -usr/lib/*/libarrow_dataset.so -usr/lib/*/pkgconfig/arrow-dataset.pc diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-dev.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-dev.install deleted file mode 100644 index 4c50bde975adc..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-dev.install +++ /dev/null @@ -1,4 +0,0 @@ -usr/include/arrow-dataset-glib/ -usr/lib/*/libarrow-dataset-glib.so -usr/lib/*/pkgconfig/arrow-dataset-glib.pc -usr/share/gir-1.0/ArrowDataset-1.0.gir diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-doc.doc-base b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-doc.doc-base deleted file mode 100644 index 5ec8156b05d6c..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-doc.doc-base +++ /dev/null @@ -1,9 +0,0 @@ -Document: arrow-dataset-glib -Title: Apache Arrow Dataset GLib Reference Manual -Author: The Apache Software Foundation -Abstract: Apache Arrow Dataset GLib provides an API to read and write semantic datasets stored in different locations and formats that uses GLib. -Section: Programming - -Format: HTML -Index: /usr/share/gtk-doc/html/arrow-dataset-glib/index.html -Files: /usr/share/gtk-doc/html/arrow-dataset-glib/*.html diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-doc.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-doc.install deleted file mode 100644 index 523bc206e1e03..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-doc.install +++ /dev/null @@ -1 +0,0 @@ -usr/share/gtk-doc/html/arrow-dataset-glib diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-doc.links b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-doc.links deleted file mode 100644 index 3d880362b5e9f..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-doc.links +++ /dev/null @@ -1,3 +0,0 @@ -usr/share/gtk-doc/html/arrow-dataset-glib usr/share/doc/libarrow-dataset-glib-doc/arrow-dataset-glib -usr/share/doc/libglib2.0-doc/glib usr/share/doc/libarrow-dataset-glib-doc/glib -usr/share/doc/libglib2.0-doc/gobject usr/share/doc/libarrow-dataset-glib-doc/gobject diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib400.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib400.install deleted file mode 100644 index 10085f3a0a0ab..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib400.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/libarrow-dataset-glib.so.* diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset400.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset400.install deleted file mode 100644 index 0146341652e1e..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset400.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/libarrow_dataset.so.* diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dev.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dev.install deleted file mode 100644 index 52fbbb32d816f..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-dev.install +++ /dev/null @@ -1,21 +0,0 @@ -usr/include/arrow/ -usr/lib/*/cmake/arrow/ArrowConfig*.cmake -usr/lib/*/cmake/arrow/ArrowOptions.cmake -usr/lib/*/cmake/arrow/ArrowTargets*.cmake -usr/lib/*/cmake/arrow/Find*Alt.cmake -usr/lib/*/cmake/arrow/FindArrow.cmake -usr/lib/*/cmake/arrow/FindBrotli.cmake -usr/lib/*/cmake/arrow/FindLz4.cmake -usr/lib/*/cmake/arrow/FindSnappy.cmake -usr/lib/*/cmake/arrow/Findutf8proc.cmake -usr/lib/*/cmake/arrow/Findzstd.cmake -usr/lib/*/cmake/arrow/arrow-config.cmake -usr/lib/*/libarrow.a -usr/lib/*/libarrow.so -usr/lib/*/libarrow_bundled_dependencies.a -usr/lib/*/pkgconfig/arrow-compute.pc -usr/lib/*/pkgconfig/arrow-csv.pc -usr/lib/*/pkgconfig/arrow-filesystem.pc -usr/lib/*/pkgconfig/arrow-json.pc -usr/lib/*/pkgconfig/arrow-orc.pc -usr/lib/*/pkgconfig/arrow.pc diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight-dev.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight-dev.install deleted file mode 100644 index 20ca33d843625..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight-dev.install +++ /dev/null @@ -1,6 +0,0 @@ -usr/lib/*/cmake/arrow/ArrowFlightConfig*.cmake -usr/lib/*/cmake/arrow/ArrowFlightTargets*.cmake -usr/lib/*/cmake/arrow/FindArrowFlight.cmake -usr/lib/*/libarrow_flight.a -usr/lib/*/libarrow_flight.so -usr/lib/*/pkgconfig/arrow-flight.pc diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight400.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight400.install deleted file mode 100644 index abdb96d4ca83b..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight400.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/libarrow_flight.so.* diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-dev.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-dev.install deleted file mode 100644 index f6de7eedb6f11..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-dev.install +++ /dev/null @@ -1,6 +0,0 @@ -usr/include/arrow-glib/ -usr/lib/*/libarrow-glib.so -usr/lib/*/pkgconfig/arrow-glib.pc -usr/lib/*/pkgconfig/arrow-orc-glib.pc -usr/share/arrow-glib/example/ -usr/share/gir-1.0/Arrow-1.0.gir diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-doc.doc-base b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-doc.doc-base deleted file mode 100644 index 8ae4ffb6ddaa5..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-doc.doc-base +++ /dev/null @@ -1,9 +0,0 @@ -Document: arrow-glib -Title: Apache Arrow GLib Reference Manual -Author: The Apache Software Foundation -Abstract: Apache Arrow GLib is a data processing library for analysis that uses GLib. -Section: Programming - -Format: HTML -Index: /usr/share/gtk-doc/html/arrow-glib/index.html -Files: /usr/share/gtk-doc/html/arrow-glib/*.html diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-doc.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-doc.install deleted file mode 100644 index 912a29c585084..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-doc.install +++ /dev/null @@ -1,2 +0,0 @@ -usr/share/doc/arrow-glib/ -usr/share/gtk-doc/html/arrow-glib diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-doc.links b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-doc.links deleted file mode 100644 index 556987d0a8065..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-doc.links +++ /dev/null @@ -1,3 +0,0 @@ -usr/share/gtk-doc/html/arrow-glib usr/share/doc/libarrow-glib-doc/arrow-glib -usr/share/doc/libglib2.0-doc/glib usr/share/doc/libarrow-glib-doc/glib -usr/share/doc/libglib2.0-doc/gobject usr/share/doc/libarrow-glib-doc/gobject diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib400.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib400.install deleted file mode 100644 index ec369d1536b5b..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib400.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/libarrow-glib.so.* diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-dev.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-dev.install deleted file mode 100644 index 807583f9845e6..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-dev.install +++ /dev/null @@ -1,6 +0,0 @@ -usr/lib/*/cmake/arrow/ArrowPythonConfig*.cmake -usr/lib/*/cmake/arrow/ArrowPythonTargets*.cmake -usr/lib/*/cmake/arrow/FindArrowPython.cmake -usr/lib/*/libarrow_python.a -usr/lib/*/libarrow_python.so -usr/lib/*/pkgconfig/arrow-python.pc diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-flight-dev.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-flight-dev.install deleted file mode 100644 index 6cf96e227e946..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-flight-dev.install +++ /dev/null @@ -1,6 +0,0 @@ -usr/lib/*/cmake/arrow/ArrowPythonFlightConfig*.cmake -usr/lib/*/cmake/arrow/ArrowPythonFlightTargets*.cmake -usr/lib/*/cmake/arrow/FindArrowPythonFlight.cmake -usr/lib/*/libarrow_python_flight.a -usr/lib/*/libarrow_python_flight.so -usr/lib/*/pkgconfig/arrow-python-flight.pc diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-flight400.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-flight400.install deleted file mode 100644 index b7cbfec1f0501..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-flight400.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/libarrow_python_flight.so.* diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-python400.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow-python400.install deleted file mode 100644 index eef3e66483739..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow-python400.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/libarrow_python.so.* diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libarrow400.install b/dev/tasks/linux-packages/apache-arrow/debian/libarrow400.install deleted file mode 100644 index 98ef2139cb163..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libarrow400.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/libarrow.so.* diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libgandiva-dev.install b/dev/tasks/linux-packages/apache-arrow/debian/libgandiva-dev.install deleted file mode 100644 index 26e7e76fb3806..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libgandiva-dev.install +++ /dev/null @@ -1,7 +0,0 @@ -usr/include/gandiva/ -usr/lib/*/cmake/arrow/GandivaConfig*.cmake -usr/lib/*/cmake/arrow/GandivaTargets*.cmake -usr/lib/*/cmake/arrow/FindGandiva.cmake -usr/lib/*/libgandiva.a -usr/lib/*/libgandiva.so -usr/lib/*/pkgconfig/gandiva.pc diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-dev.install b/dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-dev.install deleted file mode 100644 index fe7d8bb793f88..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-dev.install +++ /dev/null @@ -1,4 +0,0 @@ -usr/include/gandiva-glib/ -usr/lib/*/libgandiva-glib.so -usr/lib/*/pkgconfig/gandiva-glib.pc -usr/share/gir-1.0/Gandiva-1.0.gir diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-doc.doc-base b/dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-doc.doc-base deleted file mode 100644 index 2bf913062fb8c..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-doc.doc-base +++ /dev/null @@ -1,9 +0,0 @@ -Document: gandiva-glib -Title: Gandiva GLib Reference Manual -Author: The Apache Software Foundation -Abstract: Gandiva GLib is a toolset for compiling and evaluating expressions on Arrow Data that uses GLib. -Section: Programming - -Format: HTML -Index: /usr/share/gtk-doc/html/gandiva-glib/index.html -Files: /usr/share/gtk-doc/html/gandiva-glib/*.html diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-doc.install b/dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-doc.install deleted file mode 100644 index 358e4e5c768be..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-doc.install +++ /dev/null @@ -1 +0,0 @@ -usr/share/gtk-doc/html/gandiva-glib diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-doc.links b/dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-doc.links deleted file mode 100644 index 234794e232efb..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-doc.links +++ /dev/null @@ -1,3 +0,0 @@ -usr/share/gtk-doc/html/gandiva-glib usr/share/doc/libgandiva-glib-doc/gandiva-glib -usr/share/doc/libglib2.0-doc/glib usr/share/doc/libgandiva-glib-doc/glib -usr/share/doc/libglib2.0-doc/gobject usr/share/doc/libgandiva-glib-doc/gobject diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib400.install b/dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib400.install deleted file mode 100644 index 6257fd43823c0..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib400.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/libgandiva-glib.so.* diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libgandiva400.install b/dev/tasks/linux-packages/apache-arrow/debian/libgandiva400.install deleted file mode 100644 index 1475f49cf4c5c..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libgandiva400.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/libgandiva.so.* diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libparquet-dev.install b/dev/tasks/linux-packages/apache-arrow/debian/libparquet-dev.install deleted file mode 100644 index e163115f0351e..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libparquet-dev.install +++ /dev/null @@ -1,7 +0,0 @@ -usr/include/parquet/ -usr/lib/*/cmake/arrow/ParquetConfig*.cmake -usr/lib/*/cmake/arrow/ParquetTargets*.cmake -usr/lib/*/cmake/arrow/FindParquet.cmake -usr/lib/*/libparquet.a -usr/lib/*/libparquet.so -usr/lib/*/pkgconfig/parquet.pc diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-dev.install b/dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-dev.install deleted file mode 100644 index 9cce737a7106c..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-dev.install +++ /dev/null @@ -1,4 +0,0 @@ -usr/include/parquet-glib/ -usr/lib/*/libparquet-glib.so -usr/lib/*/pkgconfig/parquet-glib.pc -usr/share/gir-1.0/Parquet-1.0.gir diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-doc.doc-base b/dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-doc.doc-base deleted file mode 100644 index cc68e2df6c11f..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-doc.doc-base +++ /dev/null @@ -1,9 +0,0 @@ -Document: parquet-glib -Title: Apache Parquet GLib Reference Manual -Author: The Apache Software Foundation -Abstract: Apache Parquet GLib is a columnar storage format processing library that uses GLib. -Section: Programming - -Format: HTML -Index: /usr/share/gtk-doc/html/parquet-glib/index.html -Files: /usr/share/gtk-doc/html/parquet-glib/*.html diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-doc.install b/dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-doc.install deleted file mode 100644 index 5843ea3dab8b3..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-doc.install +++ /dev/null @@ -1 +0,0 @@ -usr/share/gtk-doc/html/parquet-glib diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-doc.links b/dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-doc.links deleted file mode 100644 index c31f346b174d6..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-doc.links +++ /dev/null @@ -1,3 +0,0 @@ -usr/share/gtk-doc/html/parquet-glib usr/share/doc/libparquet-glib-doc/parquet-glib -usr/share/doc/libglib2.0-doc/glib usr/share/doc/libparquet-glib-doc/glib -usr/share/doc/libglib2.0-doc/gobject usr/share/doc/libparquet-glib-doc/gobject diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib400.install b/dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib400.install deleted file mode 100644 index 1c0e4419966b2..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib400.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/libparquet-glib.so.* diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libparquet400.install b/dev/tasks/linux-packages/apache-arrow/debian/libparquet400.install deleted file mode 100644 index 540a91d5ef7a4..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libparquet400.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/libparquet.so.* diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libplasma-dev.install b/dev/tasks/linux-packages/apache-arrow/debian/libplasma-dev.install deleted file mode 100644 index c315d4dfc7706..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libplasma-dev.install +++ /dev/null @@ -1,7 +0,0 @@ -usr/include/plasma/ -usr/lib/*/cmake/arrow/PlasmaConfig*.cmake -usr/lib/*/cmake/arrow/PlasmaTargets*.cmake -usr/lib/*/cmake/arrow/FindPlasma.cmake -usr/lib/*/libplasma.a -usr/lib/*/libplasma.so -usr/lib/*/pkgconfig/plasma.pc diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-dev.install b/dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-dev.install deleted file mode 100644 index 7800681d20a2a..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-dev.install +++ /dev/null @@ -1,4 +0,0 @@ -usr/include/plasma-glib/ -usr/lib/*/libplasma-glib.so -usr/lib/*/pkgconfig/plasma-glib.pc -usr/share/gir-1.0/Plasma-1.0.gir diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-doc.doc-base b/dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-doc.doc-base deleted file mode 100644 index a9d306d8bf58e..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-doc.doc-base +++ /dev/null @@ -1,9 +0,0 @@ -Document: plasma-glib -Title: Plasma GLib Reference Manual -Author: The Apache Software Foundation -Abstract: Plasma GLib is an in-memory object store and cache for big data that uses GLib. -Section: Programming - -Format: HTML -Index: /usr/share/gtk-doc/html/plasma-glib/index.html -Files: /usr/share/gtk-doc/html/plasma-glib/*.html diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-doc.install b/dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-doc.install deleted file mode 100644 index ad13b94cd72bb..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-doc.install +++ /dev/null @@ -1 +0,0 @@ -usr/share/gtk-doc/html/plasma-glib diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-doc.links b/dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-doc.links deleted file mode 100644 index 193262f9b1737..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-doc.links +++ /dev/null @@ -1,3 +0,0 @@ -usr/share/gtk-doc/html/plasma-glib usr/share/doc/libplasma-glib-doc/plasma-glib -usr/share/doc/libglib2.0-doc/glib usr/share/doc/libplasma-glib-doc/glib -usr/share/doc/libglib2.0-doc/gobject usr/share/doc/libplasma-glib-doc/gobject diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib400.install b/dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib400.install deleted file mode 100644 index 339bcca3e7278..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib400.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/libplasma-glib.so.* diff --git a/dev/tasks/linux-packages/apache-arrow/debian/libplasma400.install b/dev/tasks/linux-packages/apache-arrow/debian/libplasma400.install deleted file mode 100644 index f8a744b65975d..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/libplasma400.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/libplasma.so.* diff --git a/dev/tasks/linux-packages/apache-arrow/debian/patches/series b/dev/tasks/linux-packages/apache-arrow/debian/patches/series deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/dev/tasks/linux-packages/apache-arrow/debian/plasma-store-server.install b/dev/tasks/linux-packages/apache-arrow/debian/plasma-store-server.install deleted file mode 100644 index bd13b0e8175f7..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/plasma-store-server.install +++ /dev/null @@ -1 +0,0 @@ -usr/bin/plasma-store-server diff --git a/dev/tasks/linux-packages/apache-arrow/debian/rules b/dev/tasks/linux-packages/apache-arrow/debian/rules deleted file mode 100755 index 7b8dff26a0866..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/rules +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/make -f -# -*- makefile-gmake -*- -# -# Uncomment this to turn on verbose mode. -#export DH_VERBOSE=1 -# This has to be exported to make some magic below work. -export DH_OPTIONS - -export DEB_BUILD_MAINT_OPTIONS=reproducible=-timeless - -BUILD_TYPE=release - -%: - dh $@ --with gir - -override_dh_auto_configure: - if dpkg -l nvidia-cuda-toolkit > /dev/null 2>&1; then \ - ARROW_CUDA=ON; \ - ARROW_PLASMA=ON; \ - else \ - ARROW_CUDA=OFF; \ - ARROW_PLASMA=OFF; \ - fi; \ - dh_auto_configure \ - --sourcedirectory=cpp \ - --builddirectory=cpp_build \ - --buildsystem=cmake+ninja \ - -- \ - -DARROW_CUDA=$${ARROW_CUDA} \ - -DARROW_FLIGHT=ON \ - -DARROW_GANDIVA=ON \ - -DARROW_GANDIVA_JAVA=OFF \ - -DARROW_MIMALLOC=ON \ - -DARROW_ORC=ON \ - -DARROW_PARQUET=ON \ - -DARROW_PLASMA=$${ARROW_PLASMA} \ - -DARROW_PYTHON=ON \ - -DARROW_S3=ON \ - -DARROW_USE_CCACHE=OFF \ - -DARROW_WITH_BROTLI=ON \ - -DARROW_WITH_BZ2=ON \ - -DARROW_WITH_LZ4=ON \ - -DARROW_WITH_SNAPPY=ON \ - -DARROW_WITH_ZLIB=ON \ - -DARROW_WITH_ZSTD=ON \ - -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) \ - -DCMAKE_UNITY_BUILD=ON \ - -DPARQUET_REQUIRE_ENCRYPTION=ON \ - -DPythonInterp_FIND_VERSION=ON \ - -DPythonInterp_FIND_VERSION_MAJOR=3 - -override_dh_auto_build: - dh_auto_build \ - --sourcedirectory=cpp \ - --builddirectory=cpp_build - dh_auto_configure \ - --sourcedirectory=c_glib \ - --builddirectory=c_glib_build \ - --buildsystem=meson+ninja \ - -- \ - -Darrow_cpp_build_type=$(BUILD_TYPE) \ - -Darrow_cpp_build_dir=../cpp_build \ - -Dgtk_doc=true - env \ - LD_LIBRARY_PATH=$(CURDIR)/cpp_build/$(BUILD_TYPE) \ - dh_auto_build \ - --sourcedirectory=c_glib \ - --builddirectory=c_glib_build \ - --buildsystem=meson+ninja - -override_dh_auto_install: - dh_auto_install \ - --sourcedirectory=c_glib \ - --builddirectory=c_glib_build \ - --buildsystem=meson+ninja - # Remove built files to reduce disk usage - dh_auto_clean \ - --sourcedirectory=c_glib \ - --builddirectory=c_glib_build \ - --buildsystem=meson+ninja - - dh_auto_install \ - --sourcedirectory=cpp \ - --builddirectory=cpp_build - # Remove built files to reduce disk usage - dh_auto_clean \ - --sourcedirectory=cpp \ - --builddirectory=cpp_build - -override_dh_auto_test: - # TODO: We need Boost 1.64 or later to build tests for - # Apache Arrow Flight. - # git clone --depth 1 https://github.com/apache/arrow-testing.git - # git clone --depth 1 https://github.com/apache/parquet-testing.git - # cd cpp_build && \ - # env \ - # ARROW_TEST_DATA=$(CURDIR)/arrow-testing/data \ - # PARQUET_TEST_DATA=$(CURDIR)/parquet-testing/data \ - # ctest --exclude-regex 'arrow-cuda-test|plasma-client_tests' - -# skip file failing with "Unknown DWARF DW_OP_255" (see bug#949296) -override_dh_dwz: - dh_dwz --exclude=libgandiva.so diff --git a/dev/tasks/linux-packages/apache-arrow/debian/source/format b/dev/tasks/linux-packages/apache-arrow/debian/source/format deleted file mode 100644 index 163aaf8d82b6c..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/source/format +++ /dev/null @@ -1 +0,0 @@ -3.0 (quilt) diff --git a/dev/tasks/linux-packages/apache-arrow/debian/watch b/dev/tasks/linux-packages/apache-arrow/debian/watch deleted file mode 100644 index 5cb3f00915f8a..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/debian/watch +++ /dev/null @@ -1,2 +0,0 @@ -version=3 -https://dist.apache.org/repos/dist/release/arrow/arrow-(.+)/apache-arrow-(.+).tar.gz diff --git a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in deleted file mode 100644 index 842b3b0f01497..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in +++ /dev/null @@ -1,802 +0,0 @@ -# -*- sh-shell: rpm -*- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -%define _centos_ver %{?centos_ver:%{centos_ver}}%{!?centos_ver:8} - -%define boost_version %( \ - if [ "%{_centos_ver}" = 7 ]; then \ - echo 169; \ - fi) -%define cmake_version %( \ - if [ "%{_centos_ver}" -lt 8 ]; then \ - echo 3; \ - fi) -%define python_version %( \ - if [ "%{_centos_ver}" = 7 ]; then \ - echo 36; \ - else \ - echo 3; \ - fi) - -%define use_flight (%{_centos_ver} >= 8) -%define use_gandiva (%{_centos_ver} >= 8 && %{_arch} != "aarch64") -%define use_mimalloc (%{_centos_ver} >= 8) -%define use_ninja (%{_centos_ver} >= 8) -# TODO: Enable this. This works on local but is fragile on GitHub Actions and -# Travis CI. -# %define use_s3 (%{_centos_ver} >= 8) -%define use_s3 0 - -%define have_rapidjson (%{_centos_ver} == 7) -%define have_re2 (%{_centos_ver} >= 8) -%define have_utf8proc (%{_centos_ver} == 7) - -Name: @PACKAGE@ -Version: @VERSION@ -Release: @RELEASE@%{?dist} -Summary: Apache Arrow is a data processing library for analysis - -License: Apache-2.0 -URL: https://arrow.apache.org/ -Source0: https://dist.apache.org/repos/dist/release/@PACKAGE@/@PACKAGE@-%{version}/apache-@PACKAGE@-%{version}.tar.gz - -BuildRequires: bison -BuildRequires: boost%{boost_version}-devel -BuildRequires: brotli-devel -BuildRequires: bzip2-devel -BuildRequires: cmake%{cmake_version} -%if %{use_s3} -BuildRequires: curl-devel -%endif -BuildRequires: flex -BuildRequires: gcc-c++ -BuildRequires: gflags-devel -BuildRequires: git -BuildRequires: glog-devel -BuildRequires: libzstd-devel -BuildRequires: lz4-devel -BuildRequires: openssl-devel -BuildRequires: pkgconfig -BuildRequires: python%{python_version}-devel -BuildRequires: python%{python_version}-numpy -%if %{have_rapidjson} -BuildRequires: rapidjson-devel -%endif -%if %{have_re2} -BuildRequires: re2-devel -%endif -BuildRequires: snappy-devel -%if %{have_utf8proc} -BuildRequires: utf8proc-devel -%endif -BuildRequires: zlib-devel - -%if %{use_gandiva} -BuildRequires: llvm-devel -BuildRequires: ncurses-devel -%endif - -BuildRequires: gobject-introspection-devel -BuildRequires: gtk-doc - -%description -Apache Arrow is a data processing library for analysis. - -%prep -%setup -q -n apache-@PACKAGE@-%{version} - -%build -cpp_build_type=release -mkdir cpp/build -cd cpp/build -%cmake3 .. \ -%if %{use_flight} - -DARROW_FLIGHT=ON \ -%endif -%if %{use_gandiva} - -DARROW_GANDIVA=ON \ -%endif -%if %{use_mimalloc} - -DARROW_MIMALLOC=ON \ -%endif - -DARROW_ORC=ON \ - -DARROW_PARQUET=ON \ - -DARROW_PLASMA=ON \ - -DARROW_PYTHON=ON \ -%if %{use_s3} - -DARROW_S3=ON \ -%endif - -DARROW_WITH_BROTLI=ON \ - -DARROW_WITH_BZ2=ON \ - -DARROW_WITH_LZ4=ON \ - -DARROW_WITH_SNAPPY=ON \ - -DARROW_WITH_ZLIB=ON \ - -DARROW_WITH_ZSTD=ON \ - -DCMAKE_BUILD_TYPE=$cpp_build_type \ - -DARROW_USE_CCACHE=OFF \ - -DCMAKE_UNITY_BUILD=ON \ - -DPARQUET_REQUIRE_ENCRYPTION=ON \ - -DPythonInterp_FIND_VERSION=ON \ - -DPythonInterp_FIND_VERSION_MAJOR=3 \ -%if %{use_ninja} - -GNinja -%endif - -%if %{use_ninja} -ninja %{?_smp_mflags} -%else -make %{?_smp_mflags} -%endif -cd - - -cd c_glib -pip3 install meson -meson setup build \ - --default-library=both \ - --libdir=%{_libdir} \ - --prefix=%{_prefix} \ - -Darrow_cpp_build_dir=../cpp/build \ - -Darrow_cpp_build_type=$cpp_build_type \ - -Dgtk_doc=true -LD_LIBRARY_PATH=$PWD/../cpp/build/$cpp_build_type ninja -C build %{?_smp_mflags} -cd - - -%install -cpp_build_type=release - -cd c_glib -DESTDIR=$RPM_BUILD_ROOT ninja -C build install -ninja -C build clean -cd - - -cd cpp/build -%if %{use_ninja} -DESTDIR=$RPM_BUILD_ROOT ninja install -ninja clean -%else -make install DESTDIR=$RPM_BUILD_ROOT -make clean -%endif -cd - - -%package libs -Summary: Runtime libraries for Apache Arrow C++ -License: Apache-2.0 -Requires: boost%{boost_version}-system -Requires: boost%{boost_version}-filesystem -Requires: brotli -Requires: gflags -Requires: glog -Requires: libzstd -Requires: lz4 -%if %{have_re2} -Requires: re2 -%endif -Requires: snappy -%if %{have_utf8proc} -Requires: utf8proc -%endif -Requires: zlib - -%description libs -This package contains the libraries for Apache Arrow C++. - -%files libs -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_docdir}/arrow/ -%{_libdir}/libarrow.so.* - -%package devel -Summary: Libraries and header files for Apache Arrow C++ -License: Apache-2.0 -Requires: %{name}-libs = %{version}-%{release} -Requires: brotli-devel -Requires: bzip2-devel -Requires: libzstd-devel -Requires: lz4-devel -Requires: openssl-devel -%if %{have_rapidjson} -Requires: rapidjson-devel -%endif -%if %{have_re2} -Requires: re2-devel -%endif -Requires: snappy-devel -%if %{have_utf8proc} -Requires: utf8proc-devel -%endif -Requires: zlib-devel - -%description devel -Libraries and header files for Apache Arrow C++. - -%files devel -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_includedir}/arrow/ -%exclude %{_includedir}/arrow/dataset/ -%if %{use_flight} -%exclude %{_includedir}/arrow/flight/ -%endif -%exclude %{_includedir}/arrow/python/ -%{_libdir}/cmake/arrow/ArrowConfig*.cmake -%{_libdir}/cmake/arrow/ArrowOptions.cmake -%{_libdir}/cmake/arrow/ArrowTargets*.cmake -%{_libdir}/cmake/arrow/FindArrow.cmake -%{_libdir}/cmake/arrow/FindBrotli.cmake -%{_libdir}/cmake/arrow/FindLz4.cmake -%{_libdir}/cmake/arrow/FindSnappy.cmake -%if %{have_re2} -%{_libdir}/cmake/arrow/Findre2Alt.cmake -%endif -%if %{have_utf8proc} -%{_libdir}/cmake/arrow/Findutf8proc.cmake -%endif -%{_libdir}/cmake/arrow/Findzstd.cmake -%{_libdir}/cmake/arrow/arrow-config.cmake -%{_libdir}/libarrow.a -%{_libdir}/libarrow.so -%{_libdir}/libarrow_bundled_dependencies.a -%{_libdir}/pkgconfig/arrow-compute.pc -%{_libdir}/pkgconfig/arrow-csv.pc -%{_libdir}/pkgconfig/arrow-filesystem.pc -%{_libdir}/pkgconfig/arrow-json.pc -%{_libdir}/pkgconfig/arrow-orc.pc -%{_libdir}/pkgconfig/arrow.pc - -%package dataset-libs -Summary: C++ library to read and write semantic datasets stored in different locations and formats -License: Apache-2.0 -Requires: %{name}-libs = %{version}-%{release} - -%description dataset-libs -This package contains the libraries for Apache Arrow dataset. - -%files dataset-libs -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_libdir}/libarrow_dataset.so.* - -%package dataset-devel -Summary: Libraries and header files for Apache Arrow dataset. -License: Apache-2.0 -Requires: %{name}-dataset-libs = %{version}-%{release} - -%description dataset-devel -Libraries and header files for Apache Arrow dataset. - -%files dataset-devel -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_includedir}/arrow/dataset/ -%{_libdir}/cmake/arrow/ArrowDatasetConfig*.cmake -%{_libdir}/cmake/arrow/ArrowDatasetTargets*.cmake -%{_libdir}/cmake/arrow/FindArrowDataset.cmake -%{_libdir}/libarrow_dataset.a -%{_libdir}/libarrow_dataset.so -%{_libdir}/pkgconfig/arrow-dataset.pc - -%if %{use_flight} -%package flight-libs -Summary: C++ library for fast data transport. -License: Apache-2.0 -Requires: %{name}-libs = %{version}-%{release} -Requires: openssl - -%description flight-libs -This package contains the libraries for Apache Arrow Flight. - -%files flight-libs -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_libdir}/libarrow_flight.so.* - -%package flight-devel -Summary: Libraries and header files for Apache Arrow Flight. -License: Apache-2.0 -Requires: %{name}-flight-libs = %{version}-%{release} - -%description flight-devel -Libraries and header files for Apache Arrow Flight. - -%files flight-devel -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_includedir}/arrow/flight/ -%{_libdir}/cmake/arrow/ArrowFlightConfig*.cmake -%{_libdir}/cmake/arrow/ArrowFlightTargets*.cmake -%{_libdir}/cmake/arrow/FindArrowFlight.cmake -%{_libdir}/libarrow_flight.a -%{_libdir}/libarrow_flight.so -%{_libdir}/pkgconfig/arrow-flight.pc -%endif - -%if %{use_gandiva} -%package -n gandiva-libs -Summary: C++ library for compiling and evaluating expressions on Apache Arrow data. -License: Apache-2.0 -Requires: %{name}-libs = %{version}-%{release} -Requires: ncurses-libs - -%description -n gandiva-libs -This package contains the libraries for Gandiva. - -%files -n gandiva-libs -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_libdir}/libgandiva.so.* - -%package -n gandiva-devel -Summary: Libraries and header files for Gandiva. -License: Apache-2.0 -Requires: gandiva-libs = %{version}-%{release} -Requires: llvm-devel - -%description -n gandiva-devel -Libraries and header files for Gandiva. - -%files -n gandiva-devel -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_includedir}/gandiva/ -%{_libdir}/cmake/arrow/GandivaConfig*.cmake -%{_libdir}/cmake/arrow/GandivaTargets*.cmake -%{_libdir}/cmake/arrow/FindGandiva.cmake -%{_libdir}/libgandiva.a -%{_libdir}/libgandiva.so -%{_libdir}/pkgconfig/gandiva.pc -%endif - -%package python-libs -Summary: Python integration library for Apache Arrow -License: Apache-2.0 -Requires: %{name}-libs = %{version}-%{release} -Requires: python%{python_version}-numpy - -%description python-libs -This package contains the Python integration library for Apache Arrow. - -%files python-libs -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_libdir}/libarrow_python.so.* - -%package python-devel -Summary: Libraries and header files for Python integration library for Apache Arrow -License: Apache-2.0 -Requires: %{name}-devel = %{version}-%{release} -Requires: %{name}-libs = %{version}-%{release} -Requires: python%{python_version}-devel - -%description python-devel -Libraries and header files for Python integration library for Apache Arrow. - -%files python-devel -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_includedir}/arrow/python/ -%exclude %{_includedir}/arrow/python/flight.h -%{_libdir}/cmake/arrow/ArrowPythonConfig*.cmake -%{_libdir}/cmake/arrow/ArrowPythonTargets*.cmake -%{_libdir}/cmake/arrow/FindArrowPython.cmake -%{_libdir}/libarrow_python.a -%{_libdir}/libarrow_python.so -%{_libdir}/pkgconfig/arrow-python.pc - -%if %{use_flight} -%package python-flight-libs -Summary: Python integration library for Apache Arrow Flight -License: Apache-2.0 -Requires: %{name}-flight-libs = %{version}-%{release} -Requires: %{name}-python-libs = %{version}-%{release} - -%description python-flight-libs -This package contains the Python integration library for Apache Arrow Flight. - -%files python-flight-libs -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_libdir}/libarrow_python_flight.so.* - -%package python-flight-devel -Summary: Libraries and header files for Python integration library for Apache Arrow Flight. -License: Apache-2.0 -Requires: %{name}-flight-devel = %{version}-%{release} -Requires: %{name}-python-devel = %{version}-%{release} -Requires: %{name}-python-flight-libs = %{version}-%{release} - -%description python-flight-devel -Libraries and header files for Python integration library for -Apache Arrow Flight. - -%files python-flight-devel -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_includedir}/arrow/python/flight.h -%{_libdir}/cmake/arrow/ArrowPythonFlightConfig*.cmake -%{_libdir}/cmake/arrow/ArrowPythonFlightTargets*.cmake -%{_libdir}/cmake/arrow/FindArrowPythonFlight.cmake -%{_libdir}/libarrow_python_flight.a -%{_libdir}/libarrow_python_flight.so -%{_libdir}/pkgconfig/arrow-python-flight.pc -%endif - -%package -n plasma-libs -Summary: Runtime libraries for Plasma in-memory object store -License: Apache-2.0 -Requires: %{name}-libs = %{version}-%{release} - -%description -n plasma-libs -This package contains the libraries for Plasma in-memory object store. - -%files -n plasma-libs -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_libdir}/libplasma.so.* - -%package -n plasma-store-server -Summary: Server for Plasma in-memory object store -License: Apache-2.0 -Requires: plasma-libs = %{version}-%{release} - -%description -n plasma-store-server -This package contains the server for Plasma in-memory object store. - -%files -n plasma-store-server -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_bindir}/plasma-store-server - -%package -n plasma-devel -Summary: Libraries and header files for Plasma in-memory object store -License: Apache-2.0 -Requires: plasma-libs = %{version}-%{release} - -%description -n plasma-devel -Libraries and header files for Plasma in-memory object store. - -%files -n plasma-devel -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_includedir}/plasma/ -%{_libdir}/cmake/arrow/PlasmaConfig*.cmake -%{_libdir}/cmake/arrow/PlasmaTargets*.cmake -%{_libdir}/cmake/arrow/FindPlasma.cmake -%{_libdir}/libplasma.a -%{_libdir}/libplasma.so -%{_libdir}/pkgconfig/plasma*.pc - -%package -n parquet-libs -Summary: Runtime libraries for Apache Parquet C++ -License: Apache-2.0 -Requires: boost%{boost_version}-program-options -Requires: %{name}-libs = %{version}-%{release} -Requires: openssl - -%description -n parquet-libs -This package contains the libraries for Apache Parquet C++. - -%files -n parquet-libs -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_libdir}/libparquet.so.* - -%package -n parquet-devel -Summary: Libraries and header files for Apache Parquet C++ -License: Apache-2.0 -Requires: parquet-libs = %{version}-%{release} -Requires: zlib-devel - -%description -n parquet-devel -Libraries and header files for Apache Parquet C++. - -%files -n parquet-devel -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_includedir}/parquet/ -%{_libdir}/cmake/arrow/ParquetConfig*.cmake -%{_libdir}/cmake/arrow/ParquetTargets*.cmake -%{_libdir}/cmake/arrow/FindParquet.cmake -%{_libdir}/libparquet.a -%{_libdir}/libparquet.so -%{_libdir}/pkgconfig/parquet*.pc - -%package glib-libs -Summary: Runtime libraries for Apache Arrow GLib -License: Apache-2.0 -Requires: %{name}-libs = %{version}-%{release} -Requires: glib2 - -%description glib-libs -This package contains the libraries for Apache Arrow GLib. - -%files glib-libs -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_libdir}/libarrow-glib.so.* -%{_datadir}/gir-1.0/Arrow-1.0.gir - -%package glib-devel -Summary: Libraries and header files for Apache Arrow GLib -License: Apache-2.0 -Requires: %{name}-devel = %{version}-%{release} -Requires: glib2-devel -Requires: gobject-introspection-devel - -%description glib-devel -Libraries and header files for Apache Arrow GLib. - -%files glib-devel -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_includedir}/arrow-glib/ -%{_libdir}/libarrow-glib.a -%{_libdir}/libarrow-glib.so -%{_libdir}/pkgconfig/arrow-glib.pc -%{_libdir}/pkgconfig/arrow-orc-glib.pc -%{_libdir}/girepository-1.0/Arrow-1.0.typelib -%{_datadir}/arrow-glib/example/ - -%package glib-doc -Summary: Documentation for Apache Arrow GLib -License: Apache-2.0 - -%description glib-doc -Documentation for Apache Arrow GLib. - -%files glib-doc -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_docdir}/arrow-glib/ -%{_datadir}/gtk-doc/html/arrow-glib/ - -%package dataset-glib-libs -Summary: Runtime libraries for Apache Arrow dataset GLib -License: Apache-2.0 -Requires: %{name}-dataset-libs = %{version}-%{release} -Requires: %{name}-glib-libs = %{version}-%{release} - -%description dataset-glib-libs -This package contains the libraries for Apache Arrow dataset GLib. - -%files dataset-glib-libs -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_libdir}/libarrow-dataset-glib.so.* -%{_datadir}/gir-1.0/ArrowDataset-1.0.gir - -%package dataset-glib-devel -Summary: Libraries and header files for Apache Arrow dataset GLib -License: Apache-2.0 -Requires: %{name}-dataset-devel = %{version}-%{release} -Requires: %{name}-glib-devel = %{version}-%{release} - -%description dataset-glib-devel -Libraries and header files for Apache Arrow dataset GLib. - -%files dataset-glib-devel -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_includedir}/arrow-dataset-glib/ -%{_libdir}/libarrow-dataset-glib.a -%{_libdir}/libarrow-dataset-glib.so -%{_libdir}/pkgconfig/arrow-dataset-glib.pc -%{_libdir}/girepository-1.0/ArrowDataset-1.0.typelib - -%package dataset-glib-doc -Summary: Documentation for Apache Arrow dataset GLib -License: Apache-2.0 - -%description dataset-glib-doc -Documentation for Apache Arrow dataset GLib. - -%files dataset-glib-doc -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_datadir}/gtk-doc/html/arrow-dataset-glib/ - -%if %{use_gandiva} -%package -n gandiva-glib-libs -Summary: Runtime libraries for Gandiva GLib -License: Apache-2.0 -Requires: gandiva-libs = %{version}-%{release} -Requires: %{name}-glib-libs = %{version}-%{release} - -%description -n gandiva-glib-libs -This package contains the libraries for Gandiva GLib. - -%files -n gandiva-glib-libs -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_libdir}/libgandiva-glib.so.* -%{_datadir}/gir-1.0/Gandiva-1.0.gir - -%package -n gandiva-glib-devel -Summary: Libraries and header files for Gandiva GLib -License: Apache-2.0 -Requires: gandiva-devel = %{version}-%{release} -Requires: %{name}-glib-devel = %{version}-%{release} - -%description -n gandiva-glib-devel -Libraries and header files for Gandiva GLib. - -%files -n gandiva-glib-devel -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_includedir}/gandiva-glib/ -%{_libdir}/libgandiva-glib.a -%{_libdir}/libgandiva-glib.so -%{_libdir}/pkgconfig/gandiva-glib.pc -%{_libdir}/girepository-1.0/Gandiva-1.0.typelib - -%package -n gandiva-glib-doc -Summary: Documentation for Gandiva GLib -License: Apache-2.0 - -%description -n gandiva-glib-doc -Documentation for Gandiva GLib. - -%files -n gandiva-glib-doc -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_datadir}/gtk-doc/html/gandiva-glib/ -%endif - -%package -n plasma-glib-libs -Summary: Runtime libraries for Plasma GLib -License: Apache-2.0 -Requires: plasma-libs = %{version}-%{release} -Requires: %{name}-glib-libs = %{version}-%{release} - -%description -n plasma-glib-libs -This package contains the libraries for Plasma GLib. - -%files -n plasma-glib-libs -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_libdir}/libplasma-glib.so.* -%{_datadir}/gir-1.0/Plasma-1.0.gir - -%package -n plasma-glib-devel -Summary: Libraries and header files for Plasma GLib -License: Apache-2.0 -Requires: plasma-devel = %{version}-%{release} -Requires: %{name}-glib-devel = %{version}-%{release} - -%description -n plasma-glib-devel -Libraries and header files for Plasma GLib. - -%files -n plasma-glib-devel -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_includedir}/plasma-glib/ -%{_libdir}/libplasma-glib.a -%{_libdir}/libplasma-glib.so -%{_libdir}/pkgconfig/plasma-glib.pc -%{_libdir}/girepository-1.0/Plasma-1.0.typelib - -%package -n plasma-glib-doc -Summary: Documentation for Plasma GLib -License: Apache-2.0 - -%description -n plasma-glib-doc -Documentation for Plasma GLib. - -%files -n plasma-glib-doc -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_datadir}/gtk-doc/html/plasma-glib/ - -%package -n parquet-glib-libs -Summary: Runtime libraries for Apache Parquet GLib -License: Apache-2.0 -Requires: parquet-libs = %{version}-%{release} -Requires: %{name}-glib-libs = %{version}-%{release} - -%description -n parquet-glib-libs -This package contains the libraries for Apache Parquet GLib. - -%files -n parquet-glib-libs -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_libdir}/libparquet-glib.so.* -%{_datadir}/gir-1.0/Parquet-1.0.gir - -%package -n parquet-glib-devel -Summary: Libraries and header files for Apache Parquet GLib -License: Apache-2.0 -Requires: parquet-devel = %{version}-%{release} -Requires: %{name}-glib-devel = %{version}-%{release} - -%description -n parquet-glib-devel -Libraries and header files for Apache Parquet GLib. - -%files -n parquet-glib-devel -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_includedir}/parquet-glib/ -%{_libdir}/libparquet-glib.a -%{_libdir}/libparquet-glib.so -%{_libdir}/pkgconfig/parquet-glib.pc -%{_libdir}/girepository-1.0/Parquet-1.0.typelib - -%package -n parquet-glib-doc -Summary: Documentation for Apache Parquet GLib -License: Apache-2.0 - -%description -n parquet-glib-doc -Documentation for Apache Parquet GLib. - -%files -n parquet-glib-doc -%defattr(-,root,root,-) -%doc README.md LICENSE.txt NOTICE.txt -%{_datadir}/gtk-doc/html/parquet-glib/ - -%changelog -* Mon Jan 18 2021 Krisztián Szűcs - 3.0.0-1 -- New upstream release. - -* Mon Oct 12 2020 Krisztián Szűcs - 2.0.0-1 -- New upstream release. - -* Mon Jul 20 2020 Krisztián Szűcs - 1.0.0-1 -- New upstream release. - -* Thu Apr 16 2020 Krisztián Szűcs - 0.17.0-1 -- New upstream release. - -* Thu Jan 30 2020 Krisztián Szűcs - 0.16.0-1 -- New upstream release. - -* Mon Sep 30 2019 Krisztián Szűcs - 0.15.0-1 -- New upstream release. - -* Fri Jun 28 2019 Sutou Kouhei - 0.14.0-1 -- New upstream release. - -* Thu Mar 28 2019 Kouhei Sutou - 0.13.0-1 -- New upstream release. - -* Wed Jan 16 2019 Krisztián Szűcs - 0.12.0-1 -- New upstream release. - -* Thu Oct 04 2018 Kouhei Sutou - 0.11.0-1 -- New upstream release. - -* Thu Aug 02 2018 Phillip Cloud - 0.10.0-1 -- New upstream release. - -* Fri Mar 16 2018 Kouhei Sutou - 0.9.0-1 -- New upstream release. - -* Sun Dec 17 2017 Uwe Korn - 0.8.0-1 -- New upstream release. - -* Wed Sep 27 2017 Kouhei Sutou - 0.7.1-1 -- New upstream release. - -* Tue Sep 12 2017 Wes McKinney - 0.7.0-1 -- New upstream release. - -* Fri Aug 11 2017 Kouhei Sutou - 0.6.0-1 -- New upstream release. - -* Wed Aug 02 2017 Kouhei Sutou - 0.6.0.20170802-1 -- New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/yum/centos-7/Dockerfile b/dev/tasks/linux-packages/apache-arrow/yum/centos-7/Dockerfile deleted file mode 100644 index 8c6c9d66d2554..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/yum/centos-7/Dockerfile +++ /dev/null @@ -1,62 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG FROM=centos:7 -FROM ${FROM} - -COPY qemu-* /usr/bin/ - -ARG DEBUG - -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "--quiet") && \ - yum update -y ${quiet} && \ - yum install -y ${quiet} epel-release && \ - yum groupinstall -y ${quiet} "Development Tools" && \ - yum install -y ${quiet} \ - bison \ - boost169-devel \ - brotli-devel \ - bzip2-devel \ - ccache \ - cmake3 \ - flex \ - gflags-devel \ - git \ - glog-devel \ - gobject-introspection-devel \ - gtk-doc \ - libzstd-devel \ - lz4-devel \ - ninja-build \ - openssl-devel \ - pkg-config \ - python36 \ - python36-devel \ - python36-numpy \ - rapidjson-devel \ - rpm-build \ - rpmdevtools \ - snappy-devel \ - tar \ - utf8proc-devel \ - zlib-devel && \ - yum clean ${quiet} all - -ENV \ - BOOST_INCLUDEDIR=/usr/include/boost169 \ - BOOST_LIBRARYDIR=/usr/lib64/boost169 diff --git a/dev/tasks/linux-packages/apache-arrow/yum/centos-7/qemu-dummy-static b/dev/tasks/linux-packages/apache-arrow/yum/centos-7/qemu-dummy-static deleted file mode 100755 index c42e0962def31..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/yum/centos-7/qemu-dummy-static +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/sh -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Do nothing. This exists only for not requiring qemu-aarch64-static copy. -# Recent Debian (buster or later) and Ubuntu (18.10 or later) on amd64 hosts or -# arm64 host don't require qemu-aarch64-static in Docker image. But old Debian -# and Ubuntu hosts on amd64 require qemu-aarch64-static in Docker image. -# -# We use "COPY qemu* /usr/bin/" in Dockerfile. If we don't put any "qemnu*", -# the "COPY" is failed. It means that we always require "qemu*" even if we -# use recent Debian/Ubuntu or arm64 host. If we have this dummy "qemu*" file, -# the "COPY" isn't failed. It means that we can copy "qemu*" only when we -# need. -# -# See also "script" in dev/tasks/linux-packages/azure.linux.arm64.yml. -# Azure Pipelines uses old Ubuntu (18.04). -# So we need to put "qemu-aarch64-static" into this directory. diff --git a/dev/tasks/linux-packages/apache-arrow/yum/centos-8-aarch64/from b/dev/tasks/linux-packages/apache-arrow/yum/centos-8-aarch64/from deleted file mode 100644 index 587ce9d4a32d0..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/yum/centos-8-aarch64/from +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -arm64v8/centos:8 diff --git a/dev/tasks/linux-packages/apache-arrow/yum/centos-8/Dockerfile b/dev/tasks/linux-packages/apache-arrow/yum/centos-8/Dockerfile deleted file mode 100644 index 66c435c333d43..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/yum/centos-8/Dockerfile +++ /dev/null @@ -1,65 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG FROM=centos:8 -FROM ${FROM} - -COPY qemu-* /usr/bin/ - -ARG DEBUG - -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "--quiet") && \ - dnf install -y ${quiet} epel-release && \ - dnf install --enablerepo=powertools -y ${quiet} \ - bison \ - boost-devel \ - brotli-devel \ - bzip2-devel \ - ccache \ - clang \ - cmake \ - curl-devel \ - flex \ - gcc-c++ \ - gflags-devel \ - git \ - glog-devel \ - gobject-introspection-devel \ - gtk-doc \ - libzstd-devel \ - llvm-devel \ - llvm-static \ - lz4-devel \ - make \ - ncurses-devel \ - ninja-build \ - openssl-devel \ - pkg-config \ - python3 \ - python3-devel \ - python3-numpy \ - python3-pip \ - re2-devel \ - # rapidjson-devel \ - rpm-build \ - rpmdevtools \ - snappy-devel \ - tar \ - # utf8proc-devel \ - zlib-devel && \ - dnf clean ${quiet} all diff --git a/dev/tasks/linux-packages/apache-arrow/yum/centos-8/qemu-dummy-static b/dev/tasks/linux-packages/apache-arrow/yum/centos-8/qemu-dummy-static deleted file mode 100755 index c42e0962def31..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/yum/centos-8/qemu-dummy-static +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/sh -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Do nothing. This exists only for not requiring qemu-aarch64-static copy. -# Recent Debian (buster or later) and Ubuntu (18.10 or later) on amd64 hosts or -# arm64 host don't require qemu-aarch64-static in Docker image. But old Debian -# and Ubuntu hosts on amd64 require qemu-aarch64-static in Docker image. -# -# We use "COPY qemu* /usr/bin/" in Dockerfile. If we don't put any "qemnu*", -# the "COPY" is failed. It means that we always require "qemu*" even if we -# use recent Debian/Ubuntu or arm64 host. If we have this dummy "qemu*" file, -# the "COPY" isn't failed. It means that we can copy "qemu*" only when we -# need. -# -# See also "script" in dev/tasks/linux-packages/azure.linux.arm64.yml. -# Azure Pipelines uses old Ubuntu (18.04). -# So we need to put "qemu-aarch64-static" into this directory. diff --git a/dev/tasks/linux-packages/apt/build.sh b/dev/tasks/linux-packages/apt/build.sh deleted file mode 100755 index 73538e7e8d2df..0000000000000 --- a/dev/tasks/linux-packages/apt/build.sh +++ /dev/null @@ -1,115 +0,0 @@ -#!/bin/bash -# -*- sh-indentation: 2; sh-basic-offset: 2 -*- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -LANG=C - -set -u - -run() -{ - "$@" - if test $? -ne 0; then - echo "Failed $@" - exit 1 - fi -} - -. /host/env.sh - -distribution=$(lsb_release --id --short | tr 'A-Z' 'a-z') -code_name=$(lsb_release --codename --short) -case "${distribution}" in - debian) - component=main - ;; - ubuntu) - component=universe - ;; -esac -architecture=$(dpkg-architecture -q DEB_BUILD_ARCH) - -debuild_options=() -dpkg_buildpackage_options=(-us -uc) - -run mkdir -p /build -run cd /build -find . -not -path ./ccache -a -not -path "./ccache/*" -delete -if which ccache > /dev/null 2>&1; then - export CCACHE_COMPILERCHECK=content - export CCACHE_COMPRESS=1 - export CCACHE_COMPRESSLEVEL=6 - export CCACHE_DIR="${PWD}/ccache" - export CCACHE_MAXSIZE=500M - ccache --show-stats - debuild_options+=(-eCCACHE_COMPILERCHECK) - debuild_options+=(-eCCACHE_COMPRESS) - debuild_options+=(-eCCACHE_COMPRESSLEVEL) - debuild_options+=(-eCCACHE_DIR) - debuild_options+=(-eCCACHE_MAXSIZE) - if [ -d /usr/lib/ccache ] ;then - debuild_options+=(--prepend-path=/usr/lib/ccache) - fi -fi -run cp /host/tmp/${PACKAGE}-${VERSION}.tar.gz \ - ${PACKAGE}_${VERSION}.orig.tar.gz -run tar xfz ${PACKAGE}_${VERSION}.orig.tar.gz -case "${VERSION}" in - *~dev*) - run mv ${PACKAGE}-$(echo $VERSION | sed -e 's/~dev/-dev/') \ - ${PACKAGE}-${VERSION} - ;; - *~rc*) - run mv ${PACKAGE}-$(echo $VERSION | sed -r -e 's/~rc[0-9]+//') \ - ${PACKAGE}-${VERSION} - ;; -esac -run cd ${PACKAGE}-${VERSION}/ -platform="${distribution}-${code_name}" -if [ -d "/host/tmp/debian.${platform}-${architecture}" ]; then - run cp -rp "/host/tmp/debian.${platform}-${architecture}" debian -elif [ -d "/host/tmp/debian.${platform}" ]; then - run cp -rp "/host/tmp/debian.${platform}" debian -else - run cp -rp "/host/tmp/debian" debian -fi -: ${DEB_BUILD_OPTIONS:="parallel=$(nproc)"} -# DEB_BUILD_OPTIONS="${DEB_BUILD_OPTIONS} noopt" -export DEB_BUILD_OPTIONS -if [ "${DEBUG:-no}" = "yes" ]; then - run debuild "${debuild_options[@]}" "${dpkg_buildpackage_options[@]}" -else - run debuild "${debuild_options[@]}" "${dpkg_buildpackage_options[@]}" > /dev/null -fi -if which ccache > /dev/null 2>&1; then - ccache --show-stats -fi -run cd - - -repositories="/host/repositories" -package_initial=$(echo "${PACKAGE}" | sed -e 's/\(.\).*/\1/') -pool_dir="${repositories}/${distribution}/pool/${code_name}/${component}/${package_initial}/${PACKAGE}" -run mkdir -p "${pool_dir}/" -run cp \ - *.*deb \ - *.dsc \ - *.tar.* \ - "${pool_dir}/" - -run chown -R "$(stat --format "%u:%g" "${repositories}")" "${repositories}" diff --git a/dev/tasks/linux-packages/github.linux.amd64.yml b/dev/tasks/linux-packages/github.linux.amd64.yml deleted file mode 100644 index 4fa056c18c70c..0000000000000 --- a/dev/tasks/linux-packages/github.linux.amd64.yml +++ /dev/null @@ -1,100 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -{% import 'macros.jinja' as macros with context %} - -{{ macros.github_header() }} - -jobs: - package: - name: Package - runs-on: ubuntu-18.04 - steps: - {{ macros.github_checkout_arrow()|indent }} - {{ macros.github_login_dockerhub()|indent }} - - - name: Set up Ruby - uses: ruby/setup-ruby@v1 - with: - ruby-version: '2.6' - - name: Free Up Disk Space - shell: bash - run: arrow/ci/scripts/util_cleanup.sh - - name: Cache ccache - uses: actions/cache@v2 - with: - path: arrow/dev/tasks/linux-packages/apache-arrow/{{ task_namespace }}/build/{{ target }}/ccache - key: linux-{{ task_namespace }}-ccache-{{ target }}-{{ "${{ hashFiles('arrow/cpp/**') }}" }} - restore-keys: linux-{{ task_namespace }}-ccache-{{ target }}- - - name: Build - run: | - set -e - pushd arrow/dev/tasks/linux-packages - rake version:update - rake docker:pull || : - rake --trace {{ task_namespace }}:build BUILD_DIR=build - sudo rm -rf */*/build - popd - env: - APT_TARGETS: {{ target }} - ARROW_VERSION: {{ arrow.version }} - REPO: {{ '${{ secrets.REPO }}' }} - YUM_TARGETS: {{ target }} - - name: Docker Push - continue-on-error: true - shell: bash - run: | - pushd arrow/dev/tasks/linux-packages - rake docker:push - popd - env: - APT_TARGETS: {{ target }} - REPO: {{ '${{ secrets.REPO }}' }} - YUM_TARGETS: {{ target }} - - name: Set up test - run: | - set -e - sudo apt update - sudo apt install -y \ - apt-utils \ - createrepo \ - devscripts \ - gpg \ - rpm - (echo "Key-Type: RSA"; \ - echo "Key-Length: 4096"; \ - echo "Name-Real: Test"; \ - echo "Name-Email: test@example.com"; \ - echo "%no-protection") | \ - gpg --full-generate-key --batch - GPG_KEY_ID=$(gpg --list-keys --with-colon test@example.com | grep fpr | cut -d: -f10) - echo "GPG_KEY_ID=${GPG_KEY_ID}" >> ${GITHUB_ENV} - gpg --export --armor test@example.com > arrow/dev/tasks/linux-packages/KEYS - - name: Test - run: | - set -e - pushd arrow/dev/tasks/linux-packages - rake --trace {{ task_namespace }}:test - rm -rf {{ task_namespace }}/repositories - popd - env: - APT_TARGETS: {{ target }} - ARROW_VERSION: {{ arrow.version }} - YUM_TARGETS: {{ target }} - - {% set patterns = upload_extensions | format_all("arrow/dev/tasks/linux-packages/**/*{}") %} - {{ macros.github_upload_releases(patterns)|indent }} diff --git a/dev/tasks/linux-packages/helper.rb b/dev/tasks/linux-packages/helper.rb deleted file mode 100644 index 30ac3b8982fd0..0000000000000 --- a/dev/tasks/linux-packages/helper.rb +++ /dev/null @@ -1,70 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -module Helper - module ApacheArrow - private - def detect_release_time - release_time_env = ENV["ARROW_RELEASE_TIME"] - if release_time_env - Time.parse(release_time_env).utc - else - latest_commit_time(arrow_source_dir) || Time.now.utc - end - end - - def arrow_source_dir - File.join(__dir__, "..", "..", "..") - end - - def detect_version(release_time) - version_env = ENV["ARROW_VERSION"] - return version_env if version_env - - pom_xml_path = File.join(arrow_source_dir, "java", "pom.xml") - pom_xml_content = File.read(pom_xml_path) - version = pom_xml_content[/^ (.+?)<\/version>/, 1] - formatted_release_time = release_time.strftime("%Y%m%d") - version.gsub(/-SNAPSHOT\z/) {"-dev#{formatted_release_time}"} - end - - def detect_env(name) - value = ENV[name] - return value if value and not value.empty? - - dot_env_path = File.join(arrow_source_dir, ".env") - File.open(dot_env_path) do |dot_env| - dot_env.each_line do |line| - case line.chomp - when /\A#{Regexp.escape(name)}=(.*)/ - return $1 - end - end - end - raise "Failed to detect #{name} environment variable" - end - - def detect_repo - detect_env("REPO") - end - - def docker_image(os, architecture) - architecture ||= "amd64" - "#{detect_repo}:#{architecture}-#{os}-package-#{@package}" - end - end -end diff --git a/dev/tasks/linux-packages/package-task.rb b/dev/tasks/linux-packages/package-task.rb deleted file mode 100644 index 59f345935019a..0000000000000 --- a/dev/tasks/linux-packages/package-task.rb +++ /dev/null @@ -1,622 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -require "English" -require "open-uri" -require "time" - -class PackageTask - include Rake::DSL - - def initialize(package, version, release_time, options={}) - @package = package - @version = version - @release_time = release_time - - @archive_base_name = "#{@package}-#{@version}" - @archive_name = "#{@archive_base_name}.tar.gz" - @full_archive_name = File.expand_path(@archive_name) - - @rpm_package = @package - case @version - when /-((dev|rc)\d+)\z/ - base_version = $PREMATCH - sub_version = $1 - type = $2 - if type == "rc" and options[:rc_build_type] == :release - @deb_upstream_version = base_version - @deb_archive_base_name_version = base_version - @rpm_version = base_version - @rpm_release = "1" - else - @deb_upstream_version = "#{base_version}~#{sub_version}" - @deb_archive_base_name_version = @version - @rpm_version = base_version - @rpm_release = "0.#{sub_version}" - end - else - @deb_upstream_version = @version - @deb_archive_base_name_version = @version - @rpm_version = @version - @rpm_release = "1" - end - @deb_release = ENV["DEB_RELEASE"] || "1" - end - - def define - define_dist_task - define_apt_task - define_yum_task - define_version_task - define_docker_tasks - end - - private - def env_value(name) - value = ENV[name] - raise "Specify #{name} environment variable" if value.nil? - value - end - - def debug_build? - ENV["DEBUG"] != "no" - end - - def git_directory?(directory) - candidate_paths = [".git", "HEAD"] - candidate_paths.any? do |candidate_path| - File.exist?(File.join(directory, candidate_path)) - end - end - - def latest_commit_time(git_directory) - return nil unless git_directory?(git_directory) - cd(git_directory) do - return Time.iso8601(`git log -n 1 --format=%aI`.chomp).utc - end - end - - def download(url, output_path) - if File.directory?(output_path) - base_name = url.split("/").last - output_path = File.join(output_path, base_name) - end - absolute_output_path = File.expand_path(output_path) - - unless File.exist?(absolute_output_path) - mkdir_p(File.dirname(absolute_output_path)) - rake_output_message "Downloading... #{url}" - URI(url).open do |downloaded_file| - File.open(absolute_output_path, "wb") do |output_file| - output_file.print(downloaded_file.read) - end - end - end - - absolute_output_path - end - - def substitute_content(content) - content.gsub(/@(.+?)@/) do |matched| - yield($1, matched) - end - end - - def docker_image(os, architecture) - image = "#{@package}-#{os}" - image << "-#{architecture}" if architecture - image - end - - def docker_run(os, architecture, console: false) - id = os - id = "#{id}-#{architecture}" if architecture - image = docker_image(os, architecture) - build_command_line = [ - "docker", - "build", - "--cache-from", image, - "--tag", image, - ] - run_command_line = [ - "docker", - "run", - "--rm", - "--log-driver", "none", - "--volume", "#{Dir.pwd}:/host:rw", - ] - if $stdin.tty? - run_command_line << "--interactive" - run_command_line << "--tty" - else - run_command_line.concat(["--attach", "STDOUT"]) - run_command_line.concat(["--attach", "STDERR"]) - end - build_dir = ENV["BUILD_DIR"] - if build_dir - build_dir = "#{File.expand_path(build_dir)}/#{id}" - mkdir_p(build_dir) - run_command_line.concat(["--volume", "#{build_dir}:/build:rw"]) - end - if debug_build? - build_command_line.concat(["--build-arg", "DEBUG=yes"]) - run_command_line.concat(["--env", "DEBUG=yes"]) - end - pass_through_env_names = [ - "DEB_BUILD_OPTIONS", - "RPM_BUILD_NCPUS", - ] - pass_through_env_names.each do |name| - value = ENV[name] - next unless value - run_command_line.concat(["--env", "#{name}=#{value}"]) - end - if File.exist?(File.join(id, "Dockerfile")) - docker_context = id - else - from = File.readlines(File.join(id, "from")).find do |line| - /^[a-z]/i =~ line - end - build_command_line.concat(["--build-arg", "FROM=#{from.chomp}"]) - docker_context = os - end - build_command_line.concat(docker_build_options(os, architecture)) - run_command_line.concat(docker_run_options(os, architecture)) - build_command_line << docker_context - run_command_line << image - run_command_line << "/host/build.sh" unless console - - sh(*build_command_line) - sh(*run_command_line) - end - - def docker_build_options(os, architecture) - [] - end - - def docker_run_options(os, architecture) - [] - end - - def docker_pull(os, architecture) - image = docker_image(os, architecture) - command_line = [ - "docker", - "pull", - image, - ] - command_line.concat(docker_pull_options(os, architecture)) - sh(*command_line) - end - - def docker_pull_options(os, architecture) - [] - end - - def docker_push(os, architecture) - image = docker_image(os, architecture) - command_line = [ - "docker", - "push", - image, - ] - command_line.concat(docker_push_options(os, architecture)) - sh(*command_line) - end - - def docker_push_options(os, architecture) - [] - end - - def define_dist_task - define_archive_task - desc "Create release package" - task :dist => [@archive_name] - end - - def enable_apt? - true - end - - def apt_targets - return [] unless enable_apt? - - targets = (ENV["APT_TARGETS"] || "").split(",") - targets = apt_targets_default if targets.empty? - - targets.find_all do |target| - Dir.exist?(File.join(apt_dir, target)) - end - end - - def apt_targets_default - # Disable arm64 targets by default for now - # because they require some setups on host. - [ - "debian-buster", - # "debian-buster-arm64", - "debian-bullseye", - # "debian-bullseye-arm64", - "ubuntu-bionic", - # "ubuntu-bionic-arm64", - "ubuntu-focal", - # "ubuntu-focal-arm64", - "ubuntu-groovy", - # "ubuntu-groovy-arm64", - ] - end - - def deb_archive_base_name - "#{@package}-#{@deb_archive_base_name_version}" - end - - def deb_archive_name - "#{@package}-#{@deb_upstream_version}.tar.gz" - end - - def apt_dir - "apt" - end - - def apt_prepare_debian_dir(tmp_dir, target) - source_debian_dir = nil - specific_debian_dir = "debian.#{target}" - distribution, code_name, _architecture = target.split("-", 3) - platform = [distribution, code_name].join("-") - platform_debian_dir = "debian.#{platform}" - if File.exist?(specific_debian_dir) - source_debian_dir = specific_debian_dir - elsif File.exist?(platform_debian_dir) - source_debian_dir = platform_debian_dir - else - source_debian_dir = "debian" - end - - prepared_debian_dir = "#{tmp_dir}/debian.#{target}" - cp_r(source_debian_dir, prepared_debian_dir) - control_in_path = "#{prepared_debian_dir}/control.in" - if File.exist?(control_in_path) - control_in = File.read(control_in_path) - rm_f(control_in_path) - File.open("#{prepared_debian_dir}/control", "w") do |control| - prepared_control = apt_prepare_debian_control(control_in, target) - control.print(prepared_control) - end - end - end - - def apt_prepare_debian_control(control_in, target) - message = "#{__method__} must be defined to use debian/control.in" - raise NotImplementedError, message - end - - def apt_build(console: false) - tmp_dir = "#{apt_dir}/tmp" - rm_rf(tmp_dir) - mkdir_p(tmp_dir) - cp(deb_archive_name, - File.join(tmp_dir, deb_archive_name)) - apt_targets.each do |target| - apt_prepare_debian_dir(tmp_dir, target) - end - - env_sh = "#{apt_dir}/env.sh" - File.open(env_sh, "w") do |file| - file.puts(<<-ENV) -PACKAGE=#{@package} -VERSION=#{@deb_upstream_version} - ENV - end - - apt_targets.each do |target| - cd(apt_dir) do - distribution, version, architecture = target.split("-", 3) - os = "#{distribution}-#{version}" - docker_run(os, architecture, console: console) - end - end - end - - def define_apt_task - namespace :apt do - source_build_sh = "#{__dir__}/apt/build.sh" - build_sh = "#{apt_dir}/build.sh" - repositories_dir = "#{apt_dir}/repositories" - - file build_sh => source_build_sh do - cp(source_build_sh, build_sh) - end - - directory repositories_dir - - desc "Build deb packages" - if enable_apt? - build_dependencies = [ - deb_archive_name, - build_sh, - repositories_dir, - ] - else - build_dependencies = [] - end - task :build => build_dependencies do - apt_build if enable_apt? - end - - namespace :build do - desc "Open console" - task :console => build_dependencies do - apt_build(console: true) if enable_apt? - end - end - end - - desc "Release APT repositories" - apt_tasks = [ - "apt:build", - ] - task :apt => apt_tasks - end - - def enable_yum? - true - end - - def yum_targets - return [] unless enable_yum? - - targets = (ENV["YUM_TARGETS"] || "").split(",") - targets = yum_targets_default if targets.empty? - - targets.find_all do |target| - Dir.exist?(File.join(yum_dir, target)) - end - end - - def yum_targets_default - # Disable aarch64 targets by default for now - # because they require some setups on host. - [ - "centos-7", - # "centos-7-aarch64", - "centos-8", - # "centos-8-aarch64", - ] - end - - def rpm_archive_base_name - "#{@package}-#{@rpm_version}" - end - - def rpm_archive_name - "#{rpm_archive_base_name}.tar.gz" - end - - def yum_dir - "yum" - end - - def yum_build_sh - "#{yum_dir}/build.sh" - end - - def yum_expand_variable(key) - case key - when "PACKAGE" - @rpm_package - when "VERSION" - @rpm_version - when "RELEASE" - @rpm_release - else - nil - end - end - - def yum_spec_in_path - "#{yum_dir}/#{@rpm_package}.spec.in" - end - - def yum_build(console: false) - tmp_dir = "#{yum_dir}/tmp" - rm_rf(tmp_dir) - mkdir_p(tmp_dir) - cp(rpm_archive_name, - File.join(tmp_dir, rpm_archive_name)) - - env_sh = "#{yum_dir}/env.sh" - File.open(env_sh, "w") do |file| - file.puts(<<-ENV) -SOURCE_ARCHIVE=#{rpm_archive_name} -PACKAGE=#{@rpm_package} -VERSION=#{@rpm_version} -RELEASE=#{@rpm_release} - ENV - end - - spec = "#{tmp_dir}/#{@rpm_package}.spec" - spec_in_data = File.read(yum_spec_in_path) - spec_data = substitute_content(spec_in_data) do |key, matched| - yum_expand_variable(key) || matched - end - File.open(spec, "w") do |spec_file| - spec_file.print(spec_data) - end - - yum_targets.each do |target| - cd(yum_dir) do - distribution, version, architecture = target.split("-", 3) - os = "#{distribution}-#{version}" - docker_run(os, architecture, console: console) - end - end - end - - def define_yum_task - namespace :yum do - source_build_sh = "#{__dir__}/yum/build.sh" - file yum_build_sh => source_build_sh do - cp(source_build_sh, yum_build_sh) - end - - repositories_dir = "#{yum_dir}/repositories" - directory repositories_dir - - desc "Build RPM packages" - if enable_yum? - build_dependencies = [ - repositories_dir, - rpm_archive_name, - yum_build_sh, - yum_spec_in_path, - ] - else - build_dependencies = [] - end - task :build => build_dependencies do - yum_build if enable_yum? - end - - namespace :build do - desc "Open console" - task :console => build_dependencies do - yum_build(console: true) if enable_yum? - end - end - end - - desc "Release Yum repositories" - yum_tasks = [ - "yum:build", - ] - task :yum => yum_tasks - end - - def define_version_task - namespace :version do - desc "Update versions" - task :update do - update_debian_changelog - update_spec - end - end - end - - def package_changelog_message - "New upstream release." - end - - def packager_name - ENV["DEBFULLNAME"] || ENV["NAME"] || guess_packager_name_from_git - end - - def guess_packager_name_from_git - name = `git config --get user.name`.chomp - return name unless name.empty? - `git log -n 1 --format=%aN`.chomp - end - - def packager_email - ENV["DEBEMAIL"] || ENV["EMAIL"] || guess_packager_email_from_git - end - - def guess_packager_email_from_git - email = `git config --get user.email`.chomp - return email unless email.empty? - `git log -n 1 --format=%aE`.chomp - end - - def update_content(path) - if File.exist?(path) - content = File.read(path) - else - content = "" - end - content = yield(content) - File.open(path, "w") do |file| - file.puts(content) - end - end - - def update_debian_changelog - return unless enable_apt? - - Dir.glob("debian*") do |debian_dir| - update_content("#{debian_dir}/changelog") do |content| - <<-CHANGELOG.rstrip -#{@package} (#{@deb_upstream_version}-#{@deb_release}) unstable; urgency=low - - * New upstream release. - - -- #{packager_name} <#{packager_email}> #{@release_time.rfc2822} - -#{content} - CHANGELOG - end - end - end - - def update_spec - return unless enable_yum? - - release_time = @release_time.strftime("%a %b %d %Y") - update_content(yum_spec_in_path) do |content| - content = content.sub(/^(%changelog\n)/, <<-CHANGELOG) -%changelog -* #{release_time} #{packager_name} <#{packager_email}> - #{@rpm_version}-#{@rpm_release} -- #{package_changelog_message} - - CHANGELOG - content = content.sub(/^(Release:\s+)\d+/, "\\11") - content.rstrip - end - end - - def define_docker_tasks - namespace :docker do - pull_tasks = [] - push_tasks = [] - - (apt_targets + yum_targets).each do |target| - distribution, version, architecture = target.split("-", 3) - os = "#{distribution}-#{version}" - - namespace :pull do - desc "Pull built image for #{target}" - task target do - docker_pull(os, architecture) - end - pull_tasks << "docker:pull:#{target}" - end - - namespace :push do - desc "Push built image for #{target}" - task target do - docker_push(os, architecture) - end - push_tasks << "docker:push:#{target}" - end - end - - desc "Pull built images" - task :pull => pull_tasks - - desc "Push built images" - task :push => push_tasks - end - end -end diff --git a/dev/tasks/linux-packages/travis.linux.arm64.yml b/dev/tasks/linux-packages/travis.linux.arm64.yml deleted file mode 100644 index 6078942e737f5..0000000000000 --- a/dev/tasks/linux-packages/travis.linux.arm64.yml +++ /dev/null @@ -1,149 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -{% import 'macros.jinja' as macros with context %} - -arch: arm64-graviton2 -virt: vm -os: linux -dist: focal -group: edge -language: minimal - -addons: - apt: - packages: - - apt-utils - # We need Ubuntu 20.10 or later - # - createrepo_c - - devscripts - - gpg - - libgit2-dev - - python3-pip - - rake - - rpm - - # To build createrepo_c from source. - # We can remove them when we can install createrepo_c package - - cmake - - libbz2-dev - - libcurl4-openssl-dev - - libglib2.0-dev - - liblzma-dev - - libmagic-dev - - librpm-dev - - libsqlite3-dev - - libssl-dev - - libxml2-dev - - libzstd-dev - - pkg-config - - zlib1g-dev - -services: - - docker - -# don't build twice -if: tag IS blank - -env: - global: - - APT_TARGETS={{ target }} - - ARROW_VERSION={{ arrow.version }} - - BUILD_REF={{ arrow.head }} - - TRAVIS_TAG={{ task.tag }} - - YUM_TARGETS={{ target }} - -before_script: - - set -e - {{ macros.travis_checkout_arrow() }} - {{ macros.travis_docker_login() }} - - # Build createrepo_c from source. - # We can remove them when we can install createrepo_c package - - git clone --depth 1 https://github.com/rpm-software-management/createrepo_c.git - - pushd createrepo_c - - | - /usr/bin/cmake \ - -DCMAKE_INSTALL_PREFIX=/usr \ - -DENABLE_BASHCOMP=OFF \ - -DENABLE_DRPM=OFF \ - -DENABLE_PYTHON=OFF \ - -DWITH_LIBMODULEMD=OFF \ - -DWITH_ZCHUNK=OFF \ - . - - make -j$(nproc) - - sudo make install - - popd - - rm -rf createrepo_c - -script: - # Build packages - - pushd arrow/dev/tasks/linux-packages - - rake version:update - - | - rake docker:pull || : - - pushd apache-arrow-apt-source/apt - - | - for target in debian-* ubuntu-*; do - cp -a ${target} ${target}-arm64 - done - - popd - - pushd apache-arrow-release/yum - - | - for target in centos-*; do - cp -a ${target} ${target}-aarch64 - done - - popd - - | - rake \ - --trace \ - {{ task_namespace }}:build \ - BUILD_DIR=build \ - DEB_BUILD_OPTIONS=parallel=2 \ - RPM_BUILD_NCPUS=2 - - sudo rm -rf */*/build - - popd - # Push Docker image - - pushd arrow/dev/tasks/linux-packages - - | - docker login -u "${DOCKERHUB_USER}" \ - -p "${DOCKERHUB_TOKEN}" || : - - | - rake docker:push || : - - popd - # Test built packages - - | - (echo "Key-Type: RSA"; \ - echo "Key-Length: 4096"; \ - echo "Name-Real: Test"; \ - echo "Name-Email: test@example.com"; \ - echo "%no-protection") | \ - gpg --full-generate-key --batch - - | - GPG_KEY_ID=$(gpg --list-keys --with-colon test@example.com | grep fpr | cut -d: -f10) - - gpg --export --armor test@example.com > arrow/dev/tasks/linux-packages/KEYS - - pushd arrow/dev/tasks/linux-packages - - | - rake --trace {{ task_namespace }}:test \ - CREATEREPO=createrepo_c \ - GPG_KEY_ID=${GPG_KEY_ID} - - rm -rf {{ task_namespace }}/repositories - - popd - -after_success: - {% set patterns = upload_extensions | format_all("arrow/dev/tasks/linux-packages/**/*{}") %} - {{ macros.travis_upload_releases(patterns) }} diff --git a/dev/tasks/linux-packages/yum/build.sh b/dev/tasks/linux-packages/yum/build.sh deleted file mode 100755 index 01746803adf63..0000000000000 --- a/dev/tasks/linux-packages/yum/build.sh +++ /dev/null @@ -1,157 +0,0 @@ -#!/bin/bash -# -*- sh-indentation: 2; sh-basic-offset: 2 -*- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -u - -run() -{ - "$@" - if test $? -ne 0; then - echo "Failed $@" - exit 1 - fi -} - -rpmbuild_options= - -. /host/env.sh - -distribution=$(cut -d " " -f 1 /etc/redhat-release | tr "A-Z" "a-z") -if grep -q Linux /etc/redhat-release; then - distribution_version=$(cut -d " " -f 4 /etc/redhat-release) -else - distribution_version=$(cut -d " " -f 3 /etc/redhat-release) -fi -distribution_version=$(echo ${distribution_version} | sed -e 's/\..*$//g') - -architecture="$(arch)" -lib_directory=/usr/lib64 -case "${architecture}" in - i*86) - architecture=i386 - lib_directory=/usr/lib - ;; -esac - -run mkdir -p /build -run cd /build -find . -not -path ./ccache -a -not -path "./ccache/*" -delete -if which ccache > /dev/null 2>&1; then - export CCACHE_COMPILERCHECK=content - export CCACHE_COMPRESS=1 - export CCACHE_COMPRESSLEVEL=6 - export CCACHE_MAXSIZE=500M - export CCACHE_DIR="${PWD}/ccache" - ccache --show-stats - if [ -d "${lib_directory}/ccache" ]; then - PATH="${lib_directory}/ccache:$PATH" - fi -fi - -run mkdir -p rpmbuild -run cd -rm -rf rpmbuild -run ln -fs /build/rpmbuild ./ -if [ -x /usr/bin/rpmdev-setuptree ]; then - rm -rf .rpmmacros - run rpmdev-setuptree -else - run cat < ~/.rpmmacros -%_topdir ${HOME}/rpmbuild -RPMMACROS - run mkdir -p rpmbuild/SOURCES - run mkdir -p rpmbuild/SPECS - run mkdir -p rpmbuild/BUILD - run mkdir -p rpmbuild/RPMS - run mkdir -p rpmbuild/SRPMS -fi - -repositories="/host/repositories" -repository="${repositories}/${distribution}/${distribution_version}" -rpm_dir="${repository}/${architecture}/Packages" -srpm_dir="${repository}/source/SRPMS" -run mkdir -p "${rpm_dir}" "${srpm_dir}" - -# for debug -# rpmbuild_options="$rpmbuild_options --define 'optflags -O0 -g3'" - -if [ -n "${SOURCE_ARCHIVE}" ]; then - case "${RELEASE}" in - 0.dev*) - source_archive_base_name=$( \ - echo ${SOURCE_ARCHIVE} | sed -e 's/\.tar\.gz$//') - run tar xf /host/tmp/${SOURCE_ARCHIVE} \ - --transform="s,^[^/]*,${PACKAGE}," - run mv \ - ${PACKAGE} \ - ${source_archive_base_name} - run tar czf \ - rpmbuild/SOURCES/${SOURCE_ARCHIVE} \ - ${source_archive_base_name} - run rm -rf ${source_archive_base_name} - ;; - *) - run cp /host/tmp/${SOURCE_ARCHIVE} rpmbuild/SOURCES/ - ;; - esac -else - run cp /host/tmp/${PACKAGE}-${VERSION}.* rpmbuild/SOURCES/ -fi -run cp \ - /host/tmp/${PACKAGE}.spec \ - rpmbuild/SPECS/ - -run cat < build.sh -#!/bin/bash - -rpmbuild -ba ${rpmbuild_options} rpmbuild/SPECS/${PACKAGE}.spec -BUILD -run chmod +x build.sh -if [ -n "${DEVTOOLSET_VERSION:-}" ]; then - run cat < which-strip.sh -#!/bin/bash - -which strip -WHICH_STRIP - run chmod +x which-strip.sh - run cat <> ~/.rpmmacros -%__strip $(run scl enable devtoolset-${DEVTOOLSET_VERSION} ./which-strip.sh) -USE_DEVTOOLSET_STRIP - if [ "${DEBUG:-no}" = "yes" ]; then - run scl enable devtoolset-${DEVTOOLSET_VERSION} ./build.sh - else - run scl enable devtoolset-${DEVTOOLSET_VERSION} ./build.sh > /dev/null - fi -else - if [ "${DEBUG:-no}" = "yes" ]; then - run ./build.sh - else - run ./build.sh > /dev/null - fi -fi - -if which ccache > /dev/null 2>&1; then - ccache --show-stats -fi - -run mv rpmbuild/RPMS/*/* "${rpm_dir}/" -run mv rpmbuild/SRPMS/* "${srpm_dir}/" - -run chown -R "$(stat --format "%u:%g" "${repositories}")" "${repositories}" diff --git a/dev/tasks/macros.jinja b/dev/tasks/macros.jinja deleted file mode 100644 index bfbd6ec25884d..0000000000000 --- a/dev/tasks/macros.jinja +++ /dev/null @@ -1,198 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -{%- macro github_header() -%} -# NOTE: must set "Crossbow" as name to have the badge links working in the -# github comment reports! -name: Crossbow -on: - push: - branches: - - "*-github-*" -{% endmacro %} - -{%- macro github_checkout_arrow() -%} - - name: Checkout Arrow - run: | - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow config core.symlinks true - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - git -C arrow checkout FETCH_HEAD - git -C arrow submodule update --init --recursive - - name: Fetch Submodules and Tags - shell: bash - run: cd arrow && ci/scripts/util_checkout.sh -{% endmacro %} - -{%- macro github_login_dockerhub() -%} - - name: Login to Dockerhub - uses: docker/login-action@v1 - with: - username: {{ '${{ secrets.DOCKERHUB_USER }}' }} - password: {{ '${{ secrets.DOCKERHUB_TOKEN }}' }} -{% endmacro %} - -{%- macro github_login_ghcr() -%} - - name: Login to GitHub Container Registry - shell: bash - run: docker login ghcr.io -u {{ '${{ github.repository_owner }}' }} -p {{ '${{ secrets.CROSSBOW_GHCR_TOKEN }}' }} -{% endmacro %} - -{%- macro github_install_archery() -%} - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: 3.8 - - name: Install Archery - shell: bash - run: pip install -e arrow/dev/archery[all] -{% endmacro %} - -{%- macro github_upload_releases(pattern) -%} - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: 3.8 - - name: Setup Crossbow - shell: bash - run: pip install -e arrow/dev/archery[crossbow] - - name: Upload artifacts - shell: bash - run: | - archery crossbow \ - --queue-path $(pwd) \ - --queue-remote {{ queue_remote_url }} \ - upload-artifacts \ - --sha {{ task.branch }} \ - --tag {{ task.tag }} \ - {% if pattern is string %} - --pattern "{{ pattern }}" - {% elif pattern is iterable %} - {% for p in pattern %} - --pattern "{{ p }}" {{ "\\" if not loop.last else "" }} - {% endfor %} - {% endif %} - env: - CROSSBOW_GITHUB_TOKEN: {{ '${{ secrets.CROSSBOW_GITHUB_TOKEN }}' }} -{% endmacro %} - -{%- macro github_upload_gemfury(pattern) -%} - {%- if arrow.branch == 'master' -%} - - name: Upload package to Gemfury - shell: bash - run: | - path=$(ls {{ pattern }}) - curl -F "package=@${path}" https://${CROSSBOW_GEMFURY_TOKEN}@push.fury.io/${CROSSBOW_GEMFURY_ORG}/ - env: - CROSSBOW_GEMFURY_TOKEN: {{ '${{ secrets.CROSSBOW_GEMFURY_TOKEN }}' }} - CROSSBOW_GEMFURY_ORG: {{ '${{ secrets.CROSSBOW_GEMFURY_ORG }}' }} - {% endif %} -{% endmacro %} - -{%- macro azure_checkout_arrow() -%} - - script: | - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - git -C arrow checkout FETCH_HEAD - git -C arrow submodule update --init --recursive - displayName: Clone arrow -{% endmacro %} - -{%- macro azure_upload_releases(pattern) -%} - - task: UsePythonVersion@0 - inputs: - versionSpec: '3.8' - - script: pip install -e arrow/dev/archery[crossbow] - displayName: Install Crossbow - - bash: | - archery crossbow \ - --queue-path $(pwd) \ - --queue-remote {{ queue_remote_url }} \ - upload-artifacts \ - --sha {{ task.branch }} \ - --tag {{ task.tag }} \ - {% if pattern is string %} - --pattern "{{ pattern }}" - {% elif pattern is iterable %} - {% for p in pattern %} - --pattern "{{ p }}" {{ "" if not loop.last else "" }} - {% endfor %} - {% endif %} - env: - CROSSBOW_GITHUB_TOKEN: $(CROSSBOW_GITHUB_TOKEN) - displayName: Upload packages as a GitHub release -{% endmacro %} - -{%- macro azure_upload_anaconda(pattern) -%} - {%- if arrow.branch == 'master' -%} - - task: CondaEnvironment@1 - inputs: - packageSpecs: 'anaconda-client' - installOptions: '-c conda-forge' - updateConda: no - - script: | - conda install -y anaconda-client - anaconda -t $(CROSSBOW_ANACONDA_TOKEN) upload --force {{ pattern }} - displayName: Upload packages to Anaconda - {% endif %} -{% endmacro %} - -{%- macro travis_checkout_arrow() -%} - - git clone --no-checkout {{ arrow.remote }} arrow - - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - - git -C arrow checkout FETCH_HEAD - - git -C arrow submodule update --init --recursive -{% endmacro %} - -{%- macro travis_install_archery() -%} - - sudo -H pip3 install --upgrade pip - - sudo -H pip3 install docker-compose - - sudo -H pip3 install -e arrow/dev/archery[docker] -{% endmacro %} - -{%- macro travis_docker_login() -%} - - echo "${DOCKERHUB_TOKEN}" | docker login --username "${DOCKERHUB_USER}" --password-stdin -{% endmacro %} - -{%- macro travis_upload_releases(pattern) -%} - - sudo -H pip3 install pygit2==1.0 - - sudo -H pip3 install -e arrow/dev/archery[crossbow] - - | - archery crossbow \ - --queue-path $(pwd) \ - --queue-remote {{ queue_remote_url }} \ - upload-artifacts \ - --sha {{ task.branch }} \ - --tag {{ task.tag }} \ - {% if pattern is string %} - --pattern "{{ pattern }}" - {% elif pattern is iterable %} - {% for p in pattern %} - --pattern "{{ p }}" {{ "\\" if not loop.last else "" }} - {% endfor %} - {% endif %} -{% endmacro %} - -{%- macro travis_upload_gemfury(pattern) -%} - {%- if arrow.branch == 'master' -%} - - | - WHEEL_PATH=$(echo arrow/python/repaired_wheels/*.whl) - curl \ - -F "package=@${WHEEL_PATH}" \ - "https://${CROSSBOW_GEMFURY_TOKEN}@push.fury.io/${CROSSBOW_GEMFURY_ORG}/" - {% endif %} -{% endmacro %} diff --git a/dev/tasks/nightlies.sample.yml b/dev/tasks/nightlies.sample.yml deleted file mode 100644 index 710f7c0ad377f..0000000000000 --- a/dev/tasks/nightlies.sample.yml +++ /dev/null @@ -1,68 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# this travis configuration can be used to submit cron scheduled tasks -# 1. copy this file to one of crossbow's branch (master for example) with -# filename .travis.yml -# 2. setup daily cron jobs for that particular branch, see travis' -# documentation https://docs.travis-ci.com/user/cron-jobs/ - -branches: - # don't attempt to build branches intended for windows builds - except: - - /.*win.*/ - -os: linux -dist: trusty -language: generic - -before_install: - # Install Miniconda. - - echo `pwd` - - | - echo "" - echo "Installing a fresh version of Miniconda." - MINICONDA_URL="https://repo.continuum.io/miniconda" - MINICONDA_FILE="Miniconda3-latest-Linux-x86_64.sh" - curl -L -O "${MINICONDA_URL}/${MINICONDA_FILE}" - bash $MINICONDA_FILE -b - - # Configure conda. - - | - echo "" - echo "Configuring conda." - source /home/travis/miniconda3/bin/activate root - conda config --remove channels defaults - conda config --add channels defaults - conda config --add channels conda-forge - conda config --set show_channel_urls true - -install: - - pushd .. - # to build against a specific branch of a fork - # git clone -b https://github.com//arrow - - git clone https://github.com/apache/arrow - - pip install dev/archery[crossbow] - -script: - # submit packaging tasks - - | - if [ $TRAVIS_EVENT_TYPE = "cron" ]; then - archery crossbow submit -g conda -g wheel -g linux - else - archery crossbow submit --dry-run -g conda -g wheel -g linux - fi diff --git a/dev/tasks/nuget-packages/github.linux.yml b/dev/tasks/nuget-packages/github.linux.yml deleted file mode 100644 index cd03a7bfeea15..0000000000000 --- a/dev/tasks/nuget-packages/github.linux.yml +++ /dev/null @@ -1,43 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -{% import 'macros.jinja' as macros with context %} - -{{ macros.github_header() }} - -jobs: - package: - name: Package - runs-on: ubuntu-latest - steps: - {{ macros.github_checkout_arrow()|indent }} - {{ macros.github_install_archery()|indent }} - - - name: Prepare version - run: | - sed -i'' -E -e \ - "s/^ .+<\/Version>/ {{ arrow.no_rc_semver_version }}<\/Version>/" \ - arrow/csharp/Directory.Build.props - - name: Build package - run: | - pushd arrow - archery docker run {{ run }} - popd - - {% set patterns = ["arrow/csharp/artifacts/**/*.nupkg", - "arrow/csharp/artifacts/**/*.snupkg"] %} - {{ macros.github_upload_releases(patterns)|indent }} diff --git a/dev/tasks/python-sdist/github.yml b/dev/tasks/python-sdist/github.yml deleted file mode 100644 index 68371876ab80f..0000000000000 --- a/dev/tasks/python-sdist/github.yml +++ /dev/null @@ -1,45 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -{% import 'macros.jinja' as macros with context %} - -{{ macros.github_header() }} - -jobs: - build: - name: "Build sdist" - runs-on: ubuntu-20.04 - steps: - {{ macros.github_checkout_arrow()|indent }} - {{ macros.github_install_archery()|indent }} - - - name: Build sdist - run: | - archery docker run python-sdist - {% if arrow.branch == 'master' %} - archery docker push python-sdist || : - {% endif %} - env: - PYARROW_VERSION: {{ arrow.no_rc_version }} - - - name: Test sdist - run: archery docker run ubuntu-python-sdist-test - env: - PYARROW_VERSION: {{ arrow.no_rc_version }} - - {{ macros.github_upload_releases("arrow/python/dist/*.tar.gz")|indent }} - {{ macros.github_upload_gemfury("arrow/python/dist/*.tar.gz")|indent }} diff --git a/dev/tasks/python-wheels/github.linux.amd64.yml b/dev/tasks/python-wheels/github.linux.amd64.yml deleted file mode 100644 index a62640787155e..0000000000000 --- a/dev/tasks/python-wheels/github.linux.amd64.yml +++ /dev/null @@ -1,48 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -{% import 'macros.jinja' as macros with context %} - -{{ macros.github_header() }} - -jobs: - build: - name: "Build wheel for Manylinux {{ manylinux_version }}" - runs-on: ubuntu-latest - env: - # archery uses these environment variables - ARCH: amd64 - PYTHON: {{ python_version }} - - steps: - {{ macros.github_checkout_arrow()|indent }} - {{ macros.github_install_archery()|indent }} - {{ macros.github_login_dockerhub()|indent }} - - - name: Build wheel - shell: bash - run: archery docker run -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} python-wheel-manylinux-{{ manylinux_version }} - - # TODO(kszucs): auditwheel show - - name: Test wheel - shell: bash - run: | - archery docker run python-wheel-manylinux-test-imports - archery docker run python-wheel-manylinux-test-unittests - - {{ macros.github_upload_releases("arrow/python/repaired_wheels/*.whl")|indent }} - {{ macros.github_upload_gemfury("arrow/python/repaired_wheels/*.whl")|indent }} diff --git a/dev/tasks/python-wheels/github.osx.yml b/dev/tasks/python-wheels/github.osx.yml deleted file mode 100644 index af0cc44ef0916..0000000000000 --- a/dev/tasks/python-wheels/github.osx.yml +++ /dev/null @@ -1,133 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -{% import 'macros.jinja' as macros with context %} - -{{ macros.github_header() }} - -env: - ARROW_S3: {{ arrow_s3 }} - MACOSX_DEPLOYMENT_TARGET: {{ macos_deployment_target }} - MB_PYTHON_VERSION: {{ python_version }} - PLAT: x86_64 - PYARROW_BUILD_VERBOSE: 1 - PYARROW_VERSION: {{ arrow.no_rc_version }} - PYTHON_VERSION: {{ python_version }} - SETUPTOOLS_SCM_PRETEND_VERSION: {{ arrow.no_rc_version }} - VCPKG_DEFAULT_TRIPLET: x64-osx-static-release - VCPKG_FEATURE_FLAGS: "-manifests" - VCPKG_OVERLAY_TRIPLETS: {{ "${{ github.workspace }}/arrow/ci/vcpkg" }} - -jobs: - build: - name: Build wheel for OS X - runs-on: macos-latest - steps: - {{ macros.github_checkout_arrow()|indent }} - - - name: Install System Dependencies - run: brew install bison ninja - - # Restore from cache the previously built ports. - # If cache-miss, download and build vcpkg (aka "bootstrap vcpkg"). - - name: Restore from Cache and Install Vcpkg - # Download and build vcpkg, without installing any port. - # If content is cached already, it is a no-op. - uses: kszucs/run-vcpkg@main - with: - # Required to prevent cache eviction on crossbow's main branch - # where we build pre-build the vcpkg packages - setupOnly: true - doNotSaveCache: true - appendedCacheKey: "-macos-{{ macos_deployment_target }}" - vcpkgDirectory: {{ "${{ github.workspace }}/vcpkg" }} - vcpkgGitCommitId: fced4bef1606260f110d74de1ae1975c2b9ac549 - - - name: Patch Vcpkg Ports - run: | - vcpkg_patch_file="../arrow/ci/vcpkg/ports.patch" - cd $VCPKG_ROOT - if ! git apply --reverse --check --ignore-whitespace ${vcpkg_patch_file}; then - git apply --ignore-whitespace ${vcpkg_patch_file} - echo "Patch successfully applied!" - fi - - # Now that vcpkg is installed, it is being used to run with the desired arguments. - - name: Install Vcpkg Dependencies - run: | - $VCPKG_ROOT/vcpkg install \ - abseil \ - boost-filesystem \ - brotli \ - bzip2 \ - c-ares \ - curl \ - flatbuffers \ - gflags \ - glog \ - grpc \ - lz4 \ - openssl \ - orc \ - protobuf \ - rapidjson \ - re2 \ - snappy \ - thrift \ - utf8proc \ - zlib \ - zstd - - {% if arrow_s3 == "ON" %} - - name: Install AWS SDK C++ - run: | - $VCPKG_ROOT/vcpkg install \ - aws-sdk-cpp[config,cognito-identity,core,identity-management,s3,sts,transfer] - {% endif %} - - - name: Setup Multibuild - run: | - git clone https://github.com/matthew-brett/multibuild - git -C multibuild checkout 03950c9a7feb09d215f82d6563c4ffd91274a1e1 - - - name: Build Wheel - env: - CONFIG_PATH: /dev/null - run: | - # configure environment and install python - source multibuild/common_utils.sh - source multibuild/travis_osx_steps.sh - before_install - - # install python dependencies - pip install -r arrow/python/requirements-wheel-build.txt delocate - - # build the wheel - arrow/ci/scripts/python_wheel_macos_build.sh $(pwd)/arrow $(pwd)/build - - - name: Setup Python for Testing - uses: actions/setup-python@v2 - with: - python-version: "{{ python_version }}" - - - name: Test the Wheel - run: | - # TODO(kszucs): temporarily remove homebrew libs - unset MACOSX_DEPLOYMENT_TARGET - arrow/ci/scripts/python_wheel_macos_test.sh $(pwd)/arrow - - {{ macros.github_upload_releases("arrow/python/dist/*.whl")|indent }} - {{ macros.github_upload_gemfury("arrow/python/dist/*.whl")|indent }} diff --git a/dev/tasks/python-wheels/github.windows.yml b/dev/tasks/python-wheels/github.windows.yml deleted file mode 100644 index 922533b0f827c..0000000000000 --- a/dev/tasks/python-wheels/github.windows.yml +++ /dev/null @@ -1,53 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -{% import 'macros.jinja' as macros with context %} - -{{ macros.github_header() }} - -jobs: - build: - name: "Build wheel for Windows" - runs-on: windows-2019 - env: - # archery uses this environment variable - PYTHON: {{ python_version }} - # this is a private repository at the moment (mostly because of licensing - # consideration of windows images with visual studio), but anyone can - # recreate the image by manually building it via: - # `archery build python-wheel-windows-vs2017` - # note that we don't run docker build since there wouldn't be a cache hit - # and rebuilding the dependencies takes a fair amount of time - REPO: ghcr.io/ursacomputing/arrow - # prefer the docker cli over docker-compose - ARCHERY_USE_DOCKER_CLI: 1 - - steps: - {{ macros.github_checkout_arrow()|indent }} - {{ macros.github_login_ghcr()|indent }} - {{ macros.github_install_archery()|indent }} - - - name: Build wheel - shell: cmd - run: archery docker run --no-build -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} python-wheel-windows-vs2017 - - - name: Test wheel - shell: cmd - run: archery docker run python-wheel-windows-test - - {{ macros.github_upload_releases("arrow/python/dist/*.whl")|indent }} - {{ macros.github_upload_gemfury("arrow/python/dist/*.whl")|indent }} diff --git a/dev/tasks/python-wheels/travis.linux.arm64.yml b/dev/tasks/python-wheels/travis.linux.arm64.yml deleted file mode 100644 index a5c0f7408d222..0000000000000 --- a/dev/tasks/python-wheels/travis.linux.arm64.yml +++ /dev/null @@ -1,73 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -{% import 'macros.jinja' as macros with context %} - -arch: arm64-graviton2 -virt: vm -os: linux -dist: focal -group: edge -language: minimal - -addons: - apt: - packages: - - libgit2-dev - - python3-pip - -services: - - docker - -# don't build twice -if: tag IS blank - -env: - global: - - BUILD_REF={{ arrow.head }} - - TRAVIS_TAG={{ task.tag }} - # archery uses these environment variables - - ARCH=arm64v8 - - PYTHON={{ python_version }} - -before_script: - - set -e - {{ macros.travis_checkout_arrow() }} - {{ macros.travis_docker_login() }} - -script: - # Install Archery and Crossbow dependencies - {{ macros.travis_install_archery() }} - - # Build and Test packages - # output something every minutes to prevent travis from killing the build - - while sleep 1m; do echo "=====[ $SECONDS seconds still running ]====="; done & - - archery docker run -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} python-wheel-manylinux-{{ manylinux_version }} - - archery docker run python-wheel-manylinux-test-imports - - archery docker run python-wheel-manylinux-test-unittests - - kill %1 - -after_success: - # Upload wheel as github artifact - {{ macros.travis_upload_releases("arrow/python/repaired_wheels/*.whl") }} - {{ macros.travis_upload_gemfury("arrow/python/repaired_wheels/*.whl") }} - - {% if arrow.branch == 'master' %} - # Push the docker image to dockerhub - - archery docker push python-wheel-manylinux-{{ manylinux_version }} - - archery docker push python-wheel-manylinux-test-unittests - {% endif %} diff --git a/dev/tasks/r/azure.linux.yml b/dev/tasks/r/azure.linux.yml deleted file mode 100644 index 6e022ec04361c..0000000000000 --- a/dev/tasks/r/azure.linux.yml +++ /dev/null @@ -1,74 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -jobs: - - job: linux - pool: - vmImage: ubuntu-latest - timeoutInMinutes: 360 - steps: - - script: | - set -ex - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - git -C arrow checkout FETCH_HEAD - git -C arrow submodule update --init --recursive - displayName: Clone arrow - - - script: | - set -ex - docker -v - docker-compose -v - cd arrow - export R_ORG={{ r_org }} - export R_IMAGE={{ r_image }} - export R_TAG={{ r_tag }} - export DEVTOOLSET_VERSION={{ devtoolset_version|default("-1") }} - docker-compose pull --ignore-pull-failures r - docker-compose build r - displayName: Docker build - - - script: | - set -ex - cd arrow - export R_ORG={{ r_org }} - export R_IMAGE={{ r_image }} - export R_TAG={{ r_tag }} - export ARROW_R_DEV={{ not_cran|default("TRUE") }} - # Note that ci/scripts/r_test.sh sets NOT_CRAN=true if ARROW_R_DEV=TRUE - docker-compose run \ - -e ARROW_DATASET={{ arrow_dataset|default("") }} \ - -e ARROW_PARQUET={{ arrow_parquet|default("") }} \ - -e ARROW_S3={{ arrow_s3|default("") }} \ - -e ARROW_WITH_RE2={{ arrow_with_re2|default("") }} \ - -e ARROW_WITH_UTF8PROC={{ arrow_with_utf8proc|default("") }} \ - -e LIBARROW_MINIMAL={{ libarrow_minimal|default("") }} \ - -e LIBARROW_DOWNLOAD={{ libarrow_download|default("") }} \ - -e LIBARROW_BUILD={{ libarrow_build|default("") }} \ - -e TEST_R_WITH_ARROW={{ with_arrow|default("TRUE") }} \ - r - displayName: Docker run - - - script: | - set -ex - cat arrow/r/check/arrow.Rcheck/00install.out - displayName: Dump install logs - condition: succeededOrFailed() - - script: | - set -ex - cat arrow/r/check/arrow.Rcheck/tests/testthat.Rout* - displayName: Dump test logs - condition: succeededOrFailed() diff --git a/dev/tasks/r/github.devdocs.yml b/dev/tasks/r/github.devdocs.yml deleted file mode 100644 index 1224a2555c8a7..0000000000000 --- a/dev/tasks/r/github.devdocs.yml +++ /dev/null @@ -1,92 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# NOTE: must set "Crossbow" as name to have the badge links working in the -# github comment reports! -name: Crossbow - -on: - push: - branches: - - "*-github-*" - -jobs: - devdocs: - name: 'R devdocs {{ "${{ matrix.os }}" }} system install: {{ "${{ matrix.system-install }}" }}' - runs-on: {{ "${{ matrix.os }}" }} - strategy: - fail-fast: false - matrix: - os: [macOS-latest, ubuntu-20.04] - # should the install method install libarrow into a system directory - # or a temporary directory. old is the same as a temporary - # directory, but an old version of libarrow will be installed - # into a system directory first (to make sure we can link correctly when building) - system-install: [true, false] - - steps: - - name: Checkout Arrow - run: | - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - git -C arrow checkout FETCH_HEAD - git -C arrow submodule update --init --recursive - - uses: r-lib/actions/setup-r@v1 - - uses: r-lib/actions/setup-pandoc@v1 - - name: Install knitr, rmarkdown - run: | - install.packages(c("rmarkdown", "knitr", "sessioninfo")) - shell: Rscript {0} - - name: Session info - run: | - options(width = 100) - pkgs <- installed.packages()[, "Package"] - sessioninfo::session_info(pkgs, include_base = TRUE) - shell: Rscript {0} - - name: Write the install script - env: - RUN_DEVDOCS: TRUE - DEVDOCS_MACOS: {{ "${{contains(matrix.os, 'macOS')}}" }} - DEVDOCS_UBUNTU: {{ "${{contains(matrix.os, 'ubuntu')}}" }} - DEVDOCS_SYSTEM_INSTALL: {{ "${{contains(matrix.system-install, 'true')}}" }} - DEVDOCS_PRIOR_SYSTEM_INSTALL: {{ "${{contains(matrix.system-install, 'old')}}" }} - run: | - # This isn't actually rendering the docs, but will save arrow/r/vignettes/script.sh - # which can be sourced to install arrow. - rmarkdown::render("arrow/r/vignettes/developing.Rmd") - shell: Rscript {0} - - name: Install from the devdocs - env: - LIBARROW_BINARY: FALSE - ARROW_R_DEV: TRUE - run: bash arrow/r/vignettes/script.sh - shell: bash - - name: Ensure that the Arrow package is loadable and we have the correct one - run: | - echo $LD_LIBRARY_PATH - R --no-save <= 3.3 - - "3.3" - - "3.4" - - "3.5" - env: - R_ORG: "rstudio" - R_IMAGE: "r-base" - R_TAG: "{{ MATRIX }}-bionic" - ARROW_R_DEV: "TRUE" - steps: - - name: Checkout Arrow - run: | - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - git -C arrow checkout FETCH_HEAD - git -C arrow submodule update --init --recursive - - name: Free Up Disk Space - shell: bash - run: arrow/ci/scripts/util_cleanup.sh - - name: Fetch Submodules and Tags - shell: bash - run: cd arrow && ci/scripts/util_checkout.sh - - name: Docker Pull - shell: bash - run: cd arrow && docker-compose pull --ignore-pull-failures r - - name: Docker Build - shell: bash - run: cd arrow && docker-compose build r - - name: Docker Run - shell: bash - run: cd arrow && docker-compose run r - - name: Dump install logs - run: cat arrow/r/check/arrow.Rcheck/00install.out - if: always() - - name: Dump test logs - run: cat arrow/r/check/arrow.Rcheck/tests/testthat.Rout* - if: always() - - name: Save the test output - if: always() - uses: actions/upload-artifact@v2 - with: - name: test-output - path: arrow/r/check/arrow.Rcheck/tests/testthat.Rout* diff --git a/dev/tasks/r/github.macos-linux.local.yml b/dev/tasks/r/github.macos-linux.local.yml deleted file mode 100644 index d9a8ed94206df..0000000000000 --- a/dev/tasks/r/github.macos-linux.local.yml +++ /dev/null @@ -1,88 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# NOTE: must set "Crossbow" as name to have the badge links working in the -# github comment reports! -name: Crossbow - -on: - push: - branches: - - "*-github-*" - -jobs: - autobrew: - name: "install from local source" - runs-on: {{ "${{ matrix.os }}" }} - strategy: - fail-fast: false - matrix: - os: [macOS-latest, ubuntu-20.04] - - steps: - - name: Checkout Arrow - run: | - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - git -C arrow checkout FETCH_HEAD - git -C arrow submodule update --init --recursive - - name: Configure non-autobrew dependencies (macos) - run: | - cd arrow/r - brew install openssl - if: contains(matrix.os, 'macOS') - - name: Configure non-autobrew dependencies (linux) - run: | - cd arrow/r - sudo apt-get update - sudo apt install libcurl4-openssl-dev libssl-dev - if: contains(matrix.os, 'ubuntu') - - uses: r-lib/actions/setup-r@v1 - - name: Install dependencies - run: | - install.packages("remotes") - remotes::install_deps("arrow/r", dependencies = TRUE) - remotes::install_cran(c("rcmdcheck", "sys", "sessioninfo")) - shell: Rscript {0} - - name: Session info - run: | - options(width = 100) - pkgs <- installed.packages()[, "Package"] - sessioninfo::session_info(pkgs, include_base = TRUE) - shell: Rscript {0} - - name: Install - env: - _R_CHECK_CRAN_INCOMING_: false - ARROW_USE_PKG_CONFIG: false - FORCE_BUNDLED_BUILD: true - LIBARROW_MINIMAL: false - TEST_R_WITH_ARROW: TRUE - ARROW_R_DEV: TRUE - run: | - cd arrow/r - R CMD INSTALL . --install-tests - - name: Run the tests - run: R -e 'if(tools::testInstalledPackage("arrow") != 0L) stop("There was a test failure.")' - - name: Dump test logs - run: cat arrow-tests/testthat.Rout* - if: failure() - - name: Save the test output - uses: actions/upload-artifact@v2 - with: - name: test-output - path: arrow-tests/testthat.Rout* - if: always() diff --git a/dev/tasks/r/github.macos.autobrew.yml b/dev/tasks/r/github.macos.autobrew.yml deleted file mode 100644 index 1b8500f64b319..0000000000000 --- a/dev/tasks/r/github.macos.autobrew.yml +++ /dev/null @@ -1,78 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# NOTE: must set "Crossbow" as name to have the badge links working in the -# github comment reports! -name: Crossbow - -on: - push: - branches: - - "*-github-*" - -jobs: - autobrew: - name: "Autobrew" - runs-on: macOS-latest - steps: - - name: Checkout Arrow - run: | - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - git -C arrow checkout FETCH_HEAD - git -C arrow submodule update --init --recursive - - name: Configure autobrew script - run: | - cd arrow/r - # Put the formula inside r/ so that it's included in the package build - cp ../dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb tools/apache-arrow.rb - # Pin the current commit in the formula to test so that we're not always pulling from master - sed -i.bak -E -e 's@https://github.com/apache/arrow.git"$@{{ arrow.remote }}.git", :revision => "{{ arrow.head }}"@' tools/apache-arrow.rb && rm -f tools/apache-arrow.rb.bak - # Sometimes crossbow gives a remote URL with .git and sometimes not. Make sure there's only one - sed -i.bak -E -e 's@.git.git@.git@' tools/apache-arrow.rb && rm -f tools/apache-arrow.rb.bak - # Get minio for S3 testing - brew install minio - - uses: r-lib/actions/setup-r@v1 - - name: Install dependencies - run: | - install.packages("remotes") - remotes::install_deps("arrow/r", dependencies = TRUE) - remotes::install_cran(c("rcmdcheck", "sys", "sessioninfo")) - shell: Rscript {0} - - name: Session info - run: | - options(width = 100) - pkgs <- installed.packages()[, "Package"] - sessioninfo::session_info(pkgs, include_base = TRUE) - shell: Rscript {0} - - name: Check - env: - _R_CHECK_CRAN_INCOMING_: false - ARROW_USE_PKG_CONFIG: false - run: arrow/ci/scripts/r_test.sh arrow - - name: Dump install logs - run: cat arrow/r/check/arrow.Rcheck/00install.out - if: always() - - name: Dump test logs - run: cat arrow/r/check/arrow.Rcheck/tests/testthat.Rout* - if: always() - - name: Save the test output - if: always() - uses: actions/upload-artifact@v2 - with: - name: test-output - path: arrow/r/check/arrow.Rcheck/tests/testthat.Rout* diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml deleted file mode 100644 index eab3e15ce9255..0000000000000 --- a/dev/tasks/tasks.yml +++ /dev/null @@ -1,1703 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -groups: - # these groups are just for convenience - # makes it easier to submit related tasks - - ############################# Packaging tasks ############################### - - conda: - - conda-* - - wheel: - - wheel-* - - linux: - - debian-* - - ubuntu-* - - centos-* - - linux-amd64: - - debian-*-amd64 - - ubuntu-*-amd64 - - centos-*-amd64 - - linux-arm64: - - debian-*-arm64 - - ubuntu-*-arm64 - - centos-*-arm64 - - gandiva: - - gandiva-* - - homebrew: - - homebrew-* - - packaging: - - conda-* - - wheel-* - - debian-* - - ubuntu-* - - centos-* - - python-sdist - - nuget - - ############################# Testing tasks ################################# - - test: - - test-* - - cpp: - - test-*cpp* - - c-glib: - - test-*c-glib* - - python: - - test-*python* - - r: - - test*-r-* - - homebrew-r-autobrew - # r-conda tasks - - conda-linux-gcc-py*-cpu-r* - - conda-osx-clang-py*-r* - - conda-win-vs2017-py*-r* - - ruby: - - test-*ruby* - - vcpkg: - - test-*vcpkg* - - integration: - - test-*dask* - - test-*hdfs* - - test-*jpype* - - test-*kartothek* - - test-*pandas* - - test-*spark* - - test-*turbodbc* - - example: - - example-* - - example-cpp: - - example-*cpp* - - verify-rc: - - verify-rc-* - - verify-rc-binaries: - - verify-rc-binaries-* - - verify-rc-wheels: - - verify-rc-wheels-* - - verify-rc-source: - - verify-rc-source-* - - verify-rc-source-macos: - - verify-rc-source-macos-* - - verify-rc-source-linux: - - verify-rc-source-linux-* - - ######################## Tasks to run regularly ############################# - - nightly: - - debian-* - - ubuntu-* - - centos-* - - conda-* - - gandiva-* - # List the homebrews explicitly because we don't care about running homebrew-cpp-autobrew - - homebrew-cpp - - homebrew-r-autobrew - - nuget - - test-* - - example-* - - wheel-* - - python-sdist - -tasks: - # arbitrary_task_name: - # template: path of jinja2 templated yml - # params: optional extra parameters - # artifacts: list of regex patterns, each needs to match a single github - # release asset, version variable is replaced in the pattern - # e.g.: - # - pyarrow-{no_rc_version}-py36(h[a-z0-9]+)_0-linux-64.tar.bz2 - - ############################## Conda Linux ################################## - - conda-clean: - ci: azure - template: conda-recipes/azure.clean.yml - - # Important notes on the conda setup here: - # - # * On conda-forge the `pyarrow` and `arrow-cpp` packages are built in - # the same feedstock as the dependency matrix is the same for them as - # Python and the OS are the main dimension. The R package `r-arrow` is - # an independent feedstock as it doesn't have the Python but the - # R dimension. To limit the number of CI jobs, we are building `r-arrow` - # for R 3.6 with the Python 3.6 jobs and for R 4.0 with the Python 3.7 jobs. - # * The files in `dev/tasks/conda-recipes/.ci_support/` are automatically - # generated and to be synced regularly from the feedstock. We have no way - # yet to generate them inside the arrow repository automatically. - - conda-linux-gcc-py36-cpu-r36: - ci: azure - template: conda-recipes/azure.linux.yml - params: - config: linux_64_cuda_compiler_versionNonenumpy1.17python3.6.____cpython - r_config: linux_64_r_base3.6 - artifacts: - - arrow-cpp-{no_rc_version}-py36(h[a-z0-9]+)_0_cpu.tar.bz2 - - pyarrow-{no_rc_version}-py36(h[a-z0-9]+)_0_cpu.tar.bz2 - - conda-linux-gcc-py37-cpu-r40: - ci: azure - template: conda-recipes/azure.linux.yml - params: - config: linux_64_cuda_compiler_versionNonenumpy1.17python3.7.____cpython - r_config: linux_64_r_base4.0 - artifacts: - - arrow-cpp-{no_rc_version}-py37(h[a-z0-9]+)_0_cpu.tar.bz2 - - pyarrow-{no_rc_version}-py37(h[a-z0-9]+)_0_cpu.tar.bz2 - - conda-linux-gcc-py38-cpu: - ci: azure - template: conda-recipes/azure.linux.yml - params: - config: linux_64_cuda_compiler_versionNonenumpy1.17python3.8.____cpython - artifacts: - - arrow-cpp-{no_rc_version}-py38(h[a-z0-9]+)_0_cpu.tar.bz2 - - pyarrow-{no_rc_version}-py38(h[a-z0-9]+)_0_cpu.tar.bz2 - - conda-linux-gcc-py39-cpu: - ci: azure - template: conda-recipes/azure.linux.yml - params: - config: linux_64_cuda_compiler_versionNonenumpy1.19python3.9.____cpython - artifacts: - - arrow-cpp-{no_rc_version}-py39(h[a-z0-9]+)_0_cpu.tar.bz2 - - pyarrow-{no_rc_version}-py39(h[a-z0-9]+)_0_cpu.tar.bz2 - - conda-linux-gcc-py36-cuda: - ci: azure - template: conda-recipes/azure.linux.yml - params: - config: linux_64_cuda_compiler_version10.2numpy1.17python3.6.____cpython - artifacts: - - arrow-cpp-{no_rc_version}-py36(h[a-z0-9]+)_0_cuda.tar.bz2 - - pyarrow-{no_rc_version}-py36(h[a-z0-9]+)_0_cuda.tar.bz2 - - conda-linux-gcc-py37-cuda: - ci: azure - template: conda-recipes/azure.linux.yml - params: - config: linux_64_cuda_compiler_version10.2numpy1.17python3.7.____cpython - artifacts: - - arrow-cpp-{no_rc_version}-py37(h[a-z0-9]+)_0_cuda.tar.bz2 - - pyarrow-{no_rc_version}-py37(h[a-z0-9]+)_0_cuda.tar.bz2 - - conda-linux-gcc-py38-cuda: - ci: azure - template: conda-recipes/azure.linux.yml - params: - config: linux_64_cuda_compiler_version10.2numpy1.17python3.8.____cpython - artifacts: - - arrow-cpp-{no_rc_version}-py38(h[a-z0-9]+)_0_cuda.tar.bz2 - - pyarrow-{no_rc_version}-py38(h[a-z0-9]+)_0_cuda.tar.bz2 - - conda-linux-gcc-py39-cuda: - ci: azure - template: conda-recipes/azure.linux.yml - params: - config: linux_64_cuda_compiler_version10.2numpy1.19python3.9.____cpython - artifacts: - - arrow-cpp-{no_rc_version}-py39(h[a-z0-9]+)_0_cuda.tar.bz2 - - pyarrow-{no_rc_version}-py39(h[a-z0-9]+)_0_cuda.tar.bz2 - - conda-linux-gcc-py36-arm64: - ci: drone - template: conda-recipes/drone.yml - params: - config: linux_aarch64_python3.6.____cpython - artifacts: - - arrow-cpp-{no_rc_version}-py36(h[a-z0-9]+)_0_cpu.tar.bz2 - - pyarrow-{no_rc_version}-py36(h[a-z0-9]+)_0_cpu.tar.bz2 - - conda-linux-gcc-py37-arm64: - ci: drone - template: conda-recipes/drone.yml - params: - config: linux_aarch64_python3.7.____cpython - artifacts: - - arrow-cpp-{no_rc_version}-py37(h[a-z0-9]+)_0_cpu.tar.bz2 - - pyarrow-{no_rc_version}-py37(h[a-z0-9]+)_0_cpu.tar.bz2 - - conda-linux-gcc-py38-arm64: - ci: drone - template: conda-recipes/drone.yml - params: - config: linux_aarch64_python3.8.____cpython - artifacts: - - arrow-cpp-{no_rc_version}-py38(h[a-z0-9]+)_0_cpu.tar.bz2 - - pyarrow-{no_rc_version}-py38(h[a-z0-9]+)_0_cpu.tar.bz2 - - conda-linux-gcc-py39-arm64: - ci: drone - template: conda-recipes/drone.yml - params: - config: linux_aarch64_python3.9.____cpython - artifacts: - - arrow-cpp-{no_rc_version}-py39(h[a-z0-9]+)_0_cpu.tar.bz2 - - pyarrow-{no_rc_version}-py39(h[a-z0-9]+)_0_cpu.tar.bz2 - - ############################## Conda OSX #################################### - - conda-osx-clang-py36-r36: - ci: azure - template: conda-recipes/azure.osx.yml - params: - config: osx_64_numpy1.17python3.6.____cpython - r_config: osx_64_r_base3.6 - artifacts: - - arrow-cpp-{no_rc_version}-py36(h[a-z0-9]+)_0_cpu.tar.bz2 - - pyarrow-{no_rc_version}-py36(h[a-z0-9]+)_0_cpu.tar.bz2 - - conda-osx-clang-py37-r40: - ci: azure - template: conda-recipes/azure.osx.yml - params: - config: osx_64_numpy1.17python3.7.____cpython - r_config: osx_64_r_base4.0 - artifacts: - - arrow-cpp-{no_rc_version}-py37(h[a-z0-9]+)_0_cpu.tar.bz2 - - pyarrow-{no_rc_version}-py37(h[a-z0-9]+)_0_cpu.tar.bz2 - - conda-osx-clang-py38: - ci: azure - template: conda-recipes/azure.osx.yml - params: - config: osx_64_numpy1.17python3.8.____cpython - artifacts: - - arrow-cpp-{no_rc_version}-py38(h[a-z0-9]+)_0_cpu.tar.bz2 - - pyarrow-{no_rc_version}-py38(h[a-z0-9]+)_0_cpu.tar.bz2 - - conda-osx-clang-py39: - ci: azure - template: conda-recipes/azure.osx.yml - params: - config: osx_64_numpy1.19python3.9.____cpython - artifacts: - - arrow-cpp-{no_rc_version}-py39(h[a-z0-9]+)_0_cpu.tar.bz2 - - pyarrow-{no_rc_version}-py39(h[a-z0-9]+)_0_cpu.tar.bz2 - - conda-osx-arm64-clang-py38: - ci: azure - template: conda-recipes/azure.osx.yml - params: - config: osx_arm64_python3.8.____cpython - artifacts: - - arrow-cpp-{no_rc_version}-py38(h[a-z0-9]+)_0_cpu.tar.bz2 - - pyarrow-{no_rc_version}-py38(h[a-z0-9]+)_0_cpu.tar.bz2 - - conda-osx-arm64-clang-py39: - ci: azure - template: conda-recipes/azure.osx.yml - params: - config: osx_arm64_python3.9.____cpython - artifacts: - - arrow-cpp-{no_rc_version}-py39(h[a-z0-9]+)_0_cpu.tar.bz2 - - pyarrow-{no_rc_version}-py39(h[a-z0-9]+)_0_cpu.tar.bz2 - - ############################## Conda Windows ################################ - - conda-win-vs2017-py36-r36: - ci: azure - template: conda-recipes/azure.win.yml - params: - config: win_64_cuda_compiler_versionNonenumpy1.17python3.6.____cpython - r_config: win_64_r_base3.6 - artifacts: - - arrow-cpp-{no_rc_version}-py36(h[a-z0-9]+)_0_cpu.tar.bz2 - - pyarrow-{no_rc_version}-py36(h[a-z0-9]+)_0_cpu.tar.bz2 - - conda-win-vs2017-py37-r40: - ci: azure - template: conda-recipes/azure.win.yml - params: - config: win_64_cuda_compiler_versionNonenumpy1.17python3.7.____cpython - r_config: win_64_r_base4.0 - artifacts: - - arrow-cpp-{no_rc_version}-py37(h[a-z0-9]+)_0_cpu.tar.bz2 - - pyarrow-{no_rc_version}-py37(h[a-z0-9]+)_0_cpu.tar.bz2 - - conda-win-vs2017-py38: - ci: azure - template: conda-recipes/azure.win.yml - params: - config: win_64_cuda_compiler_versionNonenumpy1.17python3.8.____cpython - artifacts: - - arrow-cpp-{no_rc_version}-py38(h[a-z0-9]+)_0_cpu.tar.bz2 - - pyarrow-{no_rc_version}-py38(h[a-z0-9]+)_0_cpu.tar.bz2 - - conda-win-vs2017-py39: - ci: azure - template: conda-recipes/azure.win.yml - params: - config: win_64_cuda_compiler_versionNonenumpy1.19python3.9.____cpython - artifacts: - - arrow-cpp-{no_rc_version}-py38(h[a-z0-9]+)_0_cpu.tar.bz2 - - pyarrow-{no_rc_version}-py38(h[a-z0-9]+)_0_cpu.tar.bz2 - - -{% for python_version, python_tag, abi_tag in [("3.6", "cp36", "cp36m"), - ("3.7", "cp37", "cp37m"), - ("3.8", "cp38", "cp38"), - ("3.9", "cp39", "cp39")] %} - -{############################## Wheel Linux ##################################} - -{% for ci, arch, arch_alias, manylinux in [("github", "amd64", "x86_64", "2010"), - ("github", "amd64", "x86_64", "2014"), - ("travis", "arm64", "aarch64", "2014")] %} - {% set platform_tag = "manylinux{}_{}".format(manylinux, arch_alias) %} - - wheel-manylinux{{ manylinux }}-{{ python_tag }}-{{ arch }}: - ci: {{ ci }} - template: python-wheels/{{ ci }}.linux.{{ arch }}.yml - params: - python_version: {{ python_version }} - manylinux_version: {{ manylinux }} - artifacts: - - pyarrow-{no_rc_version}-{{ python_tag }}-{{ abi_tag }}-{{ platform_tag }}.whl - -{% endfor %} - -{############################## Wheel OSX ####################################} - -# enable S3 support from macOS 10.13 so we don't need to bundle curl, crypt and ssl -{% for macos_version, macos_codename, arrow_s3 in [("10.9", "mavericks", "OFF"), - ("10.13", "high-sierra", "ON")] %} - {% set platform_tag = "macosx_{}_x86_64".format(macos_version.replace('.', '_')) %} - - wheel-osx-{{ macos_codename }}-{{ python_tag }}: - ci: github - template: python-wheels/github.osx.yml - params: - python_version: {{ python_version }} - macos_deployment_target: {{ macos_version }} - arrow_s3: {{ arrow_s3 }} - artifacts: - - pyarrow-{no_rc_version}-{{ python_tag }}-{{ abi_tag }}-{{ platform_tag }}.whl - -{% endfor %} - -{############################## Wheel Windows ################################} - - wheel-windows-{{ python_tag }}: - ci: github - template: python-wheels/github.windows.yml - params: - python_version: {{ python_version }} - artifacts: - - pyarrow-{no_rc_version}-{{ python_tag }}-{{ abi_tag }}-win_amd64.whl - -{% endfor %} - -{############################ Python sdist ####################################} - - python-sdist: - ci: github - template: python-sdist/github.yml - artifacts: - - pyarrow-{no_rc_version}.tar.gz - - ############################## Linux PKGS #################################### - - debian-buster-amd64: - ci: github - template: linux-packages/github.linux.amd64.yml - params: - target: "debian-buster" - task_namespace: "apt" - upload_extensions: - - .ddeb - - .deb - - .debian.tar.xz - - .dsc - - .orig.tar.gz - artifacts: - - apache-arrow-apt-source_{no_rc_version}-1.debian.tar.xz - - apache-arrow-apt-source_{no_rc_version}-1.dsc - - apache-arrow-apt-source_{no_rc_version}-1_all.deb - - apache-arrow-apt-source_{no_rc_version}.orig.tar.gz - - apache-arrow_{no_rc_version}-1.debian.tar.xz - - apache-arrow_{no_rc_version}-1.dsc - - apache-arrow_{no_rc_version}.orig.tar.gz - - gir1.2-arrow-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-gandiva-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-parquet-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-plasma-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-cuda-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-cuda-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-cuda-glib400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-cuda-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-cuda400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-cuda400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-dataset-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-dataset400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-flight-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-flight400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-flight400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-python-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-python400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-python400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow400_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libgandiva-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libgandiva400_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libparquet-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libparquet400_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma-glib400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libplasma-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libplasma400_{no_rc_version}-1_[a-z0-9]+.deb - - plasma-store-server-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - plasma-store-server_{no_rc_version}-1_[a-z0-9]+.deb - - debian-buster-arm64: - ci: travis - template: linux-packages/travis.linux.arm64.yml - params: - target: "debian-buster-arm64" - task_namespace: "apt" - upload_extensions: - - .ddeb - - .deb - - .debian.tar.xz - - .dsc - - .orig.tar.gz - artifacts: - - apache-arrow_{no_rc_version}-1.debian.tar.xz - - apache-arrow_{no_rc_version}-1.dsc - - apache-arrow_{no_rc_version}.orig.tar.gz - - gir1.2-arrow-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-arrow-dataset-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-gandiva-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-parquet-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-dataset-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-dataset400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-flight-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-flight400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-flight400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-python-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-python400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-python400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow400_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libgandiva-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libgandiva400_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libparquet-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libparquet400_{no_rc_version}-1_[a-z0-9]+.deb - - debian-bullseye-amd64: - ci: github - template: linux-packages/github.linux.amd64.yml - params: - target: "debian-bullseye" - task_namespace: "apt" - upload_extensions: - - .ddeb - - .deb - - .debian.tar.xz - - .dsc - - .orig.tar.gz - artifacts: - - apache-arrow-apt-source_{no_rc_version}-1.debian.tar.xz - - apache-arrow-apt-source_{no_rc_version}-1.dsc - - apache-arrow-apt-source_{no_rc_version}-1_all.deb - - apache-arrow-apt-source_{no_rc_version}.orig.tar.gz - - apache-arrow_{no_rc_version}-1.debian.tar.xz - - apache-arrow_{no_rc_version}-1.dsc - - apache-arrow_{no_rc_version}.orig.tar.gz - - gir1.2-arrow-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-gandiva-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-parquet-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-plasma-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-cuda-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-cuda-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-cuda-glib400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-cuda-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-cuda400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-cuda400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-dataset-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-dataset400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-flight-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-flight400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-flight400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-python-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-python400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-python400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow400_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libgandiva-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libgandiva400_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libparquet-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libparquet400_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma-glib400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libplasma-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libplasma400_{no_rc_version}-1_[a-z0-9]+.deb - - plasma-store-server-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - plasma-store-server_{no_rc_version}-1_[a-z0-9]+.deb - - debian-bullseye-arm64: - ci: travis - template: linux-packages/travis.linux.arm64.yml - params: - target: "debian-bullseye-arm64" - task_namespace: "apt" - upload_extensions: - - .ddeb - - .deb - - .debian.tar.xz - - .dsc - - .orig.tar.gz - artifacts: - - apache-arrow_{no_rc_version}-1.debian.tar.xz - - apache-arrow_{no_rc_version}-1.dsc - - apache-arrow_{no_rc_version}.orig.tar.gz - - gir1.2-arrow-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-arrow-dataset-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-gandiva-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-parquet-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-dataset-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-dataset400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-flight-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-flight400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-flight400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-python-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-python400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow-python400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libarrow400_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libgandiva-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libgandiva400_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libparquet-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet400-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - libparquet400_{no_rc_version}-1_[a-z0-9]+.deb - - ubuntu-bionic-amd64: - ci: github - template: linux-packages/github.linux.amd64.yml - params: - target: "ubuntu-bionic" - task_namespace: "apt" - env: - UBUNTU: 18.04 - upload_extensions: - - .ddeb - - .deb - - .debian.tar.xz - - .dsc - - .orig.tar.gz - artifacts: - - apache-arrow-apt-source_{no_rc_version}-1.debian.tar.xz - - apache-arrow-apt-source_{no_rc_version}-1.dsc - - apache-arrow-apt-source_{no_rc_version}-1_all.deb - - apache-arrow-apt-source_{no_rc_version}.orig.tar.gz - - apache-arrow_{no_rc_version}-1.debian.tar.xz - - apache-arrow_{no_rc_version}-1.dsc - - apache-arrow_{no_rc_version}.orig.tar.gz - - gir1.2-arrow-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-gandiva-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-parquet-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-plasma-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-cuda-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-cuda-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-cuda-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-cuda400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-flight-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-flight400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-python-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-python400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow400_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva400_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet400_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma400_{no_rc_version}-1_[a-z0-9]+.deb - - plasma-store-server-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - plasma-store-server_{no_rc_version}-1_[a-z0-9]+.deb - - ubuntu-bionic-arm64: - ci: travis - template: linux-packages/travis.linux.arm64.yml - params: - target: "ubuntu-bionic-arm64" - task_namespace: "apt" - upload_extensions: - - .ddeb - - .deb - - .debian.tar.xz - - .dsc - - .orig.tar.gz - artifacts: - - apache-arrow_{no_rc_version}-1.debian.tar.xz - - apache-arrow_{no_rc_version}-1.dsc - - apache-arrow_{no_rc_version}.orig.tar.gz - - gir1.2-arrow-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-arrow-dataset-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-gandiva-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-parquet-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-flight-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-flight400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-python-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-python400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow400_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva400_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet400_{no_rc_version}-1_[a-z0-9]+.deb - - ubuntu-focal-amd64: - ci: github - template: linux-packages/github.linux.amd64.yml - params: - target: "ubuntu-focal" - task_namespace: "apt" - upload_extensions: - - .ddeb - - .deb - - .debian.tar.xz - - .dsc - - .orig.tar.gz - artifacts: - - apache-arrow-apt-source_{no_rc_version}-1.debian.tar.xz - - apache-arrow-apt-source_{no_rc_version}-1.dsc - - apache-arrow-apt-source_{no_rc_version}-1_all.deb - - apache-arrow-apt-source_{no_rc_version}.orig.tar.gz - - apache-arrow_{no_rc_version}-1.debian.tar.xz - - apache-arrow_{no_rc_version}-1.dsc - - apache-arrow_{no_rc_version}.orig.tar.gz - - gir1.2-arrow-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-gandiva-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-parquet-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-plasma-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-cuda-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-cuda-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-cuda-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-cuda400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-flight-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-flight400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-python-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-python400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow400_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva400_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet400_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma400_{no_rc_version}-1_[a-z0-9]+.deb - - plasma-store-server-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - plasma-store-server_{no_rc_version}-1_[a-z0-9]+.deb - - ubuntu-focal-arm64: - ci: travis - template: linux-packages/travis.linux.arm64.yml - params: - target: "ubuntu-focal-arm64" - task_namespace: "apt" - upload_extensions: - - .ddeb - - .deb - - .debian.tar.xz - - .dsc - - .orig.tar.gz - artifacts: - - apache-arrow_{no_rc_version}-1.debian.tar.xz - - apache-arrow_{no_rc_version}-1.dsc - - apache-arrow_{no_rc_version}.orig.tar.gz - - gir1.2-arrow-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-arrow-dataset-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-gandiva-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-parquet-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-flight-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-flight400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-python-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-python400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow400_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva400_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet400_{no_rc_version}-1_[a-z0-9]+.deb - - ubuntu-groovy-amd64: - ci: github - template: linux-packages/github.linux.amd64.yml - params: - target: "ubuntu-groovy" - task_namespace: "apt" - upload_extensions: - - .ddeb - - .deb - - .debian.tar.xz - - .dsc - - .orig.tar.gz - artifacts: - - apache-arrow-apt-source_{no_rc_version}-1.debian.tar.xz - - apache-arrow-apt-source_{no_rc_version}-1.dsc - - apache-arrow-apt-source_{no_rc_version}-1_all.deb - - apache-arrow-apt-source_{no_rc_version}.orig.tar.gz - - apache-arrow_{no_rc_version}-1.debian.tar.xz - - apache-arrow_{no_rc_version}-1.dsc - - apache-arrow_{no_rc_version}.orig.tar.gz - - gir1.2-arrow-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-gandiva-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-parquet-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-plasma-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-cuda-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-cuda-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-cuda-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-cuda400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-flight-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-flight400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-python-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-python400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow400_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva400_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet400_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libplasma400_{no_rc_version}-1_[a-z0-9]+.deb - - plasma-store-server-dbgsym_{no_rc_version}-1_[a-z0-9]+.d?deb - - plasma-store-server_{no_rc_version}-1_[a-z0-9]+.deb - - ubuntu-groovy-arm64: - ci: travis - template: linux-packages/travis.linux.arm64.yml - params: - target: "ubuntu-groovy-arm64" - task_namespace: "apt" - upload_extensions: - - .ddeb - - .deb - - .debian.tar.xz - - .dsc - - .orig.tar.gz - artifacts: - - apache-arrow_{no_rc_version}-1.debian.tar.xz - - apache-arrow_{no_rc_version}-1.dsc - - apache-arrow_{no_rc_version}.orig.tar.gz - - gir1.2-arrow-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-arrow-dataset-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-gandiva-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - gir1.2-parquet-1.0_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dataset400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-flight-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-flight400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-python-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow-python400_{no_rc_version}-1_[a-z0-9]+.deb - - libarrow400_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libgandiva400_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib-dev_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib-doc_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet-glib400_{no_rc_version}-1_[a-z0-9]+.deb - - libparquet400_{no_rc_version}-1_[a-z0-9]+.deb - - centos-7-amd64: - ci: github - template: linux-packages/github.linux.amd64.yml - params: - target: "centos-7" - task_namespace: yum - upload_extensions: - - .rpm - artifacts: - - apache-arrow-release-{no_rc_version}-1.el7.noarch.rpm - - apache-arrow-release-{no_rc_version}-1.el7.src.rpm - - arrow-{no_rc_version}-1.el7.src.rpm - - arrow-debuginfo-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - arrow-devel-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - arrow-glib-devel-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - arrow-glib-doc-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - arrow-glib-libs-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - arrow-libs-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - arrow-dataset-devel-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - arrow-dataset-glib-devel-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - arrow-dataset-glib-doc-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - arrow-dataset-glib-libs-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - arrow-dataset-libs-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - arrow-python-devel-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - arrow-python-libs-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - parquet-devel-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - parquet-glib-devel-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - parquet-glib-doc-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - parquet-glib-libs-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - parquet-libs-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - plasma-devel-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - plasma-glib-devel-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - plasma-glib-doc-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - plasma-glib-libs-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - plasma-libs-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - plasma-store-server-{no_rc_version}-1.el7.[a-z0-9_]+.rpm - - centos-8-amd64: - ci: github - template: linux-packages/github.linux.amd64.yml - params: - target: "centos-8" - task_namespace: yum - upload_extensions: - - .rpm - artifacts: - - apache-arrow-release-{no_rc_version}-1.el8.noarch.rpm - - apache-arrow-release-{no_rc_version}-1.el8.src.rpm - - arrow-{no_rc_version}-1.el8.src.rpm - - arrow-dataset-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-dataset-glib-doc-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-dataset-glib-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-dataset-glib-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-dataset-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-dataset-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-flight-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-flight-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-flight-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-glib-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-glib-doc-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-glib-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-glib-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-python-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-python-flight-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-python-flight-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-python-flight-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-python-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-python-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - gandiva-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - gandiva-glib-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - gandiva-glib-doc-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - gandiva-glib-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - gandiva-glib-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - gandiva-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - gandiva-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - parquet-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - parquet-glib-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - parquet-glib-doc-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - parquet-glib-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - parquet-glib-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - parquet-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - parquet-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - plasma-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - plasma-glib-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - plasma-glib-doc-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - plasma-glib-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - plasma-glib-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - plasma-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - plasma-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - plasma-store-server-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - plasma-store-server-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - centos-8-arm64: - ci: travis - template: linux-packages/travis.linux.arm64.yml - params: - target: "centos-8-aarch64" - task_namespace: yum - upload_extensions: - - .rpm - artifacts: - - arrow-{no_rc_version}-1.el8.src.rpm - - arrow-dataset-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-dataset-glib-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-dataset-glib-doc-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-dataset-glib-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-dataset-glib-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-dataset-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-dataset-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-flight-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-flight-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-flight-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-glib-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-glib-doc-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-glib-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-glib-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-python-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-python-flight-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-python-flight-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-python-flight-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-python-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - arrow-python-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - parquet-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - parquet-glib-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - parquet-glib-doc-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - parquet-glib-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - parquet-glib-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - parquet-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - parquet-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - plasma-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - plasma-glib-devel-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - plasma-glib-doc-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - plasma-glib-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - plasma-glib-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - plasma-libs-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - plasma-libs-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - plasma-store-server-debuginfo-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - plasma-store-server-{no_rc_version}-1.el8.[a-z0-9_]+.rpm - - ############################## Homebrew Tasks ################################ - - homebrew-cpp: - ci: github - template: homebrew-formulae/github.macos.yml - params: - formula: apache-arrow.rb - - homebrew-cpp-autobrew: - ci: github - template: homebrew-formulae/github.macos.yml - params: - formula: autobrew/apache-arrow.rb - - homebrew-r-autobrew: - # This tests that the autobrew formula + script work in practice - ci: github - template: r/github.macos.autobrew.yml - - ############################## Gandiva Tasks ################################ - - gandiva-jar-ubuntu: - ci: github - template: gandiva-jars/github.linux.yml - artifacts: - - arrow-gandiva-{no_rc_version}-SNAPSHOT.jar - - gandiva-jar-osx: - ci: github - template: gandiva-jars/github.osx.yml - artifacts: - - arrow-gandiva-{no_rc_version}-SNAPSHOT.jar - - ############################## NuGet packages ############################### - - nuget: - ci: github - template: nuget-packages/github.linux.yml - params: - run: ubuntu-csharp - artifacts: - - Apache.Arrow.Flight.AspNetCore.{no_rc_version}.nupkg - - Apache.Arrow.Flight.AspNetCore.{no_rc_version}.snupkg - - Apache.Arrow.Flight.{no_rc_version}.nupkg - - Apache.Arrow.Flight.{no_rc_version}.snupkg - - Apache.Arrow.{no_rc_version}.nupkg - - Apache.Arrow.{no_rc_version}.snupkg - - ########################### Release verification ############################ - -{% for target in ["binary", "yum", "apt"] %} - verify-rc-binaries-{{ target }}: - ci: github - template: verify-rc/github.linux.yml - params: - env: - TEST_DEFAULT: 0 - TEST_{{ target|upper }}: 1 - artifact: "binaries" -{% endfor %} - -{% for platform in ["linux", "macos"] %} - - verify-rc-wheels-{{ platform }}: - ci: github - template: verify-rc/github.{{ platform }}.yml - params: - env: - TEST_DEFAULT: 0 - artifact: "wheels" - -{% for target in ["csharp", - "go", - "integration" - "java", - "js", - "python", - "ruby", - "rust"] %} - - verify-rc-source-{{ platform }}-{{ target }}: - ci: github - template: verify-rc/github.{{ platform }}.yml - params: - env: - INSTALL_NODE: 0 - TEST_DEFAULT: 0 - TEST_{{ target|upper }}: 1 - artifact: "source" - -{% endfor %} - -{% endfor %} - - verify-rc-source-windows: - ci: github - template: verify-rc/github.win.yml - params: - script: "verify-release-candidate.bat" - - verify-rc-wheels-windows: - ci: github - template: verify-rc/github.win.yml - params: - script: "verify-release-candidate-wheels.bat" - -{############################## Docker tests #################################} - -{% for image in ["conda-cpp", - "conda-cpp-valgrind", - "debian-c-glib", - "ubuntu-c-glib", - "debian-ruby", - "ubuntu-ruby"] %} - test-{{ image }}: - ci: github - template: docker-tests/github.linux.yml - params: - run: {{ image }} -{% endfor %} - - test-debian-10-cpp: - ci: github - template: docker-tests/github.linux.yml - params: - env: - DEBIAN: 10 - run: debian-cpp - - test-ubuntu-18.04-cpp: - ci: github - template: docker-tests/github.linux.yml - params: - env: - UBUNTU: 18.04 - run: ubuntu-cpp - - test-fedora-33-cpp: - ci: github - template: docker-tests/github.linux.yml - params: - env: - FEDORA: 33 - run: fedora-cpp - - test-ubuntu-18.04-cpp-release: - ci: github - template: docker-tests/github.linux.yml - params: - env: - UBUNTU: 18.04 - run: "-e ARROW_BUILD_TYPE=release ubuntu-cpp" - - test-ubuntu-18.04-cpp-static: - ci: github - template: docker-tests/github.linux.yml - params: - env: - UBUNTU: 18.04 - run: "-e ARROW_BUILD_SHARED=OFF -e ARROW_BUILD_STATIC=ON -e ARROW_TEST_LINKAGE=static ubuntu-cpp" - - test-ubuntu-20.04-cpp: - ci: github - template: docker-tests/github.linux.yml - params: - env: - UBUNTU: 20.04 - run: ubuntu-cpp - -{% for cpp_standard in [14, 17] %} - test-ubuntu-20.04-cpp-{{ cpp_standard }}: - ci: github - template: docker-tests/github.linux.yml - params: - env: - UBUNTU: 20.04 - run: "-e CMAKE_ARGS=-DCMAKE_CXX_STANDARD={{ cpp_standard }} ubuntu-cpp" -{% endfor %} - - test-ubuntu-20.04-cpp-thread-sanitizer: - ci: github - template: docker-tests/github.linux.yml - params: - env: - CLANG_TOOLS: 11 - UBUNTU: 20.04 - run: ubuntu-cpp-thread-sanitizer - -{% for python_version in ["3.6", "3.7", "3.8", "3.9"] %} - test-conda-python-{{ python_version }}: - ci: github - template: docker-tests/github.linux.yml - params: - env: - PYTHON: {{ python_version }} - run: conda-python -{% endfor %} - - test-conda-python-3.8-hypothesis: - ci: github - template: docker-tests/github.linux.yml - params: - env: - HYPOTHESIS_PROFILE: ci - PYARROW_TEST_HYPOTHESIS: ON - PYTHON: 3.8 - # limit to execute hypothesis tests only - PYTEST_ARGS: "-m hypothesis" - run: conda-python-pandas - - test-debian-10-python-3: - ci: azure - template: docker-tests/azure.linux.yml - params: - env: - DEBIAN: 10 - run: debian-python - - test-ubuntu-18.04-python-3: - ci: azure - template: docker-tests/azure.linux.yml - params: - env: - UBUNTU: 18.04 - run: ubuntu-python - - test-fedora-33-python-3: - ci: azure - template: docker-tests/azure.linux.yml - params: - env: - FEDORA: 33 - run: fedora-python - - test-r-linux-as-cran: - ci: github - template: r/github.linux.cran.yml - params: - MATRIX: {{ "${{ matrix.r_image }}" }} - - test-r-version-compatibility: - ci: github - template: r/github.linux.version.compatibility.yml - - test-r-versions: - ci: github - template: r/github.linux.versions.yml - params: - MATRIX: {{ "${{ matrix.r_version }}" }} - - test-r-install-local: - ci: github - template: r/github.macos-linux.local.yml - - test-r-devdocs: - ci: github - template: r/github.devdocs.yml - - test-r-rhub-ubuntu-gcc-release: - ci: azure - template: r/azure.linux.yml - params: - r_org: rhub - r_image: ubuntu-gcc-release - r_tag: latest - - test-r-rocker-r-base-latest: - ci: azure - template: r/azure.linux.yml - params: - r_org: rocker - r_image: r-base - r_tag: latest - - test-r-rstudio-r-base-3.6-bionic: - ci: azure - template: r/azure.linux.yml - params: - r_org: rstudio - r_image: r-base - r_tag: 3.6-bionic - - test-r-rstudio-r-base-3.6-centos8: - ci: azure - template: r/azure.linux.yml - params: - r_org: rstudio - r_image: r-base - r_tag: 3.6-centos8 - - test-r-rstudio-r-base-3.6-centos7-devtoolset-8: - ci: azure - template: r/azure.linux.yml - params: - r_org: rstudio - r_image: r-base - r_tag: 3.6-centos7 - devtoolset_version: 8 - - test-r-rstudio-r-base-3.6-opensuse15: - ci: azure - template: r/azure.linux.yml - params: - r_org: rstudio - r_image: r-base - r_tag: 3.6-opensuse15 - - test-r-rstudio-r-base-3.6-opensuse42: - ci: azure - template: r/azure.linux.yml - params: - r_org: rstudio - r_image: r-base - r_tag: 3.6-opensuse42 - - test-r-minimal-build: - ci: azure - template: r/azure.linux.yml - params: - r_org: rocker - r_image: r-base - r_tag: latest - arrow_dataset: "OFF" - arrow_parquet: "OFF" - arrow_s3: "OFF" - arrow_with_re2: "OFF" - arrow_with_utf8proc: "OFF" - libarrow_minimal: "TRUE" - - test-r-without-arrow: - ci: azure - template: r/azure.linux.yml - params: - r_org: rhub - r_image: ubuntu-gcc-release - r_tag: latest - libarrow_download: "FALSE" - libarrow_build: "FALSE" - with_arrow: "FALSE" - not_cran: "FALSE" - - test-ubuntu-18.04-r-sanitizer: - ci: azure - template: docker-tests/azure.linux.yml - params: - env: - UBUNTU: 18.04 - run: ubuntu-r-sanitizer - - test-debian-10-go-1.15: - ci: azure - template: docker-tests/azure.linux.yml - params: - env: - GO: 1.15 - run: debian-go - - test-ubuntu-20.10-docs: - ci: azure - template: docker-tests/azure.linux.yml - params: - env: - UBUNTU: "20.10" - run: ubuntu-docs - - ############################## vcpkg tests ################################## - - test-build-vcpkg-win: - ci: github - template: vcpkg-tests/github.windows.yml - - ############################## Integration tests ############################ - - test-conda-python-3.7-pandas-latest: - ci: github - template: docker-tests/github.linux.yml - params: - env: - PYTHON: 3.7 - PANDAS: latest - # use the latest pandas release, so prevent reusing any cached layers - run: --no-leaf-cache conda-python-pandas - - test-conda-python-3.8-pandas-latest: - ci: github - template: docker-tests/github.linux.yml - params: - env: - PYTHON: 3.8 - PANDAS: latest - # use the latest pandas release, so prevent reusing any cached layers - run: --no-leaf-cache conda-python-pandas - - test-conda-python-3.8-pandas-nightly: - ci: github - template: docker-tests/github.linux.yml - params: - env: - PYTHON: 3.8 - PANDAS: nightly - NUMPY: nightly - run: --no-leaf-cache conda-python-pandas - - test-conda-python-3.7-pandas-master: - ci: github - template: docker-tests/github.linux.yml - params: - env: - PYTHON: 3.7 - PANDAS: master - # use the master branch of pandas, so prevent reusing any cached layers - run: --no-leaf-cache conda-python-pandas - - test-conda-python-3.6-pandas-0.23: - ci: github - template: docker-tests/github.linux.yml - params: - env: - PYTHON: 3.6 - PANDAS: 0.23 - run: conda-python-pandas - - test-conda-python-3.7-dask-latest: - ci: github - template: docker-tests/github.linux.yml - params: - env: - PYTHON: 3.7 - DASK: latest - # use the latest dask release, so prevent reusing any cached layers - run: --no-leaf-cache conda-python-dask - - test-conda-python-3.8-dask-master: - ci: github - template: docker-tests/github.linux.yml - params: - env: - PYTHON: 3.8 - DASK: master - # use the master branch of dask, so prevent reusing any cached layers - run: --no-leaf-cache conda-python-dask - - test-conda-python-3.8-jpype: - ci: github - template: docker-tests/github.linux.yml - params: - env: - PYTHON: 3.8 - run: conda-python-jpype - - test-conda-python-3.7-turbodbc-latest: - ci: github - template: docker-tests/github.linux.yml - params: - env: - PYTHON: 3.7 - TURBODBC: latest - # use the latest turbodbc release, so prevent reusing any cached layers - run: --no-leaf-cache conda-python-turbodbc - - test-conda-python-3.7-turbodbc-master: - ci: github - template: docker-tests/github.linux.yml - params: - env: - PYTHON: 3.7 - TURBODBC: master - # use the master branch of dask, so prevent reusing any cached layers - run: --no-leaf-cache conda-python-turbodbc - - test-conda-python-3.7-kartothek-latest: - ci: github - template: docker-tests/github.linux.yml - params: - env: - PYTHON: 3.7 - KARTOTHEK: latest - run: --no-leaf-cache conda-python-kartothek - - test-conda-python-3.7-kartothek-master: - ci: github - template: docker-tests/github.linux.yml - params: - env: - PYTHON: 3.7 - KARTOTHEK: master - # use the master branch of kartothek, so prevent reusing any layers - run: --no-leaf-cache conda-python-kartothek - - test-conda-python-3.7-hdfs-3.2: - ci: github - template: docker-tests/github.linux.yml - params: - env: - PYTHON: 3.7 - HDFS: 3.2.1 - run: conda-python-hdfs - - test-conda-python-3.7-spark-branch-3.0: - ci: github - template: docker-tests/github.linux.yml - params: - env: - PYTHON: 3.7 - SPARK: "branch-3.0" - TEST_PYARROW_ONLY: "true" - # use the branch-3.0 of spark, so prevent reusing any layers - run: --no-leaf-cache conda-python-spark - - test-conda-python-3.8-spark-master: - ci: github - template: docker-tests/github.linux.yml - params: - env: - PYTHON: 3.8 - SPARK: master - # use the master branch of spark, so prevent reusing any layers - run: --no-leaf-cache conda-python-spark - - # Remove the "skipped-" prefix in ARROW-8475 - skipped-test-conda-cpp-hiveserver2: - ci: github - template: docker-tests/github.linux.yml - params: - run: conda-cpp-hiveserver2 - - example-cpp-minimal-build-static: - ci: github - template: cpp-examples/github.linux.yml - params: - type: minimal_build - run: static - - example-cpp-minimal-build-static-system-dependency: - ci: github - template: cpp-examples/github.linux.yml - params: - type: minimal_build - run: static-system-dependency diff --git a/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat b/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat deleted file mode 100644 index f748f92f3bcf6..0000000000000 --- a/dev/tasks/vcpkg-tests/cpp-build-vcpkg.bat +++ /dev/null @@ -1,90 +0,0 @@ -@rem Licensed to the Apache Software Foundation (ASF) under one -@rem or more contributor license agreements. See the NOTICE file -@rem distributed with this work for additional information -@rem regarding copyright ownership. The ASF licenses this file -@rem to you under the Apache License, Version 2.0 (the -@rem "License"); you may not use this file except in compliance -@rem with the License. You may obtain a copy of the License at -@rem -@rem http://www.apache.org/licenses/LICENSE-2.0 -@rem -@rem Unless required by applicable law or agreed to in writing, -@rem software distributed under the License is distributed on an -@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -@rem KIND, either express or implied. See the License for the -@rem specific language governing permissions and limitations -@rem under the License. - -@rem Run VsDevCmd.bat to set Visual Studio environment variables for building -@rem on the command line. This is the path for Visual Studio Enterprise 2019 - -call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\VsDevCmd.bat" -arch=amd64 - - -@rem Install build dependencies with vcpkg - -@rem TODO(ianmcook): change --x-manifest-root to --manifest-root after it -@rem changes in vcpkg - -vcpkg install ^ - --triplet x64-windows ^ - --x-manifest-root cpp ^ - --clean-after-build ^ - || exit /B 1 - - -@rem Set environment variables - -set ARROW_TEST_DATA=%cd%\testing\data -set PARQUET_TEST_DATA=%cd%\cpp\submodules\parquet-testing\data - - -@rem Build Arrow C++ library - -mkdir cpp\build -pushd cpp\build - -@rem TODO(ianmcook): test using --parallel %NUMBER_OF_PROCESSORS% with -@rem cmake --build instead of specifying -DARROW_CXXFLAGS="/MP" here -@rem (see https://gitlab.kitware.com/cmake/cmake/-/issues/20564) - -@rem TODO(ianmcook): Add -DARROW_BUILD_BENCHMARKS=ON after the issue described -@rem at https://github.com/google/benchmark/issues/1046 is resolved - -cmake -G "Visual Studio 16 2019" -A x64 ^ - -DARROW_BOOST_USE_SHARED=ON ^ - -DARROW_BUILD_SHARED=ON ^ - -DARROW_BUILD_STATIC=OFF ^ - -DARROW_BUILD_TESTS=ON ^ - -DARROW_CXXFLAGS="/MP" ^ - -DARROW_DATASET=ON ^ - -DARROW_DEPENDENCY_SOURCE=VCPKG ^ - -DARROW_FLIGHT=ON ^ - -DARROW_MIMALLOC=ON ^ - -DARROW_PARQUET=ON ^ - -DARROW_PYTHON=OFF ^ - -DARROW_WITH_BROTLI=ON ^ - -DARROW_WITH_BZ2=ON ^ - -DARROW_WITH_LZ4=ON ^ - -DARROW_WITH_SNAPPY=ON ^ - -DARROW_WITH_ZLIB=ON ^ - -DARROW_WITH_ZSTD=ON ^ - -DCMAKE_BUILD_TYPE=release ^ - -DCMAKE_UNITY_BUILD=ON ^ - .. || exit /B 1 - -cmake --build . --target INSTALL --config Release || exit /B 1 - - -@rem Test Arrow C++ library - -@rem TODO(ARROW-11675): Uncomment the below -@rem and troubleshoot two test failures: -@rem - TestStatisticsSortOrder/0.MinMax -@rem - TestStatistic.Int32Extremums - -@rem ctest --output-on-failure ^ -@rem --parallel %NUMBER_OF_PROCESSORS% ^ -@rem --timeout 300 || exit /B 1 - -popd diff --git a/dev/tasks/vcpkg-tests/github.windows.yml b/dev/tasks/vcpkg-tests/github.windows.yml deleted file mode 100644 index eacb6317c303a..0000000000000 --- a/dev/tasks/vcpkg-tests/github.windows.yml +++ /dev/null @@ -1,63 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# NOTE: must set "Crossbow" as name to have the badge links working in the -# github comment reports! -name: Crossbow - -on: - push: - branches: - - "*-github-*" - -jobs: - test-vcpkg-win: - name: Install build deps with vcpkg and build Arrow C++ - runs-on: windows-2019 - steps: - - name: Checkout Arrow - run: | - git clone --no-checkout {{ arrow.remote }} arrow - git -C arrow fetch -t {{ arrow.remote }} {{ arrow.branch }} - git -C arrow checkout FETCH_HEAD - git -C arrow submodule update --init --recursive - - name: Remove and Reinstall vcpkg - # As of January 2021, the version of vcpkg that is preinstalled on the - # Github Actions windows-2019 image is 2020.11.12, as noted at - # https://github.com/actions/virtual-environments/blob/main/images/win/Windows2019-Readme.md - # This version of vcpkg has a bug that causes the installation of - # aws-cpp-sdk to fail. See details at - # https://github.com/awslabs/aws-c-common/issues/734 - # and https://github.com/microsoft/vcpkg/pull/14716. - # When running vcpkg in Github Actions on Windows, remove the - # preinstalled vcpkg and install the newest version from source. - shell: cmd - run: | - CALL vcpkg integrate remove 2>NUL - CALL C: - CALL cd \ - CALL rmdir /s /q vcpkg 2>NUL - CALL git clone https://github.com/microsoft/vcpkg.git vcpkg - CALL cd vcpkg - CALL bootstrap-vcpkg.bat -win64 -disableMetrics - CALL vcpkg integrate install - CALL setx PATH "%PATH%;C:\vcpkg" - - name: Install Dependencies with vcpkg and Build Arrow C++ - shell: cmd - run: | - CALL cd arrow - CALL dev\tasks\vcpkg-tests\cpp-build-vcpkg.bat diff --git a/dev/tasks/verify-rc/github.linux.yml b/dev/tasks/verify-rc/github.linux.yml deleted file mode 100644 index 8729426fd134d..0000000000000 --- a/dev/tasks/verify-rc/github.linux.yml +++ /dev/null @@ -1,75 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -{% import 'macros.jinja' as macros with context %} - -{{ macros.github_header() }} - -jobs: - verify: - name: "Verify release candidate Ubuntu {{ artifact }}" - runs-on: ubuntu-20.04 - {% if env is defined %} - env: - {% for key, value in env.items() %} - {{ key }}: {{ value }} - {% endfor %} - {% endif %} - - steps: - {{ macros.github_checkout_arrow()|indent }} - - - name: Install System Dependencies - run: | - # TODO: don't require removing newer llvms - sudo apt-get --purge remove -y llvm-9 clang-9 - sudo apt-get install -y \ - autoconf-archive \ - binfmt-support \ - bison \ - curl \ - flex \ - gtk-doc-tools \ - jq \ - libboost-all-dev \ - libgirepository1.0-dev \ - qemu-user-static \ - wget - - if [ "$TEST_JAVA" = "1" ]; then - # Maven - MAVEN_VERSION=3.6.3 - wget https://downloads.apache.org/maven/maven-3/$MAVEN_VERSION/binaries/apache-maven-$MAVEN_VERSION-bin.zip - unzip apache-maven-$MAVEN_VERSION-bin.zip - mkdir -p $HOME/java - mv apache-maven-$MAVEN_VERSION $HOME/java - export PATH=$HOME/java/apache-maven-$MAVEN_VERSION/bin:$PATH - fi - - if [ "$TEST_RUBY" = "1" ]; then - ruby --version - sudo gem install bundler - fi - - uses: actions/setup-node@v2-beta - with: - node-version: '14' - - name: Run verification - shell: bash - run: | - arrow/dev/release/verify-release-candidate.sh \ - {{ artifact }} \ - {{ release|default("1.0.0") }} {{ rc|default("0") }} diff --git a/dev/tasks/verify-rc/github.macos.yml b/dev/tasks/verify-rc/github.macos.yml deleted file mode 100644 index ab0c6563bdc05..0000000000000 --- a/dev/tasks/verify-rc/github.macos.yml +++ /dev/null @@ -1,50 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -{% import 'macros.jinja' as macros with context %} - -{{ macros.github_header() }} - -jobs: - verify: - name: "Verify release candidate macOS {{ artifact }}" - runs-on: macos-latest - {% if env is defined %} - env: - {% for key, value in env.items() %} - {{ key }}: {{ value }} - {% endfor %} - {% endif %} - - steps: - {{ macros.github_checkout_arrow()|indent }} - - - name: Install System Dependencies - shell: bash - run: | - brew update - brew bundle --file=arrow/cpp/Brewfile - brew bundle --file=arrow/c_glib/Brewfile - - uses: actions/setup-node@v2-beta - with: - node-version: '14' - - name: Run verification - shell: bash - run: | - arrow/dev/release/verify-release-candidate.sh \ - {{ artifact }} \ - {{ release|default("1.0.0") }} {{ rc|default("0") }} diff --git a/dev/tasks/verify-rc/github.win.yml b/dev/tasks/verify-rc/github.win.yml deleted file mode 100644 index 5406327e87412..0000000000000 --- a/dev/tasks/verify-rc/github.win.yml +++ /dev/null @@ -1,45 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -{% import 'macros.jinja' as macros with context %} - -{{ macros.github_header() }} - -jobs: - verify: - name: "Verify release candidate Windows source" - runs-on: windows-2016 - {% if env is defined %} - env: - {% for key, value in env.items() %} - {{ key }}: {{ value }} - {% endfor %} - {% endif %} - - steps: - {{ macros.github_checkout_arrow()|indent }} - - - uses: conda-incubator/setup-miniconda@v2 - - name: Install System Dependencies - run: | - choco install boost-msvc-14.1 - choco install wget - - name: Run verification - shell: cmd - run: | - cd arrow - dev/release/{{ script }} {{ release|default("1.0.0") }} {{ rc|default("0") }} diff --git a/dev/test_merge_arrow_pr.py b/dev/test_merge_arrow_pr.py deleted file mode 100644 index 8fe188350822a..0000000000000 --- a/dev/test_merge_arrow_pr.py +++ /dev/null @@ -1,317 +0,0 @@ -#!/usr/bin/env python - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from collections import namedtuple - -import pytest - -import merge_arrow_pr - - -FakeIssue = namedtuple('issue', ['fields']) -FakeFields = namedtuple('fields', ['status', 'summary', 'assignee', - 'components', 'fixVersions']) -FakeAssignee = namedtuple('assignee', ['displayName']) -FakeStatus = namedtuple('status', ['name']) -FakeComponent = namedtuple('component', ['name']) -FakeVersion = namedtuple('version', ['name', 'raw']) - -RAW_VERSION_JSON = [ - {'name': 'JS-0.4.0', 'released': False}, - {'name': '0.11.0', 'released': False}, - {'name': '0.12.0', 'released': False}, - {'name': '0.10.0', 'released': True}, - {'name': '0.9.0', 'released': True} -] - - -SOURCE_VERSIONS = [FakeVersion(raw['name'], raw) - for raw in RAW_VERSION_JSON] - -TRANSITIONS = [{'name': 'Resolve Issue', 'id': 1}] - -jira_id = 'ARROW-1234' -status = FakeStatus('In Progress') -fields = FakeFields(status, 'issue summary', FakeAssignee('groundhog'), - [FakeComponent('C++'), FakeComponent('Format')], - []) -FAKE_ISSUE_1 = FakeIssue(fields) - - -class FakeJIRA: - - def __init__(self, issue=None, project_versions=None, transitions=None, - current_fix_versions=None): - self._issue = issue - self._project_versions = project_versions - self._transitions = transitions - - def issue(self, jira_id): - return self._issue - - def transitions(self, jira_id): - return self._transitions - - def transition_issue(self, jira_id, transition_id, comment=None, - fixVersions=None): - self.captured_transition = { - 'jira_id': jira_id, - 'transition_id': transition_id, - 'comment': comment, - 'fixVersions': fixVersions - } - - def get_candidate_fix_versions(self): - return SOURCE_VERSIONS, ['0.12.0'] - - def project_versions(self, project): - return self._project_versions - - -class FakeCLI: - - def __init__(self, responses=()): - self.responses = responses - self.position = 0 - - def prompt(self, prompt): - response = self.responses[self.position] - self.position += 1 - return response - - def fail(self, msg): - raise Exception(msg) - - -def test_jira_fix_versions(): - jira = FakeJIRA(project_versions=SOURCE_VERSIONS, - transitions=TRANSITIONS) - - issue = merge_arrow_pr.JiraIssue(jira, 'ARROW-1234', 'ARROW', FakeCLI()) - all_versions, default_versions = issue.get_candidate_fix_versions() - assert all_versions == SOURCE_VERSIONS - assert default_versions == ['0.11.0'] - - -def test_jira_no_suggest_patch_release(): - versions_json = [ - {'name': '0.11.1', 'released': False}, - {'name': '0.12.0', 'released': False}, - ] - - versions = [FakeVersion(raw['name'], raw) for raw in versions_json] - - jira = FakeJIRA(project_versions=versions, transitions=TRANSITIONS) - issue = merge_arrow_pr.JiraIssue(jira, 'ARROW-1234', 'ARROW', FakeCLI()) - all_versions, default_versions = issue.get_candidate_fix_versions() - assert all_versions == versions - assert default_versions == ['0.12.0'] - - -def test_jira_parquet_no_suggest_non_cpp(): - # ARROW-7351 - versions_json = [ - {'name': 'cpp-1.5.0', 'released': True}, - {'name': 'cpp-1.6.0', 'released': False}, - {'name': 'cpp-1.7.0', 'released': False}, - {'name': '1.11.0', 'released': False}, - {'name': '1.12.0', 'released': False} - ] - - versions = [FakeVersion(raw['name'], raw) - for raw in versions_json] - - jira = FakeJIRA(project_versions=versions, transitions=TRANSITIONS) - issue = merge_arrow_pr.JiraIssue(jira, 'PARQUET-1713', 'PARQUET', - FakeCLI()) - all_versions, default_versions = issue.get_candidate_fix_versions() - assert all_versions == versions - assert default_versions == ['cpp-1.6.0'] - - -def test_jira_invalid_issue(): - class Mock: - - def issue(self, jira_id): - raise Exception("not found") - - with pytest.raises(Exception): - merge_arrow_pr.JiraIssue(Mock(), 'ARROW-1234', 'ARROW', FakeCLI()) - - -def test_jira_resolve(): - jira = FakeJIRA(issue=FAKE_ISSUE_1, - project_versions=SOURCE_VERSIONS, - transitions=TRANSITIONS) - - my_comment = 'my comment' - fix_versions = [SOURCE_VERSIONS[1].raw] - - issue = merge_arrow_pr.JiraIssue(jira, 'ARROW-1234', 'ARROW', FakeCLI()) - issue.resolve(fix_versions, my_comment) - - assert jira.captured_transition == { - 'jira_id': 'ARROW-1234', - 'transition_id': 1, - 'comment': my_comment, - 'fixVersions': fix_versions - } - - -def test_jira_resolve_non_mainline(): - jira = FakeJIRA(issue=FAKE_ISSUE_1, - project_versions=SOURCE_VERSIONS, - transitions=TRANSITIONS) - - my_comment = 'my comment' - fix_versions = [SOURCE_VERSIONS[0].raw] - - issue = merge_arrow_pr.JiraIssue(jira, 'ARROW-1234', 'ARROW', FakeCLI()) - issue.resolve(fix_versions, my_comment) - - assert jira.captured_transition == { - 'jira_id': 'ARROW-1234', - 'transition_id': 1, - 'comment': my_comment, - 'fixVersions': fix_versions - } - - -def test_jira_resolve_released_fix_version(): - # ARROW-5083 - jira = FakeJIRA(issue=FAKE_ISSUE_1, - project_versions=SOURCE_VERSIONS, - transitions=TRANSITIONS) - - cmd = FakeCLI(responses=['0.9.0']) - fix_versions_json = merge_arrow_pr.prompt_for_fix_version(cmd, jira) - assert fix_versions_json == [RAW_VERSION_JSON[-1]] - - -def test_multiple_authors_bad_input(): - a0 = 'Jimbob Crawfish ' - a1 = 'Jarvis McCratchett ' - a2 = 'Hank Miller ' - distinct_authors = [a0, a1] - - cmd = FakeCLI(responses=['']) - primary_author, new_distinct_authors = merge_arrow_pr.get_primary_author( - cmd, distinct_authors) - assert primary_author == a0 - assert new_distinct_authors == [a0, a1] - - cmd = FakeCLI(responses=['oops', a1]) - primary_author, new_distinct_authors = merge_arrow_pr.get_primary_author( - cmd, distinct_authors) - assert primary_author == a1 - assert new_distinct_authors == [a1, a0] - - cmd = FakeCLI(responses=[a2]) - primary_author, new_distinct_authors = merge_arrow_pr.get_primary_author( - cmd, distinct_authors) - assert primary_author == a2 - assert new_distinct_authors == [a2, a0, a1] - - -def test_jira_already_resolved(): - status = FakeStatus('Resolved') - fields = FakeFields(status, 'issue summary', FakeAssignee('groundhog'), - [FakeComponent('Java')], []) - issue = FakeIssue(fields) - - jira = FakeJIRA(issue=issue, - project_versions=SOURCE_VERSIONS, - transitions=TRANSITIONS) - - fix_versions = [SOURCE_VERSIONS[0].raw] - issue = merge_arrow_pr.JiraIssue(jira, 'ARROW-1234', 'ARROW', FakeCLI()) - - with pytest.raises(Exception, - match="ARROW-1234 already has status 'Resolved'"): - issue.resolve(fix_versions, "") - - -def test_no_unset_point_release_fix_version(): - # ARROW-6915: We have had the problem of issues marked with a point release - # having their fix versions overwritten by the merge tool. This verifies - # that existing patch release versions are carried over - status = FakeStatus('In Progress') - - versions_json = { - '0.14.2': {'name': '0.14.2', 'id': 1}, - '0.15.1': {'name': '0.15.1', 'id': 2}, - '0.16.0': {'name': '0.16.0', 'id': 3}, - '0.17.0': {'name': '0.17.0', 'id': 4} - } - - fields = FakeFields(status, 'summary', FakeAssignee('someone'), - [FakeComponent('Java')], - [FakeVersion(v, versions_json[v]) - for v in ['0.17.0', '0.15.1', '0.14.2']]) - issue = FakeIssue(fields) - - jira = FakeJIRA(issue=issue, project_versions=SOURCE_VERSIONS, - transitions=TRANSITIONS) - - issue = merge_arrow_pr.JiraIssue(jira, 'ARROW-1234', 'ARROW', FakeCLI()) - issue.resolve([versions_json['0.16.0']], "a comment") - - assert jira.captured_transition == { - 'jira_id': 'ARROW-1234', - 'transition_id': 1, - 'comment': 'a comment', - 'fixVersions': [versions_json[v] - for v in ['0.16.0', '0.15.1', '0.14.2']] - } - - issue.resolve([versions_json['0.15.1']], "a comment") - - assert jira.captured_transition == { - 'jira_id': 'ARROW-1234', - 'transition_id': 1, - 'comment': 'a comment', - 'fixVersions': [versions_json[v] for v in ['0.15.1', '0.14.2']] - } - - -def test_jira_output_no_components(): - # ARROW-5472 - status = 'Interesting work' - components = [] - output = merge_arrow_pr.format_jira_output( - 'ARROW-1234', 'Resolved', status, FakeAssignee('Foo Bar'), - components) - - assert output == """=== JIRA ARROW-1234 === -Summary\t\tInteresting work -Assignee\tFoo Bar -Components\tNO COMPONENTS!!! -Status\t\tResolved -URL\t\thttps://issues.apache.org/jira/browse/ARROW-1234""" - - output = merge_arrow_pr.format_jira_output( - 'ARROW-1234', 'Resolved', status, FakeAssignee('Foo Bar'), - [FakeComponent('C++'), FakeComponent('Python')]) - - assert output == """=== JIRA ARROW-1234 === -Summary\t\tInteresting work -Assignee\tFoo Bar -Components\tC++, Python -Status\t\tResolved -URL\t\thttps://issues.apache.org/jira/browse/ARROW-1234""" diff --git a/dev/update_arrow_deps.py b/dev/update_arrow_deps.py deleted file mode 100755 index 44bdf4235d1c6..0000000000000 --- a/dev/update_arrow_deps.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env python - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Script that updates the arrow dependencies in datafusion and ballista, locall -# -# installation: -# pip install tomlkit requests -# -# usage: -# python update_arrow_deps.py - -from pathlib import Path - -# use tomlkit as it preserves comments and other formatting -import tomlkit -import requests - - -# find latest arrow-rs sha -def get_arrow_sha(): - url = 'https://api.github.com/repos/apache/arrow-rs/branches/master' - response = requests.get(url) - return response.json()['commit']['sha'] - - -# Update all entries that look like -# { -# 'git': 'https://github.com/apache/arrow-rs', -# 'rev': 'c3fe3bab9905739fdda75301dab07a18c91731bd' -# } -# to point at a new SHA -def update_dependencies(dependencies, new_sha): - if dependencies is None: - return - for dep_name in dependencies: - dep = dependencies[dep_name] - if hasattr(dep, 'get'): - if dep.get('git') == 'https://github.com/apache/arrow-rs': - dep['rev'] = new_sha - - -def update_cargo_toml(cargo_toml, new_sha): - print('updating {}'.format(cargo_toml.absolute())) - with open(cargo_toml) as f: - data = f.read() - - doc = tomlkit.parse(data) - - update_dependencies(doc.get('dependencies'), new_sha) - update_dependencies(doc.get('dev-dependencies'), new_sha) - - with open(cargo_toml, 'w') as f: - f.write(tomlkit.dumps(doc)) - - -# Begin main script - -repo_root = Path(__file__).parent.parent.absolute() - - -new_sha = get_arrow_sha() - -print('Updating files in {} to use sha {}'.format(repo_root, new_sha)) - - -for cargo_toml in repo_root.rglob('Cargo.toml'): - update_cargo_toml(cargo_toml, new_sha) From 23b7898316daf25f94db77f37ea52333c4e8f90f Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 29 Jun 2021 07:56:03 -0600 Subject: [PATCH 225/329] Ballista: Rename QueryStageExec to ShuffleWriterExec (#633) --- ballista/rust/core/src/execution_plans/mod.rs | 4 +-- .../src/execution_plans/shuffle_reader.rs | 2 +- .../{query_stage.rs => shuffle_writer.rs} | 30 +++++++++---------- .../src/execution_plans/unresolved_shuffle.rs | 4 +-- ballista/rust/core/src/utils.rs | 8 ++--- ballista/rust/executor/src/executor.rs | 11 +++++-- ballista/rust/scheduler/src/planner.rs | 20 ++++++------- 7 files changed, 42 insertions(+), 37 deletions(-) rename ballista/rust/core/src/execution_plans/{query_stage.rs => shuffle_writer.rs} (94%) diff --git a/ballista/rust/core/src/execution_plans/mod.rs b/ballista/rust/core/src/execution_plans/mod.rs index 1fb2010bd5456..ca4e60023ce8c 100644 --- a/ballista/rust/core/src/execution_plans/mod.rs +++ b/ballista/rust/core/src/execution_plans/mod.rs @@ -18,10 +18,10 @@ //! This module contains execution plans that are needed to distribute Datafusion's execution plans into //! several Ballista executors. -mod query_stage; mod shuffle_reader; +mod shuffle_writer; mod unresolved_shuffle; -pub use query_stage::QueryStageExec; pub use shuffle_reader::ShuffleReaderExec; +pub use shuffle_writer::ShuffleWriterExec; pub use unresolved_shuffle::UnresolvedShuffleExec; diff --git a/ballista/rust/core/src/execution_plans/shuffle_reader.rs b/ballista/rust/core/src/execution_plans/shuffle_reader.rs index 3a7f795f1a7fd..9ab064115acea 100644 --- a/ballista/rust/core/src/execution_plans/shuffle_reader.rs +++ b/ballista/rust/core/src/execution_plans/shuffle_reader.rs @@ -36,7 +36,7 @@ use datafusion::{ use futures::{future, Stream, StreamExt}; use log::info; -/// ShuffleReaderExec reads partitions that have already been materialized by a query stage +/// ShuffleReaderExec reads partitions that have already been materialized by a ShuffleWriterExec /// being executed by an executor #[derive(Debug, Clone)] pub struct ShuffleReaderExec { diff --git a/ballista/rust/core/src/execution_plans/query_stage.rs b/ballista/rust/core/src/execution_plans/shuffle_writer.rs similarity index 94% rename from ballista/rust/core/src/execution_plans/query_stage.rs rename to ballista/rust/core/src/execution_plans/shuffle_writer.rs index 1e91540a7d898..2d8d78324d2dd 100644 --- a/ballista/rust/core/src/execution_plans/query_stage.rs +++ b/ballista/rust/core/src/execution_plans/shuffle_writer.rs @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. -//! QueryStageExec represents a section of a query plan that has consistent partitioning and -//! can be executed as one unit with each partition being executed in parallel. The output of -//! a query stage either forms the input of another query stage or can be the final result of -//! a query. +//! ShuffleWriterExec represents a section of a query plan that has consistent partitioning and +//! can be executed as one unit with each partition being executed in parallel. The output of each +//! partition is re-partitioned and streamed to disk in Arrow IPC format. Future stages of the query +//! will use the ShuffleReaderExec to read these results. use std::iter::Iterator; use std::path::PathBuf; @@ -48,12 +48,12 @@ use log::info; use std::fs::File; use uuid::Uuid; -/// QueryStageExec represents a section of a query plan that has consistent partitioning and -/// can be executed as one unit with each partition being executed in parallel. The output of -/// a query stage either forms the input of another query stage or can be the final result of -/// a query. +/// ShuffleWriterExec represents a section of a query plan that has consistent partitioning and +/// can be executed as one unit with each partition being executed in parallel. The output of each +/// partition is re-partitioned and streamed to disk in Arrow IPC format. Future stages of the query +/// will use the ShuffleReaderExec to read these results. #[derive(Debug, Clone)] -pub struct QueryStageExec { +pub struct ShuffleWriterExec { /// Unique ID for the job (query) that this stage is a part of job_id: String, /// Unique query stage ID within the job @@ -66,8 +66,8 @@ pub struct QueryStageExec { shuffle_output_partitioning: Option, } -impl QueryStageExec { - /// Create a new query stage +impl ShuffleWriterExec { + /// Create a new shuffle writer pub fn try_new( job_id: String, stage_id: usize, @@ -96,7 +96,7 @@ impl QueryStageExec { } #[async_trait] -impl ExecutionPlan for QueryStageExec { +impl ExecutionPlan for ShuffleWriterExec { fn as_any(&self) -> &dyn Any { self } @@ -118,7 +118,7 @@ impl ExecutionPlan for QueryStageExec { children: Vec>, ) -> Result> { assert!(children.len() == 1); - Ok(Arc::new(QueryStageExec::try_new( + Ok(Arc::new(ShuffleWriterExec::try_new( self.job_id.clone(), self.stage_id, children[0].clone(), @@ -379,7 +379,7 @@ mod tests { async fn test() -> Result<()> { let input_plan = create_input_plan()?; let work_dir = TempDir::new()?; - let query_stage = QueryStageExec::try_new( + let query_stage = ShuffleWriterExec::try_new( "jobOne".to_owned(), 1, input_plan, @@ -418,7 +418,7 @@ mod tests { async fn test_partitioned() -> Result<()> { let input_plan = create_input_plan()?; let work_dir = TempDir::new()?; - let query_stage = QueryStageExec::try_new( + let query_stage = ShuffleWriterExec::try_new( "jobOne".to_owned(), 1, input_plan, diff --git a/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs b/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs index 5c1b41798c5d3..9c53bc7a1d43c 100644 --- a/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs +++ b/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs @@ -30,10 +30,10 @@ use datafusion::{ }; use log::info; -/// UnresolvedShuffleExec represents a dependency on the results of several QueryStageExec nodes which haven't been computed yet. +/// UnresolvedShuffleExec represents a dependency on the results of several ShuffleWriterExec nodes which haven't been computed yet. /// /// An ExecutionPlan that contains an UnresolvedShuffleExec isn't ready for execution. The presence of this ExecutionPlan -/// is used as a signal so the scheduler knows it can't start computation on a specific QueryStageExec. +/// is used as a signal so the scheduler knows it can't start computation on a specific ShuffleWriterExec. #[derive(Debug, Clone)] pub struct UnresolvedShuffleExec { // The query stage ids which needs to be computed diff --git a/ballista/rust/core/src/utils.rs b/ballista/rust/core/src/utils.rs index 26bdb00f34fb4..d043763dc6f1a 100644 --- a/ballista/rust/core/src/utils.rs +++ b/ballista/rust/core/src/utils.rs @@ -23,7 +23,7 @@ use std::sync::Arc; use std::{fs::File, pin::Pin}; use crate::error::{BallistaError, Result}; -use crate::execution_plans::{QueryStageExec, UnresolvedShuffleExec}; +use crate::execution_plans::{ShuffleWriterExec, UnresolvedShuffleExec}; use crate::memory_stream::MemoryStream; use crate::serde::scheduler::PartitionStats; @@ -106,7 +106,7 @@ pub async fn collect_stream( Ok(batches) } -pub fn produce_diagram(filename: &str, stages: &[Arc]) -> Result<()> { +pub fn produce_diagram(filename: &str, stages: &[Arc]) -> Result<()> { let write_file = File::create(filename)?; let mut w = BufWriter::new(&write_file); writeln!(w, "digraph G {{")?; @@ -163,8 +163,8 @@ fn build_exec_plan_diagram( "CsvExec" } else if plan.as_any().downcast_ref::().is_some() { "FilterExec" - } else if plan.as_any().downcast_ref::().is_some() { - "QueryStageExec" + } else if plan.as_any().downcast_ref::().is_some() { + "ShuffleWriterExec" } else if plan .as_any() .downcast_ref::() diff --git a/ballista/rust/executor/src/executor.rs b/ballista/rust/executor/src/executor.rs index 90c39277e2fcf..86aaa7e9f4956 100644 --- a/ballista/rust/executor/src/executor.rs +++ b/ballista/rust/executor/src/executor.rs @@ -20,7 +20,7 @@ use std::sync::Arc; use ballista_core::error::BallistaError; -use ballista_core::execution_plans::QueryStageExec; +use ballista_core::execution_plans::ShuffleWriterExec; use ballista_core::utils; use datafusion::arrow::record_batch::RecordBatch; use datafusion::physical_plan::ExecutionPlan; @@ -51,8 +51,13 @@ impl Executor { part: usize, plan: Arc, ) -> Result { - let exec = - QueryStageExec::try_new(job_id, stage_id, plan, self.work_dir.clone(), None)?; + let exec = ShuffleWriterExec::try_new( + job_id, + stage_id, + plan, + self.work_dir.clone(), + None, + )?; let mut stream = exec.execute(part).await?; let batches = utils::collect_stream(&mut stream).await?; // the output should be a single batch containing metadata (path and statistics) diff --git a/ballista/rust/scheduler/src/planner.rs b/ballista/rust/scheduler/src/planner.rs index 32fc9a9e25ebd..70d90a4a07d03 100644 --- a/ballista/rust/scheduler/src/planner.rs +++ b/ballista/rust/scheduler/src/planner.rs @@ -25,7 +25,7 @@ use std::sync::Arc; use ballista_core::datasource::DfTableAdapter; use ballista_core::error::{BallistaError, Result}; use ballista_core::{ - execution_plans::{QueryStageExec, ShuffleReaderExec, UnresolvedShuffleExec}, + execution_plans::{ShuffleReaderExec, ShuffleWriterExec, UnresolvedShuffleExec}, serde::scheduler::PartitionLocation, }; use datafusion::execution::context::{ExecutionConfig, ExecutionContext}; @@ -39,7 +39,7 @@ use datafusion::physical_plan::windows::WindowAggExec; use datafusion::physical_plan::ExecutionPlan; use log::info; -type PartialQueryStageResult = (Arc, Vec>); +type PartialQueryStageResult = (Arc, Vec>); pub struct DistributedPlanner { next_stage_id: usize, @@ -58,16 +58,16 @@ impl Default for DistributedPlanner { } impl DistributedPlanner { - /// Returns a vector of ExecutionPlans, where the root node is a [QueryStageExec]. + /// Returns a vector of ExecutionPlans, where the root node is a [ShuffleWriterExec]. /// Plans that depend on the input of other plans will have leaf nodes of type [UnresolvedShuffleExec]. - /// A [QueryStageExec] is created whenever the partitioning changes. + /// A [ShuffleWriterExec] is created whenever the partitioning changes. /// /// Returns an empty vector if the execution_plan doesn't need to be sliced into several stages. pub fn plan_query_stages( &mut self, job_id: &str, execution_plan: Arc, - ) -> Result>> { + ) -> Result>> { info!("planning query stages"); let (new_plan, mut stages) = self.plan_query_stages_internal(job_id, execution_plan)?; @@ -228,8 +228,8 @@ fn create_query_stage( job_id: &str, stage_id: usize, plan: Arc, -) -> Result> { - Ok(Arc::new(QueryStageExec::try_new( +) -> Result> { + Ok(Arc::new(ShuffleWriterExec::try_new( job_id.to_owned(), stage_id, plan, @@ -285,13 +285,13 @@ mod test { } /* Expected result: - QueryStageExec: job=f011432e-e424-4016-915d-e3d8b84f6dbd, stage=1 + ShuffleWriterExec: job=f011432e-e424-4016-915d-e3d8b84f6dbd, stage=1 HashAggregateExec: groupBy=["l_returnflag"], aggrExpr=["SUM(l_extendedprice Multiply Int64(1)) [\"l_extendedprice * CAST(1 AS Float64)\"]"] CsvExec: testdata/lineitem; partitions=2 - QueryStageExec: job=f011432e-e424-4016-915d-e3d8b84f6dbd, stage=2 + ShuffleWriterExec: job=f011432e-e424-4016-915d-e3d8b84f6dbd, stage=2 CoalescePartitionsExec UnresolvedShuffleExec: stages=[1] - QueryStageExec: job=f011432e-e424-4016-915d-e3d8b84f6dbd, stage=3 + ShuffleWriterExec: job=f011432e-e424-4016-915d-e3d8b84f6dbd, stage=3 SortExec { input: ProjectionExec { expr: [(Column { name: "l_returnflag" }, "l_returnflag"), (Column { name: "SUM(l_ext ProjectionExec { expr: [(Column { name: "l_returnflag" }, "l_returnflag"), (Column { name: "SUM(l_extendedprice Multip HashAggregateExec: groupBy=["l_returnflag"], aggrExpr=["SUM(l_extendedprice Multiply Int64(1)) [\"l_extendedprice * CAST(1 AS Float64)\"]"] From ae745d9a8b81678d767849b032067349944d12c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Wed, 30 Jun 2021 08:26:40 +0200 Subject: [PATCH 226/329] Add query 15 to queries (#645) --- benchmarks/queries/q15.sql | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 benchmarks/queries/q15.sql diff --git a/benchmarks/queries/q15.sql b/benchmarks/queries/q15.sql new file mode 100644 index 0000000000000..b5cb49e5a5535 --- /dev/null +++ b/benchmarks/queries/q15.sql @@ -0,0 +1,34 @@ +create view revenue0 (supplier_no, total_revenue) as + select + l_suppkey, + sum(l_extendedprice * (1 - l_discount)) + from + lineitem + where + l_shipdate >= date '1996-01-01' + and l_shipdate < date '1996-01-01' + interval '3' month + group by + l_suppkey; + + +select + s_suppkey, + s_name, + s_address, + s_phone, + total_revenue +from + supplier, + revenue0 +where + s_suppkey = supplier_no + and total_revenue = ( + select + max(total_revenue) + from + revenue0 + ) +order by + s_suppkey; + +drop view revenue0; \ No newline at end of file From 466b7c5722b61f3328fb4ed91f4b11d050eb60ff Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 30 Jun 2021 16:35:19 -0400 Subject: [PATCH 227/329] Improve error message and comments (#641) --- datafusion/src/logical_plan/dfschema.rs | 4 ++-- datafusion/src/logical_plan/expr.rs | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/datafusion/src/logical_plan/dfschema.rs b/datafusion/src/logical_plan/dfschema.rs index b46e067a268bf..b4d864f55ebdb 100644 --- a/datafusion/src/logical_plan/dfschema.rs +++ b/datafusion/src/logical_plan/dfschema.rs @@ -158,8 +158,8 @@ impl DFSchema { } } Err(DataFusionError::Plan(format!( - "No field matches column '{}'", - col, + "No field matches column '{}'. Available fields: {}", + col, self ))) } diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index 1bf3b65d9af00..1fab9bb875ae9 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -83,7 +83,12 @@ impl Column { } } - /// Normalize Column with qualifier based on provided dataframe schemas. + /// Normalizes `self` if is unqualified (has no relation name) + /// with an explicit qualifier from the first matching input + /// schemas. + /// + /// For example, `foo` will be normalized to `t.foo` if there is a + /// column named `foo` in a relation named `t` found in `schemas` pub fn normalize(self, schemas: &[&DFSchemaRef]) -> Result { if self.relation.is_some() { return Ok(self); @@ -1113,7 +1118,8 @@ pub fn columnize_expr(e: Expr, input_schema: &DFSchema) -> Expr { } } -/// Recursively normalize all Column expressions in a given expression tree +/// Recursively call [`Column::normalize`] on all Column expressions +/// in the `expr` expression tree. pub fn normalize_col(e: Expr, schemas: &[&DFSchemaRef]) -> Result { struct ColumnNormalizer<'a, 'b> { schemas: &'a [&'b DFSchemaRef], From e861d017ba1b9ee17c1f45d390fc9926e5af7c37 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 1 Jul 2021 04:39:32 +0800 Subject: [PATCH 228/329] bring back dev scripts for ballista (#648) --- dev/build-ballista-docker-arm64.sh | 34 +++++++++ dev/build-ballista-docker.sh | 24 +++++++ dev/build-set-env.sh | 20 ++++++ dev/build-ui.sh | 23 ++++++ dev/release/rat_exclude_files.txt | 108 +++++++++++++++++++++++++++++ dev/update_arrow_deps.py | 83 ++++++++++++++++++++++ 6 files changed, 292 insertions(+) create mode 100755 dev/build-ballista-docker-arm64.sh create mode 100755 dev/build-ballista-docker.sh create mode 100755 dev/build-set-env.sh create mode 100755 dev/build-ui.sh create mode 100644 dev/release/rat_exclude_files.txt create mode 100755 dev/update_arrow_deps.py diff --git a/dev/build-ballista-docker-arm64.sh b/dev/build-ballista-docker-arm64.sh new file mode 100755 index 0000000000000..5d951773ada41 --- /dev/null +++ b/dev/build-ballista-docker-arm64.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +if [ -z "${DOCKER_REPO}" ]; then + echo "DOCKER_REPO env var must be set" + exit -1 +fi +cargo install cross +cross build --release --target aarch64-unknown-linux-gnu +rm -rf temp-ballista-docker +mkdir temp-ballista-docker +cp target/aarch64-unknown-linux-gnu/release/ballista-executor temp-ballista-docker +cp target/aarch64-unknown-linux-gnu/release/ballista-scheduler temp-ballista-docker +cp target/aarch64-unknown-linux-gnu/release/tpch temp-ballista-docker +mkdir temp-ballista-docker/queries/ +cp benchmarks/queries/*.sql temp-ballista-docker/queries/ +docker buildx build --push -t $DOCKER_REPO/ballista-arm64 --platform=linux/arm64 -f dev/docker/ballista-arm64.Dockerfile temp-ballista-docker +rm -rf temp-ballista-docker \ No newline at end of file diff --git a/dev/build-ballista-docker.sh b/dev/build-ballista-docker.sh new file mode 100755 index 0000000000000..bc028da9e716a --- /dev/null +++ b/dev/build-ballista-docker.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +. ./dev/build-set-env.sh +docker build -t ballista-base:$BALLISTA_VERSION -f dev/docker/ballista-base.dockerfile . +docker build -t ballista:$BALLISTA_VERSION -f dev/docker/ballista.dockerfile . diff --git a/dev/build-set-env.sh b/dev/build-set-env.sh new file mode 100755 index 0000000000000..3eb29e7ce1443 --- /dev/null +++ b/dev/build-set-env.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +export BALLISTA_VERSION=$(awk -F'[ ="]+' '$1 == "version" { print $2 }' ballista/rust/core/Cargo.toml) diff --git a/dev/build-ui.sh b/dev/build-ui.sh new file mode 100755 index 0000000000000..070839702500e --- /dev/null +++ b/dev/build-ui.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +. ./dev/build-set-env.sh +docker build -t ballista-scheduler-ui:$BALLISTA_VERSION -f dev/docker/ballista-scheduler-ui.dockerfile ballista/ui/scheduler diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt new file mode 100644 index 0000000000000..96beccd0af81e --- /dev/null +++ b/dev/release/rat_exclude_files.txt @@ -0,0 +1,108 @@ +*.npmrc +*.gitignore +*.dockerignore +.gitmodules +*_generated.h +*_generated.js +*_generated.ts +*.csv +*.json +*.snap +.github/ISSUE_TEMPLATE/*.md +.github/pull_request_template.md +ci/etc/rprofile +ci/etc/*.patch +ci/vcpkg/*.patch +CHANGELOG.md +dev/requirements*.txt +dev/archery/MANIFEST.in +dev/archery/requirements*.txt +dev/archery/archery/tests/fixtures/* +dev/archery/archery/crossbow/tests/fixtures/* +dev/release/rat_exclude_files.txt +dev/tasks/homebrew-formulae/apache-arrow.rb +dev/tasks/linux-packages/apache-arrow-apt-source/debian/apache-arrow-apt-source.install +dev/tasks/linux-packages/apache-arrow-apt-source/debian/compat +dev/tasks/linux-packages/apache-arrow-apt-source/debian/control +dev/tasks/linux-packages/apache-arrow-apt-source/debian/rules +dev/tasks/linux-packages/apache-arrow-apt-source/debian/source/format +dev/tasks/linux-packages/apache-arrow/debian/compat +dev/tasks/linux-packages/apache-arrow/debian/control.in +dev/tasks/linux-packages/apache-arrow/debian/gir1.2-arrow-1.0.install +dev/tasks/linux-packages/apache-arrow/debian/gir1.2-arrow-cuda-1.0.install +dev/tasks/linux-packages/apache-arrow/debian/gir1.2-arrow-dataset-1.0.install +dev/tasks/linux-packages/apache-arrow/debian/gir1.2-gandiva-1.0.install +dev/tasks/linux-packages/apache-arrow/debian/gir1.2-parquet-1.0.install +dev/tasks/linux-packages/apache-arrow/debian/gir1.2-plasma-1.0.install +dev/tasks/linux-packages/apache-arrow/debian/libarrow-dev.install +dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-dev.install +dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-doc.doc-base +dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-doc.install +dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib-doc.links +dev/tasks/linux-packages/apache-arrow/debian/libarrow-glib400.install +dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-dev.install +dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib-dev.install +dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda-glib400.install +dev/tasks/linux-packages/apache-arrow/debian/libarrow-cuda400.install +dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-dev.install +dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-dev.install +dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-doc.doc-base +dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-doc.install +dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib-doc.links +dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset-glib400.install +dev/tasks/linux-packages/apache-arrow/debian/libarrow-dataset400.install +dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight-dev.install +dev/tasks/linux-packages/apache-arrow/debian/libarrow-flight400.install +dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-dev.install +dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-flight-dev.install +dev/tasks/linux-packages/apache-arrow/debian/libarrow-python-flight400.install +dev/tasks/linux-packages/apache-arrow/debian/libarrow-python400.install +dev/tasks/linux-packages/apache-arrow/debian/libarrow400.install +dev/tasks/linux-packages/apache-arrow/debian/libgandiva-dev.install +dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-dev.install +dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-doc.doc-base +dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-doc.install +dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib-doc.links +dev/tasks/linux-packages/apache-arrow/debian/libgandiva-glib400.install +dev/tasks/linux-packages/apache-arrow/debian/libgandiva400.install +dev/tasks/linux-packages/apache-arrow/debian/libparquet-dev.install +dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-dev.install +dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-doc.doc-base +dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-doc.install +dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib-doc.links +dev/tasks/linux-packages/apache-arrow/debian/libparquet-glib400.install +dev/tasks/linux-packages/apache-arrow/debian/libparquet400.install +dev/tasks/linux-packages/apache-arrow/debian/libplasma-dev.install +dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-dev.install +dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-doc.doc-base +dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-doc.install +dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib-doc.links +dev/tasks/linux-packages/apache-arrow/debian/libplasma-glib400.install +dev/tasks/linux-packages/apache-arrow/debian/libplasma400.install +dev/tasks/linux-packages/apache-arrow/debian/patches/series +dev/tasks/linux-packages/apache-arrow/debian/plasma-store-server.install +dev/tasks/linux-packages/apache-arrow/debian/rules +dev/tasks/linux-packages/apache-arrow/debian/source/format +dev/tasks/linux-packages/apache-arrow/debian/watch +dev/tasks/requirements*.txt +dev/tasks/conda-recipes/* +pax_global_header +MANIFEST.in +__init__.pxd +__init__.py +requirements.txt +*.html +*.sgml +*.css +*.png +*.ico +*.svg +*.devhelp2 +*.scss +.gitattributes +rust-toolchain +benchmarks/queries/q*.sql +ballista/rust/scheduler/testdata/* +ballista/ui/scheduler/yarn.lock +python/rust-toolchain +python/requirements*.txt diff --git a/dev/update_arrow_deps.py b/dev/update_arrow_deps.py new file mode 100755 index 0000000000000..44bdf4235d1c6 --- /dev/null +++ b/dev/update_arrow_deps.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Script that updates the arrow dependencies in datafusion and ballista, locall +# +# installation: +# pip install tomlkit requests +# +# usage: +# python update_arrow_deps.py + +from pathlib import Path + +# use tomlkit as it preserves comments and other formatting +import tomlkit +import requests + + +# find latest arrow-rs sha +def get_arrow_sha(): + url = 'https://api.github.com/repos/apache/arrow-rs/branches/master' + response = requests.get(url) + return response.json()['commit']['sha'] + + +# Update all entries that look like +# { +# 'git': 'https://github.com/apache/arrow-rs', +# 'rev': 'c3fe3bab9905739fdda75301dab07a18c91731bd' +# } +# to point at a new SHA +def update_dependencies(dependencies, new_sha): + if dependencies is None: + return + for dep_name in dependencies: + dep = dependencies[dep_name] + if hasattr(dep, 'get'): + if dep.get('git') == 'https://github.com/apache/arrow-rs': + dep['rev'] = new_sha + + +def update_cargo_toml(cargo_toml, new_sha): + print('updating {}'.format(cargo_toml.absolute())) + with open(cargo_toml) as f: + data = f.read() + + doc = tomlkit.parse(data) + + update_dependencies(doc.get('dependencies'), new_sha) + update_dependencies(doc.get('dev-dependencies'), new_sha) + + with open(cargo_toml, 'w') as f: + f.write(tomlkit.dumps(doc)) + + +# Begin main script + +repo_root = Path(__file__).parent.parent.absolute() + + +new_sha = get_arrow_sha() + +print('Updating files in {} to use sha {}'.format(repo_root, new_sha)) + + +for cargo_toml in repo_root.rglob('Cargo.toml'): + update_cargo_toml(cargo_toml, new_sha) From cab3a980f26cbf14986a188e68b5c2336ea3171d Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 1 Jul 2021 04:59:28 +0800 Subject: [PATCH 229/329] add integration tests for rank, dense_rank (#638) --- datafusion/src/execution/context.rs | 24 ++--- .../physical_plan/expressions/nth_value.rs | 94 ++++++++++++++----- datafusion/tests/sql.rs | 21 ++--- .../sqls/simple_window_built_in_functions.sql | 27 ++++++ .../sqls/simple_window_full_aggregation.sql | 2 +- .../simple_window_ordered_aggregation.sql | 2 +- .../simple_window_partition_aggregation.sql | 2 +- ...ple_window_partition_order_aggregation.sql | 2 +- ...imple_window_ranked_built_in_functions.sql | 22 +++++ integration-tests/test_psql_parity.py | 4 +- 10 files changed, 146 insertions(+), 54 deletions(-) create mode 100644 integration-tests/sqls/simple_window_built_in_functions.sql create mode 100644 integration-tests/sqls/simple_window_ranked_built_in_functions.sql diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index 5c41ed26eea43..5df8e20ea6060 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -1335,11 +1335,11 @@ mod tests { "+----+----+--------------+-----------------+----------------+------------------------+---------+-----------+---------+---------+---------+", "| c1 | c2 | ROW_NUMBER() | FIRST_VALUE(c2) | LAST_VALUE(c2) | NTH_VALUE(c2,Int64(2)) | SUM(c2) | COUNT(c2) | MAX(c2) | MIN(c2) | AVG(c2) |", "+----+----+--------------+-----------------+----------------+------------------------+---------+-----------+---------+---------+---------+", - "| 0 | 1 | 1 | 1 | 10 | 2 | 1 | 1 | 1 | 1 | 1 |", - "| 0 | 2 | 2 | 1 | 10 | 2 | 3 | 2 | 2 | 1 | 1.5 |", - "| 0 | 3 | 3 | 1 | 10 | 2 | 6 | 3 | 3 | 1 | 2 |", - "| 0 | 4 | 4 | 1 | 10 | 2 | 10 | 4 | 4 | 1 | 2.5 |", - "| 0 | 5 | 5 | 1 | 10 | 2 | 15 | 5 | 5 | 1 | 3 |", + "| 0 | 1 | 1 | 1 | 1 | | 1 | 1 | 1 | 1 | 1 |", + "| 0 | 2 | 2 | 1 | 2 | 2 | 3 | 2 | 2 | 1 | 1.5 |", + "| 0 | 3 | 3 | 1 | 3 | 2 | 6 | 3 | 3 | 1 | 2 |", + "| 0 | 4 | 4 | 1 | 4 | 2 | 10 | 4 | 4 | 1 | 2.5 |", + "| 0 | 5 | 5 | 1 | 5 | 2 | 15 | 5 | 5 | 1 | 3 |", "+----+----+--------------+-----------------+----------------+------------------------+---------+-----------+---------+---------+---------+", ]; @@ -1392,7 +1392,7 @@ mod tests { ROW_NUMBER() OVER (PARTITION BY c2 ORDER BY c1), \ FIRST_VALUE(c2 + c1) OVER (PARTITION BY c2 ORDER BY c1), \ LAST_VALUE(c2 + c1) OVER (PARTITION BY c2 ORDER BY c1), \ - NTH_VALUE(c2 + c1, 2) OVER (PARTITION BY c2 ORDER BY c1), \ + NTH_VALUE(c2 + c1, 1) OVER (PARTITION BY c2 ORDER BY c1), \ SUM(c2) OVER (PARTITION BY c2 ORDER BY c1), \ COUNT(c2) OVER (PARTITION BY c2 ORDER BY c1), \ MAX(c2) OVER (PARTITION BY c2 ORDER BY c1), \ @@ -1407,13 +1407,13 @@ mod tests { let expected = vec![ "+----+----+--------------+-------------------------+------------------------+--------------------------------+---------+-----------+---------+---------+---------+", - "| c1 | c2 | ROW_NUMBER() | FIRST_VALUE(c2 Plus c1) | LAST_VALUE(c2 Plus c1) | NTH_VALUE(c2 Plus c1,Int64(2)) | SUM(c2) | COUNT(c2) | MAX(c2) | MIN(c2) | AVG(c2) |", + "| c1 | c2 | ROW_NUMBER() | FIRST_VALUE(c2 Plus c1) | LAST_VALUE(c2 Plus c1) | NTH_VALUE(c2 Plus c1,Int64(1)) | SUM(c2) | COUNT(c2) | MAX(c2) | MIN(c2) | AVG(c2) |", "+----+----+--------------+-------------------------+------------------------+--------------------------------+---------+-----------+---------+---------+---------+", - "| 0 | 1 | 1 | 1 | 4 | 2 | 1 | 1 | 1 | 1 | 1 |", - "| 0 | 2 | 1 | 2 | 5 | 3 | 2 | 1 | 2 | 2 | 2 |", - "| 0 | 3 | 1 | 3 | 6 | 4 | 3 | 1 | 3 | 3 | 3 |", - "| 0 | 4 | 1 | 4 | 7 | 5 | 4 | 1 | 4 | 4 | 4 |", - "| 0 | 5 | 1 | 5 | 8 | 6 | 5 | 1 | 5 | 5 | 5 |", + "| 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |", + "| 0 | 2 | 1 | 2 | 2 | 2 | 2 | 1 | 2 | 2 | 2 |", + "| 0 | 3 | 1 | 3 | 3 | 3 | 3 | 1 | 3 | 3 | 3 |", + "| 0 | 4 | 1 | 4 | 4 | 4 | 4 | 1 | 4 | 4 | 4 |", + "| 0 | 5 | 1 | 5 | 5 | 5 | 5 | 1 | 5 | 5 | 5 |", "+----+----+--------------+-------------------------+------------------------+--------------------------------+---------+-----------+---------+---------+---------+", ]; diff --git a/datafusion/src/physical_plan/expressions/nth_value.rs b/datafusion/src/physical_plan/expressions/nth_value.rs index 3897ae5cb53e0..854078e232f00 100644 --- a/datafusion/src/physical_plan/expressions/nth_value.rs +++ b/datafusion/src/physical_plan/expressions/nth_value.rs @@ -22,9 +22,11 @@ use crate::physical_plan::window_functions::PartitionEvaluator; use crate::physical_plan::{window_functions::BuiltInWindowFunctionExpr, PhysicalExpr}; use crate::scalar::ScalarValue; use arrow::array::{new_null_array, ArrayRef}; +use arrow::compute::kernels::window::shift; use arrow::datatypes::{DataType, Field}; use arrow::record_batch::RecordBatch; use std::any::Any; +use std::iter; use std::ops::Range; use std::sync::Arc; @@ -138,21 +140,56 @@ pub(crate) struct NthValueEvaluator { } impl PartitionEvaluator for NthValueEvaluator { - fn evaluate_partition(&self, partition: Range) -> Result { - let value = &self.values[0]; + fn include_rank(&self) -> bool { + true + } + + fn evaluate_partition(&self, _partition: Range) -> Result { + unreachable!("first, last, and nth_value evaluation must be called with evaluate_partition_with_rank") + } + + fn evaluate_partition_with_rank( + &self, + partition: Range, + ranks_in_partition: &[Range], + ) -> Result { + let arr = &self.values[0]; let num_rows = partition.end - partition.start; - let value = value.slice(partition.start, num_rows); - let index: usize = match self.kind { - NthValueKind::First => 0, - NthValueKind::Last => (num_rows as usize) - 1, - NthValueKind::Nth(n) => (n as usize) - 1, - }; - Ok(if index >= num_rows { - new_null_array(value.data_type(), num_rows) - } else { - let value = ScalarValue::try_from_array(&value, index)?; - value.to_array_of_size(num_rows) - }) + match self.kind { + NthValueKind::First => { + let value = ScalarValue::try_from_array(arr, partition.start)?; + Ok(value.to_array_of_size(num_rows)) + } + NthValueKind::Last => { + // because the default window frame is between unbounded preceding and current + // row with peer evaluation, hence the last rows expands until the end of the peers + let values = ranks_in_partition + .iter() + .map(|range| { + let len = range.end - range.start; + let value = ScalarValue::try_from_array(arr, range.end - 1)?; + Ok(iter::repeat(value).take(len)) + }) + .collect::>>()? + .into_iter() + .flatten(); + ScalarValue::iter_to_array(values) + } + NthValueKind::Nth(n) => { + let index = (n as usize) - 1; + if index >= num_rows { + Ok(new_null_array(arr.data_type(), num_rows)) + } else { + let value = + ScalarValue::try_from_array(arr, partition.start + index)?; + let arr = value.to_array_of_size(num_rows); + // because the default window frame is between unbounded preceding and current + // row, hence the shift because for values with indices < index they should be + // null. This changes when window frames other than default is implemented + shift(arr.as_ref(), index as i64).map_err(DataFusionError::ArrowError) + } + } + } } } @@ -164,16 +201,17 @@ mod tests { use arrow::record_batch::RecordBatch; use arrow::{array::*, datatypes::*}; - fn test_i32_result(expr: NthValue, expected: Vec) -> Result<()> { + fn test_i32_result(expr: NthValue, expected: Int32Array) -> Result<()> { let arr: ArrayRef = Arc::new(Int32Array::from(vec![1, -2, 3, -4, 5, -6, 7, 8])); let values = vec![arr]; let schema = Schema::new(vec![Field::new("arr", DataType::Int32, false)]); let batch = RecordBatch::try_new(Arc::new(schema), values.clone())?; - let result = expr.create_evaluator(&batch)?.evaluate(vec![0..8])?; + let result = expr + .create_evaluator(&batch)? + .evaluate_with_rank(vec![0..8], vec![0..8])?; assert_eq!(1, result.len()); let result = result[0].as_any().downcast_ref::().unwrap(); - let result = result.values(); - assert_eq!(expected, result); + assert_eq!(expected, *result); Ok(()) } @@ -184,7 +222,7 @@ mod tests { Arc::new(Column::new("arr", 0)), DataType::Int32, ); - test_i32_result(first_value, vec![1; 8])?; + test_i32_result(first_value, Int32Array::from_iter_values(vec![1; 8]))?; Ok(()) } @@ -195,7 +233,7 @@ mod tests { Arc::new(Column::new("arr", 0)), DataType::Int32, ); - test_i32_result(last_value, vec![8; 8])?; + test_i32_result(last_value, Int32Array::from_iter_values(vec![8; 8]))?; Ok(()) } @@ -207,7 +245,7 @@ mod tests { DataType::Int32, 1, )?; - test_i32_result(nth_value, vec![1; 8])?; + test_i32_result(nth_value, Int32Array::from_iter_values(vec![1; 8]))?; Ok(()) } @@ -219,7 +257,19 @@ mod tests { DataType::Int32, 2, )?; - test_i32_result(nth_value, vec![-2; 8])?; + test_i32_result( + nth_value, + Int32Array::from(vec![ + None, + Some(-2), + Some(-2), + Some(-2), + Some(-2), + Some(-2), + Some(-2), + Some(-2), + ]), + )?; Ok(()) } } diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index c06a4bb1462ee..5cb5529ba80e7 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -903,7 +903,7 @@ async fn csv_query_window_with_partition_by() -> Result<()> { "-21481", "-16974", "-21481", - "-21481", + "NULL", ], vec![ "141680161", @@ -952,15 +952,8 @@ async fn csv_query_window_with_order_by() -> Result<()> { let actual = execute(&mut ctx, sql).await; let expected = vec![ vec![ - "28774375", - "61035129", - "61035129", - "1", - "61035129", - "61035129", - "61035129", - "2025611582", - "-108973366", + "28774375", "61035129", "61035129", "1", "61035129", "61035129", "61035129", + "61035129", "NULL", ], vec![ "63044568", @@ -970,7 +963,7 @@ async fn csv_query_window_with_order_by() -> Result<()> { "61035129", "-108973366", "61035129", - "2025611582", + "-108973366", "-108973366", ], vec![ @@ -981,7 +974,7 @@ async fn csv_query_window_with_order_by() -> Result<()> { "623103518", "-108973366", "61035129", - "2025611582", + "623103518", "-108973366", ], vec![ @@ -992,7 +985,7 @@ async fn csv_query_window_with_order_by() -> Result<()> { "623103518", "-1927628110", "61035129", - "2025611582", + "-1927628110", "-108973366", ], vec![ @@ -1003,7 +996,7 @@ async fn csv_query_window_with_order_by() -> Result<()> { "623103518", "-1927628110", "61035129", - "2025611582", + "-1899175111", "-108973366", ], ]; diff --git a/integration-tests/sqls/simple_window_built_in_functions.sql b/integration-tests/sqls/simple_window_built_in_functions.sql new file mode 100644 index 0000000000000..e76b383060026 --- /dev/null +++ b/integration-tests/sqls/simple_window_built_in_functions.sql @@ -0,0 +1,27 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at + +-- http://www.apache.org/licenses/LICENSE-2.0 + +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +SELECT + c9, + row_number() OVER (ORDER BY c9) row_num, + first_value(c9) OVER (ORDER BY c9) first_c9, + first_value(c9) OVER (ORDER BY c9 DESC) first_c9_desc, + last_value(c9) OVER (ORDER BY c9) last_c9, + last_value(c9) OVER (ORDER BY c9 DESC) last_c9_desc, + nth_value(c9, 2) OVER (ORDER BY c9) second_c9, + nth_value(c9, 2) OVER (ORDER BY c9 DESC) second_c9_desc +FROM test +ORDER BY c9; diff --git a/integration-tests/sqls/simple_window_full_aggregation.sql b/integration-tests/sqls/simple_window_full_aggregation.sql index 94860bc3b1835..7346f67fa4ba4 100644 --- a/integration-tests/sqls/simple_window_full_aggregation.sql +++ b/integration-tests/sqls/simple_window_full_aggregation.sql @@ -11,7 +11,7 @@ -- Unless required by applicable law or agreed to in writing, software -- distributed under the License is distributed on an "AS IS" BASIS, -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. --- See the License for the specific language gOVERning permissions and +-- See the License for the specific language governing permissions and -- limitations under the License. SELECT diff --git a/integration-tests/sqls/simple_window_ordered_aggregation.sql b/integration-tests/sqls/simple_window_ordered_aggregation.sql index d9f467b0cb09a..567c1881a3db6 100644 --- a/integration-tests/sqls/simple_window_ordered_aggregation.sql +++ b/integration-tests/sqls/simple_window_ordered_aggregation.sql @@ -11,7 +11,7 @@ -- Unless required by applicable law or agreed to in writing, software -- distributed under the License is distributed on an "AS IS" BASIS, -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. --- See the License for the specific language gOVERning permissions and +-- See the License for the specific language governing permissions and -- limitations under the License. SELECT diff --git a/integration-tests/sqls/simple_window_partition_aggregation.sql b/integration-tests/sqls/simple_window_partition_aggregation.sql index f395671db8cc8..bac4e465f626b 100644 --- a/integration-tests/sqls/simple_window_partition_aggregation.sql +++ b/integration-tests/sqls/simple_window_partition_aggregation.sql @@ -11,7 +11,7 @@ -- Unless required by applicable law or agreed to in writing, software -- distributed under the License is distributed on an "AS IS" BASIS, -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. --- See the License for the specific language gOVERning permissions and +-- See the License for the specific language governing permissions and -- limitations under the License. SELECT diff --git a/integration-tests/sqls/simple_window_partition_order_aggregation.sql b/integration-tests/sqls/simple_window_partition_order_aggregation.sql index a11a9ec6e4b1e..2702c0e2e0326 100644 --- a/integration-tests/sqls/simple_window_partition_order_aggregation.sql +++ b/integration-tests/sqls/simple_window_partition_order_aggregation.sql @@ -11,7 +11,7 @@ -- Unless required by applicable law or agreed to in writing, software -- distributed under the License is distributed on an "AS IS" BASIS, -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. --- See the License for the specific language gOVERning permissions and +-- See the License for the specific language governing permissions and -- limitations under the License. SELECT diff --git a/integration-tests/sqls/simple_window_ranked_built_in_functions.sql b/integration-tests/sqls/simple_window_ranked_built_in_functions.sql new file mode 100644 index 0000000000000..0ea6b042555cc --- /dev/null +++ b/integration-tests/sqls/simple_window_ranked_built_in_functions.sql @@ -0,0 +1,22 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at + +-- http://www.apache.org/licenses/LICENSE-2.0 + +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +select + c9, + rank() OVER (PARTITION BY c2 ORDER BY c3) rank_by_c3, + dense_rank() OVER (PARTITION BY c2 ORDER BY c3) dense_rank_by_c3 +FROM test +ORDER BY c9; diff --git a/integration-tests/test_psql_parity.py b/integration-tests/test_psql_parity.py index 92670bed0c4dd..2bb8da9fd5c58 100644 --- a/integration-tests/test_psql_parity.py +++ b/integration-tests/test_psql_parity.py @@ -74,7 +74,7 @@ class PsqlParityTest(unittest.TestCase): def test_parity(self): root = Path(os.path.dirname(__file__)) / "sqls" files = set(root.glob("*.sql")) - self.assertEqual(len(files), 9, msg="tests are missed") + self.assertEqual(len(files), 11, msg="tests are missed") for fname in files: with self.subTest(fname=fname): datafusion_output = pd.read_csv( @@ -82,7 +82,7 @@ def test_parity(self): ) psql_output = pd.read_csv(io.BytesIO(generate_csv_from_psql(fname))) self.assertTrue( - np.allclose(datafusion_output, psql_output), + np.allclose(datafusion_output, psql_output, equal_nan=True), msg=f"datafusion output=\n{datafusion_output}, psql_output=\n{psql_output}", ) From fddab22aa562750f67385a961497dc020b18c4b2 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 1 Jul 2021 05:00:03 +0800 Subject: [PATCH 230/329] Use repartition in window functions to speed up (#569) * implement window functions with partition by * fix partition requirement --- datafusion/src/execution/context.rs | 11 +++++ datafusion/src/physical_plan/planner.rs | 42 +++++++++++++++-- datafusion/src/physical_plan/windows.rs | 62 ++++++------------------- datafusion/src/sql/utils.rs | 24 ++++++++++ 4 files changed, 86 insertions(+), 53 deletions(-) diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index 5df8e20ea6060..436bce5952bdc 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -630,6 +630,9 @@ pub struct ExecutionConfig { /// Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel /// using the provided `concurrency` level pub repartition_aggregations: bool, + /// Should DataFusion repartition data using the partition keys to execute window functions in + /// parallel using the provided `concurrency` level + pub repartition_windows: bool, } impl Default for ExecutionConfig { @@ -659,6 +662,7 @@ impl Default for ExecutionConfig { information_schema: false, repartition_joins: true, repartition_aggregations: true, + repartition_windows: true, } } } @@ -749,11 +753,18 @@ impl ExecutionConfig { self.repartition_joins = enabled; self } + /// Enables or disables the use of repartitioning for aggregations to improve parallelism pub fn with_repartition_aggregations(mut self, enabled: bool) -> Self { self.repartition_aggregations = enabled; self } + + /// Enables or disables the use of repartitioning for window functions to improve parallelism + pub fn with_repartition_windows(mut self, enabled: bool) -> Self { + self.repartition_windows = enabled; + self + } } /// Holds per-execution properties and data (such as starting timestamps, etc). diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index c3bb9a80136f1..75f15653ba463 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -44,7 +44,7 @@ use crate::physical_plan::{ }; use crate::prelude::JoinType; use crate::scalar::ScalarValue; -use crate::sql::utils::generate_sort_key; +use crate::sql::utils::{generate_sort_key, window_expr_common_partition_keys}; use crate::variable::VarType; use crate::{ error::{DataFusionError, Result}, @@ -264,6 +264,38 @@ impl DefaultPhysicalPlanner { "Impossibly got empty window expression".to_owned(), )); } + + let input_exec = self.create_initial_plan(input, ctx_state)?; + + // at this moment we are guaranteed by the logical planner + // to have all the window_expr to have equal sort key + let partition_keys = window_expr_common_partition_keys(window_expr)?; + + let can_repartition = !partition_keys.is_empty() + && ctx_state.config.concurrency > 1 + && ctx_state.config.repartition_windows; + + let input_exec = if can_repartition { + let partition_keys = partition_keys + .iter() + .map(|e| { + self.create_physical_expr( + e, + input.schema(), + &input_exec.schema(), + ctx_state, + ) + }) + .collect::>>>()?; + Arc::new(RepartitionExec::try_new( + input_exec, + Partitioning::Hash(partition_keys, ctx_state.config.concurrency), + )?) + } else { + input_exec + }; + + // add a sort phase let get_sort_keys = |expr: &Expr| match expr { Expr::WindowFunction { ref partition_by, @@ -272,7 +304,6 @@ impl DefaultPhysicalPlanner { } => generate_sort_key(partition_by, order_by), _ => unreachable!(), }; - let sort_keys = get_sort_keys(&window_expr[0]); if window_expr.len() > 1 { debug_assert!( @@ -283,7 +314,6 @@ impl DefaultPhysicalPlanner { ); } - let input_exec = self.create_initial_plan(input, ctx_state)?; let logical_input_schema = input.schema(); let input_exec = if sort_keys.is_empty() { @@ -310,7 +340,11 @@ impl DefaultPhysicalPlanner { _ => unreachable!(), }) .collect::>>()?; - Arc::new(SortExec::try_new(sort_keys, input_exec)?) + Arc::new(if can_repartition { + SortExec::new_with_partitioning(sort_keys, input_exec, true) + } else { + SortExec::try_new(sort_keys, input_exec)? + }) }; let physical_input_schema = input_exec.schema(); diff --git a/datafusion/src/physical_plan/windows.rs b/datafusion/src/physical_plan/windows.rs index 89263767c72af..cd603fd5134ee 100644 --- a/datafusion/src/physical_plan/windows.rs +++ b/datafusion/src/physical_plan/windows.rs @@ -404,11 +404,22 @@ impl ExecutionPlan for WindowAggExec { /// Get the output partitioning of this plan fn output_partitioning(&self) -> Partitioning { - Partitioning::UnknownPartitioning(1) + // because we can have repartitioning using the partition keys + // this would be either 1 or more than 1 depending on the presense of + // repartitioning + self.input.output_partitioning() } fn required_child_distribution(&self) -> Distribution { - Distribution::SinglePartition + if self + .window_expr() + .iter() + .all(|expr| expr.partition_by().is_empty()) + { + Distribution::SinglePartition + } else { + Distribution::UnspecifiedDistribution + } } fn with_new_children( @@ -428,22 +439,7 @@ impl ExecutionPlan for WindowAggExec { } async fn execute(&self, partition: usize) -> Result { - if 0 != partition { - return Err(DataFusionError::Internal(format!( - "WindowAggExec invalid partition {}", - partition - ))); - } - - // window needs to operate on a single partition currently - if 1 != self.input.output_partitioning().partition_count() { - return Err(DataFusionError::Internal( - "WindowAggExec requires a single input partition".to_owned(), - )); - } - let input = self.input.execute(partition).await?; - let stream = Box::pin(WindowAggStream::new( self.schema.clone(), self.window_expr.clone(), @@ -580,38 +576,6 @@ mod tests { Ok((input, schema)) } - #[tokio::test] - async fn window_function_input_partition() -> Result<()> { - let (input, schema) = create_test_schema(4)?; - - let window_exec = Arc::new(WindowAggExec::try_new( - vec![create_window_expr( - &WindowFunction::AggregateFunction(AggregateFunction::Count), - "count".to_owned(), - &[col("c3", &schema)?], - &[], - &[], - Some(WindowFrame::default()), - schema.as_ref(), - )?], - input, - schema.clone(), - )?); - - let result = collect(window_exec).await; - - assert!(result.is_err()); - if let Some(DataFusionError::Internal(msg)) = result.err() { - assert_eq!( - msg, - "WindowAggExec requires a single input partition".to_owned() - ); - } else { - unreachable!("Expect an internal error to happen"); - } - Ok(()) - } - #[tokio::test] async fn window_function() -> Result<()> { let (input, schema) = create_test_schema(1)?; diff --git a/datafusion/src/sql/utils.rs b/datafusion/src/sql/utils.rs index 080f84ef10ed3..28243360c412f 100644 --- a/datafusion/src/sql/utils.rs +++ b/datafusion/src/sql/utils.rs @@ -462,6 +462,30 @@ pub(crate) fn generate_sort_key( sort_key } +/// given a slice of window expressions sharing the same sort key, find their common partition +/// keys. +pub(crate) fn window_expr_common_partition_keys( + window_exprs: &[Expr], +) -> Result<&[Expr]> { + let all_partition_keys = window_exprs + .iter() + .map(|expr| match expr { + Expr::WindowFunction { partition_by, .. } => Ok(partition_by), + expr => Err(DataFusionError::Execution(format!( + "Impossibly got non-window expr {:?}", + expr + ))), + }) + .collect::>>()?; + let result = all_partition_keys + .iter() + .min_by_key(|s| s.len()) + .ok_or_else(|| { + DataFusionError::Execution("No window expressions found".to_owned()) + })?; + Ok(result) +} + /// group a slice of window expression expr by their order by expressions pub(crate) fn group_window_expr_by_sort_keys( window_expr: &[Expr], From 03cfcb26ad6a566dc6fabe6f93e4f3b3d416432d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 1 Jul 2021 13:29:00 -0400 Subject: [PATCH 231/329] Update API for extension planning to include logical plan (#643) * Update API for extension planning to include logical plan * Review comments --- datafusion/src/execution/context.rs | 10 +++ datafusion/src/physical_plan/mod.rs | 14 +--- datafusion/src/physical_plan/planner.rs | 85 ++++++++++++++++++++++--- datafusion/tests/user_defined_plan.rs | 9 ++- 4 files changed, 93 insertions(+), 25 deletions(-) diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index 436bce5952bdc..d5a84869ad94a 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -3376,6 +3376,16 @@ mod tests { "query not supported".to_string(), )) } + + fn create_physical_expr( + &self, + _expr: &Expr, + _input_dfschema: &crate::logical_plan::DFSchema, + _input_schema: &Schema, + _ctx_state: &ExecutionContextState, + ) -> Result> { + unimplemented!() + } } struct MyQueryPlanner {} diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index 2122751abb604..307fff619478e 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -20,8 +20,6 @@ use self::{ coalesce_partitions::CoalescePartitionsExec, display::DisplayableExecutionPlan, }; -use crate::execution::context::ExecutionContextState; -use crate::logical_plan::LogicalPlan; use crate::physical_plan::expressions::PhysicalSortExpr; use crate::{ error::{DataFusionError, Result}, @@ -122,16 +120,8 @@ impl SQLMetric { } } -/// Physical query planner that converts a `LogicalPlan` to an -/// `ExecutionPlan` suitable for execution. -pub trait PhysicalPlanner { - /// Create a physical plan from a logical plan - fn create_physical_plan( - &self, - logical_plan: &LogicalPlan, - ctx_state: &ExecutionContextState, - ) -> Result>; -} +/// Physical planner interface +pub use self::planner::PhysicalPlanner; /// `ExecutionPlan` represent nodes in the DataFusion Physical Plan. /// diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index 75f15653ba463..5b43ec12bbf03 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -39,9 +39,7 @@ use crate::physical_plan::sort::SortExec; use crate::physical_plan::udf; use crate::physical_plan::windows::WindowAggExec; use crate::physical_plan::{hash_utils, Partitioning}; -use crate::physical_plan::{ - AggregateExpr, ExecutionPlan, PhysicalExpr, PhysicalPlanner, WindowExpr, -}; +use crate::physical_plan::{AggregateExpr, ExecutionPlan, PhysicalExpr, WindowExpr}; use crate::prelude::JoinType; use crate::scalar::ScalarValue; use crate::sql::utils::{generate_sort_key, window_expr_common_partition_keys}; @@ -172,16 +170,51 @@ fn physical_name(e: &Expr, input_schema: &DFSchema) -> Result { } } +/// Physical query planner that converts a `LogicalPlan` to an +/// `ExecutionPlan` suitable for execution. +pub trait PhysicalPlanner { + /// Create a physical plan from a logical plan + fn create_physical_plan( + &self, + logical_plan: &LogicalPlan, + ctx_state: &ExecutionContextState, + ) -> Result>; + + /// Create a physical expression from a logical expression + /// suitable for evaluation + /// + /// `expr`: the expression to convert + /// + /// `input_dfschema`: the logical plan schema for evaluating `e` + /// + /// `input_schema`: the physical schema for evaluating `e` + fn create_physical_expr( + &self, + expr: &Expr, + input_dfschema: &DFSchema, + input_schema: &Schema, + ctx_state: &ExecutionContextState, + ) -> Result>; +} + /// This trait exposes the ability to plan an [`ExecutionPlan`] out of a [`LogicalPlan`]. pub trait ExtensionPlanner { /// Create a physical plan for a [`UserDefinedLogicalNode`]. - /// This errors when the planner knows how to plan the concrete implementation of `node` - /// but errors while doing so, and `None` when the planner does not know how to plan the `node` - /// and wants to delegate the planning to another [`ExtensionPlanner`]. + /// + /// `input_dfschema`: the logical plan schema for the inputs to this node + /// + /// Returns an error when the planner knows how to plan the concrete + /// implementation of `node` but errors while doing so. + /// + /// Returns `None` when the planner does not know how to plan the + /// `node` and wants to delegate the planning to another + /// [`ExtensionPlanner`]. fn plan_extension( &self, + planner: &dyn PhysicalPlanner, node: &dyn UserDefinedLogicalNode, - inputs: &[Arc], + logical_inputs: &[&LogicalPlan], + physical_inputs: &[Arc], ctx_state: &ExecutionContextState, ) -> Result>>; } @@ -210,6 +243,30 @@ impl PhysicalPlanner for DefaultPhysicalPlanner { let plan = self.create_initial_plan(logical_plan, ctx_state)?; self.optimize_plan(plan, ctx_state) } + + /// Create a physical expression from a logical expression + /// suitable for evaluation + /// + /// `e`: the expression to convert + /// + /// `input_dfschema`: the logical plan schema for evaluating `e` + /// + /// `input_schema`: the physical schema for evaluating `e` + fn create_physical_expr( + &self, + expr: &Expr, + input_dfschema: &DFSchema, + input_schema: &Schema, + ctx_state: &ExecutionContextState, + ) -> Result> { + DefaultPhysicalPlanner::create_physical_expr( + self, + expr, + input_dfschema, + input_schema, + ctx_state, + ) + } } impl DefaultPhysicalPlanner { @@ -721,7 +778,7 @@ impl DefaultPhysicalPlanner { ))) } LogicalPlan::Extension { node } => { - let inputs = node + let physical_inputs = node .inputs() .into_iter() .map(|input_plan| self.create_initial_plan(input_plan, ctx_state)) @@ -733,7 +790,13 @@ impl DefaultPhysicalPlanner { if let Some(plan) = maybe_plan { Ok(Some(plan)) } else { - planner.plan_extension(node.as_ref(), &inputs, ctx_state) + planner.plan_extension( + self, + node.as_ref(), + &node.inputs(), + &physical_inputs, + ctx_state, + ) } }, )?; @@ -1644,8 +1707,10 @@ mod tests { /// Create a physical plan for an extension node fn plan_extension( &self, + _planner: &dyn PhysicalPlanner, _node: &dyn UserDefinedLogicalNode, - _inputs: &[Arc], + _logical_inputs: &[&LogicalPlan], + _physical_inputs: &[Arc], _ctx_state: &ExecutionContextState, ) -> Result>> { Ok(Some(Arc::new(NoOpExecutionPlan { diff --git a/datafusion/tests/user_defined_plan.rs b/datafusion/tests/user_defined_plan.rs index 22ebec8b9a994..21b49638d23a1 100644 --- a/datafusion/tests/user_defined_plan.rs +++ b/datafusion/tests/user_defined_plan.rs @@ -321,16 +321,19 @@ impl ExtensionPlanner for TopKPlanner { /// Create a physical plan for an extension node fn plan_extension( &self, + _planner: &dyn PhysicalPlanner, node: &dyn UserDefinedLogicalNode, - inputs: &[Arc], + logical_inputs: &[&LogicalPlan], + physical_inputs: &[Arc], _ctx_state: &ExecutionContextState, ) -> Result>> { Ok( if let Some(topk_node) = node.as_any().downcast_ref::() { - assert_eq!(inputs.len(), 1, "Inconsistent number of inputs"); + assert_eq!(logical_inputs.len(), 1, "Inconsistent number of inputs"); + assert_eq!(physical_inputs.len(), 1, "Inconsistent number of inputs"); // figure out input name Some(Arc::new(TopKExec { - input: inputs[0].clone(), + input: physical_inputs[0].clone(), k: topk_node.k, })) } else { From 84fab4e6518d1250c0afd31e246517b7ddc37cf7 Mon Sep 17 00:00:00 2001 From: Todd Treece <360020+toddtreece@users.noreply.github.com> Date: Fri, 2 Jul 2021 16:58:41 -0400 Subject: [PATCH 232/329] bump arrow and parquet versions to 4.4.0 (#654) --- datafusion/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index a001fc7c58035..f1a77741064e4 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -46,8 +46,8 @@ unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = "0.7" hashbrown = "0.11" -arrow = { version = "4.3", features = ["prettyprint"] } -parquet = { version = "4.3", features = ["arrow"] } +arrow = { version = "4.4", features = ["prettyprint"] } +parquet = { version = "4.4", features = ["arrow"] } sqlparser = "0.9.0" paste = "^1.0" num_cpus = "1.13.0" From c0de9bb49adce763a81fe4885527ccef4487ef0a Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Sat, 3 Jul 2021 05:16:53 +0800 Subject: [PATCH 233/329] implement lead and lag built-in window function (#429) * add lead and lag * add integration tests * add partitioned window functions --- ballista/rust/core/src/serde/scheduler/mod.rs | 1 - .../src/physical_plan/expressions/lead_lag.rs | 181 ++++++++++++++++++ .../src/physical_plan/expressions/mod.rs | 2 + .../physical_plan/expressions/nth_value.rs | 3 +- datafusion/src/physical_plan/windows.rs | 42 ++-- .../partitioned_window_built_in_functions.sql | 29 +++ .../sqls/simple_window_built_in_functions.sql | 2 + integration-tests/test_psql_parity.py | 2 +- 8 files changed, 246 insertions(+), 16 deletions(-) create mode 100644 datafusion/src/physical_plan/expressions/lead_lag.rs create mode 100644 integration-tests/sqls/partitioned_window_built_in_functions.sql diff --git a/ballista/rust/core/src/serde/scheduler/mod.rs b/ballista/rust/core/src/serde/scheduler/mod.rs index 75e3ac496ff57..f66bb08189d28 100644 --- a/ballista/rust/core/src/serde/scheduler/mod.rs +++ b/ballista/rust/core/src/serde/scheduler/mod.rs @@ -34,7 +34,6 @@ pub mod to_proto; /// Action that can be sent to an executor #[derive(Debug, Clone)] - pub enum Action { /// Execute a query and store the results in memory ExecutePartition(ExecutePartition), diff --git a/datafusion/src/physical_plan/expressions/lead_lag.rs b/datafusion/src/physical_plan/expressions/lead_lag.rs new file mode 100644 index 0000000000000..352d97c1e1167 --- /dev/null +++ b/datafusion/src/physical_plan/expressions/lead_lag.rs @@ -0,0 +1,181 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines physical expression for `lead` and `lag` that can evaluated +//! at runtime during query execution + +use crate::error::{DataFusionError, Result}; +use crate::physical_plan::window_functions::PartitionEvaluator; +use crate::physical_plan::{window_functions::BuiltInWindowFunctionExpr, PhysicalExpr}; +use arrow::array::ArrayRef; +use arrow::compute::kernels::window::shift; +use arrow::datatypes::{DataType, Field}; +use arrow::record_batch::RecordBatch; +use std::any::Any; +use std::ops::Range; +use std::sync::Arc; + +/// window shift expression +#[derive(Debug)] +pub struct WindowShift { + name: String, + data_type: DataType, + shift_offset: i64, + expr: Arc, +} + +/// lead() window function +pub fn lead( + name: String, + data_type: DataType, + expr: Arc, +) -> WindowShift { + WindowShift { + name, + data_type, + shift_offset: -1, + expr, + } +} + +/// lag() window function +pub fn lag( + name: String, + data_type: DataType, + expr: Arc, +) -> WindowShift { + WindowShift { + name, + data_type, + shift_offset: 1, + expr, + } +} + +impl BuiltInWindowFunctionExpr for WindowShift { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn field(&self) -> Result { + let nullable = true; + Ok(Field::new(&self.name, self.data_type.clone(), nullable)) + } + + fn expressions(&self) -> Vec> { + vec![self.expr.clone()] + } + + fn name(&self) -> &str { + &self.name + } + + fn create_evaluator( + &self, + batch: &RecordBatch, + ) -> Result> { + let values = self + .expressions() + .iter() + .map(|e| e.evaluate(batch)) + .map(|r| r.map(|v| v.into_array(batch.num_rows()))) + .collect::>>()?; + Ok(Box::new(WindowShiftEvaluator { + shift_offset: self.shift_offset, + values, + })) + } +} + +pub(crate) struct WindowShiftEvaluator { + shift_offset: i64, + values: Vec, +} + +impl PartitionEvaluator for WindowShiftEvaluator { + fn evaluate_partition(&self, partition: Range) -> Result { + let value = &self.values[0]; + let value = value.slice(partition.start, partition.end - partition.start); + shift(value.as_ref(), self.shift_offset).map_err(DataFusionError::ArrowError) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::error::Result; + use crate::physical_plan::expressions::Column; + use arrow::record_batch::RecordBatch; + use arrow::{array::*, datatypes::*}; + + fn test_i32_result(expr: WindowShift, expected: Int32Array) -> Result<()> { + let arr: ArrayRef = Arc::new(Int32Array::from(vec![1, -2, 3, -4, 5, -6, 7, 8])); + let values = vec![arr]; + let schema = Schema::new(vec![Field::new("arr", DataType::Int32, false)]); + let batch = RecordBatch::try_new(Arc::new(schema), values.clone())?; + let result = expr.create_evaluator(&batch)?.evaluate(vec![0..8])?; + assert_eq!(1, result.len()); + let result = result[0].as_any().downcast_ref::().unwrap(); + assert_eq!(expected, *result); + Ok(()) + } + + #[test] + fn lead_lag_window_shift() -> Result<()> { + test_i32_result( + lead( + "lead".to_owned(), + DataType::Float32, + Arc::new(Column::new("c3", 0)), + ), + vec![ + Some(-2), + Some(3), + Some(-4), + Some(5), + Some(-6), + Some(7), + Some(8), + None, + ] + .iter() + .collect::(), + )?; + + test_i32_result( + lag( + "lead".to_owned(), + DataType::Float32, + Arc::new(Column::new("c3", 0)), + ), + vec![ + None, + Some(1), + Some(-2), + Some(3), + Some(-4), + Some(5), + Some(-6), + Some(7), + ] + .iter() + .collect::(), + )?; + Ok(()) + } +} diff --git a/datafusion/src/physical_plan/expressions/mod.rs b/datafusion/src/physical_plan/expressions/mod.rs index 440cb5b4ec67a..bd3dab65b05de 100644 --- a/datafusion/src/physical_plan/expressions/mod.rs +++ b/datafusion/src/physical_plan/expressions/mod.rs @@ -36,6 +36,7 @@ mod count; mod in_list; mod is_not_null; mod is_null; +mod lead_lag; mod literal; mod min_max; mod negative; @@ -58,6 +59,7 @@ pub use count::Count; pub use in_list::{in_list, InListExpr}; pub use is_not_null::{is_not_null, IsNotNullExpr}; pub use is_null::{is_null, IsNullExpr}; +pub use lead_lag::{lag, lead}; pub use literal::{lit, Literal}; pub use min_max::{Max, Min}; pub use negative::{negative, NegativeExpr}; diff --git a/datafusion/src/physical_plan/expressions/nth_value.rs b/datafusion/src/physical_plan/expressions/nth_value.rs index 854078e232f00..7542a251f50d6 100644 --- a/datafusion/src/physical_plan/expressions/nth_value.rs +++ b/datafusion/src/physical_plan/expressions/nth_value.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -//! Defines physical expressions that can evaluated at runtime during query execution +//! Defines physical expressions for `first_value`, `last_value`, and `nth_value` +//! that can evaluated at runtime during query execution use crate::error::{DataFusionError, Result}; use crate::physical_plan::window_functions::PartitionEvaluator; diff --git a/datafusion/src/physical_plan/windows.rs b/datafusion/src/physical_plan/windows.rs index cd603fd5134ee..1b783782e164b 100644 --- a/datafusion/src/physical_plan/windows.rs +++ b/datafusion/src/physical_plan/windows.rs @@ -21,7 +21,9 @@ use crate::error::{DataFusionError, Result}; use crate::logical_plan::window_frames::{WindowFrame, WindowFrameUnits}; use crate::physical_plan::{ aggregates, common, - expressions::{dense_rank, rank, Literal, NthValue, PhysicalSortExpr, RowNumber}, + expressions::{ + dense_rank, lag, lead, rank, Literal, NthValue, PhysicalSortExpr, RowNumber, + }, type_coercion::coerce, window_functions::{ signature_for_built_in, BuiltInWindowFunction, BuiltInWindowFunctionExpr, @@ -100,10 +102,22 @@ fn create_built_in_window_expr( input_schema: &Schema, name: String, ) -> Result> { - match fun { - BuiltInWindowFunction::RowNumber => Ok(Arc::new(RowNumber::new(name))), - BuiltInWindowFunction::Rank => Ok(Arc::new(rank(name))), - BuiltInWindowFunction::DenseRank => Ok(Arc::new(dense_rank(name))), + Ok(match fun { + BuiltInWindowFunction::RowNumber => Arc::new(RowNumber::new(name)), + BuiltInWindowFunction::Rank => Arc::new(rank(name)), + BuiltInWindowFunction::DenseRank => Arc::new(dense_rank(name)), + BuiltInWindowFunction::Lag => { + let coerced_args = coerce(args, input_schema, &signature_for_built_in(fun))?; + let arg = coerced_args[0].clone(); + let data_type = args[0].data_type(input_schema)?; + Arc::new(lag(name, data_type, arg)) + } + BuiltInWindowFunction::Lead => { + let coerced_args = coerce(args, input_schema, &signature_for_built_in(fun))?; + let arg = coerced_args[0].clone(); + let data_type = args[0].data_type(input_schema)?; + Arc::new(lead(name, data_type, arg)) + } BuiltInWindowFunction::NthValue => { let coerced_args = coerce(args, input_schema, &signature_for_built_in(fun))?; let arg = coerced_args[0].clone(); @@ -118,25 +132,27 @@ fn create_built_in_window_expr( .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; let n: u32 = n as u32; let data_type = args[0].data_type(input_schema)?; - Ok(Arc::new(NthValue::nth_value(name, arg, data_type, n)?)) + Arc::new(NthValue::nth_value(name, arg, data_type, n)?) } BuiltInWindowFunction::FirstValue => { let arg = coerce(args, input_schema, &signature_for_built_in(fun))?[0].clone(); let data_type = args[0].data_type(input_schema)?; - Ok(Arc::new(NthValue::first_value(name, arg, data_type))) + Arc::new(NthValue::first_value(name, arg, data_type)) } BuiltInWindowFunction::LastValue => { let arg = coerce(args, input_schema, &signature_for_built_in(fun))?[0].clone(); let data_type = args[0].data_type(input_schema)?; - Ok(Arc::new(NthValue::last_value(name, arg, data_type))) + Arc::new(NthValue::last_value(name, arg, data_type)) } - _ => Err(DataFusionError::NotImplemented(format!( - "Window function with {:?} not yet implemented", - fun - ))), - } + _ => { + return Err(DataFusionError::NotImplemented(format!( + "Window function with {:?} not yet implemented", + fun + ))) + } + }) } /// A window expr that takes the form of a built in window function diff --git a/integration-tests/sqls/partitioned_window_built_in_functions.sql b/integration-tests/sqls/partitioned_window_built_in_functions.sql new file mode 100644 index 0000000000000..f27b085f5033d --- /dev/null +++ b/integration-tests/sqls/partitioned_window_built_in_functions.sql @@ -0,0 +1,29 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at + +-- http://www.apache.org/licenses/LICENSE-2.0 + +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +SELECT + c9, + row_number() OVER (PARTITION BY c2 ORDER BY c9) row_num, + lead(c9) OVER (PARTITION BY c2 ORDER BY c9) lead_c9, + lag(c9) OVER (PARTITION BY c2 ORDER BY c9) lag_c9, + first_value(c9) OVER (PARTITION BY c2 ORDER BY c9) first_c9, + first_value(c9) OVER (PARTITION BY c2 ORDER BY c9 DESC) first_c9_desc, + last_value(c9) OVER (PARTITION BY c2 ORDER BY c9) last_c9, + last_value(c9) OVER (PARTITION BY c2 ORDER BY c9 DESC) last_c9_desc, + nth_value(c9, 2) OVER (PARTITION BY c2 ORDER BY c9) second_c9, + nth_value(c9, 2) OVER (PARTITION BY c2 ORDER BY c9 DESC) second_c9_desc +FROM test +ORDER BY c9; diff --git a/integration-tests/sqls/simple_window_built_in_functions.sql b/integration-tests/sqls/simple_window_built_in_functions.sql index e76b383060026..05c34dd12fca7 100644 --- a/integration-tests/sqls/simple_window_built_in_functions.sql +++ b/integration-tests/sqls/simple_window_built_in_functions.sql @@ -17,6 +17,8 @@ SELECT c9, row_number() OVER (ORDER BY c9) row_num, + lead(c9) OVER (ORDER BY c9) lead_c9, + lag(c9) OVER (ORDER BY c9) lag_c9, first_value(c9) OVER (ORDER BY c9) first_c9, first_value(c9) OVER (ORDER BY c9 DESC) first_c9_desc, last_value(c9) OVER (ORDER BY c9) last_c9, diff --git a/integration-tests/test_psql_parity.py b/integration-tests/test_psql_parity.py index 2bb8da9fd5c58..766f403f3e543 100644 --- a/integration-tests/test_psql_parity.py +++ b/integration-tests/test_psql_parity.py @@ -74,7 +74,7 @@ class PsqlParityTest(unittest.TestCase): def test_parity(self): root = Path(os.path.dirname(__file__)) / "sqls" files = set(root.glob("*.sql")) - self.assertEqual(len(files), 11, msg="tests are missed") + self.assertEqual(len(files), 12, msg="tests are missed") for fname in files: with self.subTest(fname=fname): datafusion_output = pd.read_csv( From 8d8558bb50786aa37b544722ee5511fb7e5cbd50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Fri, 2 Jul 2021 23:17:58 +0200 Subject: [PATCH 234/329] Add support for leading field in interval (#647) * Add support for leading field in interval * Naming * Fix clippy, tests --- datafusion/src/sql/planner.rs | 23 ++++++++++++----------- datafusion/tests/sql.rs | 17 +++++++++++++++++ 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index b86dc0f48c149..213ae890d7d09 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -1264,13 +1264,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { last_field: &Option, fractional_seconds_precision: &Option, ) -> Result { - if leading_field.is_some() { - return Err(DataFusionError::NotImplemented(format!( - "Unsupported Interval Expression with leading_field {:?}", - leading_field - ))); - } - if leading_precision.is_some() { return Err(DataFusionError::NotImplemented(format!( "Unsupported Interval Expression with leading_precision {:?}", @@ -1367,10 +1360,18 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { break; } - let (diff_month, diff_days, diff_millis) = calculate_from_part( - interval_period_str.unwrap(), - parts.next().unwrap_or("second"), - )?; + let leading_field = leading_field + .as_ref() + .map(|dt| dt.to_string()) + .unwrap_or_else(|| "second".to_string()); + + let unit = parts + .next() + .map(|part| part.to_string()) + .unwrap_or(leading_field); + + let (diff_month, diff_days, diff_millis) = + calculate_from_part(interval_period_str.unwrap(), &unit)?; result_month += diff_month as i64; diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 5cb5529ba80e7..82c12ce217c98 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -3300,6 +3300,11 @@ async fn test_interval_expressions() -> Result<()> { "interval '5 day'", "0 years 0 mons 5 days 0 hours 0 mins 0.00 secs" ); + // Hour is ignored, this matches PostgreSQL + test_expression!( + "interval '5 day' hour", + "0 years 0 mons 5 days 0 hours 0 mins 0.00 secs" + ); test_expression!( "interval '5 day 4 hours 3 minutes 2 seconds 100 milliseconds'", "0 years 0 mons 5 days 4 hours 3 mins 2.100 secs" @@ -3308,10 +3313,18 @@ async fn test_interval_expressions() -> Result<()> { "interval '0.5 month'", "0 years 0 mons 15 days 0 hours 0 mins 0.00 secs" ); + test_expression!( + "interval '0.5' month", + "0 years 0 mons 15 days 0 hours 0 mins 0.00 secs" + ); test_expression!( "interval '1 month'", "0 years 1 mons 0 days 0 hours 0 mins 0.00 secs" ); + test_expression!( + "interval '1' MONTH", + "0 years 1 mons 0 days 0 hours 0 mins 0.00 secs" + ); test_expression!( "interval '5 month'", "0 years 5 mons 0 days 0 hours 0 mins 0.00 secs" @@ -3332,6 +3345,10 @@ async fn test_interval_expressions() -> Result<()> { "interval '2 year'", "2 years 0 mons 0 days 0 hours 0 mins 0.00 secs" ); + test_expression!( + "interval '2' year", + "2 years 0 mons 0 days 0 hours 0 mins 0.00 secs" + ); Ok(()) } From 4a65ee37abd6319ed6d342f7ff22f46f4d800a03 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Sat, 3 Jul 2021 05:18:47 +0800 Subject: [PATCH 235/329] fix python crate with the changes to logical plan builder (#650) --- python/.gitignore | 1 + python/Cargo.toml | 2 +- python/src/dataframe.rs | 16 ++++++++-------- python/tests/generic.py | 12 +++--------- python/tests/test_sql.py | 8 ++------ python/tests/test_udaf.py | 3 ++- 6 files changed, 17 insertions(+), 25 deletions(-) diff --git a/python/.gitignore b/python/.gitignore index 48fe4dbe52dde..feb402ed4c5d7 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -18,3 +18,4 @@ /target Cargo.lock venv +.venv diff --git a/python/Cargo.toml b/python/Cargo.toml index 8f1480deedbc9..777e42745de58 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -31,7 +31,7 @@ libc = "0.2" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } rand = "0.7" pyo3 = { version = "0.13.2", features = ["extension-module"] } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "c92079dfb3045a9a46d12c3bc22361a44d11b8bc" } +datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "fddab22aa562750f67385a961497dc020b18c4b2" } [lib] name = "datafusion" diff --git a/python/src/dataframe.rs b/python/src/dataframe.rs index 8ceac64741e9e..89c85f958c797 100644 --- a/python/src/dataframe.rs +++ b/python/src/dataframe.rs @@ -51,7 +51,7 @@ impl DataFrame { #[args(args = "*")] fn select(&self, args: &PyTuple) -> PyResult { let expressions = expression::from_tuple(args)?; - let builder = LogicalPlanBuilder::from(&self.plan); + let builder = LogicalPlanBuilder::from(self.plan.clone()); let builder = errors::wrap(builder.project(expressions.into_iter().map(|e| e.expr)))?; let plan = errors::wrap(builder.build())?; @@ -64,7 +64,7 @@ impl DataFrame { /// Filter according to the `predicate` expression fn filter(&self, predicate: expression::Expression) -> PyResult { - let builder = LogicalPlanBuilder::from(&self.plan); + let builder = LogicalPlanBuilder::from(self.plan.clone()); let builder = errors::wrap(builder.filter(predicate.expr))?; let plan = errors::wrap(builder.build())?; @@ -80,7 +80,7 @@ impl DataFrame { group_by: Vec, aggs: Vec, ) -> PyResult { - let builder = LogicalPlanBuilder::from(&self.plan); + let builder = LogicalPlanBuilder::from(self.plan.clone()); let builder = errors::wrap(builder.aggregate( group_by.into_iter().map(|e| e.expr), aggs.into_iter().map(|e| e.expr), @@ -96,7 +96,7 @@ impl DataFrame { /// Sort by specified sorting expressions fn sort(&self, exprs: Vec) -> PyResult { let exprs = exprs.into_iter().map(|e| e.expr); - let builder = LogicalPlanBuilder::from(&self.plan); + let builder = LogicalPlanBuilder::from(self.plan.clone()); let builder = errors::wrap(builder.sort(exprs))?; let plan = errors::wrap(builder.build())?; Ok(DataFrame { @@ -107,7 +107,7 @@ impl DataFrame { /// Limits the plan to return at most `count` rows fn limit(&self, count: usize) -> PyResult { - let builder = LogicalPlanBuilder::from(&self.plan); + let builder = LogicalPlanBuilder::from(self.plan.clone()); let builder = errors::wrap(builder.limit(count))?; let plan = errors::wrap(builder.build())?; @@ -141,7 +141,7 @@ impl DataFrame { /// Returns the join of two DataFrames `on`. fn join(&self, right: &DataFrame, on: Vec<&str>, how: &str) -> PyResult { - let builder = LogicalPlanBuilder::from(&self.plan); + let builder = LogicalPlanBuilder::from(self.plan.clone()); let join_type = match how { "inner" => JoinType::Inner, @@ -162,8 +162,8 @@ impl DataFrame { let builder = errors::wrap(builder.join( &right.plan, join_type, - on.as_slice(), - on.as_slice(), + on.clone(), + on, ))?; let plan = errors::wrap(builder.build())?; diff --git a/python/tests/generic.py b/python/tests/generic.py index e61542e6ab37f..5871c5e891b28 100644 --- a/python/tests/generic.py +++ b/python/tests/generic.py @@ -49,9 +49,7 @@ def data_datetime(f): datetime.datetime.now() - datetime.timedelta(days=1), datetime.datetime.now() + datetime.timedelta(days=1), ] - return pa.array( - data, type=pa.timestamp(f), mask=np.array([False, True, False]) - ) + return pa.array(data, type=pa.timestamp(f), mask=np.array([False, True, False])) def data_date32(): @@ -60,9 +58,7 @@ def data_date32(): datetime.date(1980, 1, 1), datetime.date(2030, 1, 1), ] - return pa.array( - data, type=pa.date32(), mask=np.array([False, True, False]) - ) + return pa.array(data, type=pa.date32(), mask=np.array([False, True, False])) def data_timedelta(f): @@ -71,9 +67,7 @@ def data_timedelta(f): datetime.timedelta(days=1), datetime.timedelta(seconds=1), ] - return pa.array( - data, type=pa.duration(f), mask=np.array([False, True, False]) - ) + return pa.array(data, type=pa.duration(f), mask=np.array([False, True, False])) def data_binary_other(): diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py index 361526d069702..62d6c0975f3db 100644 --- a/python/tests/test_sql.py +++ b/python/tests/test_sql.py @@ -112,9 +112,7 @@ def test_cast(ctx, tmp_path): "float", ] - select = ", ".join( - [f"CAST(9 AS {t}) AS A{i}" for i, t in enumerate(valid_types)] - ) + select = ", ".join([f"CAST(9 AS {t}) AS A{i}" for i, t in enumerate(valid_types)]) # can execute, which implies that we can cast ctx.sql(f"SELECT {select} FROM t").collect() @@ -143,9 +141,7 @@ def test_udf( ctx, tmp_path, fn, input_types, output_type, input_values, expected_values ): # write to disk - path = helpers.write_parquet( - tmp_path / "a.parquet", pa.array(input_values) - ) + path = helpers.write_parquet(tmp_path / "a.parquet", pa.array(input_values)) ctx.register_parquet("t", path) ctx.register_udf("udf", fn, input_types, output_type) diff --git a/python/tests/test_udaf.py b/python/tests/test_udaf.py index b24c08dbc8674..103d967663c46 100644 --- a/python/tests/test_udaf.py +++ b/python/tests/test_udaf.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +from typing import List import pyarrow as pa import pyarrow.compute as pc import pytest @@ -30,7 +31,7 @@ class Accumulator: def __init__(self): self._sum = pa.scalar(0.0) - def to_scalars(self) -> [pa.Scalar]: + def to_scalars(self) -> List[pa.Scalar]: return [self._sum] def update(self, values: pa.Array) -> None: From d97fc9145bfc41969bd30dee3d33337ac3932aa5 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 2 Jul 2021 23:28:33 -0400 Subject: [PATCH 236/329] Fix RAT check (#652) --- .github/workflows/dev.yml | 21 +++++++---- dev/release/check-rat-report.py | 59 +++++++++++++++++++++++++++++++ dev/release/rat_exclude_files.txt | 16 +++++++++ dev/release/run-rat.sh | 43 ++++++++++++++++++++++ 4 files changed, 133 insertions(+), 6 deletions(-) create mode 100644 dev/release/check-rat-report.py create mode 100755 dev/release/run-rat.sh diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index a7e574eef97c9..8bb35f1ef871b 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -16,15 +16,11 @@ # under the License. name: Dev - -on: - # always trigger - push: - pull_request: +on: [push, pull_request] jobs: lint: - name: Lint C++, Python, R, Rust, Docker, RAT + name: Lint C++, Python, R, Rust, Docker runs-on: ubuntu-latest steps: - name: Checkout Arrow @@ -42,6 +38,19 @@ jobs: - name: Lint run: archery lint --rat + rat: + name: Release Audit Tool (RAT) + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Setup Python + uses: actions/setup-python@v1 + with: + python-version: 3.8 + - name: Audit licenses + run: ./dev/release/run-rat.sh . + prettier: name: Use prettier to check formatting of documents runs-on: ubuntu-latest diff --git a/dev/release/check-rat-report.py b/dev/release/check-rat-report.py new file mode 100644 index 0000000000000..e30d72bddd7f8 --- /dev/null +++ b/dev/release/check-rat-report.py @@ -0,0 +1,59 @@ +#!/usr/bin/python +############################################################################## +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +############################################################################## +import fnmatch +import re +import sys +import xml.etree.ElementTree as ET + +if len(sys.argv) != 3: + sys.stderr.write("Usage: %s exclude_globs.lst rat_report.xml\n" % + sys.argv[0]) + sys.exit(1) + +exclude_globs_filename = sys.argv[1] +xml_filename = sys.argv[2] + +globs = [line.strip() for line in open(exclude_globs_filename, "r")] + +tree = ET.parse(xml_filename) +root = tree.getroot() +resources = root.findall('resource') + +all_ok = True +for r in resources: + approvals = r.findall('license-approval') + if not approvals or approvals[0].attrib['name'] == 'true': + continue + clean_name = re.sub('^[^/]+/', '', r.attrib['name']) + excluded = False + for g in globs: + if fnmatch.fnmatch(clean_name, g): + excluded = True + break + if not excluded: + sys.stdout.write("NOT APPROVED: %s (%s): %s\n" % ( + clean_name, r.attrib['name'], approvals[0].attrib['name'])) + all_ok = False + +if not all_ok: + sys.exit(1) + +print('OK') +sys.exit(0) diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 96beccd0af81e..5a7d3517e1960 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -106,3 +106,19 @@ ballista/rust/scheduler/testdata/* ballista/ui/scheduler/yarn.lock python/rust-toolchain python/requirements*.txt +**/testdata/* +benchmarks/queries/* +benchmarks/data/* +ci/* +**/*.svg +**/*.csv +**/*.json +venv/* +testing/* +target/* +**/target/* +Cargo.lock +**/Cargo.lock +.history +parquet-testing/* +*rat.txt diff --git a/dev/release/run-rat.sh b/dev/release/run-rat.sh new file mode 100755 index 0000000000000..94fa55fbe0974 --- /dev/null +++ b/dev/release/run-rat.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +RAT_VERSION=0.13 + +# download apache rat +if [ ! -f apache-rat-${RAT_VERSION}.jar ]; then + curl -s https://repo1.maven.org/maven2/org/apache/rat/apache-rat/${RAT_VERSION}/apache-rat-${RAT_VERSION}.jar > apache-rat-${RAT_VERSION}.jar +fi + +RAT="java -jar apache-rat-${RAT_VERSION}.jar -x " + +RELEASE_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) + +# generate the rat report +$RAT $1 > rat.txt +python $RELEASE_DIR/check-rat-report.py $RELEASE_DIR/rat_exclude_files.txt rat.txt > filtered_rat.txt +cat filtered_rat.txt +UNAPPROVED=`cat filtered_rat.txt | grep "NOT APPROVED" | wc -l` + +if [ "0" -eq "${UNAPPROVED}" ]; then + echo "No unapproved licenses" +else + echo "${UNAPPROVED} unapproved licences. Check rat report: rat.txt" + exit 1 +fi From 9314dbb31a785e8f08bb5e65ac55f51592920b01 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 4 Jul 2021 02:53:27 -0600 Subject: [PATCH 237/329] Ballista: Implement scalable distributed joins (#634) * Refactor Ballista planner to support RepartitionExec * Improve tests and replace MergeExec with CoalescePartitionsExec in query plan output --- .../src/execution_plans/shuffle_writer.rs | 20 +- .../src/execution_plans/unresolved_shuffle.rs | 15 +- ballista/rust/core/src/utils.rs | 11 +- ballista/rust/scheduler/src/planner.rs | 189 +++++++++--------- ballista/rust/scheduler/src/test_utils.rs | 15 +- datafusion/src/lib.rs | 2 +- .../src/physical_plan/coalesce_partitions.rs | 14 +- .../src/physical_plan/hash_aggregate.rs | 2 +- datafusion/src/physical_plan/limit.rs | 2 +- datafusion/src/physical_plan/mod.rs | 2 +- datafusion/src/physical_plan/planner.rs | 2 +- datafusion/tests/sql.rs | 2 +- 12 files changed, 143 insertions(+), 133 deletions(-) diff --git a/ballista/rust/core/src/execution_plans/shuffle_writer.rs b/ballista/rust/core/src/execution_plans/shuffle_writer.rs index 2d8d78324d2dd..7fffaba13217c 100644 --- a/ballista/rust/core/src/execution_plans/shuffle_writer.rs +++ b/ballista/rust/core/src/execution_plans/shuffle_writer.rs @@ -42,7 +42,9 @@ use datafusion::arrow::ipc::writer::FileWriter; use datafusion::arrow::record_batch::RecordBatch; use datafusion::error::{DataFusionError, Result}; use datafusion::physical_plan::hash_join::create_hashes; -use datafusion::physical_plan::{ExecutionPlan, Partitioning, RecordBatchStream}; +use datafusion::physical_plan::{ + DisplayFormatType, ExecutionPlan, Partitioning, RecordBatchStream, +}; use futures::StreamExt; use log::info; use std::fs::File; @@ -307,6 +309,22 @@ impl ExecutionPlan for ShuffleWriterExec { )), } } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + write!( + f, + "ShuffleWriterExec: {:?}", + self.shuffle_output_partitioning + ) + } + } + } } fn result_schema() -> SchemaRef { diff --git a/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs b/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs index 9c53bc7a1d43c..49b4f7a0992c2 100644 --- a/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs +++ b/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs @@ -23,12 +23,13 @@ use crate::serde::scheduler::PartitionLocation; use async_trait::async_trait; use datafusion::arrow::datatypes::SchemaRef; -use datafusion::physical_plan::{ExecutionPlan, Partitioning}; +use datafusion::physical_plan::{DisplayFormatType, ExecutionPlan, Partitioning}; use datafusion::{ error::{DataFusionError, Result}, physical_plan::RecordBatchStream, }; use log::info; +use std::fmt::Formatter; /// UnresolvedShuffleExec represents a dependency on the results of several ShuffleWriterExec nodes which haven't been computed yet. /// @@ -97,4 +98,16 @@ impl ExecutionPlan for UnresolvedShuffleExec { "Ballista UnresolvedShuffleExec does not support execution".to_owned(), )) } + + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default => { + write!(f, "UnresolvedShuffleExec") + } + } + } } diff --git a/ballista/rust/core/src/utils.rs b/ballista/rust/core/src/utils.rs index d043763dc6f1a..8a510f4808760 100644 --- a/ballista/rust/core/src/utils.rs +++ b/ballista/rust/core/src/utils.rs @@ -227,16 +227,7 @@ fn build_exec_plan_diagram( /// Create a DataFusion context that is compatible with Ballista pub fn create_datafusion_context() -> ExecutionContext { - // remove Repartition rule because that isn't supported yet - let rules: Vec> = vec![ - Arc::new(CoalesceBatches::new()), - Arc::new(AddCoalescePartitionsExec::new()), - ]; - let config = ExecutionConfig::new() - .with_concurrency(1) - .with_repartition_joins(false) - .with_repartition_aggregations(false) - .with_physical_optimizer_rules(rules); + let config = ExecutionConfig::new().with_concurrency(2); // TODO: this is hack to enable partitioned joins ExecutionContext::with_config(config) } diff --git a/ballista/rust/scheduler/src/planner.rs b/ballista/rust/scheduler/src/planner.rs index 70d90a4a07d03..319526142bf96 100644 --- a/ballista/rust/scheduler/src/planner.rs +++ b/ballista/rust/scheduler/src/planner.rs @@ -28,15 +28,11 @@ use ballista_core::{ execution_plans::{ShuffleReaderExec, ShuffleWriterExec, UnresolvedShuffleExec}, serde::scheduler::PartitionLocation, }; -use datafusion::execution::context::{ExecutionConfig, ExecutionContext}; -use datafusion::physical_optimizer::coalesce_batches::CoalesceBatches; -use datafusion::physical_optimizer::merge_exec::AddCoalescePartitionsExec; -use datafusion::physical_optimizer::optimizer::PhysicalOptimizerRule; +use datafusion::execution::context::ExecutionContext; use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; -use datafusion::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec}; -use datafusion::physical_plan::hash_join::HashJoinExec; +use datafusion::physical_plan::repartition::RepartitionExec; use datafusion::physical_plan::windows::WindowAggExec; -use datafusion::physical_plan::ExecutionPlan; +use datafusion::physical_plan::{ExecutionPlan, Partitioning}; use log::info; type PartialQueryStageResult = (Arc, Vec>); @@ -71,13 +67,18 @@ impl DistributedPlanner { info!("planning query stages"); let (new_plan, mut stages) = self.plan_query_stages_internal(job_id, execution_plan)?; - stages.push(create_query_stage(job_id, self.next_stage_id(), new_plan)?); + stages.push(create_shuffle_writer( + job_id, + self.next_stage_id(), + new_plan, + None, + )?); Ok(stages) } /// Returns a potentially modified version of the input execution_plan along with the resulting query stages. /// This function is needed because the input execution_plan might need to be modified, but it might not hold a - /// compelte query stage (its parent might also belong to the same stage) + /// complete query stage (its parent might also belong to the same stage) fn plan_query_stages_internal( &mut self, job_id: &str, @@ -98,22 +99,17 @@ impl DistributedPlanner { } if let Some(adapter) = execution_plan.as_any().downcast_ref::() { - // remove Repartition rule because that isn't supported yet - let rules: Vec> = vec![ - Arc::new(CoalesceBatches::new()), - Arc::new(AddCoalescePartitionsExec::new()), - ]; - let config = ExecutionConfig::new().with_physical_optimizer_rules(rules); - let ctx = ExecutionContext::with_config(config); + let ctx = ExecutionContext::new(); Ok((ctx.create_physical_plan(&adapter.logical_plan)?, stages)) - } else if let Some(merge) = execution_plan + } else if let Some(coalesce) = execution_plan .as_any() .downcast_ref::() { - let query_stage = create_query_stage( + let query_stage = create_shuffle_writer( job_id, self.next_stage_id(), - merge.children()[0].clone(), + coalesce.children()[0].clone(), + None, )?; let unresolved_shuffle = Arc::new(UnresolvedShuffleExec::new( vec![query_stage.stage_id()], @@ -121,35 +117,26 @@ impl DistributedPlanner { query_stage.output_partitioning().partition_count(), )); stages.push(query_stage); - Ok((merge.with_new_children(vec![unresolved_shuffle])?, stages)) - } else if let Some(agg) = - execution_plan.as_any().downcast_ref::() + Ok(( + coalesce.with_new_children(vec![unresolved_shuffle])?, + stages, + )) + } else if let Some(repart) = + execution_plan.as_any().downcast_ref::() { - //TODO should insert query stages in more generic way based on partitioning metadata - // and not specifically for this operator - match agg.mode() { - AggregateMode::Final | AggregateMode::FinalPartitioned => { - let mut new_children: Vec> = vec![]; - for child in &children { - let new_stage = create_query_stage( - job_id, - self.next_stage_id(), - child.clone(), - )?; - new_children.push(Arc::new(UnresolvedShuffleExec::new( - vec![new_stage.stage_id()], - new_stage.schema().clone(), - new_stage.output_partitioning().partition_count(), - ))); - stages.push(new_stage); - } - Ok((agg.with_new_children(new_children)?, stages)) - } - AggregateMode::Partial => Ok((agg.with_new_children(children)?, stages)), - } - } else if let Some(join) = execution_plan.as_any().downcast_ref::() - { - Ok((join.with_new_children(children)?, stages)) + let query_stage = create_shuffle_writer( + job_id, + self.next_stage_id(), + repart.children()[0].clone(), + Some(repart.partitioning().to_owned()), + )?; + let unresolved_shuffle = Arc::new(UnresolvedShuffleExec::new( + vec![query_stage.stage_id()], + query_stage.schema(), + query_stage.output_partitioning().partition_count(), + )); + stages.push(query_stage); + Ok((unresolved_shuffle, stages)) } else if let Some(window) = execution_plan.as_any().downcast_ref::() { @@ -158,25 +145,7 @@ impl DistributedPlanner { window ))) } else { - // TODO check for compatible partitioning schema, not just count - if execution_plan.output_partitioning().partition_count() - != children[0].output_partitioning().partition_count() - { - let mut new_children: Vec> = vec![]; - for child in &children { - let new_stage = - create_query_stage(job_id, self.next_stage_id(), child.clone())?; - new_children.push(Arc::new(UnresolvedShuffleExec::new( - vec![new_stage.stage_id()], - new_stage.schema().clone(), - new_stage.output_partitioning().partition_count(), - ))); - stages.push(new_stage); - } - Ok((execution_plan.with_new_children(new_children)?, stages)) - } else { - Ok((execution_plan.with_new_children(children)?, stages)) - } + Ok((execution_plan.with_new_children(children)?, stages)) } } @@ -224,17 +193,18 @@ pub fn remove_unresolved_shuffles( Ok(stage.with_new_children(new_children)?) } -fn create_query_stage( +fn create_shuffle_writer( job_id: &str, stage_id: usize, plan: Arc, + partitioning: Option, ) -> Result> { Ok(Arc::new(ShuffleWriterExec::try_new( job_id.to_owned(), stage_id, plan, "".to_owned(), // executor will decide on the work_dir path - None, + partitioning, )?)) } @@ -245,7 +215,7 @@ mod test { use ballista_core::error::BallistaError; use ballista_core::execution_plans::UnresolvedShuffleExec; use ballista_core::serde::protobuf; - use datafusion::physical_plan::hash_aggregate::HashAggregateExec; + use datafusion::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec}; use datafusion::physical_plan::sort::SortExec; use datafusion::physical_plan::{ coalesce_partitions::CoalescePartitionsExec, projection::ProjectionExec, @@ -262,7 +232,7 @@ mod test { } #[test] - fn test() -> Result<(), BallistaError> { + fn distributed_hash_aggregate_plan() -> Result<(), BallistaError> { let mut ctx = datafusion_test_context("testdata")?; // simplified form of TPC-H query 1 @@ -285,41 +255,72 @@ mod test { } /* Expected result: - ShuffleWriterExec: job=f011432e-e424-4016-915d-e3d8b84f6dbd, stage=1 - HashAggregateExec: groupBy=["l_returnflag"], aggrExpr=["SUM(l_extendedprice Multiply Int64(1)) [\"l_extendedprice * CAST(1 AS Float64)\"]"] - CsvExec: testdata/lineitem; partitions=2 - ShuffleWriterExec: job=f011432e-e424-4016-915d-e3d8b84f6dbd, stage=2 - CoalescePartitionsExec - UnresolvedShuffleExec: stages=[1] - ShuffleWriterExec: job=f011432e-e424-4016-915d-e3d8b84f6dbd, stage=3 - SortExec { input: ProjectionExec { expr: [(Column { name: "l_returnflag" }, "l_returnflag"), (Column { name: "SUM(l_ext - ProjectionExec { expr: [(Column { name: "l_returnflag" }, "l_returnflag"), (Column { name: "SUM(l_extendedprice Multip - HashAggregateExec: groupBy=["l_returnflag"], aggrExpr=["SUM(l_extendedprice Multiply Int64(1)) [\"l_extendedprice * CAST(1 AS Float64)\"]"] - UnresolvedShuffleExec: stages=[2] + + ShuffleWriterExec: Some(Hash([Column { name: "l_returnflag", index: 0 }], 2)) + HashAggregateExec: mode=Partial, gby=[l_returnflag@1 as l_returnflag], aggr=[SUM(l_extendedprice Multiply Int64(1))] + CsvExec: source=Path(testdata/lineitem: [testdata/lineitem/partition0.tbl,testdata/lineitem/partition1.tbl]), has_header=false + + ShuffleWriterExec: None + ProjectionExec: expr=[l_returnflag@0 as l_returnflag, SUM(lineitem.l_extendedprice Multiply Int64(1))@1 as sum_disc_price] + HashAggregateExec: mode=FinalPartitioned, gby=[l_returnflag@0 as l_returnflag], aggr=[SUM(l_extendedprice Multiply Int64(1))] + CoalesceBatchesExec: target_batch_size=4096 + RepartitionExec: partitioning=Hash([Column { name: "l_returnflag", index: 0 }], 2) + HashAggregateExec: mode=Partial, gby=[l_returnflag@1 as l_returnflag], aggr=[SUM(l_extendedprice Multiply Int64(1))] + CsvExec: source=Path(testdata/lineitem: [testdata/lineitem/partition0.tbl,testdata/lineitem/partition1.tbl]), has_header=false + + ShuffleWriterExec: None + SortExec: [l_returnflag@0 ASC] + CoalescePartitionsExec + UnresolvedShuffleExec */ - let sort = stages[2].children()[0].clone(); - let sort = downcast_exec!(sort, SortExec); + assert_eq!(3, stages.len()); - let projection = sort.children()[0].clone(); - println!("{:?}", projection); - let projection = downcast_exec!(projection, ProjectionExec); + // verify stage 0 + let stage0 = stages[0].children()[0].clone(); + let partial_hash = downcast_exec!(stage0, HashAggregateExec); + assert!(*partial_hash.mode() == AggregateMode::Partial); + // verify stage 1 + let stage1 = stages[1].children()[0].clone(); + let projection = downcast_exec!(stage1, ProjectionExec); let final_hash = projection.children()[0].clone(); let final_hash = downcast_exec!(final_hash, HashAggregateExec); - - let unresolved_shuffle = final_hash.children()[0].clone(); + assert!(*final_hash.mode() == AggregateMode::FinalPartitioned); + + // verify stage 2 + let stage2 = stages[2].children()[0].clone(); + let sort = downcast_exec!(stage2, SortExec); + let coalesce_partitions = sort.children()[0].clone(); + let coalesce_partitions = + downcast_exec!(coalesce_partitions, CoalescePartitionsExec); + let unresolved_shuffle = coalesce_partitions.children()[0].clone(); let unresolved_shuffle = downcast_exec!(unresolved_shuffle, UnresolvedShuffleExec); assert_eq!(unresolved_shuffle.query_stage_ids, vec![2]); - let merge_exec = stages[1].children()[0].clone(); - let merge_exec = downcast_exec!(merge_exec, CoalescePartitionsExec); + Ok(()) + } - let unresolved_shuffle = merge_exec.children()[0].clone(); - let unresolved_shuffle = - downcast_exec!(unresolved_shuffle, UnresolvedShuffleExec); - assert_eq!(unresolved_shuffle.query_stage_ids, vec![1]); + #[test] + fn roundtrip_serde_hash_aggregate() -> Result<(), BallistaError> { + let mut ctx = datafusion_test_context("testdata")?; + + // simplified form of TPC-H query 1 + let df = ctx.sql( + "select l_returnflag, sum(l_extendedprice * 1) as sum_disc_price + from lineitem + group by l_returnflag + order by l_returnflag", + )?; + + let plan = df.to_logical_plan(); + let plan = ctx.optimize(&plan)?; + let plan = ctx.create_physical_plan(&plan)?; + + let mut planner = DistributedPlanner::new(); + let job_uuid = Uuid::new_v4(); + let stages = planner.plan_query_stages(&job_uuid.to_string(), plan)?; let partial_hash = stages[0].children()[0].clone(); let partial_hash_serde = roundtrip_operator(partial_hash.clone())?; diff --git a/ballista/rust/scheduler/src/test_utils.rs b/ballista/rust/scheduler/src/test_utils.rs index becb95f961acf..aa1e2b2575aa9 100644 --- a/ballista/rust/scheduler/src/test_utils.rs +++ b/ballista/rust/scheduler/src/test_utils.rs @@ -15,15 +15,10 @@ // specific language governing permissions and limitations // under the License. -use std::sync::Arc; - use ballista_core::error::Result; use datafusion::arrow::datatypes::{DataType, Field, Schema}; use datafusion::execution::context::{ExecutionConfig, ExecutionContext}; -use datafusion::physical_optimizer::coalesce_batches::CoalesceBatches; -use datafusion::physical_optimizer::merge_exec::AddCoalescePartitionsExec; -use datafusion::physical_optimizer::optimizer::PhysicalOptimizerRule; use datafusion::physical_plan::csv::CsvReadOptions; pub const TPCH_TABLES: &[&str] = &[ @@ -31,16 +26,8 @@ pub const TPCH_TABLES: &[&str] = &[ ]; pub fn datafusion_test_context(path: &str) -> Result { - // remove Repartition rule because that isn't supported yet - let rules: Vec> = vec![ - Arc::new(AddCoalescePartitionsExec::new()), - Arc::new(CoalesceBatches::new()), - ]; - let config = ExecutionConfig::new() - .with_physical_optimizer_rules(rules) - .with_repartition_aggregations(false); + let config = ExecutionConfig::new().with_concurrency(2); // TODO: this is hack to enable partitioned joins let mut ctx = ExecutionContext::with_config(config); - for table in TPCH_TABLES { let schema = get_tpch_schema(table); let options = CsvReadOptions::new() diff --git a/datafusion/src/lib.rs b/datafusion/src/lib.rs index 64cc0a1349a23..5f07c171ad7ca 100644 --- a/datafusion/src/lib.rs +++ b/datafusion/src/lib.rs @@ -167,7 +167,7 @@ //! * Filter: [`FilterExec`](physical_plan::filter::FilterExec) //! * Hash and Grouped aggregations: [`HashAggregateExec`](physical_plan::hash_aggregate::HashAggregateExec) //! * Sort: [`SortExec`](physical_plan::sort::SortExec) -//! * Merge (partitions): [`MergeExec`](physical_plan::merge::MergeExec) +//! * Coalesce partitions: [`CoalescePartitionsExec`](physical_plan::coalesce_partitions::CoalescePartitionsExec) //! * Limit: [`LocalLimitExec`](physical_plan::limit::LocalLimitExec) and [`GlobalLimitExec`](physical_plan::limit::GlobalLimitExec) //! * Scan a CSV: [`CsvExec`](physical_plan::csv::CsvExec) //! * Scan a Parquet: [`ParquetExec`](physical_plan::parquet::ParquetExec) diff --git a/datafusion/src/physical_plan/coalesce_partitions.rs b/datafusion/src/physical_plan/coalesce_partitions.rs index 94ff230b81259..4c040651cd0f7 100644 --- a/datafusion/src/physical_plan/coalesce_partitions.rs +++ b/datafusion/src/physical_plan/coalesce_partitions.rs @@ -46,7 +46,7 @@ pub struct CoalescePartitionsExec { } impl CoalescePartitionsExec { - /// Create a new MergeExec + /// Create a new CoalescePartitionsExec pub fn new(input: Arc) -> Self { CoalescePartitionsExec { input } } @@ -84,16 +84,16 @@ impl ExecutionPlan for CoalescePartitionsExec { match children.len() { 1 => Ok(Arc::new(CoalescePartitionsExec::new(children[0].clone()))), _ => Err(DataFusionError::Internal( - "MergeExec wrong number of children".to_string(), + "CoalescePartitionsExec wrong number of children".to_string(), )), } } async fn execute(&self, partition: usize) -> Result { - // MergeExec produces a single partition + // CoalescePartitionsExec produces a single partition if 0 != partition { return Err(DataFusionError::Internal(format!( - "MergeExec invalid partition {}", + "CoalescePartitionsExec invalid partition {}", partition ))); } @@ -101,7 +101,7 @@ impl ExecutionPlan for CoalescePartitionsExec { let input_partitions = self.input.output_partitioning().partition_count(); match input_partitions { 0 => Err(DataFusionError::Internal( - "MergeExec requires at least one input partition".to_owned(), + "CoalescePartitionsExec requires at least one input partition".to_owned(), )), 1 => { // bypass any threading if there is a single partition @@ -135,7 +135,7 @@ impl ExecutionPlan for CoalescePartitionsExec { ) -> std::fmt::Result { match t { DisplayFormatType::Default => { - write!(f, "MergeExec") + write!(f, "CoalescePartitionsExec") } } } @@ -196,7 +196,7 @@ mod tests { let merge = CoalescePartitionsExec::new(Arc::new(csv)); - // output of MergeExec should have a single partition + // output of CoalescePartitionsExec should have a single partition assert_eq!(merge.output_partitioning().partition_count(), 1); // the result should contain 4 batches (one per input partition) diff --git a/datafusion/src/physical_plan/hash_aggregate.rs b/datafusion/src/physical_plan/hash_aggregate.rs index e157243dd8c2b..b4b7c224024d3 100644 --- a/datafusion/src/physical_plan/hash_aggregate.rs +++ b/datafusion/src/physical_plan/hash_aggregate.rs @@ -74,7 +74,7 @@ use super::{ }; /// Hash aggregate modes -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] pub enum AggregateMode { /// Partial aggregate that can be applied in parallel across input partitions Partial, diff --git a/datafusion/src/physical_plan/limit.rs b/datafusion/src/physical_plan/limit.rs index 361e26e5e94e1..9f4744291c499 100644 --- a/datafusion/src/physical_plan/limit.rs +++ b/datafusion/src/physical_plan/limit.rs @@ -49,7 +49,7 @@ pub struct GlobalLimitExec { } impl GlobalLimitExec { - /// Create a new MergeExec + /// Create a new GlobalLimitExec pub fn new(input: Arc, limit: usize) -> Self { GlobalLimitExec { input, limit } } diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index 307fff619478e..a940cbe7963a6 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -308,7 +308,7 @@ pub async fn collect(plan: Arc) -> Result> { _ => { // merge into a single partition let plan = CoalescePartitionsExec::new(plan.clone()); - // MergeExec must produce a single partition + // CoalescePartitionsExec must produce a single partition assert_eq!(1, plan.output_partitioning().partition_count()); common::collect(plan.execute(0).await?).await } diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index 5b43ec12bbf03..effdefcfabadc 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -736,7 +736,7 @@ impl DefaultPhysicalPlanner { input } else { // Apply a LocalLimitExec to each partition. The optimizer will also insert - // a MergeExec between the GlobalLimitExec and LocalLimitExec + // a CoalescePartitionsExec between the GlobalLimitExec and LocalLimitExec Arc::new(LocalLimitExec::new(input, limit)) }; diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 82c12ce217c98..bd73cb15610a7 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -3812,7 +3812,7 @@ async fn test_physical_plan_display_indent() { let expected = vec![ "GlobalLimitExec: limit=10", " SortExec: [the_min@2 DESC]", - " MergeExec", + " CoalescePartitionsExec", " ProjectionExec: expr=[c1@0 as c1, MAX(aggregate_test_100.c12)@1 as MAX(c12), MIN(aggregate_test_100.c12)@2 as the_min]", " HashAggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[MAX(c12), MIN(c12)]", " CoalesceBatchesExec: target_batch_size=4096", From a5b3a81127d5f93e82c80bb7ba07770e761874ae Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 4 Jul 2021 02:53:44 -0600 Subject: [PATCH 238/329] Show physical plan with metrics in benchmark (#662) --- benchmarks/src/bin/tpch.rs | 15 ++++++-- datafusion/src/physical_plan/display.rs | 50 ++++++++++++++++++++++--- 2 files changed, 55 insertions(+), 10 deletions(-) diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index 77c69f0ce524f..a52b6d208cff4 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -42,6 +42,7 @@ use datafusion::prelude::*; use datafusion::parquet::basic::Compression; use datafusion::parquet::file::properties::WriterProperties; +use datafusion::physical_plan::display::DisplayableExecutionPlan; use structopt::StructOpt; #[cfg(feature = "snmalloc")] @@ -343,21 +344,27 @@ async fn execute_query( debug: bool, ) -> Result> { if debug { - println!("Logical plan:\n{:?}", plan); + println!("=== Logical plan ===\n{:?}\n", plan); } let plan = ctx.optimize(plan)?; if debug { - println!("Optimized logical plan:\n{:?}", plan); + println!("=== Optimized logical plan ===\n{:?}\n", plan); } let physical_plan = ctx.create_physical_plan(&plan)?; if debug { println!( - "Physical plan:\n{}", + "=== Physical plan ===\n{}\n", displayable(physical_plan.as_ref()).indent().to_string() ); } - let result = collect(physical_plan).await?; + let result = collect(physical_plan.clone()).await?; if debug { + println!( + "=== Physical plan with metrics ===\n{}\n", + DisplayableExecutionPlan::with_metrics(physical_plan.as_ref()) + .indent() + .to_string() + ); pretty::print_batches(&result)?; } Ok(result) diff --git a/datafusion/src/physical_plan/display.rs b/datafusion/src/physical_plan/display.rs index e178ea18bb439..8498e02d50c88 100644 --- a/datafusion/src/physical_plan/display.rs +++ b/datafusion/src/physical_plan/display.rs @@ -33,13 +33,27 @@ pub enum DisplayFormatType { /// Wraps an `ExecutionPlan` with various ways to display this plan pub struct DisplayableExecutionPlan<'a> { inner: &'a dyn ExecutionPlan, + /// whether to show metrics or not + with_metrics: bool, } impl<'a> DisplayableExecutionPlan<'a> { /// Create a wrapper around an [`'ExecutionPlan'] which can be /// pretty printed in a variety of ways pub fn new(inner: &'a dyn ExecutionPlan) -> Self { - Self { inner } + Self { + inner, + with_metrics: false, + } + } + + /// Create a wrapper around an [`'ExecutionPlan'] which can be + /// pretty printed in a variety of ways + pub fn with_metrics(inner: &'a dyn ExecutionPlan) -> Self { + Self { + inner, + with_metrics: true, + } } /// Return a `format`able structure that produces a single line @@ -53,15 +67,26 @@ impl<'a> DisplayableExecutionPlan<'a> { /// CsvExec: source=...", /// ``` pub fn indent(&self) -> impl fmt::Display + 'a { - struct Wrapper<'a>(&'a dyn ExecutionPlan); + struct Wrapper<'a> { + plan: &'a dyn ExecutionPlan, + with_metrics: bool, + } impl<'a> fmt::Display for Wrapper<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let t = DisplayFormatType::Default; - let mut visitor = IndentVisitor { t, f, indent: 0 }; - accept(self.0, &mut visitor) + let mut visitor = IndentVisitor { + t, + f, + indent: 0, + with_metrics: self.with_metrics, + }; + accept(self.plan, &mut visitor) } } - Wrapper(self.inner) + Wrapper { + plan: self.inner, + with_metrics: self.with_metrics, + } } } @@ -71,8 +96,10 @@ struct IndentVisitor<'a, 'b> { t: DisplayFormatType, /// Write to this formatter f: &'a mut fmt::Formatter<'b>, - ///with_schema: bool, + /// Indent size indent: usize, + /// whether to show metrics or not + with_metrics: bool, } impl<'a, 'b> ExecutionPlanVisitor for IndentVisitor<'a, 'b> { @@ -83,6 +110,17 @@ impl<'a, 'b> ExecutionPlanVisitor for IndentVisitor<'a, 'b> { ) -> std::result::Result { write!(self.f, "{:indent$}", "", indent = self.indent * 2)?; plan.fmt_as(self.t, self.f)?; + if self.with_metrics { + write!( + self.f, + ", metrics=[{}]", + plan.metrics() + .iter() + .map(|(k, v)| format!("{}={:?}", k, v.value)) + .collect::>() + .join(", ") + )?; + } writeln!(self.f)?; self.indent += 1; Ok(true) From e036a626d3ac41fb3c414371ceb1e250af14f11a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 4 Jul 2021 02:53:56 -0600 Subject: [PATCH 239/329] Implement metrics for HashJoinExec (#664) --- datafusion/src/physical_plan/hash_join.rs | 103 ++++++++++++++-------- 1 file changed, 66 insertions(+), 37 deletions(-) diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index 195a19c54070d..f426bc9d3c3c2 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -64,7 +64,7 @@ use super::{ SendableRecordBatchStream, }; use crate::physical_plan::coalesce_batches::concat_batches; -use crate::physical_plan::PhysicalExpr; +use crate::physical_plan::{PhysicalExpr, SQLMetric}; use log::debug; // Maps a `u64` hash value based on the left ["on" values] to a list of indices with this key's value. @@ -102,6 +102,35 @@ pub struct HashJoinExec { random_state: RandomState, /// Partitioning mode to use mode: PartitionMode, + /// Metrics + metrics: Arc, +} + +/// Metrics for HashJoinExec +#[derive(Debug)] +struct HashJoinMetrics { + /// Total time for joining probe-side batches to the build-side batches + join_time: Arc, + /// Number of batches consumed by this operator + input_batches: Arc, + /// Number of rows consumed by this operator + input_rows: Arc, + /// Number of batches produced by this operator + output_batches: Arc, + /// Number of rows produced by this operator + output_rows: Arc, +} + +impl HashJoinMetrics { + fn new() -> Self { + Self { + join_time: SQLMetric::time_nanos(), + input_batches: SQLMetric::counter(), + input_rows: SQLMetric::counter(), + output_batches: SQLMetric::counter(), + output_rows: SQLMetric::counter(), + } + } } #[derive(Clone, Copy, Debug, PartialEq)] @@ -154,6 +183,7 @@ impl HashJoinExec { build_side: Arc::new(Mutex::new(None)), random_state, mode: partition_mode, + metrics: Arc::new(HashJoinMetrics::new()), }) } @@ -394,6 +424,7 @@ impl ExecutionPlan for HashJoinExec { column_indices, self.random_state.clone(), visited_left_side, + self.metrics.clone(), ))) } @@ -412,6 +443,22 @@ impl ExecutionPlan for HashJoinExec { } } } + + fn metrics(&self) -> HashMap { + let mut metrics = HashMap::new(); + metrics.insert("joinTime".to_owned(), (*self.metrics.join_time).clone()); + metrics.insert( + "inputBatches".to_owned(), + (*self.metrics.input_batches).clone(), + ); + metrics.insert("inputRows".to_owned(), (*self.metrics.input_rows).clone()); + metrics.insert( + "outputBatches".to_owned(), + (*self.metrics.output_batches).clone(), + ); + metrics.insert("outputRows".to_owned(), (*self.metrics.output_rows).clone()); + metrics + } } /// Updates `hash` with new entries from [RecordBatch] evaluated against the expressions `on`, @@ -467,22 +514,14 @@ struct HashJoinStream { right: SendableRecordBatchStream, /// Information of index and left / right placement of columns column_indices: Vec, - /// number of input batches - num_input_batches: usize, - /// number of input rows - num_input_rows: usize, - /// number of batches produced - num_output_batches: usize, - /// number of rows produced - num_output_rows: usize, - /// total time for joining probe-side batches to the build-side batches - join_time: usize, /// Random state used for hashing initialization random_state: RandomState, /// Keeps track of the left side rows whether they are visited visited_left_side: Vec, // TODO: use a more memory efficient data structure, https://github.com/apache/arrow-datafusion/issues/240 /// There is nothing to process anymore and left side is processed in case of left join is_exhausted: bool, + /// Metrics + metrics: Arc, } #[allow(clippy::too_many_arguments)] @@ -497,6 +536,7 @@ impl HashJoinStream { column_indices: Vec, random_state: RandomState, visited_left_side: Vec, + metrics: Arc, ) -> Self { HashJoinStream { schema, @@ -506,14 +546,10 @@ impl HashJoinStream { left_data, right, column_indices, - num_input_batches: 0, - num_input_rows: 0, - num_output_batches: 0, - num_output_rows: 0, - join_time: 0, random_state, visited_left_side, is_exhausted: false, + metrics, } } } @@ -1215,12 +1251,14 @@ impl Stream for HashJoinStream { &self.column_indices, &self.random_state, ); - self.num_input_batches += 1; - self.num_input_rows += batch.num_rows(); + self.metrics.input_batches.add(1); + self.metrics.input_rows.add(batch.num_rows()); if let Ok((ref batch, ref left_side)) = result { - self.join_time += start.elapsed().as_millis() as usize; - self.num_output_batches += 1; - self.num_output_rows += batch.num_rows(); + self.metrics + .join_time + .add(start.elapsed().as_millis() as usize); + self.metrics.output_batches.add(1); + self.metrics.output_rows.add(batch.num_rows()); match self.join_type { JoinType::Left @@ -1254,13 +1292,14 @@ impl Stream for HashJoinStream { self.join_type != JoinType::Semi, ); if let Ok(ref batch) = result { - self.num_input_batches += 1; - self.num_input_rows += batch.num_rows(); + self.metrics.input_batches.add(1); + self.metrics.input_rows.add(batch.num_rows()); if let Ok(ref batch) = result { - self.join_time += - start.elapsed().as_millis() as usize; - self.num_output_batches += 1; - self.num_output_rows += batch.num_rows(); + self.metrics + .join_time + .add(start.elapsed().as_millis() as usize); + self.metrics.output_batches.add(1); + self.metrics.output_rows.add(batch.num_rows()); } } self.is_exhausted = true; @@ -1274,16 +1313,6 @@ impl Stream for HashJoinStream { | JoinType::Right => {} } - // End of right batch, print stats in debug mode - debug!( - "Processed {} probe-side input batches containing {} rows and \ - produced {} output batches containing {} rows in {} ms", - self.num_input_batches, - self.num_input_rows, - self.num_output_batches, - self.num_output_rows, - self.join_time - ); other } }) From 58da15970dc0ec9e3c1c369fe89f6ba38e09d9c9 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 5 Jul 2021 00:35:47 -0600 Subject: [PATCH 240/329] Implement metrics for shuffle read and write (#676) --- ballista/rust/core/Cargo.toml | 1 + .../src/execution_plans/shuffle_reader.rs | 26 ++++++++++-- .../src/execution_plans/shuffle_writer.rs | 42 ++++++++++++++++--- ballista/rust/core/src/serde/scheduler/mod.rs | 12 +++++- ballista/rust/core/src/utils.rs | 9 +++- ballista/rust/executor/src/executor.rs | 9 ++++ 6 files changed, 87 insertions(+), 12 deletions(-) diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml index bedc0973e6ad9..3a89c75a5cd72 100644 --- a/ballista/rust/core/Cargo.toml +++ b/ballista/rust/core/Cargo.toml @@ -33,6 +33,7 @@ simd = ["datafusion/simd"] ahash = "0.7" async-trait = "0.1.36" futures = "0.3" +hashbrown = "0.11" log = "0.4" prost = "0.7" serde = {version = "1", features = ["derive"]} diff --git a/ballista/rust/core/src/execution_plans/shuffle_reader.rs b/ballista/rust/core/src/execution_plans/shuffle_reader.rs index 9ab064115acea..db03d3ddf0800 100644 --- a/ballista/rust/core/src/execution_plans/shuffle_reader.rs +++ b/ballista/rust/core/src/execution_plans/shuffle_reader.rs @@ -28,13 +28,17 @@ use async_trait::async_trait; use datafusion::arrow::datatypes::SchemaRef; use datafusion::arrow::error::Result as ArrowResult; use datafusion::arrow::record_batch::RecordBatch; -use datafusion::physical_plan::{DisplayFormatType, ExecutionPlan, Partitioning}; +use datafusion::physical_plan::{ + DisplayFormatType, ExecutionPlan, Partitioning, SQLMetric, +}; use datafusion::{ error::{DataFusionError, Result}, physical_plan::RecordBatchStream, }; use futures::{future, Stream, StreamExt}; +use hashbrown::HashMap; use log::info; +use std::time::Instant; /// ShuffleReaderExec reads partitions that have already been materialized by a ShuffleWriterExec /// being executed by an executor @@ -43,6 +47,8 @@ pub struct ShuffleReaderExec { /// Each partition of a shuffle can read data from multiple locations pub(crate) partition: Vec>, pub(crate) schema: SchemaRef, + /// Time to fetch data from executor + fetch_time: Arc, } impl ShuffleReaderExec { @@ -51,7 +57,11 @@ impl ShuffleReaderExec { partition: Vec>, schema: SchemaRef, ) -> Result { - Ok(Self { partition, schema }) + Ok(Self { + partition, + schema, + fetch_time: SQLMetric::time_nanos(), + }) } } @@ -88,11 +98,13 @@ impl ExecutionPlan for ShuffleReaderExec { ) -> Result>> { info!("ShuffleReaderExec::execute({})", partition); + let start = Instant::now(); let partition_locations = &self.partition[partition]; let result = future::join_all(partition_locations.iter().map(fetch_partition)) .await .into_iter() .collect::>>()?; + self.fetch_time.add_elapsed(start); let result = WrappedStream::new( Box::pin(futures::stream::iter(result).flatten()), @@ -115,7 +127,7 @@ impl ExecutionPlan for ShuffleReaderExec { x.iter() .map(|l| { format!( - "[executor={} part={}:{}:{} stats={:?}]", + "[executor={} part={}:{}:{} stats={}]", l.executor_meta.id, l.partition_id.job_id, l.partition_id.stage_id, @@ -127,11 +139,17 @@ impl ExecutionPlan for ShuffleReaderExec { .join(",") }) .collect::>() - .join("\n"); + .join(", "); write!(f, "ShuffleReaderExec: partition_locations={}", loc_str) } } } + + fn metrics(&self) -> HashMap { + let mut metrics = HashMap::new(); + metrics.insert("fetchTime".to_owned(), (*self.fetch_time).clone()); + metrics + } } async fn fetch_partition( diff --git a/ballista/rust/core/src/execution_plans/shuffle_writer.rs b/ballista/rust/core/src/execution_plans/shuffle_writer.rs index 7fffaba13217c..92b4448a69ec6 100644 --- a/ballista/rust/core/src/execution_plans/shuffle_writer.rs +++ b/ballista/rust/core/src/execution_plans/shuffle_writer.rs @@ -20,6 +20,7 @@ //! partition is re-partitioned and streamed to disk in Arrow IPC format. Future stages of the query //! will use the ShuffleReaderExec to read these results. +use std::fs::File; use std::iter::Iterator; use std::path::PathBuf; use std::sync::{Arc, Mutex}; @@ -43,11 +44,11 @@ use datafusion::arrow::record_batch::RecordBatch; use datafusion::error::{DataFusionError, Result}; use datafusion::physical_plan::hash_join::create_hashes; use datafusion::physical_plan::{ - DisplayFormatType, ExecutionPlan, Partitioning, RecordBatchStream, + DisplayFormatType, ExecutionPlan, Partitioning, RecordBatchStream, SQLMetric, }; use futures::StreamExt; +use hashbrown::HashMap; use log::info; -use std::fs::File; use uuid::Uuid; /// ShuffleWriterExec represents a section of a query plan that has consistent partitioning and @@ -66,6 +67,22 @@ pub struct ShuffleWriterExec { work_dir: String, /// Optional shuffle output partitioning shuffle_output_partitioning: Option, + /// Shuffle write metrics + metrics: ShuffleWriteMetrics, +} + +#[derive(Debug, Clone)] +struct ShuffleWriteMetrics { + /// Time spend writing batches to shuffle files + write_time: Arc, +} + +impl ShuffleWriteMetrics { + fn new() -> Self { + Self { + write_time: SQLMetric::time_nanos(), + } + } } impl ShuffleWriterExec { @@ -83,6 +100,7 @@ impl ShuffleWriterExec { plan, work_dir, shuffle_output_partitioning, + metrics: ShuffleWriteMetrics::new(), }) } @@ -150,12 +168,16 @@ impl ExecutionPlan for ShuffleWriterExec { info!("Writing results to {}", path); // stream results to disk - let stats = utils::write_stream_to_disk(&mut stream, path) - .await - .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; + let stats = utils::write_stream_to_disk( + &mut stream, + path, + self.metrics.write_time.clone(), + ) + .await + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; info!( - "Executed partition {} in {} seconds. Statistics: {:?}", + "Executed partition {} in {} seconds. Statistics: {}", partition, now.elapsed().as_secs(), stats @@ -231,6 +253,7 @@ impl ExecutionPlan for ShuffleWriterExec { RecordBatch::try_new(input_batch.schema(), columns)?; // write batch out + let start = Instant::now(); match &mut writers[num_output_partition] { Some(w) => { w.write(&output_batch)?; @@ -251,6 +274,7 @@ impl ExecutionPlan for ShuffleWriterExec { writers[num_output_partition] = Some(writer); } } + self.metrics.write_time.add_elapsed(start); } } @@ -310,6 +334,12 @@ impl ExecutionPlan for ShuffleWriterExec { } } + fn metrics(&self) -> HashMap { + let mut metrics = HashMap::new(); + metrics.insert("writeTime".to_owned(), (*self.metrics.write_time).clone()); + metrics + } + fn fmt_as( &self, t: DisplayFormatType, diff --git a/ballista/rust/core/src/serde/scheduler/mod.rs b/ballista/rust/core/src/serde/scheduler/mod.rs index f66bb08189d28..cbe1a31227c68 100644 --- a/ballista/rust/core/src/serde/scheduler/mod.rs +++ b/ballista/rust/core/src/serde/scheduler/mod.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::{collections::HashMap, sync::Arc}; +use std::{collections::HashMap, fmt, sync::Arc}; use datafusion::arrow::array::{ ArrayBuilder, ArrayRef, StructArray, StructBuilder, UInt64Array, UInt64Builder, @@ -113,6 +113,16 @@ impl Default for PartitionStats { } } +impl fmt::Display for PartitionStats { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "numBatches={:?}, numRows={:?}, numBytes={:?}", + self.num_batches, self.num_rows, self.num_bytes + ) + } +} + impl PartitionStats { pub fn new( num_rows: Option, diff --git a/ballista/rust/core/src/utils.rs b/ballista/rust/core/src/utils.rs index 8a510f4808760..f7d884d502985 100644 --- a/ballista/rust/core/src/utils.rs +++ b/ballista/rust/core/src/utils.rs @@ -53,15 +53,17 @@ use datafusion::physical_plan::parquet::ParquetExec; use datafusion::physical_plan::projection::ProjectionExec; use datafusion::physical_plan::sort::SortExec; use datafusion::physical_plan::{ - AggregateExpr, ExecutionPlan, PhysicalExpr, RecordBatchStream, + AggregateExpr, ExecutionPlan, PhysicalExpr, RecordBatchStream, SQLMetric, }; use futures::{future, Stream, StreamExt}; +use std::time::Instant; /// Stream data to disk in Arrow IPC format pub async fn write_stream_to_disk( stream: &mut Pin>, path: &str, + disk_write_metric: Arc, ) -> Result { let file = File::create(&path).map_err(|e| { BallistaError::General(format!( @@ -86,9 +88,14 @@ pub async fn write_stream_to_disk( num_batches += 1; num_rows += batch.num_rows(); num_bytes += batch_size_bytes; + + let start = Instant::now(); writer.write(&batch)?; + disk_write_metric.add_elapsed(start); } + let start = Instant::now(); writer.finish()?; + disk_write_metric.add_elapsed(start); Ok(PartitionStats::new( Some(num_rows as u64), Some(num_batches), diff --git a/ballista/rust/executor/src/executor.rs b/ballista/rust/executor/src/executor.rs index 86aaa7e9f4956..4a75448b5f06b 100644 --- a/ballista/rust/executor/src/executor.rs +++ b/ballista/rust/executor/src/executor.rs @@ -23,6 +23,7 @@ use ballista_core::error::BallistaError; use ballista_core::execution_plans::ShuffleWriterExec; use ballista_core::utils; use datafusion::arrow::record_batch::RecordBatch; +use datafusion::physical_plan::display::DisplayableExecutionPlan; use datafusion::physical_plan::ExecutionPlan; /// Ballista executor @@ -60,6 +61,14 @@ impl Executor { )?; let mut stream = exec.execute(part).await?; let batches = utils::collect_stream(&mut stream).await?; + + println!( + "=== Physical plan with metrics ===\n{}\n", + DisplayableExecutionPlan::with_metrics(&exec) + .indent() + .to_string() + ); + // the output should be a single batch containing metadata (path and statistics) assert!(batches.len() == 1); Ok(batches[0].clone()) From 8cbb750faab3189813e95681bc2af53f20c9f0c7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 6 Jul 2021 12:41:28 -0400 Subject: [PATCH 241/329] Add End-to-end test for parquet pruning + metrics for ParquetExec (#657) * End to end tests for parquet pruning * remove unused dep * Make the separation of per-partition and per-exec metrics clearer * Account for statistics once rather than per row group * Fix timestamps to use UTC time --- .../src/physical_optimizer/repartition.rs | 22 +- datafusion/src/physical_plan/mod.rs | 14 + datafusion/src/physical_plan/parquet.rs | 156 ++++++-- datafusion/src/test/mod.rs | 10 +- datafusion/tests/parquet_pruning.rs | 343 ++++++++++++++++++ 5 files changed, 508 insertions(+), 37 deletions(-) create mode 100644 datafusion/tests/parquet_pruning.rs diff --git a/datafusion/src/physical_optimizer/repartition.rs b/datafusion/src/physical_optimizer/repartition.rs index 011db64aaf8a2..4504c81daa06d 100644 --- a/datafusion/src/physical_optimizer/repartition.rs +++ b/datafusion/src/physical_optimizer/repartition.rs @@ -110,7 +110,9 @@ mod tests { use super::*; use crate::datasource::datasource::Statistics; - use crate::physical_plan::parquet::{ParquetExec, ParquetPartition}; + use crate::physical_plan::parquet::{ + ParquetExec, ParquetExecMetrics, ParquetPartition, + }; use crate::physical_plan::projection::ProjectionExec; #[test] @@ -119,12 +121,13 @@ mod tests { let parquet_project = ProjectionExec::try_new( vec![], Arc::new(ParquetExec::new( - vec![ParquetPartition { - filenames: vec!["x".to_string()], - statistics: Statistics::default(), - }], + vec![ParquetPartition::new( + vec!["x".to_string()], + Statistics::default(), + )], schema, None, + ParquetExecMetrics::new(), None, 2048, None, @@ -156,12 +159,13 @@ mod tests { Arc::new(ProjectionExec::try_new( vec![], Arc::new(ParquetExec::new( - vec![ParquetPartition { - filenames: vec!["x".to_string()], - statistics: Statistics::default(), - }], + vec![ParquetPartition::new( + vec!["x".to_string()], + Statistics::default(), + )], schema, None, + ParquetExecMetrics::new(), None, 2048, None, diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index a940cbe7963a6..d89eb11885041 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -297,6 +297,20 @@ pub fn visit_execution_plan( Ok(()) } +/// Recursively gateher all execution metrics from this plan and all of its input plans +pub fn plan_metrics(plan: Arc) -> HashMap { + fn get_metrics_inner( + plan: &dyn ExecutionPlan, + mut metrics: HashMap, + ) -> HashMap { + metrics.extend(plan.metrics().into_iter()); + plan.children().into_iter().fold(metrics, |metrics, child| { + get_metrics_inner(child.as_ref(), metrics) + }) + } + get_metrics_inner(plan.as_ref(), HashMap::new()) +} + /// Execute the [ExecutionPlan] and collect the results in memory pub async fn collect(plan: Arc) -> Result> { match plan.output_partitioning().partition_count() { diff --git a/datafusion/src/physical_plan/parquet.rs b/datafusion/src/physical_plan/parquet.rs index 3d20a9bf98c19..f31b921d663b0 100644 --- a/datafusion/src/physical_plan/parquet.rs +++ b/datafusion/src/physical_plan/parquet.rs @@ -40,6 +40,8 @@ use arrow::{ error::{ArrowError, Result as ArrowResult}, record_batch::RecordBatch, }; +use hashbrown::HashMap; +use log::debug; use parquet::file::{ metadata::RowGroupMetaData, reader::{FileReader, SerializedFileReader}, @@ -59,6 +61,8 @@ use crate::datasource::datasource::{ColumnStatistics, Statistics}; use async_trait::async_trait; use futures::stream::{Stream, StreamExt}; +use super::SQLMetric; + /// Execution plan for scanning one or more Parquet partitions #[derive(Debug, Clone)] pub struct ParquetExec { @@ -72,6 +76,8 @@ pub struct ParquetExec { batch_size: usize, /// Statistics for the data set (sum of statistics for all partitions) statistics: Statistics, + /// metrics for the overall execution + metrics: ParquetExecMetrics, /// Optional predicate builder predicate_builder: Option, /// Optional limit of the number of rows @@ -93,6 +99,24 @@ pub struct ParquetPartition { pub filenames: Vec, /// Statistics for this partition pub statistics: Statistics, + /// Execution metrics + metrics: ParquetPartitionMetrics, +} + +/// Stores metrics about the overall parquet execution +#[derive(Debug, Clone)] +pub struct ParquetExecMetrics { + /// Numer of times the pruning predicate could not be created + pub predicate_creation_errors: Arc, +} + +/// Stores metrics about the parquet execution for a particular ParquetPartition +#[derive(Debug, Clone)] +struct ParquetPartitionMetrics { + /// Numer of times the predicate could not be evaluated + pub predicate_evaluation_errors: Arc, + /// Number of row groups pruned using + pub row_groups_pruned: Arc, } impl ParquetExec { @@ -140,6 +164,8 @@ impl ParquetExec { max_concurrency: usize, limit: Option, ) -> Result { + debug!("Creating ParquetExec, filenames: {:?}, projection {:?}, predicate: {:?}, limit: {:?}", + filenames, projection, predicate, limit); // build a list of Parquet partitions with statistics and gather all unique schemas // used in this data set let mut schemas: Vec = vec![]; @@ -205,10 +231,7 @@ impl ParquetExec { }; // remove files that are not needed in case of limit filenames.truncate(total_files); - partitions.push(ParquetPartition { - filenames, - statistics, - }); + partitions.push(ParquetPartition::new(filenames, statistics)); if limit_exhausted { break; } @@ -225,14 +248,27 @@ impl ParquetExec { ))); } let schema = Arc::new(schemas.pop().unwrap()); + let metrics = ParquetExecMetrics::new(); + let predicate_builder = predicate.and_then(|predicate_expr| { - PruningPredicate::try_new(&predicate_expr, schema.clone()).ok() + match PruningPredicate::try_new(&predicate_expr, schema.clone()) { + Ok(predicate_builder) => Some(predicate_builder), + Err(e) => { + debug!( + "Could not create pruning predicate for {:?}: {}", + predicate_expr, e + ); + metrics.predicate_creation_errors.add(1); + None + } + } }); Ok(Self::new( partitions, schema, projection, + metrics, predicate_builder, batch_size, limit, @@ -244,6 +280,7 @@ impl ParquetExec { partitions: Vec, schema: SchemaRef, projection: Option>, + metrics: ParquetExecMetrics, predicate_builder: Option, batch_size: usize, limit: Option, @@ -307,6 +344,7 @@ impl ParquetExec { partitions, schema: Arc::new(projected_schema), projection, + metrics, predicate_builder, batch_size, statistics, @@ -341,6 +379,7 @@ impl ParquetPartition { Self { filenames, statistics, + metrics: ParquetPartitionMetrics::new(), } } @@ -355,6 +394,25 @@ impl ParquetPartition { } } +impl ParquetExecMetrics { + /// Create new metrics + pub fn new() -> Self { + Self { + predicate_creation_errors: SQLMetric::counter(), + } + } +} + +impl ParquetPartitionMetrics { + /// Create new metrics + pub fn new() -> Self { + Self { + predicate_evaluation_errors: SQLMetric::counter(), + row_groups_pruned: SQLMetric::counter(), + } + } +} + #[async_trait] impl ExecutionPlan for ParquetExec { /// Return a reference to Any that can be used for downcasting @@ -398,7 +456,9 @@ impl ExecutionPlan for ParquetExec { Receiver>, ) = channel(2); - let filenames = self.partitions[partition].filenames.clone(); + let partition = &self.partitions[partition]; + let filenames = partition.filenames.clone(); + let metrics = partition.metrics.clone(); let projection = self.projection.clone(); let predicate_builder = self.predicate_builder.clone(); let batch_size = self.batch_size; @@ -407,6 +467,7 @@ impl ExecutionPlan for ParquetExec { task::spawn_blocking(move || { if let Err(e) = read_files( &filenames, + metrics, &projection, &predicate_builder, batch_size, @@ -448,6 +509,31 @@ impl ExecutionPlan for ParquetExec { } } } + + fn metrics(&self) -> HashMap { + self.partitions + .iter() + .flat_map(|p| { + [ + ( + format!( + "numPredicateEvaluationErrors for {}", + p.filenames.join(",") + ), + p.metrics.predicate_evaluation_errors.as_ref().clone(), + ), + ( + format!("numRowGroupsPruned for {}", p.filenames.join(",")), + p.metrics.row_groups_pruned.as_ref().clone(), + ), + ] + }) + .chain(std::iter::once(( + "numPredicateCreationErrors".to_string(), + self.metrics.predicate_creation_errors.as_ref().clone(), + ))) + .collect() + } } fn send_result( @@ -547,6 +633,7 @@ impl<'a> PruningStatistics for RowGroupPruningStatistics<'a> { fn build_row_group_predicate( predicate_builder: &PruningPredicate, + metrics: ParquetPartitionMetrics, row_group_metadata: &[RowGroupMetaData], ) -> Box bool> { let parquet_schema = predicate_builder.schema().as_ref(); @@ -555,21 +642,28 @@ fn build_row_group_predicate( row_group_metadata, parquet_schema, }; - let predicate_values = predicate_builder.prune(&pruning_stats); - let predicate_values = match predicate_values { - Ok(values) => values, + match predicate_values { + Ok(values) => { + // NB: false means don't scan row group + let num_pruned = values.iter().filter(|&v| !v).count(); + metrics.row_groups_pruned.add(num_pruned); + Box::new(move |_, i| values[i]) + } // stats filter array could not be built // return a closure which will not filter out any row groups - _ => return Box::new(|_r, _i| true), - }; - - Box::new(move |_, i| predicate_values[i]) + Err(e) => { + debug!("Error evaluating row group predicate values {}", e); + metrics.predicate_evaluation_errors.add(1); + Box::new(|_r, _i| true) + } + } } fn read_files( filenames: &[String], + metrics: ParquetPartitionMetrics, projection: &[usize], predicate_builder: &Option, batch_size: usize, @@ -583,6 +677,7 @@ fn read_files( if let Some(predicate_builder) = predicate_builder { let row_group_predicate = build_row_group_predicate( predicate_builder, + metrics.clone(), file_reader.metadata().row_groups(), ); file_reader.filter_row_groups(&row_group_predicate); @@ -757,8 +852,11 @@ mod tests { vec![ParquetStatistics::int32(Some(11), Some(20), None, 0, false)], ); let row_group_metadata = vec![rgm1, rgm2]; - let row_group_predicate = - build_row_group_predicate(&predicate_builder, &row_group_metadata); + let row_group_predicate = build_row_group_predicate( + &predicate_builder, + ParquetPartitionMetrics::new(), + &row_group_metadata, + ); let row_group_filter = row_group_metadata .iter() .enumerate() @@ -787,8 +885,11 @@ mod tests { vec![ParquetStatistics::int32(Some(11), Some(20), None, 0, false)], ); let row_group_metadata = vec![rgm1, rgm2]; - let row_group_predicate = - build_row_group_predicate(&predicate_builder, &row_group_metadata); + let row_group_predicate = build_row_group_predicate( + &predicate_builder, + ParquetPartitionMetrics::new(), + &row_group_metadata, + ); let row_group_filter = row_group_metadata .iter() .enumerate() @@ -832,8 +933,11 @@ mod tests { ], ); let row_group_metadata = vec![rgm1, rgm2]; - let row_group_predicate = - build_row_group_predicate(&predicate_builder, &row_group_metadata); + let row_group_predicate = build_row_group_predicate( + &predicate_builder, + ParquetPartitionMetrics::new(), + &row_group_metadata, + ); let row_group_filter = row_group_metadata .iter() .enumerate() @@ -847,8 +951,11 @@ mod tests { // this bypasses the entire predicate expression and no row groups are filtered out let expr = col("c1").gt(lit(15)).or(col("c2").modulus(lit(2))); let predicate_builder = PruningPredicate::try_new(&expr, schema)?; - let row_group_predicate = - build_row_group_predicate(&predicate_builder, &row_group_metadata); + let row_group_predicate = build_row_group_predicate( + &predicate_builder, + ParquetPartitionMetrics::new(), + &row_group_metadata, + ); let row_group_filter = row_group_metadata .iter() .enumerate() @@ -891,8 +998,11 @@ mod tests { ], ); let row_group_metadata = vec![rgm1, rgm2]; - let row_group_predicate = - build_row_group_predicate(&predicate_builder, &row_group_metadata); + let row_group_predicate = build_row_group_predicate( + &predicate_builder, + ParquetPartitionMetrics::new(), + &row_group_metadata, + ); let row_group_filter = row_group_metadata .iter() .enumerate() diff --git a/datafusion/src/test/mod.rs b/datafusion/src/test/mod.rs index 7ca7cc12d9efb..df3aec4a68502 100644 --- a/datafusion/src/test/mod.rs +++ b/datafusion/src/test/mod.rs @@ -251,11 +251,11 @@ pub fn make_timestamps() -> RecordBatch { let arr_names = StringArray::from(names); let schema = Schema::new(vec![ - Field::new("nanos", arr_nanos.data_type().clone(), false), - Field::new("micros", arr_micros.data_type().clone(), false), - Field::new("millis", arr_millis.data_type().clone(), false), - Field::new("secs", arr_secs.data_type().clone(), false), - Field::new("name", arr_names.data_type().clone(), false), + Field::new("nanos", arr_nanos.data_type().clone(), true), + Field::new("micros", arr_micros.data_type().clone(), true), + Field::new("millis", arr_millis.data_type().clone(), true), + Field::new("secs", arr_secs.data_type().clone(), true), + Field::new("name", arr_names.data_type().clone(), true), ]); let schema = Arc::new(schema); diff --git a/datafusion/tests/parquet_pruning.rs b/datafusion/tests/parquet_pruning.rs new file mode 100644 index 0000000000000..86b3946e47121 --- /dev/null +++ b/datafusion/tests/parquet_pruning.rs @@ -0,0 +1,343 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This file contains an end to end test of parquet pruning. It writes +// data into a parquet file and then +use std::sync::Arc; + +use arrow::{ + array::{ + Array, StringArray, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, TimestampSecondArray, + }, + datatypes::{Field, Schema}, + record_batch::RecordBatch, + util::pretty::pretty_format_batches, +}; +use chrono::Duration; +use datafusion::{ + physical_plan::{plan_metrics, SQLMetric}, + prelude::ExecutionContext, +}; +use hashbrown::HashMap; +use parquet::{arrow::ArrowWriter, file::properties::WriterProperties}; +use tempfile::NamedTempFile; + +#[tokio::test] +async fn prune_timestamps_nanos() { + let output = ContextWithParquet::new() + .await + .query("SELECT * FROM t where nanos < to_timestamp('2020-01-02 01:01:11Z')") + .await; + println!("{}", output.description()); + // TODO This should prune one metrics without error + assert_eq!(output.predicate_evaluation_errors(), Some(1)); + assert_eq!(output.row_groups_pruned(), Some(0)); + assert_eq!(output.result_rows, 10, "{}", output.description()); +} + +#[tokio::test] +async fn prune_timestamps_micros() { + let output = ContextWithParquet::new() + .await + .query( + "SELECT * FROM t where micros < to_timestamp_micros('2020-01-02 01:01:11Z')", + ) + .await; + println!("{}", output.description()); + // TODO This should prune one metrics without error + assert_eq!(output.predicate_evaluation_errors(), Some(1)); + assert_eq!(output.row_groups_pruned(), Some(0)); + assert_eq!(output.result_rows, 10, "{}", output.description()); +} + +#[tokio::test] +async fn prune_timestamps_millis() { + let output = ContextWithParquet::new() + .await + .query( + "SELECT * FROM t where millis < to_timestamp_millis('2020-01-02 01:01:11Z')", + ) + .await; + println!("{}", output.description()); + // TODO This should prune one metrics without error + assert_eq!(output.predicate_evaluation_errors(), Some(1)); + assert_eq!(output.row_groups_pruned(), Some(0)); + assert_eq!(output.result_rows, 10, "{}", output.description()); +} + +#[tokio::test] +async fn prune_timestamps_seconds() { + let output = ContextWithParquet::new() + .await + .query( + "SELECT * FROM t where seconds < to_timestamp_seconds('2020-01-02 01:01:11Z')", + ) + .await; + println!("{}", output.description()); + // TODO This should prune one metrics without error + assert_eq!(output.predicate_evaluation_errors(), Some(1)); + assert_eq!(output.row_groups_pruned(), Some(0)); + assert_eq!(output.result_rows, 10, "{}", output.description()); +} + +// ---------------------- +// Begin test fixture +// ---------------------- + +/// Test fixture that has an execution context that has an external +/// table "t" registered, pointing at a parquet file made with +/// `make_test_file` +struct ContextWithParquet { + file: NamedTempFile, + ctx: ExecutionContext, +} + +/// The output of running one of the test cases +struct TestOutput { + /// The input string + sql: String, + /// Normalized metrics (filename replaced by a constant) + metrics: HashMap, + /// number of rows in results + result_rows: usize, + /// the contents of the input, as a string + pretty_input: String, + /// the raw results, as a string + pretty_results: String, +} + +impl TestOutput { + /// retrieve the value of the named metric, if any + fn metric_value(&self, metric_name: &str) -> Option { + self.metrics.get(metric_name).map(|m| m.value()) + } + + /// The number of times the pruning predicate evaluation errors + fn predicate_evaluation_errors(&self) -> Option { + self.metric_value("numPredicateEvaluationErrors for PARQUET_FILE") + } + + /// The number of times the pruning predicate evaluation errors + fn row_groups_pruned(&self) -> Option { + self.metric_value("numRowGroupsPruned for PARQUET_FILE") + } + + fn description(&self) -> String { + let metrics = self + .metrics + .iter() + .map(|(name, val)| format!(" {} = {:?}", name, val)) + .collect::>(); + + format!( + "Input:\n{}\nQuery:\n{}\nOutput:\n{}\nMetrics:\n{}", + self.pretty_input, + self.sql, + self.pretty_results, + metrics.join("\n") + ) + } +} + +/// Creates an execution context that has an external table "t" +/// registered pointing at a parquet file made with `make_test_file` +impl ContextWithParquet { + async fn new() -> Self { + let file = make_test_file().await; + + // now, setup a the file as a data source and run a query against it + let mut ctx = ExecutionContext::new(); + let parquet_path = file.path().to_string_lossy(); + ctx.register_parquet("t", &parquet_path) + .expect("registering"); + + Self { file, ctx } + } + + /// Runs the specified SQL query and returns the number of output + /// rows and normalized execution metrics + async fn query(&mut self, sql: &str) -> TestOutput { + println!("Planning sql {}", sql); + + let input = self + .ctx + .sql("SELECT * from t") + .expect("planning") + .collect() + .await + .expect("getting input"); + let pretty_input = pretty_format_batches(&input).unwrap(); + + let logical_plan = self.ctx.sql(sql).expect("planning").to_logical_plan(); + + let logical_plan = self.ctx.optimize(&logical_plan).expect("optimizing plan"); + let execution_plan = self + .ctx + .create_physical_plan(&logical_plan) + .expect("creating physical plan"); + + let results = datafusion::physical_plan::collect(execution_plan.clone()) + .await + .expect("Running"); + + // replace the path name, which varies test to test,a with some + // constant for test comparisons + let path = self.file.path(); + let path_name = path.to_string_lossy(); + let metrics = plan_metrics(execution_plan) + .into_iter() + .map(|(name, metric)| { + (name.replace(path_name.as_ref(), "PARQUET_FILE"), metric) + }) + .collect(); + + let result_rows = results.iter().map(|b| b.num_rows()).sum(); + + let pretty_results = pretty_format_batches(&results).unwrap(); + + let sql = sql.to_string(); + TestOutput { + sql, + metrics, + result_rows, + pretty_input, + pretty_results, + } + } +} + +/// Create a test parquet file with varioud data types +async fn make_test_file() -> NamedTempFile { + let output_file = tempfile::Builder::new() + .prefix("parquet_pruning") + .suffix(".parquet") + .tempfile() + .expect("tempfile creation"); + + let props = WriterProperties::builder() + .set_max_row_group_size(5) + .build(); + + let batches = vec![ + make_batch(Duration::seconds(0)), + make_batch(Duration::seconds(10)), + make_batch(Duration::minutes(10)), + make_batch(Duration::days(10)), + ]; + let schema = batches[0].schema(); + + let mut writer = ArrowWriter::try_new( + output_file + .as_file() + .try_clone() + .expect("cloning file descriptor"), + schema, + Some(props), + ) + .unwrap(); + + for batch in batches { + writer.write(&batch).expect("writing batch"); + } + writer.close().unwrap(); + + output_file +} + +/// Return record batch with a few rows of data for all of the supported timestamp types +/// values with the specified offset +/// +/// Columns are named: +/// "nanos" --> TimestampNanosecondArray +/// "micros" --> TimestampMicrosecondArray +/// "millis" --> TimestampMillisecondArray +/// "seconds" --> TimestampSecondArray +/// "names" --> StringArray +pub fn make_batch(offset: Duration) -> RecordBatch { + let ts_strings = vec![ + Some("2020-01-01T01:01:01.0000000000001"), + Some("2020-01-01T01:02:01.0000000000001"), + Some("2020-01-01T02:01:01.0000000000001"), + None, + Some("2020-01-02T01:01:01.0000000000001"), + ]; + + let offset_nanos = offset.num_nanoseconds().expect("non overflow nanos"); + + let ts_nanos = ts_strings + .into_iter() + .map(|t| { + t.map(|t| { + offset_nanos + + t.parse::() + .unwrap() + .timestamp_nanos() + }) + }) + .collect::>(); + + let ts_micros = ts_nanos + .iter() + .map(|t| t.as_ref().map(|ts_nanos| ts_nanos / 1000)) + .collect::>(); + + let ts_millis = ts_nanos + .iter() + .map(|t| t.as_ref().map(|ts_nanos| ts_nanos / 1000000)) + .collect::>(); + + let ts_seconds = ts_nanos + .iter() + .map(|t| t.as_ref().map(|ts_nanos| ts_nanos / 1000000000)) + .collect::>(); + + let names = ts_nanos + .iter() + .enumerate() + .map(|(i, _)| format!("Row {} + {}", i, offset)) + .collect::>(); + + let arr_nanos = TimestampNanosecondArray::from_opt_vec(ts_nanos, None); + let arr_micros = TimestampMicrosecondArray::from_opt_vec(ts_micros, None); + let arr_millis = TimestampMillisecondArray::from_opt_vec(ts_millis, None); + let arr_seconds = TimestampSecondArray::from_opt_vec(ts_seconds, None); + + let names = names.iter().map(|s| s.as_str()).collect::>(); + let arr_names = StringArray::from(names); + + let schema = Schema::new(vec![ + Field::new("nanos", arr_nanos.data_type().clone(), true), + Field::new("micros", arr_micros.data_type().clone(), true), + Field::new("millis", arr_millis.data_type().clone(), true), + Field::new("seconds", arr_seconds.data_type().clone(), true), + Field::new("name", arr_names.data_type().clone(), true), + ]); + let schema = Arc::new(schema); + + RecordBatch::try_new( + schema, + vec![ + Arc::new(arr_nanos), + Arc::new(arr_micros), + Arc::new(arr_millis), + Arc::new(arr_seconds), + Arc::new(arr_names), + ], + ) + .unwrap() +} From fdf41ad509ddc63e9bbd422768eff9810dea4da4 Mon Sep 17 00:00:00 2001 From: rdettai Date: Tue, 6 Jul 2021 19:45:20 +0200 Subject: [PATCH 242/329] [fix] benchmark run with compose (#666) --- benchmarks/README.md | 2 +- benchmarks/docker-compose.yaml | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 0b5ccfc16e466..a63761b6c2b3d 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -140,7 +140,7 @@ docker-compose up Then you can run the benchmark with: ```bash -docker-compose run ballista-client cargo run benchmark ballista --host ballista-scheduler --port 50050 --query 1 --path /data --format tbl +docker-compose run ballista-client bash -c '/tpch benchmark ballista --host ballista-scheduler --port 50050 --query 1 --path /data --format tbl' ``` ## Expected output diff --git a/benchmarks/docker-compose.yaml b/benchmarks/docker-compose.yaml index 74c6703f30b1c..e025ea360e76c 100644 --- a/benchmarks/docker-compose.yaml +++ b/benchmarks/docker-compose.yaml @@ -41,12 +41,10 @@ services: ballista-client: image: ballista:0.5.0-SNAPSHOT command: "/bin/sh" # do nothing - working_dir: /ballista/benchmarks/tpch environment: - RUST_LOG=info volumes: - ./data:/data - - ../..:/ballista depends_on: - ballista-scheduler - ballista-executor From 0368f59016b943448124f72d1f70b4108c45860e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Wed, 7 Jul 2021 13:53:29 +0200 Subject: [PATCH 243/329] Allow non-equijoin filters in join condition (#660) * Allow non-equijoin filters in join condition * Revert change to query * Fix, only do for inner join * Add test * docs update * Update test name Co-authored-by: Andrew Lamb * Add negative test Co-authored-by: Andrew Lamb --- datafusion/src/sql/planner.rs | 83 +++++++++++++++++++++++++---------- datafusion/tests/sql.rs | 22 ++++++++++ 2 files changed, 81 insertions(+), 24 deletions(-) diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 213ae890d7d09..e34f0e6c9b674 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -368,15 +368,34 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // parse ON expression let expr = self.sql_to_rex(sql_expr, &join_schema)?; + // expression that didn't match equi-join pattern + let mut filter = vec![]; + // extract join keys - extract_join_keys(&expr, &mut keys)?; + extract_join_keys(&expr, &mut keys, &mut filter); let (left_keys, right_keys): (Vec, Vec) = keys.into_iter().unzip(); // return the logical plan representing the join - LogicalPlanBuilder::from(left) - .join(right, join_type, left_keys, right_keys)? + let join = LogicalPlanBuilder::from(left) + .join(right, join_type, left_keys, right_keys)?; + + if filter.is_empty() { + join.build() + } else if join_type == JoinType::Inner { + join.filter( + filter + .iter() + .skip(1) + .fold(filter[0].clone(), |acc, e| acc.and(e.clone())), + )? .build() + } else { + Err(DataFusionError::NotImplemented(format!( + "Unsupported expressions in {:?} JOIN: {:?}", + join_type, filter + ))) + } } JoinConstraint::Using(idents) => { let keys: Vec = idents @@ -1550,39 +1569,41 @@ fn remove_join_expressions( } } -/// Parse equijoin ON condition which could be a single Eq or multiple conjunctive Eqs +/// Extracts equijoin ON condition be a single Eq or multiple conjunctive Eqs +/// Filters matching this pattern are added to `accum` +/// Filters that don't match this pattern are added to `accum_filter` +/// Examples: /// -/// Examples +/// foo = bar => accum=[(foo, bar)] accum_filter=[] +/// foo = bar AND bar = baz => accum=[(foo, bar), (bar, baz)] accum_filter=[] +/// foo = bar AND baz > 1 => accum=[(foo, bar)] accum_filter=[baz > 1] /// -/// foo = bar -/// foo = bar AND bar = baz AND ... -/// -fn extract_join_keys(expr: &Expr, accum: &mut Vec<(Column, Column)>) -> Result<()> { +fn extract_join_keys( + expr: &Expr, + accum: &mut Vec<(Column, Column)>, + accum_filter: &mut Vec, +) { match expr { Expr::BinaryExpr { left, op, right } => match op { Operator::Eq => match (left.as_ref(), right.as_ref()) { (Expr::Column(l), Expr::Column(r)) => { accum.push((l.clone(), r.clone())); - Ok(()) } - other => Err(DataFusionError::SQL(ParserError(format!( - "Unsupported expression '{:?}' in JOIN condition", - other - )))), + _other => { + accum_filter.push(expr.clone()); + } }, Operator::And => { - extract_join_keys(left, accum)?; - extract_join_keys(right, accum) + extract_join_keys(left, accum, accum_filter); + extract_join_keys(right, accum, accum_filter); + } + _other => { + accum_filter.push(expr.clone()); } - other => Err(DataFusionError::SQL(ParserError(format!( - "Unsupported expression '{:?}' in JOIN condition", - other - )))), }, - other => Err(DataFusionError::SQL(ParserError(format!( - "Unsupported expression '{:?}' in JOIN condition", - other - )))), + _other => { + accum_filter.push(expr.clone()); + } } } @@ -2702,6 +2723,20 @@ mod tests { quick_test(sql, expected); } + #[test] + fn equijoin_unsupported_expression() { + let sql = "SELECT id, order_id \ + FROM person \ + JOIN orders \ + ON id = customer_id AND order_id > 1 "; + let expected = "Projection: #person.id, #orders.order_id\ + \n Filter: #orders.order_id Gt Int64(1)\ + \n Join: #person.id = #orders.customer_id\ + \n TableScan: person projection=None\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); + } + #[test] fn join_with_table_name() { let sql = "SELECT id, order_id \ diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index bd73cb15610a7..f6f8b6f041e6e 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -1687,6 +1687,28 @@ async fn equijoin() -> Result<()> { Ok(()) } +#[tokio::test] +async fn equijoin_and_other_condition() -> Result<()> { + let mut ctx = create_join_context("t1_id", "t2_id")?; + let sql = + "SELECT t1_id, t1_name, t2_name FROM t1 JOIN t2 ON t1_id = t2_id AND t2_name >= 'y' ORDER BY t1_id"; + let actual = execute(&mut ctx, sql).await; + let expected = vec![vec!["11", "a", "z"], vec!["22", "b", "y"]]; + assert_eq!(expected, actual); + Ok(()) +} + +#[tokio::test] +async fn equijoin_and_unsupported_condition() -> Result<()> { + let ctx = create_join_context("t1_id", "t2_id")?; + let sql = + "SELECT t1_id, t1_name, t2_name FROM t1 LEFT JOIN t2 ON t1_id = t2_id AND t2_name >= 'y' ORDER BY t1_id"; + let res = ctx.create_logical_plan(sql); + assert!(res.is_err()); + assert_eq!(format!("{}", res.unwrap_err()), "This feature is not implemented: Unsupported expressions in Left JOIN: [#t2.t2_name GtEq Utf8(\"y\")]"); + Ok(()) +} + #[tokio::test] async fn left_join() -> Result<()> { let mut ctx = create_join_context("t1_id", "t2_id")?; From 36647662c69b2635cce300b03e5462b39bacd2a4 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Wed, 7 Jul 2021 14:02:12 +0200 Subject: [PATCH 244/329] use `Weak` ptr to break catalog list <> info schema cyclic reference (#681) Fixes #680. --- datafusion/src/catalog/information_schema.rs | 16 ++++++----- datafusion/src/execution/context.rs | 28 ++++++++++++++++++-- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/datafusion/src/catalog/information_schema.rs b/datafusion/src/catalog/information_schema.rs index fd7fcb4b901a6..cd1e612245ec0 100644 --- a/datafusion/src/catalog/information_schema.rs +++ b/datafusion/src/catalog/information_schema.rs @@ -19,7 +19,10 @@ //! //! Information Schema](https://en.wikipedia.org/wiki/Information_schema) -use std::{any, sync::Arc}; +use std::{ + any, + sync::{Arc, Weak}, +}; use arrow::{ array::{StringBuilder, UInt64Builder}, @@ -41,14 +44,14 @@ const COLUMNS: &str = "columns"; /// Wraps another [`CatalogProvider`] and adds a "information_schema" /// schema that can introspect on tables in the catalog_list pub(crate) struct CatalogWithInformationSchema { - catalog_list: Arc, + catalog_list: Weak, /// wrapped provider inner: Arc, } impl CatalogWithInformationSchema { pub(crate) fn new( - catalog_list: Arc, + catalog_list: Weak, inner: Arc, ) -> Self { Self { @@ -73,9 +76,10 @@ impl CatalogProvider for CatalogWithInformationSchema { fn schema(&self, name: &str) -> Option> { if name.eq_ignore_ascii_case(INFORMATION_SCHEMA) { - Some(Arc::new(InformationSchemaProvider { - catalog_list: self.catalog_list.clone(), - })) + Weak::upgrade(&self.catalog_list).map(|catalog_list| { + Arc::new(InformationSchemaProvider { catalog_list }) + as Arc + }) } else { self.inner.schema(name) } diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index d5a84869ad94a..6a26e0401bb87 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -144,7 +144,7 @@ impl ExecutionContext { let default_catalog: Arc = if config.information_schema { Arc::new(CatalogWithInformationSchema::new( - catalog_list.clone(), + Arc::downgrade(&catalog_list), Arc::new(default_catalog), )) } else { @@ -346,7 +346,7 @@ impl ExecutionContext { let state = self.state.lock().unwrap(); let catalog = if state.config.information_schema { Arc::new(CatalogWithInformationSchema::new( - state.catalog_list.clone(), + Arc::downgrade(&state.catalog_list), catalog, )) } else { @@ -924,6 +924,7 @@ mod tests { use arrow::datatypes::*; use arrow::record_batch::RecordBatch; use std::fs::File; + use std::sync::Weak; use std::thread::{self, JoinHandle}; use std::{io::prelude::*, sync::Mutex}; use tempfile::TempDir; @@ -3364,6 +3365,29 @@ mod tests { assert_batches_sorted_eq!(expected, &result); } + #[tokio::test] + async fn catalogs_not_leaked() { + // the information schema used to introduce cyclic Arcs + let ctx = ExecutionContext::with_config( + ExecutionConfig::new().with_information_schema(true), + ); + + // register a single catalog + let catalog = Arc::new(MemoryCatalogProvider::new()); + let catalog_weak = Arc::downgrade(&catalog); + ctx.register_catalog("my_catalog", catalog); + + let catalog_list_weak = { + let state = ctx.state.lock().unwrap(); + Arc::downgrade(&state.catalog_list) + }; + + drop(ctx); + + assert_eq!(Weak::strong_count(&catalog_list_weak), 0); + assert_eq!(Weak::strong_count(&catalog_weak), 0); + } + struct MyPhysicalPlanner {} impl PhysicalPlanner for MyPhysicalPlanner { From 18c581c4dbfbc3b5d135b3bc0d1cdb5c16af9c78 Mon Sep 17 00:00:00 2001 From: QP Hou Date: Wed, 7 Jul 2021 05:06:12 -0700 Subject: [PATCH 245/329] fix join column handling logic for `On` and `Using` constraints (#605) * fix join column handling logic for `On` and `Using` constraints * handling join column expansion during USING JOIN planning get rid of shared field and move column expansion logic into plan builder and optimizer. * add more comments & fix clippy * add more comment * reduce duplicate code in join predicate pushdown --- ballista/rust/core/proto/ballista.proto | 10 +- .../core/src/serde/logical_plan/from_proto.rs | 41 ++-- .../core/src/serde/logical_plan/to_proto.rs | 15 +- ballista/rust/core/src/serde/mod.rs | 46 +++- .../src/serde/physical_plan/from_proto.rs | 16 +- .../rust/core/src/serde/physical_plan/mod.rs | 3 +- .../core/src/serde/physical_plan/to_proto.rs | 13 +- benchmarks/queries/q7.sql | 2 +- datafusion/src/execution/context.rs | 90 +++++++ datafusion/src/execution/dataframe_impl.rs | 23 +- datafusion/src/logical_plan/builder.rs | 94 ++------ datafusion/src/logical_plan/dfschema.rs | 149 ++++++------ datafusion/src/logical_plan/expr.rs | 94 ++++++-- datafusion/src/logical_plan/mod.rs | 8 +- datafusion/src/logical_plan/plan.rs | 54 ++++- datafusion/src/optimizer/filter_push_down.rs | 226 ++++++++++++++---- .../src/optimizer/projection_push_down.rs | 88 ++++++- datafusion/src/optimizer/utils.rs | 9 +- datafusion/src/physical_plan/hash_join.rs | 198 ++++++++------- datafusion/src/physical_plan/hash_utils.rs | 57 +---- datafusion/src/physical_plan/planner.rs | 13 +- datafusion/src/sql/planner.rs | 76 +++--- datafusion/src/test/mod.rs | 11 +- 23 files changed, 836 insertions(+), 500 deletions(-) diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index e3788066d33fc..4696d21852fc2 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -378,12 +378,18 @@ enum JoinType { ANTI = 5; } +enum JoinConstraint { + ON = 0; + USING = 1; +} + message JoinNode { LogicalPlanNode left = 1; LogicalPlanNode right = 2; JoinType join_type = 3; - repeated Column left_join_column = 4; - repeated Column right_join_column = 5; + JoinConstraint join_constraint = 4; + repeated Column left_join_column = 5; + repeated Column right_join_column = 6; } message LimitNode { diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index a1136cf4a7d6e..cad0543923081 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -26,8 +26,8 @@ use datafusion::logical_plan::window_frames::{ }; use datafusion::logical_plan::{ abs, acos, asin, atan, ceil, cos, exp, floor, ln, log10, log2, round, signum, sin, - sqrt, tan, trunc, Column, DFField, DFSchema, Expr, JoinType, LogicalPlan, - LogicalPlanBuilder, Operator, + sqrt, tan, trunc, Column, DFField, DFSchema, Expr, JoinConstraint, JoinType, + LogicalPlan, LogicalPlanBuilder, Operator, }; use datafusion::physical_plan::aggregates::AggregateFunction; use datafusion::physical_plan::csv::CsvReadOptions; @@ -257,23 +257,32 @@ impl TryInto for &protobuf::LogicalPlanNode { join.join_type )) })?; - let join_type = match join_type { - protobuf::JoinType::Inner => JoinType::Inner, - protobuf::JoinType::Left => JoinType::Left, - protobuf::JoinType::Right => JoinType::Right, - protobuf::JoinType::Full => JoinType::Full, - protobuf::JoinType::Semi => JoinType::Semi, - protobuf::JoinType::Anti => JoinType::Anti, - }; - LogicalPlanBuilder::from(convert_box_required!(join.left)?) - .join( + let join_constraint = protobuf::JoinConstraint::from_i32( + join.join_constraint, + ) + .ok_or_else(|| { + proto_error(format!( + "Received a JoinNode message with unknown JoinConstraint {}", + join.join_constraint + )) + })?; + + let builder = LogicalPlanBuilder::from(convert_box_required!(join.left)?); + let builder = match join_constraint.into() { + JoinConstraint::On => builder.join( &convert_box_required!(join.right)?, - join_type, + join_type.into(), left_keys, right_keys, - )? - .build() - .map_err(|e| e.into()) + )?, + JoinConstraint::Using => builder.join_using( + &convert_box_required!(join.right)?, + join_type.into(), + left_keys, + )?, + }; + + builder.build().map_err(|e| e.into()) } } } diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/ballista/rust/core/src/serde/logical_plan/to_proto.rs index 4049622b83dc5..07d7a59c114c6 100644 --- a/ballista/rust/core/src/serde/logical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/to_proto.rs @@ -26,7 +26,7 @@ use datafusion::arrow::datatypes::{DataType, Field, IntervalUnit, Schema, TimeUn use datafusion::datasource::CsvFile; use datafusion::logical_plan::{ window_frames::{WindowFrame, WindowFrameBound, WindowFrameUnits}, - Column, Expr, JoinType, LogicalPlan, + Column, Expr, JoinConstraint, JoinType, LogicalPlan, }; use datafusion::physical_plan::aggregates::AggregateFunction; use datafusion::physical_plan::functions::BuiltinScalarFunction; @@ -804,26 +804,23 @@ impl TryInto for &LogicalPlan { right, on, join_type, + join_constraint, .. } => { let left: protobuf::LogicalPlanNode = left.as_ref().try_into()?; let right: protobuf::LogicalPlanNode = right.as_ref().try_into()?; - let join_type = match join_type { - JoinType::Inner => protobuf::JoinType::Inner, - JoinType::Left => protobuf::JoinType::Left, - JoinType::Right => protobuf::JoinType::Right, - JoinType::Full => protobuf::JoinType::Full, - JoinType::Semi => protobuf::JoinType::Semi, - JoinType::Anti => protobuf::JoinType::Anti, - }; let (left_join_column, right_join_column) = on.iter().map(|(l, r)| (l.into(), r.into())).unzip(); + let join_type: protobuf::JoinType = join_type.to_owned().into(); + let join_constraint: protobuf::JoinConstraint = + join_constraint.to_owned().into(); Ok(protobuf::LogicalPlanNode { logical_plan_type: Some(LogicalPlanType::Join(Box::new( protobuf::JoinNode { left: Some(Box::new(left)), right: Some(Box::new(right)), join_type: join_type.into(), + join_constraint: join_constraint.into(), left_join_column, right_join_column, }, diff --git a/ballista/rust/core/src/serde/mod.rs b/ballista/rust/core/src/serde/mod.rs index af83660baab56..1df0675ecae54 100644 --- a/ballista/rust/core/src/serde/mod.rs +++ b/ballista/rust/core/src/serde/mod.rs @@ -20,7 +20,7 @@ use std::{convert::TryInto, io::Cursor}; -use datafusion::logical_plan::Operator; +use datafusion::logical_plan::{JoinConstraint, JoinType, Operator}; use datafusion::physical_plan::aggregates::AggregateFunction; use datafusion::physical_plan::window_functions::BuiltInWindowFunction; @@ -291,3 +291,47 @@ impl Into for protobuf::PrimitiveScalarT } } } + +impl From for JoinType { + fn from(t: protobuf::JoinType) -> Self { + match t { + protobuf::JoinType::Inner => JoinType::Inner, + protobuf::JoinType::Left => JoinType::Left, + protobuf::JoinType::Right => JoinType::Right, + protobuf::JoinType::Full => JoinType::Full, + protobuf::JoinType::Semi => JoinType::Semi, + protobuf::JoinType::Anti => JoinType::Anti, + } + } +} + +impl From for protobuf::JoinType { + fn from(t: JoinType) -> Self { + match t { + JoinType::Inner => protobuf::JoinType::Inner, + JoinType::Left => protobuf::JoinType::Left, + JoinType::Right => protobuf::JoinType::Right, + JoinType::Full => protobuf::JoinType::Full, + JoinType::Semi => protobuf::JoinType::Semi, + JoinType::Anti => protobuf::JoinType::Anti, + } + } +} + +impl From for JoinConstraint { + fn from(t: protobuf::JoinConstraint) -> Self { + match t { + protobuf::JoinConstraint::On => JoinConstraint::On, + protobuf::JoinConstraint::Using => JoinConstraint::Using, + } + } +} + +impl From for protobuf::JoinConstraint { + fn from(t: JoinConstraint) -> Self { + match t { + JoinConstraint::On => protobuf::JoinConstraint::On, + JoinConstraint::Using => protobuf::JoinConstraint::Using, + } + } +} diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index 717ee209dbe91..12c1743c0747c 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -35,7 +35,9 @@ use datafusion::catalog::catalog::{ use datafusion::execution::context::{ ExecutionConfig, ExecutionContextState, ExecutionProps, }; -use datafusion::logical_plan::{window_frames::WindowFrame, DFSchema, Expr}; +use datafusion::logical_plan::{ + window_frames::WindowFrame, DFSchema, Expr, JoinConstraint, JoinType, +}; use datafusion::physical_plan::aggregates::{create_aggregate_expr, AggregateFunction}; use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec}; @@ -57,7 +59,6 @@ use datafusion::physical_plan::{ filter::FilterExec, functions::{self, BuiltinScalarFunction, ScalarFunctionExpr}, hash_join::HashJoinExec, - hash_utils::JoinType, limit::{GlobalLimitExec, LocalLimitExec}, parquet::ParquetExec, projection::ProjectionExec, @@ -348,14 +349,7 @@ impl TryInto> for &protobuf::PhysicalPlanNode { hashjoin.join_type )) })?; - let join_type = match join_type { - protobuf::JoinType::Inner => JoinType::Inner, - protobuf::JoinType::Left => JoinType::Left, - protobuf::JoinType::Right => JoinType::Right, - protobuf::JoinType::Full => JoinType::Full, - protobuf::JoinType::Semi => JoinType::Semi, - protobuf::JoinType::Anti => JoinType::Anti, - }; + let partition_mode = protobuf::PartitionMode::from_i32(hashjoin.partition_mode) .ok_or_else(|| { @@ -372,7 +366,7 @@ impl TryInto> for &protobuf::PhysicalPlanNode { left, right, on, - &join_type, + &join_type.into(), partition_mode, )?)) } diff --git a/ballista/rust/core/src/serde/physical_plan/mod.rs b/ballista/rust/core/src/serde/physical_plan/mod.rs index a393d7fdab1f7..3bf7e9c3063b5 100644 --- a/ballista/rust/core/src/serde/physical_plan/mod.rs +++ b/ballista/rust/core/src/serde/physical_plan/mod.rs @@ -27,7 +27,7 @@ mod roundtrip_tests { compute::kernels::sort::SortOptions, datatypes::{DataType, Field, Schema}, }, - logical_plan::Operator, + logical_plan::{JoinType, Operator}, physical_plan::{ empty::EmptyExec, expressions::{binary, col, lit, InListExpr, NotExpr}, @@ -35,7 +35,6 @@ mod roundtrip_tests { filter::FilterExec, hash_aggregate::{AggregateMode, HashAggregateExec}, hash_join::{HashJoinExec, PartitionMode}, - hash_utils::JoinType, limit::{GlobalLimitExec, LocalLimitExec}, sort::SortExec, AggregateExpr, ColumnarValue, Distribution, ExecutionPlan, Partitioning, diff --git a/ballista/rust/core/src/serde/physical_plan/to_proto.rs b/ballista/rust/core/src/serde/physical_plan/to_proto.rs index 0fc27850074c3..875dbf213441d 100644 --- a/ballista/rust/core/src/serde/physical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/to_proto.rs @@ -26,6 +26,7 @@ use std::{ sync::Arc, }; +use datafusion::logical_plan::JoinType; use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; use datafusion::physical_plan::csv::CsvExec; use datafusion::physical_plan::expressions::{ @@ -35,7 +36,6 @@ use datafusion::physical_plan::expressions::{CastExpr, TryCastExpr}; use datafusion::physical_plan::filter::FilterExec; use datafusion::physical_plan::hash_aggregate::AggregateMode; use datafusion::physical_plan::hash_join::{HashJoinExec, PartitionMode}; -use datafusion::physical_plan::hash_utils::JoinType; use datafusion::physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; use datafusion::physical_plan::parquet::ParquetExec; use datafusion::physical_plan::projection::ProjectionExec; @@ -135,18 +135,13 @@ impl TryInto for Arc { }), }) .collect(); - let join_type = match exec.join_type() { - JoinType::Inner => protobuf::JoinType::Inner, - JoinType::Left => protobuf::JoinType::Left, - JoinType::Right => protobuf::JoinType::Right, - JoinType::Full => protobuf::JoinType::Full, - JoinType::Semi => protobuf::JoinType::Semi, - JoinType::Anti => protobuf::JoinType::Anti, - }; + let join_type: protobuf::JoinType = exec.join_type().to_owned().into(); + let partition_mode = match exec.partition_mode() { PartitionMode::CollectLeft => protobuf::PartitionMode::CollectLeft, PartitionMode::Partitioned => protobuf::PartitionMode::Partitioned, }; + Ok(protobuf::PhysicalPlanNode { physical_plan_type: Some(PhysicalPlanType::HashJoin(Box::new( protobuf::HashJoinExecNode { diff --git a/benchmarks/queries/q7.sql b/benchmarks/queries/q7.sql index d53877c8dde68..512e5be55a2d9 100644 --- a/benchmarks/queries/q7.sql +++ b/benchmarks/queries/q7.sql @@ -36,4 +36,4 @@ group by order by supp_nation, cust_nation, - l_year; \ No newline at end of file + l_year; diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index 6a26e0401bb87..d2dcec5f47d73 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -1278,6 +1278,96 @@ mod tests { Ok(()) } + #[tokio::test] + async fn left_join_using() -> Result<()> { + let results = execute( + "SELECT t1.c1, t2.c2 FROM test t1 JOIN test t2 USING (c2) ORDER BY t2.c2", + 1, + ) + .await?; + assert_eq!(results.len(), 1); + + let expected = vec![ + "+----+----+", + "| c1 | c2 |", + "+----+----+", + "| 0 | 1 |", + "| 0 | 2 |", + "| 0 | 3 |", + "| 0 | 4 |", + "| 0 | 5 |", + "| 0 | 6 |", + "| 0 | 7 |", + "| 0 | 8 |", + "| 0 | 9 |", + "| 0 | 10 |", + "+----+----+", + ]; + + assert_batches_eq!(expected, &results); + Ok(()) + } + + #[tokio::test] + async fn left_join_using_join_key_projection() -> Result<()> { + let results = execute( + "SELECT t1.c1, t1.c2, t2.c2 FROM test t1 JOIN test t2 USING (c2) ORDER BY t2.c2", + 1, + ) + .await?; + assert_eq!(results.len(), 1); + + let expected = vec![ + "+----+----+----+", + "| c1 | c2 | c2 |", + "+----+----+----+", + "| 0 | 1 | 1 |", + "| 0 | 2 | 2 |", + "| 0 | 3 | 3 |", + "| 0 | 4 | 4 |", + "| 0 | 5 | 5 |", + "| 0 | 6 | 6 |", + "| 0 | 7 | 7 |", + "| 0 | 8 | 8 |", + "| 0 | 9 | 9 |", + "| 0 | 10 | 10 |", + "+----+----+----+", + ]; + + assert_batches_eq!(expected, &results); + Ok(()) + } + + #[tokio::test] + async fn left_join() -> Result<()> { + let results = execute( + "SELECT t1.c1, t1.c2, t2.c2 FROM test t1 JOIN test t2 ON t1.c2 = t2.c2 ORDER BY t1.c2", + 1, + ) + .await?; + assert_eq!(results.len(), 1); + + let expected = vec![ + "+----+----+----+", + "| c1 | c2 | c2 |", + "+----+----+----+", + "| 0 | 1 | 1 |", + "| 0 | 2 | 2 |", + "| 0 | 3 | 3 |", + "| 0 | 4 | 4 |", + "| 0 | 5 | 5 |", + "| 0 | 6 | 6 |", + "| 0 | 7 | 7 |", + "| 0 | 8 | 8 |", + "| 0 | 9 | 9 |", + "| 0 | 10 | 10 |", + "+----+----+----+", + ]; + + assert_batches_eq!(expected, &results); + Ok(()) + } + #[tokio::test] async fn window() -> Result<()> { let results = execute( diff --git a/datafusion/src/execution/dataframe_impl.rs b/datafusion/src/execution/dataframe_impl.rs index 7cf779740c473..4edd01c2c0a99 100644 --- a/datafusion/src/execution/dataframe_impl.rs +++ b/datafusion/src/execution/dataframe_impl.rs @@ -264,7 +264,7 @@ mod tests { #[tokio::test] async fn join() -> Result<()> { let left = test_table()?.select_columns(&["c1", "c2"])?; - let right = test_table()?.select_columns(&["c1", "c3"])?; + let right = test_table_with_name("c2")?.select_columns(&["c1", "c3"])?; let left_rows = left.collect().await?; let right_rows = right.collect().await?; let join = left.join(right, JoinType::Inner, &["c1"], &["c1"])?; @@ -315,7 +315,7 @@ mod tests { #[test] fn registry() -> Result<()> { let mut ctx = ExecutionContext::new(); - register_aggregate_csv(&mut ctx)?; + register_aggregate_csv(&mut ctx, "aggregate_test_100")?; // declare the udf let my_fn: ScalarFunctionImplementation = @@ -366,21 +366,28 @@ mod tests { /// Create a logical plan from a SQL query fn create_plan(sql: &str) -> Result { let mut ctx = ExecutionContext::new(); - register_aggregate_csv(&mut ctx)?; + register_aggregate_csv(&mut ctx, "aggregate_test_100")?; ctx.create_logical_plan(sql) } - fn test_table() -> Result> { + fn test_table_with_name(name: &str) -> Result> { let mut ctx = ExecutionContext::new(); - register_aggregate_csv(&mut ctx)?; - ctx.table("aggregate_test_100") + register_aggregate_csv(&mut ctx, name)?; + ctx.table(name) + } + + fn test_table() -> Result> { + test_table_with_name("aggregate_test_100") } - fn register_aggregate_csv(ctx: &mut ExecutionContext) -> Result<()> { + fn register_aggregate_csv( + ctx: &mut ExecutionContext, + table_name: &str, + ) -> Result<()> { let schema = test::aggr_test_schema(); let testdata = crate::test_util::arrow_test_data(); ctx.register_csv( - "aggregate_test_100", + table_name, &format!("{}/csv/aggregate_test_100.csv", testdata), CsvReadOptions::new().schema(schema.as_ref()), )?; diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs index 1a53e2185a4bc..41f29c4b99052 100644 --- a/datafusion/src/logical_plan/builder.rs +++ b/datafusion/src/logical_plan/builder.rs @@ -40,7 +40,6 @@ use crate::logical_plan::{ columnize_expr, normalize_col, normalize_cols, Column, DFField, DFSchema, DFSchemaRef, Partitioning, }; -use std::collections::HashSet; /// Default table name for unnamed table pub const UNNAMED_TABLE: &str = "?table?"; @@ -217,7 +216,6 @@ impl LogicalPlanBuilder { /// * An invalid expression is used (e.g. a `sort` expression) pub fn project(&self, expr: impl IntoIterator) -> Result { let input_schema = self.plan.schema(); - let all_schemas = self.plan.all_schemas(); let mut projected_expr = vec![]; for e in expr { match e { @@ -227,10 +225,8 @@ impl LogicalPlanBuilder { .push(Expr::Column(input_schema.field(i).qualified_column())) }); } - _ => projected_expr.push(columnize_expr( - normalize_col(e, &all_schemas)?, - input_schema, - )), + _ => projected_expr + .push(columnize_expr(normalize_col(e, &self.plan)?, input_schema)), } } @@ -247,7 +243,7 @@ impl LogicalPlanBuilder { /// Apply a filter pub fn filter(&self, expr: Expr) -> Result { - let expr = normalize_col(expr, &self.plan.all_schemas())?; + let expr = normalize_col(expr, &self.plan)?; Ok(Self::from(LogicalPlan::Filter { predicate: expr, input: Arc::new(self.plan.clone()), @@ -264,9 +260,8 @@ impl LogicalPlanBuilder { /// Apply a sort pub fn sort(&self, exprs: impl IntoIterator) -> Result { - let schemas = self.plan.all_schemas(); Ok(Self::from(LogicalPlan::Sort { - expr: normalize_cols(exprs, &schemas)?, + expr: normalize_cols(exprs, &self.plan)?, input: Arc::new(self.plan.clone()), })) } @@ -292,20 +287,15 @@ impl LogicalPlanBuilder { let left_keys: Vec = left_keys .into_iter() - .map(|c| c.into().normalize(&self.plan.all_schemas())) + .map(|c| c.into().normalize(&self.plan)) .collect::>()?; let right_keys: Vec = right_keys .into_iter() - .map(|c| c.into().normalize(&right.all_schemas())) + .map(|c| c.into().normalize(right)) .collect::>()?; let on: Vec<(_, _)> = left_keys.into_iter().zip(right_keys.into_iter()).collect(); - let join_schema = build_join_schema( - self.plan.schema(), - right.schema(), - &on, - &join_type, - &JoinConstraint::On, - )?; + let join_schema = + build_join_schema(self.plan.schema(), right.schema(), &join_type)?; Ok(Self::from(LogicalPlan::Join { left: Arc::new(self.plan.clone()), @@ -327,21 +317,16 @@ impl LogicalPlanBuilder { let left_keys: Vec = using_keys .clone() .into_iter() - .map(|c| c.into().normalize(&self.plan.all_schemas())) + .map(|c| c.into().normalize(&self.plan)) .collect::>()?; let right_keys: Vec = using_keys .into_iter() - .map(|c| c.into().normalize(&right.all_schemas())) + .map(|c| c.into().normalize(right)) .collect::>()?; let on: Vec<(_, _)> = left_keys.into_iter().zip(right_keys.into_iter()).collect(); - let join_schema = build_join_schema( - self.plan.schema(), - right.schema(), - &on, - &join_type, - &JoinConstraint::Using, - )?; + let join_schema = + build_join_schema(self.plan.schema(), right.schema(), &join_type)?; Ok(Self::from(LogicalPlan::Join { left: Arc::new(self.plan.clone()), @@ -394,9 +379,8 @@ impl LogicalPlanBuilder { group_expr: impl IntoIterator, aggr_expr: impl IntoIterator, ) -> Result { - let schemas = self.plan.all_schemas(); - let group_expr = normalize_cols(group_expr, &schemas)?; - let aggr_expr = normalize_cols(aggr_expr, &schemas)?; + let group_expr = normalize_cols(group_expr, &self.plan)?; + let aggr_expr = normalize_cols(aggr_expr, &self.plan)?; let all_expr = group_expr.iter().chain(aggr_expr.iter()); validate_unique_names("Aggregations", all_expr.clone(), self.plan.schema())?; @@ -440,33 +424,12 @@ impl LogicalPlanBuilder { pub fn build_join_schema( left: &DFSchema, right: &DFSchema, - on: &[(Column, Column)], join_type: &JoinType, - join_constraint: &JoinConstraint, ) -> Result { let fields: Vec = match join_type { - JoinType::Inner | JoinType::Left | JoinType::Full => { - let duplicate_keys = match join_constraint { - JoinConstraint::On => on - .iter() - .filter(|(l, r)| l == r) - .map(|on| on.1.clone()) - .collect::>(), - // using join requires unique join columns in the output schema, so we mark all - // right join keys as duplicate - JoinConstraint::Using => { - on.iter().map(|on| on.1.clone()).collect::>() - } - }; - + JoinType::Inner | JoinType::Left | JoinType::Full | JoinType::Right => { + let right_fields = right.fields().iter(); let left_fields = left.fields().iter(); - - // remove right-side join keys if they have the same names as the left-side - let right_fields = right - .fields() - .iter() - .filter(|f| !duplicate_keys.contains(&f.qualified_column())); - // left then right left_fields.chain(right_fields).cloned().collect() } @@ -474,31 +437,6 @@ pub fn build_join_schema( // Only use the left side for the schema left.fields().clone() } - JoinType::Right => { - let duplicate_keys = match join_constraint { - JoinConstraint::On => on - .iter() - .filter(|(l, r)| l == r) - .map(|on| on.1.clone()) - .collect::>(), - // using join requires unique join columns in the output schema, so we mark all - // left join keys as duplicate - JoinConstraint::Using => { - on.iter().map(|on| on.0.clone()).collect::>() - } - }; - - // remove left-side join keys if they have the same names as the right-side - let left_fields = left - .fields() - .iter() - .filter(|f| !duplicate_keys.contains(&f.qualified_column())); - - let right_fields = right.fields().iter(); - - // left then right - left_fields.chain(right_fields).cloned().collect() - } }; DFSchema::new(fields) diff --git a/datafusion/src/logical_plan/dfschema.rs b/datafusion/src/logical_plan/dfschema.rs index b4d864f55ebdb..b4bde87f3471f 100644 --- a/datafusion/src/logical_plan/dfschema.rs +++ b/datafusion/src/logical_plan/dfschema.rs @@ -48,6 +48,7 @@ impl DFSchema { pub fn new(fields: Vec) -> Result { let mut qualified_names = HashSet::new(); let mut unqualified_names = HashSet::new(); + for field in &fields { if let Some(qualifier) = field.qualifier() { if !qualified_names.insert((qualifier, field.name())) { @@ -94,10 +95,7 @@ impl DFSchema { schema .fields() .iter() - .map(|f| DFField { - field: f.clone(), - qualifier: Some(qualifier.to_owned()), - }) + .map(|f| DFField::from_qualified(qualifier, f.clone())) .collect(), ) } @@ -149,47 +147,80 @@ impl DFSchema { ))) } - /// Find the index of the column with the given qualifer and name - pub fn index_of_column(&self, col: &Column) -> Result { - for i in 0..self.fields.len() { - let field = &self.fields[i]; - if field.qualifier() == col.relation.as_ref() && field.name() == &col.name { - return Ok(i); - } + fn index_of_column_by_name( + &self, + qualifier: Option<&str>, + name: &str, + ) -> Result { + let matches: Vec = self + .fields + .iter() + .enumerate() + .filter(|(_, field)| match (qualifier, &field.qualifier) { + // field to lookup is qualified. + // current field is qualified and not shared between relations, compare both + // qualifer and name. + (Some(q), Some(field_q)) => q == field_q && field.name() == name, + // field to lookup is qualified but current field is unqualified. + (Some(_), None) => false, + // field to lookup is unqualified, no need to compare qualifier + _ => field.name() == name, + }) + .map(|(idx, _)| idx) + .collect(); + + match matches.len() { + 0 => Err(DataFusionError::Plan(format!( + "No field named '{}.{}'. Valid fields are {}.", + qualifier.unwrap_or(""), + name, + self.get_field_names() + ))), + 1 => Ok(matches[0]), + _ => Err(DataFusionError::Internal(format!( + "Ambiguous reference to qualified field named '{}.{}'", + qualifier.unwrap_or(""), + name + ))), } - Err(DataFusionError::Plan(format!( - "No field matches column '{}'. Available fields: {}", - col, self - ))) + } + + /// Find the index of the column with the given qualifier and name + pub fn index_of_column(&self, col: &Column) -> Result { + self.index_of_column_by_name(col.relation.as_deref(), &col.name) } /// Find the field with the given name pub fn field_with_name( &self, - relation_name: Option<&str>, + qualifier: Option<&str>, name: &str, - ) -> Result { - if let Some(relation_name) = relation_name { - self.field_with_qualified_name(relation_name, name) + ) -> Result<&DFField> { + if let Some(qualifier) = qualifier { + self.field_with_qualified_name(qualifier, name) } else { self.field_with_unqualified_name(name) } } - /// Find the field with the given name - pub fn field_with_unqualified_name(&self, name: &str) -> Result { - let matches: Vec<&DFField> = self - .fields + /// Find all fields match the given name + pub fn fields_with_unqualified_name(&self, name: &str) -> Vec<&DFField> { + self.fields .iter() .filter(|field| field.name() == name) - .collect(); + .collect() + } + + /// Find the field with the given name + pub fn field_with_unqualified_name(&self, name: &str) -> Result<&DFField> { + let matches = self.fields_with_unqualified_name(name); match matches.len() { 0 => Err(DataFusionError::Plan(format!( "No field with unqualified name '{}'. Valid fields are {}.", name, self.get_field_names() ))), - 1 => Ok(matches[0].to_owned()), + 1 => Ok(matches[0]), _ => Err(DataFusionError::Plan(format!( "Ambiguous reference to field named '{}'", name @@ -200,33 +231,15 @@ impl DFSchema { /// Find the field with the given qualified name pub fn field_with_qualified_name( &self, - relation_name: &str, + qualifier: &str, name: &str, - ) -> Result { - let matches: Vec<&DFField> = self - .fields - .iter() - .filter(|field| { - field.qualifier == Some(relation_name.to_string()) && field.name() == name - }) - .collect(); - match matches.len() { - 0 => Err(DataFusionError::Plan(format!( - "No field named '{}.{}'. Valid fields are {}.", - relation_name, - name, - self.get_field_names() - ))), - 1 => Ok(matches[0].to_owned()), - _ => Err(DataFusionError::Internal(format!( - "Ambiguous reference to qualified field named '{}.{}'", - relation_name, name - ))), - } + ) -> Result<&DFField> { + let idx = self.index_of_column_by_name(Some(qualifier), name)?; + Ok(self.field(idx)) } /// Find the field with the given qualified column - pub fn field_from_qualified_column(&self, column: &Column) -> Result { + pub fn field_from_column(&self, column: &Column) -> Result<&DFField> { match &column.relation { Some(r) => self.field_with_qualified_name(r, &column.name), None => self.field_with_unqualified_name(&column.name), @@ -247,31 +260,20 @@ impl DFSchema { fields: self .fields .into_iter() - .map(|f| { - if f.qualifier().is_some() { - DFField::new( - None, - f.name(), - f.data_type().to_owned(), - f.is_nullable(), - ) - } else { - f - } - }) + .map(|f| f.strip_qualifier()) .collect(), } } /// Replace all field qualifier with new value in schema - pub fn replace_qualifier(self, qualifer: &str) -> Self { + pub fn replace_qualifier(self, qualifier: &str) -> Self { DFSchema { fields: self .fields .into_iter() .map(|f| { DFField::new( - Some(qualifer), + Some(qualifier), f.name(), f.data_type().to_owned(), f.is_nullable(), @@ -328,10 +330,7 @@ impl TryFrom for DFSchema { schema .fields() .iter() - .map(|f| DFField { - field: f.clone(), - qualifier: None, - }) + .map(|f| DFField::from(f.clone())) .collect(), ) } @@ -454,8 +453,8 @@ impl DFField { /// Returns a string to the `DFField`'s qualified name pub fn qualified_name(&self) -> String { - if let Some(relation_name) = &self.qualifier { - format!("{}.{}", relation_name, self.field.name()) + if let Some(qualifier) = &self.qualifier { + format!("{}.{}", qualifier, self.field.name()) } else { self.field.name().to_owned() } @@ -469,6 +468,14 @@ impl DFField { } } + /// Builds an unqualified column based on self + pub fn unqualified_column(&self) -> Column { + Column { + relation: None, + name: self.field.name().to_string(), + } + } + /// Get the optional qualifier pub fn qualifier(&self) -> Option<&String> { self.qualifier.as_ref() @@ -478,6 +485,12 @@ impl DFField { pub fn field(&self) -> &Field { &self.field } + + /// Return field with qualifier stripped + pub fn strip_qualifier(mut self) -> Self { + self.qualifier = None; + self + } } #[cfg(test)] diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index 1fab9bb875ae9..9454d7593c3f3 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -20,7 +20,7 @@ pub use super::Operator; use crate::error::{DataFusionError, Result}; -use crate::logical_plan::{window_frames, DFField, DFSchema, DFSchemaRef}; +use crate::logical_plan::{window_frames, DFField, DFSchema, LogicalPlan}; use crate::physical_plan::{ aggregates, expressions::binary_operator_data_type, functions, udf::ScalarUDF, window_functions, @@ -29,7 +29,7 @@ use crate::{physical_plan::udaf::AggregateUDF, scalar::ScalarValue}; use aggregates::{AccumulatorFunctionImplementation, StateTypeFunction}; use arrow::{compute::can_cast_types, datatypes::DataType}; use functions::{ReturnTypeFunction, ScalarFunctionImplementation, Signature}; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::fmt; use std::sync::Arc; @@ -89,14 +89,46 @@ impl Column { /// /// For example, `foo` will be normalized to `t.foo` if there is a /// column named `foo` in a relation named `t` found in `schemas` - pub fn normalize(self, schemas: &[&DFSchemaRef]) -> Result { + pub fn normalize(self, plan: &LogicalPlan) -> Result { if self.relation.is_some() { return Ok(self); } - for schema in schemas { - if let Ok(field) = schema.field_with_unqualified_name(&self.name) { - return Ok(field.qualified_column()); + let schemas = plan.all_schemas(); + let using_columns = plan.using_columns()?; + + for schema in &schemas { + let fields = schema.fields_with_unqualified_name(&self.name); + match fields.len() { + 0 => continue, + 1 => { + return Ok(fields[0].qualified_column()); + } + _ => { + // More than 1 fields in this schema have their names set to self.name. + // + // This should only happen when a JOIN query with USING constraint references + // join columns using unqualified column name. For example: + // + // ```sql + // SELECT id FROM t1 JOIN t2 USING(id) + // ``` + // + // In this case, both `t1.id` and `t2.id` will match unqualified column `id`. + // We will use the relation from the first matched field to normalize self. + + // Compare matched fields with one USING JOIN clause at a time + for using_col in &using_columns { + let all_matched = fields + .iter() + .all(|f| using_col.contains(&f.qualified_column())); + // All matched fields belong to the same using column set, in orther words + // the same join clause. We simply pick the qualifer from the first match. + if all_matched { + return Ok(fields[0].qualified_column()); + } + } + } } } @@ -321,9 +353,7 @@ impl Expr { pub fn get_type(&self, schema: &DFSchema) -> Result { match self { Expr::Alias(expr, _) => expr.get_type(schema), - Expr::Column(c) => { - Ok(schema.field_from_qualified_column(c)?.data_type().clone()) - } + Expr::Column(c) => Ok(schema.field_from_column(c)?.data_type().clone()), Expr::ScalarVariable(_) => Ok(DataType::Utf8), Expr::Literal(l) => Ok(l.get_datatype()), Expr::Case { when_then_expr, .. } => when_then_expr[0].1.get_type(schema), @@ -395,9 +425,7 @@ impl Expr { pub fn nullable(&self, input_schema: &DFSchema) -> Result { match self { Expr::Alias(expr, _) => expr.nullable(input_schema), - Expr::Column(c) => { - Ok(input_schema.field_from_qualified_column(c)?.is_nullable()) - } + Expr::Column(c) => Ok(input_schema.field_from_column(c)?.is_nullable()), Expr::Literal(value) => Ok(value.is_null()), Expr::ScalarVariable(_) => Ok(true), Expr::Case { @@ -1118,36 +1146,56 @@ pub fn columnize_expr(e: Expr, input_schema: &DFSchema) -> Expr { } } +/// Recursively replace all Column expressions in a given expression tree with Column expressions +/// provided by the hash map argument. +pub fn replace_col(e: Expr, replace_map: &HashMap<&Column, &Column>) -> Result { + struct ColumnReplacer<'a> { + replace_map: &'a HashMap<&'a Column, &'a Column>, + } + + impl<'a> ExprRewriter for ColumnReplacer<'a> { + fn mutate(&mut self, expr: Expr) -> Result { + if let Expr::Column(c) = &expr { + match self.replace_map.get(c) { + Some(new_c) => Ok(Expr::Column((*new_c).to_owned())), + None => Ok(expr), + } + } else { + Ok(expr) + } + } + } + + e.rewrite(&mut ColumnReplacer { replace_map }) +} + /// Recursively call [`Column::normalize`] on all Column expressions /// in the `expr` expression tree. -pub fn normalize_col(e: Expr, schemas: &[&DFSchemaRef]) -> Result { - struct ColumnNormalizer<'a, 'b> { - schemas: &'a [&'b DFSchemaRef], +pub fn normalize_col(e: Expr, plan: &LogicalPlan) -> Result { + struct ColumnNormalizer<'a> { + plan: &'a LogicalPlan, } - impl<'a, 'b> ExprRewriter for ColumnNormalizer<'a, 'b> { + impl<'a> ExprRewriter for ColumnNormalizer<'a> { fn mutate(&mut self, expr: Expr) -> Result { if let Expr::Column(c) = expr { - Ok(Expr::Column(c.normalize(self.schemas)?)) + Ok(Expr::Column(c.normalize(self.plan)?)) } else { Ok(expr) } } } - e.rewrite(&mut ColumnNormalizer { schemas }) + e.rewrite(&mut ColumnNormalizer { plan }) } /// Recursively normalize all Column expressions in a list of expression trees #[inline] pub fn normalize_cols( exprs: impl IntoIterator, - schemas: &[&DFSchemaRef], + plan: &LogicalPlan, ) -> Result> { - exprs - .into_iter() - .map(|e| normalize_col(e, schemas)) - .collect() + exprs.into_iter().map(|e| normalize_col(e, plan)).collect() } /// Create an expression to represent the min() aggregate function diff --git a/datafusion/src/logical_plan/mod.rs b/datafusion/src/logical_plan/mod.rs index 69d03d22bb21a..86a2f567d7de4 100644 --- a/datafusion/src/logical_plan/mod.rs +++ b/datafusion/src/logical_plan/mod.rs @@ -41,10 +41,10 @@ pub use expr::{ cos, count, count_distinct, create_udaf, create_udf, exp, exprlist_to_fields, floor, in_list, initcap, left, length, lit, ln, log10, log2, lower, lpad, ltrim, max, md5, min, normalize_col, normalize_cols, now, octet_length, or, random, regexp_match, - regexp_replace, repeat, replace, reverse, right, round, rpad, rtrim, sha224, sha256, - sha384, sha512, signum, sin, split_part, sqrt, starts_with, strpos, substr, sum, tan, - to_hex, translate, trim, trunc, upper, when, Column, Expr, ExprRewriter, - ExpressionVisitor, Literal, Recursion, + regexp_replace, repeat, replace, replace_col, reverse, right, round, rpad, rtrim, + sha224, sha256, sha384, sha512, signum, sin, split_part, sqrt, starts_with, strpos, + substr, sum, tan, to_hex, translate, trim, trunc, upper, when, Column, Expr, + ExprRewriter, ExpressionVisitor, Literal, Recursion, }; pub use extension::UserDefinedLogicalNode; pub use operators::Operator; diff --git a/datafusion/src/logical_plan/plan.rs b/datafusion/src/logical_plan/plan.rs index 99f0fa14a2d97..b954b6a97950c 100644 --- a/datafusion/src/logical_plan/plan.rs +++ b/datafusion/src/logical_plan/plan.rs @@ -21,9 +21,11 @@ use super::display::{GraphvizVisitor, IndentVisitor}; use super::expr::{Column, Expr}; use super::extension::UserDefinedLogicalNode; use crate::datasource::TableProvider; +use crate::error::DataFusionError; use crate::logical_plan::dfschema::DFSchemaRef; use crate::sql::parser::FileType; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use std::collections::HashSet; use std::{ fmt::{self, Display}, sync::Arc, @@ -354,6 +356,43 @@ impl LogicalPlan { | LogicalPlan::CreateExternalTable { .. } => vec![], } } + + /// returns all `Using` join columns in a logical plan + pub fn using_columns(&self) -> Result>, DataFusionError> { + struct UsingJoinColumnVisitor { + using_columns: Vec>, + } + + impl PlanVisitor for UsingJoinColumnVisitor { + type Error = DataFusionError; + + fn pre_visit(&mut self, plan: &LogicalPlan) -> Result { + if let LogicalPlan::Join { + join_constraint: JoinConstraint::Using, + on, + .. + } = plan + { + self.using_columns.push( + on.iter() + .map(|entry| { + std::iter::once(entry.0.clone()) + .chain(std::iter::once(entry.1.clone())) + }) + .flatten() + .collect::>(), + ); + } + Ok(true) + } + } + + let mut visitor = UsingJoinColumnVisitor { + using_columns: vec![], + }; + self.accept(&mut visitor)?; + Ok(visitor.using_columns) + } } /// Logical partitioning schemes supported by the repartition operator. @@ -709,10 +748,21 @@ impl LogicalPlan { } Ok(()) } - LogicalPlan::Join { on: ref keys, .. } => { + LogicalPlan::Join { + on: ref keys, + join_constraint, + .. + } => { let join_expr: Vec = keys.iter().map(|(l, r)| format!("{} = {}", l, r)).collect(); - write!(f, "Join: {}", join_expr.join(", ")) + match join_constraint { + JoinConstraint::On => { + write!(f, "Join: {}", join_expr.join(", ")) + } + JoinConstraint::Using => { + write!(f, "Join: Using {}", join_expr.join(", ")) + } + } } LogicalPlan::CrossJoin { .. } => { write!(f, "CrossJoin:") diff --git a/datafusion/src/optimizer/filter_push_down.rs b/datafusion/src/optimizer/filter_push_down.rs index c1d81fe629345..76d8c05bed4c6 100644 --- a/datafusion/src/optimizer/filter_push_down.rs +++ b/datafusion/src/optimizer/filter_push_down.rs @@ -16,7 +16,7 @@ use crate::datasource::datasource::TableProviderFilterPushDown; use crate::execution::context::ExecutionProps; -use crate::logical_plan::{and, Column, LogicalPlan}; +use crate::logical_plan::{and, replace_col, Column, LogicalPlan}; use crate::logical_plan::{DFSchema, Expr}; use crate::optimizer::optimizer::OptimizerRule; use crate::optimizer::utils; @@ -96,12 +96,21 @@ fn get_join_predicates<'a>( let left_columns = &left .fields() .iter() - .map(|f| f.qualified_column()) + .map(|f| { + std::iter::once(f.qualified_column()) + // we need to push down filter using unqualified column as well + .chain(std::iter::once(f.unqualified_column())) + }) + .flatten() .collect::>(); let right_columns = &right .fields() .iter() - .map(|f| f.qualified_column()) + .map(|f| { + std::iter::once(f.qualified_column()) + .chain(std::iter::once(f.unqualified_column())) + }) + .flatten() .collect::>(); let filters = state @@ -232,6 +241,38 @@ fn split_members<'a>(predicate: &'a Expr, predicates: &mut Vec<&'a Expr>) { } } +fn optimize_join( + mut state: State, + plan: &LogicalPlan, + left: &LogicalPlan, + right: &LogicalPlan, +) -> Result { + let (pushable_to_left, pushable_to_right, keep) = + get_join_predicates(&state, left.schema(), right.schema()); + + let mut left_state = state.clone(); + left_state.filters = keep_filters(&left_state.filters, &pushable_to_left); + let left = optimize(left, left_state)?; + + let mut right_state = state.clone(); + right_state.filters = keep_filters(&right_state.filters, &pushable_to_right); + let right = optimize(right, right_state)?; + + // create a new Join with the new `left` and `right` + let expr = plan.expressions(); + let plan = utils::from_plan(plan, &expr, &[left, right])?; + + if keep.0.is_empty() { + Ok(plan) + } else { + // wrap the join on the filter whose predicates must be kept + let plan = add_filter(plan, &keep.0); + state.filters = remove_filters(&state.filters, &keep.1); + + Ok(plan) + } +} + fn optimize(plan: &LogicalPlan, mut state: State) -> Result { match plan { LogicalPlan::Explain { .. } => { @@ -336,32 +377,68 @@ fn optimize(plan: &LogicalPlan, mut state: State) -> Result { .collect::>(); issue_filters(state, used_columns, plan) } - LogicalPlan::Join { left, right, .. } - | LogicalPlan::CrossJoin { left, right, .. } => { - let (pushable_to_left, pushable_to_right, keep) = - get_join_predicates(&state, left.schema(), right.schema()); - - let mut left_state = state.clone(); - left_state.filters = keep_filters(&left_state.filters, &pushable_to_left); - let left = optimize(left, left_state)?; - - let mut right_state = state.clone(); - right_state.filters = keep_filters(&right_state.filters, &pushable_to_right); - let right = optimize(right, right_state)?; - - // create a new Join with the new `left` and `right` - let expr = plan.expressions(); - let plan = utils::from_plan(plan, &expr, &[left, right])?; + LogicalPlan::CrossJoin { left, right, .. } => { + optimize_join(state, plan, left, right) + } + LogicalPlan::Join { + left, right, on, .. + } => { + // duplicate filters for joined columns so filters can be pushed down to both sides. + // Take the following query as an example: + // + // ```sql + // SELECT * FROM t1 JOIN t2 on t1.id = t2.uid WHERE t1.id > 1 + // ``` + // + // `t1.id > 1` predicate needs to be pushed down to t1 table scan, while + // `t2.uid > 1` predicate needs to be pushed down to t2 table scan. + // + // Join clauses with `Using` constraints also take advantage of this logic to make sure + // predicates reference the shared join columns are pushed to both sides. + let join_side_filters = state + .filters + .iter() + .filter_map(|(predicate, columns)| { + let mut join_cols_to_replace = HashMap::new(); + for col in columns.iter() { + for (l, r) in on { + if col == l { + join_cols_to_replace.insert(col, r); + break; + } else if col == r { + join_cols_to_replace.insert(col, l); + break; + } + } + } - if keep.0.is_empty() { - Ok(plan) - } else { - // wrap the join on the filter whose predicates must be kept - let plan = add_filter(plan, &keep.0); - state.filters = remove_filters(&state.filters, &keep.1); + if join_cols_to_replace.is_empty() { + return None; + } - Ok(plan) - } + let join_side_predicate = + match replace_col(predicate.clone(), &join_cols_to_replace) { + Ok(p) => p, + Err(e) => { + return Some(Err(e)); + } + }; + + let join_side_columns = columns + .clone() + .into_iter() + // replace keys in join_cols_to_replace with values in resulting column + // set + .filter(|c| !join_cols_to_replace.contains_key(c)) + .chain(join_cols_to_replace.iter().map(|(_, v)| (*v).clone())) + .collect(); + + Some(Ok((join_side_predicate, join_side_columns))) + }) + .collect::>>()?; + state.filters.extend(join_side_filters); + + optimize_join(state, plan, left, right) } LogicalPlan::TableScan { source, @@ -878,12 +955,13 @@ mod tests { Ok(()) } - /// post-join predicates on a column common to both sides is pushed to both sides + /// post-on-join predicates on a column common to both sides is pushed to both sides #[test] - fn filter_join_on_common_independent() -> Result<()> { + fn filter_on_join_on_common_independent() -> Result<()> { let table_scan = test_table_scan()?; - let left = LogicalPlanBuilder::from(table_scan.clone()).build()?; - let right = LogicalPlanBuilder::from(table_scan) + let left = LogicalPlanBuilder::from(table_scan).build()?; + let right_table_scan = test_table_scan_with_name("test2")?; + let right = LogicalPlanBuilder::from(right_table_scan) .project(vec![col("a")])? .build()?; let plan = LogicalPlanBuilder::from(left) @@ -901,20 +979,61 @@ mod tests { format!("{:?}", plan), "\ Filter: #test.a LtEq Int64(1)\ - \n Join: #test.a = #test.a\ + \n Join: #test.a = #test2.a\ \n TableScan: test projection=None\ - \n Projection: #test.a\ - \n TableScan: test projection=None" + \n Projection: #test2.a\ + \n TableScan: test2 projection=None" ); // filter sent to side before the join let expected = "\ - Join: #test.a = #test.a\ + Join: #test.a = #test2.a\ \n Filter: #test.a LtEq Int64(1)\ \n TableScan: test projection=None\ - \n Projection: #test.a\ - \n Filter: #test.a LtEq Int64(1)\ - \n TableScan: test projection=None"; + \n Projection: #test2.a\ + \n Filter: #test2.a LtEq Int64(1)\ + \n TableScan: test2 projection=None"; + assert_optimized_plan_eq(&plan, expected); + Ok(()) + } + + /// post-using-join predicates on a column common to both sides is pushed to both sides + #[test] + fn filter_using_join_on_common_independent() -> Result<()> { + let table_scan = test_table_scan()?; + let left = LogicalPlanBuilder::from(table_scan).build()?; + let right_table_scan = test_table_scan_with_name("test2")?; + let right = LogicalPlanBuilder::from(right_table_scan) + .project(vec![col("a")])? + .build()?; + let plan = LogicalPlanBuilder::from(left) + .join_using( + &right, + JoinType::Inner, + vec![Column::from_name("a".to_string())], + )? + .filter(col("a").lt_eq(lit(1i64)))? + .build()?; + + // not part of the test, just good to know: + assert_eq!( + format!("{:?}", plan), + "\ + Filter: #test.a LtEq Int64(1)\ + \n Join: Using #test.a = #test2.a\ + \n TableScan: test projection=None\ + \n Projection: #test2.a\ + \n TableScan: test2 projection=None" + ); + + // filter sent to side before the join + let expected = "\ + Join: Using #test.a = #test2.a\ + \n Filter: #test.a LtEq Int64(1)\ + \n TableScan: test projection=None\ + \n Projection: #test2.a\ + \n Filter: #test2.a LtEq Int64(1)\ + \n TableScan: test2 projection=None"; assert_optimized_plan_eq(&plan, expected); Ok(()) } @@ -923,10 +1042,11 @@ mod tests { #[test] fn filter_join_on_common_dependent() -> Result<()> { let table_scan = test_table_scan()?; - let left = LogicalPlanBuilder::from(table_scan.clone()) + let left = LogicalPlanBuilder::from(table_scan) .project(vec![col("a"), col("c")])? .build()?; - let right = LogicalPlanBuilder::from(table_scan) + let right_table_scan = test_table_scan_with_name("test2")?; + let right = LogicalPlanBuilder::from(right_table_scan) .project(vec![col("a"), col("b")])? .build()?; let plan = LogicalPlanBuilder::from(left) @@ -944,12 +1064,12 @@ mod tests { assert_eq!( format!("{:?}", plan), "\ - Filter: #test.c LtEq #test.b\ - \n Join: #test.a = #test.a\ + Filter: #test.c LtEq #test2.b\ + \n Join: #test.a = #test2.a\ \n Projection: #test.a, #test.c\ \n TableScan: test projection=None\ - \n Projection: #test.a, #test.b\ - \n TableScan: test projection=None" + \n Projection: #test2.a, #test2.b\ + \n TableScan: test2 projection=None" ); // expected is equal: no push-down @@ -962,12 +1082,14 @@ mod tests { #[test] fn filter_join_on_one_side() -> Result<()> { let table_scan = test_table_scan()?; - let left = LogicalPlanBuilder::from(table_scan.clone()) + let left = LogicalPlanBuilder::from(table_scan) .project(vec![col("a"), col("b")])? .build()?; - let right = LogicalPlanBuilder::from(table_scan) + let table_scan_right = test_table_scan_with_name("test2")?; + let right = LogicalPlanBuilder::from(table_scan_right) .project(vec![col("a"), col("c")])? .build()?; + let plan = LogicalPlanBuilder::from(left) .join( &right, @@ -983,20 +1105,20 @@ mod tests { format!("{:?}", plan), "\ Filter: #test.b LtEq Int64(1)\ - \n Join: #test.a = #test.a\ + \n Join: #test.a = #test2.a\ \n Projection: #test.a, #test.b\ \n TableScan: test projection=None\ - \n Projection: #test.a, #test.c\ - \n TableScan: test projection=None" + \n Projection: #test2.a, #test2.c\ + \n TableScan: test2 projection=None" ); let expected = "\ - Join: #test.a = #test.a\ + Join: #test.a = #test2.a\ \n Projection: #test.a, #test.b\ \n Filter: #test.b LtEq Int64(1)\ \n TableScan: test projection=None\ - \n Projection: #test.a, #test.c\ - \n TableScan: test projection=None"; + \n Projection: #test2.a, #test2.c\ + \n TableScan: test2 projection=None"; assert_optimized_plan_eq(&plan, expected); Ok(()) } diff --git a/datafusion/src/optimizer/projection_push_down.rs b/datafusion/src/optimizer/projection_push_down.rs index 3c8f1ee4ceb58..0272b9f7872cf 100644 --- a/datafusion/src/optimizer/projection_push_down.rs +++ b/datafusion/src/optimizer/projection_push_down.rs @@ -216,9 +216,7 @@ fn optimize_plan( let schema = build_join_schema( optimized_left.schema(), optimized_right.schema(), - on, join_type, - join_constraint, )?; Ok(LogicalPlan::Join { @@ -499,7 +497,7 @@ mod tests { } #[test] - fn join_schema_trim() -> Result<()> { + fn join_schema_trim_full_join_column_projection() -> Result<()> { let table_scan = test_table_scan()?; let schema = Schema::new(vec![Field::new("c1", DataType::UInt32, false)]); @@ -511,7 +509,7 @@ mod tests { .project(vec![col("a"), col("b"), col("c1")])? .build()?; - // make sure projections are pushed down to table scan + // make sure projections are pushed down to both table scans let expected = "Projection: #test.a, #test.b, #test2.c1\ \n Join: #test.a = #test2.c1\ \n TableScan: test projection=Some([0, 1])\ @@ -521,7 +519,48 @@ mod tests { let formatted_plan = format!("{:?}", optimized_plan); assert_eq!(formatted_plan, expected); - // make sure schema for join node doesn't include c1 column + // make sure schema for join node include both join columns + let optimized_join = optimized_plan.inputs()[0]; + assert_eq!( + **optimized_join.schema(), + DFSchema::new(vec![ + DFField::new(Some("test"), "a", DataType::UInt32, false), + DFField::new(Some("test"), "b", DataType::UInt32, false), + DFField::new(Some("test2"), "c1", DataType::UInt32, false), + ])?, + ); + + Ok(()) + } + + #[test] + fn join_schema_trim_partial_join_column_projection() -> Result<()> { + // test join column push down without explicit column projections + + let table_scan = test_table_scan()?; + + let schema = Schema::new(vec![Field::new("c1", DataType::UInt32, false)]); + let table2_scan = + LogicalPlanBuilder::scan_empty(Some("test2"), &schema, None)?.build()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .join(&table2_scan, JoinType::Left, vec!["a"], vec!["c1"])? + // projecting joined column `a` should push the right side column `c1` projection as + // well into test2 table even though `c1` is not referenced in projection. + .project(vec![col("a"), col("b")])? + .build()?; + + // make sure projections are pushed down to both table scans + let expected = "Projection: #test.a, #test.b\ + \n Join: #test.a = #test2.c1\ + \n TableScan: test projection=Some([0, 1])\ + \n TableScan: test2 projection=Some([0])"; + + let optimized_plan = optimize(&plan)?; + let formatted_plan = format!("{:?}", optimized_plan); + assert_eq!(formatted_plan, expected); + + // make sure schema for join node include both join columns let optimized_join = optimized_plan.inputs()[0]; assert_eq!( **optimized_join.schema(), @@ -535,6 +574,45 @@ mod tests { Ok(()) } + #[test] + fn join_schema_trim_using_join() -> Result<()> { + // shared join colums from using join should be pushed to both sides + + let table_scan = test_table_scan()?; + + let schema = Schema::new(vec![Field::new("a", DataType::UInt32, false)]); + let table2_scan = + LogicalPlanBuilder::scan_empty(Some("test2"), &schema, None)?.build()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .join_using(&table2_scan, JoinType::Left, vec!["a"])? + .project(vec![col("a"), col("b")])? + .build()?; + + // make sure projections are pushed down to table scan + let expected = "Projection: #test.a, #test.b\ + \n Join: Using #test.a = #test2.a\ + \n TableScan: test projection=Some([0, 1])\ + \n TableScan: test2 projection=Some([0])"; + + let optimized_plan = optimize(&plan)?; + let formatted_plan = format!("{:?}", optimized_plan); + assert_eq!(formatted_plan, expected); + + // make sure schema for join node include both join columns + let optimized_join = optimized_plan.inputs()[0]; + assert_eq!( + **optimized_join.schema(), + DFSchema::new(vec![ + DFField::new(Some("test"), "a", DataType::UInt32, false), + DFField::new(Some("test"), "b", DataType::UInt32, false), + DFField::new(Some("test2"), "a", DataType::UInt32, false), + ])?, + ); + + Ok(()) + } + #[test] fn cast() -> Result<()> { let table_scan = test_table_scan()?; diff --git a/datafusion/src/optimizer/utils.rs b/datafusion/src/optimizer/utils.rs index ae3e196c22251..1d19f0681b350 100644 --- a/datafusion/src/optimizer/utils.rs +++ b/datafusion/src/optimizer/utils.rs @@ -215,13 +215,8 @@ pub fn from_plan( on, .. } => { - let schema = build_join_schema( - inputs[0].schema(), - inputs[1].schema(), - on, - join_type, - join_constraint, - )?; + let schema = + build_join_schema(inputs[0].schema(), inputs[1].schema(), join_type)?; Ok(LogicalPlan::Join { left: Arc::new(inputs[0].clone()), right: Arc::new(inputs[1].clone()), diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index f426bc9d3c3c2..00ca1539d714f 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -55,9 +55,10 @@ use arrow::array::{ use super::expressions::Column; use super::{ coalesce_partitions::CoalescePartitionsExec, - hash_utils::{build_join_schema, check_join_is_valid, JoinOn, JoinType}, + hash_utils::{build_join_schema, check_join_is_valid, JoinOn}, }; use crate::error::{DataFusionError, Result}; +use crate::logical_plan::JoinType; use super::{ DisplayFormatType, ExecutionPlan, Partitioning, RecordBatchStream, @@ -165,12 +166,7 @@ impl HashJoinExec { let right_schema = right.schema(); check_join_is_valid(&left_schema, &right_schema, &on)?; - let schema = Arc::new(build_join_schema( - &left_schema, - &right_schema, - &on, - join_type, - )); + let schema = Arc::new(build_join_schema(&left_schema, &right_schema, join_type)); let random_state = RandomState::with_seeds(0, 0, 0, 0); @@ -1437,16 +1433,16 @@ mod tests { join_collect(left.clone(), right.clone(), on.clone(), &JoinType::Inner) .await?; - assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "c2"]); + assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]); let expected = vec![ - "+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | c2 |", - "+----+----+----+----+----+", - "| 1 | 4 | 7 | 10 | 70 |", - "| 2 | 5 | 8 | 20 | 80 |", - "| 3 | 5 | 9 | 20 | 80 |", - "+----+----+----+----+----+", + "+----+----+----+----+----+----+", + "| a1 | b1 | c1 | a2 | b1 | c2 |", + "+----+----+----+----+----+----+", + "| 1 | 4 | 7 | 10 | 4 | 70 |", + "| 2 | 5 | 8 | 20 | 5 | 80 |", + "| 3 | 5 | 9 | 20 | 5 | 80 |", + "+----+----+----+----+----+----+", ]; assert_batches_sorted_eq!(expected, &batches); @@ -1478,16 +1474,16 @@ mod tests { ) .await?; - assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "c2"]); + assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]); let expected = vec![ - "+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | c2 |", - "+----+----+----+----+----+", - "| 1 | 4 | 7 | 10 | 70 |", - "| 2 | 5 | 8 | 20 | 80 |", - "| 3 | 5 | 9 | 20 | 80 |", - "+----+----+----+----+----+", + "+----+----+----+----+----+----+", + "| a1 | b1 | c1 | a2 | b1 | c2 |", + "+----+----+----+----+----+----+", + "| 1 | 4 | 7 | 10 | 4 | 70 |", + "| 2 | 5 | 8 | 20 | 5 | 80 |", + "| 3 | 5 | 9 | 20 | 5 | 80 |", + "+----+----+----+----+----+----+", ]; assert_batches_sorted_eq!(expected, &batches); @@ -1555,18 +1551,18 @@ mod tests { let (columns, batches) = join_collect(left, right, on, &JoinType::Inner).await?; - assert_eq!(columns, vec!["a1", "b2", "c1", "c2"]); + assert_eq!(columns, vec!["a1", "b2", "c1", "a1", "b2", "c2"]); assert_eq!(batches.len(), 1); let expected = vec![ - "+----+----+----+----+", - "| a1 | b2 | c1 | c2 |", - "+----+----+----+----+", - "| 1 | 1 | 7 | 70 |", - "| 2 | 2 | 8 | 80 |", - "| 2 | 2 | 9 | 80 |", - "+----+----+----+----+", + "+----+----+----+----+----+----+", + "| a1 | b2 | c1 | a1 | b2 | c2 |", + "+----+----+----+----+----+----+", + "| 1 | 1 | 7 | 1 | 1 | 70 |", + "| 2 | 2 | 8 | 2 | 2 | 80 |", + "| 2 | 2 | 9 | 2 | 2 | 80 |", + "+----+----+----+----+----+----+", ]; assert_batches_sorted_eq!(expected, &batches); @@ -1607,18 +1603,18 @@ mod tests { let (columns, batches) = join_collect(left, right, on, &JoinType::Inner).await?; - assert_eq!(columns, vec!["a1", "b2", "c1", "c2"]); + assert_eq!(columns, vec!["a1", "b2", "c1", "a1", "b2", "c2"]); assert_eq!(batches.len(), 1); let expected = vec![ - "+----+----+----+----+", - "| a1 | b2 | c1 | c2 |", - "+----+----+----+----+", - "| 1 | 1 | 7 | 70 |", - "| 2 | 2 | 8 | 80 |", - "| 2 | 2 | 9 | 80 |", - "+----+----+----+----+", + "+----+----+----+----+----+----+", + "| a1 | b2 | c1 | a1 | b2 | c2 |", + "+----+----+----+----+----+----+", + "| 1 | 1 | 7 | 1 | 1 | 70 |", + "| 2 | 2 | 8 | 2 | 2 | 80 |", + "| 2 | 2 | 9 | 2 | 2 | 80 |", + "+----+----+----+----+----+----+", ]; assert_batches_sorted_eq!(expected, &batches); @@ -1655,7 +1651,7 @@ mod tests { let join = join(left, right, on, &JoinType::Inner)?; let columns = columns(&join.schema()); - assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "c2"]); + assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]); // first part let stream = join.execute(0).await?; @@ -1663,11 +1659,11 @@ mod tests { assert_eq!(batches.len(), 1); let expected = vec![ - "+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | c2 |", - "+----+----+----+----+----+", - "| 1 | 4 | 7 | 10 | 70 |", - "+----+----+----+----+----+", + "+----+----+----+----+----+----+", + "| a1 | b1 | c1 | a2 | b1 | c2 |", + "+----+----+----+----+----+----+", + "| 1 | 4 | 7 | 10 | 4 | 70 |", + "+----+----+----+----+----+----+", ]; assert_batches_sorted_eq!(expected, &batches); @@ -1676,12 +1672,12 @@ mod tests { let batches = common::collect(stream).await?; assert_eq!(batches.len(), 1); let expected = vec![ - "+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | c2 |", - "+----+----+----+----+----+", - "| 2 | 5 | 8 | 30 | 90 |", - "| 3 | 5 | 9 | 30 | 90 |", - "+----+----+----+----+----+", + "+----+----+----+----+----+----+", + "| a1 | b1 | c1 | a2 | b1 | c2 |", + "+----+----+----+----+----+----+", + "| 2 | 5 | 8 | 30 | 5 | 90 |", + "| 3 | 5 | 9 | 30 | 5 | 90 |", + "+----+----+----+----+----+----+", ]; assert_batches_sorted_eq!(expected, &batches); @@ -1721,21 +1717,21 @@ mod tests { let join = join(left, right, on, &JoinType::Left).unwrap(); let columns = columns(&join.schema()); - assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "c2"]); + assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]); let stream = join.execute(0).await.unwrap(); let batches = common::collect(stream).await.unwrap(); let expected = vec![ - "+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | c2 |", - "+----+----+----+----+----+", - "| 1 | 4 | 7 | 10 | 70 |", - "| 1 | 4 | 7 | 10 | 70 |", - "| 2 | 5 | 8 | 20 | 80 |", - "| 2 | 5 | 8 | 20 | 80 |", - "| 3 | 7 | 9 | | |", - "+----+----+----+----+----+", + "+----+----+----+----+----+----+", + "| a1 | b1 | c1 | a2 | b1 | c2 |", + "+----+----+----+----+----+----+", + "| 1 | 4 | 7 | 10 | 4 | 70 |", + "| 1 | 4 | 7 | 10 | 4 | 70 |", + "| 2 | 5 | 8 | 20 | 5 | 80 |", + "| 2 | 5 | 8 | 20 | 5 | 80 |", + "| 3 | 7 | 9 | | 7 | |", + "+----+----+----+----+----+----+", ]; assert_batches_sorted_eq!(expected, &batches); @@ -1801,19 +1797,19 @@ mod tests { let join = join(left, right, on, &JoinType::Left).unwrap(); let columns = columns(&join.schema()); - assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "c2"]); + assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]); let stream = join.execute(0).await.unwrap(); let batches = common::collect(stream).await.unwrap(); let expected = vec![ - "+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | c2 |", - "+----+----+----+----+----+", - "| 1 | 4 | 7 | | |", - "| 2 | 5 | 8 | | |", - "| 3 | 7 | 9 | | |", - "+----+----+----+----+----+", + "+----+----+----+----+----+----+", + "| a1 | b1 | c1 | a2 | b1 | c2 |", + "+----+----+----+----+----+----+", + "| 1 | 4 | 7 | | 4 | |", + "| 2 | 5 | 8 | | 5 | |", + "| 3 | 7 | 9 | | 7 | |", + "+----+----+----+----+----+----+", ]; assert_batches_sorted_eq!(expected, &batches); @@ -1874,16 +1870,16 @@ mod tests { let (columns, batches) = join_collect(left.clone(), right.clone(), on.clone(), &JoinType::Left) .await?; - assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "c2"]); + assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]); let expected = vec![ - "+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | c2 |", - "+----+----+----+----+----+", - "| 1 | 4 | 7 | 10 | 70 |", - "| 2 | 5 | 8 | 20 | 80 |", - "| 3 | 7 | 9 | | |", - "+----+----+----+----+----+", + "+----+----+----+----+----+----+", + "| a1 | b1 | c1 | a2 | b1 | c2 |", + "+----+----+----+----+----+----+", + "| 1 | 4 | 7 | 10 | 4 | 70 |", + "| 2 | 5 | 8 | 20 | 5 | 80 |", + "| 3 | 7 | 9 | | 7 | |", + "+----+----+----+----+----+----+", ]; assert_batches_sorted_eq!(expected, &batches); @@ -1914,16 +1910,16 @@ mod tests { &JoinType::Left, ) .await?; - assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "c2"]); + assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]); let expected = vec![ - "+----+----+----+----+----+", - "| a1 | b1 | c1 | a2 | c2 |", - "+----+----+----+----+----+", - "| 1 | 4 | 7 | 10 | 70 |", - "| 2 | 5 | 8 | 20 | 80 |", - "| 3 | 7 | 9 | | |", - "+----+----+----+----+----+", + "+----+----+----+----+----+----+", + "| a1 | b1 | c1 | a2 | b1 | c2 |", + "+----+----+----+----+----+----+", + "| 1 | 4 | 7 | 10 | 4 | 70 |", + "| 2 | 5 | 8 | 20 | 5 | 80 |", + "| 3 | 7 | 9 | | 7 | |", + "+----+----+----+----+----+----+", ]; assert_batches_sorted_eq!(expected, &batches); @@ -2025,16 +2021,16 @@ mod tests { let (columns, batches) = join_collect(left, right, on, &JoinType::Right).await?; - assert_eq!(columns, vec!["a1", "c1", "a2", "b1", "c2"]); + assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]); let expected = vec![ - "+----+----+----+----+----+", - "| a1 | c1 | a2 | b1 | c2 |", - "+----+----+----+----+----+", - "| | | 30 | 6 | 90 |", - "| 1 | 7 | 10 | 4 | 70 |", - "| 2 | 8 | 20 | 5 | 80 |", - "+----+----+----+----+----+", + "+----+----+----+----+----+----+", + "| a1 | b1 | c1 | a2 | b1 | c2 |", + "+----+----+----+----+----+----+", + "| | 6 | | 30 | 6 | 90 |", + "| 1 | 4 | 7 | 10 | 4 | 70 |", + "| 2 | 5 | 8 | 20 | 5 | 80 |", + "+----+----+----+----+----+----+", ]; assert_batches_sorted_eq!(expected, &batches); @@ -2062,16 +2058,16 @@ mod tests { let (columns, batches) = partitioned_join_collect(left, right, on, &JoinType::Right).await?; - assert_eq!(columns, vec!["a1", "c1", "a2", "b1", "c2"]); + assert_eq!(columns, vec!["a1", "b1", "c1", "a2", "b1", "c2"]); let expected = vec![ - "+----+----+----+----+----+", - "| a1 | c1 | a2 | b1 | c2 |", - "+----+----+----+----+----+", - "| | | 30 | 6 | 90 |", - "| 1 | 7 | 10 | 4 | 70 |", - "| 2 | 8 | 20 | 5 | 80 |", - "+----+----+----+----+----+", + "+----+----+----+----+----+----+", + "| a1 | b1 | c1 | a2 | b1 | c2 |", + "+----+----+----+----+----+----+", + "| | 6 | | 30 | 6 | 90 |", + "| 1 | 4 | 7 | 10 | 4 | 70 |", + "| 2 | 5 | 8 | 20 | 5 | 80 |", + "+----+----+----+----+----+----+", ]; assert_batches_sorted_eq!(expected, &batches); diff --git a/datafusion/src/physical_plan/hash_utils.rs b/datafusion/src/physical_plan/hash_utils.rs index 0cf0b9212cd21..9243affe9cfc3 100644 --- a/datafusion/src/physical_plan/hash_utils.rs +++ b/datafusion/src/physical_plan/hash_utils.rs @@ -21,25 +21,9 @@ use crate::error::{DataFusionError, Result}; use arrow::datatypes::{Field, Schema}; use std::collections::HashSet; +use crate::logical_plan::JoinType; use crate::physical_plan::expressions::Column; -/// All valid types of joins. -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub enum JoinType { - /// Inner Join - Inner, - /// Left Join - Left, - /// Right Join - Right, - /// Full Join - Full, - /// Semi Join - Semi, - /// Anti Join - Anti, -} - /// The on clause of the join, as vector of (left, right) columns. pub type JoinOn = Vec<(Column, Column)>; /// Reference for JoinOn. @@ -104,46 +88,11 @@ fn check_join_set_is_valid( /// Creates a schema for a join operation. /// The fields from the left side are first -pub fn build_join_schema( - left: &Schema, - right: &Schema, - on: JoinOnRef, - join_type: &JoinType, -) -> Schema { +pub fn build_join_schema(left: &Schema, right: &Schema, join_type: &JoinType) -> Schema { let fields: Vec = match join_type { - JoinType::Inner | JoinType::Left | JoinType::Full => { - // remove right-side join keys if they have the same names as the left-side - let duplicate_keys = &on - .iter() - .filter(|(l, r)| l.name() == r.name()) - .map(|on| on.1.name()) - .collect::>(); - + JoinType::Inner | JoinType::Left | JoinType::Full | JoinType::Right => { let left_fields = left.fields().iter(); - - let right_fields = right - .fields() - .iter() - .filter(|f| !duplicate_keys.contains(f.name().as_str())); - - // left then right - left_fields.chain(right_fields).cloned().collect() - } - JoinType::Right => { - // remove left-side join keys if they have the same names as the right-side - let duplicate_keys = &on - .iter() - .filter(|(l, r)| l.name() == r.name()) - .map(|on| on.1.name()) - .collect::>(); - - let left_fields = left - .fields() - .iter() - .filter(|f| !duplicate_keys.contains(f.name().as_str())); - let right_fields = right.fields().iter(); - // left then right left_fields.chain(right_fields).cloned().collect() } diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index effdefcfabadc..73b2f362989f6 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -40,7 +40,6 @@ use crate::physical_plan::udf; use crate::physical_plan::windows::WindowAggExec; use crate::physical_plan::{hash_utils, Partitioning}; use crate::physical_plan::{AggregateExpr, ExecutionPlan, PhysicalExpr, WindowExpr}; -use crate::prelude::JoinType; use crate::scalar::ScalarValue; use crate::sql::utils::{generate_sort_key, window_expr_common_partition_keys}; use crate::variable::VarType; @@ -661,14 +660,6 @@ impl DefaultPhysicalPlanner { let physical_left = self.create_initial_plan(left, ctx_state)?; let right_df_schema = right.schema(); let physical_right = self.create_initial_plan(right, ctx_state)?; - let physical_join_type = match join_type { - JoinType::Inner => hash_utils::JoinType::Inner, - JoinType::Left => hash_utils::JoinType::Left, - JoinType::Right => hash_utils::JoinType::Right, - JoinType::Full => hash_utils::JoinType::Full, - JoinType::Semi => hash_utils::JoinType::Semi, - JoinType::Anti => hash_utils::JoinType::Anti, - }; let join_on = keys .iter() .map(|(l, r)| { @@ -702,7 +693,7 @@ impl DefaultPhysicalPlanner { Partitioning::Hash(right_expr, ctx_state.config.concurrency), )?), join_on, - &physical_join_type, + join_type, PartitionMode::Partitioned, )?)) } else { @@ -710,7 +701,7 @@ impl DefaultPhysicalPlanner { physical_left, physical_right, join_on, - &physical_join_type, + join_type, PartitionMode::CollectLeft, )?)) } diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index e34f0e6c9b674..f89ba3f659c88 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -27,8 +27,8 @@ use crate::datasource::TableProvider; use crate::logical_plan::window_frames::{WindowFrame, WindowFrameUnits}; use crate::logical_plan::Expr::Alias; use crate::logical_plan::{ - and, lit, union_with_alias, Column, DFSchema, Expr, LogicalPlan, LogicalPlanBuilder, - Operator, PlanType, StringifiedPlan, ToDFSchema, + and, col, lit, normalize_col, union_with_alias, Column, DFSchema, Expr, LogicalPlan, + LogicalPlanBuilder, Operator, PlanType, StringifiedPlan, ToDFSchema, }; use crate::prelude::JoinType; use crate::scalar::ScalarValue; @@ -496,12 +496,12 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let right_schema = right.schema(); let mut join_keys = vec![]; for (l, r) in &possible_join_keys { - if left_schema.field_from_qualified_column(l).is_ok() - && right_schema.field_from_qualified_column(r).is_ok() + if left_schema.field_from_column(l).is_ok() + && right_schema.field_from_column(r).is_ok() { join_keys.push((l.clone(), r.clone())); - } else if left_schema.field_from_qualified_column(r).is_ok() - && right_schema.field_from_qualified_column(l).is_ok() + } else if left_schema.field_from_column(r).is_ok() + && right_schema.field_from_column(l).is_ok() { join_keys.push((r.clone(), l.clone())); } @@ -579,7 +579,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // SELECT c1 AS m FROM t HAVING c1 > 10; // SELECT c1, MAX(c2) AS m FROM t GROUP BY c1 HAVING MAX(c2) > 10; // - resolve_aliases_to_exprs(&having_expr, &alias_map) + let having_expr = resolve_aliases_to_exprs(&having_expr, &alias_map)?; + normalize_col(having_expr, &projected_plan) }) .transpose()?; @@ -603,6 +604,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let group_by_expr = resolve_positions_to_exprs(&group_by_expr, &select_exprs) .unwrap_or(group_by_expr); + let group_by_expr = normalize_col(group_by_expr, &projected_plan)?; self.validate_schema_satisfies_exprs( plan.schema(), &[group_by_expr.clone()], @@ -681,13 +683,14 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ) -> Result> { let input_schema = plan.schema(); - Ok(projection + projection .iter() .map(|expr| self.sql_select_to_rex(expr, input_schema)) .collect::>>()? .iter() .flat_map(|expr| expand_wildcard(expr, input_schema)) - .collect::>()) + .map(|expr| normalize_col(expr, plan)) + .collect::>>() } /// Wrap a plan in a projection @@ -835,20 +838,29 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { find_column_exprs(exprs) .iter() .try_for_each(|col| match col { - Expr::Column(col) => { - match &col.relation { - Some(r) => schema.field_with_qualified_name(r, &col.name), - None => schema.field_with_unqualified_name(&col.name), + Expr::Column(col) => match &col.relation { + Some(r) => { + schema.field_with_qualified_name(r, &col.name)?; + Ok(()) + } + None => { + if !schema.fields_with_unqualified_name(&col.name).is_empty() { + Ok(()) + } else { + Err(DataFusionError::Plan(format!( + "No field with unqualified name '{}'", + &col.name + ))) + } } - .map_err(|_| { - DataFusionError::Plan(format!( - "Invalid identifier '{}' for schema {}", - col, - schema.to_string() - )) - })?; - Ok(()) } + .map_err(|_: DataFusionError| { + DataFusionError::Plan(format!( + "Invalid identifier '{}' for schema {}", + col, + schema.to_string() + )) + }), _ => Err(DataFusionError::Internal("Not a column".to_string())), }) } @@ -926,11 +938,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let var_names = vec![id.value.clone()]; Ok(Expr::ScalarVariable(var_names)) } else { - Ok(Expr::Column( - schema - .field_with_unqualified_name(&id.value)? - .qualified_column(), - )) + // create a column expression based on raw user input, this column will be + // normalized with qualifer later by the SQL planner. + Ok(col(&id.value)) } } @@ -1672,7 +1682,7 @@ mod tests { let err = logical_plan(sql).expect_err("query should have failed"); assert!(matches!( err, - DataFusionError::Plan(msg) if msg.contains("No field with unqualified name 'doesnotexist'"), + DataFusionError::Plan(msg) if msg.contains("Invalid identifier '#doesnotexist' for schema "), )); } @@ -1730,7 +1740,7 @@ mod tests { let err = logical_plan(sql).expect_err("query should have failed"); assert!(matches!( err, - DataFusionError::Plan(msg) if msg.contains("No field with unqualified name 'doesnotexist'"), + DataFusionError::Plan(msg) if msg.contains("Invalid identifier '#doesnotexist' for schema "), )); } @@ -1740,7 +1750,7 @@ mod tests { let err = logical_plan(sql).expect_err("query should have failed"); assert!(matches!( err, - DataFusionError::Plan(msg) if msg.contains("No field with unqualified name 'x'"), + DataFusionError::Plan(msg) if msg.contains("Invalid identifier '#x' for schema "), )); } @@ -2211,7 +2221,7 @@ mod tests { let err = logical_plan(sql).expect_err("query should have failed"); assert!(matches!( err, - DataFusionError::Plan(msg) if msg.contains("No field with unqualified name 'doesnotexist'"), + DataFusionError::Plan(msg) if msg.contains("Invalid identifier '#doesnotexist' for schema "), )); } @@ -2301,7 +2311,7 @@ mod tests { let err = logical_plan(sql).expect_err("query should have failed"); assert!(matches!( err, - DataFusionError::Plan(msg) if msg.contains("No field with unqualified name 'doesnotexist'"), + DataFusionError::Plan(msg) if msg.contains("Column #doesnotexist not found in provided schemas"), )); } @@ -2311,7 +2321,7 @@ mod tests { let err = logical_plan(sql).expect_err("query should have failed"); assert!(matches!( err, - DataFusionError::Plan(msg) if msg.contains("No field with unqualified name 'doesnotexist'"), + DataFusionError::Plan(msg) if msg.contains("Invalid identifier '#doesnotexist' for schema "), )); } @@ -2757,7 +2767,7 @@ mod tests { JOIN person as person2 \ USING (id)"; let expected = "Projection: #person.first_name, #person.id\ - \n Join: #person.id = #person2.id\ + \n Join: Using #person.id = #person2.id\ \n TableScan: person projection=None\ \n TableScan: person2 projection=None"; quick_test(sql, expected); diff --git a/datafusion/src/test/mod.rs b/datafusion/src/test/mod.rs index df3aec4a68502..b791551133e7e 100644 --- a/datafusion/src/test/mod.rs +++ b/datafusion/src/test/mod.rs @@ -110,14 +110,19 @@ pub fn aggr_test_schema() -> SchemaRef { ])) } -/// some tests share a common table -pub fn test_table_scan() -> Result { +/// some tests share a common table with different names +pub fn test_table_scan_with_name(name: &str) -> Result { let schema = Schema::new(vec![ Field::new("a", DataType::UInt32, false), Field::new("b", DataType::UInt32, false), Field::new("c", DataType::UInt32, false), ]); - LogicalPlanBuilder::scan_empty(Some("test"), &schema, None)?.build() + LogicalPlanBuilder::scan_empty(Some(name), &schema, None)?.build() +} + +/// some tests share a common table +pub fn test_table_scan() -> Result { + test_table_scan_with_name("test") } pub fn assert_fields_eq(plan: &LogicalPlan, expected: Vec<&str>) { From bbc9c6c68a03a19e4f385663b7c7ab795748f16e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 Jul 2021 09:00:32 -0400 Subject: [PATCH 246/329] Fix test output due to logical merge conflict (#694) --- datafusion/tests/sql.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index f6f8b6f041e6e..9c7d0795edb91 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -1705,7 +1705,7 @@ async fn equijoin_and_unsupported_condition() -> Result<()> { "SELECT t1_id, t1_name, t2_name FROM t1 LEFT JOIN t2 ON t1_id = t2_id AND t2_name >= 'y' ORDER BY t1_id"; let res = ctx.create_logical_plan(sql); assert!(res.is_err()); - assert_eq!(format!("{}", res.unwrap_err()), "This feature is not implemented: Unsupported expressions in Left JOIN: [#t2.t2_name GtEq Utf8(\"y\")]"); + assert_eq!(format!("{}", res.unwrap_err()), "This feature is not implemented: Unsupported expressions in Left JOIN: [#t2_name GtEq Utf8(\"y\")]"); Ok(()) } From 9f8e265e6df502a3badd8f9eff2f62a47515eb7b Mon Sep 17 00:00:00 2001 From: Cui Wenzheng Date: Wed, 7 Jul 2021 21:01:05 +0800 Subject: [PATCH 247/329] fix typo in DEVELOPERS.md (#692) --- DEVELOPERS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DEVELOPERS.md b/DEVELOPERS.md index 85384680c02eb..3ee2f7da09c5f 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -37,8 +37,8 @@ Testing setup: - `git submodule init` - `git submodule update` -- `export PARQUET_TEST_DATA=parquet_testing/` -- `export ARROW_TEST_DATA=testing/data/` +- `export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data/` +- `export ARROW_TEST_DATA=$(pwd)/testing/data/` ## How to add a new scalar function From 79d60f9b678e9a2351fc83511399663985e39cf6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 Jul 2021 09:43:44 -0400 Subject: [PATCH 248/329] Remove qualifiers on pushed down predicates / Fix parquet pruning (#689) * Remove qualifiers on pushed down predicates * Add test for normalizing and unnormalizing columns * Fix logical conflict --- datafusion/src/logical_plan/expr.rs | 149 ++++++++++++++++++++++-- datafusion/src/logical_plan/mod.rs | 4 +- datafusion/src/physical_plan/planner.rs | 13 ++- datafusion/tests/parquet_pruning.rs | 24 ++-- 4 files changed, 164 insertions(+), 26 deletions(-) diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index 9454d7593c3f3..59c99797e0cd8 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -90,14 +90,22 @@ impl Column { /// For example, `foo` will be normalized to `t.foo` if there is a /// column named `foo` in a relation named `t` found in `schemas` pub fn normalize(self, plan: &LogicalPlan) -> Result { + let schemas = plan.all_schemas(); + let using_columns = plan.using_columns()?; + self.normalize_with_schemas(&schemas, &using_columns) + } + + // Internal implementation of normalize + fn normalize_with_schemas( + self, + schemas: &[&Arc], + using_columns: &[HashSet], + ) -> Result { if self.relation.is_some() { return Ok(self); } - let schemas = plan.all_schemas(); - let using_columns = plan.using_columns()?; - - for schema in &schemas { + for schema in schemas { let fields = schema.fields_with_unqualified_name(&self.name); match fields.len() { 0 => continue, @@ -118,7 +126,7 @@ impl Column { // We will use the relation from the first matched field to normalize self. // Compare matched fields with one USING JOIN clause at a time - for using_col in &using_columns { + for using_col in using_columns { let all_matched = fields .iter() .all(|f| using_col.contains(&f.qualified_column())); @@ -1171,22 +1179,39 @@ pub fn replace_col(e: Expr, replace_map: &HashMap<&Column, &Column>) -> Result Result { +pub fn normalize_col(expr: Expr, plan: &LogicalPlan) -> Result { + normalize_col_with_schemas(expr, &plan.all_schemas(), &plan.using_columns()?) +} + +/// Recursively call [`Column::normalize`] on all Column expressions +/// in the `expr` expression tree. +fn normalize_col_with_schemas( + expr: Expr, + schemas: &[&Arc], + using_columns: &[HashSet], +) -> Result { struct ColumnNormalizer<'a> { - plan: &'a LogicalPlan, + schemas: &'a [&'a Arc], + using_columns: &'a [HashSet], } impl<'a> ExprRewriter for ColumnNormalizer<'a> { fn mutate(&mut self, expr: Expr) -> Result { if let Expr::Column(c) = expr { - Ok(Expr::Column(c.normalize(self.plan)?)) + Ok(Expr::Column(c.normalize_with_schemas( + self.schemas, + self.using_columns, + )?)) } else { Ok(expr) } } } - e.rewrite(&mut ColumnNormalizer { plan }) + expr.rewrite(&mut ColumnNormalizer { + schemas, + using_columns, + }) } /// Recursively normalize all Column expressions in a list of expression trees @@ -1198,6 +1223,38 @@ pub fn normalize_cols( exprs.into_iter().map(|e| normalize_col(e, plan)).collect() } +/// Recursively 'unnormalize' (remove all qualifiers) from an +/// expression tree. +/// +/// For example, if there were expressions like `foo.bar` this would +/// rewrite it to just `bar`. +pub fn unnormalize_col(expr: Expr) -> Expr { + struct RemoveQualifier {} + + impl ExprRewriter for RemoveQualifier { + fn mutate(&mut self, expr: Expr) -> Result { + if let Expr::Column(col) = expr { + //let Column { relation: _, name } = col; + Ok(Expr::Column(Column { + relation: None, + name: col.name, + })) + } else { + Ok(expr) + } + } + } + + expr.rewrite(&mut RemoveQualifier {}) + .expect("Unnormalize is infallable") +} + +/// Recursively un-normalize all Column expressions in a list of expression trees +#[inline] +pub fn unnormalize_cols(exprs: impl IntoIterator) -> Vec { + exprs.into_iter().map(unnormalize_col).collect() +} + /// Create an expression to represent the min() aggregate function pub fn min(expr: Expr) -> Expr { Expr::AggregateFunction { @@ -1810,4 +1867,78 @@ mod tests { } } } + + #[test] + fn normalize_cols() { + let expr = col("a") + col("b") + col("c"); + + // Schemas with some matching and some non matching cols + let schema_a = + DFSchema::new(vec![make_field("tableA", "a"), make_field("tableA", "aa")]) + .unwrap(); + let schema_c = + DFSchema::new(vec![make_field("tableC", "cc"), make_field("tableC", "c")]) + .unwrap(); + let schema_b = DFSchema::new(vec![make_field("tableB", "b")]).unwrap(); + // non matching + let schema_f = + DFSchema::new(vec![make_field("tableC", "f"), make_field("tableC", "ff")]) + .unwrap(); + let schemas = vec![schema_c, schema_f, schema_b, schema_a] + .into_iter() + .map(Arc::new) + .collect::>(); + let schemas = schemas.iter().collect::>(); + + let normalized_expr = normalize_col_with_schemas(expr, &schemas, &[]).unwrap(); + assert_eq!( + normalized_expr, + col("tableA.a") + col("tableB.b") + col("tableC.c") + ); + } + + #[test] + fn normalize_cols_priority() { + let expr = col("a") + col("b"); + // Schemas with multiple matches for column a, first takes priority + let schema_a = DFSchema::new(vec![make_field("tableA", "a")]).unwrap(); + let schema_b = DFSchema::new(vec![make_field("tableB", "b")]).unwrap(); + let schema_a2 = DFSchema::new(vec![make_field("tableA2", "a")]).unwrap(); + let schemas = vec![schema_a2, schema_b, schema_a] + .into_iter() + .map(Arc::new) + .collect::>(); + let schemas = schemas.iter().collect::>(); + + let normalized_expr = normalize_col_with_schemas(expr, &schemas, &[]).unwrap(); + assert_eq!(normalized_expr, col("tableA2.a") + col("tableB.b")); + } + + #[test] + fn normalize_cols_non_exist() { + // test normalizing columns when the name doesn't exist + let expr = col("a") + col("b"); + let schema_a = DFSchema::new(vec![make_field("tableA", "a")]).unwrap(); + let schemas = vec![schema_a].into_iter().map(Arc::new).collect::>(); + let schemas = schemas.iter().collect::>(); + + let error = normalize_col_with_schemas(expr, &schemas, &[]) + .unwrap_err() + .to_string(); + assert_eq!( + error, + "Error during planning: Column #b not found in provided schemas" + ); + } + + #[test] + fn unnormalize_cols() { + let expr = col("tableA.a") + col("tableB.b"); + let unnormalized_expr = unnormalize_col(expr); + assert_eq!(unnormalized_expr, col("a") + col("b")); + } + + fn make_field(relation: &str, column: &str) -> DFField { + DFField::new(Some(relation), column, DataType::Int8, false) + } } diff --git a/datafusion/src/logical_plan/mod.rs b/datafusion/src/logical_plan/mod.rs index 86a2f567d7de4..2c751abdad349 100644 --- a/datafusion/src/logical_plan/mod.rs +++ b/datafusion/src/logical_plan/mod.rs @@ -43,8 +43,8 @@ pub use expr::{ min, normalize_col, normalize_cols, now, octet_length, or, random, regexp_match, regexp_replace, repeat, replace, replace_col, reverse, right, round, rpad, rtrim, sha224, sha256, sha384, sha512, signum, sin, split_part, sqrt, starts_with, strpos, - substr, sum, tan, to_hex, translate, trim, trunc, upper, when, Column, Expr, - ExprRewriter, ExpressionVisitor, Literal, Recursion, + substr, sum, tan, to_hex, translate, trim, trunc, unnormalize_col, unnormalize_cols, + upper, when, Column, Expr, ExprRewriter, ExpressionVisitor, Literal, Recursion, }; pub use extension::UserDefinedLogicalNode; pub use operators::Operator; diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index 73b2f362989f6..df4168370003a 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -23,8 +23,9 @@ use super::{ }; use crate::execution::context::ExecutionContextState; use crate::logical_plan::{ - DFSchema, Expr, LogicalPlan, Operator, Partitioning as LogicalPartitioning, PlanType, - StringifiedPlan, UserDefinedLogicalNode, + unnormalize_cols, DFSchema, Expr, LogicalPlan, Operator, + Partitioning as LogicalPartitioning, PlanType, StringifiedPlan, + UserDefinedLogicalNode, }; use crate::physical_plan::explain::ExplainExec; use crate::physical_plan::expressions; @@ -311,7 +312,13 @@ impl DefaultPhysicalPlanner { filters, limit, .. - } => source.scan(projection, batch_size, filters, *limit), + } => { + // Remove all qualifiers from the scan as the provider + // doesn't know (nor should care) how the relation was + // referred to in the query + let filters = unnormalize_cols(filters.iter().cloned()); + source.scan(projection, batch_size, &filters, *limit) + } LogicalPlan::Window { input, window_expr, .. } => { diff --git a/datafusion/tests/parquet_pruning.rs b/datafusion/tests/parquet_pruning.rs index 86b3946e47121..f5486afc7aa4a 100644 --- a/datafusion/tests/parquet_pruning.rs +++ b/datafusion/tests/parquet_pruning.rs @@ -44,9 +44,9 @@ async fn prune_timestamps_nanos() { .query("SELECT * FROM t where nanos < to_timestamp('2020-01-02 01:01:11Z')") .await; println!("{}", output.description()); - // TODO This should prune one metrics without error - assert_eq!(output.predicate_evaluation_errors(), Some(1)); - assert_eq!(output.row_groups_pruned(), Some(0)); + // This should prune one metrics without error + assert_eq!(output.predicate_evaluation_errors(), Some(0)); + assert_eq!(output.row_groups_pruned(), Some(1)); assert_eq!(output.result_rows, 10, "{}", output.description()); } @@ -59,9 +59,9 @@ async fn prune_timestamps_micros() { ) .await; println!("{}", output.description()); - // TODO This should prune one metrics without error - assert_eq!(output.predicate_evaluation_errors(), Some(1)); - assert_eq!(output.row_groups_pruned(), Some(0)); + // This should prune one metrics without error + assert_eq!(output.predicate_evaluation_errors(), Some(0)); + assert_eq!(output.row_groups_pruned(), Some(1)); assert_eq!(output.result_rows, 10, "{}", output.description()); } @@ -74,9 +74,9 @@ async fn prune_timestamps_millis() { ) .await; println!("{}", output.description()); - // TODO This should prune one metrics without error - assert_eq!(output.predicate_evaluation_errors(), Some(1)); - assert_eq!(output.row_groups_pruned(), Some(0)); + // This should prune one metrics without error + assert_eq!(output.predicate_evaluation_errors(), Some(0)); + assert_eq!(output.row_groups_pruned(), Some(1)); assert_eq!(output.result_rows, 10, "{}", output.description()); } @@ -89,9 +89,9 @@ async fn prune_timestamps_seconds() { ) .await; println!("{}", output.description()); - // TODO This should prune one metrics without error - assert_eq!(output.predicate_evaluation_errors(), Some(1)); - assert_eq!(output.row_groups_pruned(), Some(0)); + // This should prune one metrics without error + assert_eq!(output.predicate_evaluation_errors(), Some(0)); + assert_eq!(output.row_groups_pruned(), Some(1)); assert_eq!(output.result_rows, 10, "{}", output.description()); } From f94f6391845c844980fd4fb3171a743bf5d182b2 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 8 Jul 2021 19:37:49 +0800 Subject: [PATCH 249/329] add more integration tests (#668) --- .../sqls/self_join_with_alias.sql | 22 +++++++++++++++++++ integration-tests/sqls/simple_union_all.sql | 17 ++++++++++++++ integration-tests/test_psql_parity.py | 2 +- 3 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 integration-tests/sqls/self_join_with_alias.sql create mode 100644 integration-tests/sqls/simple_union_all.sql diff --git a/integration-tests/sqls/self_join_with_alias.sql b/integration-tests/sqls/self_join_with_alias.sql new file mode 100644 index 0000000000000..54c39888dffed --- /dev/null +++ b/integration-tests/sqls/self_join_with_alias.sql @@ -0,0 +1,22 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at + +-- http://www.apache.org/licenses/LICENSE-2.0 + +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +SELECT + t1.c9 result +FROM test t1 +INNER JOIN test t2 +ON t1.c9 = t2.c9 +ORDER BY result; diff --git a/integration-tests/sqls/simple_union_all.sql b/integration-tests/sqls/simple_union_all.sql new file mode 100644 index 0000000000000..65557b8d263fd --- /dev/null +++ b/integration-tests/sqls/simple_union_all.sql @@ -0,0 +1,17 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at + +-- http://www.apache.org/licenses/LICENSE-2.0 + +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +SELECT 1 num UNION ALL SELECT 2 num ORDER BY num; diff --git a/integration-tests/test_psql_parity.py b/integration-tests/test_psql_parity.py index 766f403f3e543..39cfdee77fbdd 100644 --- a/integration-tests/test_psql_parity.py +++ b/integration-tests/test_psql_parity.py @@ -74,7 +74,7 @@ class PsqlParityTest(unittest.TestCase): def test_parity(self): root = Path(os.path.dirname(__file__)) / "sqls" files = set(root.glob("*.sql")) - self.assertEqual(len(files), 12, msg="tests are missed") + self.assertEqual(len(files), 14, msg="tests are missed") for fname in files: with self.subTest(fname=fname): datafusion_output = pd.read_csv( From 024bd89603dea13e63b70c92274116edbe36c4f9 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Thu, 8 Jul 2021 12:38:23 +0100 Subject: [PATCH 250/329] perf: Improve materialisation performance of SortPreservingMergeExec (#691) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * test: add benchmarks for SortPreservingMergeExec * perf: minimise array data extend calls The `SortPreservingMergeStream` operator merges two streams together by creating an output record batch that is build from the contents of the input. Previously each row of input would be pushed into the output sink even if though the API supports pushing batches of rows. This commit implements the logic to push batches of rows from inputs where possible. Performance benchmarks show an improvement of between 3-12%. ``` group master pr ----- ------ -- interleave_batches 1.04 637.5±51.84µs ? ?/sec 1.00 615.5±12.13µs ? ?/sec merge_batches_no_overlap_large 1.12 454.9±2.90µs ? ?/sec 1.00 404.9±10.94µs ? ?/sec merge_batches_no_overlap_small 1.14 485.1±6.67µs ? ?/sec 1.00 425.7±9.33µs ? ?/sec merge_batches_small_into_large 1.14 263.0±8.85µs ? ?/sec 1.00 229.7±5.23µs ? ?/sec merge_batches_some_overlap_large 1.05 532.5±8.33µs ? ?/sec 1.00 508.3±14.24µs ? ?/sec merge_batches_some_overlap_small 1.06 546.9±12.82µs ? ?/sec 1.00 516.9±13.20µs ? ?/sec ``` * test: more test coverage * refactor: update batch size --- datafusion/Cargo.toml | 4 + datafusion/benches/physical_plan.rs | 176 +++++++++++++++ .../physical_plan/sort_preserving_merge.rs | 202 ++++++++++++++---- 3 files changed, 341 insertions(+), 41 deletions(-) create mode 100644 datafusion/benches/physical_plan.rs diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index f1a77741064e4..845de6213f4d3 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -95,3 +95,7 @@ harness = false [[bench]] name = "scalar" harness = false + +[[bench]] +name = "physical_plan" +harness = false \ No newline at end of file diff --git a/datafusion/benches/physical_plan.rs b/datafusion/benches/physical_plan.rs new file mode 100644 index 0000000000000..9222ae131b8ff --- /dev/null +++ b/datafusion/benches/physical_plan.rs @@ -0,0 +1,176 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[macro_use] +extern crate criterion; +use criterion::{BatchSize, Criterion}; +extern crate arrow; +extern crate datafusion; + +use std::{iter::FromIterator, sync::Arc}; + +use arrow::{ + array::{ArrayRef, Int64Array, StringArray}, + record_batch::RecordBatch, +}; +use tokio::runtime::Runtime; + +use datafusion::physical_plan::{ + collect, + expressions::{col, PhysicalSortExpr}, + memory::MemoryExec, + sort_preserving_merge::SortPreservingMergeExec, +}; + +// Initialise the operator using the provided record batches and the sort key +// as inputs. All record batches must have the same schema. +fn sort_preserving_merge_operator(batches: Vec, sort: &[&str]) { + let schema = batches[0].schema(); + + let sort = sort + .iter() + .map(|name| PhysicalSortExpr { + expr: col(name, &schema).unwrap(), + options: Default::default(), + }) + .collect::>(); + + let exec = MemoryExec::try_new( + &batches.into_iter().map(|rb| vec![rb]).collect::>(), + schema, + None, + ) + .unwrap(); + let merge = Arc::new(SortPreservingMergeExec::new(sort, Arc::new(exec), 8192)); + + let rt = Runtime::new().unwrap(); + rt.block_on(collect(merge)).unwrap(); +} + +// Produces `n` record batches of row size `m`. Each record batch will have +// identical contents except for if the `batch_offset` is set. In that case the +// values for column "d" in each subsequent record batch will be offset in +// value. +// +// The `rows_per_key` value controls how many rows are generated per "key", +// which is defined as columns a, b and c. +fn batches( + n: usize, + m: usize, + rows_per_sort_key: usize, + batch_offset: usize, +) -> Vec { + let mut rbs = Vec::with_capacity(n); + let mut curr_batch_offset = 0; + + for _ in 0..n { + let mut col_a = Vec::with_capacity(m); + let mut col_b = Vec::with_capacity(m); + let mut col_c = Vec::with_capacity(m); + let mut col_d = Vec::with_capacity(m); + + let mut j = 0; + let mut current_rows_per_sort_key = 0; + + for i in 0..m { + if current_rows_per_sort_key == rows_per_sort_key { + current_rows_per_sort_key = 0; + j = i; + } + + col_a.push(Some(format!("a-{:?}", j))); + col_b.push(Some(format!("b-{:?}", j))); + col_c.push(Some(format!("c-{:?}", j))); + col_d.push(Some((i + curr_batch_offset) as i64)); + + current_rows_per_sort_key += 1; + } + + col_a.sort(); + col_b.sort(); + col_c.sort(); + + let col_a: ArrayRef = Arc::new(StringArray::from_iter(col_a)); + let col_b: ArrayRef = Arc::new(StringArray::from_iter(col_b)); + let col_c: ArrayRef = Arc::new(StringArray::from_iter(col_c)); + let col_d: ArrayRef = Arc::new(Int64Array::from(col_d)); + + let rb = RecordBatch::try_from_iter(vec![ + ("a", col_a), + ("b", col_b), + ("c", col_c), + ("d", col_d), + ]) + .unwrap(); + rbs.push(rb); + + curr_batch_offset += batch_offset; + } + + rbs +} + +fn criterion_benchmark(c: &mut Criterion) { + let small_batch = batches(1, 100, 10, 0).remove(0); + let large_batch = batches(1, 1000, 1, 0).remove(0); + + let benches = vec![ + // Two batches with identical rows. They will need to be merged together + // with one row from each batch being taken until both batches are + // drained. + ("interleave_batches", batches(2, 1000, 10, 1)), + // Two batches with a small overlapping region of rows for each unique + // sort key. + ("merge_batches_some_overlap_small", batches(2, 1000, 10, 5)), + // Two batches with a large overlapping region of rows for each unique + // sort key. + ( + "merge_batches_some_overlap_large", + batches(2, 1000, 250, 125), + ), + // Two batches with no overlapping region of rows for each unique + // sort key. For a given unique sort key all rows are drained from one + // batch, then all the rows for the same key from the second batch. + // This repeats until all rows are drained. There are a small number of + // rows (10) for each unique sort key. + ("merge_batches_no_overlap_small", batches(2, 1000, 10, 12)), + // As above but this time there are a larger number of rows (250) for + // each unique sort key - still no overlaps. + ("merge_batches_no_overlap_large", batches(2, 1000, 250, 252)), + // Merges two batches where one batch is significantly larger than the + // other. + ( + "merge_batches_small_into_large", + vec![large_batch, small_batch], + ), + ]; + + for (name, input) in benches { + c.bench_function(name, move |b| { + b.iter_batched( + || input.clone(), + |input| { + sort_preserving_merge_operator(input, &["a", "b", "c", "d"]); + }, + BatchSize::LargeInput, + ) + }); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/src/physical_plan/sort_preserving_merge.rs b/datafusion/src/physical_plan/sort_preserving_merge.rs index 316f0509960dd..0949c3c6a8cf6 100644 --- a/datafusion/src/physical_plan/sort_preserving_merge.rs +++ b/datafusion/src/physical_plan/sort_preserving_merge.rs @@ -24,22 +24,23 @@ use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; -use arrow::array::{ArrayRef, MutableArrayData}; -use arrow::compute::SortOptions; +use arrow::{ + array::{make_array as make_arrow_array, ArrayRef, MutableArrayData}, + compute::SortOptions, + datatypes::SchemaRef, + error::{ArrowError, Result as ArrowResult}, + record_batch::RecordBatch, +}; use async_trait::async_trait; use futures::channel::mpsc; use futures::stream::FusedStream; use futures::{Stream, StreamExt}; -use crate::arrow::datatypes::SchemaRef; -use crate::arrow::error::ArrowError; -use crate::arrow::{error::Result as ArrowResult, record_batch::RecordBatch}; use crate::error::{DataFusionError, Result}; -use crate::physical_plan::common::spawn_execution; -use crate::physical_plan::expressions::PhysicalSortExpr; use crate::physical_plan::{ - DisplayFormatType, Distribution, ExecutionPlan, Partitioning, PhysicalExpr, - RecordBatchStream, SendableRecordBatchStream, + common::spawn_execution, expressions::PhysicalSortExpr, DisplayFormatType, + Distribution, ExecutionPlan, Partitioning, PhysicalExpr, RecordBatchStream, + SendableRecordBatchStream, }; /// Sort preserving merge execution plan @@ -425,19 +426,38 @@ impl SortPreservingMergeStream { self.in_progress.len(), ); - for row_index in &self.in_progress { - let buffer_idx = + if self.in_progress.is_empty() { + return make_arrow_array(array_data.freeze()); + } + + let first = &self.in_progress[0]; + let mut buffer_idx = + stream_to_buffer_idx[first.stream_idx] + first.cursor_idx; + let mut start_row_idx = first.row_idx; + let mut end_row_idx = start_row_idx + 1; + + for row_index in self.in_progress.iter().skip(1) { + let next_buffer_idx = stream_to_buffer_idx[row_index.stream_idx] + row_index.cursor_idx; - // TODO: Coalesce contiguous writes - array_data.extend( - buffer_idx, - row_index.row_idx, - row_index.row_idx + 1, - ); + if next_buffer_idx == buffer_idx && row_index.row_idx == end_row_idx { + // subsequent row in same batch + end_row_idx += 1; + continue; + } + + // emit current batch of rows for current buffer + array_data.extend(buffer_idx, start_row_idx, end_row_idx); + + // start new batch of rows + buffer_idx = next_buffer_idx; + start_row_idx = row_index.row_idx; + end_row_idx = start_row_idx + 1; } - arrow::array::make_array(array_data.freeze()) + // emit final batch of rows + array_data.extend(buffer_idx, start_row_idx, end_row_idx); + make_arrow_array(array_data.freeze()) }) .collect(); @@ -555,7 +575,54 @@ mod tests { use tokio_stream::StreamExt; #[tokio::test] - async fn test_merge() { + async fn test_merge_interleave() { + let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("a"), + Some("c"), + Some("e"), + Some("g"), + Some("j"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8])); + let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + let a: ArrayRef = Arc::new(Int32Array::from(vec![10, 20, 70, 90, 30])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("b"), + Some("d"), + Some("f"), + Some("h"), + Some("j"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2, 2, 6])); + let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + _test_merge( + b1, + b2, + &[ + "+----+---+-------------------------------+", + "| a | b | c |", + "+----+---+-------------------------------+", + "| 1 | a | 1970-01-01 00:00:00.000000008 |", + "| 10 | b | 1970-01-01 00:00:00.000000004 |", + "| 2 | c | 1970-01-01 00:00:00.000000007 |", + "| 20 | d | 1970-01-01 00:00:00.000000006 |", + "| 7 | e | 1970-01-01 00:00:00.000000006 |", + "| 70 | f | 1970-01-01 00:00:00.000000002 |", + "| 9 | g | 1970-01-01 00:00:00.000000005 |", + "| 90 | h | 1970-01-01 00:00:00.000000002 |", + "| 30 | j | 1970-01-01 00:00:00.000000006 |", // input b2 before b1 + "| 3 | j | 1970-01-01 00:00:00.000000008 |", + "+----+---+-------------------------------+", + ], + ) + .await; + } + + #[tokio::test] + async fn test_merge_some_overlap() { let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3])); let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ Some("a"), @@ -564,21 +631,92 @@ mod tests { Some("d"), Some("e"), ])); - let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 4])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8])); let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); - let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])); + let a: ArrayRef = Arc::new(Int32Array::from(vec![70, 90, 30, 100, 110])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("c"), + Some("d"), + Some("e"), + Some("f"), + Some("g"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2, 2, 6])); + let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + _test_merge( + b1, + b2, + &[ + "+-----+---+-------------------------------+", + "| a | b | c |", + "+-----+---+-------------------------------+", + "| 1 | a | 1970-01-01 00:00:00.000000008 |", + "| 2 | b | 1970-01-01 00:00:00.000000007 |", + "| 70 | c | 1970-01-01 00:00:00.000000004 |", + "| 7 | c | 1970-01-01 00:00:00.000000006 |", + "| 9 | d | 1970-01-01 00:00:00.000000005 |", + "| 90 | d | 1970-01-01 00:00:00.000000006 |", + "| 30 | e | 1970-01-01 00:00:00.000000002 |", + "| 3 | e | 1970-01-01 00:00:00.000000008 |", + "| 100 | f | 1970-01-01 00:00:00.000000002 |", + "| 110 | g | 1970-01-01 00:00:00.000000006 |", + "+-----+---+-------------------------------+", + ], + ) + .await; + } + + #[tokio::test] + async fn test_merge_no_overlap() { + let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3])); let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("a"), + Some("b"), + Some("c"), Some("d"), Some("e"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8])); + let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + let a: ArrayRef = Arc::new(Int32Array::from(vec![10, 20, 70, 90, 30])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("f"), Some("g"), Some("h"), Some("i"), + Some("j"), ])); let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2, 2, 6])); let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); - let schema = b1.schema(); + _test_merge( + b1, + b2, + &[ + "+----+---+-------------------------------+", + "| a | b | c |", + "+----+---+-------------------------------+", + "| 1 | a | 1970-01-01 00:00:00.000000008 |", + "| 2 | b | 1970-01-01 00:00:00.000000007 |", + "| 7 | c | 1970-01-01 00:00:00.000000006 |", + "| 9 | d | 1970-01-01 00:00:00.000000005 |", + "| 3 | e | 1970-01-01 00:00:00.000000008 |", + "| 10 | f | 1970-01-01 00:00:00.000000004 |", + "| 20 | g | 1970-01-01 00:00:00.000000006 |", + "| 70 | h | 1970-01-01 00:00:00.000000002 |", + "| 90 | i | 1970-01-01 00:00:00.000000002 |", + "| 30 | j | 1970-01-01 00:00:00.000000006 |", + "+----+---+-------------------------------+", + ], + ) + .await; + } + + async fn _test_merge(b1: RecordBatch, b2: RecordBatch, exp: &[&str]) { + let schema = b1.schema(); let sort = vec![ PhysicalSortExpr { expr: col("b", &schema).unwrap(), @@ -595,25 +733,7 @@ mod tests { let collected = collect(merge).await.unwrap(); assert_eq!(collected.len(), 1); - assert_batches_eq!( - &[ - "+---+---+-------------------------------+", - "| a | b | c |", - "+---+---+-------------------------------+", - "| 1 | a | 1970-01-01 00:00:00.000000008 |", - "| 2 | b | 1970-01-01 00:00:00.000000007 |", - "| 7 | c | 1970-01-01 00:00:00.000000006 |", - "| 1 | d | 1970-01-01 00:00:00.000000004 |", - "| 9 | d | 1970-01-01 00:00:00.000000005 |", - "| 3 | e | 1970-01-01 00:00:00.000000004 |", - "| 2 | e | 1970-01-01 00:00:00.000000006 |", - "| 3 | g | 1970-01-01 00:00:00.000000002 |", - "| 4 | h | 1970-01-01 00:00:00.000000002 |", - "| 5 | i | 1970-01-01 00:00:00.000000006 |", - "+---+---+-------------------------------+", - ], - collected.as_slice() - ); + assert_batches_eq!(exp, collected.as_slice()); } async fn sorted_merge( From 7378bb4de1dbcb008e68a01e1f0f046c6a17cade Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 8 Jul 2021 07:52:33 -0400 Subject: [PATCH 251/329] Fix build with 1.52.1 (#696) --- datafusion/src/physical_plan/parquet.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/src/physical_plan/parquet.rs b/datafusion/src/physical_plan/parquet.rs index f31b921d663b0..63e11d5106bac 100644 --- a/datafusion/src/physical_plan/parquet.rs +++ b/datafusion/src/physical_plan/parquet.rs @@ -514,7 +514,7 @@ impl ExecutionPlan for ParquetExec { self.partitions .iter() .flat_map(|p| { - [ + vec![ ( format!( "numPredicateEvaluationErrors for {}", From 63727df03472cd5ee0cff85371b62dfae6131493 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Fri, 9 Jul 2021 11:28:32 +0200 Subject: [PATCH 252/329] Avoid sleeping between tasks (#698) --- ballista/rust/executor/src/execution_loop.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/ballista/rust/executor/src/execution_loop.rs b/ballista/rust/executor/src/execution_loop.rs index 17a8d8c2002a8..17f6e4dd5d359 100644 --- a/ballista/rust/executor/src/execution_loop.rs +++ b/ballista/rust/executor/src/execution_loop.rs @@ -49,6 +49,10 @@ pub async fn poll_loop( let task_status: Vec = sample_tasks_status(&mut task_status_receiver).await; + // Keeps track of whether we received task in last iteration + // to avoid going in sleep mode between polling + let mut active_job = false; + let poll_work_result: anyhow::Result< tonic::Response, tonic::Status, @@ -73,14 +77,18 @@ pub async fn poll_loop( task, ) .await; + active_job = true; + } else { + active_job = false; } } Err(error) => { warn!("Executor registration failed. If this continues to happen the executor might be marked as dead by the scheduler. Error: {}", error); } } - - tokio::time::sleep(Duration::from_millis(250)).await; + if !active_job { + tokio::time::sleep(Duration::from_millis(100)).await; + } } } From 7d2456743470142753d940f1829db5258e417c27 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 10 Jul 2021 05:27:01 -0400 Subject: [PATCH 253/329] Fix date32 and date64 parquet row group pruning, tests for same (#690) --- .../src/physical_plan/expressions/binary.rs | 3 + datafusion/src/scalar.rs | 3 + datafusion/tests/parquet_pruning.rs | 210 ++++++++++++++++-- 3 files changed, 193 insertions(+), 23 deletions(-) diff --git a/datafusion/src/physical_plan/expressions/binary.rs b/datafusion/src/physical_plan/expressions/binary.rs index 102b701633853..3394113a9c40d 100644 --- a/datafusion/src/physical_plan/expressions/binary.rs +++ b/datafusion/src/physical_plan/expressions/binary.rs @@ -269,6 +269,9 @@ macro_rules! binary_array_op_scalar { DataType::Date32 => { compute_op_scalar!($LEFT, $RIGHT, $OP, Date32Array) } + DataType::Date64 => { + compute_op_scalar!($LEFT, $RIGHT, $OP, Date64Array) + } other => Err(DataFusionError::Internal(format!( "Data type {:?} not supported for scalar operation on dyn array", other diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index c23674bd59db0..f94a090a538a1 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -900,6 +900,7 @@ impl TryFrom for i64 { fn try_from(value: ScalarValue) -> Result { match value { ScalarValue::Int64(Some(inner_value)) + | ScalarValue::Date64(Some(inner_value)) | ScalarValue::TimestampNanosecond(Some(inner_value)) | ScalarValue::TimestampMicrosecond(Some(inner_value)) | ScalarValue::TimestampMillisecond(Some(inner_value)) @@ -939,6 +940,8 @@ impl TryFrom<&DataType> for ScalarValue { DataType::UInt64 => ScalarValue::UInt64(None), DataType::Utf8 => ScalarValue::Utf8(None), DataType::LargeUtf8 => ScalarValue::LargeUtf8(None), + DataType::Date32 => ScalarValue::Date32(None), + DataType::Date64 => ScalarValue::Date64(None), DataType::Timestamp(TimeUnit::Second, _) => { ScalarValue::TimestampSecond(None) } diff --git a/datafusion/tests/parquet_pruning.rs b/datafusion/tests/parquet_pruning.rs index f5486afc7aa4a..8ad7974280f09 100644 --- a/datafusion/tests/parquet_pruning.rs +++ b/datafusion/tests/parquet_pruning.rs @@ -21,17 +21,20 @@ use std::sync::Arc; use arrow::{ array::{ - Array, StringArray, TimestampMicrosecondArray, TimestampMillisecondArray, - TimestampNanosecondArray, TimestampSecondArray, + Array, Date32Array, Date64Array, StringArray, TimestampMicrosecondArray, + TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, }, datatypes::{Field, Schema}, record_batch::RecordBatch, util::pretty::pretty_format_batches, }; -use chrono::Duration; +use chrono::{Datelike, Duration}; use datafusion::{ + datasource::{parquet::ParquetTable, TableProvider}, + logical_plan::{col, lit, Expr, LogicalPlan, LogicalPlanBuilder}, physical_plan::{plan_metrics, SQLMetric}, prelude::ExecutionContext, + scalar::ScalarValue, }; use hashbrown::HashMap; use parquet::{arrow::ArrowWriter, file::properties::WriterProperties}; @@ -39,7 +42,7 @@ use tempfile::NamedTempFile; #[tokio::test] async fn prune_timestamps_nanos() { - let output = ContextWithParquet::new() + let output = ContextWithParquet::new(Scenario::Timestamps) .await .query("SELECT * FROM t where nanos < to_timestamp('2020-01-02 01:01:11Z')") .await; @@ -52,7 +55,7 @@ async fn prune_timestamps_nanos() { #[tokio::test] async fn prune_timestamps_micros() { - let output = ContextWithParquet::new() + let output = ContextWithParquet::new(Scenario::Timestamps) .await .query( "SELECT * FROM t where micros < to_timestamp_micros('2020-01-02 01:01:11Z')", @@ -67,7 +70,7 @@ async fn prune_timestamps_micros() { #[tokio::test] async fn prune_timestamps_millis() { - let output = ContextWithParquet::new() + let output = ContextWithParquet::new(Scenario::Timestamps) .await .query( "SELECT * FROM t where millis < to_timestamp_millis('2020-01-02 01:01:11Z')", @@ -82,7 +85,7 @@ async fn prune_timestamps_millis() { #[tokio::test] async fn prune_timestamps_seconds() { - let output = ContextWithParquet::new() + let output = ContextWithParquet::new(Scenario::Timestamps) .await .query( "SELECT * FROM t where seconds < to_timestamp_seconds('2020-01-02 01:01:11Z')", @@ -95,15 +98,60 @@ async fn prune_timestamps_seconds() { assert_eq!(output.result_rows, 10, "{}", output.description()); } +#[tokio::test] +async fn prune_date32() { + let output = ContextWithParquet::new(Scenario::Dates) + .await + .query("SELECT * FROM t where date32 < cast('2020-01-02' as date)") + .await; + println!("{}", output.description()); + // This should prune out groups without error + assert_eq!(output.predicate_evaluation_errors(), Some(0)); + assert_eq!(output.row_groups_pruned(), Some(3)); + assert_eq!(output.result_rows, 1, "{}", output.description()); +} + +#[tokio::test] +async fn prune_date64() { + // work around for not being able to cast Date32 to Date64 automatically + let date = "2020-01-02" + .parse::() + .unwrap() + .and_time(chrono::NaiveTime::from_hms(0, 0, 0)); + let date = ScalarValue::Date64(Some(date.timestamp_millis())); + + let output = ContextWithParquet::new(Scenario::Dates) + .await + .query_with_expr(col("date64").lt(lit(date))) + // .query( + // "SELECT * FROM t where date64 < caste('2020-01-02' as date)", + // query results in Plan("'Date64 < Date32' can't be evaluated because there isn't a common type to coerce the types to") + // ) + .await; + + println!("{}", output.description()); + // This should prune out groups without error + assert_eq!(output.predicate_evaluation_errors(), Some(0)); + assert_eq!(output.row_groups_pruned(), Some(3)); + assert_eq!(output.result_rows, 1, "{}", output.description()); +} + // ---------------------- // Begin test fixture // ---------------------- +/// What data to use +enum Scenario { + Timestamps, + Dates, +} + /// Test fixture that has an execution context that has an external /// table "t" registered, pointing at a parquet file made with /// `make_test_file` struct ContextWithParquet { file: NamedTempFile, + provider: Arc, ctx: ExecutionContext, } @@ -156,24 +204,54 @@ impl TestOutput { /// Creates an execution context that has an external table "t" /// registered pointing at a parquet file made with `make_test_file` +/// and the appropriate scenario impl ContextWithParquet { - async fn new() -> Self { - let file = make_test_file().await; + async fn new(scenario: Scenario) -> Self { + let file = make_test_file(scenario).await; // now, setup a the file as a data source and run a query against it let mut ctx = ExecutionContext::new(); let parquet_path = file.path().to_string_lossy(); - ctx.register_parquet("t", &parquet_path) - .expect("registering"); - Self { file, ctx } + let table = ParquetTable::try_new(parquet_path, 4).unwrap(); + + let provider = Arc::new(table); + ctx.register_table("t", provider.clone()).unwrap(); + + Self { + file, + provider, + ctx, + } + } + + /// runs a query like "SELECT * from t WHERE and returns + /// the number of output rows and normalized execution metrics + async fn query_with_expr(&mut self, expr: Expr) -> TestOutput { + let sql = format!("EXPR only: {:?}", expr); + let logical_plan = LogicalPlanBuilder::scan("t", self.provider.clone(), None) + .unwrap() + .filter(expr) + .unwrap() + .build() + .unwrap(); + self.run_test(logical_plan, sql).await } /// Runs the specified SQL query and returns the number of output /// rows and normalized execution metrics async fn query(&mut self, sql: &str) -> TestOutput { println!("Planning sql {}", sql); + let logical_plan = self.ctx.sql(sql).expect("planning").to_logical_plan(); + self.run_test(logical_plan, sql).await + } + /// runs the logical plan + async fn run_test( + &mut self, + logical_plan: LogicalPlan, + sql: impl Into, + ) -> TestOutput { let input = self .ctx .sql("SELECT * from t") @@ -183,8 +261,6 @@ impl ContextWithParquet { .expect("getting input"); let pretty_input = pretty_format_batches(&input).unwrap(); - let logical_plan = self.ctx.sql(sql).expect("planning").to_logical_plan(); - let logical_plan = self.ctx.optimize(&logical_plan).expect("optimizing plan"); let execution_plan = self .ctx @@ -210,7 +286,7 @@ impl ContextWithParquet { let pretty_results = pretty_format_batches(&results).unwrap(); - let sql = sql.to_string(); + let sql = sql.into(); TestOutput { sql, metrics, @@ -222,7 +298,7 @@ impl ContextWithParquet { } /// Create a test parquet file with varioud data types -async fn make_test_file() -> NamedTempFile { +async fn make_test_file(scenario: Scenario) -> NamedTempFile { let output_file = tempfile::Builder::new() .prefix("parquet_pruning") .suffix(".parquet") @@ -233,12 +309,25 @@ async fn make_test_file() -> NamedTempFile { .set_max_row_group_size(5) .build(); - let batches = vec![ - make_batch(Duration::seconds(0)), - make_batch(Duration::seconds(10)), - make_batch(Duration::minutes(10)), - make_batch(Duration::days(10)), - ]; + let batches = match scenario { + Scenario::Timestamps => { + vec![ + make_timestamp_batch(Duration::seconds(0)), + make_timestamp_batch(Duration::seconds(10)), + make_timestamp_batch(Duration::minutes(10)), + make_timestamp_batch(Duration::days(10)), + ] + } + Scenario::Dates => { + vec![ + make_date_batch(Duration::days(0)), + make_date_batch(Duration::days(10)), + make_date_batch(Duration::days(300)), + make_date_batch(Duration::days(3600)), + ] + } + }; + let schema = batches[0].schema(); let mut writer = ArrowWriter::try_new( @@ -268,7 +357,7 @@ async fn make_test_file() -> NamedTempFile { /// "millis" --> TimestampMillisecondArray /// "seconds" --> TimestampSecondArray /// "names" --> StringArray -pub fn make_batch(offset: Duration) -> RecordBatch { +fn make_timestamp_batch(offset: Duration) -> RecordBatch { let ts_strings = vec![ Some("2020-01-01T01:01:01.0000000000001"), Some("2020-01-01T01:02:01.0000000000001"), @@ -341,3 +430,78 @@ pub fn make_batch(offset: Duration) -> RecordBatch { ) .unwrap() } + +/// Return record batch with a few rows of data for all of the supported date +/// types with the specified offset (in days) +/// +/// Columns are named: +/// "date32" --> Date32Array +/// "date64" --> Date64Array +/// "names" --> StringArray +fn make_date_batch(offset: Duration) -> RecordBatch { + let date_strings = vec![ + Some("2020-01-01"), + Some("2020-01-02"), + Some("2020-01-03"), + None, + Some("2020-01-04"), + ]; + + let names = date_strings + .iter() + .enumerate() + .map(|(i, val)| format!("Row {} + {}: {:?}", i, offset, val)) + .collect::>(); + + // Copied from `cast.rs` cast kernel due to lack of temporal kernels + // https://github.com/apache/arrow-rs/issues/527 + const EPOCH_DAYS_FROM_CE: i32 = 719_163; + + let date_seconds = date_strings + .iter() + .map(|t| { + t.map(|t| { + let t = t.parse::().unwrap(); + let t = t + offset; + t.num_days_from_ce() - EPOCH_DAYS_FROM_CE + }) + }) + .collect::>(); + + let date_millis = date_strings + .into_iter() + .map(|t| { + t.map(|t| { + let t = t + .parse::() + .unwrap() + .and_time(chrono::NaiveTime::from_hms(0, 0, 0)); + let t = t + offset; + t.timestamp_millis() + }) + }) + .collect::>(); + + let arr_date32 = Date32Array::from(date_seconds); + let arr_date64 = Date64Array::from(date_millis); + + let names = names.iter().map(|s| s.as_str()).collect::>(); + let arr_names = StringArray::from(names); + + let schema = Schema::new(vec![ + Field::new("date32", arr_date32.data_type().clone(), true), + Field::new("date64", arr_date64.data_type().clone(), true), + Field::new("name", arr_names.data_type().clone(), true), + ]); + let schema = Arc::new(schema); + + RecordBatch::try_new( + schema, + vec![ + Arc::new(arr_date32), + Arc::new(arr_date64), + Arc::new(arr_names), + ], + ) + .unwrap() +} From f7dff76a52e58e8dc270073a99ee7c23c5abc21a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 12 Jul 2021 07:48:32 -0600 Subject: [PATCH 254/329] Ballista: Make shuffle partitions configurable in benchmarks (#702) --- ballista/rust/client/src/context.rs | 54 +++++-- ballista/rust/client/src/prelude.rs | 2 + ballista/rust/core/proto/ballista.proto | 4 +- ballista/rust/core/src/config.rs | 183 ++++++++++++++++++++++ ballista/rust/core/src/lib.rs | 1 + ballista/rust/core/src/utils.rs | 6 +- ballista/rust/scheduler/src/lib.rs | 22 ++- ballista/rust/scheduler/src/test_utils.rs | 3 +- benchmarks/src/bin/tpch.rs | 23 ++- 9 files changed, 271 insertions(+), 27 deletions(-) create mode 100644 ballista/rust/core/src/config.rs diff --git a/ballista/rust/client/src/context.rs b/ballista/rust/client/src/context.rs index aca712e1d8782..b8210cbc26266 100644 --- a/ballista/rust/client/src/context.rs +++ b/ballista/rust/client/src/context.rs @@ -23,15 +23,15 @@ use std::sync::{Arc, Mutex}; use std::{collections::HashMap, convert::TryInto}; use std::{fs, time::Duration}; -use ballista_core::serde::protobuf::scheduler_grpc_client::SchedulerGrpcClient; -use ballista_core::serde::protobuf::PartitionLocation; +use ballista_core::config::BallistaConfig; use ballista_core::serde::protobuf::{ - execute_query_params::Query, job_status, ExecuteQueryParams, GetJobStatusParams, - GetJobStatusResult, + execute_query_params::Query, job_status, scheduler_grpc_client::SchedulerGrpcClient, + ExecuteQueryParams, GetJobStatusParams, GetJobStatusResult, KeyValuePair, + PartitionLocation, }; -use ballista_core::utils::WrappedStream; use ballista_core::{ client::BallistaClient, datasource::DfTableAdapter, utils::create_datafusion_context, + utils::WrappedStream, }; use datafusion::arrow::datatypes::Schema; @@ -45,6 +45,8 @@ use futures::StreamExt; use log::{error, info}; struct BallistaContextState { + /// Ballista configuration + config: BallistaConfig, /// Scheduler host scheduler_host: String, /// Scheduler port @@ -54,8 +56,13 @@ struct BallistaContextState { } impl BallistaContextState { - pub fn new(scheduler_host: String, scheduler_port: u16) -> Self { + pub fn new( + scheduler_host: String, + scheduler_port: u16, + config: &BallistaConfig, + ) -> Self { Self { + config: config.clone(), scheduler_host, scheduler_port, tables: HashMap::new(), @@ -64,6 +71,7 @@ impl BallistaContextState { #[cfg(feature = "standalone")] pub async fn new_standalone( + config: &BallistaConfig, concurrent_tasks: usize, ) -> ballista_core::error::Result { info!("Running in local mode. Scheduler will be run in-proc"); @@ -87,11 +95,16 @@ impl BallistaContextState { ballista_executor::new_standalone_executor(scheduler, concurrent_tasks).await?; Ok(Self { + config: config.clone(), scheduler_host: "localhost".to_string(), scheduler_port: addr.port(), tables: HashMap::new(), }) } + + pub fn config(&self) -> &BallistaConfig { + &self.config + } } pub struct BallistaContext { @@ -100,8 +113,8 @@ pub struct BallistaContext { impl BallistaContext { /// Create a context for executing queries against a remote Ballista scheduler instance - pub fn remote(host: &str, port: u16) -> Self { - let state = BallistaContextState::new(host.to_owned(), port); + pub fn remote(host: &str, port: u16, config: &BallistaConfig) -> Self { + let state = BallistaContextState::new(host.to_owned(), port, config); Self { state: Arc::new(Mutex::new(state)), @@ -110,9 +123,11 @@ impl BallistaContext { #[cfg(feature = "standalone")] pub async fn standalone( + config: &BallistaConfig, concurrent_tasks: usize, ) -> ballista_core::error::Result { - let state = BallistaContextState::new_standalone(concurrent_tasks).await?; + let state = + BallistaContextState::new_standalone(config, concurrent_tasks).await?; Ok(Self { state: Arc::new(Mutex::new(state)), @@ -127,7 +142,7 @@ impl BallistaContext { let path = fs::canonicalize(&path)?; // use local DataFusion context for now but later this might call the scheduler - let mut ctx = create_datafusion_context(); + let mut ctx = create_datafusion_context(&self.state.lock().unwrap().config()); let df = ctx.read_parquet(path.to_str().unwrap())?; Ok(df) } @@ -144,7 +159,7 @@ impl BallistaContext { let path = fs::canonicalize(&path)?; // use local DataFusion context for now but later this might call the scheduler - let mut ctx = create_datafusion_context(); + let mut ctx = create_datafusion_context(&self.state.lock().unwrap().config()); let df = ctx.read_csv(path.to_str().unwrap(), options)?; Ok(df) } @@ -176,9 +191,9 @@ impl BallistaContext { /// Create a DataFrame from a SQL statement pub fn sql(&self, sql: &str) -> Result> { // use local DataFusion context for now but later this might call the scheduler - let mut ctx = create_datafusion_context(); // register tables let state = self.state.lock().unwrap(); + let mut ctx = create_datafusion_context(&state.config()); for (name, plan) in &state.tables { let plan = ctx.optimize(plan)?; let execution_plan = ctx.create_physical_plan(&plan)?; @@ -217,10 +232,11 @@ impl BallistaContext { &self, plan: &LogicalPlan, ) -> Result>> { - let scheduler_url = { + let (scheduler_url, config) = { let state = self.state.lock().unwrap(); - - format!("http://{}:{}", state.scheduler_host, state.scheduler_port) + let scheduler_url = + format!("http://{}:{}", state.scheduler_host, state.scheduler_port); + (scheduler_url, state.config.clone()) }; info!("Connecting to Ballista scheduler at {}", scheduler_url); @@ -238,6 +254,14 @@ impl BallistaContext { .try_into() .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?, )), + settings: config + .settings() + .iter() + .map(|(k, v)| KeyValuePair { + key: k.to_owned(), + value: v.to_owned(), + }) + .collect::>(), }) .await .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))? diff --git a/ballista/rust/client/src/prelude.rs b/ballista/rust/client/src/prelude.rs index 2f940aef4c976..d162d0c017bd4 100644 --- a/ballista/rust/client/src/prelude.rs +++ b/ballista/rust/client/src/prelude.rs @@ -18,6 +18,8 @@ //! Ballista Prelude (common imports) pub use crate::context::BallistaContext; +pub use ballista_core::config::BallistaConfig; +pub use ballista_core::config::BALLISTA_DEFAULT_SHUFFLE_PARTITIONS; pub use ballista_core::error::{BallistaError, Result}; pub use futures::StreamExt; diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index 4696d21852fc2..b1c153de64c24 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -803,7 +803,9 @@ message ExecuteQueryParams { oneof query { LogicalPlanNode logical_plan = 1; string sql = 2; - }} + } + repeated KeyValuePair settings = 3; +} message ExecuteSqlParams { string sql = 1; diff --git a/ballista/rust/core/src/config.rs b/ballista/rust/core/src/config.rs new file mode 100644 index 0000000000000..dcc0bdb06cded --- /dev/null +++ b/ballista/rust/core/src/config.rs @@ -0,0 +1,183 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// + +//! Ballista configuration + +use std::collections::HashMap; + +use crate::error::{BallistaError, Result}; + +use datafusion::arrow::datatypes::DataType; +use log::warn; + +pub const BALLISTA_DEFAULT_SHUFFLE_PARTITIONS: &str = "ballista.shuffle.partitions"; + +/// Configuration option meta-data +#[derive(Debug, Clone)] +pub struct ConfigEntry { + name: String, + description: String, + data_type: DataType, + default_value: Option, +} + +impl ConfigEntry { + fn new( + name: String, + description: String, + data_type: DataType, + default_value: Option, + ) -> Self { + Self { + name, + description, + data_type, + default_value, + } + } +} + +/// Ballista configuration builder +pub struct BallistaConfigBuilder { + settings: HashMap, +} + +impl Default for BallistaConfigBuilder { + /// Create a new config builder + fn default() -> Self { + Self { + settings: HashMap::new(), + } + } +} + +impl BallistaConfigBuilder { + /// Create a new config with an additional setting + pub fn set(&self, k: &str, v: &str) -> Self { + let mut settings = self.settings.clone(); + settings.insert(k.to_owned(), v.to_owned()); + Self { settings } + } + + pub fn build(&self) -> Result { + BallistaConfig::with_settings(self.settings.clone()) + } +} + +/// Ballista configuration +#[derive(Debug, Clone)] +pub struct BallistaConfig { + /// Settings stored in map for easy serde + settings: HashMap, +} + +impl BallistaConfig { + /// Create a default configuration + pub fn new() -> Result { + Self::with_settings(HashMap::new()) + } + + /// Create a configuration builder + pub fn builder() -> BallistaConfigBuilder { + BallistaConfigBuilder::default() + } + + /// Create a new configuration based on key-value pairs + pub fn with_settings(settings: HashMap) -> Result { + let supported_entries = BallistaConfig::valid_entries(); + for (name, entry) in &supported_entries { + if let Some(v) = settings.get(name) { + // validate that we can parse the user-supplied value + let _ = v.parse::().map_err(|e| BallistaError::General(format!("Failed to parse user-supplied value '{}' for configuration setting '{}': {:?}", name, v, e)))?; + } else if let Some(v) = entry.default_value.clone() { + let _ = v.parse::().map_err(|e| BallistaError::General(format!("Failed to parse default value '{}' for configuration setting '{}': {:?}", name, v, e)))?; + } else { + return Err(BallistaError::General(format!( + "No value specified for mandatory configuration setting '{}'", + name + ))); + } + } + + Ok(Self { settings }) + } + + /// All available configuration options + pub fn valid_entries() -> HashMap { + let entries = vec![ + ConfigEntry::new(BALLISTA_DEFAULT_SHUFFLE_PARTITIONS.to_string(), + "Sets the default number of partitions to create when repartitioning query stages".to_string(), + DataType::UInt16, Some("2".to_string())), + ]; + entries + .iter() + .map(|e| (e.name.clone(), e.clone())) + .collect::>() + } + + pub fn settings(&self) -> &HashMap { + &self.settings + } + + pub fn default_shuffle_partitions(&self) -> usize { + self.get_usize_setting(BALLISTA_DEFAULT_SHUFFLE_PARTITIONS) + } + + fn get_usize_setting(&self, key: &str) -> usize { + if let Some(v) = self.settings.get(key) { + // infallible because we validate all configs in the constructor + v.parse().unwrap() + } else { + let entries = Self::valid_entries(); + // infallible because we validate all configs in the constructor + let v = entries.get(key).unwrap().default_value.as_ref().unwrap(); + v.parse().unwrap() + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn default_config() -> Result<()> { + let config = BallistaConfig::new()?; + assert_eq!(2, config.default_shuffle_partitions()); + Ok(()) + } + + #[test] + fn custom_config() -> Result<()> { + let config = BallistaConfig::builder() + .set(BALLISTA_DEFAULT_SHUFFLE_PARTITIONS, "123") + .build()?; + assert_eq!(123, config.default_shuffle_partitions()); + Ok(()) + } + + #[test] + fn custom_config_invalid() -> Result<()> { + let config = BallistaConfig::builder() + .set(BALLISTA_DEFAULT_SHUFFLE_PARTITIONS, "true") + .build(); + assert!(config.is_err()); + assert_eq!("General(\"Failed to parse user-supplied value 'ballista.shuffle.partitions' for configuration setting 'true': ParseIntError { kind: InvalidDigit }\")", format!("{:?}", config.unwrap_err())); + Ok(()) + } +} diff --git a/ballista/rust/core/src/lib.rs b/ballista/rust/core/src/lib.rs index 425dbab34c132..2a8486945ad0a 100644 --- a/ballista/rust/core/src/lib.rs +++ b/ballista/rust/core/src/lib.rs @@ -24,6 +24,7 @@ pub fn print_version() { } pub mod client; +pub mod config; pub mod datasource; pub mod error; pub mod execution_plans; diff --git a/ballista/rust/core/src/utils.rs b/ballista/rust/core/src/utils.rs index f7d884d502985..7e9a55af1a777 100644 --- a/ballista/rust/core/src/utils.rs +++ b/ballista/rust/core/src/utils.rs @@ -27,6 +27,7 @@ use crate::execution_plans::{ShuffleWriterExec, UnresolvedShuffleExec}; use crate::memory_stream::MemoryStream; use crate::serde::scheduler::PartitionStats; +use crate::config::BallistaConfig; use datafusion::arrow::error::Result as ArrowResult; use datafusion::arrow::{ array::{ @@ -233,8 +234,9 @@ fn build_exec_plan_diagram( } /// Create a DataFusion context that is compatible with Ballista -pub fn create_datafusion_context() -> ExecutionContext { - let config = ExecutionConfig::new().with_concurrency(2); // TODO: this is hack to enable partitioned joins +pub fn create_datafusion_context(config: &BallistaConfig) -> ExecutionContext { + let config = + ExecutionConfig::new().with_concurrency(config.default_shuffle_partitions()); ExecutionContext::with_config(config) } diff --git a/ballista/rust/scheduler/src/lib.rs b/ballista/rust/scheduler/src/lib.rs index 3bd4c03aa9c33..905437d4d980f 100644 --- a/ballista/rust/scheduler/src/lib.rs +++ b/ballista/rust/scheduler/src/lib.rs @@ -79,6 +79,7 @@ use rand::{distributions::Alphanumeric, thread_rng, Rng}; use tonic::{Request, Response}; use self::state::{ConfigBackendClient, SchedulerState}; +use ballista_core::config::BallistaConfig; use ballista_core::utils::create_datafusion_context; use datafusion::physical_plan::parquet::ParquetExec; use std::time::{Instant, SystemTime, UNIX_EPOCH}; @@ -290,7 +291,22 @@ impl SchedulerGrpc for SchedulerServer { &self, request: Request, ) -> std::result::Result, tonic::Status> { - if let ExecuteQueryParams { query: Some(query) } = request.into_inner() { + if let ExecuteQueryParams { + query: Some(query), + settings, + } = request.into_inner() + { + // parse config + let mut config_builder = BallistaConfig::builder(); + for kv_pair in &settings { + config_builder = config_builder.set(&kv_pair.key, &kv_pair.value); + } + let config = config_builder.build().map_err(|e| { + let msg = format!("Could not parse configs: {}", e); + error!("{}", msg); + tonic::Status::internal(msg) + })?; + let plan = match query { Query::LogicalPlan(logical_plan) => { // parse protobuf @@ -303,7 +319,7 @@ impl SchedulerGrpc for SchedulerServer { Query::Sql(sql) => { //TODO we can't just create a new context because we need a context that has // tables registered from previous SQL statements that have been executed - let mut ctx = create_datafusion_context(); + let mut ctx = create_datafusion_context(&config); let df = ctx.sql(&sql).map_err(|e| { let msg = format!("Error parsing SQL: {}", e); error!("{}", msg); @@ -339,7 +355,7 @@ impl SchedulerGrpc for SchedulerServer { let job_id_spawn = job_id.clone(); tokio::spawn(async move { // create physical plan using DataFusion - let datafusion_ctx = create_datafusion_context(); + let datafusion_ctx = create_datafusion_context(&config); macro_rules! fail_job { ($code :expr) => {{ match $code { diff --git a/ballista/rust/scheduler/src/test_utils.rs b/ballista/rust/scheduler/src/test_utils.rs index aa1e2b2575aa9..5b7b685d7be9e 100644 --- a/ballista/rust/scheduler/src/test_utils.rs +++ b/ballista/rust/scheduler/src/test_utils.rs @@ -26,7 +26,8 @@ pub const TPCH_TABLES: &[&str] = &[ ]; pub fn datafusion_test_context(path: &str) -> Result { - let config = ExecutionConfig::new().with_concurrency(2); // TODO: this is hack to enable partitioned joins + let default_shuffle_partitions = 2; + let config = ExecutionConfig::new().with_concurrency(default_shuffle_partitions); let mut ctx = ExecutionContext::with_config(config); for table in TPCH_TABLES { let schema = get_tpch_schema(table); diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index a52b6d208cff4..169319d30beef 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -28,21 +28,21 @@ use std::{ use futures::StreamExt; use ballista::context::BallistaContext; +use ballista::prelude::{BallistaConfig, BALLISTA_DEFAULT_SHUFFLE_PARTITIONS}; use datafusion::arrow::datatypes::{DataType, Field, Schema}; use datafusion::arrow::record_batch::RecordBatch; use datafusion::arrow::util::pretty; - use datafusion::datasource::parquet::ParquetTable; use datafusion::datasource::{CsvFile, MemTable, TableProvider}; use datafusion::error::{DataFusionError, Result}; use datafusion::logical_plan::LogicalPlan; -use datafusion::physical_plan::{collect, displayable}; -use datafusion::prelude::*; - use datafusion::parquet::basic::Compression; use datafusion::parquet::file::properties::WriterProperties; use datafusion::physical_plan::display::DisplayableExecutionPlan; +use datafusion::physical_plan::{collect, displayable}; +use datafusion::prelude::*; + use structopt::StructOpt; #[cfg(feature = "snmalloc")] @@ -94,6 +94,10 @@ struct BallistaBenchmarkOpt { /// Ballista executor port #[structopt(long = "port")] port: Option, + + /// Number of shuffle partitions + #[structopt(short, long, default_value = "2")] + shuffle_partitions: usize, } #[derive(Debug, StructOpt, Clone)] @@ -252,7 +256,16 @@ async fn benchmark_datafusion(opt: DataFusionBenchmarkOpt) -> Result Result<()> { println!("Running benchmarks with the following options: {:?}", opt); - let ctx = BallistaContext::remote(opt.host.unwrap().as_str(), opt.port.unwrap()); + let config = BallistaConfig::builder() + .set( + BALLISTA_DEFAULT_SHUFFLE_PARTITIONS, + &format!("{}", opt.shuffle_partitions), + ) + .build() + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; + + let ctx = + BallistaContext::remote(opt.host.unwrap().as_str(), opt.port.unwrap(), &config); // register tables with Ballista context let path = opt.path.to_str().unwrap(); From 0a05acf06bfe5c8f3f899cdc4eb5970acfc11f38 Mon Sep 17 00:00:00 2001 From: QP Hou Date: Mon, 12 Jul 2021 08:18:01 -0700 Subject: [PATCH 255/329] avoid iterator materialization in column index lookup (#703) --- datafusion/src/logical_plan/dfschema.rs | 26 +++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/datafusion/src/logical_plan/dfschema.rs b/datafusion/src/logical_plan/dfschema.rs index b4bde87f3471f..217e2de6d4ff3 100644 --- a/datafusion/src/logical_plan/dfschema.rs +++ b/datafusion/src/logical_plan/dfschema.rs @@ -152,7 +152,7 @@ impl DFSchema { qualifier: Option<&str>, name: &str, ) -> Result { - let matches: Vec = self + let mut matches = self .fields .iter() .enumerate() @@ -164,24 +164,26 @@ impl DFSchema { // field to lookup is qualified but current field is unqualified. (Some(_), None) => false, // field to lookup is unqualified, no need to compare qualifier - _ => field.name() == name, + (None, Some(_)) | (None, None) => field.name() == name, }) - .map(|(idx, _)| idx) - .collect(); + .map(|(idx, _)| idx); - match matches.len() { - 0 => Err(DataFusionError::Plan(format!( + match matches.next() { + None => Err(DataFusionError::Plan(format!( "No field named '{}.{}'. Valid fields are {}.", qualifier.unwrap_or(""), name, self.get_field_names() ))), - 1 => Ok(matches[0]), - _ => Err(DataFusionError::Internal(format!( - "Ambiguous reference to qualified field named '{}.{}'", - qualifier.unwrap_or(""), - name - ))), + Some(idx) => match matches.next() { + None => Ok(idx), + // found more than one matches + Some(_) => Err(DataFusionError::Internal(format!( + "Ambiguous reference to qualified field named '{}.{}'", + qualifier.unwrap_or(""), + name + ))), + }, } } From 24f32caea26f7a574da5bf9ae35f60e87057f6f9 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 12 Jul 2021 13:47:21 -0600 Subject: [PATCH 256/329] Ballista: Shuffle write bug fix (#714) * shuffle write bug fix * Rename variable * windows * fix bug in windows-specific assertion * revert accidental change --- .../src/execution_plans/shuffle_writer.rs | 33 ++++++++++++++----- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/ballista/rust/core/src/execution_plans/shuffle_writer.rs b/ballista/rust/core/src/execution_plans/shuffle_writer.rs index 92b4448a69ec6..83d40aecd0d7e 100644 --- a/ballista/rust/core/src/execution_plans/shuffle_writer.rs +++ b/ballista/rust/core/src/execution_plans/shuffle_writer.rs @@ -234,7 +234,7 @@ impl ExecutionPlan for ShuffleWriterExec { indices[(*hash % num_output_partitions as u64) as usize] .push(index as u64) } - for (num_output_partition, partition_indices) in + for (output_partition, partition_indices) in indices.into_iter().enumerate() { let indices = partition_indices.into(); @@ -254,13 +254,13 @@ impl ExecutionPlan for ShuffleWriterExec { // write batch out let start = Instant::now(); - match &mut writers[num_output_partition] { + match &mut writers[output_partition] { Some(w) => { w.write(&output_batch)?; } None => { let mut path = path.clone(); - path.push(&format!("{}", partition)); + path.push(&format!("{}", output_partition)); std::fs::create_dir_all(&path)?; path.push("data.arrow"); @@ -271,7 +271,7 @@ impl ExecutionPlan for ShuffleWriterExec { ShuffleWriter::new(path, stream.schema().as_ref())?; writer.write(&output_batch)?; - writers[num_output_partition] = Some(writer); + writers[output_partition] = Some(writer); } } self.metrics.write_time.add_elapsed(start); @@ -419,20 +419,22 @@ impl ShuffleWriter { mod tests { use super::*; use datafusion::arrow::array::{StringArray, StructArray, UInt32Array, UInt64Array}; + use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion::physical_plan::expressions::Column; + use datafusion::physical_plan::limit::GlobalLimitExec; use datafusion::physical_plan::memory::MemoryExec; use tempfile::TempDir; #[tokio::test] async fn test() -> Result<()> { - let input_plan = create_input_plan()?; + let input_plan = Arc::new(CoalescePartitionsExec::new(create_input_plan()?)); let work_dir = TempDir::new()?; let query_stage = ShuffleWriterExec::try_new( "jobOne".to_owned(), 1, input_plan, work_dir.into_path().to_str().unwrap().to_owned(), - None, + Some(Partitioning::Hash(vec![Arc::new(Column::new("a", 0))], 2)), )?; let mut stream = query_stage.execute(0).await?; let batches = utils::collect_stream(&mut stream) @@ -441,17 +443,28 @@ mod tests { assert_eq!(1, batches.len()); let batch = &batches[0]; assert_eq!(3, batch.num_columns()); - assert_eq!(1, batch.num_rows()); + assert_eq!(2, batch.num_rows()); let path = batch.columns()[1] .as_any() .downcast_ref::() .unwrap(); - let file = path.value(0); - assert!(file.ends_with("data.arrow")); + + let file0 = path.value(0); + assert!( + file0.ends_with("/jobOne/1/0/data.arrow") + || file0.ends_with("\\jobOne\\1\\0\\data.arrow") + ); + let file1 = path.value(1); + assert!( + file1.ends_with("/jobOne/1/1/data.arrow") + || file1.ends_with("\\jobOne\\1\\1\\data.arrow") + ); + let stats = batch.columns()[2] .as_any() .downcast_ref::() .unwrap(); + let num_rows = stats .column_by_name("num_rows") .unwrap() @@ -459,6 +472,8 @@ mod tests { .downcast_ref::() .unwrap(); assert_eq!(4, num_rows.value(0)); + assert_eq!(4, num_rows.value(1)); + Ok(()) } From 2176ff592e38147c1c779b5660ec73db9c161a67 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 12 Jul 2021 18:23:43 -0600 Subject: [PATCH 257/329] Implement serde for ShuffleWriterExec (#712) --- ballista/rust/core/proto/ballista.proto | 10 ++++++ .../src/execution_plans/shuffle_writer.rs | 7 ++-- .../src/serde/physical_plan/from_proto.rs | 32 +++++++++++++++++- .../rust/core/src/serde/physical_plan/mod.rs | 16 +++++++++ .../core/src/serde/physical_plan/to_proto.rs | 33 ++++++++++++++++++- 5 files changed, 94 insertions(+), 4 deletions(-) diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index b1c153de64c24..5b3e93e379e33 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -424,6 +424,7 @@ message PhysicalPlanNode { UnresolvedShuffleExecNode unresolved = 15; RepartitionExecNode repartition = 16; WindowAggExecNode window = 17; + ShuffleWriterExecNode shuffle_writer = 18; } } @@ -629,6 +630,15 @@ message HashAggregateExecNode { Schema input_schema = 7; } +message ShuffleWriterExecNode { + //TODO it seems redundant to provide job and stage id here since we also have them + // in the TaskDefinition that wraps this plan + string job_id = 1; + uint32 stage_id = 2; + PhysicalPlanNode input = 3; + PhysicalHashRepartition output_partitioning = 4; +} + message ShuffleReaderExecNode { repeated ShuffleReaderPartition partition = 1; Schema schema = 2; diff --git a/ballista/rust/core/src/execution_plans/shuffle_writer.rs b/ballista/rust/core/src/execution_plans/shuffle_writer.rs index 83d40aecd0d7e..d5c7d8f284496 100644 --- a/ballista/rust/core/src/execution_plans/shuffle_writer.rs +++ b/ballista/rust/core/src/execution_plans/shuffle_writer.rs @@ -126,7 +126,10 @@ impl ExecutionPlan for ShuffleWriterExec { } fn output_partitioning(&self) -> Partitioning { - self.plan.output_partitioning() + match &self.shuffle_output_partitioning { + Some(p) => p.clone(), + _ => Partitioning::UnknownPartitioning(1), + } } fn children(&self) -> Vec> { @@ -143,7 +146,7 @@ impl ExecutionPlan for ShuffleWriterExec { self.stage_id, children[0].clone(), self.work_dir.clone(), - None, + self.shuffle_output_partitioning.clone(), )?)) } diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index 12c1743c0747c..a1a60bde0cecf 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -22,7 +22,9 @@ use std::convert::{TryFrom, TryInto}; use std::sync::Arc; use crate::error::BallistaError; -use crate::execution_plans::{ShuffleReaderExec, UnresolvedShuffleExec}; +use crate::execution_plans::{ + ShuffleReaderExec, ShuffleWriterExec, UnresolvedShuffleExec, +}; use crate::serde::protobuf::repartition_exec_node::PartitionMethod; use crate::serde::protobuf::ShuffleReaderPartition; use crate::serde::scheduler::PartitionLocation; @@ -370,6 +372,34 @@ impl TryInto> for &protobuf::PhysicalPlanNode { partition_mode, )?)) } + PhysicalPlanType::ShuffleWriter(shuffle_writer) => { + let input: Arc = + convert_box_required!(shuffle_writer.input)?; + + let output_partitioning = match &shuffle_writer.output_partitioning { + Some(hash_part) => { + let expr = hash_part + .hash_expr + .iter() + .map(|e| e.try_into()) + .collect::>, _>>()?; + + Some(Partitioning::Hash( + expr, + hash_part.partition_count.try_into().unwrap(), + )) + } + None => None, + }; + + Ok(Arc::new(ShuffleWriterExec::try_new( + shuffle_writer.job_id.clone(), + shuffle_writer.stage_id as usize, + input, + "".to_string(), // this is intentional but hacky - the executor will fill this in + output_partitioning, + )?)) + } PhysicalPlanType::ShuffleReader(shuffle_reader) => { let schema = Arc::new(convert_required!(shuffle_reader.schema)?); let partition_location: Vec> = shuffle_reader diff --git a/ballista/rust/core/src/serde/physical_plan/mod.rs b/ballista/rust/core/src/serde/physical_plan/mod.rs index 3bf7e9c3063b5..f544859fa7b2b 100644 --- a/ballista/rust/core/src/serde/physical_plan/mod.rs +++ b/ballista/rust/core/src/serde/physical_plan/mod.rs @@ -45,6 +45,7 @@ mod roundtrip_tests { use super::super::super::error::Result; use super::super::protobuf; + use crate::execution_plans::ShuffleWriterExec; fn roundtrip_test(exec_plan: Arc) -> Result<()> { let proto: protobuf::PhysicalPlanNode = exec_plan.clone().try_into()?; @@ -184,4 +185,19 @@ mod roundtrip_tests { Arc::new(EmptyExec::new(false, schema)), )?)) } + + #[test] + fn roundtrip_shuffle_writer() -> Result<()> { + let field_a = Field::new("a", DataType::Int64, false); + let field_b = Field::new("b", DataType::Int64, false); + let schema = Arc::new(Schema::new(vec![field_a, field_b])); + + roundtrip_test(Arc::new(ShuffleWriterExec::try_new( + "job123".to_string(), + 123, + Arc::new(EmptyExec::new(false, schema)), + "".to_string(), + Some(Partitioning::Hash(vec![Arc::new(Column::new("a", 0))], 4)), + )?)) + } } diff --git a/ballista/rust/core/src/serde/physical_plan/to_proto.rs b/ballista/rust/core/src/serde/physical_plan/to_proto.rs index 875dbf213441d..cdd33f9384131 100644 --- a/ballista/rust/core/src/serde/physical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/to_proto.rs @@ -55,7 +55,9 @@ use datafusion::physical_plan::{AggregateExpr, ExecutionPlan, PhysicalExpr}; use datafusion::physical_plan::hash_aggregate::HashAggregateExec; use protobuf::physical_plan_node::PhysicalPlanType; -use crate::execution_plans::{ShuffleReaderExec, UnresolvedShuffleExec}; +use crate::execution_plans::{ + ShuffleReaderExec, ShuffleWriterExec, UnresolvedShuffleExec, +}; use crate::serde::protobuf::repartition_exec_node::PartitionMethod; use crate::serde::scheduler::PartitionLocation; use crate::serde::{protobuf, BallistaError}; @@ -356,6 +358,35 @@ impl TryInto for Arc { }, ))), }) + } else if let Some(exec) = plan.downcast_ref::() { + let input: protobuf::PhysicalPlanNode = + exec.children()[0].to_owned().try_into()?; + Ok(protobuf::PhysicalPlanNode { + physical_plan_type: Some(PhysicalPlanType::ShuffleWriter(Box::new( + protobuf::ShuffleWriterExecNode { + job_id: exec.job_id().to_string(), + stage_id: exec.stage_id() as u32, + input: Some(Box::new(input)), + output_partitioning: match exec.output_partitioning() { + Partitioning::Hash(exprs, partition_count) => { + Some(protobuf::PhysicalHashRepartition { + hash_expr: exprs + .iter() + .map(|expr| expr.clone().try_into()) + .collect::, BallistaError>>()?, + partition_count: partition_count as u64, + }) + } + other => { + return Err(BallistaError::General(format!( + "physical_plan::to_proto() invalid partitioning for ShuffleWriterExec: {:?}", + other + ))) + } + }, + }, + ))), + }) } else if let Some(exec) = plan.downcast_ref::() { Ok(protobuf::PhysicalPlanNode { physical_plan_type: Some(PhysicalPlanType::Unresolved( From 9ba22ef2a9bcc22864f4d6c1de90ecdc2fb4204b Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Tue, 13 Jul 2021 21:44:59 +0800 Subject: [PATCH 258/329] Use pytest in integration test (#715) * use pytest * update to use pytest --- .github/workflows/rust.yml | 4 +- integration-tests/requirements.in | 18 ++++ integration-tests/requirements.txt | 118 ++++++++++++++++++++++++++ integration-tests/test_psql_parity.py | 35 ++++---- 4 files changed, 153 insertions(+), 22 deletions(-) create mode 100644 integration-tests/requirements.in create mode 100644 integration-tests/requirements.txt diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 4a994bfb6b6c9..2454d10fe03c4 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -163,7 +163,7 @@ jobs: - name: Install Python dependencies run: | python -m pip install --upgrade pip setuptools wheel - python -m pip install --upgrade numpy==1.20.3 pandas==1.2.4 + python -m pip install -r integration-tests/requirements.txt - name: Allow access of psql run: | # make sure psql can access the server @@ -194,7 +194,7 @@ jobs: - name: Build datafusion-cli run: cargo build --bin datafusion-cli - name: Test Psql Parity - run: python -m unittest -v integration-tests/test_psql_parity.py + run: python -m pytest -v integration-tests/test_psql_parity.py env: POSTGRES_HOST: localhost POSTGRES_PORT: ${{ job.services.postgres.ports[5432] }} diff --git a/integration-tests/requirements.in b/integration-tests/requirements.in new file mode 100644 index 0000000000000..782329d05e36f --- /dev/null +++ b/integration-tests/requirements.in @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +pytest +numpy +pandas diff --git a/integration-tests/requirements.txt b/integration-tests/requirements.txt new file mode 100644 index 0000000000000..3f8b11372ecf5 --- /dev/null +++ b/integration-tests/requirements.txt @@ -0,0 +1,118 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This file is autogenerated by pip-compile with python 3.8 +# To update, run: +# +# pip-compile --generate-hashes +# +attrs==21.2.0 \ + --hash=sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1 \ + --hash=sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb + # via pytest +iniconfig==1.1.1 \ + --hash=sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3 \ + --hash=sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32 + # via pytest +numpy==1.21.0 \ + --hash=sha256:1a784e8ff7ea2a32e393cc53eb0003eca1597c7ca628227e34ce34eb11645a0e \ + --hash=sha256:2ba579dde0563f47021dcd652253103d6fd66165b18011dce1a0609215b2791e \ + --hash=sha256:3537b967b350ad17633b35c2f4b1a1bbd258c018910b518c30b48c8e41272717 \ + --hash=sha256:3c40e6b860220ed862e8097b8f81c9af6d7405b723f4a7af24a267b46f90e461 \ + --hash=sha256:598fe100b2948465cf3ed64b1a326424b5e4be2670552066e17dfaa67246011d \ + --hash=sha256:620732f42259eb2c4642761bd324462a01cdd13dd111740ce3d344992dd8492f \ + --hash=sha256:709884863def34d72b183d074d8ba5cfe042bc3ff8898f1ffad0209161caaa99 \ + --hash=sha256:75579acbadbf74e3afd1153da6177f846212ea2a0cc77de53523ae02c9256513 \ + --hash=sha256:7c55407f739f0bfcec67d0df49103f9333edc870061358ac8a8c9e37ea02fcd2 \ + --hash=sha256:a1f2fb2da242568af0271455b89aee0f71e4e032086ee2b4c5098945d0e11cf6 \ + --hash=sha256:a290989cd671cd0605e9c91a70e6df660f73ae87484218e8285c6522d29f6e38 \ + --hash=sha256:ac4fd578322842dbda8d968e3962e9f22e862b6ec6e3378e7415625915e2da4d \ + --hash=sha256:ad09f55cc95ed8d80d8ab2052f78cc21cb231764de73e229140d81ff49d8145e \ + --hash=sha256:b9205711e5440954f861ceeea8f1b415d7dd15214add2e878b4d1cf2bcb1a914 \ + --hash=sha256:bba474a87496d96e61461f7306fba2ebba127bed7836212c360f144d1e72ac54 \ + --hash=sha256:bebab3eaf0641bba26039fb0b2c5bf9b99407924b53b1ea86e03c32c64ef5aef \ + --hash=sha256:cc367c86eb87e5b7c9592935620f22d13b090c609f1b27e49600cd033b529f54 \ + --hash=sha256:ccc6c650f8700ce1e3a77668bb7c43e45c20ac06ae00d22bdf6760b38958c883 \ + --hash=sha256:cf680682ad0a3bef56dae200dbcbac2d57294a73e5b0f9864955e7dd7c2c2491 \ + --hash=sha256:d2910d0a075caed95de1a605df00ee03b599de5419d0b95d55342e9a33ad1fb3 \ + --hash=sha256:d5caa946a9f55511e76446e170bdad1d12d6b54e17a2afe7b189112ed4412bb8 \ + --hash=sha256:d89b0dc7f005090e32bb4f9bf796e1dcca6b52243caf1803fdd2b748d8561f63 \ + --hash=sha256:d95d16204cd51ff1a1c8d5f9958ce90ae190be81d348b514f9be39f878b8044a \ + --hash=sha256:e4d5a86a5257843a18fb1220c5f1c199532bc5d24e849ed4b0289fb59fbd4d8f \ + --hash=sha256:e58ddb53a7b4959932f5582ac455ff90dcb05fac3f8dcc8079498d43afbbde6c \ + --hash=sha256:e80fe25cba41c124d04c662f33f6364909b985f2eb5998aaa5ae4b9587242cce \ + --hash=sha256:eda2829af498946c59d8585a9fd74da3f810866e05f8df03a86f70079c7531dd \ + --hash=sha256:fd0a359c1c17f00cb37de2969984a74320970e0ceef4808c32e00773b06649d9 + # via + # -r requirements.in + # pandas +packaging==21.0 \ + --hash=sha256:7dc96269f53a4ccec5c0670940a4281106dd0bb343f47b7471f779df49c2fbe7 \ + --hash=sha256:c86254f9220d55e31cc94d69bade760f0847da8000def4dfe1c6b872fd14ff14 + # via pytest +pandas==1.3.0 \ + --hash=sha256:08eeff3da6a188e24db7f292b39a8ca9e073bf841fbbeadb946b3ad5c19d843e \ + --hash=sha256:1ff13eed501e07e7fb26a4ea18a846b6e5d7de549b497025601fd9ccb7c1d123 \ + --hash=sha256:522bfea92f3ef6207cadc7428bda1e7605dae0383b8065030e7b5d0266717b48 \ + --hash=sha256:7897326cae660eee69d501cbfa950281a193fcf407393965e1bc07448e1cc35a \ + --hash=sha256:798675317d0e4863a92a9a6bc5bd2490b5f6fef8c17b95f29e2e33f28bef9eca \ + --hash=sha256:7d3cd2c99faa94d717ca00ea489264a291ad7209453dffbf059bfb7971fd3a61 \ + --hash=sha256:823737830364d0e2af8c3912a28ba971296181a07950873492ed94e12d28c405 \ + --hash=sha256:872aa91e0f9ca913046ab639d4181a899f5e592030d954d28c2529b88756a736 \ + --hash=sha256:88864c1e28353b958b1f30e4193818519624ad9a1776921622a6a2a016d5d807 \ + --hash=sha256:92835113a67cbd34747c198d41f09f4b63f6fe11ca5643baebc7ab1e30e89e95 \ + --hash=sha256:98efc2d4983d5bb47662fe2d97b2c81b91566cb08b266490918b9c7d74a5ef64 \ + --hash=sha256:b10d7910ae9d7920a5ff7816d794d99acbc361f7b16a0f017d4fa83ced8cb55e \ + --hash=sha256:c554e6c9cf2d5ea1aba5979cc837b3649539ced0e18ece186f055450c86622e2 \ + --hash=sha256:c746876cdd8380be0c3e70966d4566855901ac9aaa5e4b9ccaa5ca5311457d11 \ + --hash=sha256:c81b8d91e9ae861eb4406b4e0f8d4dabbc105b9c479b3d1e921fba1d35b5b62a \ + --hash=sha256:e6b75091fa54a53db3927b4d1bc997c23c5ba6f87acdfe1ee5a92c38c6b2ed6a \ + --hash=sha256:ed4fc66f23fe17c93a5d439230ca2d6b5f8eac7154198d327dbe8a16d98f3f10 \ + --hash=sha256:f058c786e7b0a9e7fa5e0b9f4422e0ccdd3bf3aa3053c18d77ed2a459bd9a45a \ + --hash=sha256:fe7a549d10ca534797095586883a5c17d140d606747591258869c56e14d1b457 + # via -r requirements.in +pluggy==0.13.1 \ + --hash=sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0 \ + --hash=sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d + # via pytest +py==1.10.0 \ + --hash=sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3 \ + --hash=sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a + # via pytest +pyparsing==2.4.7 \ + --hash=sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1 \ + --hash=sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b + # via packaging +pytest==6.2.4 \ + --hash=sha256:50bcad0a0b9c5a72c8e4e7c9855a3ad496ca6a881a3641b4260605450772c54b \ + --hash=sha256:91ef2131a9bd6be8f76f1f08eac5c5317221d6ad1e143ae03894b862e8976890 + # via -r requirements.in +python-dateutil==2.8.1 \ + --hash=sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c \ + --hash=sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a + # via pandas +pytz==2021.1 \ + --hash=sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da \ + --hash=sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798 + # via pandas +six==1.16.0 \ + --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ + --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 + # via python-dateutil +toml==0.10.2 \ + --hash=sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b \ + --hash=sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f + # via pytest diff --git a/integration-tests/test_psql_parity.py b/integration-tests/test_psql_parity.py index 39cfdee77fbdd..a160d3e320ce1 100644 --- a/integration-tests/test_psql_parity.py +++ b/integration-tests/test_psql_parity.py @@ -14,13 +14,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pandas as pd -import numpy as np import io import os import subprocess from pathlib import Path -import unittest + +import numpy as np +import pandas as pd +import pytest pg_db, pg_user, pg_host, pg_port = [ os.environ.get(i) @@ -70,22 +71,16 @@ def generate_csv_from_psql(fname: str): ) -class PsqlParityTest(unittest.TestCase): - def test_parity(self): - root = Path(os.path.dirname(__file__)) / "sqls" - files = set(root.glob("*.sql")) - self.assertEqual(len(files), 14, msg="tests are missed") - for fname in files: - with self.subTest(fname=fname): - datafusion_output = pd.read_csv( - io.BytesIO(generate_csv_from_datafusion(fname)) - ) - psql_output = pd.read_csv(io.BytesIO(generate_csv_from_psql(fname))) - self.assertTrue( - np.allclose(datafusion_output, psql_output, equal_nan=True), - msg=f"datafusion output=\n{datafusion_output}, psql_output=\n{psql_output}", - ) +root = Path(os.path.dirname(__file__)) / "sqls" +test_files = set(root.glob("*.sql")) + +class TestPsqlParity: + def test_tests_count(self): + assert len(test_files) == 14, "tests are missed" -if __name__ == "__main__": - unittest.main() + @pytest.mark.parametrize("fname", test_files) + def test_sql_file(self, fname): + datafusion_output = pd.read_csv(io.BytesIO(generate_csv_from_datafusion(fname))) + psql_output = pd.read_csv(io.BytesIO(generate_csv_from_psql(fname))) + np.testing.assert_allclose(datafusion_output, psql_output, equal_nan=True) From 75a376f0ff2e8236c07a3b0a16374b7e3855c194 Mon Sep 17 00:00:00 2001 From: QP Hou Date: Tue, 13 Jul 2021 06:46:31 -0700 Subject: [PATCH 259/329] dedup using join column in wildcard expansion (#678) * dedup using join column in wildcard expansion * reuse expand_wildcard in logical plan builder --- datafusion/src/logical_plan/builder.rs | 72 ++++++++++++++++++++++++-- datafusion/src/logical_plan/expr.rs | 2 +- datafusion/src/logical_plan/mod.rs | 2 +- datafusion/src/logical_plan/plan.rs | 2 +- datafusion/src/sql/planner.rs | 34 +++++++++--- datafusion/src/sql/utils.rs | 14 +---- 6 files changed, 99 insertions(+), 27 deletions(-) diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs index 41f29c4b99052..85c4aea99ff5f 100644 --- a/datafusion/src/logical_plan/builder.rs +++ b/datafusion/src/logical_plan/builder.rs @@ -17,7 +17,10 @@ //! This module provides a builder for creating LogicalPlans -use std::{collections::HashMap, sync::Arc}; +use std::{ + collections::{HashMap, HashSet}, + sync::Arc, +}; use arrow::{ datatypes::{Schema, SchemaRef}, @@ -220,10 +223,7 @@ impl LogicalPlanBuilder { for e in expr { match e { Expr::Wildcard => { - (0..input_schema.fields().len()).for_each(|i| { - projected_expr - .push(Expr::Column(input_schema.field(i).qualified_column())) - }); + projected_expr.extend(expand_wildcard(input_schema, &self.plan)?) } _ => projected_expr .push(columnize_expr(normalize_col(e, &self.plan)?, input_schema)), @@ -508,6 +508,47 @@ pub fn union_with_alias( }) } +/// Resolves an `Expr::Wildcard` to a collection of `Expr::Column`'s. +pub(crate) fn expand_wildcard( + schema: &DFSchema, + plan: &LogicalPlan, +) -> Result> { + let using_columns = plan.using_columns()?; + let columns_to_skip = using_columns + .into_iter() + // For each USING JOIN condition, only expand to one column in projection + .map(|cols| { + let mut cols = cols.into_iter().collect::>(); + // sort join columns to make sure we consistently keep the same + // qualified column + cols.sort(); + cols.into_iter().skip(1) + }) + .flatten() + .collect::>(); + + if columns_to_skip.is_empty() { + Ok(schema + .fields() + .iter() + .map(|f| Expr::Column(f.qualified_column())) + .collect::>()) + } else { + Ok(schema + .fields() + .iter() + .filter_map(|f| { + let col = f.qualified_column(); + if !columns_to_skip.contains(&col) { + Some(Expr::Column(col)) + } else { + None + } + }) + .collect::>()) + } +} + #[cfg(test)] mod tests { use arrow::datatypes::{DataType, Field}; @@ -587,6 +628,27 @@ mod tests { Ok(()) } + #[test] + fn plan_using_join_wildcard_projection() -> Result<()> { + let t2 = LogicalPlanBuilder::scan_empty(Some("t2"), &employee_schema(), None)? + .build()?; + + let plan = LogicalPlanBuilder::scan_empty(Some("t1"), &employee_schema(), None)? + .join_using(&t2, JoinType::Inner, vec!["id"])? + .project(vec![Expr::Wildcard])? + .build()?; + + // id column should only show up once in projection + let expected = "Projection: #t1.id, #t1.first_name, #t1.last_name, #t1.state, #t1.salary, #t2.first_name, #t2.last_name, #t2.state, #t2.salary\ + \n Join: Using #t1.id = #t2.id\ + \n TableScan: t1 projection=None\ + \n TableScan: t2 projection=None"; + + assert_eq!(expected, format!("{:?}", plan)); + + Ok(()) + } + #[test] fn plan_builder_union_combined_single_union() -> Result<()> { let plan = LogicalPlanBuilder::scan_empty( diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index 59c99797e0cd8..2eee140f47fe5 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -34,7 +34,7 @@ use std::fmt; use std::sync::Arc; /// A named reference to a qualified field in a schema. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct Column { /// relation/table name. pub relation: Option, diff --git a/datafusion/src/logical_plan/mod.rs b/datafusion/src/logical_plan/mod.rs index 2c751abdad349..f381e316669e4 100644 --- a/datafusion/src/logical_plan/mod.rs +++ b/datafusion/src/logical_plan/mod.rs @@ -21,7 +21,7 @@ //! Logical query plans can then be optimized and executed directly, or translated into //! physical query plans and executed. -mod builder; +pub(crate) mod builder; mod dfschema; mod display; mod expr; diff --git a/datafusion/src/logical_plan/plan.rs b/datafusion/src/logical_plan/plan.rs index b954b6a97950c..2504dfaa6f236 100644 --- a/datafusion/src/logical_plan/plan.rs +++ b/datafusion/src/logical_plan/plan.rs @@ -25,8 +25,8 @@ use crate::error::DataFusionError; use crate::logical_plan::dfschema::DFSchemaRef; use crate::sql::parser::FileType; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use std::collections::HashSet; use std::{ + collections::HashSet, fmt::{self, Display}, sync::Arc, }; diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index f89ba3f659c88..41b4e20f15f3d 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -27,8 +27,9 @@ use crate::datasource::TableProvider; use crate::logical_plan::window_frames::{WindowFrame, WindowFrameUnits}; use crate::logical_plan::Expr::Alias; use crate::logical_plan::{ - and, col, lit, normalize_col, union_with_alias, Column, DFSchema, Expr, LogicalPlan, - LogicalPlanBuilder, Operator, PlanType, StringifiedPlan, ToDFSchema, + and, builder::expand_wildcard, col, lit, normalize_col, union_with_alias, Column, + DFSchema, Expr, LogicalPlan, LogicalPlanBuilder, Operator, PlanType, StringifiedPlan, + ToDFSchema, }; use crate::prelude::JoinType; use crate::scalar::ScalarValue; @@ -56,7 +57,7 @@ use sqlparser::parser::ParserError::ParserError; use super::{ parser::DFParser, utils::{ - can_columns_satisfy_exprs, expand_wildcard, expr_as_column_expr, extract_aliases, + can_columns_satisfy_exprs, expr_as_column_expr, extract_aliases, find_aggregate_exprs, find_column_exprs, find_window_exprs, group_window_expr_by_sort_keys, rebase_expr, resolve_aliases_to_exprs, resolve_positions_to_exprs, @@ -687,9 +688,17 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .iter() .map(|expr| self.sql_select_to_rex(expr, input_schema)) .collect::>>()? - .iter() - .flat_map(|expr| expand_wildcard(expr, input_schema)) - .map(|expr| normalize_col(expr, plan)) + .into_iter() + .map(|expr| { + Ok(match expr { + Expr::Wildcard => expand_wildcard(input_schema, plan)?, + _ => vec![normalize_col(expr, plan)?], + }) + }) + .flat_map(|res| match res { + Ok(v) => v.into_iter().map(Ok).collect(), + Err(e) => vec![Err(e)], + }) .collect::>>() } @@ -2773,6 +2782,19 @@ mod tests { quick_test(sql, expected); } + #[test] + fn project_wildcard_on_join_with_using() { + let sql = "SELECT * \ + FROM lineitem \ + JOIN lineitem as lineitem2 \ + USING (l_item_id)"; + let expected = "Projection: #lineitem.l_item_id, #lineitem.l_description, #lineitem.price, #lineitem2.l_description, #lineitem2.price\ + \n Join: Using #lineitem.l_item_id = #lineitem2.l_item_id\ + \n TableScan: lineitem projection=None\ + \n TableScan: lineitem2 projection=None"; + quick_test(sql, expected); + } + #[test] fn equijoin_explicit_syntax_3_tables() { let sql = "SELECT id, order_id, l_description \ diff --git a/datafusion/src/sql/utils.rs b/datafusion/src/sql/utils.rs index 28243360c412f..41bcd205800df 100644 --- a/datafusion/src/sql/utils.rs +++ b/datafusion/src/sql/utils.rs @@ -17,7 +17,7 @@ //! SQL Utility Functions -use crate::logical_plan::{DFSchema, Expr, LogicalPlan}; +use crate::logical_plan::{Expr, LogicalPlan}; use crate::scalar::ScalarValue; use crate::{ error::{DataFusionError, Result}, @@ -25,18 +25,6 @@ use crate::{ }; use std::collections::HashMap; -/// Resolves an `Expr::Wildcard` to a collection of `Expr::Column`'s. -pub(crate) fn expand_wildcard(expr: &Expr, schema: &DFSchema) -> Vec { - match expr { - Expr::Wildcard => schema - .fields() - .iter() - .map(|f| Expr::Column(f.qualified_column())) - .collect::>(), - _ => vec![expr.clone()], - } -} - /// Collect all deeply nested `Expr::AggregateFunction` and /// `Expr::AggregateUDF`. They are returned in order of occurrence (depth /// first), with duplicates omitted. From fd50dd808213e219559ccd142bcb88f2194ff4a5 Mon Sep 17 00:00:00 2001 From: QP Hou Date: Tue, 13 Jul 2021 10:26:20 -0700 Subject: [PATCH 260/329] replace once iter chain with array::IntoIter (#704) * replace once iter chain with array::IntoIter * use new array IntoIter implementation from rust 1.53 --- datafusion/src/logical_plan/plan.rs | 8 +++----- datafusion/src/optimizer/filter_push_down.rs | 13 +++++++++---- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/datafusion/src/logical_plan/plan.rs b/datafusion/src/logical_plan/plan.rs index 2504dfaa6f236..4749840ed4c11 100644 --- a/datafusion/src/logical_plan/plan.rs +++ b/datafusion/src/logical_plan/plan.rs @@ -375,12 +375,10 @@ impl LogicalPlan { { self.using_columns.push( on.iter() - .map(|entry| { - std::iter::once(entry.0.clone()) - .chain(std::iter::once(entry.1.clone())) - }) + .map(|entry| [&entry.0, &entry.1]) .flatten() - .collect::>(), + .cloned() + .collect::>(), ); } Ok(true) diff --git a/datafusion/src/optimizer/filter_push_down.rs b/datafusion/src/optimizer/filter_push_down.rs index 76d8c05bed4c6..399923e87218b 100644 --- a/datafusion/src/optimizer/filter_push_down.rs +++ b/datafusion/src/optimizer/filter_push_down.rs @@ -97,9 +97,11 @@ fn get_join_predicates<'a>( .fields() .iter() .map(|f| { - std::iter::once(f.qualified_column()) + [ + f.qualified_column(), // we need to push down filter using unqualified column as well - .chain(std::iter::once(f.unqualified_column())) + f.unqualified_column(), + ] }) .flatten() .collect::>(); @@ -107,8 +109,11 @@ fn get_join_predicates<'a>( .fields() .iter() .map(|f| { - std::iter::once(f.qualified_column()) - .chain(std::iter::once(f.unqualified_column())) + [ + f.qualified_column(), + // we need to push down filter using unqualified column as well + f.unqualified_column(), + ] }) .flatten() .collect::>(); From 002ca5d1e81dd45247b90f0f6e340ff5fec3a747 Mon Sep 17 00:00:00 2001 From: Javier Goday Date: Wed, 14 Jul 2021 22:22:17 +0200 Subject: [PATCH 261/329] Lead/lag window function with offset and default value arguments (#687) --- .../src/physical_plan/expressions/lead_lag.rs | 94 ++++++++++++++++++- datafusion/src/physical_plan/type_coercion.rs | 35 +++++-- .../src/physical_plan/window_functions.rs | 14 ++- datafusion/src/physical_plan/windows.rs | 67 ++++++++++++- .../simple_window_lead_built_in_functions.sql | 27 ++++++ integration-tests/test_psql_parity.py | 2 +- 6 files changed, 221 insertions(+), 18 deletions(-) create mode 100644 integration-tests/sqls/simple_window_lead_built_in_functions.sql diff --git a/datafusion/src/physical_plan/expressions/lead_lag.rs b/datafusion/src/physical_plan/expressions/lead_lag.rs index 352d97c1e1167..d1f6c197a1862 100644 --- a/datafusion/src/physical_plan/expressions/lead_lag.rs +++ b/datafusion/src/physical_plan/expressions/lead_lag.rs @@ -21,11 +21,13 @@ use crate::error::{DataFusionError, Result}; use crate::physical_plan::window_functions::PartitionEvaluator; use crate::physical_plan::{window_functions::BuiltInWindowFunctionExpr, PhysicalExpr}; +use crate::scalar::ScalarValue; use arrow::array::ArrayRef; -use arrow::compute::kernels::window::shift; +use arrow::compute::cast; use arrow::datatypes::{DataType, Field}; use arrow::record_batch::RecordBatch; use std::any::Any; +use std::ops::Neg; use std::ops::Range; use std::sync::Arc; @@ -36,6 +38,7 @@ pub struct WindowShift { data_type: DataType, shift_offset: i64, expr: Arc, + default_value: Option, } /// lead() window function @@ -43,12 +46,15 @@ pub fn lead( name: String, data_type: DataType, expr: Arc, + shift_offset: Option, + default_value: Option, ) -> WindowShift { WindowShift { name, data_type, - shift_offset: -1, + shift_offset: shift_offset.map(|v| v.neg()).unwrap_or(-1), expr, + default_value, } } @@ -57,12 +63,15 @@ pub fn lag( name: String, data_type: DataType, expr: Arc, + shift_offset: Option, + default_value: Option, ) -> WindowShift { WindowShift { name, data_type, - shift_offset: 1, + shift_offset: shift_offset.unwrap_or(1), expr, + default_value, } } @@ -98,6 +107,7 @@ impl BuiltInWindowFunctionExpr for WindowShift { Ok(Box::new(WindowShiftEvaluator { shift_offset: self.shift_offset, values, + default_value: self.default_value.clone(), })) } } @@ -105,13 +115,63 @@ impl BuiltInWindowFunctionExpr for WindowShift { pub(crate) struct WindowShiftEvaluator { shift_offset: i64, values: Vec, + default_value: Option, +} + +fn create_empty_array( + value: &Option, + data_type: &DataType, + size: usize, +) -> Result { + use arrow::array::new_null_array; + let array = value + .as_ref() + .map(|scalar| scalar.to_array_of_size(size)) + .unwrap_or_else(|| new_null_array(data_type, size)); + if array.data_type() != data_type { + cast(&array, data_type).map_err(DataFusionError::ArrowError) + } else { + Ok(array) + } +} + +// TODO: change the original arrow::compute::kernels::window::shift impl to support an optional default value +fn shift_with_default_value( + array: &ArrayRef, + offset: i64, + value: &Option, +) -> Result { + use arrow::compute::concat; + + let value_len = array.len() as i64; + if offset == 0 { + Ok(arrow::array::make_array(array.data_ref().clone())) + } else if offset == i64::MIN || offset.abs() >= value_len { + create_empty_array(value, array.data_type(), array.len()) + } else { + let slice_offset = (-offset).clamp(0, value_len) as usize; + let length = array.len() - offset.abs() as usize; + let slice = array.slice(slice_offset, length); + + // Generate array with remaining `null` items + let nulls = offset.abs() as usize; + let default_values = create_empty_array(value, slice.data_type(), nulls)?; + // Concatenate both arrays, add nulls after if shift > 0 else before + if offset > 0 { + concat(&[default_values.as_ref(), slice.as_ref()]) + .map_err(DataFusionError::ArrowError) + } else { + concat(&[slice.as_ref(), default_values.as_ref()]) + .map_err(DataFusionError::ArrowError) + } + } } impl PartitionEvaluator for WindowShiftEvaluator { fn evaluate_partition(&self, partition: Range) -> Result { let value = &self.values[0]; let value = value.slice(partition.start, partition.end - partition.start); - shift(value.as_ref(), self.shift_offset).map_err(DataFusionError::ArrowError) + shift_with_default_value(&value, self.shift_offset, &self.default_value) } } @@ -142,6 +202,8 @@ mod tests { "lead".to_owned(), DataType::Float32, Arc::new(Column::new("c3", 0)), + None, + None, ), vec![ Some(-2), @@ -162,6 +224,8 @@ mod tests { "lead".to_owned(), DataType::Float32, Arc::new(Column::new("c3", 0)), + None, + None, ), vec![ None, @@ -176,6 +240,28 @@ mod tests { .iter() .collect::(), )?; + + test_i32_result( + lag( + "lead".to_owned(), + DataType::Int32, + Arc::new(Column::new("c3", 0)), + None, + Some(ScalarValue::Int32(Some(100))), + ), + vec![ + Some(100), + Some(1), + Some(-2), + Some(3), + Some(-4), + Some(5), + Some(-6), + Some(7), + ] + .iter() + .collect::(), + )?; Ok(()) } } diff --git a/datafusion/src/physical_plan/type_coercion.rs b/datafusion/src/physical_plan/type_coercion.rs index ffd8f20064f78..c8387bbd71e1a 100644 --- a/datafusion/src/physical_plan/type_coercion.rs +++ b/datafusion/src/physical_plan/type_coercion.rs @@ -128,13 +128,11 @@ fn get_valid_types( } vec![(0..*number).map(|i| current_types[i].clone()).collect()] } - Signature::OneOf(types) => { - let mut r = vec![]; - for s in types { - r.extend(get_valid_types(s, current_types)?); - } - r - } + Signature::OneOf(types) => types + .iter() + .filter_map(|t| get_valid_types(t, current_types).ok()) + .flatten() + .collect::>(), }; Ok(valid_types) @@ -367,4 +365,27 @@ mod tests { Ok(()) } + + #[test] + fn test_get_valid_types_one_of() -> Result<()> { + let signature = Signature::OneOf(vec![Signature::Any(1), Signature::Any(2)]); + + let invalid_types = get_valid_types( + &signature, + &[DataType::Int32, DataType::Int32, DataType::Int32], + )?; + assert_eq!(invalid_types.len(), 0); + + let args = vec![DataType::Int32, DataType::Int32]; + let valid_types = get_valid_types(&signature, &args)?; + assert_eq!(valid_types.len(), 1); + assert_eq!(valid_types[0], args); + + let args = vec![DataType::Int32]; + let valid_types = get_valid_types(&signature, &args)?; + assert_eq!(valid_types.len(), 1); + assert_eq!(valid_types[0], args); + + Ok(()) + } } diff --git a/datafusion/src/physical_plan/window_functions.rs b/datafusion/src/physical_plan/window_functions.rs index 99805b6d29414..e2b460644479c 100644 --- a/datafusion/src/physical_plan/window_functions.rs +++ b/datafusion/src/physical_plan/window_functions.rs @@ -201,10 +201,16 @@ pub(super) fn signature_for_built_in(fun: &BuiltInWindowFunction) -> Signature { | BuiltInWindowFunction::DenseRank | BuiltInWindowFunction::PercentRank | BuiltInWindowFunction::CumeDist => Signature::Any(0), - BuiltInWindowFunction::Lag - | BuiltInWindowFunction::Lead - | BuiltInWindowFunction::FirstValue - | BuiltInWindowFunction::LastValue => Signature::Any(1), + BuiltInWindowFunction::Lag | BuiltInWindowFunction::Lead => { + Signature::OneOf(vec![ + Signature::Any(1), + Signature::Any(2), + Signature::Any(3), + ]) + } + BuiltInWindowFunction::FirstValue | BuiltInWindowFunction::LastValue => { + Signature::Any(1) + } BuiltInWindowFunction::Ntile => Signature::Exact(vec![DataType::UInt64]), BuiltInWindowFunction::NthValue => Signature::Any(2), } diff --git a/datafusion/src/physical_plan/windows.rs b/datafusion/src/physical_plan/windows.rs index 1b783782e164b..a1f4b7ace530b 100644 --- a/datafusion/src/physical_plan/windows.rs +++ b/datafusion/src/physical_plan/windows.rs @@ -32,6 +32,7 @@ use crate::physical_plan::{ Accumulator, AggregateExpr, Distribution, ExecutionPlan, Partitioning, PhysicalExpr, RecordBatchStream, SendableRecordBatchStream, WindowExpr, }; +use crate::scalar::ScalarValue; use arrow::compute::concat; use arrow::{ array::ArrayRef, @@ -96,6 +97,19 @@ pub fn create_window_expr( }) } +fn get_scalar_value_from_args( + args: &[Arc], + index: usize, +) -> Option { + args.get(index).map(|v| { + v.as_any() + .downcast_ref::() + .unwrap() + .value() + .clone() + }) +} + fn create_built_in_window_expr( fun: &BuiltInWindowFunction, args: &[Arc], @@ -110,13 +124,21 @@ fn create_built_in_window_expr( let coerced_args = coerce(args, input_schema, &signature_for_built_in(fun))?; let arg = coerced_args[0].clone(); let data_type = args[0].data_type(input_schema)?; - Arc::new(lag(name, data_type, arg)) + let shift_offset = get_scalar_value_from_args(&coerced_args, 1) + .map(|v| v.try_into()) + .and_then(|v| v.ok()); + let default_value = get_scalar_value_from_args(&coerced_args, 2); + Arc::new(lag(name, data_type, arg, shift_offset, default_value)) } BuiltInWindowFunction::Lead => { let coerced_args = coerce(args, input_schema, &signature_for_built_in(fun))?; let arg = coerced_args[0].clone(); let data_type = args[0].data_type(input_schema)?; - Arc::new(lead(name, data_type, arg)) + let shift_offset = get_scalar_value_from_args(&coerced_args, 1) + .map(|v| v.try_into()) + .and_then(|v| v.ok()); + let default_value = get_scalar_value_from_args(&coerced_args, 2); + Arc::new(lead(name, data_type, arg, shift_offset, default_value)) } BuiltInWindowFunction::NthValue => { let coerced_args = coerce(args, input_schema, &signature_for_built_in(fun))?; @@ -592,6 +614,47 @@ mod tests { Ok((input, schema)) } + #[test] + fn test_create_window_exp_lead_no_args() -> Result<()> { + let (_, schema) = create_test_schema(1)?; + + let expr = create_window_expr( + &WindowFunction::BuiltInWindowFunction(BuiltInWindowFunction::Lead), + "prev".to_owned(), + &[col("c2", &schema)?], + &[], + &[], + Some(WindowFrame::default()), + schema.as_ref(), + )?; + + assert_eq!(expr.name(), "prev"); + + Ok(()) + } + + #[test] + fn test_create_window_exp_lead_with_args() -> Result<()> { + let (_, schema) = create_test_schema(1)?; + + let expr = create_window_expr( + &WindowFunction::BuiltInWindowFunction(BuiltInWindowFunction::Lead), + "prev".to_owned(), + &[ + col("c2", &schema)?, + Arc::new(Literal::new(ScalarValue::Int64(Some(1)))), + ], + &[], + &[], + Some(WindowFrame::default()), + schema.as_ref(), + )?; + + assert_eq!(expr.name(), "prev"); + + Ok(()) + } + #[tokio::test] async fn window_function() -> Result<()> { let (input, schema) = create_test_schema(1)?; diff --git a/integration-tests/sqls/simple_window_lead_built_in_functions.sql b/integration-tests/sqls/simple_window_lead_built_in_functions.sql new file mode 100644 index 0000000000000..67df05b68c1aa --- /dev/null +++ b/integration-tests/sqls/simple_window_lead_built_in_functions.sql @@ -0,0 +1,27 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at + +-- http://www.apache.org/licenses/LICENSE-2.0 + +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +SELECT + c8, + LEAD(c8) OVER () next_c8, + LEAD(c8, 10, 10) OVER() next_10_c8, + LEAD(c8, 100, 10) OVER() next_out_of_bounds_c8, + LAG(c8) OVER() prev_c8, + LAG(c8, -2, 0) OVER() AS prev_2_c8, + LAG(c8, -200, 10) OVER() AS prev_out_of_bounds_c8 + +FROM test +ORDER BY c8; diff --git a/integration-tests/test_psql_parity.py b/integration-tests/test_psql_parity.py index a160d3e320ce1..a85a2c2f4b372 100644 --- a/integration-tests/test_psql_parity.py +++ b/integration-tests/test_psql_parity.py @@ -77,7 +77,7 @@ def generate_csv_from_psql(fname: str): class TestPsqlParity: def test_tests_count(self): - assert len(test_files) == 14, "tests are missed" + assert len(test_files) == 15, "tests are missed" @pytest.mark.parametrize("fname", test_files) def test_sql_file(self, fname): From e24155ee026f69798a19ad73000106cd03ba5cfa Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 15 Jul 2021 19:47:00 +0800 Subject: [PATCH 262/329] split up windows functions into a dedicated module with separate files (#724) * split up window functions * add docs --- datafusion/src/physical_plan/windows.rs | 717 ------------------ .../src/physical_plan/windows/aggregate.rs | 180 +++++ .../src/physical_plan/windows/built_in.rs | 103 +++ datafusion/src/physical_plan/windows/mod.rs | 258 +++++++ .../physical_plan/windows/window_agg_exec.rs | 260 +++++++ 5 files changed, 801 insertions(+), 717 deletions(-) delete mode 100644 datafusion/src/physical_plan/windows.rs create mode 100644 datafusion/src/physical_plan/windows/aggregate.rs create mode 100644 datafusion/src/physical_plan/windows/built_in.rs create mode 100644 datafusion/src/physical_plan/windows/mod.rs create mode 100644 datafusion/src/physical_plan/windows/window_agg_exec.rs diff --git a/datafusion/src/physical_plan/windows.rs b/datafusion/src/physical_plan/windows.rs deleted file mode 100644 index a1f4b7ace530b..0000000000000 --- a/datafusion/src/physical_plan/windows.rs +++ /dev/null @@ -1,717 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Execution plan for window functions - -use crate::error::{DataFusionError, Result}; -use crate::logical_plan::window_frames::{WindowFrame, WindowFrameUnits}; -use crate::physical_plan::{ - aggregates, common, - expressions::{ - dense_rank, lag, lead, rank, Literal, NthValue, PhysicalSortExpr, RowNumber, - }, - type_coercion::coerce, - window_functions::{ - signature_for_built_in, BuiltInWindowFunction, BuiltInWindowFunctionExpr, - WindowFunction, - }, - Accumulator, AggregateExpr, Distribution, ExecutionPlan, Partitioning, PhysicalExpr, - RecordBatchStream, SendableRecordBatchStream, WindowExpr, -}; -use crate::scalar::ScalarValue; -use arrow::compute::concat; -use arrow::{ - array::ArrayRef, - datatypes::{Field, Schema, SchemaRef}, - error::{ArrowError, Result as ArrowResult}, - record_batch::RecordBatch, -}; -use async_trait::async_trait; -use futures::stream::Stream; -use futures::Future; -use pin_project_lite::pin_project; -use std::any::Any; -use std::convert::TryInto; -use std::ops::Range; -use std::pin::Pin; -use std::sync::Arc; -use std::task::{Context, Poll}; - -/// Window execution plan -#[derive(Debug)] -pub struct WindowAggExec { - /// Input plan - input: Arc, - /// Window function expression - window_expr: Vec>, - /// Schema after the window is run - schema: SchemaRef, - /// Schema before the window - input_schema: SchemaRef, -} - -/// Create a physical expression for window function -pub fn create_window_expr( - fun: &WindowFunction, - name: String, - args: &[Arc], - partition_by: &[Arc], - order_by: &[PhysicalSortExpr], - window_frame: Option, - input_schema: &Schema, -) -> Result> { - Ok(match fun { - WindowFunction::AggregateFunction(fun) => Arc::new(AggregateWindowExpr { - aggregate: aggregates::create_aggregate_expr( - fun, - false, - args, - input_schema, - name, - )?, - partition_by: partition_by.to_vec(), - order_by: order_by.to_vec(), - window_frame, - }), - WindowFunction::BuiltInWindowFunction(fun) => Arc::new(BuiltInWindowExpr { - fun: fun.clone(), - expr: create_built_in_window_expr(fun, args, input_schema, name)?, - partition_by: partition_by.to_vec(), - order_by: order_by.to_vec(), - window_frame, - }), - }) -} - -fn get_scalar_value_from_args( - args: &[Arc], - index: usize, -) -> Option { - args.get(index).map(|v| { - v.as_any() - .downcast_ref::() - .unwrap() - .value() - .clone() - }) -} - -fn create_built_in_window_expr( - fun: &BuiltInWindowFunction, - args: &[Arc], - input_schema: &Schema, - name: String, -) -> Result> { - Ok(match fun { - BuiltInWindowFunction::RowNumber => Arc::new(RowNumber::new(name)), - BuiltInWindowFunction::Rank => Arc::new(rank(name)), - BuiltInWindowFunction::DenseRank => Arc::new(dense_rank(name)), - BuiltInWindowFunction::Lag => { - let coerced_args = coerce(args, input_schema, &signature_for_built_in(fun))?; - let arg = coerced_args[0].clone(); - let data_type = args[0].data_type(input_schema)?; - let shift_offset = get_scalar_value_from_args(&coerced_args, 1) - .map(|v| v.try_into()) - .and_then(|v| v.ok()); - let default_value = get_scalar_value_from_args(&coerced_args, 2); - Arc::new(lag(name, data_type, arg, shift_offset, default_value)) - } - BuiltInWindowFunction::Lead => { - let coerced_args = coerce(args, input_schema, &signature_for_built_in(fun))?; - let arg = coerced_args[0].clone(); - let data_type = args[0].data_type(input_schema)?; - let shift_offset = get_scalar_value_from_args(&coerced_args, 1) - .map(|v| v.try_into()) - .and_then(|v| v.ok()); - let default_value = get_scalar_value_from_args(&coerced_args, 2); - Arc::new(lead(name, data_type, arg, shift_offset, default_value)) - } - BuiltInWindowFunction::NthValue => { - let coerced_args = coerce(args, input_schema, &signature_for_built_in(fun))?; - let arg = coerced_args[0].clone(); - let n = coerced_args[1] - .as_any() - .downcast_ref::() - .unwrap() - .value(); - let n: i64 = n - .clone() - .try_into() - .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; - let n: u32 = n as u32; - let data_type = args[0].data_type(input_schema)?; - Arc::new(NthValue::nth_value(name, arg, data_type, n)?) - } - BuiltInWindowFunction::FirstValue => { - let arg = - coerce(args, input_schema, &signature_for_built_in(fun))?[0].clone(); - let data_type = args[0].data_type(input_schema)?; - Arc::new(NthValue::first_value(name, arg, data_type)) - } - BuiltInWindowFunction::LastValue => { - let arg = - coerce(args, input_schema, &signature_for_built_in(fun))?[0].clone(); - let data_type = args[0].data_type(input_schema)?; - Arc::new(NthValue::last_value(name, arg, data_type)) - } - _ => { - return Err(DataFusionError::NotImplemented(format!( - "Window function with {:?} not yet implemented", - fun - ))) - } - }) -} - -/// A window expr that takes the form of a built in window function -#[derive(Debug)] -pub struct BuiltInWindowExpr { - fun: BuiltInWindowFunction, - expr: Arc, - partition_by: Vec>, - order_by: Vec, - window_frame: Option, -} - -impl WindowExpr for BuiltInWindowExpr { - /// Return a reference to Any that can be used for downcasting - fn as_any(&self) -> &dyn Any { - self - } - - fn name(&self) -> &str { - self.expr.name() - } - - fn field(&self) -> Result { - self.expr.field() - } - - fn expressions(&self) -> Vec> { - self.expr.expressions() - } - - fn partition_by(&self) -> &[Arc] { - &self.partition_by - } - - fn order_by(&self) -> &[PhysicalSortExpr] { - &self.order_by - } - - fn evaluate(&self, batch: &RecordBatch) -> Result { - let evaluator = self.expr.create_evaluator(batch)?; - let num_rows = batch.num_rows(); - let partition_points = - self.evaluate_partition_points(num_rows, &self.partition_columns(batch)?)?; - let results = if evaluator.include_rank() { - let sort_partition_points = - self.evaluate_partition_points(num_rows, &self.sort_columns(batch)?)?; - evaluator.evaluate_with_rank(partition_points, sort_partition_points)? - } else { - evaluator.evaluate(partition_points)? - }; - let results = results.iter().map(|i| i.as_ref()).collect::>(); - concat(&results).map_err(DataFusionError::ArrowError) - } -} - -/// Given a partition range, and the full list of sort partition points, given that the sort -/// partition points are sorted using [partition columns..., order columns...], the split -/// boundaries would align (what's sorted on [partition columns...] would definitely be sorted -/// on finer columns), so this will use binary search to find ranges that are within the -/// partition range and return the valid slice. -pub(crate) fn find_ranges_in_range<'a>( - partition_range: &Range, - sort_partition_points: &'a [Range], -) -> &'a [Range] { - let start_idx = sort_partition_points - .partition_point(|sort_range| sort_range.start < partition_range.start); - let end_idx = start_idx - + sort_partition_points[start_idx..] - .partition_point(|sort_range| sort_range.end <= partition_range.end); - &sort_partition_points[start_idx..end_idx] -} - -/// A window expr that takes the form of an aggregate function -#[derive(Debug)] -pub struct AggregateWindowExpr { - aggregate: Arc, - partition_by: Vec>, - order_by: Vec, - window_frame: Option, -} - -impl AggregateWindowExpr { - /// the aggregate window function operates based on window frame, and by default the mode is - /// "range". - fn evaluation_mode(&self) -> WindowFrameUnits { - self.window_frame.unwrap_or_default().units - } - - /// create a new accumulator based on the underlying aggregation function - fn create_accumulator(&self) -> Result { - let accumulator = self.aggregate.create_accumulator()?; - Ok(AggregateWindowAccumulator { accumulator }) - } - - /// peer based evaluation based on the fact that batch is pre-sorted given the sort columns - /// and then per partition point we'll evaluate the peer group (e.g. SUM or MAX gives the same - /// results for peers) and concatenate the results. - fn peer_based_evaluate(&self, batch: &RecordBatch) -> Result { - let num_rows = batch.num_rows(); - let partition_points = - self.evaluate_partition_points(num_rows, &self.partition_columns(batch)?)?; - let sort_partition_points = - self.evaluate_partition_points(num_rows, &self.sort_columns(batch)?)?; - let values = self.evaluate_args(batch)?; - let results = partition_points - .iter() - .map(|partition_range| { - let sort_partition_points = - find_ranges_in_range(partition_range, &sort_partition_points); - let mut window_accumulators = self.create_accumulator()?; - sort_partition_points - .iter() - .map(|range| window_accumulators.scan_peers(&values, range)) - .collect::>>() - }) - .collect::>>>()? - .into_iter() - .flatten() - .collect::>(); - let results = results.iter().map(|i| i.as_ref()).collect::>(); - concat(&results).map_err(DataFusionError::ArrowError) - } - - fn group_based_evaluate(&self, _batch: &RecordBatch) -> Result { - Err(DataFusionError::NotImplemented(format!( - "Group based evaluation for {} is not yet implemented", - self.name() - ))) - } - - fn row_based_evaluate(&self, _batch: &RecordBatch) -> Result { - Err(DataFusionError::NotImplemented(format!( - "Row based evaluation for {} is not yet implemented", - self.name() - ))) - } -} - -impl WindowExpr for AggregateWindowExpr { - /// Return a reference to Any that can be used for downcasting - fn as_any(&self) -> &dyn Any { - self - } - - fn name(&self) -> &str { - self.aggregate.name() - } - - fn field(&self) -> Result { - self.aggregate.field() - } - - fn expressions(&self) -> Vec> { - self.aggregate.expressions() - } - - fn partition_by(&self) -> &[Arc] { - &self.partition_by - } - - fn order_by(&self) -> &[PhysicalSortExpr] { - &self.order_by - } - - /// evaluate the window function values against the batch - fn evaluate(&self, batch: &RecordBatch) -> Result { - match self.evaluation_mode() { - WindowFrameUnits::Range => self.peer_based_evaluate(batch), - WindowFrameUnits::Rows => self.row_based_evaluate(batch), - WindowFrameUnits::Groups => self.group_based_evaluate(batch), - } - } -} - -/// Aggregate window accumulator utilizes the accumulator from aggregation and do a accumulative sum -/// across evaluation arguments based on peer equivalences. -#[derive(Debug)] -struct AggregateWindowAccumulator { - accumulator: Box, -} - -impl AggregateWindowAccumulator { - /// scan one peer group of values (as arguments to window function) given by the value_range - /// and return evaluation result that are of the same number of rows. - fn scan_peers( - &mut self, - values: &[ArrayRef], - value_range: &Range, - ) -> Result { - if value_range.is_empty() { - return Err(DataFusionError::Internal( - "Value range cannot be empty".to_owned(), - )); - } - let len = value_range.end - value_range.start; - let values = values - .iter() - .map(|v| v.slice(value_range.start, len)) - .collect::>(); - self.accumulator.update_batch(&values)?; - let value = self.accumulator.evaluate()?; - Ok(value.to_array_of_size(len)) - } -} - -fn create_schema( - input_schema: &Schema, - window_expr: &[Arc], -) -> Result { - let mut fields = Vec::with_capacity(input_schema.fields().len() + window_expr.len()); - for expr in window_expr { - fields.push(expr.field()?); - } - fields.extend_from_slice(input_schema.fields()); - Ok(Schema::new(fields)) -} - -impl WindowAggExec { - /// Create a new execution plan for window aggregates - pub fn try_new( - window_expr: Vec>, - input: Arc, - input_schema: SchemaRef, - ) -> Result { - let schema = create_schema(&input_schema, &window_expr)?; - let schema = Arc::new(schema); - Ok(WindowAggExec { - input, - window_expr, - schema, - input_schema, - }) - } - - /// Window expressions - pub fn window_expr(&self) -> &[Arc] { - &self.window_expr - } - - /// Input plan - pub fn input(&self) -> &Arc { - &self.input - } - - /// Get the input schema before any window functions are applied - pub fn input_schema(&self) -> SchemaRef { - self.input_schema.clone() - } -} - -#[async_trait] -impl ExecutionPlan for WindowAggExec { - /// Return a reference to Any that can be used for downcasting - fn as_any(&self) -> &dyn Any { - self - } - - fn schema(&self) -> SchemaRef { - self.schema.clone() - } - - fn children(&self) -> Vec> { - vec![self.input.clone()] - } - - /// Get the output partitioning of this plan - fn output_partitioning(&self) -> Partitioning { - // because we can have repartitioning using the partition keys - // this would be either 1 or more than 1 depending on the presense of - // repartitioning - self.input.output_partitioning() - } - - fn required_child_distribution(&self) -> Distribution { - if self - .window_expr() - .iter() - .all(|expr| expr.partition_by().is_empty()) - { - Distribution::SinglePartition - } else { - Distribution::UnspecifiedDistribution - } - } - - fn with_new_children( - &self, - children: Vec>, - ) -> Result> { - match children.len() { - 1 => Ok(Arc::new(WindowAggExec::try_new( - self.window_expr.clone(), - children[0].clone(), - self.input_schema.clone(), - )?)), - _ => Err(DataFusionError::Internal( - "WindowAggExec wrong number of children".to_owned(), - )), - } - } - - async fn execute(&self, partition: usize) -> Result { - let input = self.input.execute(partition).await?; - let stream = Box::pin(WindowAggStream::new( - self.schema.clone(), - self.window_expr.clone(), - input, - )); - Ok(stream) - } -} - -pin_project! { - /// stream for window aggregation plan - pub struct WindowAggStream { - schema: SchemaRef, - #[pin] - output: futures::channel::oneshot::Receiver>, - finished: bool, - } -} - -/// Compute the window aggregate columns -fn compute_window_aggregates( - window_expr: Vec>, - batch: &RecordBatch, -) -> Result> { - window_expr - .iter() - .map(|window_expr| window_expr.evaluate(batch)) - .collect() -} - -impl WindowAggStream { - /// Create a new WindowAggStream - pub fn new( - schema: SchemaRef, - window_expr: Vec>, - input: SendableRecordBatchStream, - ) -> Self { - let (tx, rx) = futures::channel::oneshot::channel(); - let schema_clone = schema.clone(); - tokio::spawn(async move { - let schema = schema_clone.clone(); - let result = WindowAggStream::process(input, window_expr, schema).await; - tx.send(result) - }); - - Self { - output: rx, - finished: false, - schema, - } - } - - async fn process( - input: SendableRecordBatchStream, - window_expr: Vec>, - schema: SchemaRef, - ) -> ArrowResult { - let input_schema = input.schema(); - let batches = common::collect(input) - .await - .map_err(DataFusionError::into_arrow_external_error)?; - let batch = common::combine_batches(&batches, input_schema.clone())?; - if let Some(batch) = batch { - // calculate window cols - let mut columns = compute_window_aggregates(window_expr, &batch) - .map_err(DataFusionError::into_arrow_external_error)?; - // combine with the original cols - // note the setup of window aggregates is that they newly calculated window - // expressions are always prepended to the columns - columns.extend_from_slice(batch.columns()); - RecordBatch::try_new(schema, columns) - } else { - Ok(RecordBatch::new_empty(schema)) - } - } -} - -impl Stream for WindowAggStream { - type Item = ArrowResult; - - fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - if self.finished { - return Poll::Ready(None); - } - - // is the output ready? - let this = self.project(); - let output_poll = this.output.poll(cx); - - match output_poll { - Poll::Ready(result) => { - *this.finished = true; - // check for error in receiving channel and unwrap actual result - let result = match result { - Err(e) => Some(Err(ArrowError::ExternalError(Box::new(e)))), // error receiving - Ok(result) => Some(result), - }; - Poll::Ready(result) - } - Poll::Pending => Poll::Pending, - } - } -} - -impl RecordBatchStream for WindowAggStream { - /// Get the schema - fn schema(&self) -> SchemaRef { - self.schema.clone() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::physical_plan::aggregates::AggregateFunction; - use crate::physical_plan::collect; - use crate::physical_plan::csv::{CsvExec, CsvReadOptions}; - use crate::physical_plan::expressions::col; - use crate::test; - use arrow::array::*; - - fn create_test_schema(partitions: usize) -> Result<(Arc, SchemaRef)> { - let schema = test::aggr_test_schema(); - let path = test::create_partitioned_csv("aggregate_test_100.csv", partitions)?; - let csv = CsvExec::try_new( - &path, - CsvReadOptions::new().schema(&schema), - None, - 1024, - None, - )?; - - let input = Arc::new(csv); - Ok((input, schema)) - } - - #[test] - fn test_create_window_exp_lead_no_args() -> Result<()> { - let (_, schema) = create_test_schema(1)?; - - let expr = create_window_expr( - &WindowFunction::BuiltInWindowFunction(BuiltInWindowFunction::Lead), - "prev".to_owned(), - &[col("c2", &schema)?], - &[], - &[], - Some(WindowFrame::default()), - schema.as_ref(), - )?; - - assert_eq!(expr.name(), "prev"); - - Ok(()) - } - - #[test] - fn test_create_window_exp_lead_with_args() -> Result<()> { - let (_, schema) = create_test_schema(1)?; - - let expr = create_window_expr( - &WindowFunction::BuiltInWindowFunction(BuiltInWindowFunction::Lead), - "prev".to_owned(), - &[ - col("c2", &schema)?, - Arc::new(Literal::new(ScalarValue::Int64(Some(1)))), - ], - &[], - &[], - Some(WindowFrame::default()), - schema.as_ref(), - )?; - - assert_eq!(expr.name(), "prev"); - - Ok(()) - } - - #[tokio::test] - async fn window_function() -> Result<()> { - let (input, schema) = create_test_schema(1)?; - - let window_exec = Arc::new(WindowAggExec::try_new( - vec![ - create_window_expr( - &WindowFunction::AggregateFunction(AggregateFunction::Count), - "count".to_owned(), - &[col("c3", &schema)?], - &[], - &[], - Some(WindowFrame::default()), - schema.as_ref(), - )?, - create_window_expr( - &WindowFunction::AggregateFunction(AggregateFunction::Max), - "max".to_owned(), - &[col("c3", &schema)?], - &[], - &[], - Some(WindowFrame::default()), - schema.as_ref(), - )?, - create_window_expr( - &WindowFunction::AggregateFunction(AggregateFunction::Min), - "min".to_owned(), - &[col("c3", &schema)?], - &[], - &[], - Some(WindowFrame::default()), - schema.as_ref(), - )?, - ], - input, - schema.clone(), - )?); - - let result: Vec = collect(window_exec).await?; - assert_eq!(result.len(), 1); - - let columns = result[0].columns(); - - // c3 is small int - - let count: &UInt64Array = as_primitive_array(&columns[0]); - assert_eq!(count.value(0), 100); - assert_eq!(count.value(99), 100); - - let max: &Int8Array = as_primitive_array(&columns[1]); - assert_eq!(max.value(0), 125); - assert_eq!(max.value(99), 125); - - let min: &Int8Array = as_primitive_array(&columns[2]); - assert_eq!(min.value(0), -117); - assert_eq!(min.value(99), -117); - - Ok(()) - } -} diff --git a/datafusion/src/physical_plan/windows/aggregate.rs b/datafusion/src/physical_plan/windows/aggregate.rs new file mode 100644 index 0000000000000..f7c29ba6aff72 --- /dev/null +++ b/datafusion/src/physical_plan/windows/aggregate.rs @@ -0,0 +1,180 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Physical exec for aggregate window function expressions. + +use crate::error::{DataFusionError, Result}; +use crate::logical_plan::window_frames::{WindowFrame, WindowFrameUnits}; +use crate::physical_plan::windows::find_ranges_in_range; +use crate::physical_plan::{ + expressions::PhysicalSortExpr, Accumulator, AggregateExpr, PhysicalExpr, WindowExpr, +}; +use arrow::compute::concat; +use arrow::record_batch::RecordBatch; +use arrow::{array::ArrayRef, datatypes::Field}; +use std::any::Any; +use std::iter::IntoIterator; +use std::ops::Range; +use std::sync::Arc; + +/// A window expr that takes the form of an aggregate function +#[derive(Debug)] +pub struct AggregateWindowExpr { + aggregate: Arc, + partition_by: Vec>, + order_by: Vec, + window_frame: Option, +} + +impl AggregateWindowExpr { + /// create a new aggregate window function expression + pub(super) fn new( + aggregate: Arc, + partition_by: &[Arc], + order_by: &[PhysicalSortExpr], + window_frame: Option, + ) -> Self { + Self { + aggregate, + partition_by: partition_by.to_vec(), + order_by: order_by.to_vec(), + window_frame, + } + } + + /// the aggregate window function operates based on window frame, and by default the mode is + /// "range". + fn evaluation_mode(&self) -> WindowFrameUnits { + self.window_frame.unwrap_or_default().units + } + + /// create a new accumulator based on the underlying aggregation function + fn create_accumulator(&self) -> Result { + let accumulator = self.aggregate.create_accumulator()?; + Ok(AggregateWindowAccumulator { accumulator }) + } + + /// peer based evaluation based on the fact that batch is pre-sorted given the sort columns + /// and then per partition point we'll evaluate the peer group (e.g. SUM or MAX gives the same + /// results for peers) and concatenate the results. + fn peer_based_evaluate(&self, batch: &RecordBatch) -> Result { + let num_rows = batch.num_rows(); + let partition_points = + self.evaluate_partition_points(num_rows, &self.partition_columns(batch)?)?; + let sort_partition_points = + self.evaluate_partition_points(num_rows, &self.sort_columns(batch)?)?; + let values = self.evaluate_args(batch)?; + let results = partition_points + .iter() + .map(|partition_range| { + let sort_partition_points = + find_ranges_in_range(partition_range, &sort_partition_points); + let mut window_accumulators = self.create_accumulator()?; + sort_partition_points + .iter() + .map(|range| window_accumulators.scan_peers(&values, range)) + .collect::>>() + }) + .collect::>>>()? + .into_iter() + .flatten() + .collect::>(); + let results = results.iter().map(|i| i.as_ref()).collect::>(); + concat(&results).map_err(DataFusionError::ArrowError) + } + + fn group_based_evaluate(&self, _batch: &RecordBatch) -> Result { + Err(DataFusionError::NotImplemented(format!( + "Group based evaluation for {} is not yet implemented", + self.name() + ))) + } + + fn row_based_evaluate(&self, _batch: &RecordBatch) -> Result { + Err(DataFusionError::NotImplemented(format!( + "Row based evaluation for {} is not yet implemented", + self.name() + ))) + } +} + +impl WindowExpr for AggregateWindowExpr { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + self.aggregate.name() + } + + fn field(&self) -> Result { + self.aggregate.field() + } + + fn expressions(&self) -> Vec> { + self.aggregate.expressions() + } + + fn partition_by(&self) -> &[Arc] { + &self.partition_by + } + + fn order_by(&self) -> &[PhysicalSortExpr] { + &self.order_by + } + + /// evaluate the window function values against the batch + fn evaluate(&self, batch: &RecordBatch) -> Result { + match self.evaluation_mode() { + WindowFrameUnits::Range => self.peer_based_evaluate(batch), + WindowFrameUnits::Rows => self.row_based_evaluate(batch), + WindowFrameUnits::Groups => self.group_based_evaluate(batch), + } + } +} + +/// Aggregate window accumulator utilizes the accumulator from aggregation and do a accumulative sum +/// across evaluation arguments based on peer equivalences. +#[derive(Debug)] +struct AggregateWindowAccumulator { + accumulator: Box, +} + +impl AggregateWindowAccumulator { + /// scan one peer group of values (as arguments to window function) given by the value_range + /// and return evaluation result that are of the same number of rows. + fn scan_peers( + &mut self, + values: &[ArrayRef], + value_range: &Range, + ) -> Result { + if value_range.is_empty() { + return Err(DataFusionError::Internal( + "Value range cannot be empty".to_owned(), + )); + } + let len = value_range.end - value_range.start; + let values = values + .iter() + .map(|v| v.slice(value_range.start, len)) + .collect::>(); + self.accumulator.update_batch(&values)?; + let value = self.accumulator.evaluate()?; + Ok(value.to_array_of_size(len)) + } +} diff --git a/datafusion/src/physical_plan/windows/built_in.rs b/datafusion/src/physical_plan/windows/built_in.rs new file mode 100644 index 0000000000000..82040de6ef5c3 --- /dev/null +++ b/datafusion/src/physical_plan/windows/built_in.rs @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Physical exec for built-in window function expressions. + +use crate::error::{DataFusionError, Result}; +use crate::logical_plan::window_frames::WindowFrame; +use crate::physical_plan::{ + expressions::PhysicalSortExpr, + window_functions::{BuiltInWindowFunction, BuiltInWindowFunctionExpr}, + PhysicalExpr, WindowExpr, +}; +use arrow::compute::concat; +use arrow::record_batch::RecordBatch; +use arrow::{array::ArrayRef, datatypes::Field}; +use std::any::Any; +use std::sync::Arc; + +/// A window expr that takes the form of a built in window function +#[derive(Debug)] +pub struct BuiltInWindowExpr { + fun: BuiltInWindowFunction, + expr: Arc, + partition_by: Vec>, + order_by: Vec, + window_frame: Option, +} + +impl BuiltInWindowExpr { + /// create a new built-in window function expression + pub(super) fn new( + fun: BuiltInWindowFunction, + expr: Arc, + partition_by: &[Arc], + order_by: &[PhysicalSortExpr], + window_frame: Option, + ) -> Self { + Self { + fun, + expr, + partition_by: partition_by.to_vec(), + order_by: order_by.to_vec(), + window_frame, + } + } +} + +impl WindowExpr for BuiltInWindowExpr { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + self.expr.name() + } + + fn field(&self) -> Result { + self.expr.field() + } + + fn expressions(&self) -> Vec> { + self.expr.expressions() + } + + fn partition_by(&self) -> &[Arc] { + &self.partition_by + } + + fn order_by(&self) -> &[PhysicalSortExpr] { + &self.order_by + } + + fn evaluate(&self, batch: &RecordBatch) -> Result { + let evaluator = self.expr.create_evaluator(batch)?; + let num_rows = batch.num_rows(); + let partition_points = + self.evaluate_partition_points(num_rows, &self.partition_columns(batch)?)?; + let results = if evaluator.include_rank() { + let sort_partition_points = + self.evaluate_partition_points(num_rows, &self.sort_columns(batch)?)?; + evaluator.evaluate_with_rank(partition_points, sort_partition_points)? + } else { + evaluator.evaluate(partition_points)? + }; + let results = results.iter().map(|i| i.as_ref()).collect::>(); + concat(&results).map_err(DataFusionError::ArrowError) + } +} diff --git a/datafusion/src/physical_plan/windows/mod.rs b/datafusion/src/physical_plan/windows/mod.rs new file mode 100644 index 0000000000000..194aa8de5bb5c --- /dev/null +++ b/datafusion/src/physical_plan/windows/mod.rs @@ -0,0 +1,258 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Physical expressions for window functions + +use crate::error::{DataFusionError, Result}; +use crate::logical_plan::window_frames::WindowFrame; +use crate::physical_plan::{ + aggregates, + expressions::{ + dense_rank, lag, lead, rank, Literal, NthValue, PhysicalSortExpr, RowNumber, + }, + type_coercion::coerce, + window_functions::{ + signature_for_built_in, BuiltInWindowFunction, BuiltInWindowFunctionExpr, + WindowFunction, + }, + PhysicalExpr, WindowExpr, +}; +use crate::scalar::ScalarValue; +use arrow::datatypes::Schema; +use std::convert::TryInto; +use std::ops::Range; +use std::sync::Arc; + +mod aggregate; +mod built_in; +mod window_agg_exec; + +pub use aggregate::AggregateWindowExpr; +pub use built_in::BuiltInWindowExpr; +pub use window_agg_exec::WindowAggExec; + +/// Create a physical expression for window function +pub fn create_window_expr( + fun: &WindowFunction, + name: String, + args: &[Arc], + partition_by: &[Arc], + order_by: &[PhysicalSortExpr], + window_frame: Option, + input_schema: &Schema, +) -> Result> { + Ok(match fun { + WindowFunction::AggregateFunction(fun) => Arc::new(AggregateWindowExpr::new( + aggregates::create_aggregate_expr(fun, false, args, input_schema, name)?, + partition_by, + order_by, + window_frame, + )), + WindowFunction::BuiltInWindowFunction(fun) => Arc::new(BuiltInWindowExpr::new( + fun.clone(), + create_built_in_window_expr(fun, args, input_schema, name)?, + partition_by, + order_by, + window_frame, + )), + }) +} + +fn get_scalar_value_from_args( + args: &[Arc], + index: usize, +) -> Option { + args.get(index).map(|v| { + v.as_any() + .downcast_ref::() + .unwrap() + .value() + .clone() + }) +} + +fn create_built_in_window_expr( + fun: &BuiltInWindowFunction, + args: &[Arc], + input_schema: &Schema, + name: String, +) -> Result> { + Ok(match fun { + BuiltInWindowFunction::RowNumber => Arc::new(RowNumber::new(name)), + BuiltInWindowFunction::Rank => Arc::new(rank(name)), + BuiltInWindowFunction::DenseRank => Arc::new(dense_rank(name)), + BuiltInWindowFunction::Lag => { + let coerced_args = coerce(args, input_schema, &signature_for_built_in(fun))?; + let arg = coerced_args[0].clone(); + let data_type = args[0].data_type(input_schema)?; + let shift_offset = get_scalar_value_from_args(&coerced_args, 1) + .map(|v| v.try_into()) + .and_then(|v| v.ok()); + let default_value = get_scalar_value_from_args(&coerced_args, 2); + Arc::new(lag(name, data_type, arg, shift_offset, default_value)) + } + BuiltInWindowFunction::Lead => { + let coerced_args = coerce(args, input_schema, &signature_for_built_in(fun))?; + let arg = coerced_args[0].clone(); + let data_type = args[0].data_type(input_schema)?; + let shift_offset = get_scalar_value_from_args(&coerced_args, 1) + .map(|v| v.try_into()) + .and_then(|v| v.ok()); + let default_value = get_scalar_value_from_args(&coerced_args, 2); + Arc::new(lead(name, data_type, arg, shift_offset, default_value)) + } + BuiltInWindowFunction::NthValue => { + let coerced_args = coerce(args, input_schema, &signature_for_built_in(fun))?; + let arg = coerced_args[0].clone(); + let n = coerced_args[1] + .as_any() + .downcast_ref::() + .unwrap() + .value(); + let n: i64 = n + .clone() + .try_into() + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; + let n: u32 = n as u32; + let data_type = args[0].data_type(input_schema)?; + Arc::new(NthValue::nth_value(name, arg, data_type, n)?) + } + BuiltInWindowFunction::FirstValue => { + let arg = + coerce(args, input_schema, &signature_for_built_in(fun))?[0].clone(); + let data_type = args[0].data_type(input_schema)?; + Arc::new(NthValue::first_value(name, arg, data_type)) + } + BuiltInWindowFunction::LastValue => { + let arg = + coerce(args, input_schema, &signature_for_built_in(fun))?[0].clone(); + let data_type = args[0].data_type(input_schema)?; + Arc::new(NthValue::last_value(name, arg, data_type)) + } + _ => { + return Err(DataFusionError::NotImplemented(format!( + "Window function with {:?} not yet implemented", + fun + ))) + } + }) +} + +/// Given a partition range, and the full list of sort partition points, given that the sort +/// partition points are sorted using [partition columns..., order columns...], the split +/// boundaries would align (what's sorted on [partition columns...] would definitely be sorted +/// on finer columns), so this will use binary search to find ranges that are within the +/// partition range and return the valid slice. +pub(crate) fn find_ranges_in_range<'a>( + partition_range: &Range, + sort_partition_points: &'a [Range], +) -> &'a [Range] { + let start_idx = sort_partition_points + .partition_point(|sort_range| sort_range.start < partition_range.start); + let end_idx = start_idx + + sort_partition_points[start_idx..] + .partition_point(|sort_range| sort_range.end <= partition_range.end); + &sort_partition_points[start_idx..end_idx] +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::physical_plan::aggregates::AggregateFunction; + use crate::physical_plan::collect; + use crate::physical_plan::csv::{CsvExec, CsvReadOptions}; + use crate::physical_plan::expressions::col; + use crate::test; + use arrow::array::*; + use arrow::datatypes::SchemaRef; + use arrow::record_batch::RecordBatch; + + fn create_test_schema(partitions: usize) -> Result<(Arc, SchemaRef)> { + let schema = test::aggr_test_schema(); + let path = test::create_partitioned_csv("aggregate_test_100.csv", partitions)?; + let csv = CsvExec::try_new( + &path, + CsvReadOptions::new().schema(&schema), + None, + 1024, + None, + )?; + + let input = Arc::new(csv); + Ok((input, schema)) + } + + #[tokio::test] + async fn window_function() -> Result<()> { + let (input, schema) = create_test_schema(1)?; + + let window_exec = Arc::new(WindowAggExec::try_new( + vec![ + create_window_expr( + &WindowFunction::AggregateFunction(AggregateFunction::Count), + "count".to_owned(), + &[col("c3", &schema)?], + &[], + &[], + Some(WindowFrame::default()), + schema.as_ref(), + )?, + create_window_expr( + &WindowFunction::AggregateFunction(AggregateFunction::Max), + "max".to_owned(), + &[col("c3", &schema)?], + &[], + &[], + Some(WindowFrame::default()), + schema.as_ref(), + )?, + create_window_expr( + &WindowFunction::AggregateFunction(AggregateFunction::Min), + "min".to_owned(), + &[col("c3", &schema)?], + &[], + &[], + Some(WindowFrame::default()), + schema.as_ref(), + )?, + ], + input, + schema.clone(), + )?); + + let result: Vec = collect(window_exec).await?; + assert_eq!(result.len(), 1); + + let columns = result[0].columns(); + + // c3 is small int + + let count: &UInt64Array = as_primitive_array(&columns[0]); + assert_eq!(count.value(0), 100); + assert_eq!(count.value(99), 100); + + let max: &Int8Array = as_primitive_array(&columns[1]); + assert_eq!(max.value(0), 125); + assert_eq!(max.value(99), 125); + + let min: &Int8Array = as_primitive_array(&columns[2]); + assert_eq!(min.value(0), -117); + assert_eq!(min.value(99), -117); + + Ok(()) + } +} diff --git a/datafusion/src/physical_plan/windows/window_agg_exec.rs b/datafusion/src/physical_plan/windows/window_agg_exec.rs new file mode 100644 index 0000000000000..2ff1f34ce4c7f --- /dev/null +++ b/datafusion/src/physical_plan/windows/window_agg_exec.rs @@ -0,0 +1,260 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Stream and channel implementations for window function expressions. + +use crate::error::{DataFusionError, Result}; +use crate::physical_plan::{ + common, Distribution, ExecutionPlan, Partitioning, RecordBatchStream, + SendableRecordBatchStream, WindowExpr, +}; +use arrow::{ + array::ArrayRef, + datatypes::{Schema, SchemaRef}, + error::{ArrowError, Result as ArrowResult}, + record_batch::RecordBatch, +}; +use async_trait::async_trait; +use futures::stream::Stream; +use futures::Future; +use pin_project_lite::pin_project; +use std::any::Any; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +/// Window execution plan +#[derive(Debug)] +pub struct WindowAggExec { + /// Input plan + input: Arc, + /// Window function expression + window_expr: Vec>, + /// Schema after the window is run + schema: SchemaRef, + /// Schema before the window + input_schema: SchemaRef, +} + +impl WindowAggExec { + /// Create a new execution plan for window aggregates + pub fn try_new( + window_expr: Vec>, + input: Arc, + input_schema: SchemaRef, + ) -> Result { + let schema = create_schema(&input_schema, &window_expr)?; + let schema = Arc::new(schema); + Ok(WindowAggExec { + input, + window_expr, + schema, + input_schema, + }) + } + + /// Window expressions + pub fn window_expr(&self) -> &[Arc] { + &self.window_expr + } + + /// Input plan + pub fn input(&self) -> &Arc { + &self.input + } + + /// Get the input schema before any window functions are applied + pub fn input_schema(&self) -> SchemaRef { + self.input_schema.clone() + } +} + +#[async_trait] +impl ExecutionPlan for WindowAggExec { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + + fn children(&self) -> Vec> { + vec![self.input.clone()] + } + + /// Get the output partitioning of this plan + fn output_partitioning(&self) -> Partitioning { + // because we can have repartitioning using the partition keys + // this would be either 1 or more than 1 depending on the presense of + // repartitioning + self.input.output_partitioning() + } + + fn required_child_distribution(&self) -> Distribution { + if self + .window_expr() + .iter() + .all(|expr| expr.partition_by().is_empty()) + { + Distribution::SinglePartition + } else { + Distribution::UnspecifiedDistribution + } + } + + fn with_new_children( + &self, + children: Vec>, + ) -> Result> { + match children.len() { + 1 => Ok(Arc::new(WindowAggExec::try_new( + self.window_expr.clone(), + children[0].clone(), + self.input_schema.clone(), + )?)), + _ => Err(DataFusionError::Internal( + "WindowAggExec wrong number of children".to_owned(), + )), + } + } + + async fn execute(&self, partition: usize) -> Result { + let input = self.input.execute(partition).await?; + let stream = Box::pin(WindowAggStream::new( + self.schema.clone(), + self.window_expr.clone(), + input, + )); + Ok(stream) + } +} + +fn create_schema( + input_schema: &Schema, + window_expr: &[Arc], +) -> Result { + let mut fields = Vec::with_capacity(input_schema.fields().len() + window_expr.len()); + for expr in window_expr { + fields.push(expr.field()?); + } + fields.extend_from_slice(input_schema.fields()); + Ok(Schema::new(fields)) +} + +/// Compute the window aggregate columns +fn compute_window_aggregates( + window_expr: Vec>, + batch: &RecordBatch, +) -> Result> { + window_expr + .iter() + .map(|window_expr| window_expr.evaluate(batch)) + .collect() +} + +pin_project! { + /// stream for window aggregation plan + pub struct WindowAggStream { + schema: SchemaRef, + #[pin] + output: futures::channel::oneshot::Receiver>, + finished: bool, + } +} + +impl WindowAggStream { + /// Create a new WindowAggStream + pub fn new( + schema: SchemaRef, + window_expr: Vec>, + input: SendableRecordBatchStream, + ) -> Self { + let (tx, rx) = futures::channel::oneshot::channel(); + let schema_clone = schema.clone(); + tokio::spawn(async move { + let schema = schema_clone.clone(); + let result = WindowAggStream::process(input, window_expr, schema).await; + tx.send(result) + }); + + Self { + output: rx, + finished: false, + schema, + } + } + + async fn process( + input: SendableRecordBatchStream, + window_expr: Vec>, + schema: SchemaRef, + ) -> ArrowResult { + let input_schema = input.schema(); + let batches = common::collect(input) + .await + .map_err(DataFusionError::into_arrow_external_error)?; + let batch = common::combine_batches(&batches, input_schema.clone())?; + if let Some(batch) = batch { + // calculate window cols + let mut columns = compute_window_aggregates(window_expr, &batch) + .map_err(DataFusionError::into_arrow_external_error)?; + // combine with the original cols + // note the setup of window aggregates is that they newly calculated window + // expressions are always prepended to the columns + columns.extend_from_slice(batch.columns()); + RecordBatch::try_new(schema, columns) + } else { + Ok(RecordBatch::new_empty(schema)) + } + } +} + +impl Stream for WindowAggStream { + type Item = ArrowResult; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + if self.finished { + return Poll::Ready(None); + } + + // is the output ready? + let this = self.project(); + let output_poll = this.output.poll(cx); + + match output_poll { + Poll::Ready(result) => { + *this.finished = true; + // check for error in receiving channel and unwrap actual result + let result = match result { + Err(e) => Some(Err(ArrowError::ExternalError(Box::new(e)))), // error receiving + Ok(result) => Some(result), + }; + Poll::Ready(result) + } + Poll::Pending => Poll::Pending, + } + } +} + +impl RecordBatchStream for WindowAggStream { + /// Get the schema + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} From 1dd1bd5c949d429244783c34320d952b90a2e266 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 15 Jul 2021 19:47:41 +0800 Subject: [PATCH 263/329] provide more details on required .parquet file extension error message (#729) * provide more details on required .parquet file extension * Update datafusion/src/physical_plan/parquet.rs --- datafusion/src/physical_plan/parquet.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/src/physical_plan/parquet.rs b/datafusion/src/physical_plan/parquet.rs index 63e11d5106bac..f606b5315281e 100644 --- a/datafusion/src/physical_plan/parquet.rs +++ b/datafusion/src/physical_plan/parquet.rs @@ -135,7 +135,7 @@ impl ParquetExec { let filenames = common::build_file_list(path, ".parquet")?; if filenames.is_empty() { Err(DataFusionError::Plan(format!( - "No Parquet files found at path {}", + "No Parquet files (with .parquet extension) found at path {}", path ))) } else { From 1dd5afeee5abb60b0bcfaccdbb8fd2625a58cf5e Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Fri, 16 Jul 2021 19:26:32 +0800 Subject: [PATCH 264/329] implement FromStr for FileType (#728) --- datafusion/src/sql/parser.rs | 40 +++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/datafusion/src/sql/parser.rs b/datafusion/src/sql/parser.rs index 3637e882d2fd2..bb2f9e6bbb24e 100644 --- a/datafusion/src/sql/parser.rs +++ b/datafusion/src/sql/parser.rs @@ -25,6 +25,7 @@ use sqlparser::{ parser::{Parser, ParserError}, tokenizer::{Token, Tokenizer}, }; +use std::str::FromStr; // Use `Parser::expected` instead, if possible macro_rules! parser_err { @@ -44,6 +45,22 @@ pub enum FileType { CSV, } +impl FromStr for FileType { + type Err = ParserError; + + fn from_str(s: &str) -> Result { + match s.to_uppercase().as_str() { + "PARQUET" => Ok(Self::Parquet), + "NDJSON" => Ok(Self::NdJson), + "CSV" => Ok(Self::CSV), + other => Err(ParserError::ParserError(format!( + "expect one of PARQUET, NDJSON, or CSV, found: {}", + other + ))), + } + } +} + /// DataFusion extension DDL for `CREATE EXTERNAL TABLE` #[derive(Debug, Clone, PartialEq)] pub struct CreateExternalTable { @@ -268,12 +285,7 @@ impl<'a> DFParser<'a> { /// Parses the set of valid formats fn parse_file_format(&mut self) -> Result { match self.parser.next_token() { - Token::Word(w) => match &*w.value { - "PARQUET" => Ok(FileType::Parquet), - "NDJSON" => Ok(FileType::NdJson), - "CSV" => Ok(FileType::CSV), - _ => self.expected("one of PARQUET, NDJSON, or CSV", Token::Word(w)), - }, + Token::Word(w) => w.value.parse(), unexpected => self.expected("one of PARQUET, NDJSON, or CSV", unexpected), } } @@ -367,13 +379,21 @@ mod tests { }); expect_parse_ok(sql, expected)?; + // positive case: it is ok for parquet files to be other than upper case + let sql = "CREATE EXTERNAL TABLE t STORED AS parqueT LOCATION 'foo.parquet'"; + let expected = Statement::CreateExternalTable(CreateExternalTable { + name: "t".into(), + columns: vec![], + file_type: FileType::Parquet, + has_header: false, + location: "foo.parquet".into(), + }); + expect_parse_ok(sql, expected)?; + // Error cases: Invalid type let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS UNKNOWN_TYPE LOCATION 'foo.csv'"; - expect_parse_error( - sql, - "Expected one of PARQUET, NDJSON, or CSV, found: UNKNOWN_TYPE", - ); + expect_parse_error(sql, "expect one of PARQUET, NDJSON, or CSV"); Ok(()) } From a1c794cec233f7fe34f34c7d64f529625a507669 Mon Sep 17 00:00:00 2001 From: Cui Wenzheng Date: Fri, 16 Jul 2021 19:27:03 +0800 Subject: [PATCH 265/329] fix return type conflict when calling builtin math fuctions (#716) --- datafusion/src/execution/context.rs | 76 ++++++++++++++++++- datafusion/src/physical_plan/functions.rs | 25 ++++-- .../src/physical_plan/math_expressions.rs | 2 +- 3 files changed, 92 insertions(+), 11 deletions(-) diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index d2dcec5f47d73..d4d3a8a14ac69 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -916,9 +916,10 @@ mod tests { physical_plan::expressions::AvgAccumulator, }; use arrow::array::{ - Array, ArrayRef, BinaryArray, DictionaryArray, Float64Array, Int32Array, - Int64Array, LargeBinaryArray, LargeStringArray, StringArray, - TimestampNanosecondArray, + Array, ArrayRef, BinaryArray, DictionaryArray, Float32Array, Float64Array, + Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, + LargeStringArray, StringArray, TimestampNanosecondArray, UInt16Array, + UInt32Array, UInt64Array, UInt8Array, }; use arrow::compute::add; use arrow::datatypes::*; @@ -2364,6 +2365,75 @@ mod tests { assert_batches_sorted_eq!(expected, &results); } + #[tokio::test] + async fn case_builtin_math_expression() { + let mut ctx = ExecutionContext::new(); + + let type_values = vec![ + ( + DataType::Int8, + Arc::new(Int8Array::from(vec![1])) as ArrayRef, + ), + ( + DataType::Int16, + Arc::new(Int16Array::from(vec![1])) as ArrayRef, + ), + ( + DataType::Int32, + Arc::new(Int32Array::from(vec![1])) as ArrayRef, + ), + ( + DataType::Int64, + Arc::new(Int64Array::from(vec![1])) as ArrayRef, + ), + ( + DataType::UInt8, + Arc::new(UInt8Array::from(vec![1])) as ArrayRef, + ), + ( + DataType::UInt16, + Arc::new(UInt16Array::from(vec![1])) as ArrayRef, + ), + ( + DataType::UInt32, + Arc::new(UInt32Array::from(vec![1])) as ArrayRef, + ), + ( + DataType::UInt64, + Arc::new(UInt64Array::from(vec![1])) as ArrayRef, + ), + ( + DataType::Float32, + Arc::new(Float32Array::from(vec![1.0_f32])) as ArrayRef, + ), + ( + DataType::Float64, + Arc::new(Float64Array::from(vec![1.0_f64])) as ArrayRef, + ), + ]; + + for (data_type, array) in type_values.iter() { + let schema = + Arc::new(Schema::new(vec![Field::new("v", data_type.clone(), false)])); + let batch = + RecordBatch::try_new(schema.clone(), vec![array.clone()]).unwrap(); + let provider = MemTable::try_new(schema, vec![vec![batch]]).unwrap(); + ctx.register_table("t", Arc::new(provider)).unwrap(); + let expected = vec![ + "+---------+", + "| sqrt(v) |", + "+---------+", + "| 1 |", + "+---------+", + ]; + let results = plan_and_collect(&mut ctx, "SELECT sqrt(v) FROM t") + .await + .unwrap(); + + assert_batches_sorted_eq!(expected, &results); + } + } + #[tokio::test] async fn case_sensitive_identifiers_user_defined_functions() -> Result<()> { let mut ctx = ExecutionContext::new(); diff --git a/datafusion/src/physical_plan/functions.rs b/datafusion/src/physical_plan/functions.rs index 01f7e95a0ee99..d856ca4bd6062 100644 --- a/datafusion/src/physical_plan/functions.rs +++ b/datafusion/src/physical_plan/functions.rs @@ -468,7 +468,18 @@ pub fn return_type( | BuiltinScalarFunction::Sin | BuiltinScalarFunction::Sqrt | BuiltinScalarFunction::Tan - | BuiltinScalarFunction::Trunc => Ok(DataType::Float64), + | BuiltinScalarFunction::Trunc => { + if arg_types.is_empty() { + return Err(DataFusionError::Internal(format!( + "builtin scalar function {} does not support empty arguments", + fun + ))); + } + match arg_types[0] { + DataType::Float32 => Ok(DataType::Float32), + _ => Ok(DataType::Float64), + } + } } } @@ -1427,8 +1438,8 @@ mod tests { }; use arrow::{ array::{ - Array, ArrayRef, BinaryArray, BooleanArray, FixedSizeListArray, Float64Array, - Int32Array, StringArray, UInt32Array, UInt64Array, + Array, ArrayRef, BinaryArray, BooleanArray, FixedSizeListArray, Float32Array, + Float64Array, Int32Array, StringArray, UInt32Array, UInt64Array, }, datatypes::Field, record_batch::RecordBatch, @@ -1857,10 +1868,10 @@ mod tests { test_function!( Exp, &[lit(ScalarValue::Float32(Some(1.0)))], - Ok(Some((1.0_f32).exp() as f64)), - f64, - Float64, - Float64Array + Ok(Some((1.0_f32).exp())), + f32, + Float32, + Float32Array ); test_function!( InitCap, diff --git a/datafusion/src/physical_plan/math_expressions.rs b/datafusion/src/physical_plan/math_expressions.rs index cfc239cde6613..eabacfc6eb183 100644 --- a/datafusion/src/physical_plan/math_expressions.rs +++ b/datafusion/src/physical_plan/math_expressions.rs @@ -60,7 +60,7 @@ macro_rules! unary_primitive_array_op { }, ColumnarValue::Scalar(a) => match a { ScalarValue::Float32(a) => Ok(ColumnarValue::Scalar( - ScalarValue::Float64(a.map(|x| x.$FUNC() as f64)), + ScalarValue::Float32(a.map(|x| x.$FUNC())), )), ScalarValue::Float64(a) => Ok(ColumnarValue::Scalar( ScalarValue::Float64(a.map(|x| x.$FUNC())), From e65d49ca7d05916591c051de1fccc0764dd18323 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 16 Jul 2021 06:55:18 -0600 Subject: [PATCH 266/329] UnresolvedShuffleExec should represent a single shuffle (#727) --- ballista/rust/core/proto/ballista.proto | 2 +- .../src/execution_plans/unresolved_shuffle.rs | 14 ++-- .../src/serde/physical_plan/from_proto.rs | 6 +- .../core/src/serde/physical_plan/to_proto.rs | 6 +- ballista/rust/core/src/utils.rs | 12 ++- ballista/rust/scheduler/src/planner.rs | 30 ++++--- ballista/rust/scheduler/src/state/mod.rs | 83 +++++++++---------- 7 files changed, 68 insertions(+), 85 deletions(-) diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index 5b3e93e379e33..1c2328eed44d3 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -542,7 +542,7 @@ message PhysicalNegativeNode { } message UnresolvedShuffleExecNode { - repeated uint32 query_stage_ids = 1; + uint32 stage_id = 1; Schema schema = 2; uint32 partition_count = 3; } diff --git a/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs b/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs index 49b4f7a0992c2..cb351eec561a1 100644 --- a/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs +++ b/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs @@ -31,14 +31,14 @@ use datafusion::{ use log::info; use std::fmt::Formatter; -/// UnresolvedShuffleExec represents a dependency on the results of several ShuffleWriterExec nodes which haven't been computed yet. +/// UnresolvedShuffleExec represents a dependency on the results of a ShuffleWriterExec node which hasn't computed yet. /// /// An ExecutionPlan that contains an UnresolvedShuffleExec isn't ready for execution. The presence of this ExecutionPlan -/// is used as a signal so the scheduler knows it can't start computation on a specific ShuffleWriterExec. +/// is used as a signal so the scheduler knows it can't start computation until the dependent shuffle has completed. #[derive(Debug, Clone)] pub struct UnresolvedShuffleExec { // The query stage ids which needs to be computed - pub query_stage_ids: Vec, + pub stage_id: usize, // The schema this node will have once it is replaced with a ShuffleReaderExec pub schema: SchemaRef, @@ -49,13 +49,9 @@ pub struct UnresolvedShuffleExec { impl UnresolvedShuffleExec { /// Create a new UnresolvedShuffleExec - pub fn new( - query_stage_ids: Vec, - schema: SchemaRef, - partition_count: usize, - ) -> Self { + pub fn new(stage_id: usize, schema: SchemaRef, partition_count: usize) -> Self { Self { - query_stage_ids, + stage_id, schema, partition_count, } diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index a1a60bde0cecf..4b0a9844773ca 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -464,11 +464,7 @@ impl TryInto> for &protobuf::PhysicalPlanNode { PhysicalPlanType::Unresolved(unresolved_shuffle) => { let schema = Arc::new(convert_required!(unresolved_shuffle.schema)?); Ok(Arc::new(UnresolvedShuffleExec { - query_stage_ids: unresolved_shuffle - .query_stage_ids - .iter() - .map(|id| *id as usize) - .collect(), + stage_id: unresolved_shuffle.stage_id as usize, schema, partition_count: unresolved_shuffle.partition_count as usize, })) diff --git a/ballista/rust/core/src/serde/physical_plan/to_proto.rs b/ballista/rust/core/src/serde/physical_plan/to_proto.rs index cdd33f9384131..0429efb7c0174 100644 --- a/ballista/rust/core/src/serde/physical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/to_proto.rs @@ -391,11 +391,7 @@ impl TryInto for Arc { Ok(protobuf::PhysicalPlanNode { physical_plan_type: Some(PhysicalPlanType::Unresolved( protobuf::UnresolvedShuffleExecNode { - query_stage_ids: exec - .query_stage_ids - .iter() - .map(|id| *id as u32) - .collect(), + stage_id: exec.stage_id as u32, schema: Some(exec.schema().as_ref().into()), partition_count: exec.partition_count as u32, }, diff --git a/ballista/rust/core/src/utils.rs b/ballista/rust/core/src/utils.rs index 7e9a55af1a777..8b1cf61a55ee6 100644 --- a/ballista/rust/core/src/utils.rs +++ b/ballista/rust/core/src/utils.rs @@ -209,13 +209,11 @@ fn build_exec_plan_diagram( for child in plan.children() { if let Some(shuffle) = child.as_any().downcast_ref::() { if !draw_entity { - for y in &shuffle.query_stage_ids { - writeln!( - w, - "\tstage_{}_exec_1 -> stage_{}_exec_{};", - y, stage_id, node_id - )?; - } + writeln!( + w, + "\tstage_{}_exec_1 -> stage_{}_exec_{};", + shuffle.stage_id, stage_id, node_id + )?; } } else { // relationships within same entity diff --git a/ballista/rust/scheduler/src/planner.rs b/ballista/rust/scheduler/src/planner.rs index 319526142bf96..3f90da238b7fe 100644 --- a/ballista/rust/scheduler/src/planner.rs +++ b/ballista/rust/scheduler/src/planner.rs @@ -112,7 +112,7 @@ impl DistributedPlanner { None, )?; let unresolved_shuffle = Arc::new(UnresolvedShuffleExec::new( - vec![query_stage.stage_id()], + query_stage.stage_id(), query_stage.schema(), query_stage.output_partitioning().partition_count(), )); @@ -131,7 +131,7 @@ impl DistributedPlanner { Some(repart.partitioning().to_owned()), )?; let unresolved_shuffle = Arc::new(UnresolvedShuffleExec::new( - vec![query_stage.stage_id()], + query_stage.stage_id(), query_stage.schema(), query_stage.output_partitioning().partition_count(), )); @@ -166,19 +166,17 @@ pub fn remove_unresolved_shuffles( child.as_any().downcast_ref::() { let mut relevant_locations = vec![]; - for id in &unresolved_shuffle.query_stage_ids { - relevant_locations.append( - &mut partition_locations - .get(id) - .ok_or_else(|| { - BallistaError::General( - "Missing partition location. Could not remove unresolved shuffles" - .to_owned(), - ) - })? - .clone(), - ); - } + relevant_locations.append( + &mut partition_locations + .get(&unresolved_shuffle.stage_id) + .ok_or_else(|| { + BallistaError::General( + "Missing partition location. Could not remove unresolved shuffles" + .to_owned(), + ) + })? + .clone(), + ); new_children.push(Arc::new(ShuffleReaderExec::try_new( relevant_locations, unresolved_shuffle.schema().clone(), @@ -297,7 +295,7 @@ mod test { let unresolved_shuffle = coalesce_partitions.children()[0].clone(); let unresolved_shuffle = downcast_exec!(unresolved_shuffle, UnresolvedShuffleExec); - assert_eq!(unresolved_shuffle.query_stage_ids, vec![2]); + assert_eq!(unresolved_shuffle.stage_id, 2); Ok(()) } diff --git a/ballista/rust/scheduler/src/state/mod.rs b/ballista/rust/scheduler/src/state/mod.rs index cbee3f1bef690..3ddbced226849 100644 --- a/ballista/rust/scheduler/src/state/mod.rs +++ b/ballista/rust/scheduler/src/state/mod.rs @@ -302,48 +302,47 @@ impl SchedulerState { Vec>, > = HashMap::new(); for unresolved_shuffle in unresolved_shuffles { - for stage_id in unresolved_shuffle.query_stage_ids { - for partition_id in 0..unresolved_shuffle.partition_count { - let referenced_task = tasks - .get(&get_task_status_key( - &self.namespace, - &partition.job_id, - stage_id, - partition_id, - )) - .unwrap(); - let task_is_dead = self - .reschedule_dead_task(&referenced_task, &executors) - .await?; - if task_is_dead { - continue 'tasks; - } else if let Some(task_status::Status::Completed( - CompletedTask { executor_id }, - )) = &referenced_task.status - { - let empty = vec![]; - let locations = - partition_locations.entry(stage_id).or_insert(empty); - let executor_meta = executors - .iter() - .find(|exec| exec.id == *executor_id) - .unwrap() - .clone(); - locations.push(vec![ - ballista_core::serde::scheduler::PartitionLocation { - partition_id: - ballista_core::serde::scheduler::PartitionId { - job_id: partition.job_id.clone(), - stage_id, - partition_id, - }, - executor_meta, - partition_stats: PartitionStats::default(), - }, - ]); - } else { - continue 'tasks; - } + for partition_id in 0..unresolved_shuffle.partition_count { + let referenced_task = tasks + .get(&get_task_status_key( + &self.namespace, + &partition.job_id, + unresolved_shuffle.stage_id, + partition_id, + )) + .unwrap(); + let task_is_dead = self + .reschedule_dead_task(&referenced_task, &executors) + .await?; + if task_is_dead { + continue 'tasks; + } else if let Some(task_status::Status::Completed( + CompletedTask { executor_id }, + )) = &referenced_task.status + { + let empty = vec![]; + let locations = partition_locations + .entry(unresolved_shuffle.stage_id) + .or_insert(empty); + let executor_meta = executors + .iter() + .find(|exec| exec.id == *executor_id) + .unwrap() + .clone(); + locations.push(vec![ + ballista_core::serde::scheduler::PartitionLocation { + partition_id: + ballista_core::serde::scheduler::PartitionId { + job_id: partition.job_id.clone(), + stage_id: unresolved_shuffle.stage_id, + partition_id, + }, + executor_meta, + partition_stats: PartitionStats::default(), + }, + ]); + } else { + continue 'tasks; } } } From 2f5bc94a9206ab6b0bf9f443c4a756d1f01d566e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 16 Jul 2021 07:52:20 -0600 Subject: [PATCH 267/329] MINOR: Remove unused Ballista query execution code path (#732) * Remove unused code path * Remove proto defs --- ballista/rust/core/proto/ballista.proto | 6 -- ballista/rust/core/src/client.rs | 51 ------------ .../core/src/serde/scheduler/from_proto.rs | 18 ---- ballista/rust/core/src/serde/scheduler/mod.rs | 2 - .../rust/core/src/serde/scheduler/to_proto.rs | 4 - ballista/rust/executor/src/flight_service.rs | 82 +------------------ 6 files changed, 2 insertions(+), 161 deletions(-) diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index 1c2328eed44d3..0575460cfca35 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -699,12 +699,6 @@ message KeyValuePair { message Action { oneof ActionType { - // Execute a logical query plan - LogicalPlanNode query = 1; - - // Execute one partition of a physical query plan - ExecutePartition execute_partition = 2; - // Fetch a partition from an executor PartitionId fetch_partition = 3; } diff --git a/ballista/rust/core/src/client.rs b/ballista/rust/core/src/client.rs index c8267c8194c20..2df4145783553 100644 --- a/ballista/rust/core/src/client.rs +++ b/ballista/rust/core/src/client.rs @@ -75,57 +75,6 @@ impl BallistaClient { Ok(Self { flight_client }) } - /// Execute one partition of a physical query plan against the executor - pub async fn execute_partition( - &mut self, - job_id: String, - stage_id: usize, - partition_id: Vec, - plan: Arc, - ) -> Result> { - let action = Action::ExecutePartition(ExecutePartition { - job_id, - stage_id, - partition_id, - plan, - shuffle_locations: Default::default(), - }); - let stream = self.execute_action(&action).await?; - let batches = collect(stream).await?; - - batches - .iter() - .map(|batch| { - if batch.num_rows() != 1 { - Err(BallistaError::General( - "execute_partition received wrong number of rows".to_owned(), - )) - } else { - let path = batch - .column(0) - .as_any() - .downcast_ref::() - .expect( - "execute_partition expected column 0 to be a StringArray", - ); - - let stats = batch - .column(1) - .as_any() - .downcast_ref::() - .expect( - "execute_partition expected column 1 to be a StructArray", - ); - - Ok(ExecutePartitionResult::new( - path.value(0), - PartitionStats::from_arrow_struct_array(stats), - )) - } - }) - .collect::>>() - } - /// Fetch a partition from an executor pub async fn fetch_partition( &mut self, diff --git a/ballista/rust/core/src/serde/scheduler/from_proto.rs b/ballista/rust/core/src/serde/scheduler/from_proto.rs index 4631b2e4d8638..73f8f53956de8 100644 --- a/ballista/rust/core/src/serde/scheduler/from_proto.rs +++ b/ballista/rust/core/src/serde/scheduler/from_proto.rs @@ -32,24 +32,6 @@ impl TryInto for protobuf::Action { fn try_into(self) -> Result { match self.action_type { - Some(ActionType::ExecutePartition(partition)) => { - Ok(Action::ExecutePartition(ExecutePartition::new( - partition.job_id, - partition.stage_id as usize, - partition.partition_id.iter().map(|n| *n as usize).collect(), - partition - .plan - .as_ref() - .ok_or_else(|| { - BallistaError::General( - "PhysicalPlanNode in ExecutePartition is missing" - .to_owned(), - ) - })? - .try_into()?, - HashMap::new(), - ))) - } Some(ActionType::FetchPartition(partition)) => { Ok(Action::FetchPartition(partition.try_into()?)) } diff --git a/ballista/rust/core/src/serde/scheduler/mod.rs b/ballista/rust/core/src/serde/scheduler/mod.rs index cbe1a31227c68..fa2c1b890e844 100644 --- a/ballista/rust/core/src/serde/scheduler/mod.rs +++ b/ballista/rust/core/src/serde/scheduler/mod.rs @@ -35,8 +35,6 @@ pub mod to_proto; /// Action that can be sent to an executor #[derive(Debug, Clone)] pub enum Action { - /// Execute a query and store the results in memory - ExecutePartition(ExecutePartition), /// Collect a shuffle partition FetchPartition(PartitionId), } diff --git a/ballista/rust/core/src/serde/scheduler/to_proto.rs b/ballista/rust/core/src/serde/scheduler/to_proto.rs index 40ca907a8a717..c3f2046305cf9 100644 --- a/ballista/rust/core/src/serde/scheduler/to_proto.rs +++ b/ballista/rust/core/src/serde/scheduler/to_proto.rs @@ -29,10 +29,6 @@ impl TryInto for Action { fn try_into(self) -> Result { match self { - Action::ExecutePartition(partition) => Ok(protobuf::Action { - action_type: Some(ActionType::ExecutePartition(partition.try_into()?)), - settings: vec![], - }), Action::FetchPartition(partition_id) => Ok(protobuf::Action { action_type: Some(ActionType::FetchPartition(partition_id.into())), settings: vec![], diff --git a/ballista/rust/executor/src/flight_service.rs b/ballista/rust/executor/src/flight_service.rs index 99424b6e8db46..7325287f074f5 100644 --- a/ballista/rust/executor/src/flight_service.rs +++ b/ballista/rust/executor/src/flight_service.rs @@ -25,7 +25,7 @@ use std::sync::Arc; use crate::executor::Executor; use ballista_core::error::BallistaError; use ballista_core::serde::decode_protobuf; -use ballista_core::serde::scheduler::{Action as BallistaAction, PartitionStats}; +use ballista_core::serde::scheduler::Action as BallistaAction; use arrow_flight::{ flight_service_server::FlightService, Action, ActionType, Criteria, Empty, @@ -33,18 +33,13 @@ use arrow_flight::{ PutResult, SchemaResult, Ticket, }; use datafusion::arrow::{ - datatypes::{DataType, Field, Schema}, - error::ArrowError, - ipc::reader::FileReader, - ipc::writer::IpcWriteOptions, + error::ArrowError, ipc::reader::FileReader, ipc::writer::IpcWriteOptions, record_batch::RecordBatch, }; -use datafusion::physical_plan::displayable; use futures::{Stream, StreamExt}; use log::{info, warn}; use std::io::{Read, Seek}; use tokio::sync::mpsc::channel; -use tokio::task::JoinHandle; use tokio::{ sync::mpsc::{Receiver, Sender}, task, @@ -92,79 +87,6 @@ impl FlightService for BallistaFlightService { decode_protobuf(&ticket.ticket).map_err(|e| from_ballista_err(&e))?; match &action { - BallistaAction::ExecutePartition(partition) => { - info!( - "ExecutePartition: job={}, stage={}, partition={:?}\n{}", - partition.job_id, - partition.stage_id, - partition.partition_id, - displayable(partition.plan.as_ref()).indent().to_string() - ); - - let mut tasks: Vec>> = vec![]; - for &part in &partition.partition_id { - let partition = partition.clone(); - let executor = self.executor.clone(); - tasks.push(tokio::spawn(async move { - let results = executor - .execute_partition( - partition.job_id.clone(), - partition.stage_id, - part, - partition.plan.clone(), - ) - .await?; - let results = vec![results]; - - let mut flights: Vec> = vec![]; - let options = arrow::ipc::writer::IpcWriteOptions::default(); - - let mut batches: Vec> = results - .iter() - .flat_map(|batch| create_flight_iter(batch, &options)) - .collect(); - - // append batch vector to schema vector, so that the first message sent is the schema - flights.append(&mut batches); - - Ok(flights) - })); - } - - // wait for all partitions to complete - let results = futures::future::join_all(tasks).await; - - // get results - let mut flights: Vec> = vec![]; - - // add an initial FlightData message that sends schema - let options = arrow::ipc::writer::IpcWriteOptions::default(); - let stats = PartitionStats::default(); - let schema = Arc::new(Schema::new(vec![ - Field::new("path", DataType::Utf8, false), - stats.arrow_struct_repr(), - ])); - let schema_flight_data = - arrow_flight::utils::flight_data_from_arrow_schema( - schema.as_ref(), - &options, - ); - flights.push(Ok(schema_flight_data)); - - // collect statistics from each executed partition - for result in results { - let result = result.map_err(|e| { - Status::internal(format!("Ballista Error: {:?}", e)) - })?; - let batches = result.map_err(|e| { - Status::internal(format!("Ballista Error: {:?}", e)) - })?; - flights.extend_from_slice(&batches); - } - - let output = futures::stream::iter(flights); - Ok(Response::new(Box::pin(output) as Self::DoGetStream)) - } BallistaAction::FetchPartition(partition_id) => { // fetch a partition that was previously executed by this executor info!("FetchPartition {:?}", partition_id); From d5427d8dd1fd8bd6fb2541c1c9386802da7783de Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 16 Jul 2021 15:09:57 -0400 Subject: [PATCH 268/329] Derive PartialEq for datasource enums (#734) --- datafusion/src/datasource/datasource.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/src/datasource/datasource.rs b/datafusion/src/datasource/datasource.rs index b83aa4b1ab56b..e173d6e0d771e 100644 --- a/datafusion/src/datasource/datasource.rs +++ b/datafusion/src/datasource/datasource.rs @@ -27,7 +27,7 @@ use crate::{arrow::datatypes::SchemaRef, scalar::ScalarValue}; /// This table statistics are estimates. /// It can not be used directly in the precise compute -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone, Default, PartialEq)] pub struct Statistics { /// The number of table rows pub num_rows: Option, @@ -51,7 +51,7 @@ pub struct ColumnStatistics { /// Indicates whether and how a filter expression can be handled by a /// TableProvider for table scans. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq)] pub enum TableProviderFilterPushDown { /// The expression cannot be used by the provider. Unsupported, @@ -67,7 +67,7 @@ pub enum TableProviderFilterPushDown { } /// Indicates the type of this table for metadata/catalog purposes. -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, PartialEq)] pub enum TableType { /// An ordinary physical table. Base, From 5c1f8eb32c5f423d6c0e1f50350f9fe140135fdb Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 17 Jul 2021 09:15:32 -0400 Subject: [PATCH 269/329] Update to use arrow 5.0 (#721) * Update to arrow 5.0 * Update to new interfaces --- ballista/rust/core/Cargo.toml | 2 +- ballista/rust/executor/Cargo.toml | 4 ++-- ballista/rust/executor/src/flight_service.rs | 6 ++---- datafusion-cli/Cargo.toml | 2 +- datafusion-examples/Cargo.toml | 2 +- datafusion-examples/examples/flight_server.rs | 11 +++-------- datafusion/Cargo.toml | 6 +++--- datafusion/src/physical_plan/hash_aggregate.rs | 4 ++-- datafusion/src/physical_plan/mod.rs | 5 +++-- datafusion/src/scalar.rs | 2 +- 10 files changed, 19 insertions(+), 25 deletions(-) diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml index 3a89c75a5cd72..ce72d2fda92d4 100644 --- a/ballista/rust/core/Cargo.toml +++ b/ballista/rust/core/Cargo.toml @@ -42,7 +42,7 @@ tokio = "1.0" tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } -arrow-flight = { version = "4.0" } +arrow-flight = { version = "5.0" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/executor/Cargo.toml b/ballista/rust/executor/Cargo.toml index 68e4920f3b40b..428a5bb0f01f5 100644 --- a/ballista/rust/executor/Cargo.toml +++ b/ballista/rust/executor/Cargo.toml @@ -43,8 +43,8 @@ tokio-stream = { version = "0.1", features = ["net"] } tonic = "0.4" uuid = { version = "0.8", features = ["v4"] } -arrow = { version = "4.0" } -arrow-flight = { version = "4.0" } +arrow = { version = "5.0" } +arrow-flight = { version = "5.0" } datafusion = { path = "../../../datafusion" } diff --git a/ballista/rust/executor/src/flight_service.rs b/ballista/rust/executor/src/flight_service.rs index 7325287f074f5..9a3f2d872d521 100644 --- a/ballista/rust/executor/src/flight_service.rs +++ b/ballista/rust/executor/src/flight_service.rs @@ -23,6 +23,7 @@ use std::pin::Pin; use std::sync::Arc; use crate::executor::Executor; +use arrow_flight::SchemaAsIpc; use ballista_core::error::BallistaError; use ballista_core::serde::decode_protobuf; use ballista_core::serde::scheduler::Action as BallistaAction; @@ -218,10 +219,7 @@ where T: Read + Seek, { let options = arrow::ipc::writer::IpcWriteOptions::default(); - let schema_flight_data = arrow_flight::utils::flight_data_from_arrow_schema( - reader.schema().as_ref(), - &options, - ); + let schema_flight_data = SchemaAsIpc::new(reader.schema().as_ref(), &options).into(); send_response(&tx, Ok(schema_flight_data)).await?; for batch in reader { diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index cd17b61984d5e..fda9271876aa8 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -31,4 +31,4 @@ clap = "2.33" rustyline = "8.0" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } datafusion = { path = "../datafusion" } -arrow = { version = "4.0" } +arrow = { version = "5.0" } diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 886f8f5e74f68..35aa3764d6dc4 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -29,7 +29,7 @@ publish = false [dev-dependencies] -arrow-flight = { version = "4.0" } +arrow-flight = { version = "5.0" } datafusion = { path = "../datafusion" } prost = "0.7" tonic = "0.4" diff --git a/datafusion-examples/examples/flight_server.rs b/datafusion-examples/examples/flight_server.rs index 8496bcb18914f..138434ea2482f 100644 --- a/datafusion-examples/examples/flight_server.rs +++ b/datafusion-examples/examples/flight_server.rs @@ -17,6 +17,7 @@ use std::pin::Pin; +use arrow_flight::SchemaAsIpc; use futures::Stream; use tonic::transport::Server; use tonic::{Request, Response, Status, Streaming}; @@ -67,10 +68,7 @@ impl FlightService for FlightServiceImpl { let table = ParquetTable::try_new(&request.path[0], num_cpus::get()).unwrap(); let options = datafusion::arrow::ipc::writer::IpcWriteOptions::default(); - let schema_result = arrow_flight::utils::flight_schema_from_arrow_schema( - table.schema().as_ref(), - &options, - ); + let schema_result = SchemaAsIpc::new(table.schema().as_ref(), &options).into(); Ok(Response::new(schema_result)) } @@ -108,10 +106,7 @@ impl FlightService for FlightServiceImpl { // add an initial FlightData message that sends schema let options = datafusion::arrow::ipc::writer::IpcWriteOptions::default(); let schema_flight_data = - arrow_flight::utils::flight_data_from_arrow_schema( - &df.schema().clone().into(), - &options, - ); + SchemaAsIpc::new(&df.schema().clone().into(), &options).into(); let mut flights: Vec> = vec![Ok(schema_flight_data)]; diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index 845de6213f4d3..2f1e997c3596f 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -46,8 +46,8 @@ unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = "0.7" hashbrown = "0.11" -arrow = { version = "4.4", features = ["prettyprint"] } -parquet = { version = "4.4", features = ["arrow"] } +arrow = { version = "5.0", features = ["prettyprint"] } +parquet = { version = "5.0", features = ["arrow"] } sqlparser = "0.9.0" paste = "^1.0" num_cpus = "1.13.0" @@ -98,4 +98,4 @@ harness = false [[bench]] name = "physical_plan" -harness = false \ No newline at end of file +harness = false diff --git a/datafusion/src/physical_plan/hash_aggregate.rs b/datafusion/src/physical_plan/hash_aggregate.rs index b4b7c224024d3..ae513831bef46 100644 --- a/datafusion/src/physical_plan/hash_aggregate.rs +++ b/datafusion/src/physical_plan/hash_aggregate.rs @@ -508,7 +508,7 @@ fn dictionary_create_key_for_col( )) })?; - create_key_for_col(&dict_col.values(), values_index, vec) + create_key_for_col(dict_col.values(), values_index, vec) } /// Appends a sequence of [u8] bytes for the value in `col[row]` to @@ -1104,7 +1104,7 @@ fn dictionary_create_group_by_value( )) })?; - create_group_by_value(&dict_col.values(), values_index) + create_group_by_value(dict_col.values(), values_index) } /// Extract the value in `col[row]` as a GroupByScalar diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index d89eb11885041..b3c0dd63e9eda 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -505,8 +505,9 @@ pub trait WindowExpr: Send + Sync + Debug { end: num_rows, }]) } else { - lexicographical_partition_ranges(partition_columns) - .map_err(DataFusionError::ArrowError) + Ok(lexicographical_partition_ranges(partition_columns) + .map_err(DataFusionError::ArrowError)? + .collect::>()) } } diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index f94a090a538a1..e7354f8e62ec1 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -778,7 +778,7 @@ impl ScalarValue { keys_col.data_type() )) })?; - Self::try_from_array(&dict_array.values(), values_index) + Self::try_from_array(dict_array.values(), values_index) } } From 666360b1c192c9896dec8f80a95a5325f971f568 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sun, 18 Jul 2021 13:07:55 +0200 Subject: [PATCH 270/329] Support table columns alias (#735) * Support column alias in FROM * Remove q13 * Test on error * Don't convert empty list * Don't convert empty list * Fix test * Improve error message * Improve error message --- datafusion/src/sql/planner.rs | 115 +++++++++++++++++++++++++--------- 1 file changed, 85 insertions(+), 30 deletions(-) diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 41b4e20f15f3d..1437346fccd3a 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -424,46 +424,80 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { relation: &TableFactor, ctes: &mut HashMap, ) -> Result { - match relation { + let (plan, columns_alias) = match relation { TableFactor::Table { name, alias, .. } => { let table_name = name.to_string(); let cte = ctes.get(&table_name); - match ( - cte, - self.schema_provider.get_table_provider(name.try_into()?), - ) { - (Some(cte_plan), _) => Ok(cte_plan.clone()), - (_, Some(provider)) => LogicalPlanBuilder::scan( - // take alias into account to support `JOIN table1 as table2` - alias - .as_ref() - .map(|a| a.name.value.as_str()) - .unwrap_or(&table_name), - provider, - None, - )? - .build(), - (None, None) => Err(DataFusionError::Plan(format!( - "Table or CTE with name '{}' not found", - name - ))), - } + let columns_alias = alias.clone().map(|x| x.columns); + ( + match ( + cte, + self.schema_provider.get_table_provider(name.try_into()?), + ) { + (Some(cte_plan), _) => Ok(cte_plan.clone()), + (_, Some(provider)) => LogicalPlanBuilder::scan( + // take alias into account to support `JOIN table1 as table2` + alias + .as_ref() + .map(|a| a.name.value.as_str()) + .unwrap_or(&table_name), + provider, + None, + )? + .build(), + (None, None) => Err(DataFusionError::Plan(format!( + "Table or CTE with name '{}' not found", + name + ))), + }?, + columns_alias, + ) } TableFactor::Derived { subquery, alias, .. - } => self.query_to_plan_with_alias( - subquery, - alias.as_ref().map(|a| a.name.value.to_string()), - ctes, + } => ( + self.query_to_plan_with_alias( + subquery, + alias.as_ref().map(|a| a.name.value.to_string()), + ctes, + )?, + alias.clone().map(|x| x.columns), ), TableFactor::NestedJoin(table_with_joins) => { - self.plan_table_with_joins(table_with_joins, ctes) + (self.plan_table_with_joins(table_with_joins, ctes)?, None) } // @todo Support TableFactory::TableFunction? - _ => Err(DataFusionError::NotImplemented(format!( - "Unsupported ast node {:?} in create_relation", - relation - ))), + _ => { + return Err(DataFusionError::NotImplemented(format!( + "Unsupported ast node {:?} in create_relation", + relation + ))) + } + }; + + if let Some(columns_alias) = columns_alias { + if columns_alias.is_empty() { + // sqlparser-rs encodes AS t as an empty list of column alias + Ok(plan) + } else if columns_alias.len() != plan.schema().fields().len() { + return Err(DataFusionError::Plan(format!( + "Source table contains {} columns but only {} names given as column alias", + plan.schema().fields().len(), + columns_alias.len(), + ))); + } else { + let fields = plan.schema().fields().clone(); + LogicalPlanBuilder::from(plan) + .project( + fields + .iter() + .zip(columns_alias.iter()) + .map(|(field, ident)| col(field.name()).alias(&ident.value)), + )? + .build() + } + } else { + Ok(plan) } } @@ -1884,6 +1918,27 @@ mod tests { quick_test(sql, expected); } + #[test] + fn table_with_column_alias() { + let sql = "SELECT a, b, c + FROM lineitem l (a, b, c)"; + let expected = "Projection: #a, #b, #c\ + \n Projection: #l.l_item_id AS a, #l.l_description AS b, #l.price AS c\ + \n TableScan: l projection=None"; + quick_test(sql, expected); + } + + #[test] + fn table_with_column_alias_number_cols() { + let sql = "SELECT a, b, c + FROM lineitem l (a, b)"; + let err = logical_plan(sql).expect_err("query should have failed"); + assert_eq!( + "Plan(\"Source table contains 3 columns but only 2 names given as column alias\")", + format!("{:?}", err) + ); + } + #[test] fn select_with_having() { let sql = "SELECT id, age From afe29bd3c6322db88936d5936db58ad1fb5c9ae2 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Sun, 18 Jul 2021 19:28:34 +0800 Subject: [PATCH 271/329] update `python` crate dependencies (#740) * update dependencies * rename macros --- datafusion/src/physical_plan/functions.rs | 2 - python/Cargo.toml | 4 +- python/requirements.txt | 88 ++++++++++++----------- 3 files changed, 48 insertions(+), 46 deletions(-) diff --git a/datafusion/src/physical_plan/functions.rs b/datafusion/src/physical_plan/functions.rs index d856ca4bd6062..7bb3cb456e9fd 100644 --- a/datafusion/src/physical_plan/functions.rs +++ b/datafusion/src/physical_plan/functions.rs @@ -3723,7 +3723,6 @@ mod tests { let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]); let ctx_state = ExecutionContextState::new(); - // concat(value, value) let col_value: ArrayRef = Arc::new(StringArray::from(vec!["aaa-555"])); let pattern = lit(ScalarValue::Utf8(Some(r".*-(\d*)".to_string()))); let columns: Vec = vec![col_value]; @@ -3763,7 +3762,6 @@ mod tests { let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); let ctx_state = ExecutionContextState::new(); - // concat(value, value) let col_value = lit(ScalarValue::Utf8(Some("aaa-555".to_string()))); let pattern = lit(ScalarValue::Utf8(Some(r".*-(\d*)".to_string()))); let columns: Vec = vec![Arc::new(Int32Array::from(vec![1]))]; diff --git a/python/Cargo.toml b/python/Cargo.toml index 777e42745de58..eab8a8bcf5555 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -30,8 +30,8 @@ edition = "2018" libc = "0.2" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } rand = "0.7" -pyo3 = { version = "0.13.2", features = ["extension-module"] } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "fddab22aa562750f67385a961497dc020b18c4b2" } +pyo3 = { version = "0.14.1", features = ["extension-module"] } +datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "5c1f8eb32c5f423d6c0e1f50350f9fe140135fdb" } [lib] name = "datafusion" diff --git a/python/requirements.txt b/python/requirements.txt index f7ede1ebd58e2..cbd86cdc0e254 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,8 +1,8 @@ # -# This file is autogenerated by pip-compile +# This file is autogenerated by pip-compile with python 3.9 # To update, run: # -# pip-compile --generate-hashes requirements.in +# pip-compile --generate-hashes # attrs==21.2.0 \ --hash=sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1 \ @@ -12,48 +12,52 @@ iniconfig==1.1.1 \ --hash=sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3 \ --hash=sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32 # via pytest -maturin==0.10.6 \ - --hash=sha256:0e81496f70a4805e6ea7dda7b0425246c111ccb119a2e22c64abeff131f4dd21 \ - --hash=sha256:3b5d5429bc05a816824420d99973f0cab39d8e274f6c3647bfd9afd95a030304 \ - --hash=sha256:4177a223727a0ad57bc3f69ca4c3bc04bb3cc4da787cc59a8e25808c85685c67 \ - --hash=sha256:4eb4481b6c7d6cac043b969d2eb993c982523e91bb2709f0b09e231cf4846731 \ - --hash=sha256:532625f312185b06ec196fdb0fc79efafc0e98768153d226fb9417c0ca85e410 \ - --hash=sha256:53ef64a147f8a5241a3e932f2db22b5ae7dc5892dae994da319446c5db89dc94 \ - --hash=sha256:a04589da42f62b1d515f35c81274a56fe0d29216894525e8a37fd1e3c69d87b1 \ - --hash=sha256:b58e9e2ba5a3f651d8885c41370a00bb1d3e4d7313cbb63354077153be7650f4 \ - --hash=sha256:bd39f7e08eb9908d4fe1cd9b3c953fad5b1fb4fec9c82d14c2973a65751e1899 \ - --hash=sha256:d63f2a15f0b8db4e70d9a59766ca240b2c2ee2146ed5e4385a6118d941d68b25 \ - --hash=sha256:fa7e1cea2a768257a33aeb556fdec5fc36011bfe82d96730117433c635629dd8 +maturin==0.11.1 \ + --hash=sha256:1d8a276b4c4ac74ecf9624ebc718982cdd0f86581d6338c877d7eb2833b89a13 \ + --hash=sha256:56b1dc8651a40d024a0ac59720ffeb61a41059fcd836f1742ad828b78650fc1a \ + --hash=sha256:70b35e77e60772002e279e87e936dd5467a7952b7ccc37054a1e478d3b25c279 \ + --hash=sha256:7b9f66a5425cf9f04276effbe31f1fea331a1bd742e1726effd72e48a98da0e1 \ + --hash=sha256:8921ab6dccde53625075b5d24d2e817bf36abaeef4387237fedb4e298e73e77d \ + --hash=sha256:a2a22ea9c8448796ce8078c7706623b212f2940ac79adaebe17000ada3d6647a \ + --hash=sha256:b322f36ee7ff67870fe6d0b5fcd41226a7eca05e6819ae812875ed5f2116038d \ + --hash=sha256:ba23ac9ca8d4a23ad794b9d966f09959dc5d511afd23992afc93b720dbe0f676 \ + --hash=sha256:c7dca1e2d8eabeb3dbd9b08a182be85621b2519a9968c728b9db73023bbdd823 \ + --hash=sha256:e1598a844fdc7b5093749feb0b373fb2f7545033bb1f00779cfbf173906e374a \ + --hash=sha256:e60308dd43eb5f763126d0651827683141b12878541c6ede008f77ef655d1343 # via -r requirements.in -numpy==1.20.3 \ - --hash=sha256:1676b0a292dd3c99e49305a16d7a9f42a4ab60ec522eac0d3dd20cdf362ac010 \ - --hash=sha256:16f221035e8bd19b9dc9a57159e38d2dd060b48e93e1d843c49cb370b0f415fd \ - --hash=sha256:43909c8bb289c382170e0282158a38cf306a8ad2ff6dfadc447e90f9961bef43 \ - --hash=sha256:4e465afc3b96dbc80cf4a5273e5e2b1e3451286361b4af70ce1adb2984d392f9 \ - --hash=sha256:55b745fca0a5ab738647d0e4db099bd0a23279c32b31a783ad2ccea729e632df \ - --hash=sha256:5d050e1e4bc9ddb8656d7b4f414557720ddcca23a5b88dd7cff65e847864c400 \ - --hash=sha256:637d827248f447e63585ca3f4a7d2dfaa882e094df6cfa177cc9cf9cd6cdf6d2 \ - --hash=sha256:6690080810f77485667bfbff4f69d717c3be25e5b11bb2073e76bb3f578d99b4 \ - --hash=sha256:66fbc6fed94a13b9801fb70b96ff30605ab0a123e775a5e7a26938b717c5d71a \ - --hash=sha256:67d44acb72c31a97a3d5d33d103ab06d8ac20770e1c5ad81bdb3f0c086a56cf6 \ - --hash=sha256:6ca2b85a5997dabc38301a22ee43c82adcb53ff660b89ee88dded6b33687e1d8 \ - --hash=sha256:6e51534e78d14b4a009a062641f465cfaba4fdcb046c3ac0b1f61dd97c861b1b \ - --hash=sha256:70eb5808127284c4e5c9e836208e09d685a7978b6a216db85960b1a112eeace8 \ - --hash=sha256:830b044f4e64a76ba71448fce6e604c0fc47a0e54d8f6467be23749ac2cbd2fb \ - --hash=sha256:8b7bb4b9280da3b2856cb1fc425932f46fba609819ee1c62256f61799e6a51d2 \ - --hash=sha256:a9c65473ebc342715cb2d7926ff1e202c26376c0dcaaee85a1fd4b8d8c1d3b2f \ - --hash=sha256:c1c09247ccea742525bdb5f4b5ceeacb34f95731647fe55774aa36557dbb5fa4 \ - --hash=sha256:c5bf0e132acf7557fc9bb8ded8b53bbbbea8892f3c9a1738205878ca9434206a \ - --hash=sha256:db250fd3e90117e0312b611574cd1b3f78bec046783195075cbd7ba9c3d73f16 \ - --hash=sha256:e515c9a93aebe27166ec9593411c58494fa98e5fcc219e47260d9ab8a1cc7f9f \ - --hash=sha256:e55185e51b18d788e49fe8305fd73ef4470596b33fc2c1ceb304566b99c71a69 \ - --hash=sha256:ea9cff01e75a956dbee133fa8e5b68f2f92175233de2f88de3a682dd94deda65 \ - --hash=sha256:f1452578d0516283c87608a5a5548b0cdde15b99650efdfd85182102ef7a7c17 \ - --hash=sha256:f39a995e47cb8649673cfa0579fbdd1cdd33ea497d1728a6cb194d6252268e48 +numpy==1.21.0 \ + --hash=sha256:1a784e8ff7ea2a32e393cc53eb0003eca1597c7ca628227e34ce34eb11645a0e \ + --hash=sha256:2ba579dde0563f47021dcd652253103d6fd66165b18011dce1a0609215b2791e \ + --hash=sha256:3537b967b350ad17633b35c2f4b1a1bbd258c018910b518c30b48c8e41272717 \ + --hash=sha256:3c40e6b860220ed862e8097b8f81c9af6d7405b723f4a7af24a267b46f90e461 \ + --hash=sha256:598fe100b2948465cf3ed64b1a326424b5e4be2670552066e17dfaa67246011d \ + --hash=sha256:620732f42259eb2c4642761bd324462a01cdd13dd111740ce3d344992dd8492f \ + --hash=sha256:709884863def34d72b183d074d8ba5cfe042bc3ff8898f1ffad0209161caaa99 \ + --hash=sha256:75579acbadbf74e3afd1153da6177f846212ea2a0cc77de53523ae02c9256513 \ + --hash=sha256:7c55407f739f0bfcec67d0df49103f9333edc870061358ac8a8c9e37ea02fcd2 \ + --hash=sha256:a1f2fb2da242568af0271455b89aee0f71e4e032086ee2b4c5098945d0e11cf6 \ + --hash=sha256:a290989cd671cd0605e9c91a70e6df660f73ae87484218e8285c6522d29f6e38 \ + --hash=sha256:ac4fd578322842dbda8d968e3962e9f22e862b6ec6e3378e7415625915e2da4d \ + --hash=sha256:ad09f55cc95ed8d80d8ab2052f78cc21cb231764de73e229140d81ff49d8145e \ + --hash=sha256:b9205711e5440954f861ceeea8f1b415d7dd15214add2e878b4d1cf2bcb1a914 \ + --hash=sha256:bba474a87496d96e61461f7306fba2ebba127bed7836212c360f144d1e72ac54 \ + --hash=sha256:bebab3eaf0641bba26039fb0b2c5bf9b99407924b53b1ea86e03c32c64ef5aef \ + --hash=sha256:cc367c86eb87e5b7c9592935620f22d13b090c609f1b27e49600cd033b529f54 \ + --hash=sha256:ccc6c650f8700ce1e3a77668bb7c43e45c20ac06ae00d22bdf6760b38958c883 \ + --hash=sha256:cf680682ad0a3bef56dae200dbcbac2d57294a73e5b0f9864955e7dd7c2c2491 \ + --hash=sha256:d2910d0a075caed95de1a605df00ee03b599de5419d0b95d55342e9a33ad1fb3 \ + --hash=sha256:d5caa946a9f55511e76446e170bdad1d12d6b54e17a2afe7b189112ed4412bb8 \ + --hash=sha256:d89b0dc7f005090e32bb4f9bf796e1dcca6b52243caf1803fdd2b748d8561f63 \ + --hash=sha256:d95d16204cd51ff1a1c8d5f9958ce90ae190be81d348b514f9be39f878b8044a \ + --hash=sha256:e4d5a86a5257843a18fb1220c5f1c199532bc5d24e849ed4b0289fb59fbd4d8f \ + --hash=sha256:e58ddb53a7b4959932f5582ac455ff90dcb05fac3f8dcc8079498d43afbbde6c \ + --hash=sha256:e80fe25cba41c124d04c662f33f6364909b985f2eb5998aaa5ae4b9587242cce \ + --hash=sha256:eda2829af498946c59d8585a9fd74da3f810866e05f8df03a86f70079c7531dd \ + --hash=sha256:fd0a359c1c17f00cb37de2969984a74320970e0ceef4808c32e00773b06649d9 # via pyarrow -packaging==20.9 \ - --hash=sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5 \ - --hash=sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a +packaging==21.0 \ + --hash=sha256:7dc96269f53a4ccec5c0670940a4281106dd0bb343f47b7471f779df49c2fbe7 \ + --hash=sha256:c86254f9220d55e31cc94d69bade760f0847da8000def4dfe1c6b872fd14ff14 # via pytest pluggy==0.13.1 \ --hash=sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0 \ From bd3ee23520a3e6f135891ec32d96fcea7ee2bb55 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Mon, 19 Jul 2021 13:13:22 +0100 Subject: [PATCH 272/329] perf: improve performance of `SortPreservingMergeExec` operator (#722) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * perf: re-use Array comparators This commit stores built Arrow comparators for two arrays on each of the sort key cursors, resulting in a significant reduction in the cost associated with merging record batches using the `SortPreservingMerge` operator. Benchmarks improved as follows: ``` ⇒ critcmp master pr group master pr ----- ------ -- interleave_batches 1.83 623.8±12.41µs ? ?/sec 1.00 341.2±6.98µs ? ?/sec merge_batches_no_overlap_large 1.56 400.6±4.94µs ? ?/sec 1.00 256.3±6.57µs ? ?/sec merge_batches_no_overlap_small 1.63 425.1±24.88µs ? ?/sec 1.00 261.1±7.46µs ? ?/sec merge_batches_small_into_large 1.18 228.0±3.95µs ? ?/sec 1.00 193.6±2.86µs ? ?/sec merge_batches_some_overlap_large 1.68 505.4±10.27µs ? ?/sec 1.00 301.3±6.63µs ? ?/sec merge_batches_some_overlap_small 1.64 515.7±5.21µs ? ?/sec 1.00 314.6±12.66µs ? ?/sec ``` * test: test more than two partitions --- .../physical_plan/sort_preserving_merge.rs | 182 ++++++++++++++---- 1 file changed, 145 insertions(+), 37 deletions(-) diff --git a/datafusion/src/physical_plan/sort_preserving_merge.rs b/datafusion/src/physical_plan/sort_preserving_merge.rs index 0949c3c6a8cf6..b4bcc2935e4f4 100644 --- a/datafusion/src/physical_plan/sort_preserving_merge.rs +++ b/datafusion/src/physical_plan/sort_preserving_merge.rs @@ -24,6 +24,7 @@ use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; +use arrow::array::DynComparator; use arrow::{ array::{make_array as make_arrow_array, ArrayRef, MutableArrayData}, compute::SortOptions, @@ -35,6 +36,7 @@ use async_trait::async_trait; use futures::channel::mpsc; use futures::stream::FusedStream; use futures::{Stream, StreamExt}; +use hashbrown::HashMap; use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ @@ -176,34 +178,60 @@ impl ExecutionPlan for SortPreservingMergeExec { } } -/// A `SortKeyCursor` is created from a `RecordBatch`, and a set of `PhysicalExpr` that when -/// evaluated on the `RecordBatch` yield the sort keys. +/// A `SortKeyCursor` is created from a `RecordBatch`, and a set of +/// `PhysicalExpr` that when evaluated on the `RecordBatch` yield the sort keys. /// /// Additionally it maintains a row cursor that can be advanced through the rows /// of the provided `RecordBatch` /// -/// `SortKeyCursor::compare` can then be used to compare the sort key pointed to by this -/// row cursor, with that of another `SortKeyCursor` -#[derive(Debug, Clone)] +/// `SortKeyCursor::compare` can then be used to compare the sort key pointed to +/// by this row cursor, with that of another `SortKeyCursor`. A cursor stores +/// a row comparator for each other cursor that it is compared to. struct SortKeyCursor { columns: Vec, - batch: RecordBatch, cur_row: usize, num_rows: usize, + + // An index uniquely identifying the record batch scanned by this cursor. + batch_idx: usize, + batch: RecordBatch, + + // A collection of comparators that compare rows in this cursor's batch to + // the cursors in other batches. Other batches are uniquely identified by + // their batch_idx. + batch_comparators: HashMap>, +} + +impl<'a> std::fmt::Debug for SortKeyCursor { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SortKeyCursor") + .field("columns", &self.columns) + .field("cur_row", &self.cur_row) + .field("num_rows", &self.num_rows) + .field("batch_idx", &self.batch_idx) + .field("batch", &self.batch) + .field("batch_comparators", &"") + .finish() + } } impl SortKeyCursor { - fn new(batch: RecordBatch, sort_key: &[Arc]) -> Result { + fn new( + batch_idx: usize, + batch: RecordBatch, + sort_key: &[Arc], + ) -> Result { let columns = sort_key .iter() .map(|expr| Ok(expr.evaluate(&batch)?.into_array(batch.num_rows()))) .collect::>()?; - Ok(Self { cur_row: 0, num_rows: batch.num_rows(), columns, batch, + batch_idx, + batch_comparators: HashMap::new(), }) } @@ -220,7 +248,7 @@ impl SortKeyCursor { /// Compares the sort key pointed to by this instance's row cursor with that of another fn compare( - &self, + &mut self, other: &SortKeyCursor, options: &[SortOptions], ) -> Result { @@ -246,7 +274,19 @@ impl SortKeyCursor { .zip(other.columns.iter()) .zip(options.iter()); - for ((l, r), sort_options) in zipped { + // Recall or initialise a collection of comparators for comparing + // columnar arrays of this cursor and "other". + let cmp = self + .batch_comparators + .entry(other.batch_idx) + .or_insert_with(|| Vec::with_capacity(other.columns.len())); + + for (i, ((l, r), sort_options)) in zipped.enumerate() { + if i >= cmp.len() { + // initialise comparators as potentially needed + cmp.push(arrow::array::build_compare(l.as_ref(), r.as_ref())?); + } + match (l.is_valid(self.cur_row), r.is_valid(other.cur_row)) { (false, true) if sort_options.nulls_first => return Ok(Ordering::Less), (false, true) => return Ok(Ordering::Greater), @@ -255,15 +295,11 @@ impl SortKeyCursor { } (true, false) => return Ok(Ordering::Less), (false, false) => {} - (true, true) => { - // TODO: Building the predicate each time is sub-optimal - let c = arrow::array::build_compare(l.as_ref(), r.as_ref())?; - match c(self.cur_row, other.cur_row) { - Ordering::Equal => {} - o if sort_options.descending => return Ok(o.reverse()), - o => return Ok(o), - } - } + (true, true) => match cmp[i](self.cur_row, other.cur_row) { + Ordering::Equal => {} + o if sort_options.descending => return Ok(o.reverse()), + o => return Ok(o), + }, } } @@ -304,6 +340,9 @@ struct SortPreservingMergeStream { target_batch_size: usize, /// If the stream has encountered an error aborted: bool, + + /// An index to uniquely identify the input stream batch + next_batch_index: usize, } impl SortPreservingMergeStream { @@ -313,15 +352,21 @@ impl SortPreservingMergeStream { expressions: &[PhysicalSortExpr], target_batch_size: usize, ) -> Self { + let cursors = (0..streams.len()) + .into_iter() + .map(|_| VecDeque::new()) + .collect(); + Self { schema, - cursors: vec![Default::default(); streams.len()], + cursors, streams, column_expressions: expressions.iter().map(|x| x.expr.clone()).collect(), sort_options: expressions.iter().map(|x| x.options).collect(), target_batch_size, aborted: false, in_progress: vec![], + next_batch_index: 0, } } @@ -352,12 +397,17 @@ impl SortPreservingMergeStream { return Poll::Ready(Err(e)); } Some(Ok(batch)) => { - let cursor = match SortKeyCursor::new(batch, &self.column_expressions) { + let cursor = match SortKeyCursor::new( + self.next_batch_index, // assign this batch an ID + batch, + &self.column_expressions, + ) { Ok(cursor) => cursor, Err(e) => { return Poll::Ready(Err(ArrowError::ExternalError(Box::new(e)))); } }; + self.next_batch_index += 1; self.cursors[idx].push_back(cursor) } } @@ -367,17 +417,17 @@ impl SortPreservingMergeStream { /// Returns the index of the next stream to pull a row from, or None /// if all cursors for all streams are exhausted - fn next_stream_idx(&self) -> Result> { - let mut min_cursor: Option<(usize, &SortKeyCursor)> = None; - for (idx, candidate) in self.cursors.iter().enumerate() { - if let Some(candidate) = candidate.back() { + fn next_stream_idx(&mut self) -> Result> { + let mut min_cursor: Option<(usize, &mut SortKeyCursor)> = None; + for (idx, candidate) in self.cursors.iter_mut().enumerate() { + if let Some(candidate) = candidate.back_mut() { if candidate.is_finished() { continue; } match min_cursor { None => min_cursor = Some((idx, candidate)), - Some((_, min)) => { + Some((_, ref mut min)) => { if min.compare(candidate, &self.sort_options)? == Ordering::Greater { @@ -599,8 +649,7 @@ mod tests { let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); _test_merge( - b1, - b2, + &[vec![b1], vec![b2]], &[ "+----+---+-------------------------------+", "| a | b | c |", @@ -646,8 +695,7 @@ mod tests { let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); _test_merge( - b1, - b2, + &[vec![b1], vec![b2]], &[ "+-----+---+-------------------------------+", "| a | b | c |", @@ -693,8 +741,7 @@ mod tests { let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); _test_merge( - b1, - b2, + &[vec![b1], vec![b2]], &[ "+----+---+-------------------------------+", "| a | b | c |", @@ -715,8 +762,71 @@ mod tests { .await; } - async fn _test_merge(b1: RecordBatch, b2: RecordBatch, exp: &[&str]) { - let schema = b1.schema(); + #[tokio::test] + async fn test_merge_three_partitions() { + let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("a"), + Some("b"), + Some("c"), + Some("d"), + Some("f"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8])); + let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + let a: ArrayRef = Arc::new(Int32Array::from(vec![10, 20, 70, 90, 30])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("e"), + Some("g"), + Some("h"), + Some("i"), + Some("j"), + ])); + let c: ArrayRef = + Arc::new(TimestampNanosecondArray::from(vec![40, 60, 20, 20, 60])); + let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + let a: ArrayRef = Arc::new(Int32Array::from(vec![100, 200, 700, 900, 300])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("f"), + Some("g"), + Some("h"), + Some("i"), + Some("j"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2, 2, 6])); + let b3 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + _test_merge( + &[vec![b1], vec![b2], vec![b3]], + &[ + "+-----+---+-------------------------------+", + "| a | b | c |", + "+-----+---+-------------------------------+", + "| 1 | a | 1970-01-01 00:00:00.000000008 |", + "| 2 | b | 1970-01-01 00:00:00.000000007 |", + "| 7 | c | 1970-01-01 00:00:00.000000006 |", + "| 9 | d | 1970-01-01 00:00:00.000000005 |", + "| 10 | e | 1970-01-01 00:00:00.000000040 |", + "| 100 | f | 1970-01-01 00:00:00.000000004 |", + "| 3 | f | 1970-01-01 00:00:00.000000008 |", + "| 200 | g | 1970-01-01 00:00:00.000000006 |", + "| 20 | g | 1970-01-01 00:00:00.000000060 |", + "| 700 | h | 1970-01-01 00:00:00.000000002 |", + "| 70 | h | 1970-01-01 00:00:00.000000020 |", + "| 900 | i | 1970-01-01 00:00:00.000000002 |", + "| 90 | i | 1970-01-01 00:00:00.000000020 |", + "| 300 | j | 1970-01-01 00:00:00.000000006 |", + "| 30 | j | 1970-01-01 00:00:00.000000060 |", + "+-----+---+-------------------------------+", + ], + ) + .await; + } + + async fn _test_merge(partitions: &[Vec], exp: &[&str]) { + let schema = partitions[0][0].schema(); let sort = vec![ PhysicalSortExpr { expr: col("b", &schema).unwrap(), @@ -727,12 +837,10 @@ mod tests { options: Default::default(), }, ]; - let exec = MemoryExec::try_new(&[vec![b1], vec![b2]], schema, None).unwrap(); + let exec = MemoryExec::try_new(partitions, schema, None).unwrap(); let merge = Arc::new(SortPreservingMergeExec::new(sort, Arc::new(exec), 1024)); let collected = collect(merge).await.unwrap(); - assert_eq!(collected.len(), 1); - assert_batches_eq!(exp, collected.as_slice()); } From 93670f58a9e9de41d40badee554dbbb50911237e Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Mon, 19 Jul 2021 21:35:13 +0800 Subject: [PATCH 273/329] impl fmt::Display for PlanType (#752) --- datafusion/src/logical_plan/plan.rs | 12 ++++++------ datafusion/src/physical_plan/explain.rs | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/datafusion/src/logical_plan/plan.rs b/datafusion/src/logical_plan/plan.rs index 4749840ed4c11..42eaf8e559c92 100644 --- a/datafusion/src/logical_plan/plan.rs +++ b/datafusion/src/logical_plan/plan.rs @@ -819,14 +819,14 @@ pub enum PlanType { PhysicalPlan, } -impl From<&PlanType> for String { - fn from(t: &PlanType) -> Self { - match t { - PlanType::LogicalPlan => "logical_plan".into(), +impl fmt::Display for PlanType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + PlanType::LogicalPlan => write!(f, "logical_plan"), PlanType::OptimizedLogicalPlan { optimizer_name } => { - format!("logical_plan after {}", optimizer_name) + write!(f, "logical_plan after {}", optimizer_name) } - PlanType::PhysicalPlan => "physical_plan".into(), + PlanType::PhysicalPlan => write!(f, "physical_plan"), } } } diff --git a/datafusion/src/physical_plan/explain.rs b/datafusion/src/physical_plan/explain.rs index 3c5ef1af32366..c838ce4a94d46 100644 --- a/datafusion/src/physical_plan/explain.rs +++ b/datafusion/src/physical_plan/explain.rs @@ -104,7 +104,7 @@ impl ExecutionPlan for ExplainExec { let mut plan_builder = StringBuilder::new(self.stringified_plans.len()); for p in &self.stringified_plans { - type_builder.append_value(&String::from(&p.plan_type))?; + type_builder.append_value(&p.plan_type.to_string())?; plan_builder.append_value(&*p.plan)?; } From 6f9681d5a5657e330812404b5fb98a2df9205659 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Mon, 19 Jul 2021 21:39:36 +0800 Subject: [PATCH 274/329] update planner (#751) --- datafusion/src/optimizer/limit_push_down.rs | 57 ++++++++++++++++++--- 1 file changed, 49 insertions(+), 8 deletions(-) diff --git a/datafusion/src/optimizer/limit_push_down.rs b/datafusion/src/optimizer/limit_push_down.rs index 21b82a6c9698a..37c95a4436922 100644 --- a/datafusion/src/optimizer/limit_push_down.rs +++ b/datafusion/src/optimizer/limit_push_down.rs @@ -17,13 +17,13 @@ //! Optimizer rule to push down LIMIT in the query plan //! It will push down through projection, limits (taking the smaller limit) -use std::sync::Arc; - use super::utils; use crate::error::Result; use crate::execution::context::ExecutionProps; use crate::logical_plan::LogicalPlan; use crate::optimizer::optimizer::OptimizerRule; +use std::sync::Arc; +use utils::optimize_explain; /// Optimization rule that tries pushes down LIMIT n /// where applicable to reduce the amount of scanned / processed data @@ -37,16 +37,42 @@ impl LimitPushDown { } fn limit_push_down( + optimizer: &LimitPushDown, upper_limit: Option, plan: &LogicalPlan, + execution_props: &ExecutionProps, ) -> Result { match (plan, upper_limit) { + ( + LogicalPlan::Explain { + verbose, + schema, + plan, + stringified_plans, + }, + _, + ) => { + let schema = schema.as_ref().to_owned().into(); + optimize_explain( + optimizer, + *verbose, + plan, + stringified_plans, + &schema, + execution_props, + ) + } (LogicalPlan::Limit { n, input }, upper_limit) => { let smallest = upper_limit.map(|x| std::cmp::min(x, *n)).unwrap_or(*n); Ok(LogicalPlan::Limit { n: smallest, // push down limit to plan (minimum of upper limit and current limit) - input: Arc::new(limit_push_down(Some(smallest), input.as_ref())?), + input: Arc::new(limit_push_down( + optimizer, + Some(smallest), + input.as_ref(), + execution_props, + )?), }) } ( @@ -80,7 +106,12 @@ fn limit_push_down( // Push down limit directly (projection doesn't change number of rows) Ok(LogicalPlan::Projection { expr: expr.clone(), - input: Arc::new(limit_push_down(upper_limit, input.as_ref())?), + input: Arc::new(limit_push_down( + optimizer, + upper_limit, + input.as_ref(), + execution_props, + )?), schema: schema.clone(), }) } @@ -98,7 +129,12 @@ fn limit_push_down( .map(|x| { Ok(LogicalPlan::Limit { n: upper_limit, - input: Arc::new(limit_push_down(Some(upper_limit), x)?), + input: Arc::new(limit_push_down( + optimizer, + Some(upper_limit), + x, + execution_props, + )?), }) }) .collect::>()?; @@ -117,7 +153,7 @@ fn limit_push_down( let inputs = plan.inputs(); let new_inputs = inputs .iter() - .map(|plan| limit_push_down(None, plan)) + .map(|plan| limit_push_down(optimizer, None, plan, execution_props)) .collect::>>()?; utils::from_plan(plan, &expr, &new_inputs) @@ -126,14 +162,19 @@ fn limit_push_down( } impl OptimizerRule for LimitPushDown { - fn optimize(&self, plan: &LogicalPlan, _: &ExecutionProps) -> Result { - limit_push_down(None, plan) + fn optimize( + &self, + plan: &LogicalPlan, + execution_props: &ExecutionProps, + ) -> Result { + limit_push_down(self, None, plan, execution_props) } fn name(&self) -> &str { "limit_push_down" } } + #[cfg(test)] mod test { use super::*; From 7019b0f3b6de025ed95f88492466663284ada8e7 Mon Sep 17 00:00:00 2001 From: Cui Wenzheng Date: Mon, 19 Jul 2021 23:43:57 +0800 Subject: [PATCH 275/329] step1 add option in ExecutionConfig to enable/disable parquet pruning (#749) --- datafusion/src/datasource/parquet.rs | 22 +++++++++++++++++++- datafusion/src/execution/context.rs | 18 ++++++++++++---- datafusion/src/physical_optimizer/pruning.rs | 4 ++-- 3 files changed, 37 insertions(+), 7 deletions(-) diff --git a/datafusion/src/datasource/parquet.rs b/datafusion/src/datasource/parquet.rs index e53fbbdefd2f2..28f79a6ae8ddb 100644 --- a/datafusion/src/datasource/parquet.rs +++ b/datafusion/src/datasource/parquet.rs @@ -38,6 +38,7 @@ pub struct ParquetTable { schema: SchemaRef, statistics: Statistics, max_concurrency: usize, + enable_pruning: bool, } impl ParquetTable { @@ -51,6 +52,7 @@ impl ParquetTable { schema, statistics: parquet_exec.statistics().to_owned(), max_concurrency, + enable_pruning: true, }) } @@ -58,6 +60,17 @@ impl ParquetTable { pub fn path(&self) -> &str { &self.path } + + /// Get parquet pruning option + pub fn get_enable_pruning(&self) -> bool { + self.enable_pruning + } + + /// Set parquet pruning option + pub fn with_enable_pruning(mut self, enable_pruning: bool) -> Self { + self.enable_pruning = enable_pruning; + self + } } impl TableProvider for ParquetTable { @@ -86,7 +99,14 @@ impl TableProvider for ParquetTable { filters: &[Expr], limit: Option, ) -> Result> { - let predicate = combine_filters(filters); + // If enable pruning then combine the filters to build the predicate. + // If disable pruning then set the predicate to None, thus readers + // will not prune data based on the statistics. + let predicate = if self.enable_pruning { + combine_filters(filters) + } else { + None + }; Ok(Arc::new(ParquetExec::try_from_path( &self.path, projection.clone(), diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index d4d3a8a14ac69..bd939cef7035b 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -322,10 +322,11 @@ impl ExecutionContext { /// Registers a Parquet data source so that it can be referenced from SQL statements /// executed against this context. pub fn register_parquet(&mut self, name: &str, filename: &str) -> Result<()> { - let table = ParquetTable::try_new( - filename, - self.state.lock().unwrap().config.concurrency, - )?; + let table = { + let m = self.state.lock().unwrap(); + ParquetTable::try_new(filename, m.config.concurrency)? + .with_enable_pruning(m.config.parquet_pruning) + }; self.register_table(name, Arc::new(table))?; Ok(()) } @@ -633,6 +634,8 @@ pub struct ExecutionConfig { /// Should DataFusion repartition data using the partition keys to execute window functions in /// parallel using the provided `concurrency` level pub repartition_windows: bool, + /// Should Datafusion parquet reader using the predicate to prune data + parquet_pruning: bool, } impl Default for ExecutionConfig { @@ -663,6 +666,7 @@ impl Default for ExecutionConfig { repartition_joins: true, repartition_aggregations: true, repartition_windows: true, + parquet_pruning: true, } } } @@ -765,6 +769,12 @@ impl ExecutionConfig { self.repartition_windows = enabled; self } + + /// Enables or disables the use of pruning predicate for parquet readers to skip row groups + pub fn with_parquet_pruning(mut self, enabled: bool) -> Self { + self.parquet_pruning = enabled; + self + } } /// Holds per-execution properties and data (such as starting timestamps, etc). diff --git a/datafusion/src/physical_optimizer/pruning.rs b/datafusion/src/physical_optimizer/pruning.rs index 5585c4d08140a..36253815414ab 100644 --- a/datafusion/src/physical_optimizer/pruning.rs +++ b/datafusion/src/physical_optimizer/pruning.rs @@ -547,7 +547,7 @@ fn build_predicate_expression( // allow partial failure in predicate expression generation // this can still produce a useful predicate when multiple conditions are joined using AND Err(_) => { - return Ok(logical_plan::lit(true)); + return Ok(unhandled); } }; let corrected_op = expr_builder.correct_operator(op); @@ -596,7 +596,7 @@ fn build_predicate_expression( .lt_eq(expr_builder.scalar_expr().clone()) } // other expressions are not supported - _ => logical_plan::lit(true), + _ => unhandled, }; Ok(statistics_expr) } From a4f6cdd64997617d3d32fac537615d19fa7cbe36 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Mon, 19 Jul 2021 23:47:23 +0800 Subject: [PATCH 276/329] fix arrow type id mapping (#742) --- python/src/dataframe.rs | 7 +--- python/src/to_rust.rs | 1 + python/src/types.rs | 19 ++------- python/tests/test_pa_types.py | 51 +++++++++++++++++++++++++ python/tests/test_string_functions.py | 55 +++++++++++++++++++++++++++ 5 files changed, 112 insertions(+), 21 deletions(-) create mode 100644 python/tests/test_pa_types.py create mode 100644 python/tests/test_string_functions.py diff --git a/python/src/dataframe.rs b/python/src/dataframe.rs index 89c85f958c797..4a50262ec3292 100644 --- a/python/src/dataframe.rs +++ b/python/src/dataframe.rs @@ -159,12 +159,7 @@ impl DataFrame { } }; - let builder = errors::wrap(builder.join( - &right.plan, - join_type, - on.clone(), - on, - ))?; + let builder = errors::wrap(builder.join(&right.plan, join_type, on.clone(), on))?; let plan = errors::wrap(builder.build())?; diff --git a/python/src/to_rust.rs b/python/src/to_rust.rs index 2e3f7f05ec588..e7957ec42d92f 100644 --- a/python/src/to_rust.rs +++ b/python/src/to_rust.rs @@ -48,6 +48,7 @@ pub fn to_rust(ob: &PyAny) -> PyResult { Ok(array) } +/// converts a pyarrow batch into a RecordBatch pub fn to_rust_batch(batch: &PyAny) -> PyResult { let schema = batch.getattr("schema")?; let names = schema.getattr("names")?.extract::>()?; diff --git a/python/src/types.rs b/python/src/types.rs index ffa822e073a89..bd6ef0d376e63 100644 --- a/python/src/types.rs +++ b/python/src/types.rs @@ -48,24 +48,13 @@ fn data_type_id(id: &i32) -> Result { 7 => DataType::Int32, 8 => DataType::UInt64, 9 => DataType::Int64, - 10 => DataType::Float16, 11 => DataType::Float32, 12 => DataType::Float64, - - //13 => DataType::Decimal, - - // 14 => DataType::Date32(), - // 15 => DataType::Date64(), - // 16 => DataType::Timestamp(), - // 17 => DataType::Time32(), - // 18 => DataType::Time64(), - // 19 => DataType::Duration() - 20 => DataType::Binary, - 21 => DataType::Utf8, - 22 => DataType::LargeBinary, - 23 => DataType::LargeUtf8, - + 13 => DataType::Utf8, + 14 => DataType::Binary, + 34 => DataType::LargeUtf8, + 35 => DataType::LargeBinary, other => { return Err(errors::DataFusionError::Common(format!( "The type {} is not valid", diff --git a/python/tests/test_pa_types.py b/python/tests/test_pa_types.py new file mode 100644 index 0000000000000..069343f8a45f4 --- /dev/null +++ b/python/tests/test_pa_types.py @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pyarrow as pa + + +def test_type_ids(): + """having this fixed is very important because internally we rely on this id to parse from + python""" + for idx, arrow_type in [ + (0, pa.null()), + (1, pa.bool_()), + (2, pa.uint8()), + (3, pa.int8()), + (4, pa.uint16()), + (5, pa.int16()), + (6, pa.uint32()), + (7, pa.int32()), + (8, pa.uint64()), + (9, pa.int64()), + (10, pa.float16()), + (11, pa.float32()), + (12, pa.float64()), + (13, pa.string()), + (13, pa.utf8()), + (14, pa.binary()), + (16, pa.date32()), + (17, pa.date64()), + (18, pa.timestamp("us")), + (19, pa.time32("s")), + (20, pa.time64("us")), + (23, pa.decimal128(8, 1)), + (34, pa.large_utf8()), + (35, pa.large_binary()), + ]: + + assert idx == arrow_type.id diff --git a/python/tests/test_string_functions.py b/python/tests/test_string_functions.py new file mode 100644 index 0000000000000..f8e15578320b1 --- /dev/null +++ b/python/tests/test_string_functions.py @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pyarrow as pa +import pytest +from datafusion import ExecutionContext +from datafusion import functions as f + + +@pytest.fixture +def df(): + ctx = ExecutionContext() + + # create a RecordBatch and a new DataFrame from it + batch = pa.RecordBatch.from_arrays( + [pa.array(["Hello", "World", "!"]), pa.array([4, 5, 6])], + names=["a", "b"], + ) + + return ctx.create_dataframe([[batch]]) + + +def test_string_functions(df): + df = df.select(f.md5(f.col("a")), f.lower(f.col("a"))) + result = df.collect() + assert len(result) == 1 + result = result[0] + assert result.column(0) == pa.array( + [ + "8b1a9953c4611296a827abf8c47804d7", + "f5a7924e621e84c9280a9a27e1bcb7f6", + "9033e0e305f247c0c3c80d0c7848c8b3", + ] + ) + assert result.column(1) == pa.array( + [ + "hello", + "world", + "!", + ] + ) From b5e034b1a4f47a47de68d176b98042a1e4df7d58 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Tue, 20 Jul 2021 00:29:27 +0800 Subject: [PATCH 277/329] Remove unnecessary projection in logical plan optimization phase (#747) * eliminate super-set project Signed-off-by: Ruihang Xia * keep projection right before table scan Signed-off-by: Ruihang Xia * tidy Signed-off-by: Ruihang Xia --- .../src/optimizer/projection_push_down.rs | 69 ++++++++++++++++++- 1 file changed, 66 insertions(+), 3 deletions(-) diff --git a/datafusion/src/optimizer/projection_push_down.rs b/datafusion/src/optimizer/projection_push_down.rs index 0272b9f7872cf..089dca2318c98 100644 --- a/datafusion/src/optimizer/projection_push_down.rs +++ b/datafusion/src/optimizer/projection_push_down.rs @@ -173,7 +173,17 @@ fn optimize_plan( true, execution_props, )?; - if new_fields.is_empty() { + + let new_required_columns_optimized = new_input + .schema() + .fields() + .iter() + .map(|f| f.qualified_column()) + .collect::>(); + + if new_fields.is_empty() + || (has_projection && &new_required_columns_optimized == required_columns) + { // no need for an expression at all Ok(new_input) } else { @@ -496,6 +506,60 @@ mod tests { Ok(()) } + #[test] + fn redundunt_project() -> Result<()> { + let table_scan = test_table_scan()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .project(vec![col("a"), col("b"), col("c")])? + .project(vec![col("a"), col("c"), col("b")])? + .build()?; + let expected = "Projection: #test.a, #test.c, #test.b\ + \n TableScan: test projection=Some([0, 1, 2])"; + + assert_optimized_plan_eq(&plan, expected); + + Ok(()) + } + + #[test] + fn reorder_projection() -> Result<()> { + let table_scan = test_table_scan()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .project(vec![col("c"), col("b"), col("a")])? + .build()?; + let expected = "Projection: #test.c, #test.b, #test.a\ + \n TableScan: test projection=Some([0, 1, 2])"; + + assert_optimized_plan_eq(&plan, expected); + + Ok(()) + } + + #[test] + fn noncontiguous_redundunt_projection() -> Result<()> { + let table_scan = test_table_scan()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .project(vec![col("c"), col("b"), col("a")])? + .filter(col("c").gt(lit(1)))? + .project(vec![col("c"), col("a"), col("b")])? + .filter(col("b").gt(lit(1)))? + .filter(col("a").gt(lit(1)))? + .project(vec![col("a"), col("c"), col("b")])? + .build()?; + let expected = "Projection: #test.a, #test.c, #test.b\ + \n Filter: #test.a Gt Int32(1)\ + \n Filter: #test.b Gt Int32(1)\ + \n Filter: #test.c Gt Int32(1)\ + \n TableScan: test projection=Some([0, 1, 2])"; + + assert_optimized_plan_eq(&plan, expected); + + Ok(()) + } + #[test] fn join_schema_trim_full_join_column_projection() -> Result<()> { let table_scan = test_table_scan()?; @@ -812,8 +876,7 @@ mod tests { assert_fields_eq(&plan, vec!["c", "a", "MAX(test.b)"]); - let expected = "\ - Projection: #test.c, #test.a, #MAX(test.b)\ + let expected = "Projection: #test.c, #test.a, #MAX(test.b)\ \n Filter: #test.c Gt Int32(1)\ \n Aggregate: groupBy=[[#test.a, #test.c]], aggr=[[MAX(#test.b)]]\ \n TableScan: test projection=Some([0, 1, 2])"; From 3fb600df48ab1e53903b1a9bb12ebde33ad0856b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 19 Jul 2021 12:45:36 -0400 Subject: [PATCH 278/329] Show optimized physical and logical plans in EXPLAIN (#744) * Show optimized physical and logical plans in EXPLAIN * rewrite tests * reformat --- datafusion/src/logical_plan/builder.rs | 18 +++-- datafusion/src/logical_plan/plan.rs | 24 +++++-- datafusion/src/optimizer/utils.rs | 4 +- datafusion/src/physical_plan/explain.rs | 18 ++++- datafusion/src/physical_plan/planner.rs | 94 +++++++++++++++++-------- datafusion/src/sql/planner.rs | 4 +- datafusion/tests/sql.rs | 45 ++++++++++-- datafusion/tests/user_defined_plan.rs | 6 +- 8 files changed, 158 insertions(+), 55 deletions(-) diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs index 85c4aea99ff5f..0335e29127ab9 100644 --- a/datafusion/src/logical_plan/builder.rs +++ b/datafusion/src/logical_plan/builder.rs @@ -399,7 +399,7 @@ impl LogicalPlanBuilder { /// Create an expression to represent the explanation of the plan pub fn explain(&self, verbose: bool) -> Result { let stringified_plans = vec![StringifiedPlan::new( - PlanType::LogicalPlan, + PlanType::InitialLogicalPlan, format!("{:#?}", self.plan.clone()), )]; @@ -740,14 +740,24 @@ mod tests { #[test] fn stringified_plan() { let stringified_plan = - StringifiedPlan::new(PlanType::LogicalPlan, "...the plan..."); + StringifiedPlan::new(PlanType::InitialLogicalPlan, "...the plan..."); + assert!(stringified_plan.should_display(true)); + assert!(!stringified_plan.should_display(false)); // not in non verbose mode + + let stringified_plan = + StringifiedPlan::new(PlanType::FinalLogicalPlan, "...the plan..."); assert!(stringified_plan.should_display(true)); assert!(stringified_plan.should_display(false)); // display in non verbose mode too let stringified_plan = - StringifiedPlan::new(PlanType::PhysicalPlan, "...the plan..."); + StringifiedPlan::new(PlanType::InitialPhysicalPlan, "...the plan..."); assert!(stringified_plan.should_display(true)); - assert!(!stringified_plan.should_display(false)); + assert!(!stringified_plan.should_display(false)); // not in non verbose mode + + let stringified_plan = + StringifiedPlan::new(PlanType::FinalPhysicalPlan, "...the plan..."); + assert!(stringified_plan.should_display(true)); + assert!(stringified_plan.should_display(false)); // display in non verbose mode let stringified_plan = StringifiedPlan::new( PlanType::OptimizedLogicalPlan { diff --git a/datafusion/src/logical_plan/plan.rs b/datafusion/src/logical_plan/plan.rs index 42eaf8e559c92..9a4daae27ff56 100644 --- a/datafusion/src/logical_plan/plan.rs +++ b/datafusion/src/logical_plan/plan.rs @@ -805,28 +805,35 @@ impl fmt::Debug for LogicalPlan { } } -/// Represents which type of plan +/// Represents which type of plan, when storing multiple +/// for use in EXPLAIN plans #[derive(Debug, Clone, PartialEq)] pub enum PlanType { /// The initial LogicalPlan provided to DataFusion - LogicalPlan, + InitialLogicalPlan, /// The LogicalPlan which results from applying an optimizer pass OptimizedLogicalPlan { /// The name of the optimizer which produced this plan optimizer_name: String, }, - /// The physical plan, prepared for execution - PhysicalPlan, + /// The final, fully optimized LogicalPlan that was converted to a physical plan + FinalLogicalPlan, + /// The initial physical plan, prepared for execution + InitialPhysicalPlan, + /// The final, fully optimized physical which would be executed + FinalPhysicalPlan, } impl fmt::Display for PlanType { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - PlanType::LogicalPlan => write!(f, "logical_plan"), + PlanType::InitialLogicalPlan => write!(f, "initial_logical_plan"), PlanType::OptimizedLogicalPlan { optimizer_name } => { write!(f, "logical_plan after {}", optimizer_name) } - PlanType::PhysicalPlan => write!(f, "physical_plan"), + PlanType::FinalLogicalPlan => write!(f, "logical_plan"), + PlanType::InitialPhysicalPlan => write!(f, "initial_physical_plan"), + PlanType::FinalPhysicalPlan => write!(f, "physical_plan"), } } } @@ -854,7 +861,10 @@ impl StringifiedPlan { /// returns true if this plan should be displayed. Generally /// `verbose_mode = true` will display all available plans pub fn should_display(&self, verbose_mode: bool) -> bool { - self.plan_type == PlanType::LogicalPlan || verbose_mode + match self.plan_type { + PlanType::FinalLogicalPlan | PlanType::FinalPhysicalPlan => true, + _ => verbose_mode, + } } } diff --git a/datafusion/src/optimizer/utils.rs b/datafusion/src/optimizer/utils.rs index 1d19f0681b350..88380ea17c875 100644 --- a/datafusion/src/optimizer/utils.rs +++ b/datafusion/src/optimizer/utils.rs @@ -542,7 +542,7 @@ mod tests { &optimizer, true, &empty_plan, - &[StringifiedPlan::new(PlanType::LogicalPlan, "...")], + &[StringifiedPlan::new(PlanType::InitialLogicalPlan, "...")], schema.as_ref(), &ExecutionProps::new(), )?; @@ -556,7 +556,7 @@ mod tests { assert!(*verbose); let expected_stringified_plans = vec![ - StringifiedPlan::new(PlanType::LogicalPlan, "..."), + StringifiedPlan::new(PlanType::InitialLogicalPlan, "..."), StringifiedPlan::new( PlanType::OptimizedLogicalPlan { optimizer_name: "test_optimizer".into(), diff --git a/datafusion/src/physical_plan/explain.rs b/datafusion/src/physical_plan/explain.rs index c838ce4a94d46..195a7a518370a 100644 --- a/datafusion/src/physical_plan/explain.rs +++ b/datafusion/src/physical_plan/explain.rs @@ -40,14 +40,21 @@ pub struct ExplainExec { schema: SchemaRef, /// The strings to be printed stringified_plans: Vec, + /// control which plans to print + verbose: bool, } impl ExplainExec { /// Create a new ExplainExec - pub fn new(schema: SchemaRef, stringified_plans: Vec) -> Self { + pub fn new( + schema: SchemaRef, + stringified_plans: Vec, + verbose: bool, + ) -> Self { ExplainExec { schema, stringified_plans, + verbose, } } @@ -103,8 +110,13 @@ impl ExecutionPlan for ExplainExec { let mut type_builder = StringBuilder::new(self.stringified_plans.len()); let mut plan_builder = StringBuilder::new(self.stringified_plans.len()); - for p in &self.stringified_plans { - type_builder.append_value(&p.plan_type.to_string())?; + let plans_to_print = self + .stringified_plans + .iter() + .filter(|s| s.should_display(self.verbose)); + + for p in plans_to_print { + type_builder.append_value(p.plan_type.to_string())?; plan_builder.append_value(&*p.plan)?; } diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index df4168370003a..5163e4b425b4f 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -240,8 +240,13 @@ impl PhysicalPlanner for DefaultPhysicalPlanner { logical_plan: &LogicalPlan, ctx_state: &ExecutionContextState, ) -> Result> { - let plan = self.create_initial_plan(logical_plan, ctx_state)?; - self.optimize_plan(plan, ctx_state) + match self.handle_explain(logical_plan, ctx_state)? { + Some(plan) => Ok(plan), + None => { + let plan = self.create_initial_plan(logical_plan, ctx_state)?; + self.optimize_plan(plan, ctx_state) + } + } } /// Create a physical expression from a logical expression @@ -280,7 +285,7 @@ impl DefaultPhysicalPlanner { Self { extension_planners } } - /// Optimize a physical plan + /// Optimize a physical plan by applying each physical optimizer fn optimize_plan( &self, plan: Arc, @@ -749,32 +754,9 @@ impl DefaultPhysicalPlanner { "Unsupported logical plan: CreateExternalTable".to_string(), )) } - LogicalPlan::Explain { - verbose, - plan, - stringified_plans, - schema, - } => { - let input = self.create_initial_plan(plan, ctx_state)?; - - let mut stringified_plans = stringified_plans - .iter() - .filter(|s| s.should_display(*verbose)) - .cloned() - .collect::>(); - - // add in the physical plan if requested - if *verbose { - stringified_plans.push(StringifiedPlan::new( - PlanType::PhysicalPlan, - displayable(input.as_ref()).indent().to_string(), - )); - } - Ok(Arc::new(ExplainExec::new( - SchemaRef::new(schema.as_ref().to_owned().into()), - stringified_plans, - ))) - } + LogicalPlan::Explain { .. } => Err(DataFusionError::Internal( + "Unsupported logical plan: Explain must be root of the plan".to_string(), + )), LogicalPlan::Extension { node } => { let physical_inputs = node .inputs() @@ -1315,6 +1297,60 @@ impl DefaultPhysicalPlanner { options, }) } + + /// Handles capturing the various plans for EXPLAIN queries + /// + /// Returns + /// Some(plan) if optimized, and None if logical_plan was not an + /// explain (and thus needs to be optimized as normal) + fn handle_explain( + &self, + logical_plan: &LogicalPlan, + ctx_state: &ExecutionContextState, + ) -> Result>> { + if let LogicalPlan::Explain { + verbose, + plan, + stringified_plans, + schema, + } = logical_plan + { + let final_logical_plan = StringifiedPlan::new( + PlanType::FinalLogicalPlan, + plan.display_indent().to_string(), + ); + + let input = self.create_initial_plan(plan, ctx_state)?; + + let initial_physical_plan = StringifiedPlan::new( + PlanType::InitialPhysicalPlan, + displayable(input.as_ref()).indent().to_string(), + ); + + let input = self.optimize_plan(input, ctx_state)?; + + let final_physical_plan = StringifiedPlan::new( + PlanType::FinalPhysicalPlan, + displayable(input.as_ref()).indent().to_string(), + ); + + let stringified_plans = stringified_plans + .iter() + .cloned() + .chain(std::iter::once(final_logical_plan)) + .chain(std::iter::once(initial_physical_plan)) + .chain(std::iter::once(final_physical_plan)) + .collect::>(); + + Ok(Some(Arc::new(ExplainExec::new( + SchemaRef::new(schema.as_ref().to_owned().into()), + stringified_plans, + *verbose, + )))) + } else { + Ok(None) + } + } } fn tuple_err(value: (Result, Result)) -> Result<(T, R)> { diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 1437346fccd3a..a4bb02cf0f9a9 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -234,8 +234,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let plan = self.sql_statement_to_plan(statement)?; let stringified_plans = vec![StringifiedPlan::new( - PlanType::LogicalPlan, - format!("{:#?}", plan), + PlanType::InitialLogicalPlan, + plan.display_indent().to_string(), )]; let schema = LogicalPlan::explain_schema(); diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 9c7d0795edb91..875e982551185 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -1972,17 +1972,28 @@ async fn csv_explain() { register_aggregate_csv_by_sql(&mut ctx).await; let sql = "EXPLAIN SELECT c1 FROM aggregate_test_100 where c2 > 10"; let actual = execute(&mut ctx, sql).await; - let expected = vec![vec![ - "logical_plan", - "Projection: #aggregate_test_100.c1\ - \n Filter: #aggregate_test_100.c2 Gt Int64(10)\ - \n TableScan: aggregate_test_100 projection=None", + let actual = normalize_vec_for_explain(actual); + let expected = vec![ + vec![ + "logical_plan", + "Projection: #aggregate_test_100.c1\ + \n Filter: #aggregate_test_100.c2 Gt Int64(10)\ + \n TableScan: aggregate_test_100 projection=Some([0, 1])" + ], + vec!["physical_plan", + "ProjectionExec: expr=[c1@0 as c1]\ + \n CoalesceBatchesExec: target_batch_size=4096\ + \n FilterExec: CAST(c2@1 AS Int64) > 10\ + \n RepartitionExec: partitioning=RoundRobinBatch(NUM_CORES)\ + \n CsvExec: source=Path(ARROW_TEST_DATA/csv/aggregate_test_100.csv: [ARROW_TEST_DATA/csv/aggregate_test_100.csv]), has_header=true\ + \n" ]]; assert_eq!(expected, actual); // Also, expect same result with lowercase explain let sql = "explain SELECT c1 FROM aggregate_test_100 where c2 > 10"; let actual = execute(&mut ctx, sql).await; + let actual = normalize_vec_for_explain(actual); assert_eq!(expected, actual); } @@ -3921,3 +3932,27 @@ async fn test_aggregation_with_bad_arguments() -> Result<()> { assert_eq!(err.to_string(), "Error during planning: Invalid or wrong number of arguments passed to aggregate: 'COUNT(DISTINCT )'"); Ok(()) } + +// Normalizes parts of an explain plan that vary from run to run (such as path) +fn normalize_for_explain(s: &str) -> String { + // Convert things like /Users/alamb/Software/arrow/testing/data/csv/aggregate_test_100.csv + // to ARROW_TEST_DATA/csv/aggregate_test_100.csv + let data_path = datafusion::test_util::arrow_test_data(); + let s = s.replace(&data_path, "ARROW_TEST_DATA"); + + // convert things like partitioning=RoundRobinBatch(16) + // to partitioning=RoundRobinBatch(NUM_CORES) + let needle = format!("RoundRobinBatch({})", num_cpus::get()); + s.replace(&needle, "RoundRobinBatch(NUM_CORES)") +} + +/// Applies normalize_for_explain to every line +fn normalize_vec_for_explain(v: Vec>) -> Vec> { + v.into_iter() + .map(|l| { + l.into_iter() + .map(|s| normalize_for_explain(&s)) + .collect::>() + }) + .collect::>() +} diff --git a/datafusion/tests/user_defined_plan.rs b/datafusion/tests/user_defined_plan.rs index 21b49638d23a1..e1f8c767bd8d5 100644 --- a/datafusion/tests/user_defined_plan.rs +++ b/datafusion/tests/user_defined_plan.rs @@ -163,9 +163,9 @@ async fn topk_plan() -> Result<()> { let mut ctx = setup_table(make_topk_context()).await?; let expected = vec![ - "| logical_plan after topk | TopK: k=3 |", - "| | Projection: #sales.customer_id, #sales.revenue |", - "| | TableScan: sales projection=Some([0, 1]) |", + "| logical_plan after topk | TopK: k=3 |", + "| | Projection: #sales.customer_id, #sales.revenue |", + "| | TableScan: sales projection=Some([0, 1]) |", ].join("\n"); let explain_query = format!("EXPLAIN VERBOSE {}", QUERY); From 8a17c183e044451ff9e58a1be10536098680951d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 19 Jul 2021 15:12:40 -0400 Subject: [PATCH 279/329] Move assert_batches_eq! macros to test_utils.rs (#746) * Move assert_batches_eq! macros to test_utils.rs * port test --- datafusion/src/test/mod.rs | 69 ----------------------------------- datafusion/src/test_util.rs | 71 ++++++++++++++++++++++++++++++++++++- datafusion/tests/sql.rs | 34 +++++++++++------- 3 files changed, 92 insertions(+), 82 deletions(-) diff --git a/datafusion/src/test/mod.rs b/datafusion/src/test/mod.rs index b791551133e7e..e9a33745eeeb8 100644 --- a/datafusion/src/test/mod.rs +++ b/datafusion/src/test/mod.rs @@ -280,72 +280,3 @@ pub fn make_timestamps() -> RecordBatch { pub mod exec; pub mod user_defined; pub mod variable; - -/// Compares formatted output of a record batch with an expected -/// vector of strings, with the result of pretty formatting record -/// batches. This is a macro so errors appear on the correct line -/// -/// Designed so that failure output can be directly copy/pasted -/// into the test code as expected results. -/// -/// Expects to be called about like this: -/// -/// `assert_batch_eq!(expected_lines: &[&str], batches: &[RecordBatch])` -#[macro_export] -macro_rules! assert_batches_eq { - ($EXPECTED_LINES: expr, $CHUNKS: expr) => { - let expected_lines: Vec = - $EXPECTED_LINES.iter().map(|&s| s.into()).collect(); - - let formatted = arrow::util::pretty::pretty_format_batches($CHUNKS).unwrap(); - - let actual_lines: Vec<&str> = formatted.trim().lines().collect(); - - assert_eq!( - expected_lines, actual_lines, - "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", - expected_lines, actual_lines - ); - }; -} - -/// Compares formatted output of a record batch with an expected -/// vector of strings in a way that order does not matter. -/// This is a macro so errors appear on the correct line -/// -/// Designed so that failure output can be directly copy/pasted -/// into the test code as expected results. -/// -/// Expects to be called about like this: -/// -/// `assert_batch_sorted_eq!(expected_lines: &[&str], batches: &[RecordBatch])` -#[macro_export] -macro_rules! assert_batches_sorted_eq { - ($EXPECTED_LINES: expr, $CHUNKS: expr) => { - let mut expected_lines: Vec = - $EXPECTED_LINES.iter().map(|&s| s.into()).collect(); - - // sort except for header + footer - let num_lines = expected_lines.len(); - if num_lines > 3 { - expected_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() - } - - let formatted = arrow::util::pretty::pretty_format_batches($CHUNKS).unwrap(); - // fix for windows: \r\n --> - - let mut actual_lines: Vec<&str> = formatted.trim().lines().collect(); - - // sort except for header + footer - let num_lines = actual_lines.len(); - if num_lines > 3 { - actual_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() - } - - assert_eq!( - expected_lines, actual_lines, - "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", - expected_lines, actual_lines - ); - }; -} diff --git a/datafusion/src/test_util.rs b/datafusion/src/test_util.rs index e96e8e0c209f7..0c9498acf9207 100644 --- a/datafusion/src/test_util.rs +++ b/datafusion/src/test_util.rs @@ -15,10 +15,79 @@ // specific language governing permissions and limitations // under the License. -//! Utils to make testing easier +//! Utility functions to make testing DataFusion based crates easier use std::{env, error::Error, path::PathBuf}; +/// Compares formatted output of a record batch with an expected +/// vector of strings, with the result of pretty formatting record +/// batches. This is a macro so errors appear on the correct line +/// +/// Designed so that failure output can be directly copy/pasted +/// into the test code as expected results. +/// +/// Expects to be called about like this: +/// +/// `assert_batch_eq!(expected_lines: &[&str], batches: &[RecordBatch])` +#[macro_export] +macro_rules! assert_batches_eq { + ($EXPECTED_LINES: expr, $CHUNKS: expr) => { + let expected_lines: Vec = + $EXPECTED_LINES.iter().map(|&s| s.into()).collect(); + + let formatted = arrow::util::pretty::pretty_format_batches($CHUNKS).unwrap(); + + let actual_lines: Vec<&str> = formatted.trim().lines().collect(); + + assert_eq!( + expected_lines, actual_lines, + "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + expected_lines, actual_lines + ); + }; +} + +/// Compares formatted output of a record batch with an expected +/// vector of strings in a way that order does not matter. +/// This is a macro so errors appear on the correct line +/// +/// Designed so that failure output can be directly copy/pasted +/// into the test code as expected results. +/// +/// Expects to be called about like this: +/// +/// `assert_batch_sorted_eq!(expected_lines: &[&str], batches: &[RecordBatch])` +#[macro_export] +macro_rules! assert_batches_sorted_eq { + ($EXPECTED_LINES: expr, $CHUNKS: expr) => { + let mut expected_lines: Vec = + $EXPECTED_LINES.iter().map(|&s| s.into()).collect(); + + // sort except for header + footer + let num_lines = expected_lines.len(); + if num_lines > 3 { + expected_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() + } + + let formatted = arrow::util::pretty::pretty_format_batches($CHUNKS).unwrap(); + // fix for windows: \r\n --> + + let mut actual_lines: Vec<&str> = formatted.trim().lines().collect(); + + // sort except for header + footer + let num_lines = actual_lines.len(); + if num_lines > 3 { + actual_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() + } + + assert_eq!( + expected_lines, actual_lines, + "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + expected_lines, actual_lines + ); + }; +} + /// Returns the arrow test data directory, which is by default stored /// in a git submodule rooted at `testing/data`. /// diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 875e982551185..95b5596eb9f17 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -35,6 +35,7 @@ use arrow::{ util::display::array_value_to_string, }; +use datafusion::assert_batches_eq; use datafusion::logical_plan::LogicalPlan; use datafusion::prelude::*; use datafusion::{ @@ -112,19 +113,23 @@ async fn parquet_query() { // NOTE that string_col is actually a binary column and does not have the UTF8 logical type // so we need an explicit cast let sql = "SELECT id, CAST(string_col AS varchar) FROM alltypes_plain"; - let actual = execute(&mut ctx, sql).await; + let actual = execute_to_batches(&mut ctx, sql).await; let expected = vec![ - vec!["4", "0"], - vec!["5", "1"], - vec!["6", "0"], - vec!["7", "1"], - vec!["2", "0"], - vec!["3", "1"], - vec!["0", "0"], - vec!["1", "1"], + "+----+--------------------------+", + "| id | CAST(string_col AS Utf8) |", + "+----+--------------------------+", + "| 4 | 0 |", + "| 5 | 1 |", + "| 6 | 0 |", + "| 7 | 1 |", + "| 2 | 0 |", + "| 3 | 1 |", + "| 0 | 0 |", + "| 1 | 1 |", + "+----+--------------------------+", ]; - assert_eq!(expected, actual); + assert_batches_eq!(expected, &actual); } #[tokio::test] @@ -2487,7 +2492,7 @@ fn register_alltypes_parquet(ctx: &mut ExecutionContext) { /// Execute query and return result set as 2-d table of Vecs /// `result[row][column]` -async fn execute(ctx: &mut ExecutionContext, sql: &str) -> Vec> { +async fn execute_to_batches(ctx: &mut ExecutionContext, sql: &str) -> Vec { let msg = format!("Creating logical plan for '{}'", sql); let plan = ctx.create_logical_plan(sql).expect(&msg); let logical_schema = plan.schema(); @@ -2503,8 +2508,13 @@ async fn execute(ctx: &mut ExecutionContext, sql: &str) -> Vec> { let results = collect(plan).await.expect(&msg); assert_eq!(logical_schema.as_ref(), optimized_logical_schema.as_ref()); + results +} - result_vec(&results) +/// Execute query and return result set as 2-d table of Vecs +/// `result[row][column]` +async fn execute(ctx: &mut ExecutionContext, sql: &str) -> Vec> { + result_vec(&execute_to_batches(ctx, sql).await) } /// Specialised String representation From c51e9ece1893c08cb0a605740356d356e9168052 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Tue, 20 Jul 2021 03:37:43 +0800 Subject: [PATCH 280/329] update `python` crate to support latest pyo3 syntax and gil sematics (#741) * update dependencies * rename macros * update pyo3 deprecate --- python/src/functions.rs | 4 +- python/src/to_py.rs | 27 ++++---- python/src/udaf.rs | 136 +++++++++++++++++++--------------------- python/src/udf.rs | 52 ++++++++------- 4 files changed, 103 insertions(+), 116 deletions(-) diff --git a/python/src/functions.rs b/python/src/functions.rs index b03004fae4312..9b60fdb73e0c2 100644 --- a/python/src/functions.rs +++ b/python/src/functions.rs @@ -25,7 +25,7 @@ use std::sync::Arc; /// Expression representing a column on the existing plan. #[pyfunction] -#[text_signature = "(name)"] +#[pyo3(text_signature = "(name)")] fn col(name: &str) -> expression::Expression { expression::Expression { expr: logical_plan::col(name), @@ -34,7 +34,7 @@ fn col(name: &str) -> expression::Expression { /// Expression representing a constant value #[pyfunction] -#[text_signature = "(value)"] +#[pyo3(text_signature = "(value)")] fn lit(value: i32) -> expression::Expression { expression::Expression { expr: logical_plan::lit(value), diff --git a/python/src/to_py.rs b/python/src/to_py.rs index ff03e03325258..6bc0581c8c70a 100644 --- a/python/src/to_py.rs +++ b/python/src/to_py.rs @@ -15,15 +15,14 @@ // specific language governing permissions and limitations // under the License. +use datafusion::arrow::array::ArrayRef; +use datafusion::arrow::record_batch::RecordBatch; use libc::uintptr_t; use pyo3::prelude::*; +use pyo3::types::PyList; use pyo3::PyErr; - use std::convert::From; -use datafusion::arrow::array::ArrayRef; -use datafusion::arrow::record_batch::RecordBatch; - use crate::errors; pub fn to_py_array(array: &ArrayRef, py: Python) -> PyResult { @@ -64,15 +63,13 @@ fn to_py_batch<'a>( /// Converts a &[RecordBatch] into a Vec represented in PyArrow pub fn to_py(batches: &[RecordBatch]) -> PyResult { - let gil = pyo3::Python::acquire_gil(); - let py = gil.python(); - let pyarrow = PyModule::import(py, "pyarrow")?; - let builtins = PyModule::import(py, "builtins")?; - - let mut py_batches = vec![]; - for batch in batches { - py_batches.push(to_py_batch(batch, py, pyarrow)?); - } - let result = builtins.call1("list", (py_batches,))?; - Ok(PyObject::from(result)) + Python::with_gil(|py| { + let pyarrow = PyModule::import(py, "pyarrow")?; + let mut py_batches = vec![]; + for batch in batches { + py_batches.push(to_py_batch(batch, py, pyarrow)?); + } + let list = PyList::new(py, py_batches); + Ok(PyObject::from(list)) + }) } diff --git a/python/src/udaf.rs b/python/src/udaf.rs index 3ce223df9a491..83e8be05db603 100644 --- a/python/src/udaf.rs +++ b/python/src/udaf.rs @@ -44,18 +44,17 @@ impl PyAccumulator { impl Accumulator for PyAccumulator { fn state(&self) -> Result> { - let gil = pyo3::Python::acquire_gil(); - let py = gil.python(); - - let state = self - .accum - .as_ref(py) - .call_method0("to_scalars") - .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))? - .extract::>() - .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; - - Ok(state.into_iter().map(|v| v.scalar).collect::>()) + Python::with_gil(|py| { + let state = self + .accum + .as_ref(py) + .call_method0("to_scalars") + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))? + .extract::>() + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; + + Ok(state.into_iter().map(|v| v.scalar).collect::>()) + }) } fn update(&mut self, _values: &[ScalarValue]) -> Result<()> { @@ -69,66 +68,60 @@ impl Accumulator for PyAccumulator { } fn evaluate(&self) -> Result { - // get GIL - let gil = pyo3::Python::acquire_gil(); - let py = gil.python(); - - let value = self - .accum - .as_ref(py) - .call_method0("evaluate") - .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; - - to_rust_scalar(value) - .map_err(|e| InnerDataFusionError::Execution(format!("{}", e))) + Python::with_gil(|py| { + let value = self + .accum + .as_ref(py) + .call_method0("evaluate") + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; + + to_rust_scalar(value) + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e))) + }) } fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { - // get GIL - let gil = pyo3::Python::acquire_gil(); - let py = gil.python(); - - // 1. cast args to Pyarrow array - // 2. call function - - // 1. - let py_args = values - .iter() - .map(|arg| { - // remove unwrap - to_py_array(arg, py).unwrap() - }) - .collect::>(); - let py_args = PyTuple::new(py, py_args); - - // update accumulator - self.accum - .as_ref(py) - .call_method1("update", py_args) - .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; - - Ok(()) + Python::with_gil(|py| { + // 1. cast args to Pyarrow array + // 2. call function + + // 1. + let py_args = values + .iter() + .map(|arg| { + // remove unwrap + to_py_array(arg, py).unwrap() + }) + .collect::>(); + let py_args = PyTuple::new(py, py_args); + + // update accumulator + self.accum + .as_ref(py) + .call_method1("update", py_args) + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; + + Ok(()) + }) } fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { - // get GIL - let gil = pyo3::Python::acquire_gil(); - let py = gil.python(); - - // 1. cast states to Pyarrow array - // 2. merge - let state = &states[0]; - - let state = to_py_array(state, py) - .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; - - // 2. - self.accum - .as_ref(py) - .call_method1("merge", (state,)) - .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; - - Ok(()) + Python::with_gil(|py| { + // 1. cast states to Pyarrow array + // 2. merge + let state = &states[0]; + + let state = to_py_array(state, py) + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; + + // 2. + self.accum + .as_ref(py) + .call_method1("merge", (state,)) + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; + + Ok(()) + }) } } @@ -136,12 +129,11 @@ pub fn array_udaf( accumulator: PyObject, ) -> Arc Result> + Send + Sync> { Arc::new(move || -> Result> { - let gil = pyo3::Python::acquire_gil(); - let py = gil.python(); - - let accumulator = accumulator - .call0(py) - .map_err(|e| InnerDataFusionError::Execution(format!("{}", e)))?; + let accumulator = Python::with_gil(|py| { + accumulator + .call0(py) + .map_err(|e| InnerDataFusionError::Execution(format!("{}", e))) + })?; Ok(Box::new(PyAccumulator::new(accumulator))) }) } diff --git a/python/src/udf.rs b/python/src/udf.rs index 7fee71008ef2f..49a18d9932412 100644 --- a/python/src/udf.rs +++ b/python/src/udf.rs @@ -30,33 +30,31 @@ use crate::to_rust::to_rust; pub fn array_udf(func: PyObject) -> ScalarFunctionImplementation { make_scalar_function( move |args: &[array::ArrayRef]| -> Result { - // get GIL - let gil = pyo3::Python::acquire_gil(); - let py = gil.python(); - - // 1. cast args to Pyarrow arrays - // 2. call function - // 3. cast to arrow::array::Array - - // 1. - let py_args = args - .iter() - .map(|arg| { - // remove unwrap - to_py_array(arg, py).unwrap() - }) - .collect::>(); - let py_args = PyTuple::new(py, py_args); - - // 2. - let value = func.as_ref(py).call(py_args, None); - let value = match value { - Ok(n) => Ok(n), - Err(error) => Err(DataFusionError::Execution(format!("{:?}", error))), - }?; - - let array = to_rust(value).unwrap(); - Ok(array) + Python::with_gil(|py| { + // 1. cast args to Pyarrow arrays + // 2. call function + // 3. cast to arrow::array::Array + + // 1. + let py_args = args + .iter() + .map(|arg| { + // remove unwrap + to_py_array(arg, py).unwrap() + }) + .collect::>(); + let py_args = PyTuple::new(py, py_args); + + // 2. + let value = func.as_ref(py).call(py_args, None); + let value = match value { + Ok(n) => Ok(n), + Err(error) => Err(DataFusionError::Execution(format!("{:?}", error))), + }?; + + let array = to_rust(value).unwrap(); + Ok(array) + }) }, ) } From e4df37a4001423909964348289360da66acdd0a3 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 19 Jul 2021 17:58:14 -0600 Subject: [PATCH 281/329] Ballista: Prep for fixing shuffle mechansim, part 1 (#738) --- ballista/rust/core/proto/ballista.proto | 12 + .../src/execution_plans/shuffle_writer.rs | 252 ++++++++++-------- .../core/src/serde/physical_plan/to_proto.rs | 38 +-- .../core/src/serde/scheduler/from_proto.rs | 1 + ballista/rust/core/src/serde/scheduler/mod.rs | 7 +- .../rust/core/src/serde/scheduler/to_proto.rs | 1 + ballista/rust/executor/src/execution_loop.rs | 12 +- ballista/rust/executor/src/executor.rs | 21 +- ballista/rust/scheduler/src/planner.rs | 44 ++- ballista/rust/scheduler/src/state/mod.rs | 117 +++++--- 10 files changed, 316 insertions(+), 189 deletions(-) diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index 0575460cfca35..50bd901f145d2 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -721,6 +721,7 @@ message PartitionLocation { PartitionId partition_id = 1; ExecutorMetadata executor_meta = 2; PartitionStats partition_stats = 3; + string path = 4; } // Unique identifier for a materialized partition of data @@ -776,6 +777,17 @@ message FailedTask { message CompletedTask { string executor_id = 1; + // TODO tasks are currently always shuffle writes but this will not always be the case + // so we might want to think about some refactoring of the task definitions + repeated ShuffleWritePartition partitions = 2; +} + +message ShuffleWritePartition { + uint64 partition_id = 1; + string path = 2; + uint64 num_batches = 3; + uint64 num_rows = 4; + uint64 num_bytes = 5; } message TaskStatus { diff --git a/ballista/rust/core/src/execution_plans/shuffle_writer.rs b/ballista/rust/core/src/execution_plans/shuffle_writer.rs index d5c7d8f284496..47bf2a25f6365 100644 --- a/ballista/rust/core/src/execution_plans/shuffle_writer.rs +++ b/ballista/rust/core/src/execution_plans/shuffle_writer.rs @@ -31,7 +31,8 @@ use crate::error::BallistaError; use crate::memory_stream::MemoryStream; use crate::utils; -use crate::serde::scheduler::PartitionStats; +use crate::serde::protobuf::ShuffleWritePartition; +use crate::serde::scheduler::{PartitionLocation, PartitionStats}; use async_trait::async_trait; use datafusion::arrow::array::{ Array, ArrayBuilder, ArrayRef, StringBuilder, StructBuilder, UInt32Builder, @@ -39,16 +40,19 @@ use datafusion::arrow::array::{ }; use datafusion::arrow::compute::take; use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use datafusion::arrow::ipc::reader::FileReader; use datafusion::arrow::ipc::writer::FileWriter; use datafusion::arrow::record_batch::RecordBatch; use datafusion::error::{DataFusionError, Result}; use datafusion::physical_plan::hash_join::create_hashes; +use datafusion::physical_plan::repartition::RepartitionExec; +use datafusion::physical_plan::Partitioning::RoundRobinBatch; use datafusion::physical_plan::{ DisplayFormatType, ExecutionPlan, Partitioning, RecordBatchStream, SQLMetric, }; use futures::StreamExt; use hashbrown::HashMap; -use log::info; +use log::{debug, info}; use uuid::Uuid; /// ShuffleWriterExec represents a section of a query plan that has consistent partitioning and @@ -75,12 +79,16 @@ pub struct ShuffleWriterExec { struct ShuffleWriteMetrics { /// Time spend writing batches to shuffle files write_time: Arc, + input_rows: Arc, + output_rows: Arc, } impl ShuffleWriteMetrics { fn new() -> Self { Self { write_time: SQLMetric::time_nanos(), + input_rows: SQLMetric::counter(), + output_rows: SQLMetric::counter(), } } } @@ -113,50 +121,19 @@ impl ShuffleWriterExec { pub fn stage_id(&self) -> usize { self.stage_id } -} - -#[async_trait] -impl ExecutionPlan for ShuffleWriterExec { - fn as_any(&self) -> &dyn Any { - self - } - - fn schema(&self) -> SchemaRef { - self.plan.schema() - } - - fn output_partitioning(&self) -> Partitioning { - match &self.shuffle_output_partitioning { - Some(p) => p.clone(), - _ => Partitioning::UnknownPartitioning(1), - } - } - fn children(&self) -> Vec> { - vec![self.plan.clone()] + /// Get the true output partitioning + pub fn shuffle_output_partitioning(&self) -> Option<&Partitioning> { + self.shuffle_output_partitioning.as_ref() } - fn with_new_children( + pub async fn execute_shuffle_write( &self, - children: Vec>, - ) -> Result> { - assert!(children.len() == 1); - Ok(Arc::new(ShuffleWriterExec::try_new( - self.job_id.clone(), - self.stage_id, - children[0].clone(), - self.work_dir.clone(), - self.shuffle_output_partitioning.clone(), - )?)) - } - - async fn execute( - &self, - partition: usize, - ) -> Result>> { + input_partition: usize, + ) -> Result> { let now = Instant::now(); - let mut stream = self.plan.execute(partition).await?; + let mut stream = self.plan.execute(input_partition).await?; let mut path = PathBuf::from(&self.work_dir); path.push(&self.job_id); @@ -164,7 +141,7 @@ impl ExecutionPlan for ShuffleWriterExec { match &self.shuffle_output_partitioning { None => { - path.push(&format!("{}", partition)); + path.push(&format!("{}", input_partition)); std::fs::create_dir_all(&path)?; path.push("data.arrow"); let path = path.to_str().unwrap(); @@ -181,29 +158,18 @@ impl ExecutionPlan for ShuffleWriterExec { info!( "Executed partition {} in {} seconds. Statistics: {}", - partition, + input_partition, now.elapsed().as_secs(), stats ); - let schema = result_schema(); - - // build result set with summary of the partition execution status - let mut part_builder = UInt32Builder::new(1); - part_builder.append_value(partition as u32)?; - let part: ArrayRef = Arc::new(part_builder.finish()); - - let mut path_builder = StringBuilder::new(1); - path_builder.append_value(&path)?; - let path: ArrayRef = Arc::new(path_builder.finish()); - - let stats: ArrayRef = stats - .to_arrow_arrayref() - .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; - let batch = RecordBatch::try_new(schema.clone(), vec![part, path, stats]) - .map_err(DataFusionError::ArrowError)?; - - Ok(Box::pin(MemoryStream::try_new(vec![batch], schema, None)?)) + Ok(vec![ShuffleWritePartition { + partition_id: input_partition as u64, + path: path.to_owned(), + num_batches: stats.num_batches.unwrap_or(0), + num_rows: stats.num_rows.unwrap_or(0), + num_bytes: stats.num_bytes.unwrap_or(0), + }]) } Some(Partitioning::Hash(exprs, n)) => { @@ -218,8 +184,12 @@ impl ExecutionPlan for ShuffleWriterExec { let hashes_buf = &mut vec![]; let random_state = ahash::RandomState::with_seeds(0, 0, 0, 0); + while let Some(result) = stream.next().await { let input_batch = result?; + + self.metrics.input_rows.add(input_batch.num_rows()); + let arrays = exprs .iter() .map(|expr| { @@ -241,6 +211,7 @@ impl ExecutionPlan for ShuffleWriterExec { indices.into_iter().enumerate() { let indices = partition_indices.into(); + // Produce batches based on indices let columns = input_batch .columns() @@ -255,7 +226,8 @@ impl ExecutionPlan for ShuffleWriterExec { let output_batch = RecordBatch::try_new(input_batch.schema(), columns)?; - // write batch out + // write non-empty batch out + //if output_batch.num_rows() > 0 { let start = Instant::now(); match &mut writers[output_partition] { Some(w) => { @@ -266,7 +238,7 @@ impl ExecutionPlan for ShuffleWriterExec { path.push(&format!("{}", output_partition)); std::fs::create_dir_all(&path)?; - path.push("data.arrow"); + path.push(format!("data-{}.arrow", input_partition)); let path = path.to_str().unwrap(); info!("Writing results to {}", path); @@ -277,58 +249,39 @@ impl ExecutionPlan for ShuffleWriterExec { writers[output_partition] = Some(writer); } } + self.metrics.output_rows.add(output_batch.num_rows()); self.metrics.write_time.add_elapsed(start); + //} } } - // build metadata result batch - let num_writers = writers.iter().filter(|w| w.is_some()).count(); - let mut partition_builder = UInt32Builder::new(num_writers); - let mut path_builder = StringBuilder::new(num_writers); - let mut num_rows_builder = UInt64Builder::new(num_writers); - let mut num_batches_builder = UInt64Builder::new(num_writers); - let mut num_bytes_builder = UInt64Builder::new(num_writers); + let mut part_locs = vec![]; for (i, w) in writers.iter_mut().enumerate() { match w { Some(w) => { w.finish()?; - path_builder.append_value(w.path())?; - partition_builder.append_value(i as u32)?; - num_rows_builder.append_value(w.num_rows)?; - num_batches_builder.append_value(w.num_batches)?; - num_bytes_builder.append_value(w.num_bytes)?; + info!( + "Finished writing shuffle partition {} at {}. Batches: {}. Rows: {}. Bytes: {}.", + i, + w.path(), + w.num_batches, + w.num_rows, + w.num_bytes + ); + + part_locs.push(ShuffleWritePartition { + partition_id: i as u64, + path: w.path().to_owned(), + num_batches: w.num_batches, + num_rows: w.num_rows, + num_bytes: w.num_bytes, + }); } None => {} } } - - // build arrays - let partition_num: ArrayRef = Arc::new(partition_builder.finish()); - let path: ArrayRef = Arc::new(path_builder.finish()); - let field_builders: Vec> = vec![ - Box::new(num_rows_builder), - Box::new(num_batches_builder), - Box::new(num_bytes_builder), - ]; - let mut stats_builder = StructBuilder::new( - PartitionStats::default().arrow_struct_fields(), - field_builders, - ); - for _ in 0..num_writers { - stats_builder.append(true)?; - } - let stats = Arc::new(stats_builder.finish()); - - // build result batch containing metadata - let schema = result_schema(); - let batch = RecordBatch::try_new( - schema.clone(), - vec![partition_num, path, stats], - ) - .map_err(DataFusionError::ArrowError)?; - - Ok(Box::pin(MemoryStream::try_new(vec![batch], schema, None)?)) + Ok(part_locs) } _ => Err(DataFusionError::Execution( @@ -336,9 +289,98 @@ impl ExecutionPlan for ShuffleWriterExec { )), } } +} + +#[async_trait] +impl ExecutionPlan for ShuffleWriterExec { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.plan.schema() + } + + fn output_partitioning(&self) -> Partitioning { + // This operator needs to be executed once for each *input* partition and there + // isn't really a mechanism yet in DataFusion to support this use case so we report + // the input partitioning as the output partitioning here. The executor reports + // output partition meta data back to the scheduler. + self.plan.output_partitioning() + } + + fn children(&self) -> Vec> { + vec![self.plan.clone()] + } + + fn with_new_children( + &self, + children: Vec>, + ) -> Result> { + assert!(children.len() == 1); + Ok(Arc::new(ShuffleWriterExec::try_new( + self.job_id.clone(), + self.stage_id, + children[0].clone(), + self.work_dir.clone(), + self.shuffle_output_partitioning.clone(), + )?)) + } + + async fn execute( + &self, + input_partition: usize, + ) -> Result>> { + let part_loc = self.execute_shuffle_write(input_partition).await?; + + // build metadata result batch + let num_writers = part_loc.len(); + let mut partition_builder = UInt32Builder::new(num_writers); + let mut path_builder = StringBuilder::new(num_writers); + let mut num_rows_builder = UInt64Builder::new(num_writers); + let mut num_batches_builder = UInt64Builder::new(num_writers); + let mut num_bytes_builder = UInt64Builder::new(num_writers); + + for loc in &part_loc { + path_builder.append_value(loc.path.clone())?; + partition_builder.append_value(loc.partition_id as u32)?; + num_rows_builder.append_value(loc.num_rows)?; + num_batches_builder.append_value(loc.num_batches)?; + num_bytes_builder.append_value(loc.num_bytes)?; + } + + // build arrays + let partition_num: ArrayRef = Arc::new(partition_builder.finish()); + let path: ArrayRef = Arc::new(path_builder.finish()); + let field_builders: Vec> = vec![ + Box::new(num_rows_builder), + Box::new(num_batches_builder), + Box::new(num_bytes_builder), + ]; + let mut stats_builder = StructBuilder::new( + PartitionStats::default().arrow_struct_fields(), + field_builders, + ); + for _ in 0..num_writers { + stats_builder.append(true)?; + } + let stats = Arc::new(stats_builder.finish()); + + // build result batch containing metadata + let schema = result_schema(); + let batch = + RecordBatch::try_new(schema.clone(), vec![partition_num, path, stats]) + .map_err(DataFusionError::ArrowError)?; + + debug!("RESULTS METADATA:\n{:?}", batch); + + Ok(Box::pin(MemoryStream::try_new(vec![batch], schema, None)?)) + } fn metrics(&self) -> HashMap { let mut metrics = HashMap::new(); + metrics.insert("inputRows".to_owned(), (*self.metrics.input_rows).clone()); + metrics.insert("outputRows".to_owned(), (*self.metrics.output_rows).clone()); metrics.insert("writeTime".to_owned(), (*self.metrics.write_time).clone()); metrics } @@ -454,13 +496,13 @@ mod tests { let file0 = path.value(0); assert!( - file0.ends_with("/jobOne/1/0/data.arrow") - || file0.ends_with("\\jobOne\\1\\0\\data.arrow") + file0.ends_with("/jobOne/1/0/data-0.arrow") + || file0.ends_with("\\jobOne\\1\\0\\data-0.arrow") ); let file1 = path.value(1); assert!( - file1.ends_with("/jobOne/1/1/data.arrow") - || file1.ends_with("\\jobOne\\1\\1\\data.arrow") + file1.ends_with("/jobOne/1/1/data-0.arrow") + || file1.ends_with("\\jobOne\\1\\1\\data-0.arrow") ); let stats = batch.columns()[2] diff --git a/ballista/rust/core/src/serde/physical_plan/to_proto.rs b/ballista/rust/core/src/serde/physical_plan/to_proto.rs index 0429efb7c0174..fa35eb48d4fa4 100644 --- a/ballista/rust/core/src/serde/physical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/to_proto.rs @@ -361,29 +361,33 @@ impl TryInto for Arc { } else if let Some(exec) = plan.downcast_ref::() { let input: protobuf::PhysicalPlanNode = exec.children()[0].to_owned().try_into()?; + // note that we use shuffle_output_partitioning() rather than output_partitioning() + // to get the true output partitioning + let output_partitioning = match exec.shuffle_output_partitioning() { + Some(Partitioning::Hash(exprs, partition_count)) => { + Some(protobuf::PhysicalHashRepartition { + hash_expr: exprs + .iter() + .map(|expr| expr.clone().try_into()) + .collect::, BallistaError>>()?, + partition_count: *partition_count as u64, + }) + } + None => None, + other => { + return Err(BallistaError::General(format!( + "physical_plan::to_proto() invalid partitioning for ShuffleWriterExec: {:?}", + other + ))) + } + }; Ok(protobuf::PhysicalPlanNode { physical_plan_type: Some(PhysicalPlanType::ShuffleWriter(Box::new( protobuf::ShuffleWriterExecNode { job_id: exec.job_id().to_string(), stage_id: exec.stage_id() as u32, input: Some(Box::new(input)), - output_partitioning: match exec.output_partitioning() { - Partitioning::Hash(exprs, partition_count) => { - Some(protobuf::PhysicalHashRepartition { - hash_expr: exprs - .iter() - .map(|expr| expr.clone().try_into()) - .collect::, BallistaError>>()?, - partition_count: partition_count as u64, - }) - } - other => { - return Err(BallistaError::General(format!( - "physical_plan::to_proto() invalid partitioning for ShuffleWriterExec: {:?}", - other - ))) - } - }, + output_partitioning, }, ))), }) diff --git a/ballista/rust/core/src/serde/scheduler/from_proto.rs b/ballista/rust/core/src/serde/scheduler/from_proto.rs index 73f8f53956de8..4f9c9bc8877e8 100644 --- a/ballista/rust/core/src/serde/scheduler/from_proto.rs +++ b/ballista/rust/core/src/serde/scheduler/from_proto.rs @@ -102,6 +102,7 @@ impl TryInto for protobuf::PartitionLocation { ) })? .into(), + path: self.path, }) } } diff --git a/ballista/rust/core/src/serde/scheduler/mod.rs b/ballista/rust/core/src/serde/scheduler/mod.rs index fa2c1b890e844..eeddfbbb41f39 100644 --- a/ballista/rust/core/src/serde/scheduler/mod.rs +++ b/ballista/rust/core/src/serde/scheduler/mod.rs @@ -62,6 +62,7 @@ pub struct PartitionLocation { pub partition_id: PartitionId, pub executor_meta: ExecutorMeta, pub partition_stats: PartitionStats, + pub path: String, } /// Meta-data for an executor, used when fetching shuffle partitions from other executors @@ -96,9 +97,9 @@ impl From for ExecutorMeta { /// Summary of executed partition #[derive(Debug, Copy, Clone)] pub struct PartitionStats { - num_rows: Option, - num_batches: Option, - num_bytes: Option, + pub(crate) num_rows: Option, + pub(crate) num_batches: Option, + pub(crate) num_bytes: Option, } impl Default for PartitionStats { diff --git a/ballista/rust/core/src/serde/scheduler/to_proto.rs b/ballista/rust/core/src/serde/scheduler/to_proto.rs index c3f2046305cf9..57d4f615c5f84 100644 --- a/ballista/rust/core/src/serde/scheduler/to_proto.rs +++ b/ballista/rust/core/src/serde/scheduler/to_proto.rs @@ -70,6 +70,7 @@ impl TryInto for PartitionLocation { partition_id: Some(self.partition_id.into()), executor_meta: Some(self.executor_meta.into()), partition_stats: Some(self.partition_stats.into()), + path: self.path, }) } } diff --git a/ballista/rust/executor/src/execution_loop.rs b/ballista/rust/executor/src/execution_loop.rs index 17f6e4dd5d359..b65b83bbaf484 100644 --- a/ballista/rust/executor/src/execution_loop.rs +++ b/ballista/rust/executor/src/execution_loop.rs @@ -27,7 +27,8 @@ use tonic::transport::Channel; use ballista_core::serde::protobuf::ExecutorRegistration; use ballista_core::serde::protobuf::{ self, scheduler_grpc_client::SchedulerGrpcClient, task_status, FailedTask, - PartitionId, PollWorkParams, PollWorkResult, TaskDefinition, TaskStatus, + PartitionId, PollWorkParams, PollWorkResult, ShuffleWritePartition, TaskDefinition, + TaskStatus, }; use protobuf::CompletedTask; @@ -110,7 +111,7 @@ async fn run_received_tasks( tokio::spawn(async move { let execution_result = executor - .execute_partition( + .execute_shuffle_write( task_id.job_id.clone(), task_id.stage_id as usize, task_id.partition_id as usize, @@ -121,7 +122,7 @@ async fn run_received_tasks( debug!("Statistics: {:?}", execution_result); available_tasks_slots.fetch_add(1, Ordering::SeqCst); let _ = task_status_sender.send(as_task_status( - execution_result.map(|_| ()), + execution_result, executor_id, task_id, )); @@ -129,18 +130,19 @@ async fn run_received_tasks( } fn as_task_status( - execution_result: ballista_core::error::Result<()>, + execution_result: ballista_core::error::Result>, executor_id: String, task_id: PartitionId, ) -> TaskStatus { match execution_result { - Ok(_) => { + Ok(partitions) => { info!("Task {:?} finished", task_id); TaskStatus { partition_id: Some(task_id), status: Some(task_status::Status::Completed(CompletedTask { executor_id, + partitions, })), } } diff --git a/ballista/rust/executor/src/executor.rs b/ballista/rust/executor/src/executor.rs index 4a75448b5f06b..cbf3eb040ff6b 100644 --- a/ballista/rust/executor/src/executor.rs +++ b/ballista/rust/executor/src/executor.rs @@ -21,8 +21,7 @@ use std::sync::Arc; use ballista_core::error::BallistaError; use ballista_core::execution_plans::ShuffleWriterExec; -use ballista_core::utils; -use datafusion::arrow::record_batch::RecordBatch; +use ballista_core::serde::protobuf; use datafusion::physical_plan::display::DisplayableExecutionPlan; use datafusion::physical_plan::ExecutionPlan; @@ -45,22 +44,26 @@ impl Executor { /// Execute one partition of a query stage and persist the result to disk in IPC format. On /// success, return a RecordBatch containing metadata about the results, including path /// and statistics. - pub async fn execute_partition( + pub async fn execute_shuffle_write( &self, job_id: String, stage_id: usize, part: usize, plan: Arc, - ) -> Result { + ) -> Result, BallistaError> { + // TODO to enable shuffling we need to specify the output partitioning here and + // until we do that there is always a single output partition + // see https://github.com/apache/arrow-datafusion/issues/707 + let shuffle_output_partitioning = None; + let exec = ShuffleWriterExec::try_new( job_id, stage_id, plan, self.work_dir.clone(), - None, + shuffle_output_partitioning, )?; - let mut stream = exec.execute(part).await?; - let batches = utils::collect_stream(&mut stream).await?; + let partitions = exec.execute_shuffle_write(part).await?; println!( "=== Physical plan with metrics ===\n{}\n", @@ -69,9 +72,7 @@ impl Executor { .to_string() ); - // the output should be a single batch containing metadata (path and statistics) - assert!(batches.len() == 1); - Ok(batches[0].clone()) + Ok(partitions) } pub fn work_dir(&self) -> &str { diff --git a/ballista/rust/scheduler/src/planner.rs b/ballista/rust/scheduler/src/planner.rs index 3f90da238b7fe..11f5c994fd520 100644 --- a/ballista/rust/scheduler/src/planner.rs +++ b/ballista/rust/scheduler/src/planner.rs @@ -108,6 +108,10 @@ impl DistributedPlanner { let query_stage = create_shuffle_writer( job_id, self.next_stage_id(), + //TODO should be children[0].clone() so that we replace this + // with an UnresolvedShuffleExec instead of just executing this + // part of the plan again + // see https://github.com/apache/arrow-datafusion/issues/707 coalesce.children()[0].clone(), None, )?; @@ -127,6 +131,10 @@ impl DistributedPlanner { let query_stage = create_shuffle_writer( job_id, self.next_stage_id(), + //TODO should be children[0].clone() so that we replace this + // with an UnresolvedShuffleExec instead of just executing this + // part of the plan again + // see https://github.com/apache/arrow-datafusion/issues/707 repart.children()[0].clone(), Some(repart.partitioning().to_owned()), )?; @@ -158,7 +166,7 @@ impl DistributedPlanner { pub fn remove_unresolved_shuffles( stage: &dyn ExecutionPlan, - partition_locations: &HashMap>>, + partition_locations: &HashMap>>, ) -> Result> { let mut new_children: Vec> = vec![]; for child in stage.children() { @@ -166,16 +174,30 @@ pub fn remove_unresolved_shuffles( child.as_any().downcast_ref::() { let mut relevant_locations = vec![]; - relevant_locations.append( - &mut partition_locations - .get(&unresolved_shuffle.stage_id) - .ok_or_else(|| { - BallistaError::General( - "Missing partition location. Could not remove unresolved shuffles" - .to_owned(), - ) - })? - .clone(), + let p = partition_locations + .get(&unresolved_shuffle.stage_id) + .ok_or_else(|| { + BallistaError::General( + "Missing partition location. Could not remove unresolved shuffles" + .to_owned(), + ) + })? + .clone(); + + for i in 0..unresolved_shuffle.partition_count { + if let Some(x) = p.get(&i) { + relevant_locations.push(x.to_owned()); + } else { + relevant_locations.push(vec![]); + } + } + println!( + "create shuffle reader with {:?}", + relevant_locations + .iter() + .map(|c| format!("{:?}", c)) + .collect::>() + .join("\n") ); new_children.push(Arc::new(ShuffleReaderExec::try_new( relevant_locations, diff --git a/ballista/rust/scheduler/src/state/mod.rs b/ballista/rust/scheduler/src/state/mod.rs index 3ddbced226849..a4ae59e1dfda1 100644 --- a/ballista/rust/scheduler/src/state/mod.rs +++ b/ballista/rust/scheduler/src/state/mod.rs @@ -27,16 +27,13 @@ use prost::Message; use tokio::sync::OwnedMutexGuard; use ballista_core::serde::protobuf::{ - job_status, task_status, CompletedJob, CompletedTask, ExecutorHeartbeat, + self, job_status, task_status, CompletedJob, CompletedTask, ExecutorHeartbeat, ExecutorMetadata, FailedJob, FailedTask, JobStatus, PhysicalPlanNode, RunningJob, RunningTask, TaskStatus, }; use ballista_core::serde::scheduler::PartitionStats; use ballista_core::{error::BallistaError, serde::scheduler::ExecutorMeta}; -use ballista_core::{ - error::Result, execution_plans::UnresolvedShuffleExec, - serde::protobuf::PartitionLocation, -}; +use ballista_core::{error::Result, execution_plans::UnresolvedShuffleExec}; use super::planner::remove_unresolved_shuffles; @@ -254,9 +251,9 @@ impl SchedulerState { executors: &[ExecutorMeta], ) -> Result { let executor_id: &str = match &task_status.status { - Some(task_status::Status::Completed(CompletedTask { executor_id })) => { - executor_id - } + Some(task_status::Status::Completed(CompletedTask { + executor_id, .. + })) => executor_id, Some(task_status::Status::Running(RunningTask { executor_id })) => { executor_id } @@ -298,8 +295,11 @@ impl SchedulerState { // Let's try to resolve any unresolved shuffles we find let unresolved_shuffles = find_unresolved_shuffles(&plan)?; let mut partition_locations: HashMap< - usize, - Vec>, + usize, // stage id + HashMap< + usize, // shuffle input partition id + Vec, // shuffle output partitions + >, > = HashMap::new(); for unresolved_shuffle in unresolved_shuffles { for partition_id in 0..unresolved_shuffle.partition_count { @@ -317,30 +317,49 @@ impl SchedulerState { if task_is_dead { continue 'tasks; } else if let Some(task_status::Status::Completed( - CompletedTask { executor_id }, + CompletedTask { + executor_id, + partitions, + }, )) = &referenced_task.status { - let empty = vec![]; let locations = partition_locations .entry(unresolved_shuffle.stage_id) - .or_insert(empty); + .or_insert_with(HashMap::new); let executor_meta = executors .iter() .find(|exec| exec.id == *executor_id) .unwrap() .clone(); - locations.push(vec![ - ballista_core::serde::scheduler::PartitionLocation { - partition_id: - ballista_core::serde::scheduler::PartitionId { - job_id: partition.job_id.clone(), - stage_id: unresolved_shuffle.stage_id, - partition_id, - }, - executor_meta, - partition_stats: PartitionStats::default(), - }, - ]); + + let temp = + locations.entry(partition_id).or_insert_with(Vec::new); + for p in partitions { + let executor_meta = executor_meta.clone(); + let partition_location = + ballista_core::serde::scheduler::PartitionLocation { + partition_id: + ballista_core::serde::scheduler::PartitionId { + job_id: partition.job_id.clone(), + stage_id: unresolved_shuffle.stage_id, + partition_id, + }, + executor_meta, + partition_stats: PartitionStats::new( + Some(p.num_rows), + Some(p.num_batches), + Some(p.num_bytes), + ), + path: p.path.clone(), + }; + info!( + "Scheduler storing stage {} partition {} path: {}", + unresolved_shuffle.stage_id, + partition_id, + partition_location.path + ); + temp.push(partition_location); + } } else { continue 'tasks; } @@ -452,24 +471,39 @@ impl SchedulerState { let mut job_status = statuses .iter() .map(|status| match &status.status { - Some(task_status::Status::Completed(CompletedTask { executor_id })) => { - Ok((status, executor_id)) - } + Some(task_status::Status::Completed(CompletedTask { + executor_id, + partitions, + })) => Ok((status, executor_id, partitions)), _ => Err(BallistaError::General("Task not completed".to_string())), }) .collect::>>() .ok() .map(|info| { - let partition_location = info - .into_iter() - .map(|(status, execution_id)| PartitionLocation { - partition_id: status.partition_id.to_owned(), - executor_meta: executors - .get(execution_id) - .map(|e| e.clone().into()), - partition_stats: None, - }) - .collect(); + let mut partition_location = vec![]; + for (status, executor_id, partitions) in info { + let input_partition_id = status.partition_id.as_ref().unwrap(); //TODO unwrap + let executor_meta = + executors.get(executor_id).map(|e| e.clone().into()); + for shuffle_write_partition in partitions { + let shuffle_input_partition_id = Some(protobuf::PartitionId { + job_id: input_partition_id.job_id.clone(), + stage_id: input_partition_id.stage_id, + partition_id: input_partition_id.partition_id, + }); + partition_location.push(protobuf::PartitionLocation { + partition_id: shuffle_input_partition_id.clone(), + executor_meta: executor_meta.clone(), + partition_stats: Some(protobuf::PartitionStats { + num_batches: shuffle_write_partition.num_batches as i64, + num_rows: shuffle_write_partition.num_rows as i64, + num_bytes: shuffle_write_partition.num_bytes as i64, + column_stats: vec![], + }), + path: shuffle_write_partition.path.clone(), + }); + } + } job_status::Status::Completed(CompletedJob { partition_location }) }); @@ -745,6 +779,7 @@ mod test { let meta = TaskStatus { status: Some(task_status::Status::Completed(CompletedTask { executor_id: "".to_owned(), + partitions: vec![], })), partition_id: Some(PartitionId { job_id: job_id.to_owned(), @@ -784,6 +819,7 @@ mod test { let meta = TaskStatus { status: Some(task_status::Status::Completed(CompletedTask { executor_id: "".to_owned(), + partitions: vec![], })), partition_id: Some(PartitionId { job_id: job_id.to_owned(), @@ -821,6 +857,7 @@ mod test { let meta = TaskStatus { status: Some(task_status::Status::Completed(CompletedTask { executor_id: "".to_owned(), + partitions: vec![], })), partition_id: Some(PartitionId { job_id: job_id.to_owned(), @@ -832,6 +869,7 @@ mod test { let meta = TaskStatus { status: Some(task_status::Status::Completed(CompletedTask { executor_id: "".to_owned(), + partitions: vec![], })), partition_id: Some(PartitionId { job_id: job_id.to_owned(), @@ -863,6 +901,7 @@ mod test { let meta = TaskStatus { status: Some(task_status::Status::Completed(CompletedTask { executor_id: "".to_owned(), + partitions: vec![], })), partition_id: Some(PartitionId { job_id: job_id.to_owned(), @@ -874,6 +913,7 @@ mod test { let meta = TaskStatus { status: Some(task_status::Status::Completed(CompletedTask { executor_id: "".to_owned(), + partitions: vec![], })), partition_id: Some(PartitionId { job_id: job_id.to_owned(), @@ -905,6 +945,7 @@ mod test { let meta = TaskStatus { status: Some(task_status::Status::Completed(CompletedTask { executor_id: "".to_owned(), + partitions: vec![], })), partition_id: Some(PartitionId { job_id: job_id.to_owned(), From c7a5a936902402c5637064cb3ee0b3ca25b2ee9a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 20 Jul 2021 07:25:25 -0400 Subject: [PATCH 282/329] Implement test for parquet pruning disabling (#754) --- datafusion/tests/parquet_pruning.rs | 58 +++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/datafusion/tests/parquet_pruning.rs b/datafusion/tests/parquet_pruning.rs index 8ad7974280f09..0838211f14f09 100644 --- a/datafusion/tests/parquet_pruning.rs +++ b/datafusion/tests/parquet_pruning.rs @@ -30,10 +30,10 @@ use arrow::{ }; use chrono::{Datelike, Duration}; use datafusion::{ - datasource::{parquet::ParquetTable, TableProvider}, + datasource::TableProvider, logical_plan::{col, lit, Expr, LogicalPlan, LogicalPlanBuilder}, physical_plan::{plan_metrics, SQLMetric}, - prelude::ExecutionContext, + prelude::{ExecutionConfig, ExecutionContext}, scalar::ScalarValue, }; use hashbrown::HashMap; @@ -136,6 +136,47 @@ async fn prune_date64() { assert_eq!(output.result_rows, 1, "{}", output.description()); } +#[tokio::test] +async fn prune_disabled() { + let query = "SELECT * FROM t where nanos < to_timestamp('2020-01-02 01:01:11Z')"; + let expected_rows = 10; + + // with pruning + let output = ContextWithParquet::new(Scenario::Timestamps) + .await + .query(query) + .await; + + // This should prune one without error + assert_eq!(output.predicate_evaluation_errors(), Some(0)); + assert_eq!(output.row_groups_pruned(), Some(1)); + assert_eq!( + output.result_rows, + expected_rows, + "{}", + output.description() + ); + + // same query, without pruning + let config = ExecutionConfig::new().with_parquet_pruning(false); + + let output = ContextWithParquet::with_config(Scenario::Timestamps, config) + .await + .query(query) + .await; + println!("{}", output.description()); + + // This should not prune any + assert_eq!(output.predicate_evaluation_errors(), Some(0)); + assert_eq!(output.row_groups_pruned(), Some(0)); + assert_eq!( + output.result_rows, + expected_rows, + "{}", + output.description() + ); +} + // ---------------------- // Begin test fixture // ---------------------- @@ -207,15 +248,18 @@ impl TestOutput { /// and the appropriate scenario impl ContextWithParquet { async fn new(scenario: Scenario) -> Self { - let file = make_test_file(scenario).await; + Self::with_config(scenario, ExecutionConfig::new()).await + } - // now, setup a the file as a data source and run a query against it - let mut ctx = ExecutionContext::new(); + async fn with_config(scenario: Scenario, config: ExecutionConfig) -> Self { + let file = make_test_file(scenario).await; let parquet_path = file.path().to_string_lossy(); - let table = ParquetTable::try_new(parquet_path, 4).unwrap(); + // now, setup a the file as a data source and run a query against it + let mut ctx = ExecutionContext::with_config(config); - let provider = Arc::new(table); + ctx.register_parquet("t", &parquet_path).unwrap(); + let provider = ctx.deregister_table("t").unwrap().unwrap(); ctx.register_table("t", provider.clone()).unwrap(); Self { From 6cfaa29491bdd75523e8ab5cabe33fddfe7b13fb Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Tue, 20 Jul 2021 20:29:36 +0800 Subject: [PATCH 283/329] add more math functions and unit tests to `python` crate (#748) * rename functions * add test math functions * add python math functions --- python/Cargo.toml | 2 +- python/README.md | 2 +- python/requirements.in | 5 + python/requirements.txt | 142 +++++++++++++++++++++++++- python/src/functions.rs | 132 ++++++++++++++++-------- python/tests/test_math_functions.py | 60 +++++++++++ python/tests/test_string_functions.py | 8 +- python/tests/test_udaf.py | 1 + 8 files changed, 302 insertions(+), 50 deletions(-) create mode 100644 python/tests/test_math_functions.py diff --git a/python/Cargo.toml b/python/Cargo.toml index eab8a8bcf5555..ee99359a82f05 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -31,7 +31,7 @@ libc = "0.2" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } rand = "0.7" pyo3 = { version = "0.14.1", features = ["extension-module"] } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "5c1f8eb32c5f423d6c0e1f50350f9fe140135fdb" } +datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "e4df37a4001423909964348289360da66acdd0a3" } [lib] name = "datafusion" diff --git a/python/README.md b/python/README.md index 05561f712cae4..e13ebb83fa857 100644 --- a/python/README.md +++ b/python/README.md @@ -145,7 +145,7 @@ Whenever rust code changes (your changes or via `git pull`): ```bash # make sure you activate the venv using "source venv/bin/activate" first maturin develop -python -m unittest discover tests +python -m pytest ``` ## How to update dependencies diff --git a/python/requirements.in b/python/requirements.in index 4ff7f4ee618ba..5f145dc3b9276 100644 --- a/python/requirements.in +++ b/python/requirements.in @@ -18,3 +18,8 @@ maturin toml pyarrow pytest +black +isort +mypy +numpy +pandas diff --git a/python/requirements.txt b/python/requirements.txt index cbd86cdc0e254..b7f0080f7296b 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -4,14 +4,30 @@ # # pip-compile --generate-hashes # +appdirs==1.4.4 \ + --hash=sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41 \ + --hash=sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128 + # via black attrs==21.2.0 \ --hash=sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1 \ --hash=sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb # via pytest +black==21.7b0 \ + --hash=sha256:1c7aa6ada8ee864db745b22790a32f94b2795c253a75d6d9b5e439ff10d23116 \ + --hash=sha256:c8373c6491de9362e39271630b65b964607bc5c79c83783547d76c839b3aa219 + # via -r requirements.in +click==8.0.1 \ + --hash=sha256:8c04c11192119b1ef78ea049e0a6f0463e4c48ef00a30160c704337586f3ad7a \ + --hash=sha256:fba402a4a47334742d782209a7c79bc448911afe1149d07bdabdf480b3e2f4b6 + # via black iniconfig==1.1.1 \ --hash=sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3 \ --hash=sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32 # via pytest +isort==5.9.2 \ + --hash=sha256:eed17b53c3e7912425579853d078a0832820f023191561fcee9d7cae424e0813 \ + --hash=sha256:f65ce5bd4cbc6abdfbe29afc2f0245538ab358c14590912df638033f157d555e + # via -r requirements.in maturin==0.11.1 \ --hash=sha256:1d8a276b4c4ac74ecf9624ebc718982cdd0f86581d6338c877d7eb2833b89a13 \ --hash=sha256:56b1dc8651a40d024a0ac59720ffeb61a41059fcd836f1742ad828b78650fc1a \ @@ -25,6 +41,37 @@ maturin==0.11.1 \ --hash=sha256:e1598a844fdc7b5093749feb0b373fb2f7545033bb1f00779cfbf173906e374a \ --hash=sha256:e60308dd43eb5f763126d0651827683141b12878541c6ede008f77ef655d1343 # via -r requirements.in +mypy==0.910 \ + --hash=sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9 \ + --hash=sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a \ + --hash=sha256:119bed3832d961f3a880787bf621634ba042cb8dc850a7429f643508eeac97b9 \ + --hash=sha256:1a85e280d4d217150ce8cb1a6dddffd14e753a4e0c3cf90baabb32cefa41b59e \ + --hash=sha256:3c4b8ca36877fc75339253721f69603a9c7fdb5d4d5a95a1a1b899d8b86a4de2 \ + --hash=sha256:3e382b29f8e0ccf19a2df2b29a167591245df90c0b5a2542249873b5c1d78212 \ + --hash=sha256:42c266ced41b65ed40a282c575705325fa7991af370036d3f134518336636f5b \ + --hash=sha256:53fd2eb27a8ee2892614370896956af2ff61254c275aaee4c230ae771cadd885 \ + --hash=sha256:704098302473cb31a218f1775a873b376b30b4c18229421e9e9dc8916fd16150 \ + --hash=sha256:7df1ead20c81371ccd6091fa3e2878559b5c4d4caadaf1a484cf88d93ca06703 \ + --hash=sha256:866c41f28cee548475f146aa4d39a51cf3b6a84246969f3759cb3e9c742fc072 \ + --hash=sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457 \ + --hash=sha256:adaeee09bfde366d2c13fe6093a7df5df83c9a2ba98638c7d76b010694db760e \ + --hash=sha256:b6fb13123aeef4a3abbcfd7e71773ff3ff1526a7d3dc538f3929a49b42be03f0 \ + --hash=sha256:b94e4b785e304a04ea0828759172a15add27088520dc7e49ceade7834275bedb \ + --hash=sha256:c0df2d30ed496a08de5daed2a9ea807d07c21ae0ab23acf541ab88c24b26ab97 \ + --hash=sha256:c6c2602dffb74867498f86e6129fd52a2770c48b7cd3ece77ada4fa38f94eba8 \ + --hash=sha256:ceb6e0a6e27fb364fb3853389607cf7eb3a126ad335790fa1e14ed02fba50811 \ + --hash=sha256:d9dd839eb0dc1bbe866a288ba3c1afc33a202015d2ad83b31e875b5905a079b6 \ + --hash=sha256:e4dab234478e3bd3ce83bac4193b2ecd9cf94e720ddd95ce69840273bf44f6de \ + --hash=sha256:ec4e0cd079db280b6bdabdc807047ff3e199f334050db5cbb91ba3e959a67504 \ + --hash=sha256:ecd2c3fe726758037234c93df7e98deb257fd15c24c9180dacf1ef829da5f921 \ + --hash=sha256:ef565033fa5a958e62796867b1df10c40263ea9ded87164d67572834e57a174d + # via -r requirements.in +mypy-extensions==0.4.3 \ + --hash=sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d \ + --hash=sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8 + # via + # black + # mypy numpy==1.21.0 \ --hash=sha256:1a784e8ff7ea2a32e393cc53eb0003eca1597c7ca628227e34ce34eb11645a0e \ --hash=sha256:2ba579dde0563f47021dcd652253103d6fd66165b18011dce1a0609215b2791e \ @@ -54,11 +101,39 @@ numpy==1.21.0 \ --hash=sha256:e80fe25cba41c124d04c662f33f6364909b985f2eb5998aaa5ae4b9587242cce \ --hash=sha256:eda2829af498946c59d8585a9fd74da3f810866e05f8df03a86f70079c7531dd \ --hash=sha256:fd0a359c1c17f00cb37de2969984a74320970e0ceef4808c32e00773b06649d9 - # via pyarrow + # via + # -r requirements.in + # pandas + # pyarrow packaging==21.0 \ --hash=sha256:7dc96269f53a4ccec5c0670940a4281106dd0bb343f47b7471f779df49c2fbe7 \ --hash=sha256:c86254f9220d55e31cc94d69bade760f0847da8000def4dfe1c6b872fd14ff14 # via pytest +pandas==1.3.0 \ + --hash=sha256:08eeff3da6a188e24db7f292b39a8ca9e073bf841fbbeadb946b3ad5c19d843e \ + --hash=sha256:1ff13eed501e07e7fb26a4ea18a846b6e5d7de549b497025601fd9ccb7c1d123 \ + --hash=sha256:522bfea92f3ef6207cadc7428bda1e7605dae0383b8065030e7b5d0266717b48 \ + --hash=sha256:7897326cae660eee69d501cbfa950281a193fcf407393965e1bc07448e1cc35a \ + --hash=sha256:798675317d0e4863a92a9a6bc5bd2490b5f6fef8c17b95f29e2e33f28bef9eca \ + --hash=sha256:7d3cd2c99faa94d717ca00ea489264a291ad7209453dffbf059bfb7971fd3a61 \ + --hash=sha256:823737830364d0e2af8c3912a28ba971296181a07950873492ed94e12d28c405 \ + --hash=sha256:872aa91e0f9ca913046ab639d4181a899f5e592030d954d28c2529b88756a736 \ + --hash=sha256:88864c1e28353b958b1f30e4193818519624ad9a1776921622a6a2a016d5d807 \ + --hash=sha256:92835113a67cbd34747c198d41f09f4b63f6fe11ca5643baebc7ab1e30e89e95 \ + --hash=sha256:98efc2d4983d5bb47662fe2d97b2c81b91566cb08b266490918b9c7d74a5ef64 \ + --hash=sha256:b10d7910ae9d7920a5ff7816d794d99acbc361f7b16a0f017d4fa83ced8cb55e \ + --hash=sha256:c554e6c9cf2d5ea1aba5979cc837b3649539ced0e18ece186f055450c86622e2 \ + --hash=sha256:c746876cdd8380be0c3e70966d4566855901ac9aaa5e4b9ccaa5ca5311457d11 \ + --hash=sha256:c81b8d91e9ae861eb4406b4e0f8d4dabbc105b9c479b3d1e921fba1d35b5b62a \ + --hash=sha256:e6b75091fa54a53db3927b4d1bc997c23c5ba6f87acdfe1ee5a92c38c6b2ed6a \ + --hash=sha256:ed4fc66f23fe17c93a5d439230ca2d6b5f8eac7154198d327dbe8a16d98f3f10 \ + --hash=sha256:f058c786e7b0a9e7fa5e0b9f4422e0ccdd3bf3aa3053c18d77ed2a459bd9a45a \ + --hash=sha256:fe7a549d10ca534797095586883a5c17d140d606747591258869c56e14d1b457 + # via -r requirements.in +pathspec==0.9.0 \ + --hash=sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a \ + --hash=sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1 + # via black pluggy==0.13.1 \ --hash=sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0 \ --hash=sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d @@ -102,10 +177,75 @@ pytest==6.2.4 \ --hash=sha256:50bcad0a0b9c5a72c8e4e7c9855a3ad496ca6a881a3641b4260605450772c54b \ --hash=sha256:91ef2131a9bd6be8f76f1f08eac5c5317221d6ad1e143ae03894b862e8976890 # via -r requirements.in +python-dateutil==2.8.2 \ + --hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \ + --hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9 + # via pandas +pytz==2021.1 \ + --hash=sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da \ + --hash=sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798 + # via pandas +regex==2021.7.6 \ + --hash=sha256:0eb2c6e0fcec5e0f1d3bcc1133556563222a2ffd2211945d7b1480c1b1a42a6f \ + --hash=sha256:15dddb19823f5147e7517bb12635b3c82e6f2a3a6b696cc3e321522e8b9308ad \ + --hash=sha256:173bc44ff95bc1e96398c38f3629d86fa72e539c79900283afa895694229fe6a \ + --hash=sha256:1c78780bf46d620ff4fff40728f98b8afd8b8e35c3efd638c7df67be2d5cddbf \ + --hash=sha256:2366fe0479ca0e9afa534174faa2beae87847d208d457d200183f28c74eaea59 \ + --hash=sha256:2bceeb491b38225b1fee4517107b8491ba54fba77cf22a12e996d96a3c55613d \ + --hash=sha256:2ddeabc7652024803666ea09f32dd1ed40a0579b6fbb2a213eba590683025895 \ + --hash=sha256:2fe5e71e11a54e3355fa272137d521a40aace5d937d08b494bed4529964c19c4 \ + --hash=sha256:319eb2a8d0888fa6f1d9177705f341bc9455a2c8aca130016e52c7fe8d6c37a3 \ + --hash=sha256:3f5716923d3d0bfb27048242a6e0f14eecdb2e2a7fac47eda1d055288595f222 \ + --hash=sha256:422dec1e7cbb2efbbe50e3f1de36b82906def93ed48da12d1714cabcd993d7f0 \ + --hash=sha256:4c9c3155fe74269f61e27617529b7f09552fbb12e44b1189cebbdb24294e6e1c \ + --hash=sha256:4f64fc59fd5b10557f6cd0937e1597af022ad9b27d454e182485f1db3008f417 \ + --hash=sha256:564a4c8a29435d1f2256ba247a0315325ea63335508ad8ed938a4f14c4116a5d \ + --hash=sha256:59506c6e8bd9306cd8a41511e32d16d5d1194110b8cfe5a11d102d8b63cf945d \ + --hash=sha256:598c0a79b4b851b922f504f9f39a863d83ebdfff787261a5ed061c21e67dd761 \ + --hash=sha256:59c00bb8dd8775473cbfb967925ad2c3ecc8886b3b2d0c90a8e2707e06c743f0 \ + --hash=sha256:6110bab7eab6566492618540c70edd4d2a18f40ca1d51d704f1d81c52d245026 \ + --hash=sha256:6afe6a627888c9a6cfbb603d1d017ce204cebd589d66e0703309b8048c3b0854 \ + --hash=sha256:791aa1b300e5b6e5d597c37c346fb4d66422178566bbb426dd87eaae475053fb \ + --hash=sha256:8394e266005f2d8c6f0bc6780001f7afa3ef81a7a2111fa35058ded6fce79e4d \ + --hash=sha256:875c355360d0f8d3d827e462b29ea7682bf52327d500a4f837e934e9e4656068 \ + --hash=sha256:89e5528803566af4df368df2d6f503c84fbfb8249e6631c7b025fe23e6bd0cde \ + --hash=sha256:99d8ab206a5270c1002bfcf25c51bf329ca951e5a169f3b43214fdda1f0b5f0d \ + --hash=sha256:9a854b916806c7e3b40e6616ac9e85d3cdb7649d9e6590653deb5b341a736cec \ + --hash=sha256:b85ac458354165405c8a84725de7bbd07b00d9f72c31a60ffbf96bb38d3e25fa \ + --hash=sha256:bc84fb254a875a9f66616ed4538542fb7965db6356f3df571d783f7c8d256edd \ + --hash=sha256:c92831dac113a6e0ab28bc98f33781383fe294df1a2c3dfd1e850114da35fd5b \ + --hash=sha256:cbe23b323988a04c3e5b0c387fe3f8f363bf06c0680daf775875d979e376bd26 \ + --hash=sha256:ccb3d2190476d00414aab36cca453e4596e8f70a206e2aa8db3d495a109153d2 \ + --hash=sha256:d8bbce0c96462dbceaa7ac4a7dfbbee92745b801b24bce10a98d2f2b1ea9432f \ + --hash=sha256:db2b7df831c3187a37f3bb80ec095f249fa276dbe09abd3d35297fc250385694 \ + --hash=sha256:e586f448df2bbc37dfadccdb7ccd125c62b4348cb90c10840d695592aa1b29e0 \ + --hash=sha256:e5983c19d0beb6af88cb4d47afb92d96751fb3fa1784d8785b1cdf14c6519407 \ + --hash=sha256:e6a1e5ca97d411a461041d057348e578dc344ecd2add3555aedba3b408c9f874 \ + --hash=sha256:eaf58b9e30e0e546cdc3ac06cf9165a1ca5b3de8221e9df679416ca667972035 \ + --hash=sha256:ed693137a9187052fc46eedfafdcb74e09917166362af4cc4fddc3b31560e93d \ + --hash=sha256:edd1a68f79b89b0c57339bce297ad5d5ffcc6ae7e1afdb10f1947706ed066c9c \ + --hash=sha256:f080248b3e029d052bf74a897b9d74cfb7643537fbde97fe8225a6467fb559b5 \ + --hash=sha256:f9392a4555f3e4cb45310a65b403d86b589adc773898c25a39184b1ba4db8985 \ + --hash=sha256:f98dc35ab9a749276f1a4a38ab3e0e2ba1662ce710f6530f5b0a6656f1c32b58 + # via black +six==1.16.0 \ + --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ + --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 + # via python-dateutil toml==0.10.2 \ --hash=sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b \ --hash=sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f # via # -r requirements.in # maturin + # mypy # pytest +tomli==1.0.4 \ + --hash=sha256:0713b16ff91df8638a6a694e295c8159ab35ba93e3424a626dd5226d386057be \ + --hash=sha256:be670d0d8d7570fd0ea0113bd7bb1ba3ac6706b4de062cc4c952769355c9c268 + # via black +typing-extensions==3.10.0.0 \ + --hash=sha256:0ac0f89795dd19de6b97debb0c6af1c70987fd80a2d62d1958f7e56fcc31b497 \ + --hash=sha256:50b6f157849174217d0656f99dc82fe932884fb250826c18350e159ec6cdf342 \ + --hash=sha256:779383f6086d90c99ae41cf0ff39aac8a7937a9283ce0a414e5dd782f4c94a84 + # via mypy diff --git a/python/src/functions.rs b/python/src/functions.rs index 9b60fdb73e0c2..415490743185a 100644 --- a/python/src/functions.rs +++ b/python/src/functions.rs @@ -63,7 +63,25 @@ fn in_list( } } -macro_rules! define_function { +/// Current date and time +#[pyfunction] +fn now() -> expression::Expression { + expression::Expression { + // here lit(0) is a stub for conform to arity + expr: logical_plan::now(logical_plan::lit(0)), + } +} + +/// Returns a random value in the range 0.0 <= x < 1.0 +#[pyfunction] +fn random() -> expression::Expression { + expression::Expression { + // here lit(0) is a stub for conform to arity + expr: logical_plan::random(logical_plan::lit(0)), + } +} + +macro_rules! define_unary_function { ($NAME: ident) => { #[doc = "This function is not documented yet"] #[pyfunction] @@ -84,66 +102,82 @@ macro_rules! define_function { }; } -define_function!(ascii, "Returns the numeric code of the first character of the argument. In UTF8 encoding, returns the Unicode code point of the character. In other multibyte encodings, the argument must be an ASCII character."); -define_function!(sum); -define_function!( +define_unary_function!(sqrt, "sqrt"); +define_unary_function!(sin, "sin"); +define_unary_function!(cos, "cos"); +define_unary_function!(tan, "tan"); +define_unary_function!(asin, "asin"); +define_unary_function!(acos, "acos"); +define_unary_function!(atan, "atan"); +define_unary_function!(floor, "floor"); +define_unary_function!(ceil, "ceil"); +define_unary_function!(round, "round"); +define_unary_function!(trunc, "trunc"); +define_unary_function!(abs, "abs"); +define_unary_function!(signum, "signum"); +define_unary_function!(exp, "exp"); +define_unary_function!(ln, "ln"); +define_unary_function!(log2, "log2"); +define_unary_function!(log10, "log10"); + +define_unary_function!(ascii, "Returns the numeric code of the first character of the argument. In UTF8 encoding, returns the Unicode code point of the character. In other multibyte encodings, the argument must be an ASCII character."); +define_unary_function!(sum); +define_unary_function!( bit_length, "Returns number of bits in the string (8 times the octet_length)." ); -define_function!(btrim, "Removes the longest string containing only characters in characters (a space by default) from the start and end of string."); -define_function!( +define_unary_function!(btrim, "Removes the longest string containing only characters in characters (a space by default) from the start and end of string."); +define_unary_function!( character_length, "Returns number of characters in the string." ); -define_function!(chr, "Returns the character with the given code."); -define_function!(concat_ws, "Concatenates all but the first argument, with separators. The first argument is used as the separator string, and should not be NULL. Other NULL arguments are ignored."); -define_function!(initcap, "Converts the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters."); -define_function!(left, "Returns first n characters in the string, or when n is negative, returns all but last |n| characters."); -define_function!(lower, "Converts the string to all lower case"); -define_function!(lpad, "Extends the string to length length by prepending the characters fill (a space by default). If the string is already longer than length then it is truncated (on the right)."); -define_function!(ltrim, "Removes the longest string containing only characters in characters (a space by default) from the start of string."); -define_function!( +define_unary_function!(chr, "Returns the character with the given code."); +define_unary_function!(concat_ws, "Concatenates all but the first argument, with separators. The first argument is used as the separator string, and should not be NULL. Other NULL arguments are ignored."); +define_unary_function!(initcap, "Converts the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters."); +define_unary_function!(left, "Returns first n characters in the string, or when n is negative, returns all but last |n| characters."); +define_unary_function!(lower, "Converts the string to all lower case"); +define_unary_function!(lpad, "Extends the string to length length by prepending the characters fill (a space by default). If the string is already longer than length then it is truncated (on the right)."); +define_unary_function!(ltrim, "Removes the longest string containing only characters in characters (a space by default) from the start of string."); +define_unary_function!( md5, "Computes the MD5 hash of the argument, with the result written in hexadecimal." ); -define_function!(now); -define_function!(octet_length, "Returns number of bytes in the string. Since this version of the function accepts type character directly, it will not strip trailing spaces."); -define_function!(random, "Returns a random value in the range 0.0 <= x < 1.0"); -define_function!( +define_unary_function!(octet_length, "Returns number of bytes in the string. Since this version of the function accepts type character directly, it will not strip trailing spaces."); +define_unary_function!( replace, "Replaces all occurrences in string of substring from with substring to." ); -define_function!(repeat, "Repeats string the specified number of times."); -define_function!( +define_unary_function!(repeat, "Repeats string the specified number of times."); +define_unary_function!( regexp_replace, "Replaces substring(s) matching a POSIX regular expression" ); -define_function!( +define_unary_function!( reverse, "Reverses the order of the characters in the string." ); -define_function!(right, "Returns last n characters in the string, or when n is negative, returns all but first |n| characters."); -define_function!(rpad, "Extends the string to length length by appending the characters fill (a space by default). If the string is already longer than length then it is truncated."); -define_function!(rtrim, "Removes the longest string containing only characters in characters (a space by default) from the end of string."); -define_function!(sha224); -define_function!(sha256); -define_function!(sha384); -define_function!(sha512); -define_function!(split_part, "Splits string at occurrences of delimiter and returns the n'th field (counting from one)."); -define_function!(starts_with, "Returns true if string starts with prefix."); -define_function!(strpos,"Returns starting index of specified substring within string, or zero if it's not present. (Same as position(substring in string), but note the reversed argument order.)"); -define_function!(substr); -define_function!( +define_unary_function!(right, "Returns last n characters in the string, or when n is negative, returns all but first |n| characters."); +define_unary_function!(rpad, "Extends the string to length length by appending the characters fill (a space by default). If the string is already longer than length then it is truncated."); +define_unary_function!(rtrim, "Removes the longest string containing only characters in characters (a space by default) from the end of string."); +define_unary_function!(sha224); +define_unary_function!(sha256); +define_unary_function!(sha384); +define_unary_function!(sha512); +define_unary_function!(split_part, "Splits string at occurrences of delimiter and returns the n'th field (counting from one)."); +define_unary_function!(starts_with, "Returns true if string starts with prefix."); +define_unary_function!(strpos,"Returns starting index of specified substring within string, or zero if it's not present. (Same as position(substring in string), but note the reversed argument order.)"); +define_unary_function!(substr); +define_unary_function!( to_hex, "Converts the number to its equivalent hexadecimal representation." ); -define_function!(translate, "Replaces each character in string that matches a character in the from set with the corresponding character in the to set. If from is longer than to, occurrences of the extra characters in from are deleted."); -define_function!(trim, "Removes the longest string containing only characters in characters (a space by default) from the start, end, or both ends (BOTH is the default) of string."); -define_function!(upper, "Converts the string to all upper case."); -define_function!(avg); -define_function!(min); -define_function!(max); -define_function!(count); +define_unary_function!(translate, "Replaces each character in string that matches a character in the from set with the corresponding character in the to set. If from is longer than to, occurrences of the extra characters in from are deleted."); +define_unary_function!(trim, "Removes the longest string containing only characters in characters (a space by default) from the start, end, or both ends (BOTH is the default) of string."); +define_unary_function!(upper, "Converts the string to all upper case."); +define_unary_function!(avg); +define_unary_function!(min); +define_unary_function!(max); +define_unary_function!(count); /* #[pyfunction] @@ -263,5 +297,23 @@ pub fn init(module: &PyModule) -> PyResult<()> { module.add_function(wrap_pyfunction!(max, module)?)?; module.add_function(wrap_pyfunction!(avg, module)?)?; module.add_function(wrap_pyfunction!(udaf, module)?)?; + module.add_function(wrap_pyfunction!(sqrt, module)?)?; + module.add_function(wrap_pyfunction!(sin, module)?)?; + module.add_function(wrap_pyfunction!(cos, module)?)?; + module.add_function(wrap_pyfunction!(tan, module)?)?; + module.add_function(wrap_pyfunction!(asin, module)?)?; + module.add_function(wrap_pyfunction!(acos, module)?)?; + module.add_function(wrap_pyfunction!(atan, module)?)?; + module.add_function(wrap_pyfunction!(floor, module)?)?; + module.add_function(wrap_pyfunction!(ceil, module)?)?; + module.add_function(wrap_pyfunction!(round, module)?)?; + module.add_function(wrap_pyfunction!(trunc, module)?)?; + module.add_function(wrap_pyfunction!(abs, module)?)?; + module.add_function(wrap_pyfunction!(signum, module)?)?; + module.add_function(wrap_pyfunction!(exp, module)?)?; + module.add_function(wrap_pyfunction!(ln, module)?)?; + module.add_function(wrap_pyfunction!(log2, module)?)?; + module.add_function(wrap_pyfunction!(log10, module)?)?; + Ok(()) } diff --git a/python/tests/test_math_functions.py b/python/tests/test_math_functions.py new file mode 100644 index 0000000000000..56d4824aeb9d5 --- /dev/null +++ b/python/tests/test_math_functions.py @@ -0,0 +1,60 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np +import pyarrow as pa +import pytest +from datafusion import ExecutionContext +from datafusion import functions as f + + +@pytest.fixture +def df(): + ctx = ExecutionContext() + # create a RecordBatch and a new DataFrame from it + batch = pa.RecordBatch.from_arrays([pa.array([0.1, -0.7, 0.55])], names=["value"]) + return ctx.create_dataframe([[batch]]) + + +def test_math_functions(df): + values = np.array([0.1, -0.7, 0.55]) + col_v = f.col("value") + df = df.select( + f.abs(col_v), + f.sin(col_v), + f.cos(col_v), + f.tan(col_v), + f.asin(col_v), + f.acos(col_v), + f.exp(col_v), + f.ln(col_v + f.lit(1)), + f.log2(col_v + f.lit(1)), + f.log10(col_v + f.lit(1)), + ) + result = df.collect() + assert len(result) == 1 + result = result[0] + np.testing.assert_array_almost_equal(result.column(0), np.abs(values)) + np.testing.assert_array_almost_equal(result.column(1), np.sin(values)) + np.testing.assert_array_almost_equal(result.column(2), np.cos(values)) + np.testing.assert_array_almost_equal(result.column(3), np.tan(values)) + np.testing.assert_array_almost_equal(result.column(4), np.arcsin(values)) + np.testing.assert_array_almost_equal(result.column(5), np.arccos(values)) + np.testing.assert_array_almost_equal(result.column(6), np.exp(values)) + np.testing.assert_array_almost_equal(result.column(7), np.log(values + 1.0)) + np.testing.assert_array_almost_equal(result.column(8), np.log2(values + 1.0)) + np.testing.assert_array_almost_equal(result.column(9), np.log10(values + 1.0)) diff --git a/python/tests/test_string_functions.py b/python/tests/test_string_functions.py index f8e15578320b1..ea064a6b2e9f6 100644 --- a/python/tests/test_string_functions.py +++ b/python/tests/test_string_functions.py @@ -46,10 +46,4 @@ def test_string_functions(df): "9033e0e305f247c0c3c80d0c7848c8b3", ] ) - assert result.column(1) == pa.array( - [ - "hello", - "world", - "!", - ] - ) + assert result.column(1) == pa.array(["hello", "world", "!"]) diff --git a/python/tests/test_udaf.py b/python/tests/test_udaf.py index 103d967663c46..e7044d6119e38 100644 --- a/python/tests/test_udaf.py +++ b/python/tests/test_udaf.py @@ -16,6 +16,7 @@ # under the License. from typing import List + import pyarrow as pa import pyarrow.compute as pc import pytest From 3a24113569802a5366e184dfac28aaa81cf6b418 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Tue, 20 Jul 2021 20:29:59 +0800 Subject: [PATCH 284/329] fix maturin version in pyproject.toml (#756) --- python/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 27480690e06cc..1482129897fae 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -16,5 +16,5 @@ # under the License. [build-system] -requires = ["maturin"] +requires = ["maturin>=0.11,<0.12"] build-backend = "maturin" From 30693df8961dca300306dfd0c8fca130375b50b3 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 20 Jul 2021 16:43:51 -0400 Subject: [PATCH 285/329] Show the result of all optimizer passes in EXPLAIN VERBOSE (#759) --- datafusion/src/execution/context.rs | 106 ++++++++++++++-- datafusion/src/logical_plan/builder.rs | 15 +-- datafusion/src/logical_plan/mod.rs | 2 +- datafusion/src/logical_plan/plan.rs | 20 +++ datafusion/src/optimizer/limit_push_down.rs | 20 --- .../src/optimizer/projection_push_down.rs | 22 +--- .../src/optimizer/simplify_expressions.rs | 23 +--- datafusion/src/optimizer/utils.rs | 111 +---------------- datafusion/src/physical_plan/display.rs | 11 ++ datafusion/src/physical_plan/explain.rs | 24 +++- datafusion/src/physical_plan/planner.rs | 115 ++++++++++++------ datafusion/src/sql/planner.rs | 9 +- datafusion/tests/sql.rs | 7 ++ datafusion/tests/user_defined_plan.rs | 6 +- 14 files changed, 252 insertions(+), 239 deletions(-) diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index bd939cef7035b..0cf8b3b6c2765 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -21,6 +21,7 @@ use crate::{ catalog::{CatalogList, MemoryCatalogList}, information_schema::CatalogWithInformationSchema, }, + logical_plan::{PlanType, ToStringifiedPlan}, optimizer::{ aggregate_statistics::AggregateStatistics, eliminate_limit::EliminateLimit, hash_build_probe_order::HashBuildProbeOrder, @@ -446,19 +447,31 @@ impl ExecutionContext { /// Optimizes the logical plan by applying optimizer rules. pub fn optimize(&self, plan: &LogicalPlan) -> Result { - let state = &mut self.state.lock().unwrap(); - let execution_props = &mut state.execution_props.clone(); - let optimizers = &state.config.optimizers; - - let execution_props = execution_props.start_execution(); - - let mut new_plan = plan.clone(); - debug!("Logical plan:\n {:?}", plan); - for optimizer in optimizers { - new_plan = optimizer.optimize(&new_plan, execution_props)?; + if let LogicalPlan::Explain { + verbose, + plan, + stringified_plans, + schema, + } = plan + { + let mut stringified_plans = stringified_plans.clone(); + + // optimize the child plan, capturing the output of each optimizer + let plan = self.optimize_internal(plan, |optimized_plan, optimizer| { + let optimizer_name = optimizer.name().to_string(); + let plan_type = PlanType::OptimizedLogicalPlan { optimizer_name }; + stringified_plans.push(optimized_plan.to_stringified(plan_type)); + })?; + + Ok(LogicalPlan::Explain { + verbose: *verbose, + plan: Arc::new(plan), + stringified_plans, + schema: schema.clone(), + }) + } else { + self.optimize_internal(plan, |_, _| {}) } - debug!("Optimized logical plan:\n {:?}", new_plan); - Ok(new_plan) } /// Creates a physical plan from a logical plan. @@ -556,6 +569,32 @@ impl ExecutionContext { ))), } } + + /// Optimizes the logical plan by applying optimizer rules, and + /// invoking observer function after each call + fn optimize_internal( + &self, + plan: &LogicalPlan, + mut observer: F, + ) -> Result + where + F: FnMut(&LogicalPlan, &dyn OptimizerRule), + { + let state = &mut self.state.lock().unwrap(); + let execution_props = &mut state.execution_props.clone(); + let optimizers = &state.config.optimizers; + + let execution_props = execution_props.start_execution(); + + let mut new_plan = plan.clone(); + debug!("Logical plan:\n {:?}", plan); + for optimizer in optimizers { + new_plan = optimizer.optimize(&new_plan, execution_props)?; + observer(&new_plan, optimizer.as_ref()); + } + debug!("Optimized logical plan:\n {:?}", new_plan); + Ok(new_plan) + } } impl From>> for ExecutionContext { @@ -941,6 +980,49 @@ mod tests { use tempfile::TempDir; use test::*; + #[test] + fn optimize_explain() { + let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]); + + let plan = LogicalPlanBuilder::scan_empty(Some("employee"), &schema, None) + .unwrap() + .explain(true) + .unwrap() + .build() + .unwrap(); + + if let LogicalPlan::Explain { + stringified_plans, .. + } = &plan + { + assert_eq!(stringified_plans.len(), 1); + } else { + panic!("plan was not an explain: {:?}", plan); + } + + // now optimize the plan and expect to see more plans + let optimized_plan = ExecutionContext::new().optimize(&plan).unwrap(); + if let LogicalPlan::Explain { + stringified_plans, .. + } = &optimized_plan + { + // should have more than one plan + assert!( + stringified_plans.len() > 1, + "plans: {:#?}", + stringified_plans + ); + // should have at least one optimized plan + let opt = stringified_plans + .iter() + .any(|p| matches!(p.plan_type, PlanType::OptimizedLogicalPlan { .. })); + + assert!(opt, "plans: {:#?}", stringified_plans); + } else { + panic!("plan was not an explain: {:?}", plan); + } + } + #[tokio::test] async fn parallel_projection() -> Result<()> { let partition_count = 4; diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs index 0335e29127ab9..60e0ed3c09883 100644 --- a/datafusion/src/logical_plan/builder.rs +++ b/datafusion/src/logical_plan/builder.rs @@ -27,18 +27,15 @@ use arrow::{ record_batch::RecordBatch, }; -use crate::datasource::TableProvider; use crate::error::{DataFusionError, Result}; +use crate::{datasource::TableProvider, logical_plan::plan::ToStringifiedPlan}; use crate::{ datasource::{empty::EmptyTable, parquet::ParquetTable, CsvFile, MemTable}, prelude::CsvReadOptions, }; use super::dfschema::ToDFSchema; -use super::{ - exprlist_to_fields, Expr, JoinConstraint, JoinType, LogicalPlan, PlanType, - StringifiedPlan, -}; +use super::{exprlist_to_fields, Expr, JoinConstraint, JoinType, LogicalPlan, PlanType}; use crate::logical_plan::{ columnize_expr, normalize_col, normalize_cols, Column, DFField, DFSchema, DFSchemaRef, Partitioning, @@ -398,10 +395,8 @@ impl LogicalPlanBuilder { /// Create an expression to represent the explanation of the plan pub fn explain(&self, verbose: bool) -> Result { - let stringified_plans = vec![StringifiedPlan::new( - PlanType::InitialLogicalPlan, - format!("{:#?}", self.plan.clone()), - )]; + let stringified_plans = + vec![self.plan.to_stringified(PlanType::InitialLogicalPlan)]; let schema = LogicalPlan::explain_schema(); @@ -553,6 +548,8 @@ pub(crate) fn expand_wildcard( mod tests { use arrow::datatypes::{DataType, Field}; + use crate::logical_plan::StringifiedPlan; + use super::super::{col, lit, sum}; use super::*; diff --git a/datafusion/src/logical_plan/mod.rs b/datafusion/src/logical_plan/mod.rs index f381e316669e4..a021d06f09502 100644 --- a/datafusion/src/logical_plan/mod.rs +++ b/datafusion/src/logical_plan/mod.rs @@ -50,6 +50,6 @@ pub use extension::UserDefinedLogicalNode; pub use operators::Operator; pub use plan::{ JoinConstraint, JoinType, LogicalPlan, Partitioning, PlanType, PlanVisitor, - StringifiedPlan, }; +pub(crate) use plan::{StringifiedPlan, ToStringifiedPlan}; pub use registry::FunctionRegistry; diff --git a/datafusion/src/logical_plan/plan.rs b/datafusion/src/logical_plan/plan.rs index 9a4daae27ff56..28405fb6dfba0 100644 --- a/datafusion/src/logical_plan/plan.rs +++ b/datafusion/src/logical_plan/plan.rs @@ -820,6 +820,11 @@ pub enum PlanType { FinalLogicalPlan, /// The initial physical plan, prepared for execution InitialPhysicalPlan, + /// The ExecutionPlan which results from applying an optimizer pass + OptimizedPhysicalPlan { + /// The name of the optimizer which produced this plan + optimizer_name: String, + }, /// The final, fully optimized physical which would be executed FinalPhysicalPlan, } @@ -833,6 +838,9 @@ impl fmt::Display for PlanType { } PlanType::FinalLogicalPlan => write!(f, "logical_plan"), PlanType::InitialPhysicalPlan => write!(f, "initial_physical_plan"), + PlanType::OptimizedPhysicalPlan { optimizer_name } => { + write!(f, "physical_plan after {}", optimizer_name) + } PlanType::FinalPhysicalPlan => write!(f, "physical_plan"), } } @@ -868,6 +876,18 @@ impl StringifiedPlan { } } +/// Trait for something that can be formatted as a stringified plan +pub trait ToStringifiedPlan { + /// Create a stringified plan with the specified type + fn to_stringified(&self, plan_type: PlanType) -> StringifiedPlan; +} + +impl ToStringifiedPlan for LogicalPlan { + fn to_stringified(&self, plan_type: PlanType) -> StringifiedPlan { + StringifiedPlan::new(plan_type, self.display_indent().to_string()) + } +} + #[cfg(test)] mod tests { use super::super::{col, lit, LogicalPlanBuilder}; diff --git a/datafusion/src/optimizer/limit_push_down.rs b/datafusion/src/optimizer/limit_push_down.rs index 37c95a4436922..46738c557c631 100644 --- a/datafusion/src/optimizer/limit_push_down.rs +++ b/datafusion/src/optimizer/limit_push_down.rs @@ -23,7 +23,6 @@ use crate::execution::context::ExecutionProps; use crate::logical_plan::LogicalPlan; use crate::optimizer::optimizer::OptimizerRule; use std::sync::Arc; -use utils::optimize_explain; /// Optimization rule that tries pushes down LIMIT n /// where applicable to reduce the amount of scanned / processed data @@ -43,25 +42,6 @@ fn limit_push_down( execution_props: &ExecutionProps, ) -> Result { match (plan, upper_limit) { - ( - LogicalPlan::Explain { - verbose, - schema, - plan, - stringified_plans, - }, - _, - ) => { - let schema = schema.as_ref().to_owned().into(); - optimize_explain( - optimizer, - *verbose, - plan, - stringified_plans, - &schema, - execution_props, - ) - } (LogicalPlan::Limit { n, input }, upper_limit) => { let smallest = upper_limit.map(|x| std::cmp::min(x, *n)).unwrap_or(*n); Ok(LogicalPlan::Limit { diff --git a/datafusion/src/optimizer/projection_push_down.rs b/datafusion/src/optimizer/projection_push_down.rs index 089dca2318c98..0de36f354206f 100644 --- a/datafusion/src/optimizer/projection_push_down.rs +++ b/datafusion/src/optimizer/projection_push_down.rs @@ -18,7 +18,7 @@ //! Projection Push Down optimizer rule ensures that only referenced columns are //! loaded into memory -use crate::error::Result; +use crate::error::{DataFusionError, Result}; use crate::execution::context::ExecutionProps; use crate::logical_plan::{ build_join_schema, Column, DFField, DFSchema, DFSchemaRef, LogicalPlan, @@ -33,7 +33,6 @@ use std::{ collections::{BTreeSet, HashSet}, sync::Arc, }; -use utils::optimize_explain; /// Optimizer that removes unused projections and aggregations from plans /// This reduces both scans and @@ -354,22 +353,9 @@ fn optimize_plan( limit: *limit, }) } - LogicalPlan::Explain { - verbose, - plan, - stringified_plans, - schema, - } => { - let schema = schema.as_ref().to_owned().into(); - optimize_explain( - optimizer, - *verbose, - &*plan, - stringified_plans, - &schema, - execution_props, - ) - } + LogicalPlan::Explain { .. } => Err(DataFusionError::Internal( + "Unsupported logical plan: Explain must be root of the plan".to_string(), + )), LogicalPlan::Union { inputs, schema, diff --git a/datafusion/src/optimizer/simplify_expressions.rs b/datafusion/src/optimizer/simplify_expressions.rs index 0e65de07305ff..f629eaf95b5f4 100644 --- a/datafusion/src/optimizer/simplify_expressions.rs +++ b/datafusion/src/optimizer/simplify_expressions.rs @@ -22,7 +22,6 @@ use crate::logical_plan::LogicalPlan; use crate::logical_plan::{lit, Expr}; use crate::optimizer::optimizer::OptimizerRule; use crate::optimizer::utils; -use crate::optimizer::utils::optimize_explain; use crate::scalar::ScalarValue; use crate::{error::Result, logical_plan::Operator}; @@ -278,27 +277,9 @@ impl OptimizerRule for SimplifyExpressions { fn optimize( &self, plan: &LogicalPlan, - execution_props: &ExecutionProps, + _execution_props: &ExecutionProps, ) -> Result { - match plan { - LogicalPlan::Explain { - verbose, - plan, - stringified_plans, - schema, - } => { - let schema = schema.as_ref().to_owned().into(); - optimize_explain( - self, - *verbose, - &*plan, - stringified_plans, - &schema, - execution_props, - ) - } - _ => optimize(plan), - } + optimize(plan) } } diff --git a/datafusion/src/optimizer/utils.rs b/datafusion/src/optimizer/utils.rs index 88380ea17c875..615f0ccfceaf5 100644 --- a/datafusion/src/optimizer/utils.rs +++ b/datafusion/src/optimizer/utils.rs @@ -17,15 +17,11 @@ //! Collection of utility functions that are leveraged by the query optimizer rules -use std::{collections::HashSet, sync::Arc}; - -use arrow::datatypes::Schema; - use super::optimizer::OptimizerRule; use crate::execution::context::ExecutionProps; use crate::logical_plan::{ build_join_schema, Column, DFSchemaRef, Expr, LogicalPlan, LogicalPlanBuilder, - Operator, Partitioning, PlanType, Recursion, StringifiedPlan, ToDFSchema, + Operator, Partitioning, Recursion, }; use crate::prelude::lit; use crate::scalar::ScalarValue; @@ -33,6 +29,7 @@ use crate::{ error::{DataFusionError, Result}, logical_plan::ExpressionVisitor, }; +use std::{collections::HashSet, sync::Arc}; const CASE_EXPR_MARKER: &str = "__DATAFUSION_CASE_EXPR__"; const CASE_ELSE_MARKER: &str = "__DATAFUSION_CASE_ELSE__"; @@ -94,34 +91,6 @@ pub fn expr_to_columns(expr: &Expr, accum: &mut HashSet) -> Result<()> { Ok(()) } -/// Create a `LogicalPlan::Explain` node by running `optimizer` on the -/// input plan and capturing the resulting plan string -pub fn optimize_explain( - optimizer: &impl OptimizerRule, - verbose: bool, - plan: &LogicalPlan, - stringified_plans: &[StringifiedPlan], - schema: &Schema, - execution_props: &ExecutionProps, -) -> Result { - // These are the fields of LogicalPlan::Explain It might be nice - // to transform that enum Variant into its own struct and avoid - // passing the fields individually - let plan = Arc::new(optimizer.optimize(plan, execution_props)?); - let mut stringified_plans = stringified_plans.to_vec(); - let optimizer_name = optimizer.name().into(); - stringified_plans.push(StringifiedPlan::new( - PlanType::OptimizedLogicalPlan { optimizer_name }, - format!("{:#?}", plan), - )); - Ok(LogicalPlan::Explain { - verbose, - plan, - stringified_plans, - schema: schema.clone().to_dfschema_ref()?, - }) -} - /// Convenience rule for writing optimizers: recursively invoke /// optimize on plan's children and then return a node of the same /// type. Useful for optimizer rules which want to leave the type @@ -132,23 +101,6 @@ pub fn optimize_children( plan: &LogicalPlan, execution_props: &ExecutionProps, ) -> Result { - if let LogicalPlan::Explain { - verbose, - plan, - stringified_plans, - schema, - } = plan - { - return optimize_explain( - optimizer, - *verbose, - &*plan, - stringified_plans, - &schema.as_ref().to_owned().into(), - execution_props, - ); - } - let new_exprs = plan.expressions(); let new_inputs = plan .inputs() @@ -489,7 +441,7 @@ pub fn rewrite_expression(expr: &Expr, expressions: &[Expr]) -> Result { #[cfg(test)] mod tests { use super::*; - use crate::logical_plan::{col, LogicalPlanBuilder}; + use crate::logical_plan::col; use arrow::datatypes::DataType; use std::collections::HashSet; @@ -514,61 +466,4 @@ mod tests { assert!(accum.contains(&Column::from_name("a"))); Ok(()) } - - struct TestOptimizer {} - - impl OptimizerRule for TestOptimizer { - fn optimize( - &self, - plan: &LogicalPlan, - _: &ExecutionProps, - ) -> Result { - Ok(plan.clone()) - } - - fn name(&self) -> &str { - "test_optimizer" - } - } - - #[test] - fn test_optimize_explain() -> Result<()> { - let optimizer = TestOptimizer {}; - - let empty_plan = LogicalPlanBuilder::empty(false).build()?; - let schema = LogicalPlan::explain_schema(); - - let optimized_explain = optimize_explain( - &optimizer, - true, - &empty_plan, - &[StringifiedPlan::new(PlanType::InitialLogicalPlan, "...")], - schema.as_ref(), - &ExecutionProps::new(), - )?; - - match &optimized_explain { - LogicalPlan::Explain { - verbose, - stringified_plans, - .. - } => { - assert!(*verbose); - - let expected_stringified_plans = vec![ - StringifiedPlan::new(PlanType::InitialLogicalPlan, "..."), - StringifiedPlan::new( - PlanType::OptimizedLogicalPlan { - optimizer_name: "test_optimizer".into(), - }, - "EmptyRelation", - ), - ]; - assert_eq!(*stringified_plans, expected_stringified_plans); - } - _ => panic!("Expected explain plan but got {:?}", optimized_explain), - } - - Ok(()) - } } diff --git a/datafusion/src/physical_plan/display.rs b/datafusion/src/physical_plan/display.rs index 8498e02d50c88..e251e4ea53db1 100644 --- a/datafusion/src/physical_plan/display.rs +++ b/datafusion/src/physical_plan/display.rs @@ -21,6 +21,8 @@ use std::fmt; +use crate::logical_plan::{StringifiedPlan, ToStringifiedPlan}; + use super::{accept, ExecutionPlan, ExecutionPlanVisitor}; /// Options for controlling how each [`ExecutionPlan`] should format itself @@ -131,3 +133,12 @@ impl<'a, 'b> ExecutionPlanVisitor for IndentVisitor<'a, 'b> { Ok(true) } } + +impl<'a> ToStringifiedPlan for DisplayableExecutionPlan<'a> { + fn to_stringified( + &self, + plan_type: crate::logical_plan::PlanType, + ) -> StringifiedPlan { + StringifiedPlan::new(plan_type, self.indent().to_string()) + } +} diff --git a/datafusion/src/physical_plan/explain.rs b/datafusion/src/physical_plan/explain.rs index 195a7a518370a..a6a34f5d0b0cf 100644 --- a/datafusion/src/physical_plan/explain.rs +++ b/datafusion/src/physical_plan/explain.rs @@ -115,9 +115,20 @@ impl ExecutionPlan for ExplainExec { .iter() .filter(|s| s.should_display(self.verbose)); + // Identify plans that are not changed + let mut prev: Option<&StringifiedPlan> = None; + for p in plans_to_print { type_builder.append_value(p.plan_type.to_string())?; - plan_builder.append_value(&*p.plan)?; + match prev { + Some(prev) if !should_show(prev, p) => { + plan_builder.append_value("SAME TEXT AS ABOVE")?; + } + Some(_) | None => { + plan_builder.append_value(&*p.plan)?; + } + } + prev = Some(p); } let record_batch = RecordBatch::try_new( @@ -146,3 +157,14 @@ impl ExecutionPlan for ExplainExec { } } } + +/// If this plan should be shown, given the previous plan that was +/// displayed. +/// +/// This is meant to avoid repeating the same plan over and over again +/// in explain plans to make clear what is changing +fn should_show(previous_plan: &StringifiedPlan, this_plan: &StringifiedPlan) -> bool { + // if the plans are different, or if they would have been + // displayed in the normal explain (aka non verbose) plan + (previous_plan.plan != this_plan.plan) || this_plan.should_display(false) +} diff --git a/datafusion/src/physical_plan/planner.rs b/datafusion/src/physical_plan/planner.rs index 5163e4b425b4f..e662821e4539f 100644 --- a/datafusion/src/physical_plan/planner.rs +++ b/datafusion/src/physical_plan/planner.rs @@ -24,9 +24,10 @@ use super::{ use crate::execution::context::ExecutionContextState; use crate::logical_plan::{ unnormalize_cols, DFSchema, Expr, LogicalPlan, Operator, - Partitioning as LogicalPartitioning, PlanType, StringifiedPlan, + Partitioning as LogicalPartitioning, PlanType, ToStringifiedPlan, UserDefinedLogicalNode, }; +use crate::physical_optimizer::optimizer::PhysicalOptimizerRule; use crate::physical_plan::explain::ExplainExec; use crate::physical_plan::expressions; use crate::physical_plan::expressions::{CaseExpr, Column, Literal, PhysicalSortExpr}; @@ -244,7 +245,7 @@ impl PhysicalPlanner for DefaultPhysicalPlanner { Some(plan) => Ok(plan), None => { let plan = self.create_initial_plan(logical_plan, ctx_state)?; - self.optimize_plan(plan, ctx_state) + self.optimize_internal(plan, ctx_state, |_, _| {}) } } } @@ -285,23 +286,6 @@ impl DefaultPhysicalPlanner { Self { extension_planners } } - /// Optimize a physical plan by applying each physical optimizer - fn optimize_plan( - &self, - plan: Arc, - ctx_state: &ExecutionContextState, - ) -> Result> { - let optimizers = &ctx_state.config.physical_optimizers; - debug!("Physical plan:\n{:?}", plan); - - let mut new_plan = plan; - for optimizer in optimizers { - new_plan = optimizer.optimize(new_plan, &ctx_state.config)?; - } - debug!("Optimized physical plan:\n{:?}", new_plan); - Ok(new_plan) - } - /// Create a physical plan from a logical plan fn create_initial_plan( &self, @@ -1315,32 +1299,24 @@ impl DefaultPhysicalPlanner { schema, } = logical_plan { - let final_logical_plan = StringifiedPlan::new( - PlanType::FinalLogicalPlan, - plan.display_indent().to_string(), - ); + use PlanType::*; + let mut stringified_plans = stringified_plans.clone(); - let input = self.create_initial_plan(plan, ctx_state)?; + stringified_plans.push(plan.to_stringified(FinalLogicalPlan)); - let initial_physical_plan = StringifiedPlan::new( - PlanType::InitialPhysicalPlan, - displayable(input.as_ref()).indent().to_string(), - ); + let input = self.create_initial_plan(plan, ctx_state)?; - let input = self.optimize_plan(input, ctx_state)?; + stringified_plans + .push(displayable(input.as_ref()).to_stringified(InitialPhysicalPlan)); - let final_physical_plan = StringifiedPlan::new( - PlanType::FinalPhysicalPlan, - displayable(input.as_ref()).indent().to_string(), - ); + let input = self.optimize_internal(input, ctx_state, |plan, optimizer| { + let optimizer_name = optimizer.name().to_string(); + let plan_type = OptimizedPhysicalPlan { optimizer_name }; + stringified_plans.push(displayable(plan).to_stringified(plan_type)); + })?; - let stringified_plans = stringified_plans - .iter() - .cloned() - .chain(std::iter::once(final_logical_plan)) - .chain(std::iter::once(initial_physical_plan)) - .chain(std::iter::once(final_physical_plan)) - .collect::>(); + stringified_plans + .push(displayable(input.as_ref()).to_stringified(FinalPhysicalPlan)); Ok(Some(Arc::new(ExplainExec::new( SchemaRef::new(schema.as_ref().to_owned().into()), @@ -1351,6 +1327,29 @@ impl DefaultPhysicalPlanner { Ok(None) } } + + /// Optimize a physical plan by applying each physical optimizer, + /// calling observer(plan, optimizer after each one) + fn optimize_internal( + &self, + plan: Arc, + ctx_state: &ExecutionContextState, + mut observer: F, + ) -> Result> + where + F: FnMut(&dyn ExecutionPlan, &dyn PhysicalOptimizerRule), + { + let optimizers = &ctx_state.config.physical_optimizers; + debug!("Physical plan:\n{:?}", plan); + + let mut new_plan = plan; + for optimizer in optimizers { + new_plan = optimizer.optimize(new_plan, &ctx_state.config)?; + observer(new_plan.as_ref(), optimizer.as_ref()) + } + debug!("Optimized physical plan:\n{:?}", new_plan); + Ok(new_plan) + } } fn tuple_err(value: (Result, Result)) -> Result<(T, R)> { @@ -1645,6 +1644,42 @@ mod tests { Ok(()) } + #[test] + fn test_explain() { + let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]); + + let logical_plan = + LogicalPlanBuilder::scan_empty(Some("employee"), &schema, None) + .unwrap() + .explain(true) + .unwrap() + .build() + .unwrap(); + + let plan = plan(&logical_plan).unwrap(); + if let Some(plan) = plan.as_any().downcast_ref::() { + let stringified_plans = plan.stringified_plans(); + assert!(stringified_plans.len() >= 4); + assert!(stringified_plans + .iter() + .any(|p| matches!(p.plan_type, PlanType::FinalLogicalPlan))); + assert!(stringified_plans + .iter() + .any(|p| matches!(p.plan_type, PlanType::InitialPhysicalPlan))); + assert!(stringified_plans + .iter() + .any(|p| matches!(p.plan_type, PlanType::OptimizedPhysicalPlan { .. }))); + assert!(stringified_plans + .iter() + .any(|p| matches!(p.plan_type, PlanType::FinalPhysicalPlan))); + } else { + panic!( + "Plan was not an explain plan: {}", + displayable(plan.as_ref()).indent() + ); + } + } + /// An example extension node that doesn't do anything struct NoOpExtensionNode { schema: DFSchemaRef, diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index a4bb02cf0f9a9..fa2b035162a60 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -28,8 +28,8 @@ use crate::logical_plan::window_frames::{WindowFrame, WindowFrameUnits}; use crate::logical_plan::Expr::Alias; use crate::logical_plan::{ and, builder::expand_wildcard, col, lit, normalize_col, union_with_alias, Column, - DFSchema, Expr, LogicalPlan, LogicalPlanBuilder, Operator, PlanType, StringifiedPlan, - ToDFSchema, + DFSchema, Expr, LogicalPlan, LogicalPlanBuilder, Operator, PlanType, ToDFSchema, + ToStringifiedPlan, }; use crate::prelude::JoinType; use crate::scalar::ScalarValue; @@ -233,10 +233,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ) -> Result { let plan = self.sql_statement_to_plan(statement)?; - let stringified_plans = vec![StringifiedPlan::new( - PlanType::InitialLogicalPlan, - plan.display_indent().to_string(), - )]; + let stringified_plans = vec![plan.to_stringified(PlanType::InitialLogicalPlan)]; let schema = LogicalPlan::explain_schema(); let plan = Arc::new(plan); diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 95b5596eb9f17..0ef8b4ca30243 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -2205,6 +2205,13 @@ async fn csv_explain_verbose() { "Actual: '{}'", actual ); + + // ensure the "same text as above" optimization is working + assert!( + actual.contains("SAME TEXT AS ABOVE"), + "Actual 2: '{}'", + actual + ); } #[tokio::test] diff --git a/datafusion/tests/user_defined_plan.rs b/datafusion/tests/user_defined_plan.rs index e1f8c767bd8d5..c1269d9a217ff 100644 --- a/datafusion/tests/user_defined_plan.rs +++ b/datafusion/tests/user_defined_plan.rs @@ -163,9 +163,9 @@ async fn topk_plan() -> Result<()> { let mut ctx = setup_table(make_topk_context()).await?; let expected = vec![ - "| logical_plan after topk | TopK: k=3 |", - "| | Projection: #sales.customer_id, #sales.revenue |", - "| | TableScan: sales projection=Some([0, 1]) |", + "| logical_plan after topk | TopK: k=3 |", + "| | Projection: #sales.customer_id, #sales.revenue |", + "| | TableScan: sales projection=Some([0, 1]) |", ].join("\n"); let explain_query = format!("EXPLAIN VERBOSE {}", QUERY); From ed5746de89d89c69c9e2d1baff9b2f91dbef995c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 20 Jul 2021 18:25:06 -0600 Subject: [PATCH 286/329] Ballista shuffle is finally working as intended, providing scalable distributed joins (#750) --- ballista/rust/client/src/context.rs | 1 + ballista/rust/core/proto/ballista.proto | 16 +- ballista/rust/core/src/client.rs | 9 +- .../src/execution_plans/shuffle_reader.rs | 35 ++- .../src/execution_plans/shuffle_writer.rs | 11 + .../src/execution_plans/unresolved_shuffle.rs | 19 +- .../src/serde/physical_plan/from_proto.rs | 43 +-- .../core/src/serde/physical_plan/to_proto.rs | 3 +- .../core/src/serde/scheduler/from_proto.rs | 9 +- ballista/rust/core/src/serde/scheduler/mod.rs | 12 +- .../rust/core/src/serde/scheduler/to_proto.rs | 41 ++- ballista/rust/executor/src/execution_loop.rs | 23 +- ballista/rust/executor/src/executor.rs | 37 ++- ballista/rust/executor/src/flight_service.rs | 21 +- ballista/rust/scheduler/src/lib.rs | 49 +++- ballista/rust/scheduler/src/planner.rs | 269 +++++++++++++++--- ballista/rust/scheduler/src/state/mod.rs | 50 ++-- 17 files changed, 507 insertions(+), 141 deletions(-) diff --git a/ballista/rust/client/src/context.rs b/ballista/rust/client/src/context.rs index b8210cbc26266..26087f8e6693c 100644 --- a/ballista/rust/client/src/context.rs +++ b/ballista/rust/client/src/context.rs @@ -223,6 +223,7 @@ impl BallistaContext { &partition_id.job_id, partition_id.stage_id as usize, partition_id.partition_id as usize, + &location.path, ) .await .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?) diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index 50bd901f145d2..9dbce81c21f1f 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -544,7 +544,8 @@ message PhysicalNegativeNode { message UnresolvedShuffleExecNode { uint32 stage_id = 1; Schema schema = 2; - uint32 partition_count = 3; + uint32 input_partition_count = 3; + uint32 output_partition_count = 4; } message FilterExecNode { @@ -700,7 +701,7 @@ message Action { oneof ActionType { // Fetch a partition from an executor - PartitionId fetch_partition = 3; + FetchPartition fetch_partition = 3; } // configuration settings @@ -714,6 +715,15 @@ message ExecutePartition { PhysicalPlanNode plan = 4; // The task could need to read partitions from other executors repeated PartitionLocation partition_location = 5; + // Output partition for shuffle writer + PhysicalHashRepartition output_partitioning = 6; +} + +message FetchPartition { + string job_id = 1; + uint32 stage_id = 2; + uint32 partition_id = 3; + string path = 4; } // Mapping from partition id to executor id @@ -809,6 +819,8 @@ message PollWorkParams { message TaskDefinition { PartitionId task_id = 1; PhysicalPlanNode plan = 2; + // Output partition for shuffle writer + PhysicalHashRepartition output_partitioning = 3; } message PollWorkResult { diff --git a/ballista/rust/core/src/client.rs b/ballista/rust/core/src/client.rs index 2df4145783553..26c8d22b405d5 100644 --- a/ballista/rust/core/src/client.rs +++ b/ballista/rust/core/src/client.rs @@ -81,9 +81,14 @@ impl BallistaClient { job_id: &str, stage_id: usize, partition_id: usize, + path: &str, ) -> Result { - let action = - Action::FetchPartition(PartitionId::new(job_id, stage_id, partition_id)); + let action = Action::FetchPartition { + job_id: job_id.to_string(), + stage_id, + partition_id, + path: path.to_owned(), + }; self.execute_action(&action).await } diff --git a/ballista/rust/core/src/execution_plans/shuffle_reader.rs b/ballista/rust/core/src/execution_plans/shuffle_reader.rs index db03d3ddf0800..0447ca9953135 100644 --- a/ballista/rust/core/src/execution_plans/shuffle_reader.rs +++ b/ballista/rust/core/src/execution_plans/shuffle_reader.rs @@ -76,6 +76,8 @@ impl ExecutionPlan for ShuffleReaderExec { } fn output_partitioning(&self) -> Partitioning { + // TODO partitioning may be known and could be populated here + // see https://github.com/apache/arrow-datafusion/issues/758 Partitioning::UnknownPartitioning(self.partition.len()) } @@ -123,24 +125,26 @@ impl ExecutionPlan for ShuffleReaderExec { let loc_str = self .partition .iter() - .map(|x| { - x.iter() - .map(|l| { - format!( - "[executor={} part={}:{}:{} stats={}]", - l.executor_meta.id, - l.partition_id.job_id, - l.partition_id.stage_id, - l.partition_id.partition_id, - l.partition_stats - ) - }) - .collect::>() - .join(",") + .enumerate() + .map(|(partition_id, locations)| { + format!( + "[partition={} paths={}]", + partition_id, + locations + .iter() + .map(|l| l.path.clone()) + .collect::>() + .join(",") + ) }) .collect::>() .join(", "); - write!(f, "ShuffleReaderExec: partition_locations={}", loc_str) + write!( + f, + "ShuffleReaderExec: partition_locations({})={}", + self.partition.len(), + loc_str + ) } } } @@ -166,6 +170,7 @@ async fn fetch_partition( &partition_id.job_id, partition_id.stage_id as usize, partition_id.partition_id as usize, + &location.path, ) .await .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?) diff --git a/ballista/rust/core/src/execution_plans/shuffle_writer.rs b/ballista/rust/core/src/execution_plans/shuffle_writer.rs index 47bf2a25f6365..8081dab36ab5f 100644 --- a/ballista/rust/core/src/execution_plans/shuffle_writer.rs +++ b/ballista/rust/core/src/execution_plans/shuffle_writer.rs @@ -141,6 +141,7 @@ impl ShuffleWriterExec { match &self.shuffle_output_partitioning { None => { + let start = Instant::now(); path.push(&format!("{}", input_partition)); std::fs::create_dir_all(&path)?; path.push("data.arrow"); @@ -156,6 +157,14 @@ impl ShuffleWriterExec { .await .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; + self.metrics + .input_rows + .add(stats.num_rows.unwrap_or(0) as usize); + self.metrics + .output_rows + .add(stats.num_rows.unwrap_or(0) as usize); + self.metrics.write_time.add_elapsed(start); + info!( "Executed partition {} in {} seconds. Statistics: {}", input_partition, @@ -227,6 +236,8 @@ impl ShuffleWriterExec { RecordBatch::try_new(input_batch.schema(), columns)?; // write non-empty batch out + + //TODO optimize so we don't write or fetch empty partitions //if output_batch.num_rows() > 0 { let start = Instant::now(); match &mut writers[output_partition] { diff --git a/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs b/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs index cb351eec561a1..3111b5a41be3f 100644 --- a/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs +++ b/ballista/rust/core/src/execution_plans/unresolved_shuffle.rs @@ -43,17 +43,26 @@ pub struct UnresolvedShuffleExec { // The schema this node will have once it is replaced with a ShuffleReaderExec pub schema: SchemaRef, + // The number of shuffle writer partition tasks that will produce the partitions + pub input_partition_count: usize, + // The partition count this node will have once it is replaced with a ShuffleReaderExec - pub partition_count: usize, + pub output_partition_count: usize, } impl UnresolvedShuffleExec { /// Create a new UnresolvedShuffleExec - pub fn new(stage_id: usize, schema: SchemaRef, partition_count: usize) -> Self { + pub fn new( + stage_id: usize, + schema: SchemaRef, + input_partition_count: usize, + output_partition_count: usize, + ) -> Self { Self { stage_id, schema, - partition_count, + input_partition_count, + output_partition_count, } } } @@ -69,7 +78,9 @@ impl ExecutionPlan for UnresolvedShuffleExec { } fn output_partitioning(&self) -> Partitioning { - Partitioning::UnknownPartitioning(self.partition_count) + //TODO the output partition is known and should be populated here! + // see https://github.com/apache/arrow-datafusion/issues/758 + Partitioning::UnknownPartitioning(self.output_partition_count) } fn children(&self) -> Vec> { diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index 4b0a9844773ca..509044b3d1bac 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -376,21 +376,9 @@ impl TryInto> for &protobuf::PhysicalPlanNode { let input: Arc = convert_box_required!(shuffle_writer.input)?; - let output_partitioning = match &shuffle_writer.output_partitioning { - Some(hash_part) => { - let expr = hash_part - .hash_expr - .iter() - .map(|e| e.try_into()) - .collect::>, _>>()?; - - Some(Partitioning::Hash( - expr, - hash_part.partition_count.try_into().unwrap(), - )) - } - None => None, - }; + let output_partitioning = parse_protobuf_hash_partitioning( + shuffle_writer.output_partitioning.as_ref(), + )?; Ok(Arc::new(ShuffleWriterExec::try_new( shuffle_writer.job_id.clone(), @@ -466,7 +454,10 @@ impl TryInto> for &protobuf::PhysicalPlanNode { Ok(Arc::new(UnresolvedShuffleExec { stage_id: unresolved_shuffle.stage_id as usize, schema, - partition_count: unresolved_shuffle.partition_count as usize, + input_partition_count: unresolved_shuffle.input_partition_count + as usize, + output_partition_count: unresolved_shuffle.output_partition_count + as usize, })) } } @@ -680,3 +671,23 @@ impl TryFrom<&protobuf::physical_window_expr_node::WindowFunction> for WindowFun } } } + +pub fn parse_protobuf_hash_partitioning( + partitioning: Option<&protobuf::PhysicalHashRepartition>, +) -> Result, BallistaError> { + match partitioning { + Some(hash_part) => { + let expr = hash_part + .hash_expr + .iter() + .map(|e| e.try_into()) + .collect::>, _>>()?; + + Ok(Some(Partitioning::Hash( + expr, + hash_part.partition_count.try_into().unwrap(), + ))) + } + None => Ok(None), + } +} diff --git a/ballista/rust/core/src/serde/physical_plan/to_proto.rs b/ballista/rust/core/src/serde/physical_plan/to_proto.rs index fa35eb48d4fa4..ec5ec7cb7affa 100644 --- a/ballista/rust/core/src/serde/physical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/to_proto.rs @@ -397,7 +397,8 @@ impl TryInto for Arc { protobuf::UnresolvedShuffleExecNode { stage_id: exec.stage_id as u32, schema: Some(exec.schema().as_ref().into()), - partition_count: exec.partition_count as u32, + input_partition_count: exec.input_partition_count as u32, + output_partition_count: exec.output_partition_count as u32, }, )), }) diff --git a/ballista/rust/core/src/serde/scheduler/from_proto.rs b/ballista/rust/core/src/serde/scheduler/from_proto.rs index 4f9c9bc8877e8..8d4e279395fa8 100644 --- a/ballista/rust/core/src/serde/scheduler/from_proto.rs +++ b/ballista/rust/core/src/serde/scheduler/from_proto.rs @@ -32,9 +32,12 @@ impl TryInto for protobuf::Action { fn try_into(self) -> Result { match self.action_type { - Some(ActionType::FetchPartition(partition)) => { - Ok(Action::FetchPartition(partition.try_into()?)) - } + Some(ActionType::FetchPartition(fetch)) => Ok(Action::FetchPartition { + job_id: fetch.job_id, + stage_id: fetch.stage_id as usize, + partition_id: fetch.partition_id as usize, + path: fetch.path, + }), _ => Err(BallistaError::General( "scheduler::from_proto(Action) invalid or missing action".to_owned(), )), diff --git a/ballista/rust/core/src/serde/scheduler/mod.rs b/ballista/rust/core/src/serde/scheduler/mod.rs index eeddfbbb41f39..a20d955f28b22 100644 --- a/ballista/rust/core/src/serde/scheduler/mod.rs +++ b/ballista/rust/core/src/serde/scheduler/mod.rs @@ -23,6 +23,7 @@ use datafusion::arrow::array::{ use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion::logical_plan::LogicalPlan; use datafusion::physical_plan::ExecutionPlan; +use datafusion::physical_plan::Partitioning; use serde::Serialize; use uuid::Uuid; @@ -36,7 +37,12 @@ pub mod to_proto; #[derive(Debug, Clone)] pub enum Action { /// Collect a shuffle partition - FetchPartition(PartitionId), + FetchPartition { + job_id: String, + stage_id: usize, + partition_id: usize, + path: String, + }, } /// Unique identifier for the output partition of an operator. @@ -223,6 +229,8 @@ pub struct ExecutePartition { pub plan: Arc, /// Location of shuffle partitions that this query stage may depend on pub shuffle_locations: HashMap, + /// Output partitioning for shuffle writes + pub output_partitioning: Option, } impl ExecutePartition { @@ -232,6 +240,7 @@ impl ExecutePartition { partition_id: Vec, plan: Arc, shuffle_locations: HashMap, + output_partitioning: Option, ) -> Self { Self { job_id, @@ -239,6 +248,7 @@ impl ExecutePartition { partition_id, plan, shuffle_locations, + output_partitioning, } } diff --git a/ballista/rust/core/src/serde/scheduler/to_proto.rs b/ballista/rust/core/src/serde/scheduler/to_proto.rs index 57d4f615c5f84..bdc88d0b99a2e 100644 --- a/ballista/rust/core/src/serde/scheduler/to_proto.rs +++ b/ballista/rust/core/src/serde/scheduler/to_proto.rs @@ -23,14 +23,25 @@ use crate::serde::protobuf::action::ActionType; use crate::serde::scheduler::{ Action, ExecutePartition, PartitionId, PartitionLocation, PartitionStats, }; +use datafusion::physical_plan::Partitioning; impl TryInto for Action { type Error = BallistaError; fn try_into(self) -> Result { match self { - Action::FetchPartition(partition_id) => Ok(protobuf::Action { - action_type: Some(ActionType::FetchPartition(partition_id.into())), + Action::FetchPartition { + job_id, + stage_id, + partition_id, + path, + } => Ok(protobuf::Action { + action_type: Some(ActionType::FetchPartition(protobuf::FetchPartition { + job_id, + stage_id: stage_id as u32, + partition_id: partition_id as u32, + path, + })), settings: vec![], }), } @@ -47,6 +58,9 @@ impl TryInto for ExecutePartition { partition_id: self.partition_id.iter().map(|n| *n as u32).collect(), plan: Some(self.plan.try_into()?), partition_location: vec![], + output_partitioning: hash_partitioning_to_proto( + self.output_partitioning.as_ref(), + )?, }) } } @@ -87,3 +101,26 @@ impl Into for PartitionStats { } } } + +pub fn hash_partitioning_to_proto( + output_partitioning: Option<&Partitioning>, +) -> Result, BallistaError> { + match output_partitioning { + Some(Partitioning::Hash(exprs, partition_count)) => { + Ok(Some(protobuf::PhysicalHashRepartition { + hash_expr: exprs + .iter() + .map(|expr| expr.clone().try_into()) + .collect::, BallistaError>>()?, + partition_count: *partition_count as u64, + })) + } + None => Ok(None), + other => { + return Err(BallistaError::General(format!( + "scheduler::to_proto() invalid partitioning for ExecutePartition: {:?}", + other + ))) + } + } +} diff --git a/ballista/rust/executor/src/execution_loop.rs b/ballista/rust/executor/src/execution_loop.rs index b65b83bbaf484..4d12dfc1c7551 100644 --- a/ballista/rust/executor/src/execution_loop.rs +++ b/ballista/rust/executor/src/execution_loop.rs @@ -33,6 +33,8 @@ use ballista_core::serde::protobuf::{ use protobuf::CompletedTask; use crate::executor::Executor; +use ballista_core::error::BallistaError; +use ballista_core::serde::physical_plan::from_proto::parse_protobuf_hash_partitioning; pub async fn poll_loop( mut scheduler: SchedulerGrpcClient, @@ -70,15 +72,23 @@ pub async fn poll_loop( match poll_work_result { Ok(result) => { if let Some(task) = result.into_inner().task { - run_received_tasks( + match run_received_tasks( executor.clone(), executor_meta.id.clone(), available_tasks_slots.clone(), task_status_sender, task, ) - .await; - active_job = true; + .await + { + Ok(_) => { + active_job = true; + } + Err(e) => { + warn!("Failed to run task: {:?}", e); + active_job = false; + } + } } else { active_job = false; } @@ -99,7 +109,7 @@ async fn run_received_tasks( available_tasks_slots: Arc, task_status_sender: Sender, task: TaskDefinition, -) { +) -> Result<(), BallistaError> { let task_id = task.task_id.unwrap(); let task_id_log = format!( "{}/{}/{}", @@ -108,6 +118,8 @@ async fn run_received_tasks( info!("Received task {}", task_id_log); available_tasks_slots.fetch_sub(1, Ordering::SeqCst); let plan: Arc = (&task.plan.unwrap()).try_into().unwrap(); + let shuffle_output_partitioning = + parse_protobuf_hash_partitioning(task.output_partitioning.as_ref())?; tokio::spawn(async move { let execution_result = executor @@ -116,6 +128,7 @@ async fn run_received_tasks( task_id.stage_id as usize, task_id.partition_id as usize, plan, + shuffle_output_partitioning, ) .await; info!("Done with task {}", task_id_log); @@ -127,6 +140,8 @@ async fn run_received_tasks( task_id, )); }); + + Ok(()) } fn as_task_status( diff --git a/ballista/rust/executor/src/executor.rs b/ballista/rust/executor/src/executor.rs index cbf3eb040ff6b..398ebca2b8e66 100644 --- a/ballista/rust/executor/src/executor.rs +++ b/ballista/rust/executor/src/executor.rs @@ -22,8 +22,9 @@ use std::sync::Arc; use ballista_core::error::BallistaError; use ballista_core::execution_plans::ShuffleWriterExec; use ballista_core::serde::protobuf; +use datafusion::error::DataFusionError; use datafusion::physical_plan::display::DisplayableExecutionPlan; -use datafusion::physical_plan::ExecutionPlan; +use datafusion::physical_plan::{ExecutionPlan, Partitioning}; /// Ballista executor pub struct Executor { @@ -50,23 +51,33 @@ impl Executor { stage_id: usize, part: usize, plan: Arc, + _shuffle_output_partitioning: Option, ) -> Result, BallistaError> { - // TODO to enable shuffling we need to specify the output partitioning here and - // until we do that there is always a single output partition - // see https://github.com/apache/arrow-datafusion/issues/707 - let shuffle_output_partitioning = None; + let exec = if let Some(shuffle_writer) = + plan.as_any().downcast_ref::() + { + // recreate the shuffle writer with the correct working directory + ShuffleWriterExec::try_new( + job_id.clone(), + stage_id, + plan.children()[0].clone(), + self.work_dir.clone(), + shuffle_writer.shuffle_output_partitioning().cloned(), + ) + } else { + Err(DataFusionError::Internal( + "Plan passed to execute_shuffle_write is not a ShuffleWriterExec" + .to_string(), + )) + }?; - let exec = ShuffleWriterExec::try_new( - job_id, - stage_id, - plan, - self.work_dir.clone(), - shuffle_output_partitioning, - )?; let partitions = exec.execute_shuffle_write(part).await?; println!( - "=== Physical plan with metrics ===\n{}\n", + "=== [{}/{}/{}] Physical plan with metrics ===\n{}\n", + job_id, + stage_id, + part, DisplayableExecutionPlan::with_metrics(&exec) .indent() .to_string() diff --git a/ballista/rust/executor/src/flight_service.rs b/ballista/rust/executor/src/flight_service.rs index 9a3f2d872d521..73dd1a946d554 100644 --- a/ballista/rust/executor/src/flight_service.rs +++ b/ballista/rust/executor/src/flight_service.rs @@ -18,7 +18,6 @@ //! Implementation of the Apache Arrow Flight protocol that wraps an executor. use std::fs::File; -use std::path::PathBuf; use std::pin::Pin; use std::sync::Arc; @@ -82,24 +81,13 @@ impl FlightService for BallistaFlightService { request: Request, ) -> Result, Status> { let ticket = request.into_inner(); - info!("Received do_get request"); let action = decode_protobuf(&ticket.ticket).map_err(|e| from_ballista_err(&e))?; match &action { - BallistaAction::FetchPartition(partition_id) => { - // fetch a partition that was previously executed by this executor - info!("FetchPartition {:?}", partition_id); - - let mut path = PathBuf::from(self.executor.work_dir()); - path.push(&partition_id.job_id); - path.push(&format!("{}", partition_id.stage_id)); - path.push(&format!("{}", partition_id.partition_id)); - path.push("data.arrow"); - let path = path.to_str().unwrap(); - - info!("FetchPartition {:?} reading {}", partition_id, path); + BallistaAction::FetchPartition { path, .. } => { + info!("FetchPartition reading {}", &path); let file = File::open(&path) .map_err(|e| { BallistaError::General(format!( @@ -222,7 +210,11 @@ where let schema_flight_data = SchemaAsIpc::new(reader.schema().as_ref(), &options).into(); send_response(&tx, Ok(schema_flight_data)).await?; + let mut row_count = 0; for batch in reader { + if let Ok(x) = &batch { + row_count += x.num_rows(); + } let batch_flight_data: Vec<_> = batch .map(|b| create_flight_iter(&b, &options).collect()) .map_err(|e| from_arrow_err(&e))?; @@ -230,6 +222,7 @@ where send_response(&tx, batch.clone()).await?; } } + info!("FetchPartition streamed {} rows", row_count); Ok(()) } diff --git a/ballista/rust/scheduler/src/lib.rs b/ballista/rust/scheduler/src/lib.rs index 905437d4d980f..f5e2dc1dfd807 100644 --- a/ballista/rust/scheduler/src/lib.rs +++ b/ballista/rust/scheduler/src/lib.rs @@ -76,10 +76,12 @@ use crate::planner::DistributedPlanner; use log::{debug, error, info, warn}; use rand::{distributions::Alphanumeric, thread_rng, Rng}; -use tonic::{Request, Response}; +use tonic::{Request, Response, Status}; use self::state::{ConfigBackendClient, SchedulerState}; use ballista_core::config::BallistaConfig; +use ballista_core::execution_plans::ShuffleWriterExec; +use ballista_core::serde::scheduler::to_proto::hash_partitioning_to_proto; use ballista_core::utils::create_datafusion_context; use datafusion::physical_plan::parquet::ParquetExec; use std::time::{Instant, SystemTime, UNIX_EPOCH}; @@ -209,7 +211,7 @@ impl SchedulerGrpc for SchedulerServer { tonic::Status::internal(msg) })?; } - let task = if can_accept_task { + let task: Result, Status> = if can_accept_task { let plan = self .state .assign_next_schedulable_task(&metadata.id) @@ -229,15 +231,35 @@ impl SchedulerGrpc for SchedulerServer { partition_id.partition_id ); } - plan.map(|(status, plan)| TaskDefinition { - plan: Some(plan.try_into().unwrap()), - task_id: status.partition_id, - }) + match plan { + Some((status, plan)) => { + let plan_clone = plan.clone(); + let output_partitioning = if let Some(shuffle_writer) = + plan_clone.as_any().downcast_ref::() + { + shuffle_writer.shuffle_output_partitioning() + } else { + return Err(Status::invalid_argument(format!( + "Task root plan was not a ShuffleWriterExec: {:?}", + plan_clone + ))); + }; + Ok(Some(TaskDefinition { + plan: Some(plan.try_into().unwrap()), + task_id: status.partition_id, + output_partitioning: hash_partitioning_to_proto( + output_partitioning, + ) + .map_err(|_| Status::internal("TBD".to_string()))?, + })) + } + None => Ok(None), + } } else { - None + Ok(None) }; lock.unlock().await; - Ok(Response::new(PollWorkResult { task })) + Ok(Response::new(PollWorkResult { task: task? })) } else { warn!("Received invalid executor poll_work request"); Err(tonic::Status::invalid_argument( @@ -431,12 +453,12 @@ impl SchedulerGrpc for SchedulerServer { })); // save stages into state - for stage in stages { + for shuffle_writer in stages { fail_job!(state .save_stage_plan( &job_id_spawn, - stage.stage_id(), - stage.children()[0].clone() + shuffle_writer.stage_id(), + shuffle_writer.clone() ) .await .map_err(|e| { @@ -444,12 +466,13 @@ impl SchedulerGrpc for SchedulerServer { error!("{}", msg); tonic::Status::internal(msg) })); - let num_partitions = stage.output_partitioning().partition_count(); + let num_partitions = + shuffle_writer.output_partitioning().partition_count(); for partition_id in 0..num_partitions { let pending_status = TaskStatus { partition_id: Some(PartitionId { job_id: job_id_spawn.clone(), - stage_id: stage.stage_id() as u32, + stage_id: shuffle_writer.stage_id() as u32, partition_id: partition_id as u32, }), status: None, diff --git a/ballista/rust/scheduler/src/planner.rs b/ballista/rust/scheduler/src/planner.rs index 11f5c994fd520..05025f2824775 100644 --- a/ballista/rust/scheduler/src/planner.rs +++ b/ballista/rust/scheduler/src/planner.rs @@ -105,22 +105,24 @@ impl DistributedPlanner { .as_any() .downcast_ref::() { - let query_stage = create_shuffle_writer( + let shuffle_writer = create_shuffle_writer( job_id, self.next_stage_id(), - //TODO should be children[0].clone() so that we replace this - // with an UnresolvedShuffleExec instead of just executing this - // part of the plan again - // see https://github.com/apache/arrow-datafusion/issues/707 - coalesce.children()[0].clone(), + children[0].clone(), None, )?; let unresolved_shuffle = Arc::new(UnresolvedShuffleExec::new( - query_stage.stage_id(), - query_stage.schema(), - query_stage.output_partitioning().partition_count(), + shuffle_writer.stage_id(), + shuffle_writer.schema(), + shuffle_writer.output_partitioning().partition_count(), + shuffle_writer + .shuffle_output_partitioning() + .map(|p| p.partition_count()) + .unwrap_or_else(|| { + shuffle_writer.output_partitioning().partition_count() + }), )); - stages.push(query_stage); + stages.push(shuffle_writer); Ok(( coalesce.with_new_children(vec![unresolved_shuffle])?, stages, @@ -128,23 +130,33 @@ impl DistributedPlanner { } else if let Some(repart) = execution_plan.as_any().downcast_ref::() { - let query_stage = create_shuffle_writer( - job_id, - self.next_stage_id(), - //TODO should be children[0].clone() so that we replace this - // with an UnresolvedShuffleExec instead of just executing this - // part of the plan again - // see https://github.com/apache/arrow-datafusion/issues/707 - repart.children()[0].clone(), - Some(repart.partitioning().to_owned()), - )?; - let unresolved_shuffle = Arc::new(UnresolvedShuffleExec::new( - query_stage.stage_id(), - query_stage.schema(), - query_stage.output_partitioning().partition_count(), - )); - stages.push(query_stage); - Ok((unresolved_shuffle, stages)) + match repart.output_partitioning() { + Partitioning::Hash(_, _) => { + let shuffle_writer = create_shuffle_writer( + job_id, + self.next_stage_id(), + children[0].clone(), + Some(repart.partitioning().to_owned()), + )?; + let unresolved_shuffle = Arc::new(UnresolvedShuffleExec::new( + shuffle_writer.stage_id(), + shuffle_writer.schema(), + shuffle_writer.output_partitioning().partition_count(), + shuffle_writer + .shuffle_output_partitioning() + .map(|p| p.partition_count()) + .unwrap_or_else(|| { + shuffle_writer.output_partitioning().partition_count() + }), + )); + stages.push(shuffle_writer); + Ok((unresolved_shuffle, stages)) + } + _ => { + // remove any non-hash repartition from the distributed plan + Ok((children[0].clone(), stages)) + } + } } else if let Some(window) = execution_plan.as_any().downcast_ref::() { @@ -184,18 +196,22 @@ pub fn remove_unresolved_shuffles( })? .clone(); - for i in 0..unresolved_shuffle.partition_count { + for i in 0..unresolved_shuffle.output_partition_count { if let Some(x) = p.get(&i) { relevant_locations.push(x.to_owned()); } else { relevant_locations.push(vec![]); } } - println!( - "create shuffle reader with {:?}", + info!( + "Creating shuffle reader: {}", relevant_locations .iter() - .map(|c| format!("{:?}", c)) + .map(|c| c + .iter() + .map(|l| l.path.clone()) + .collect::>() + .join(", ")) .collect::>() .join("\n") ); @@ -235,7 +251,9 @@ mod test { use ballista_core::error::BallistaError; use ballista_core::execution_plans::UnresolvedShuffleExec; use ballista_core::serde::protobuf; + use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; use datafusion::physical_plan::hash_aggregate::{AggregateMode, HashAggregateExec}; + use datafusion::physical_plan::hash_join::HashJoinExec; use datafusion::physical_plan::sort::SortExec; use datafusion::physical_plan::{ coalesce_partitions::CoalescePartitionsExec, projection::ProjectionExec, @@ -284,9 +302,7 @@ mod test { ProjectionExec: expr=[l_returnflag@0 as l_returnflag, SUM(lineitem.l_extendedprice Multiply Int64(1))@1 as sum_disc_price] HashAggregateExec: mode=FinalPartitioned, gby=[l_returnflag@0 as l_returnflag], aggr=[SUM(l_extendedprice Multiply Int64(1))] CoalesceBatchesExec: target_batch_size=4096 - RepartitionExec: partitioning=Hash([Column { name: "l_returnflag", index: 0 }], 2) - HashAggregateExec: mode=Partial, gby=[l_returnflag@1 as l_returnflag], aggr=[SUM(l_extendedprice Multiply Int64(1))] - CsvExec: source=Path(testdata/lineitem: [testdata/lineitem/partition0.tbl,testdata/lineitem/partition1.tbl]), has_header=false + UnresolvedShuffleExec ShuffleWriterExec: None SortExec: [l_returnflag@0 ASC] @@ -307,6 +323,14 @@ mod test { let final_hash = projection.children()[0].clone(); let final_hash = downcast_exec!(final_hash, HashAggregateExec); assert!(*final_hash.mode() == AggregateMode::FinalPartitioned); + let coalesce = final_hash.children()[0].clone(); + let coalesce = downcast_exec!(coalesce, CoalesceBatchesExec); + let unresolved_shuffle = coalesce.children()[0].clone(); + let unresolved_shuffle = + downcast_exec!(unresolved_shuffle, UnresolvedShuffleExec); + assert_eq!(unresolved_shuffle.stage_id, 1); + assert_eq!(unresolved_shuffle.input_partition_count, 2); + assert_eq!(unresolved_shuffle.output_partition_count, 2); // verify stage 2 let stage2 = stages[2].children()[0].clone(); @@ -314,10 +338,187 @@ mod test { let coalesce_partitions = sort.children()[0].clone(); let coalesce_partitions = downcast_exec!(coalesce_partitions, CoalescePartitionsExec); + assert_eq!( + coalesce_partitions.output_partitioning().partition_count(), + 1 + ); let unresolved_shuffle = coalesce_partitions.children()[0].clone(); let unresolved_shuffle = downcast_exec!(unresolved_shuffle, UnresolvedShuffleExec); assert_eq!(unresolved_shuffle.stage_id, 2); + assert_eq!(unresolved_shuffle.input_partition_count, 2); + assert_eq!(unresolved_shuffle.output_partition_count, 2); + + Ok(()) + } + + #[test] + fn distributed_join_plan() -> Result<(), BallistaError> { + let mut ctx = datafusion_test_context("testdata")?; + + // simplified form of TPC-H query 12 + let df = ctx.sql( + "select + l_shipmode, + sum(case + when o_orderpriority = '1-URGENT' + or o_orderpriority = '2-HIGH' + then 1 + else 0 + end) as high_line_count, + sum(case + when o_orderpriority <> '1-URGENT' + and o_orderpriority <> '2-HIGH' + then 1 + else 0 + end) as low_line_count +from + lineitem + join + orders + on + l_orderkey = o_orderkey +where + l_shipmode in ('MAIL', 'SHIP') + and l_commitdate < l_receiptdate + and l_shipdate < l_commitdate + and l_receiptdate >= date '1994-01-01' + and l_receiptdate < date '1995-01-01' +group by + l_shipmode +order by + l_shipmode; +", + )?; + + let plan = df.to_logical_plan(); + let plan = ctx.optimize(&plan)?; + let plan = ctx.create_physical_plan(&plan)?; + + let mut planner = DistributedPlanner::new(); + let job_uuid = Uuid::new_v4(); + let stages = planner.plan_query_stages(&job_uuid.to_string(), plan)?; + for stage in &stages { + println!("{}", displayable(stage.as_ref()).indent().to_string()); + } + + /* Expected result: + + ShuffleWriterExec: Some(Hash([Column { name: "l_orderkey", index: 0 }], 2)) + CoalesceBatchesExec: target_batch_size=4096 + FilterExec: l_shipmode@4 IN ([Literal { value: Utf8("MAIL") }, Literal { value: Utf8("SHIP") }]) AND l_commitdate@2 < l_receiptdate@3 AND l_shipdate@1 < l_commitdate@2 AND l_receiptdate@3 >= 8766 AND l_receiptdate@3 < 9131 + CsvExec: source=Path(testdata/lineitem: [testdata/lineitem/partition0.tbl,testdata/lineitem/partition1.tbl]), has_header=false + + ShuffleWriterExec: Some(Hash([Column { name: "o_orderkey", index: 0 }], 2)) + CsvExec: source=Path(testdata/orders: [testdata/orders/orders.tbl]), has_header=false + + ShuffleWriterExec: Some(Hash([Column { name: "l_shipmode", index: 0 }], 2)) + HashAggregateExec: mode=Partial, gby=[l_shipmode@4 as l_shipmode], aggr=[SUM(CASE WHEN #orders.o_orderpriority Eq Utf8("1-URGENT") Or #orders.o_orderpriority Eq Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END), SUM(CASE WHEN #orders.o_orderpriority NotEq Utf8("1-URGENT") And #orders.o_orderpriority NotEq Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)] + CoalesceBatchesExec: target_batch_size=4096 + HashJoinExec: mode=Partitioned, join_type=Inner, on=[(Column { name: "l_orderkey", index: 0 }, Column { name: "o_orderkey", index: 0 })] + CoalesceBatchesExec: target_batch_size=4096 + UnresolvedShuffleExec + CoalesceBatchesExec: target_batch_size=4096 + UnresolvedShuffleExec + + ShuffleWriterExec: None + ProjectionExec: expr=[l_shipmode@0 as l_shipmode, SUM(CASE WHEN #orders.o_orderpriority Eq Utf8("1-URGENT") Or #orders.o_orderpriority Eq Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)@1 as high_line_count, SUM(CASE WHEN #orders.o_orderpriority NotEq Utf8("1-URGENT") And #orders.o_orderpriority NotEq Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)@2 as low_line_count] + HashAggregateExec: mode=FinalPartitioned, gby=[l_shipmode@0 as l_shipmode], aggr=[SUM(CASE WHEN #orders.o_orderpriority Eq Utf8("1-URGENT") Or #orders.o_orderpriority Eq Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END), SUM(CASE WHEN #orders.o_orderpriority NotEq Utf8("1-URGENT") And #orders.o_orderpriority NotEq Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)] + CoalesceBatchesExec: target_batch_size=4096 + UnresolvedShuffleExec + + ShuffleWriterExec: None + SortExec: [l_shipmode@0 ASC] + CoalescePartitionsExec + UnresolvedShuffleExec + */ + + assert_eq!(5, stages.len()); + + // verify partitioning for each stage + + // csv "lineitem" (2 files) + assert_eq!( + 2, + stages[0].children()[0] + .output_partitioning() + .partition_count() + ); + assert_eq!( + 2, + stages[0] + .shuffle_output_partitioning() + .unwrap() + .partition_count() + ); + + // csv "orders" (1 file) + assert_eq!( + 1, + stages[1].children()[0] + .output_partitioning() + .partition_count() + ); + assert_eq!( + 2, + stages[1] + .shuffle_output_partitioning() + .unwrap() + .partition_count() + ); + + // join and partial hash aggregate + let input = stages[2].children()[0].clone(); + assert_eq!(2, input.output_partitioning().partition_count()); + assert_eq!( + 2, + stages[2] + .shuffle_output_partitioning() + .unwrap() + .partition_count() + ); + + let hash_agg = downcast_exec!(input, HashAggregateExec); + + let coalesce_batches = hash_agg.children()[0].clone(); + let coalesce_batches = downcast_exec!(coalesce_batches, CoalesceBatchesExec); + + let join = coalesce_batches.children()[0].clone(); + let join = downcast_exec!(join, HashJoinExec); + + let join_input_1 = join.children()[0].clone(); + // skip CoalesceBatches + let join_input_1 = join_input_1.children()[0].clone(); + let unresolved_shuffle_reader_1 = + downcast_exec!(join_input_1, UnresolvedShuffleExec); + assert_eq!(unresolved_shuffle_reader_1.input_partition_count, 2); // lineitem + assert_eq!(unresolved_shuffle_reader_1.output_partition_count, 2); + + let join_input_2 = join.children()[1].clone(); + // skip CoalesceBatches + let join_input_2 = join_input_2.children()[0].clone(); + let unresolved_shuffle_reader_2 = + downcast_exec!(join_input_2, UnresolvedShuffleExec); + assert_eq!(unresolved_shuffle_reader_2.input_partition_count, 1); //orders + assert_eq!(unresolved_shuffle_reader_2.output_partition_count, 2); + + // final partitioned hash aggregate + assert_eq!( + 2, + stages[3].children()[0] + .output_partitioning() + .partition_count() + ); + assert!(stages[3].shuffle_output_partitioning().is_none()); + + // coalesce partitions and sort + assert_eq!( + 1, + stages[4].children()[0] + .output_partitioning() + .partition_count() + ); + assert!(stages[4].shuffle_output_partitioning().is_none()); Ok(()) } diff --git a/ballista/rust/scheduler/src/state/mod.rs b/ballista/rust/scheduler/src/state/mod.rs index a4ae59e1dfda1..0bbab8cebf896 100644 --- a/ballista/rust/scheduler/src/state/mod.rs +++ b/ballista/rust/scheduler/src/state/mod.rs @@ -297,18 +297,22 @@ impl SchedulerState { let mut partition_locations: HashMap< usize, // stage id HashMap< - usize, // shuffle input partition id - Vec, // shuffle output partitions + usize, // shuffle output partition id + Vec, // shuffle partitions >, > = HashMap::new(); for unresolved_shuffle in unresolved_shuffles { - for partition_id in 0..unresolved_shuffle.partition_count { + // we schedule one task per *input* partition and each input partition + // can produce multiple output partitions + for shuffle_input_partition_id in + 0..unresolved_shuffle.input_partition_count + { let referenced_task = tasks .get(&get_task_status_key( &self.namespace, &partition.job_id, unresolved_shuffle.stage_id, - partition_id, + shuffle_input_partition_id, )) .unwrap(); let task_is_dead = self @@ -323,7 +327,11 @@ impl SchedulerState { }, )) = &referenced_task.status { - let locations = partition_locations + debug!("Task for unresolved shuffle input partition {} completed and produced these shuffle partitions:\n\t{}", + shuffle_input_partition_id, + partitions.iter().map(|p| format!("{}={}", p.partition_id, &p.path)).collect::>().join("\n\t") + ); + let stage_shuffle_partition_locations = partition_locations .entry(unresolved_shuffle.stage_id) .or_insert_with(HashMap::new); let executor_meta = executors @@ -332,9 +340,10 @@ impl SchedulerState { .unwrap() .clone(); - let temp = - locations.entry(partition_id).or_insert_with(Vec::new); - for p in partitions { + for shuffle_write_partition in partitions { + let temp = stage_shuffle_partition_locations + .entry(shuffle_write_partition.partition_id as usize) + .or_insert_with(Vec::new); let executor_meta = executor_meta.clone(); let partition_location = ballista_core::serde::scheduler::PartitionLocation { @@ -342,29 +351,36 @@ impl SchedulerState { ballista_core::serde::scheduler::PartitionId { job_id: partition.job_id.clone(), stage_id: unresolved_shuffle.stage_id, - partition_id, + partition_id: shuffle_write_partition + .partition_id + as usize, }, executor_meta, partition_stats: PartitionStats::new( - Some(p.num_rows), - Some(p.num_batches), - Some(p.num_bytes), + Some(shuffle_write_partition.num_rows), + Some(shuffle_write_partition.num_batches), + Some(shuffle_write_partition.num_bytes), ), - path: p.path.clone(), + path: shuffle_write_partition.path.clone(), }; - info!( - "Scheduler storing stage {} partition {} path: {}", + debug!( + "Scheduler storing stage {} output partition {} path: {}", unresolved_shuffle.stage_id, - partition_id, + partition_location.partition_id.partition_id, partition_location.path - ); + ); temp.push(partition_location); } } else { + debug!( + "Stage {} input partition {} has not completed yet", + unresolved_shuffle.stage_id, shuffle_input_partition_id, + ); continue 'tasks; } } } + let plan = remove_unresolved_shuffles(plan.as_ref(), &partition_locations)?; From eccd074827e7625960e56313420168755ee4a564 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 21 Jul 2021 07:16:16 -0400 Subject: [PATCH 287/329] Start update tests in sql.rs to use `assert_batches_eq!` (#760) * Start update tests in sql.rs to use `assert_batches_eq!` * fix whitespace --- datafusion/tests/sql.rs | 741 +++++++++++++++++++++------------------- 1 file changed, 389 insertions(+), 352 deletions(-) diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 0ef8b4ca30243..d9f7c6ea41211 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +//! This module contains end to end tests of running SQL queries using +//! DataFusion + use std::convert::TryFrom; use std::sync::Arc; @@ -36,6 +39,7 @@ use arrow::{ }; use datafusion::assert_batches_eq; +use datafusion::assert_batches_sorted_eq; use datafusion::logical_plan::LogicalPlan; use datafusion::prelude::*; use datafusion::{ @@ -262,17 +266,21 @@ async fn csv_select_nested() -> Result<()> { ORDER BY c2 ASC, c3 ASC ) )"; - let actual = execute(&mut ctx, sql).await; + let actual = execute_to_batches(&mut ctx, sql).await; let expected = vec![ - vec!["a", "5", "-101"], - vec!["a", "5", "-54"], - vec!["a", "5", "-38"], - vec!["a", "5", "65"], - vec!["a", "6", "-101"], - vec!["a", "6", "-31"], - vec!["a", "6", "36"], + "+----+----+------+", + "| o1 | o2 | c3 |", + "+----+----+------+", + "| a | 5 | -101 |", + "| a | 5 | -54 |", + "| a | 5 | -38 |", + "| a | 5 | 65 |", + "| a | 6 | -101 |", + "| a | 6 | -31 |", + "| a | 6 | 36 |", + "+----+----+------+", ]; - assert_eq!(expected, actual); + assert_batches_eq!(expected, &actual); Ok(()) } @@ -281,9 +289,15 @@ async fn csv_count_star() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "SELECT COUNT(*), COUNT(1) AS c, COUNT(c1) FROM aggregate_test_100"; - let actual = execute(&mut ctx, sql).await; - let expected = vec![vec!["100", "100", "100"]]; - assert_eq!(expected, actual); + let actual = execute_to_batches(&mut ctx, sql).await; + let expected = vec![ + "+-----------------+-----+-----------+", + "| COUNT(UInt8(1)) | c | COUNT(c1) |", + "+-----------------+-----+-----------+", + "| 100 | 100 | 100 |", + "+-----------------+-----+-----------+", + ]; + assert_batches_eq!(expected, &actual); Ok(()) } @@ -292,12 +306,16 @@ async fn csv_query_with_predicate() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "SELECT c1, c12 FROM aggregate_test_100 WHERE c12 > 0.376 AND c12 < 0.4"; - let actual = execute(&mut ctx, sql).await; + let actual = execute_to_batches(&mut ctx, sql).await; let expected = vec![ - vec!["e", "0.39144436569161134"], - vec!["d", "0.38870280983958583"], + "+----+---------------------+", + "| c1 | c12 |", + "+----+---------------------+", + "| e | 0.39144436569161134 |", + "| d | 0.38870280983958583 |", + "+----+---------------------+", ]; - assert_eq!(expected, actual); + assert_batches_eq!(expected, &actual); Ok(()) } @@ -306,9 +324,16 @@ async fn csv_query_with_negative_predicate() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "SELECT c1, c4 FROM aggregate_test_100 WHERE c3 < -55 AND -c4 > 30000"; - let actual = execute(&mut ctx, sql).await; - let expected = vec![vec!["e", "-31500"], vec!["c", "-30187"]]; - assert_eq!(expected, actual); + let actual = execute_to_batches(&mut ctx, sql).await; + let expected = vec![ + "+----+--------+", + "| c1 | c4 |", + "+----+--------+", + "| e | -31500 |", + "| c | -30187 |", + "+----+--------+", + ]; + assert_batches_eq!(expected, &actual); Ok(()) } @@ -317,9 +342,15 @@ async fn csv_query_with_negated_predicate() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "SELECT COUNT(1) FROM aggregate_test_100 WHERE NOT(c1 != 'a')"; - let actual = execute(&mut ctx, sql).await; - let expected = vec![vec!["21"]]; - assert_eq!(expected, actual); + let actual = execute_to_batches(&mut ctx, sql).await; + let expected = vec![ + "+-----------------+", + "| COUNT(UInt8(1)) |", + "+-----------------+", + "| 21 |", + "+-----------------+", + ]; + assert_batches_eq!(expected, &actual); Ok(()) } @@ -328,9 +359,15 @@ async fn csv_query_with_is_not_null_predicate() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "SELECT COUNT(1) FROM aggregate_test_100 WHERE c1 IS NOT NULL"; - let actual = execute(&mut ctx, sql).await; - let expected = vec![vec!["100"]]; - assert_eq!(expected, actual); + let actual = execute_to_batches(&mut ctx, sql).await; + let expected = vec![ + "+-----------------+", + "| COUNT(UInt8(1)) |", + "+-----------------+", + "| 100 |", + "+-----------------+", + ]; + assert_batches_eq!(expected, &actual); Ok(()) } @@ -339,9 +376,15 @@ async fn csv_query_with_is_null_predicate() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "SELECT COUNT(1) FROM aggregate_test_100 WHERE c1 IS NULL"; - let actual = execute(&mut ctx, sql).await; - let expected = vec![vec!["0"]]; - assert_eq!(expected, actual); + let actual = execute_to_batches(&mut ctx, sql).await; + let expected = vec![ + "+-----------------+", + "| COUNT(UInt8(1)) |", + "+-----------------+", + "| 0 |", + "+-----------------+", + ]; + assert_batches_eq!(expected, &actual); Ok(()) } @@ -350,16 +393,19 @@ async fn csv_query_group_by_int_min_max() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "SELECT c2, MIN(c12), MAX(c12) FROM aggregate_test_100 GROUP BY c2"; - let mut actual = execute(&mut ctx, sql).await; - actual.sort(); + let actual = execute_to_batches(&mut ctx, sql).await; let expected = vec![ - vec!["1", "0.05636955101974106", "0.9965400387585364"], - vec!["2", "0.16301110515739792", "0.991517828651004"], - vec!["3", "0.047343434291126085", "0.9293883502480845"], - vec!["4", "0.02182578039211991", "0.9237877978193884"], - vec!["5", "0.01479305307777301", "0.9723580396501548"], + "+----+----------------------+--------------------+", + "| c2 | MIN(c12) | MAX(c12) |", + "+----+----------------------+--------------------+", + "| 1 | 0.05636955101974106 | 0.9965400387585364 |", + "| 2 | 0.16301110515739792 | 0.991517828651004 |", + "| 3 | 0.047343434291126085 | 0.9293883502480845 |", + "| 4 | 0.02182578039211991 | 0.9237877978193884 |", + "| 5 | 0.01479305307777301 | 0.9723580396501548 |", + "+----+----------------------+--------------------+", ]; - assert_eq!(expected, actual); + assert_batches_sorted_eq!(expected, &actual); Ok(()) } @@ -370,16 +416,20 @@ async fn csv_query_group_by_float32() -> Result<()> { let sql = "SELECT COUNT(*) as cnt, c1 FROM aggregate_simple GROUP BY c1 ORDER BY cnt DESC"; - let actual = execute(&mut ctx, sql).await; + let actual = execute_to_batches(&mut ctx, sql).await; let expected = vec![ - vec!["5", "0.00005"], - vec!["4", "0.00004"], - vec!["3", "0.00003"], - vec!["2", "0.00002"], - vec!["1", "0.00001"], + "+-----+---------+", + "| cnt | c1 |", + "+-----+---------+", + "| 5 | 0.00005 |", + "| 4 | 0.00004 |", + "| 3 | 0.00003 |", + "| 2 | 0.00002 |", + "| 1 | 0.00001 |", + "+-----+---------+", ]; - assert_eq!(expected, actual); + assert_batches_eq!(expected, &actual); Ok(()) } @@ -418,45 +468,88 @@ async fn select_distinct() -> Result<()> { } #[tokio::test] -async fn select_distinct_simple() -> Result<()> { +async fn select_distinct_simple_1() { let mut ctx = ExecutionContext::new(); - register_aggregate_simple_csv(&mut ctx)?; + register_aggregate_simple_csv(&mut ctx).unwrap(); let sql = "SELECT DISTINCT c1 FROM aggregate_simple order by c1"; - let actual = execute(&mut ctx, sql).await; + let actual = execute_to_batches(&mut ctx, sql).await; let expected = vec![ - vec!["0.00001"], - vec!["0.00002"], - vec!["0.00003"], - vec!["0.00004"], - vec!["0.00005"], + "+---------+", + "| c1 |", + "+---------+", + "| 0.00001 |", + "| 0.00002 |", + "| 0.00003 |", + "| 0.00004 |", + "| 0.00005 |", + "+---------+", ]; - assert_eq!(actual, expected); + assert_batches_eq!(expected, &actual); +} + +#[tokio::test] +async fn select_distinct_simple_2() { + let mut ctx = ExecutionContext::new(); + register_aggregate_simple_csv(&mut ctx).unwrap(); let sql = "SELECT DISTINCT c1, c2 FROM aggregate_simple order by c1"; - let actual = execute(&mut ctx, sql).await; + let actual = execute_to_batches(&mut ctx, sql).await; let expected = vec![ - vec!["0.00001", "0.000000000001"], - vec!["0.00002", "0.000000000002"], - vec!["0.00003", "0.000000000003"], - vec!["0.00004", "0.000000000004"], - vec!["0.00005", "0.000000000005"], + "+---------+----------------+", + "| c1 | c2 |", + "+---------+----------------+", + "| 0.00001 | 0.000000000001 |", + "| 0.00002 | 0.000000000002 |", + "| 0.00003 | 0.000000000003 |", + "| 0.00004 | 0.000000000004 |", + "| 0.00005 | 0.000000000005 |", + "+---------+----------------+", ]; - assert_eq!(actual, expected); + assert_batches_eq!(expected, &actual); +} + +#[tokio::test] +async fn select_distinct_simple_3() { + let mut ctx = ExecutionContext::new(); + register_aggregate_simple_csv(&mut ctx).unwrap(); let sql = "SELECT distinct c3 FROM aggregate_simple order by c3"; - let actual = execute(&mut ctx, sql).await; + let actual = execute_to_batches(&mut ctx, sql).await; - let expected = vec![vec!["false"], vec!["true"]]; - assert_eq!(actual, expected); + let expected = vec![ + "+-------+", + "| c3 |", + "+-------+", + "| false |", + "| true |", + "+-------+", + ]; + assert_batches_eq!(expected, &actual); +} + +#[tokio::test] +async fn select_distinct_simple_4() { + let mut ctx = ExecutionContext::new(); + register_aggregate_simple_csv(&mut ctx).unwrap(); let sql = "SELECT distinct c1+c2 as a FROM aggregate_simple"; - let actual = execute(&mut ctx, sql).await; + let actual = execute_to_batches(&mut ctx, sql).await; - assert_eq!(actual.len(), 5); - Ok(()) + let expected = vec![ + "+-------------------------+", + "| a |", + "+-------------------------+", + "| 0.000030000002242136256 |", + "| 0.000040000002989515004 |", + "| 0.000010000000747378751 |", + "| 0.00005000000373689376 |", + "| 0.000020000001494757502 |", + "+-------------------------+", + ]; + assert_batches_sorted_eq!(expected, &actual); } #[tokio::test] @@ -464,10 +557,10 @@ async fn projection_same_fields() -> Result<()> { let mut ctx = ExecutionContext::new(); let sql = "select (1+1) as a from (select 1 as a);"; - let actual = execute(&mut ctx, sql).await; + let actual = execute_to_batches(&mut ctx, sql).await; - let expected = vec![vec!["2"]]; - assert_eq!(actual, expected); + let expected = vec!["+---+", "| a |", "+---+", "| 2 |", "+---+"]; + assert_batches_eq!(expected, &actual); Ok(()) } @@ -479,16 +572,20 @@ async fn csv_query_group_by_float64() -> Result<()> { let sql = "SELECT COUNT(*) as cnt, c2 FROM aggregate_simple GROUP BY c2 ORDER BY cnt DESC"; - let actual = execute(&mut ctx, sql).await; + let actual = execute_to_batches(&mut ctx, sql).await; let expected = vec![ - vec!["5", "0.000000000005"], - vec!["4", "0.000000000004"], - vec!["3", "0.000000000003"], - vec!["2", "0.000000000002"], - vec!["1", "0.000000000001"], + "+-----+----------------+", + "| cnt | c2 |", + "+-----+----------------+", + "| 5 | 0.000000000005 |", + "| 4 | 0.000000000004 |", + "| 3 | 0.000000000003 |", + "| 2 | 0.000000000002 |", + "| 1 | 0.000000000001 |", + "+-----+----------------+", ]; - assert_eq!(expected, actual); + assert_batches_eq!(expected, &actual); Ok(()) } @@ -500,10 +597,17 @@ async fn csv_query_group_by_boolean() -> Result<()> { let sql = "SELECT COUNT(*) as cnt, c3 FROM aggregate_simple GROUP BY c3 ORDER BY cnt DESC"; - let actual = execute(&mut ctx, sql).await; + let actual = execute_to_batches(&mut ctx, sql).await; - let expected = vec![vec!["9", "true"], vec!["6", "false"]]; - assert_eq!(expected, actual); + let expected = vec![ + "+-----+-------+", + "| cnt | c3 |", + "+-----+-------+", + "| 9 | true |", + "| 6 | false |", + "+-----+-------+", + ]; + assert_batches_eq!(expected, &actual); Ok(()) } @@ -513,36 +617,39 @@ async fn csv_query_group_by_two_columns() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "SELECT c1, c2, MIN(c3) FROM aggregate_test_100 GROUP BY c1, c2"; - let mut actual = execute(&mut ctx, sql).await; - actual.sort(); + let actual = execute_to_batches(&mut ctx, sql).await; let expected = vec![ - vec!["a", "1", "-85"], - vec!["a", "2", "-48"], - vec!["a", "3", "-72"], - vec!["a", "4", "-101"], - vec!["a", "5", "-101"], - vec!["b", "1", "12"], - vec!["b", "2", "-60"], - vec!["b", "3", "-101"], - vec!["b", "4", "-117"], - vec!["b", "5", "-82"], - vec!["c", "1", "-24"], - vec!["c", "2", "-117"], - vec!["c", "3", "-2"], - vec!["c", "4", "-90"], - vec!["c", "5", "-94"], - vec!["d", "1", "-99"], - vec!["d", "2", "93"], - vec!["d", "3", "-76"], - vec!["d", "4", "5"], - vec!["d", "5", "-59"], - vec!["e", "1", "36"], - vec!["e", "2", "-61"], - vec!["e", "3", "-95"], - vec!["e", "4", "-56"], - vec!["e", "5", "-86"], + "+----+----+---------+", + "| c1 | c2 | MIN(c3) |", + "+----+----+---------+", + "| a | 1 | -85 |", + "| a | 2 | -48 |", + "| a | 3 | -72 |", + "| a | 4 | -101 |", + "| a | 5 | -101 |", + "| b | 1 | 12 |", + "| b | 2 | -60 |", + "| b | 3 | -101 |", + "| b | 4 | -117 |", + "| b | 5 | -82 |", + "| c | 1 | -24 |", + "| c | 2 | -117 |", + "| c | 3 | -2 |", + "| c | 4 | -90 |", + "| c | 5 | -94 |", + "| d | 1 | -99 |", + "| d | 2 | 93 |", + "| d | 3 | -76 |", + "| d | 4 | 5 |", + "| d | 5 | -59 |", + "| e | 1 | 36 |", + "| e | 2 | -61 |", + "| e | 3 | -95 |", + "| e | 4 | -56 |", + "| e | 5 | -86 |", + "+----+----+---------+", ]; - assert_eq!(expected, actual); + assert_batches_sorted_eq!(expected, &actual); Ok(()) } @@ -551,10 +658,16 @@ async fn csv_query_group_by_and_having() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "SELECT c1, MIN(c3) AS m FROM aggregate_test_100 GROUP BY c1 HAVING m < -100 AND MAX(c3) > 70"; - let mut actual = execute(&mut ctx, sql).await; - actual.sort(); - let expected = vec![vec!["a", "-101"], vec!["c", "-117"]]; - assert_eq!(expected, actual); + let actual = execute_to_batches(&mut ctx, sql).await; + let expected = vec![ + "+----+------+", + "| c1 | m |", + "+----+------+", + "| a | -101 |", + "| c | -117 |", + "+----+------+", + ]; + assert_batches_sorted_eq!(expected, &actual); Ok(()) } @@ -567,10 +680,15 @@ async fn csv_query_group_by_and_having_and_where() -> Result<()> { WHERE c1 IN ('a', 'b') GROUP BY c1 HAVING m < -100 AND MAX(c3) > 70"; - let mut actual = execute(&mut ctx, sql).await; - actual.sort(); - let expected = vec![vec!["a", "-101"]]; - assert_eq!(expected, actual); + let actual = execute_to_batches(&mut ctx, sql).await; + let expected = vec![ + "+----+------+", + "| c1 | m |", + "+----+------+", + "| a | -101 |", + "+----+------+", + ]; + assert_batches_eq!(expected, &actual); Ok(()) } @@ -581,10 +699,9 @@ async fn all_where_empty() -> Result<()> { let sql = "SELECT * FROM aggregate_test_100 WHERE 1=2"; - let mut actual = execute(&mut ctx, sql).await; - actual.sort(); - let expected: Vec> = vec![]; - assert_eq!(expected, actual); + let actual = execute_to_batches(&mut ctx, sql).await; + let expected = vec!["++", "++"]; + assert_batches_eq!(expected, &actual); Ok(()) } @@ -593,16 +710,19 @@ async fn csv_query_having_without_group_by() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "SELECT c1, c2, c3 FROM aggregate_test_100 HAVING c2 >= 4 AND c3 > 90"; - let mut actual = execute(&mut ctx, sql).await; - actual.sort(); + let actual = execute_to_batches(&mut ctx, sql).await; let expected = vec![ - vec!["c", "4", "123"], - vec!["c", "5", "118"], - vec!["d", "4", "102"], - vec!["e", "4", "96"], - vec!["e", "4", "97"], + "+----+----+-----+", + "| c1 | c2 | c3 |", + "+----+----+-----+", + "| c | 4 | 123 |", + "| c | 5 | 118 |", + "| d | 4 | 102 |", + "| e | 4 | 96 |", + "| e | 4 | 97 |", + "+----+----+-----+", ]; - assert_eq!(expected, actual); + assert_batches_sorted_eq!(expected, &actual); Ok(()) } @@ -721,16 +841,19 @@ async fn csv_query_group_by_avg() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "SELECT c1, avg(c12) FROM aggregate_test_100 GROUP BY c1"; - let mut actual = execute(&mut ctx, sql).await; - actual.sort(); + let actual = execute_to_batches(&mut ctx, sql).await; let expected = vec![ - vec!["a", "0.48754517466109415"], - vec!["b", "0.41040709263815384"], - vec!["c", "0.6600456536439784"], - vec!["d", "0.48855379387549824"], - vec!["e", "0.48600669271341534"], + "+----+---------------------+", + "| c1 | AVG(c12) |", + "+----+---------------------+", + "| a | 0.48754517466109415 |", + "| b | 0.41040709263815384 |", + "| c | 0.6600456536439784 |", + "| d | 0.48855379387549824 |", + "| e | 0.48600669271341534 |", + "+----+---------------------+", ]; - assert_eq!(expected, actual); + assert_batches_sorted_eq!(expected, &actual); Ok(()) } @@ -739,16 +862,19 @@ async fn csv_query_group_by_avg_with_projection() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "SELECT avg(c12), c1 FROM aggregate_test_100 GROUP BY c1"; - let mut actual = execute(&mut ctx, sql).await; - actual.sort(); + let actual = execute_to_batches(&mut ctx, sql).await; let expected = vec![ - vec!["0.41040709263815384", "b"], - vec!["0.48600669271341534", "e"], - vec!["0.48754517466109415", "a"], - vec!["0.48855379387549824", "d"], - vec!["0.6600456536439784", "c"], + "+---------------------+----+", + "| AVG(c12) | c1 |", + "+---------------------+----+", + "| 0.41040709263815384 | b |", + "| 0.48600669271341534 | e |", + "| 0.48754517466109415 | a |", + "| 0.48855379387549824 | d |", + "| 0.6600456536439784 | c |", + "+---------------------+----+", ]; - assert_eq!(expected, actual); + assert_batches_sorted_eq!(expected, &actual); Ok(()) } @@ -800,9 +926,15 @@ async fn csv_query_count() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "SELECT count(c12) FROM aggregate_test_100"; - let actual = execute(&mut ctx, sql).await; - let expected = vec![vec!["100"]]; - assert_eq!(expected, actual); + let actual = execute_to_batches(&mut ctx, sql).await; + let expected = vec![ + "+------------+", + "| COUNT(c12) |", + "+------------+", + "| 100 |", + "+------------+", + ]; + assert_batches_eq!(expected, &actual); Ok(()) } @@ -811,65 +943,29 @@ async fn csv_query_window_with_empty_over() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "select \ - c9, \ - count(c5) over (), \ - max(c5) over (), \ - min(c5) over (), \ - first_value(c5) over (), \ - last_value(c5) over (), \ - nth_value(c5, 2) over () \ - from aggregate_test_100 \ - order by c9 \ - limit 5"; - let actual = execute(&mut ctx, sql).await; + c9, \ + count(c5) over (), \ + max(c5) over (), \ + min(c5) over (), \ + first_value(c5) over (), \ + last_value(c5) over (), \ + nth_value(c5, 2) over () \ + from aggregate_test_100 \ + order by c9 \ + limit 5"; + let actual = execute_to_batches(&mut ctx, sql).await; let expected = vec![ - vec![ - "28774375", - "100", - "2143473091", - "-2141999138", - "2033001162", - "61035129", - "706441268", - ], - vec![ - "63044568", - "100", - "2143473091", - "-2141999138", - "2033001162", - "61035129", - "706441268", - ], - vec![ - "141047417", - "100", - "2143473091", - "-2141999138", - "2033001162", - "61035129", - "706441268", - ], - vec![ - "141680161", - "100", - "2143473091", - "-2141999138", - "2033001162", - "61035129", - "706441268", - ], - vec![ - "145294611", - "100", - "2143473091", - "-2141999138", - "2033001162", - "61035129", - "706441268", - ], + "+-----------+-----------+------------+-------------+-----------------+----------------+------------------------+", + "| c9 | COUNT(c5) | MAX(c5) | MIN(c5) | FIRST_VALUE(c5) | LAST_VALUE(c5) | NTH_VALUE(c5,Int64(2)) |", + "+-----------+-----------+------------+-------------+-----------------+----------------+------------------------+", + "| 28774375 | 100 | 2143473091 | -2141999138 | 2033001162 | 61035129 | 706441268 |", + "| 63044568 | 100 | 2143473091 | -2141999138 | 2033001162 | 61035129 | 706441268 |", + "| 141047417 | 100 | 2143473091 | -2141999138 | 2033001162 | 61035129 | 706441268 |", + "| 141680161 | 100 | 2143473091 | -2141999138 | 2033001162 | 61035129 | 706441268 |", + "| 145294611 | 100 | 2143473091 | -2141999138 | 2033001162 | 61035129 | 706441268 |", + "+-----------+-----------+------------+-------------+-----------------+----------------+------------------------+", ]; - assert_eq!(expected, actual); + assert_batches_eq!(expected, &actual); Ok(()) } @@ -878,62 +974,31 @@ async fn csv_query_window_with_partition_by() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "select \ - c9, \ - sum(cast(c4 as Int)) over (partition by c3), \ - avg(cast(c4 as Int)) over (partition by c3), \ - count(cast(c4 as Int)) over (partition by c3), \ - max(cast(c4 as Int)) over (partition by c3), \ - min(cast(c4 as Int)) over (partition by c3), \ - first_value(cast(c4 as Int)) over (partition by c3), \ - last_value(cast(c4 as Int)) over (partition by c3), \ - nth_value(cast(c4 as Int), 2) over (partition by c3) \ - from aggregate_test_100 \ - order by c9 \ - limit 5"; - let actual = execute(&mut ctx, sql).await; + c9, \ + sum(cast(c4 as Int)) over (partition by c3), \ + avg(cast(c4 as Int)) over (partition by c3), \ + count(cast(c4 as Int)) over (partition by c3), \ + max(cast(c4 as Int)) over (partition by c3), \ + min(cast(c4 as Int)) over (partition by c3), \ + first_value(cast(c4 as Int)) over (partition by c3), \ + last_value(cast(c4 as Int)) over (partition by c3), \ + nth_value(cast(c4 as Int), 2) over (partition by c3) \ + from aggregate_test_100 \ + order by c9 \ + limit 5"; + let actual = execute_to_batches(&mut ctx, sql).await; let expected = vec![ - vec![ - "28774375", "-16110", "-16110", "1", "-16110", "-16110", "-16110", "-16110", - "NULL", - ], - vec![ - "63044568", "3917", "3917", "1", "3917", "3917", "3917", "3917", "NULL", - ], - vec![ - "141047417", - "-38455", - "-19227.5", - "2", - "-16974", - "-21481", - "-16974", - "-21481", - "NULL", - ], - vec![ - "141680161", - "-1114", - "-1114", - "1", - "-1114", - "-1114", - "-1114", - "-1114", - "NULL", - ], - vec![ - "145294611", - "15673", - "15673", - "1", - "15673", - "15673", - "15673", - "15673", - "NULL", - ], + "+-----------+------------------------+------------------------+--------------------------+------------------------+------------------------+--------------------------------+-------------------------------+---------------------------------------+", + "| c9 | SUM(CAST(c4 AS Int32)) | AVG(CAST(c4 AS Int32)) | COUNT(CAST(c4 AS Int32)) | MAX(CAST(c4 AS Int32)) | MIN(CAST(c4 AS Int32)) | FIRST_VALUE(CAST(c4 AS Int32)) | LAST_VALUE(CAST(c4 AS Int32)) | NTH_VALUE(CAST(c4 AS Int32),Int64(2)) |", + "+-----------+------------------------+------------------------+--------------------------+------------------------+------------------------+--------------------------------+-------------------------------+---------------------------------------+", + "| 28774375 | -16110 | -16110 | 1 | -16110 | -16110 | -16110 | -16110 | |", + "| 63044568 | 3917 | 3917 | 1 | 3917 | 3917 | 3917 | 3917 | |", + "| 141047417 | -38455 | -19227.5 | 2 | -16974 | -21481 | -16974 | -21481 | |", + "| 141680161 | -1114 | -1114 | 1 | -1114 | -1114 | -1114 | -1114 | |", + "| 145294611 | 15673 | 15673 | 1 | 15673 | 15673 | 15673 | 15673 | |", + "+-----------+------------------------+------------------------+--------------------------+------------------------+------------------------+--------------------------------+-------------------------------+---------------------------------------+", ]; - assert_eq!(expected, actual); + assert_batches_eq!(expected, &actual); Ok(()) } @@ -942,70 +1007,31 @@ async fn csv_query_window_with_order_by() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "select \ - c9, \ - sum(c5) over (order by c9), \ - avg(c5) over (order by c9), \ - count(c5) over (order by c9), \ - max(c5) over (order by c9), \ - min(c5) over (order by c9), \ - first_value(c5) over (order by c9), \ - last_value(c5) over (order by c9), \ - nth_value(c5, 2) over (order by c9) \ - from aggregate_test_100 \ - order by c9 \ - limit 5"; - let actual = execute(&mut ctx, sql).await; + c9, \ + sum(c5) over (order by c9), \ + avg(c5) over (order by c9), \ + count(c5) over (order by c9), \ + max(c5) over (order by c9), \ + min(c5) over (order by c9), \ + first_value(c5) over (order by c9), \ + last_value(c5) over (order by c9), \ + nth_value(c5, 2) over (order by c9) \ + from aggregate_test_100 \ + order by c9 \ + limit 5"; + let actual = execute_to_batches(&mut ctx, sql).await; let expected = vec![ - vec![ - "28774375", "61035129", "61035129", "1", "61035129", "61035129", "61035129", - "61035129", "NULL", - ], - vec![ - "63044568", - "-47938237", - "-23969118.5", - "2", - "61035129", - "-108973366", - "61035129", - "-108973366", - "-108973366", - ], - vec![ - "141047417", - "575165281", - "191721760.33333334", - "3", - "623103518", - "-108973366", - "61035129", - "623103518", - "-108973366", - ], - vec![ - "141680161", - "-1352462829", - "-338115707.25", - "4", - "623103518", - "-1927628110", - "61035129", - "-1927628110", - "-108973366", - ], - vec![ - "145294611", - "-3251637940", - "-650327588", - "5", - "623103518", - "-1927628110", - "61035129", - "-1899175111", - "-108973366", - ], + "+-----------+-------------+--------------------+-----------+-----------+-------------+-----------------+----------------+------------------------+", + "| c9 | SUM(c5) | AVG(c5) | COUNT(c5) | MAX(c5) | MIN(c5) | FIRST_VALUE(c5) | LAST_VALUE(c5) | NTH_VALUE(c5,Int64(2)) |", + "+-----------+-------------+--------------------+-----------+-----------+-------------+-----------------+----------------+------------------------+", + "| 28774375 | 61035129 | 61035129 | 1 | 61035129 | 61035129 | 61035129 | 61035129 | |", + "| 63044568 | -47938237 | -23969118.5 | 2 | 61035129 | -108973366 | 61035129 | -108973366 | -108973366 |", + "| 141047417 | 575165281 | 191721760.33333334 | 3 | 623103518 | -108973366 | 61035129 | 623103518 | -108973366 |", + "| 141680161 | -1352462829 | -338115707.25 | 4 | 623103518 | -1927628110 | 61035129 | -1927628110 | -108973366 |", + "| 145294611 | -3251637940 | -650327588 | 5 | 623103518 | -1927628110 | 61035129 | -1899175111 | -108973366 |", + "+-----------+-------------+--------------------+-----------+-----------+-------------+-----------------+----------------+------------------------+", ]; - assert_eq!(expected, actual); + assert_batches_eq!(expected, &actual); Ok(()) } @@ -1014,16 +1040,19 @@ async fn csv_query_group_by_int_count() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "SELECT c1, count(c12) FROM aggregate_test_100 GROUP BY c1"; - let mut actual = execute(&mut ctx, sql).await; - actual.sort(); + let actual = execute_to_batches(&mut ctx, sql).await; let expected = vec![ - vec!["a", "21"], - vec!["b", "19"], - vec!["c", "21"], - vec!["d", "18"], - vec!["e", "21"], + "+----+------------+", + "| c1 | COUNT(c12) |", + "+----+------------+", + "| a | 21 |", + "| b | 19 |", + "| c | 21 |", + "| d | 18 |", + "| e | 21 |", + "+----+------------+", ]; - assert_eq!(expected, actual); + assert_batches_sorted_eq!(expected, &actual); Ok(()) } @@ -1032,16 +1061,19 @@ async fn csv_query_group_with_aliased_aggregate() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "SELECT c1, count(c12) AS count FROM aggregate_test_100 GROUP BY c1"; - let mut actual = execute(&mut ctx, sql).await; - actual.sort(); + let actual = execute_to_batches(&mut ctx, sql).await; let expected = vec![ - vec!["a", "21"], - vec!["b", "19"], - vec!["c", "21"], - vec!["d", "18"], - vec!["e", "21"], + "+----+-------+", + "| c1 | count |", + "+----+-------+", + "| a | 21 |", + "| b | 19 |", + "| c | 21 |", + "| d | 18 |", + "| e | 21 |", + "+----+-------+", ]; - assert_eq!(expected, actual); + assert_batches_sorted_eq!(expected, &actual); Ok(()) } @@ -1050,19 +1082,24 @@ async fn csv_query_group_by_string_min_max() -> Result<()> { let mut ctx = ExecutionContext::new(); register_aggregate_csv(&mut ctx)?; let sql = "SELECT c1, MIN(c12), MAX(c12) FROM aggregate_test_100 GROUP BY c1"; - let mut actual = execute(&mut ctx, sql).await; - actual.sort(); + let actual = execute_to_batches(&mut ctx, sql).await; let expected = vec![ - vec!["a", "0.02182578039211991", "0.9800193410444061"], - vec!["b", "0.04893135681998029", "0.9185813970744787"], - vec!["c", "0.0494924465469434", "0.991517828651004"], - vec!["d", "0.061029375346466685", "0.9748360509016578"], - vec!["e", "0.01479305307777301", "0.9965400387585364"], + "+----+----------------------+--------------------+", + "| c1 | MIN(c12) | MAX(c12) |", + "+----+----------------------+--------------------+", + "| a | 0.02182578039211991 | 0.9800193410444061 |", + "| b | 0.04893135681998029 | 0.9185813970744787 |", + "| c | 0.0494924465469434 | 0.991517828651004 |", + "| d | 0.061029375346466685 | 0.9748360509016578 |", + "| e | 0.01479305307777301 | 0.9965400387585364 |", + "+----+----------------------+--------------------+", ]; - assert_eq!(expected, actual); + assert_batches_sorted_eq!(expected, &actual); Ok(()) } +// --- End Test Porting --- + #[tokio::test] async fn csv_query_cast() -> Result<()> { let mut ctx = ExecutionContext::new(); @@ -3850,11 +3887,11 @@ async fn test_physical_plan_display_indent() { let mut ctx = ExecutionContext::with_config(config); register_aggregate_csv(&mut ctx).unwrap(); let sql = "SELECT c1, MAX(c12), MIN(c12) as the_min \ - FROM aggregate_test_100 \ - WHERE c12 < 10 \ - GROUP BY c1 \ - ORDER BY the_min DESC \ - LIMIT 10"; + FROM aggregate_test_100 \ + WHERE c12 < 10 \ + GROUP BY c1 \ + ORDER BY the_min DESC \ + LIMIT 10"; let plan = ctx.create_logical_plan(sql).unwrap(); let plan = ctx.optimize(&plan).unwrap(); @@ -3872,7 +3909,7 @@ async fn test_physical_plan_display_indent() { " FilterExec: c12@1 < CAST(10 AS Float64)", " RepartitionExec: partitioning=RoundRobinBatch(3)", " CsvExec: source=Path(ARROW_TEST_DATA/csv/aggregate_test_100.csv: [ARROW_TEST_DATA/csv/aggregate_test_100.csv]), has_header=true", - ]; + ]; let data_path = datafusion::test_util::arrow_test_data(); let actual = format!("{}", displayable(physical_plan.as_ref()).indent()) @@ -3898,10 +3935,10 @@ async fn test_physical_plan_display_indent_multi_children() { register_aggregate_csv(&mut ctx).unwrap(); let sql = "SELECT c1 \ FROM (select c1 from aggregate_test_100)\ - JOIN\ - (select c1 as c2 from aggregate_test_100)\ - ON c1=c2\ - "; + JOIN\ + (select c1 as c2 from aggregate_test_100)\ + ON c1=c2\ + "; let plan = ctx.create_logical_plan(sql).unwrap(); let plan = ctx.optimize(&plan).unwrap(); From a11a5d4dbd594c3e71f286a4b6b5faf8e8c1f9de Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 22 Jul 2021 04:27:37 +0800 Subject: [PATCH 288/329] impl from str for column and scalar (#762) --- datafusion/src/logical_plan/expr.rs | 10 ++++++++++ datafusion/src/scalar.rs | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index 2eee140f47fe5..c3126980f4c35 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -30,7 +30,9 @@ use aggregates::{AccumulatorFunctionImplementation, StateTypeFunction}; use arrow::{compute::can_cast_types, datatypes::DataType}; use functions::{ReturnTypeFunction, ScalarFunctionImplementation, Signature}; use std::collections::{HashMap, HashSet}; +use std::convert::Infallible; use std::fmt; +use std::str::FromStr; use std::sync::Arc; /// A named reference to a qualified field in a schema. @@ -153,6 +155,14 @@ impl From<&str> for Column { } } +impl FromStr for Column { + type Err = Infallible; + + fn from_str(s: &str) -> std::result::Result { + Ok(s.into()) + } +} + impl fmt::Display for Column { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match &self.relation { diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index e7354f8e62ec1..ab08364242428 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -27,6 +27,8 @@ use arrow::{ TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }, }; +use std::convert::Infallible; +use std::str::FromStr; use std::{convert::TryFrom, fmt, iter::repeat, sync::Arc}; /// Represents a dynamically typed, nullable single value. @@ -854,6 +856,14 @@ impl From<&str> for ScalarValue { } } +impl FromStr for ScalarValue { + type Err = Infallible; + + fn from_str(s: &str) -> std::result::Result { + Ok(s.into()) + } +} + macro_rules! impl_try_from { ($SCALAR:ident, $NATIVE:ident) => { impl TryFrom for $NATIVE { From 4d61196dee8526998aee7e7bb10ea88422e5f9e1 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 22 Jul 2021 14:11:58 +0800 Subject: [PATCH 289/329] fix 226 (#761) - fix concat - fix concat_ws - fix random - add unit tests --- datafusion/src/logical_plan/expr.rs | 126 ++++++++++++++++++++++++---- 1 file changed, 109 insertions(+), 17 deletions(-) diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index c3126980f4c35..8b0e647261da8 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -1353,8 +1353,8 @@ impl Literal for ScalarValue { } macro_rules! make_literal { - ($TYPE:ty, $SCALAR:ident) => { - #[allow(missing_docs)] + ($TYPE:ty, $SCALAR:ident, $DOC: expr) => { + #[doc = $DOC] impl Literal for $TYPE { fn lit(&self) -> Expr { Expr::Literal(ScalarValue::$SCALAR(Some(self.clone()))) @@ -1363,27 +1363,55 @@ macro_rules! make_literal { }; } -make_literal!(bool, Boolean); -make_literal!(f32, Float32); -make_literal!(f64, Float64); -make_literal!(i8, Int8); -make_literal!(i16, Int16); -make_literal!(i32, Int32); -make_literal!(i64, Int64); -make_literal!(u8, UInt8); -make_literal!(u16, UInt16); -make_literal!(u32, UInt32); -make_literal!(u64, UInt64); +make_literal!(bool, Boolean, "literal expression containing a bool"); +make_literal!(f32, Float32, "literal expression containing an f32"); +make_literal!(f64, Float64, "literal expression containing an f64"); +make_literal!(i8, Int8, "literal expression containing an i8"); +make_literal!(i16, Int16, "literal expression containing an i16"); +make_literal!(i32, Int32, "literal expression containing an i32"); +make_literal!(i64, Int64, "literal expression containing an i64"); +make_literal!(u8, UInt8, "literal expression containing a u8"); +make_literal!(u16, UInt16, "literal expression containing a u16"); +make_literal!(u32, UInt32, "literal expression containing a u32"); +make_literal!(u64, UInt64, "literal expression containing a u64"); /// Create a literal expression pub fn lit(n: T) -> Expr { n.lit() } +/// Concatenates the text representations of all the arguments. NULL arguments are ignored. +pub fn concat(args: &[Expr]) -> Expr { + Expr::ScalarFunction { + fun: functions::BuiltinScalarFunction::Concat, + args: args.to_vec(), + } +} + +/// Concatenates all but the first argument, with separators. +/// The first argument is used as the separator string, and should not be NULL. +/// Other NULL arguments are ignored. +pub fn concat_ws(sep: impl Into, values: &[Expr]) -> Expr { + let mut args = vec![lit(sep.into())]; + args.extend_from_slice(values); + Expr::ScalarFunction { + fun: functions::BuiltinScalarFunction::ConcatWithSeparator, + args, + } +} + +/// Returns a random value in the range 0.0 <= x < 1.0 +pub fn random() -> Expr { + Expr::ScalarFunction { + fun: functions::BuiltinScalarFunction::Random, + args: vec![], + } +} + /// Create an convenience function representing a unary scalar function macro_rules! unary_scalar_expr { ($ENUM:ident, $FUNC:ident) => { - #[allow(missing_docs)] + #[doc = "this scalar function is not documented yet"] pub fn $FUNC(e: Expr) -> Expr { Expr::ScalarFunction { fun: functions::BuiltinScalarFunction::$ENUM, @@ -1407,7 +1435,6 @@ unary_scalar_expr!(Floor, floor); unary_scalar_expr!(Ceil, ceil); unary_scalar_expr!(Now, now); unary_scalar_expr!(Round, round); -unary_scalar_expr!(Random, random); unary_scalar_expr!(Trunc, trunc); unary_scalar_expr!(Abs, abs); unary_scalar_expr!(Signum, signum); @@ -1423,8 +1450,6 @@ unary_scalar_expr!(Btrim, btrim); unary_scalar_expr!(CharacterLength, character_length); unary_scalar_expr!(CharacterLength, length); unary_scalar_expr!(Chr, chr); -unary_scalar_expr!(Concat, concat); -unary_scalar_expr!(ConcatWithSeparator, concat_ws); unary_scalar_expr!(InitCap, initcap); unary_scalar_expr!(Left, left); unary_scalar_expr!(Lower, lower); @@ -1951,4 +1976,71 @@ mod tests { fn make_field(relation: &str, column: &str) -> DFField { DFField::new(Some(relation), column, DataType::Int8, false) } + + macro_rules! test_unary_scalar_expr { + ($ENUM:ident, $FUNC:ident) => {{ + if let Expr::ScalarFunction { fun, args } = $FUNC(col("tableA.a")) { + let name = functions::BuiltinScalarFunction::$ENUM; + assert_eq!(name, fun); + assert_eq!(1, args.len()); + } else { + assert!(false, "unexpected"); + } + }}; + } + + #[test] + fn scalar_function_definitions() { + test_unary_scalar_expr!(Sqrt, sqrt); + test_unary_scalar_expr!(Sin, sin); + test_unary_scalar_expr!(Cos, cos); + test_unary_scalar_expr!(Tan, tan); + test_unary_scalar_expr!(Asin, asin); + test_unary_scalar_expr!(Acos, acos); + test_unary_scalar_expr!(Atan, atan); + test_unary_scalar_expr!(Floor, floor); + test_unary_scalar_expr!(Ceil, ceil); + test_unary_scalar_expr!(Now, now); + test_unary_scalar_expr!(Round, round); + test_unary_scalar_expr!(Trunc, trunc); + test_unary_scalar_expr!(Abs, abs); + test_unary_scalar_expr!(Signum, signum); + test_unary_scalar_expr!(Exp, exp); + test_unary_scalar_expr!(Log2, log2); + test_unary_scalar_expr!(Log10, log10); + test_unary_scalar_expr!(Ln, ln); + test_unary_scalar_expr!(Ascii, ascii); + test_unary_scalar_expr!(BitLength, bit_length); + test_unary_scalar_expr!(Btrim, btrim); + test_unary_scalar_expr!(CharacterLength, character_length); + test_unary_scalar_expr!(CharacterLength, length); + test_unary_scalar_expr!(Chr, chr); + test_unary_scalar_expr!(InitCap, initcap); + test_unary_scalar_expr!(Left, left); + test_unary_scalar_expr!(Lower, lower); + test_unary_scalar_expr!(Lpad, lpad); + test_unary_scalar_expr!(Ltrim, ltrim); + test_unary_scalar_expr!(MD5, md5); + test_unary_scalar_expr!(OctetLength, octet_length); + test_unary_scalar_expr!(RegexpMatch, regexp_match); + test_unary_scalar_expr!(RegexpReplace, regexp_replace); + test_unary_scalar_expr!(Replace, replace); + test_unary_scalar_expr!(Repeat, repeat); + test_unary_scalar_expr!(Reverse, reverse); + test_unary_scalar_expr!(Right, right); + test_unary_scalar_expr!(Rpad, rpad); + test_unary_scalar_expr!(Rtrim, rtrim); + test_unary_scalar_expr!(SHA224, sha224); + test_unary_scalar_expr!(SHA256, sha256); + test_unary_scalar_expr!(SHA384, sha384); + test_unary_scalar_expr!(SHA512, sha512); + test_unary_scalar_expr!(SplitPart, split_part); + test_unary_scalar_expr!(StartsWith, starts_with); + test_unary_scalar_expr!(Strpos, strpos); + test_unary_scalar_expr!(Substr, substr); + test_unary_scalar_expr!(ToHex, to_hex); + test_unary_scalar_expr!(Translate, translate); + test_unary_scalar_expr!(Trim, trim); + test_unary_scalar_expr!(Upper, upper); + } } From 31d16d265006e11d286d2874e8c5182091741be6 Mon Sep 17 00:00:00 2001 From: Cui Wenzheng Date: Sat, 24 Jul 2021 04:47:23 +0800 Subject: [PATCH 290/329] #723 limit pruning rule to simple expression (#764) * limit pruning rule to simple expression * add Not(bool col) support --- datafusion/src/physical_optimizer/pruning.rs | 347 +++++++++++++++---- datafusion/tests/parquet_pruning.rs | 269 +++++++++++++- 2 files changed, 542 insertions(+), 74 deletions(-) diff --git a/datafusion/src/physical_optimizer/pruning.rs b/datafusion/src/physical_optimizer/pruning.rs index 36253815414ab..c6f7647b70cfa 100644 --- a/datafusion/src/physical_optimizer/pruning.rs +++ b/datafusion/src/physical_optimizer/pruning.rs @@ -355,17 +355,18 @@ fn build_statistics_record_batch( struct PruningExpressionBuilder<'a> { column: Column, - column_expr: &'a Expr, - scalar_expr: &'a Expr, + column_expr: Expr, + op: Operator, + scalar_expr: Expr, field: &'a Field, required_columns: &'a mut RequiredStatColumns, - reverse_operator: bool, } impl<'a> PruningExpressionBuilder<'a> { fn try_new( left: &'a Expr, right: &'a Expr, + op: Operator, schema: &'a Schema, required_columns: &'a mut RequiredStatColumns, ) -> Result { @@ -374,10 +375,10 @@ impl<'a> PruningExpressionBuilder<'a> { utils::expr_to_columns(left, &mut left_columns)?; let mut right_columns = HashSet::::new(); utils::expr_to_columns(right, &mut right_columns)?; - let (column_expr, scalar_expr, columns, reverse_operator) = + let (column_expr, scalar_expr, columns, correct_operator) = match (left_columns.len(), right_columns.len()) { - (1, 0) => (left, right, left_columns, false), - (0, 1) => (right, left, right_columns, true), + (1, 0) => (left, right, left_columns, op), + (0, 1) => (right, left, right_columns, reverse_operator(op)), _ => { // if more than one column used in expression - not supported return Err(DataFusionError::Plan( @@ -386,6 +387,12 @@ impl<'a> PruningExpressionBuilder<'a> { )); } }; + + let (column_expr, correct_operator, scalar_expr) = + match rewrite_expr_to_prunable(column_expr, correct_operator, scalar_expr) { + Ok(ret) => ret, + Err(e) => return Err(e), + }; let column = columns.iter().next().unwrap().clone(); let field = match schema.column_with_name(&column.flat_name()) { Some((_, f)) => f, @@ -399,40 +406,111 @@ impl<'a> PruningExpressionBuilder<'a> { Ok(Self { column, column_expr, + op: correct_operator, scalar_expr, field, required_columns, - reverse_operator, }) } - fn correct_operator(&self, op: Operator) -> Operator { - if !self.reverse_operator { - return op; - } - - match op { - Operator::Lt => Operator::Gt, - Operator::Gt => Operator::Lt, - Operator::LtEq => Operator::GtEq, - Operator::GtEq => Operator::LtEq, - _ => op, - } + fn op(&self) -> Operator { + self.op } fn scalar_expr(&self) -> &Expr { - self.scalar_expr + &self.scalar_expr } fn min_column_expr(&mut self) -> Result { self.required_columns - .min_column_expr(&self.column, self.column_expr, self.field) + .min_column_expr(&self.column, &self.column_expr, self.field) } fn max_column_expr(&mut self) -> Result { self.required_columns - .max_column_expr(&self.column, self.column_expr, self.field) + .max_column_expr(&self.column, &self.column_expr, self.field) + } +} + +/// This function is designed to rewrite the column_expr to +/// ensure the column_expr is monotonically increasing. +/// +/// For example, +/// 1. `col > 10` +/// 2. `-col > 10` should be rewritten to `col < -10` +/// 3. `!col = true` would be rewritten to `col = !true` +/// 4. `abs(a - 10) > 0` not supported +/// +/// More rewrite rules are still in progress. +fn rewrite_expr_to_prunable( + column_expr: &Expr, + op: Operator, + scalar_expr: &Expr, +) -> Result<(Expr, Operator, Expr)> { + if !is_compare_op(op) { + return Err(DataFusionError::Plan( + "rewrite_expr_to_prunable only support compare expression".to_string(), + )); + } + + match column_expr { + // `col > lit()` + Expr::Column(_) => Ok((column_expr.clone(), op, scalar_expr.clone())), + + // `-col > lit()` --> `col < -lit()` + Expr::Negative(c) => match c.as_ref() { + Expr::Column(_) => Ok(( + c.as_ref().clone(), + reverse_operator(op), + Expr::Negative(Box::new(scalar_expr.clone())), + )), + _ => Err(DataFusionError::Plan(format!( + "negative with complex expression {:?} is not supported", + column_expr + ))), + }, + + // `!col = true` --> `col = !true` + Expr::Not(c) => { + if op != Operator::Eq && op != Operator::NotEq { + return Err(DataFusionError::Plan( + "Not with operator other than Eq / NotEq is not supported" + .to_string(), + )); + } + return match c.as_ref() { + Expr::Column(_) => Ok(( + c.as_ref().clone(), + reverse_operator(op), + Expr::Not(Box::new(scalar_expr.clone())), + )), + _ => Err(DataFusionError::Plan(format!( + "Not with complex expression {:?} is not supported", + column_expr + ))), + }; + } + + _ => { + return Err(DataFusionError::Plan(format!( + "column expression {:?} is not supported", + column_expr + ))) + } } + // Ok((column_expr.clone(), op, scalar_expr.clone())) +} + +fn is_compare_op(op: Operator) -> bool { + matches!( + op, + Operator::Eq + | Operator::NotEq + | Operator::Lt + | Operator::LtEq + | Operator::Gt + | Operator::GtEq + ) } /// replaces a column with an old name with a new name in an expression @@ -455,6 +533,16 @@ fn rewrite_column_expr( utils::rewrite_expression(expr, &expressions) } +fn reverse_operator(op: Operator) -> Operator { + match op { + Operator::Lt => Operator::Gt, + Operator::Gt => Operator::Lt, + Operator::LtEq => Operator::GtEq, + Operator::GtEq => Operator::LtEq, + _ => op, + } +} + /// Given a column reference to `column`, returns a pruning /// expression in terms of the min and max that will evaluate to true /// if the column may contain values, and false if definitely does not @@ -541,7 +629,7 @@ fn build_predicate_expression( } let expr_builder = - PruningExpressionBuilder::try_new(left, right, schema, required_columns); + PruningExpressionBuilder::try_new(left, right, op, schema, required_columns); let mut expr_builder = match expr_builder { Ok(builder) => builder, // allow partial failure in predicate expression generation @@ -550,54 +638,63 @@ fn build_predicate_expression( return Ok(unhandled); } }; - let corrected_op = expr_builder.correct_operator(op); - let statistics_expr = match corrected_op { - Operator::NotEq => { - // column != literal => (min, max) = literal => - // !(min != literal && max != literal) ==> - // min != literal || literal != max - let min_column_expr = expr_builder.min_column_expr()?; - let max_column_expr = expr_builder.max_column_expr()?; - min_column_expr - .not_eq(expr_builder.scalar_expr().clone()) - .or(expr_builder.scalar_expr().clone().not_eq(max_column_expr)) - } - Operator::Eq => { - // column = literal => (min, max) = literal => min <= literal && literal <= max - // (column / 2) = 4 => (column_min / 2) <= 4 && 4 <= (column_max / 2) - let min_column_expr = expr_builder.min_column_expr()?; - let max_column_expr = expr_builder.max_column_expr()?; - min_column_expr - .lt_eq(expr_builder.scalar_expr().clone()) - .and(expr_builder.scalar_expr().clone().lt_eq(max_column_expr)) - } - Operator::Gt => { - // column > literal => (min, max) > literal => max > literal - expr_builder - .max_column_expr()? - .gt(expr_builder.scalar_expr().clone()) - } - Operator::GtEq => { - // column >= literal => (min, max) >= literal => max >= literal - expr_builder - .max_column_expr()? - .gt_eq(expr_builder.scalar_expr().clone()) - } - Operator::Lt => { - // column < literal => (min, max) < literal => min < literal - expr_builder - .min_column_expr()? - .lt(expr_builder.scalar_expr().clone()) - } - Operator::LtEq => { - // column <= literal => (min, max) <= literal => min <= literal - expr_builder - .min_column_expr()? - .lt_eq(expr_builder.scalar_expr().clone()) - } - // other expressions are not supported - _ => unhandled, - }; + + let statistics_expr = build_statistics_expr(&mut expr_builder).unwrap_or(unhandled); + Ok(statistics_expr) +} + +fn build_statistics_expr(expr_builder: &mut PruningExpressionBuilder) -> Result { + let statistics_expr = + match expr_builder.op() { + Operator::NotEq => { + // column != literal => (min, max) = literal => + // !(min != literal && max != literal) ==> + // min != literal || literal != max + let min_column_expr = expr_builder.min_column_expr()?; + let max_column_expr = expr_builder.max_column_expr()?; + min_column_expr + .not_eq(expr_builder.scalar_expr().clone()) + .or(expr_builder.scalar_expr().clone().not_eq(max_column_expr)) + } + Operator::Eq => { + // column = literal => (min, max) = literal => min <= literal && literal <= max + // (column / 2) = 4 => (column_min / 2) <= 4 && 4 <= (column_max / 2) + let min_column_expr = expr_builder.min_column_expr()?; + let max_column_expr = expr_builder.max_column_expr()?; + min_column_expr + .lt_eq(expr_builder.scalar_expr().clone()) + .and(expr_builder.scalar_expr().clone().lt_eq(max_column_expr)) + } + Operator::Gt => { + // column > literal => (min, max) > literal => max > literal + expr_builder + .max_column_expr()? + .gt(expr_builder.scalar_expr().clone()) + } + Operator::GtEq => { + // column >= literal => (min, max) >= literal => max >= literal + expr_builder + .max_column_expr()? + .gt_eq(expr_builder.scalar_expr().clone()) + } + Operator::Lt => { + // column < literal => (min, max) < literal => min < literal + expr_builder + .min_column_expr()? + .lt(expr_builder.scalar_expr().clone()) + } + Operator::LtEq => { + // column <= literal => (min, max) <= literal => min <= literal + expr_builder + .min_column_expr()? + .lt_eq(expr_builder.scalar_expr().clone()) + } + // other expressions are not supported + _ => return Err(DataFusionError::Plan( + "expressions other than (neq, eq, gt, gteq, lt, lteq) are not superted" + .to_string(), + )), + }; Ok(statistics_expr) } @@ -1308,4 +1405,112 @@ mod tests { result ) } + + /// Creates setup for int32 chunk pruning + fn int32_setup() -> (SchemaRef, TestStatistics) { + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); + + let statistics = TestStatistics::new().with( + "i", + ContainerStats::new_i32( + vec![Some(-5), Some(1), Some(-11), None, Some(1)], // min + vec![Some(5), Some(11), Some(-1), None, None], // max + ), + ); + (schema, statistics) + } + + #[test] + fn prune_int32_col_gt_zero() { + let (schema, statistics) = int32_setup(); + + // Expression "i > 0" and "-i < 0" + // i [-5, 5] ==> some rows could pass (must keep) + // i [1, 11] ==> all rows must pass (must keep) + // i [-11, -1] ==> no rows can pass (not keep) + // i [NULL, NULL] ==> unknown (must keep) + // i [1, NULL] ==> unknown (must keep) + let expected_ret = vec![true, true, false, true, true]; + + // i > 0 + let expr = col("i").gt(lit(0)); + let p = PruningPredicate::try_new(&expr, schema.clone()).unwrap(); + let result = p.prune(&statistics).unwrap(); + assert_eq!(result, expected_ret); + + // -i < 0 + let expr = Expr::Negative(Box::new(col("i"))).lt(lit(0)); + let p = PruningPredicate::try_new(&expr, schema).unwrap(); + let result = p.prune(&statistics).unwrap(); + assert_eq!(result, expected_ret); + } + + #[test] + fn prune_int32_col_lte_zero() { + let (schema, statistics) = int32_setup(); + + // Expression "i <= 0" and "-i >= 0" + // i [-5, 5] ==> some rows could pass (must keep) + // i [1, 11] ==> no rows can pass (not keep) + // i [-11, -1] ==> all rows must pass (must keep) + // i [NULL, NULL] ==> unknown (must keep) + // i [1, NULL] ==> no rows can pass (not keep) + let expected_ret = vec![true, false, true, true, false]; + + // i <= 0 + let expr = col("i").lt_eq(lit(0)); + let p = PruningPredicate::try_new(&expr, schema.clone()).unwrap(); + let result = p.prune(&statistics).unwrap(); + assert_eq!(result, expected_ret); + + // -i >= 0 + let expr = Expr::Negative(Box::new(col("i"))).gt_eq(lit(0)); + let p = PruningPredicate::try_new(&expr, schema).unwrap(); + let result = p.prune(&statistics).unwrap(); + assert_eq!(result, expected_ret); + } + + #[test] + fn prune_int32_col_eq_zero() { + let (schema, statistics) = int32_setup(); + + // Expression "i = 0" + // i [-5, 5] ==> some rows could pass (must keep) + // i [1, 11] ==> no rows can pass (not keep) + // i [-11, -1] ==> no rows can pass (not keep) + // i [NULL, NULL] ==> unknown (must keep) + // i [1, NULL] ==> no rows can pass (not keep) + let expected_ret = vec![true, false, false, true, false]; + + // i = 0 + let expr = col("i").eq(lit(0)); + let p = PruningPredicate::try_new(&expr, schema).unwrap(); + let result = p.prune(&statistics).unwrap(); + assert_eq!(result, expected_ret); + } + + #[test] + fn prune_int32_col_lt_neg_one() { + let (schema, statistics) = int32_setup(); + + // Expression "i > -1" and "-i < 1" + // i [-5, 5] ==> some rows could pass (must keep) + // i [1, 11] ==> all rows must pass (must keep) + // i [-11, -1] ==> no rows can pass (not keep) + // i [NULL, NULL] ==> unknown (must keep) + // i [1, NULL] ==> all rows must pass (must keep) + let expected_ret = vec![true, true, false, true, true]; + + // i > -1 + let expr = col("i").gt(lit(-1)); + let p = PruningPredicate::try_new(&expr, schema.clone()).unwrap(); + let result = p.prune(&statistics).unwrap(); + assert_eq!(result, expected_ret); + + // -i < 1 + let expr = Expr::Negative(Box::new(col("i"))).lt(lit(1)); + let p = PruningPredicate::try_new(&expr, schema).unwrap(); + let result = p.prune(&statistics).unwrap(); + assert_eq!(result, expected_ret); + } } diff --git a/datafusion/tests/parquet_pruning.rs b/datafusion/tests/parquet_pruning.rs index 0838211f14f09..789f0810c983a 100644 --- a/datafusion/tests/parquet_pruning.rs +++ b/datafusion/tests/parquet_pruning.rs @@ -21,10 +21,11 @@ use std::sync::Arc; use arrow::{ array::{ - Array, Date32Array, Date64Array, StringArray, TimestampMicrosecondArray, - TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, + Array, ArrayRef, Date32Array, Date64Array, Float64Array, Int32Array, StringArray, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, }, - datatypes::{Field, Schema}, + datatypes::{DataType, Field, Schema}, record_batch::RecordBatch, util::pretty::pretty_format_batches, }; @@ -177,6 +178,229 @@ async fn prune_disabled() { ); } +#[tokio::test] +async fn prune_int32_lt() { + let (expected_errors, expected_row_group_pruned, expected_results) = + (Some(0), Some(1), 11); + + // resulrt of sql "SELECT * FROM t where i < 1" is same as + // "SELECT * FROM t where -i > -1" + let output = ContextWithParquet::new(Scenario::Int32) + .await + .query("SELECT * FROM t where i < 1") + .await; + + println!("{}", output.description()); + // This should prune out groups without error + assert_eq!(output.predicate_evaluation_errors(), expected_errors); + assert_eq!(output.row_groups_pruned(), expected_row_group_pruned); + assert_eq!( + output.result_rows, + expected_results, + "{}", + output.description() + ); + + let output = ContextWithParquet::new(Scenario::Int32) + .await + .query("SELECT * FROM t where -i > -1") + .await; + + println!("{}", output.description()); + // This should prune out groups without error + assert_eq!(output.predicate_evaluation_errors(), expected_errors); + assert_eq!(output.row_groups_pruned(), expected_row_group_pruned); + assert_eq!( + output.result_rows, + expected_results, + "{}", + output.description() + ); +} + +#[tokio::test] +async fn prune_int32_eq() { + // resulrt of sql "SELECT * FROM t where i = 1" + let output = ContextWithParquet::new(Scenario::Int32) + .await + .query("SELECT * FROM t where i = 1") + .await; + + println!("{}", output.description()); + // This should prune out groups without error + assert_eq!(output.predicate_evaluation_errors(), Some(0)); + assert_eq!(output.row_groups_pruned(), Some(3)); + assert_eq!(output.result_rows, 1, "{}", output.description()); +} + +#[tokio::test] +async fn prune_int32_scalar_fun_and_eq() { + // resulrt of sql "SELECT * FROM t where abs(i) = 1 and i = 1" + // only use "i = 1" to prune + let output = ContextWithParquet::new(Scenario::Int32) + .await + .query("SELECT * FROM t where abs(i) = 1 and i = 1") + .await; + + println!("{}", output.description()); + // This should prune out groups without error + assert_eq!(output.predicate_evaluation_errors(), Some(0)); + assert_eq!(output.row_groups_pruned(), Some(3)); + assert_eq!(output.result_rows, 1, "{}", output.description()); +} + +#[tokio::test] +async fn prune_int32_scalar_fun() { + // resulrt of sql "SELECT * FROM t where abs(i) = 1" is not supported + let output = ContextWithParquet::new(Scenario::Int32) + .await + .query("SELECT * FROM t where abs(i) = 1") + .await; + + println!("{}", output.description()); + // This should prune out groups with error, because there is not col to + // prune the row groups. + assert_eq!(output.predicate_evaluation_errors(), Some(1)); + assert_eq!(output.row_groups_pruned(), Some(0)); + assert_eq!(output.result_rows, 3, "{}", output.description()); +} + +#[tokio::test] +async fn prune_int32_complex_expr() { + // resulrt of sql "SELECT * FROM t where i+1 = 1" is not supported + let output = ContextWithParquet::new(Scenario::Int32) + .await + .query("SELECT * FROM t where i+1 = 1") + .await; + + println!("{}", output.description()); + // This should prune out groups with error, because there is not col to + // prune the row groups. + assert_eq!(output.predicate_evaluation_errors(), Some(1)); + assert_eq!(output.row_groups_pruned(), Some(0)); + assert_eq!(output.result_rows, 2, "{}", output.description()); +} + +#[tokio::test] +async fn prune_int32_complex_expr_subtract() { + // resulrt of sql "SELECT * FROM t where 1-i > 1" is not supported + let output = ContextWithParquet::new(Scenario::Int32) + .await + .query("SELECT * FROM t where 1-i > 1") + .await; + + println!("{}", output.description()); + // This should prune out groups with error, because there is not col to + // prune the row groups. + assert_eq!(output.predicate_evaluation_errors(), Some(1)); + assert_eq!(output.row_groups_pruned(), Some(0)); + assert_eq!(output.result_rows, 9, "{}", output.description()); +} + +#[tokio::test] +async fn prune_f64_lt() { + let (expected_errors, expected_row_group_pruned, expected_results) = + (Some(0), Some(1), 11); + + // resulrt of sql "SELECT * FROM t where i < 1" is same as + // "SELECT * FROM t where -i > -1" + let output = ContextWithParquet::new(Scenario::Float64) + .await + .query("SELECT * FROM t where f < 1") + .await; + + println!("{}", output.description()); + // This should prune out groups without error + assert_eq!(output.predicate_evaluation_errors(), expected_errors); + assert_eq!(output.row_groups_pruned(), expected_row_group_pruned); + assert_eq!( + output.result_rows, + expected_results, + "{}", + output.description() + ); + + let output = ContextWithParquet::new(Scenario::Float64) + .await + .query("SELECT * FROM t where -f > -1") + .await; + + println!("{}", output.description()); + // This should prune out groups without error + assert_eq!(output.predicate_evaluation_errors(), expected_errors); + assert_eq!(output.row_groups_pruned(), expected_row_group_pruned); + assert_eq!( + output.result_rows, + expected_results, + "{}", + output.description() + ); +} + +#[tokio::test] +async fn prune_f64_scalar_fun_and_gt() { + // resulrt of sql "SELECT * FROM t where abs(f - 1) <= 0.000001 and f >= 0.1" + // only use "f >= 0" to prune + let output = ContextWithParquet::new(Scenario::Float64) + .await + .query("SELECT * FROM t where abs(f - 1) <= 0.000001 and f >= 0.1") + .await; + + println!("{}", output.description()); + // This should prune out groups without error + assert_eq!(output.predicate_evaluation_errors(), Some(0)); + assert_eq!(output.row_groups_pruned(), Some(2)); + assert_eq!(output.result_rows, 1, "{}", output.description()); +} + +#[tokio::test] +async fn prune_f64_scalar_fun() { + // resulrt of sql "SELECT * FROM t where abs(f-1) <= 0.000001" is not supported + let output = ContextWithParquet::new(Scenario::Float64) + .await + .query("SELECT * FROM t where abs(f-1) <= 0.000001") + .await; + + println!("{}", output.description()); + // This should prune out groups with error, because there is not col to + // prune the row groups. + assert_eq!(output.predicate_evaluation_errors(), Some(1)); + assert_eq!(output.row_groups_pruned(), Some(0)); + assert_eq!(output.result_rows, 1, "{}", output.description()); +} + +#[tokio::test] +async fn prune_f64_complex_expr() { + // resulrt of sql "SELECT * FROM t where f+1 > 1.1"" is not supported + let output = ContextWithParquet::new(Scenario::Float64) + .await + .query("SELECT * FROM t where f+1 > 1.1") + .await; + + println!("{}", output.description()); + // This should prune out groups with error, because there is not col to + // prune the row groups. + assert_eq!(output.predicate_evaluation_errors(), Some(1)); + assert_eq!(output.row_groups_pruned(), Some(0)); + assert_eq!(output.result_rows, 9, "{}", output.description()); +} + +#[tokio::test] +async fn prune_f64_complex_expr_subtract() { + // resulrt of sql "SELECT * FROM t where 1-f > 1" is not supported + let output = ContextWithParquet::new(Scenario::Float64) + .await + .query("SELECT * FROM t where 1-f > 1") + .await; + + println!("{}", output.description()); + // This should prune out groups with error, because there is not col to + // prune the row groups. + assert_eq!(output.predicate_evaluation_errors(), Some(1)); + assert_eq!(output.row_groups_pruned(), Some(0)); + assert_eq!(output.result_rows, 9, "{}", output.description()); +} + // ---------------------- // Begin test fixture // ---------------------- @@ -185,6 +409,8 @@ async fn prune_disabled() { enum Scenario { Timestamps, Dates, + Int32, + Float64, } /// Test fixture that has an execution context that has an external @@ -370,6 +596,22 @@ async fn make_test_file(scenario: Scenario) -> NamedTempFile { make_date_batch(Duration::days(3600)), ] } + Scenario::Int32 => { + vec![ + make_int32_batch(-5, 0), + make_int32_batch(-4, 1), + make_int32_batch(0, 5), + make_int32_batch(5, 10), + ] + } + Scenario::Float64 => { + vec![ + make_f64_batch(vec![-5.0, -4.0, -3.0, -2.0, -1.0]), + make_f64_batch(vec![-4.0, -3.0, -2.0, -1.0, 0.0]), + make_f64_batch(vec![0.0, 1.0, 2.0, 3.0, 4.0]), + make_f64_batch(vec![5.0, 6.0, 7.0, 8.0, 9.0]), + ] + } }; let schema = batches[0].schema(); @@ -475,6 +717,27 @@ fn make_timestamp_batch(offset: Duration) -> RecordBatch { .unwrap() } +/// Return record batch with i32 sequence +/// +/// Columns are named +/// "i" -> Int32Array +fn make_int32_batch(start: i32, end: i32) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); + let v: Vec = (start..end).collect(); + let array = Arc::new(Int32Array::from(v)) as ArrayRef; + RecordBatch::try_new(schema, vec![array.clone()]).unwrap() +} + +/// Return record batch with f64 vector +/// +/// Columns are named +/// "f" -> Float64Array +fn make_f64_batch(v: Vec) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![Field::new("f", DataType::Float64, true)])); + let array = Arc::new(Float64Array::from(v)) as ArrayRef; + RecordBatch::try_new(schema, vec![array.clone()]).unwrap() +} + /// Return record batch with a few rows of data for all of the supported date /// types with the specified offset (in days) /// From 2a4f94e622a9c3db88eeb5c5ec7dd12efb98b546 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Sat, 24 Jul 2021 04:50:12 +0800 Subject: [PATCH 291/329] update python crate (#768) --- python/Cargo.toml | 2 +- python/src/functions.rs | 97 ++++++++++++++++------------- python/tests/test_math_functions.py | 2 + 3 files changed, 57 insertions(+), 44 deletions(-) diff --git a/python/Cargo.toml b/python/Cargo.toml index ee99359a82f05..fe84e5234c333 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -31,7 +31,7 @@ libc = "0.2" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } rand = "0.7" pyo3 = { version = "0.14.1", features = ["extension-module"] } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "e4df37a4001423909964348289360da66acdd0a3" } +datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "4d61196dee8526998aee7e7bb10ea88422e5f9e1" } [lib] name = "datafusion" diff --git a/python/src/functions.rs b/python/src/functions.rs index 415490743185a..23f010a6ae45c 100644 --- a/python/src/functions.rs +++ b/python/src/functions.rs @@ -20,7 +20,7 @@ use crate::udf; use crate::{expression, types::PyDataType}; use datafusion::arrow::datatypes::DataType; use datafusion::logical_plan; -use pyo3::{prelude::*, wrap_pyfunction}; +use pyo3::{prelude::*, types::PyTuple, wrap_pyfunction}; use std::sync::Arc; /// Expression representing a column on the existing plan. @@ -76,11 +76,33 @@ fn now() -> expression::Expression { #[pyfunction] fn random() -> expression::Expression { expression::Expression { - // here lit(0) is a stub for conform to arity - expr: logical_plan::random(logical_plan::lit(0)), + expr: logical_plan::random(), } } +/// Concatenates the text representations of all the arguments. +/// NULL arguments are ignored. +#[pyfunction(args = "*")] +fn concat(args: &PyTuple) -> PyResult { + let expressions = expression::from_tuple(args)?; + let args = expressions.into_iter().map(|e| e.expr).collect::>(); + Ok(expression::Expression { + expr: logical_plan::concat(&args), + }) +} + +/// Concatenates all but the first argument, with separators. +/// The first argument is used as the separator string, and should not be NULL. +/// Other NULL arguments are ignored. +#[pyfunction(sep, args = "*")] +fn concat_ws(sep: String, args: &PyTuple) -> PyResult { + let expressions = expression::from_tuple(args)?; + let args = expressions.into_iter().map(|e| e.expr).collect::>(); + Ok(expression::Expression { + expr: logical_plan::concat_ws(sep, &args), + }) +} + macro_rules! define_unary_function { ($NAME: ident) => { #[doc = "This function is not documented yet"] @@ -132,7 +154,6 @@ define_unary_function!( "Returns number of characters in the string." ); define_unary_function!(chr, "Returns the character with the given code."); -define_unary_function!(concat_ws, "Concatenates all but the first argument, with separators. The first argument is used as the separator string, and should not be NULL. Other NULL arguments are ignored."); define_unary_function!(initcap, "Converts the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters."); define_unary_function!(left, "Returns first n characters in the string, or when n is negative, returns all but last |n| characters."); define_unary_function!(lower, "Converts the string to all lower case"); @@ -179,15 +200,6 @@ define_unary_function!(min); define_unary_function!(max); define_unary_function!(count); -/* -#[pyfunction] -fn concat(value: Vec) -> expression::Expression { - expression::Expression { - expr: logical_plan::concat(value.into_iter().map(|e| e.expr)), - } -} - */ - pub(crate) fn create_udf( fun: PyObject, input_types: Vec, @@ -250,26 +262,39 @@ fn udaf( } pub fn init(module: &PyModule) -> PyResult<()> { - module.add_function(wrap_pyfunction!(col, module)?)?; - module.add_function(wrap_pyfunction!(lit, module)?)?; - // see https://github.com/apache/arrow-datafusion/issues/226 - //module.add_function(wrap_pyfunction!(concat, module)?)?; - module.add_function(wrap_pyfunction!(udf, module)?)?; + module.add_function(wrap_pyfunction!(abs, module)?)?; + module.add_function(wrap_pyfunction!(acos, module)?)?; module.add_function(wrap_pyfunction!(array, module)?)?; module.add_function(wrap_pyfunction!(ascii, module)?)?; + module.add_function(wrap_pyfunction!(asin, module)?)?; + module.add_function(wrap_pyfunction!(atan, module)?)?; + module.add_function(wrap_pyfunction!(avg, module)?)?; module.add_function(wrap_pyfunction!(bit_length, module)?)?; + module.add_function(wrap_pyfunction!(btrim, module)?)?; + module.add_function(wrap_pyfunction!(ceil, module)?)?; module.add_function(wrap_pyfunction!(character_length, module)?)?; module.add_function(wrap_pyfunction!(chr, module)?)?; - module.add_function(wrap_pyfunction!(btrim, module)?)?; + module.add_function(wrap_pyfunction!(col, module)?)?; module.add_function(wrap_pyfunction!(concat_ws, module)?)?; + module.add_function(wrap_pyfunction!(concat, module)?)?; + module.add_function(wrap_pyfunction!(cos, module)?)?; + module.add_function(wrap_pyfunction!(count, module)?)?; + module.add_function(wrap_pyfunction!(exp, module)?)?; + module.add_function(wrap_pyfunction!(floor, module)?)?; module.add_function(wrap_pyfunction!(in_list, module)?)?; module.add_function(wrap_pyfunction!(initcap, module)?)?; module.add_function(wrap_pyfunction!(left, module)?)?; + module.add_function(wrap_pyfunction!(lit, module)?)?; + module.add_function(wrap_pyfunction!(ln, module)?)?; + module.add_function(wrap_pyfunction!(log10, module)?)?; + module.add_function(wrap_pyfunction!(log2, module)?)?; module.add_function(wrap_pyfunction!(lower, module)?)?; module.add_function(wrap_pyfunction!(lpad, module)?)?; + module.add_function(wrap_pyfunction!(ltrim, module)?)?; + module.add_function(wrap_pyfunction!(max, module)?)?; module.add_function(wrap_pyfunction!(md5, module)?)?; + module.add_function(wrap_pyfunction!(min, module)?)?; module.add_function(wrap_pyfunction!(now, module)?)?; - module.add_function(wrap_pyfunction!(ltrim, module)?)?; module.add_function(wrap_pyfunction!(octet_length, module)?)?; module.add_function(wrap_pyfunction!(random, module)?)?; module.add_function(wrap_pyfunction!(regexp_replace, module)?)?; @@ -277,43 +302,29 @@ pub fn init(module: &PyModule) -> PyResult<()> { module.add_function(wrap_pyfunction!(replace, module)?)?; module.add_function(wrap_pyfunction!(reverse, module)?)?; module.add_function(wrap_pyfunction!(right, module)?)?; + module.add_function(wrap_pyfunction!(round, module)?)?; module.add_function(wrap_pyfunction!(rpad, module)?)?; module.add_function(wrap_pyfunction!(rtrim, module)?)?; module.add_function(wrap_pyfunction!(sha224, module)?)?; module.add_function(wrap_pyfunction!(sha256, module)?)?; module.add_function(wrap_pyfunction!(sha384, module)?)?; module.add_function(wrap_pyfunction!(sha512, module)?)?; + module.add_function(wrap_pyfunction!(signum, module)?)?; + module.add_function(wrap_pyfunction!(sin, module)?)?; module.add_function(wrap_pyfunction!(split_part, module)?)?; + module.add_function(wrap_pyfunction!(sqrt, module)?)?; module.add_function(wrap_pyfunction!(starts_with, module)?)?; module.add_function(wrap_pyfunction!(strpos, module)?)?; module.add_function(wrap_pyfunction!(substr, module)?)?; + module.add_function(wrap_pyfunction!(sum, module)?)?; + module.add_function(wrap_pyfunction!(tan, module)?)?; module.add_function(wrap_pyfunction!(to_hex, module)?)?; module.add_function(wrap_pyfunction!(translate, module)?)?; module.add_function(wrap_pyfunction!(trim, module)?)?; - module.add_function(wrap_pyfunction!(upper, module)?)?; - module.add_function(wrap_pyfunction!(sum, module)?)?; - module.add_function(wrap_pyfunction!(count, module)?)?; - module.add_function(wrap_pyfunction!(min, module)?)?; - module.add_function(wrap_pyfunction!(max, module)?)?; - module.add_function(wrap_pyfunction!(avg, module)?)?; - module.add_function(wrap_pyfunction!(udaf, module)?)?; - module.add_function(wrap_pyfunction!(sqrt, module)?)?; - module.add_function(wrap_pyfunction!(sin, module)?)?; - module.add_function(wrap_pyfunction!(cos, module)?)?; - module.add_function(wrap_pyfunction!(tan, module)?)?; - module.add_function(wrap_pyfunction!(asin, module)?)?; - module.add_function(wrap_pyfunction!(acos, module)?)?; - module.add_function(wrap_pyfunction!(atan, module)?)?; - module.add_function(wrap_pyfunction!(floor, module)?)?; - module.add_function(wrap_pyfunction!(ceil, module)?)?; - module.add_function(wrap_pyfunction!(round, module)?)?; module.add_function(wrap_pyfunction!(trunc, module)?)?; - module.add_function(wrap_pyfunction!(abs, module)?)?; - module.add_function(wrap_pyfunction!(signum, module)?)?; - module.add_function(wrap_pyfunction!(exp, module)?)?; - module.add_function(wrap_pyfunction!(ln, module)?)?; - module.add_function(wrap_pyfunction!(log2, module)?)?; - module.add_function(wrap_pyfunction!(log10, module)?)?; + module.add_function(wrap_pyfunction!(udaf, module)?)?; + module.add_function(wrap_pyfunction!(udf, module)?)?; + module.add_function(wrap_pyfunction!(upper, module)?)?; Ok(()) } diff --git a/python/tests/test_math_functions.py b/python/tests/test_math_functions.py index 56d4824aeb9d5..cb03753121fa0 100644 --- a/python/tests/test_math_functions.py +++ b/python/tests/test_math_functions.py @@ -44,6 +44,7 @@ def test_math_functions(df): f.ln(col_v + f.lit(1)), f.log2(col_v + f.lit(1)), f.log10(col_v + f.lit(1)), + f.random(), ) result = df.collect() assert len(result) == 1 @@ -58,3 +59,4 @@ def test_math_functions(df): np.testing.assert_array_almost_equal(result.column(7), np.log(values + 1.0)) np.testing.assert_array_almost_equal(result.column(8), np.log2(values + 1.0)) np.testing.assert_array_almost_equal(result.column(9), np.log10(values + 1.0)) + np.testing.assert_array_less(result.column(10), np.ones_like(values)) From 5151135fa298f42e8b77407ccd4f0e412adc36fe Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 24 Jul 2021 06:57:34 -0400 Subject: [PATCH 292/329] Use consistent version of string_to_timestamp_nanos in DataFusion (#767) * Use consistent version of string_to_timestamp_nanos in DataFusion * fixup comments * Use upstream string_to_timestamp_nanos and remove copy in DataFusion --- datafusion/src/optimizer/constant_folding.rs | 2 +- .../src/physical_plan/datetime_expressions.rs | 160 ++---------------- 2 files changed, 14 insertions(+), 148 deletions(-) diff --git a/datafusion/src/optimizer/constant_folding.rs b/datafusion/src/optimizer/constant_folding.rs index 79833df66129d..b4c4a96de4b5b 100644 --- a/datafusion/src/optimizer/constant_folding.rs +++ b/datafusion/src/optimizer/constant_folding.rs @@ -20,6 +20,7 @@ use std::sync::Arc; +use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos; use arrow::datatypes::DataType; use crate::error::Result; @@ -29,7 +30,6 @@ use crate::optimizer::optimizer::OptimizerRule; use crate::optimizer::utils; use crate::physical_plan::functions::BuiltinScalarFunction; use crate::scalar::ScalarValue; -use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos; use arrow::compute::{kernels, DEFAULT_CAST_OPTIONS}; /// Optimizer that simplifies comparison expressions involving boolean literals. diff --git a/datafusion/src/physical_plan/datetime_expressions.rs b/datafusion/src/physical_plan/datetime_expressions.rs index e17ded29749ea..39ae70d1b5d02 100644 --- a/datafusion/src/physical_plan/datetime_expressions.rs +++ b/datafusion/src/physical_plan/datetime_expressions.rs @@ -25,6 +25,7 @@ use crate::{ }; use arrow::{ array::{Array, ArrayRef, GenericStringArray, PrimitiveArray, StringOffsetSizeTrait}, + compute::kernels::cast_utils::string_to_timestamp_nanos, datatypes::{ ArrowPrimitiveType, DataType, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, @@ -41,150 +42,10 @@ use arrow::{ }; use chrono::prelude::*; use chrono::Duration; -use chrono::LocalResult; -#[inline] -/// Accepts a string in RFC3339 / ISO8601 standard format and some -/// variants and converts it to a nanosecond precision timestamp. -/// -/// Implements the `to_timestamp` function to convert a string to a -/// timestamp, following the model of spark SQL’s to_`timestamp`. -/// -/// In addition to RFC3339 / ISO8601 standard timestamps, it also -/// accepts strings that use a space ` ` to separate the date and time -/// as well as strings that have no explicit timezone offset. -/// -/// Examples of accepted inputs: -/// * `1997-01-31T09:26:56.123Z` # RCF3339 -/// * `1997-01-31T09:26:56.123-05:00` # RCF3339 -/// * `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space rather than T -/// * `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone offset specified -/// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and no timezone offset -/// * `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds -// -/// Internally, this function uses the `chrono` library for the -/// datetime parsing -/// -/// We hope to extend this function in the future with a second -/// parameter to specifying the format string. -/// -/// ## Timestamp Precision -/// -/// DataFusion uses the maximum precision timestamps supported by -/// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This -/// means the range of dates that timestamps can represent is ~1677 AD -/// to 2262 AM -/// -/// -/// ## Timezone / Offset Handling -/// -/// By using the Arrow format, DataFusion inherits Arrow’s handling of -/// timestamp values. Specifically, the stored numerical values of -/// timestamps are stored compared to offset UTC. -/// -/// This function intertprets strings without an explicit time zone as -/// timestamps with offsets of the local time on the machine that ran -/// the datafusion query -/// -/// For example, `1997-01-31 09:26:56.123Z` is interpreted as UTC, as -/// it has an explicit timezone specifier (“Z” for Zulu/UTC) -/// -/// `1997-01-31T09:26:56.123` is interpreted as a local timestamp in -/// the timezone of the machine that ran DataFusion. For example, if -/// the system timezone is set to Americas/New_York (UTC-5) the -/// timestamp will be interpreted as though it were -/// `1997-01-31T09:26:56.123-05:00` -fn string_to_timestamp_nanos(s: &str) -> Result { - // Fast path: RFC3339 timestamp (with a T) - // Example: 2020-09-08T13:42:29.190855Z - if let Ok(ts) = DateTime::parse_from_rfc3339(s) { - return Ok(ts.timestamp_nanos()); - } - - // Implement quasi-RFC3339 support by trying to parse the - // timestamp with various other format specifiers to to support - // separating the date and time with a space ' ' rather than 'T' to be - // (more) compatible with Apache Spark SQL - - // timezone offset, using ' ' as a separator - // Example: 2020-09-08 13:42:29.190855-05:00 - if let Ok(ts) = DateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f%:z") { - return Ok(ts.timestamp_nanos()); - } - - // with an explicit Z, using ' ' as a separator - // Example: 2020-09-08 13:42:29Z - if let Ok(ts) = Utc.datetime_from_str(s, "%Y-%m-%d %H:%M:%S%.fZ") { - return Ok(ts.timestamp_nanos()); - } - - // Support timestamps without an explicit timezone offset, again - // to be compatible with what Apache Spark SQL does. - - // without a timezone specifier as a local time, using T as a separator - // Example: 2020-09-08T13:42:29.190855 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f") { - return naive_datetime_to_timestamp(s, ts); - } - - // without a timezone specifier as a local time, using T as a - // separator, no fractional seconds - // Example: 2020-09-08T13:42:29 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") { - return naive_datetime_to_timestamp(s, ts); - } - - // without a timezone specifier as a local time, using ' ' as a separator - // Example: 2020-09-08 13:42:29.190855 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S.%f") { - return naive_datetime_to_timestamp(s, ts); - } - - // without a timezone specifier as a local time, using ' ' as a - // separator, no fractional seconds - // Example: 2020-09-08 13:42:29 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S") { - return naive_datetime_to_timestamp(s, ts); - } - - // Note we don't pass along the error message from the underlying - // chrono parsing because we tried several different format - // strings and we don't know which the user was trying to - // match. Ths any of the specific error messages is likely to be - // be more confusing than helpful - Err(DataFusionError::Execution(format!( - "Error parsing '{}' as timestamp", - s - ))) -} - -/// Converts the naive datetime (which has no specific timezone) to a -/// nanosecond epoch timestamp relative to UTC. -fn naive_datetime_to_timestamp(s: &str, datetime: NaiveDateTime) -> Result { - let l = Local {}; - - match l.from_local_datetime(&datetime) { - LocalResult::None => Err(DataFusionError::Execution(format!( - "Error parsing '{}' as timestamp: local time representation is invalid", - s - ))), - LocalResult::Single(local_datetime) => { - Ok(local_datetime.with_timezone(&Utc).timestamp_nanos()) - } - // Ambiguous times can happen if the timestamp is exactly when - // a daylight savings time transition occurs, for example, and - // so the datetime could validly be said to be in two - // potential offsets. However, since we are about to convert - // to UTC anyways, we can pick one arbitrarily - LocalResult::Ambiguous(local_datetime, _) => { - Ok(local_datetime.with_timezone(&Utc).timestamp_nanos()) - } - } -} - -// given a function `op` that maps a `&str` to a Result of an arrow native type, -// returns a `PrimitiveArray` after the application -// of the function to `args[0]`. +/// given a function `op` that maps a `&str` to a Result of an arrow native type, +/// returns a `PrimitiveArray` after the application +/// of the function to `args[0]`. /// # Errors /// This function errors iff: /// * the number of arguments is not 1 or @@ -262,11 +123,16 @@ where } } +/// Calls string_to_timestamp_nanos and converts the error type +fn string_to_timestamp_nanos_shim(s: &str) -> Result { + string_to_timestamp_nanos(s).map_err(|e| e.into()) +} + /// to_timestamp SQL function pub fn to_timestamp(args: &[ColumnarValue]) -> Result { handle::( args, - string_to_timestamp_nanos, + string_to_timestamp_nanos_shim, "to_timestamp", ) } @@ -275,7 +141,7 @@ pub fn to_timestamp(args: &[ColumnarValue]) -> Result { pub fn to_timestamp_millis(args: &[ColumnarValue]) -> Result { handle::( args, - |s| string_to_timestamp_nanos(s).map(|n| n / 1_000_000), + |s| string_to_timestamp_nanos_shim(s).map(|n| n / 1_000_000), "to_timestamp_millis", ) } @@ -284,7 +150,7 @@ pub fn to_timestamp_millis(args: &[ColumnarValue]) -> Result { pub fn to_timestamp_micros(args: &[ColumnarValue]) -> Result { handle::( args, - |s| string_to_timestamp_nanos(s).map(|n| n / 1_000), + |s| string_to_timestamp_nanos_shim(s).map(|n| n / 1_000), "to_timestamp_micros", ) } @@ -293,7 +159,7 @@ pub fn to_timestamp_micros(args: &[ColumnarValue]) -> Result { pub fn to_timestamp_seconds(args: &[ColumnarValue]) -> Result { handle::( args, - |s| string_to_timestamp_nanos(s).map(|n| n / 1_000_000_000), + |s| string_to_timestamp_nanos_shim(s).map(|n| n / 1_000_000_000), "to_timestamp_seconds", ) } From 18dbbb41a0b63507524e736e048e5bf65de7d866 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 24 Jul 2021 06:59:37 -0400 Subject: [PATCH 293/329] Update docs to use vendored version of arrow (#772) --- README.md | 8 ++++---- datafusion/src/lib.rs | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index d3160608bd565..4157130ec38b0 100644 --- a/README.md +++ b/README.md @@ -69,8 +69,8 @@ Run a SQL query against data stored in a CSV: ```rust use datafusion::prelude::*; -use arrow::util::pretty::print_batches; -use arrow::record_batch::RecordBatch; +use datafusion::arrow::util::pretty::print_batches; +use datafusion::arrow::record_batch::RecordBatch; #[tokio::main] async fn main() -> datafusion::error::Result<()> { @@ -92,8 +92,8 @@ Use the DataFrame API to process data stored in a CSV: ```rust use datafusion::prelude::*; -use arrow::util::pretty::print_batches; -use arrow::record_batch::RecordBatch; +use datafusion::arrow::util::pretty::print_batches; +use datafusion::arrow::record_batch::RecordBatch; #[tokio::main] async fn main() -> datafusion::error::Result<()> { diff --git a/datafusion/src/lib.rs b/datafusion/src/lib.rs index 5f07c171ad7ca..d8be372dc8f08 100644 --- a/datafusion/src/lib.rs +++ b/datafusion/src/lib.rs @@ -39,7 +39,7 @@ //! ```rust //! # use datafusion::prelude::*; //! # use datafusion::error::Result; -//! # use arrow::record_batch::RecordBatch; +//! # use datafusion::arrow::record_batch::RecordBatch; //! //! # #[tokio::main] //! # async fn main() -> Result<()> { @@ -77,7 +77,7 @@ //! ``` //! # use datafusion::prelude::*; //! # use datafusion::error::Result; -//! # use arrow::record_batch::RecordBatch; +//! # use datafusion::arrow::record_batch::RecordBatch; //! //! # #[tokio::main] //! # async fn main() -> Result<()> { From 6f5878dc89b6f11a27b521dd5771e0f049cf2d30 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 26 Jul 2021 17:47:18 -0600 Subject: [PATCH 294/329] Remove separate ballista build from CI (#776) --- .github/workflows/rust.yml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 2454d10fe03c4..89c56b5fcedad 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -64,15 +64,6 @@ jobs: env: CARGO_HOME: "/github/home/.cargo" CARGO_TARGET_DIR: "/github/home/target" - # Ballista is currently not part of the main workspace so requires a separate build step - - name: Build Ballista - run: | - cd ballista/rust - # snmalloc requires cmake so build without default features - cargo build --no-default-features --features sled - env: - CARGO_HOME: "/github/home/.cargo" - CARGO_TARGET_DIR: "/github/home/target" # test the crate linux-test: From c74136d5649ad4446c98c15052ad4fffb6bb5dfc Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 27 Jul 2021 11:31:20 -0700 Subject: [PATCH 295/329] Add Ballista examples (#775) --- Cargo.toml | 1 + ballista-examples/Cargo.toml | 38 +++++++++++ ballista-examples/README.md | 58 +++++++++++++++++ .../src/bin/ballista-dataframe.rs | 56 +++++++++++++++++ ballista-examples/src/bin/ballista-sql.rs | 63 +++++++++++++++++++ ballista/README.md | 42 ++++++++----- 6 files changed, 242 insertions(+), 16 deletions(-) create mode 100644 ballista-examples/Cargo.toml create mode 100644 ballista-examples/README.md create mode 100644 ballista-examples/src/bin/ballista-dataframe.rs create mode 100644 ballista-examples/src/bin/ballista-sql.rs diff --git a/Cargo.toml b/Cargo.toml index 351523d74c36a..d6da8c14cd964 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,7 @@ members = [ "ballista/rust/core", "ballista/rust/executor", "ballista/rust/scheduler", + "ballista-examples", ] exclude = ["python"] diff --git a/ballista-examples/Cargo.toml b/ballista-examples/Cargo.toml new file mode 100644 index 0000000000000..b7d40223c4693 --- /dev/null +++ b/ballista-examples/Cargo.toml @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "ballista-examples" +description = "Ballista usage examples" +version = "0.5.0-SNAPSHOT" +homepage = "https://github.com/apache/arrow-datafusion" +repository = "https://github.com/apache/arrow-datafusion" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = [ "arrow", "distributed", "query", "sql" ] +edition = "2018" +publish = false + +[dependencies] +arrow-flight = { version = "5.0" } +datafusion = { path = "../datafusion" } +ballista = { path = "../ballista/rust/client" } +prost = "0.7" +tonic = "0.4" +tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } +futures = "0.3" +num_cpus = "1.13.0" diff --git a/ballista-examples/README.md b/ballista-examples/README.md new file mode 100644 index 0000000000000..1364ad47598be --- /dev/null +++ b/ballista-examples/README.md @@ -0,0 +1,58 @@ + + +# Ballista Examples + +This directory contains examples for executing distributed queries with Ballista. + +For background information on the Ballista architecture, refer to +the [Ballista README](../ballista/README.md). + +## Start a standalone cluster + +From the root of the arrow-datafusion project, build release binaries. + +```bash +cargo build --release +``` + +Start a Ballista scheduler process in a new terminal session. + +```bash +RUST_LOG=info ./target/release/ballista-scheduler +``` + +Start one or more Ballista executor processes in new terminal sessions. When starting more than one +executor, a unique port number must be specified for each executor. + +```bash +RUST_LOG=info ./target/release/ballista-executor -c 4 +``` + +## Running the examples + +Refer to the instructions in [DEVELOPERS.md](../DEVELOPERS.md) to define the `ARROW_TEST_DATA` and +`PARQUET_TEST_DATA` environment variables so that the examples can find the test data files. + +The examples can be run using the `cargo run --bin` syntax. + +```bash +cargo run --release --bin ballista-dataframe +``` + diff --git a/ballista-examples/src/bin/ballista-dataframe.rs b/ballista-examples/src/bin/ballista-dataframe.rs new file mode 100644 index 0000000000000..da7d99db1cf02 --- /dev/null +++ b/ballista-examples/src/bin/ballista-dataframe.rs @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use ballista::prelude::*; +use datafusion::arrow::util::pretty; +use datafusion::prelude::{col, lit}; + +/// This example demonstrates executing a simple query against an Arrow data source (Parquet) and +/// fetching results, using the DataFrame trait +#[tokio::main] +async fn main() -> Result<()> { + let config = BallistaConfig::builder() + .set("ballista.shuffle.partitions", "4") + .build()?; + let ctx = BallistaContext::remote("localhost", 50050, &config); + + let testdata = datafusion::arrow::util::test_util::parquet_test_data(); + + let filename = &format!("{}/alltypes_plain.parquet", testdata); + + // define the query using the DataFrame trait + let df = ctx + .read_parquet(filename)? + .select_columns(&["id", "bool_col", "timestamp_col"])? + .filter(col("id").gt(lit(1)))?; + + // execute the query - note that calling collect on the DataFrame + // trait will execute the query with DataFusion so we have to call + // collect on the BallistaContext instead and pass it the DataFusion + // logical plan + let mut stream = ctx.collect(&df.to_logical_plan()).await?; + + // print the results + let mut results = vec![]; + while let Some(batch) = stream.next().await { + let batch = batch?; + results.push(batch); + } + pretty::print_batches(&results)?; + + Ok(()) +} diff --git a/ballista-examples/src/bin/ballista-sql.rs b/ballista-examples/src/bin/ballista-sql.rs new file mode 100644 index 0000000000000..f9e7d180af45f --- /dev/null +++ b/ballista-examples/src/bin/ballista-sql.rs @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use ballista::prelude::*; +use datafusion::arrow::util::pretty; +use datafusion::prelude::CsvReadOptions; + +/// This example demonstrates executing a simple query against an Arrow data source (CSV) and +/// fetching results, using SQL +#[tokio::main] +async fn main() -> Result<()> { + let config = BallistaConfig::builder() + .set("ballista.shuffle.partitions", "4") + .build()?; + let ctx = BallistaContext::remote("localhost", 50050, &config); + + let testdata = datafusion::arrow::util::test_util::arrow_test_data(); + + // register csv file with the execution context + ctx.register_csv( + "aggregate_test_100", + &format!("{}/csv/aggregate_test_100.csv", testdata), + CsvReadOptions::new(), + )?; + + // execute the query + let df = ctx.sql( + "SELECT c1, MIN(c12), MAX(c12) \ + FROM aggregate_test_100 \ + WHERE c11 > 0.1 AND c11 < 0.9 \ + GROUP BY c1", + )?; + + // execute the query - note that calling collect on the DataFrame + // trait will execute the query with DataFusion so we have to call + // collect on the BallistaContext instead and pass it the DataFusion + // logical plan + let mut stream = ctx.collect(&df.to_logical_plan()).await?; + + // print the results + let mut results = vec![]; + while let Some(batch) = stream.next().await { + let batch = batch?; + results.push(batch); + } + pretty::print_batches(&results)?; + + Ok(()) +} diff --git a/ballista/README.md b/ballista/README.md index 038146a639ed8..0a8db63a1a6cc 100644 --- a/ballista/README.md +++ b/ballista/README.md @@ -17,11 +17,11 @@ under the License. --> -# Ballista: Distributed Compute with Apache Arrow +# Ballista: Distributed Compute with Apache Arrow and DataFusion -Ballista is a distributed compute platform primarily implemented in Rust, and powered by Apache Arrow. It is built -on an architecture that allows other programming languages (such as Python, C++, and Java) to be supported as -first-class citizens without paying a penalty for serialization costs. +Ballista is a distributed compute platform primarily implemented in Rust, and powered by Apache Arrow and +DataFusion. It is built on an architecture that allows other programming languages (such as Python, C++, and +Java) to be supported as first-class citizens without paying a penalty for serialization costs. The foundational technologies in Ballista are: @@ -35,9 +35,30 @@ Ballista can be deployed as a standalone cluster and also supports [Kubernetes]( case, the scheduler can be configured to use [etcd](https://etcd.io/) as a backing store to (eventually) provide redundancy in the case of a scheduler failing. +# Getting Started + +Fully working examples are available. Refer to the [Ballista Examples README](../ballista-examples/README.md) for +more information. + +## Distributed Scheduler Overview + +Ballista uses the DataFusion query execution framework to create a physical plan and then transforms it into a +distributed physical plan by breaking the query down into stages whenever the partitioning scheme changes. + +Specifically, any `RepartitionExec` operator is replaced with an `UnresolvedShuffleExec` and the child operator +of the repartition operator is wrapped in a `ShuffleWriterExec` operator and scheduled for execution. + +Each executor polls the scheduler for the next task to run. Tasks are currently always `ShuffleWriterExec` operators +and each task represents one *input* partition that will be executed. The resulting batches are repartitioned +according to the shuffle partitioning scheme and each *output* partition is streamed to disk in Arrow IPC format. + +The scheduler will replace `UnresolvedShuffleExec` operators with `ShuffleReaderExec` operators once all shuffle +tasks have completed. The `ShuffleReaderExec` operator connects to other executors as required using the Flight +interface, and streams the shuffle IPC files. + # How does this compare to Apache Spark? -Although Ballista is largely inspired by Apache Spark, there are some key differences. +Ballista implements a similar design to Apache Spark, but there are some key differences. - The choice of Rust as the main execution language means that memory usage is deterministic and avoids the overhead of GC pauses. @@ -49,14 +70,3 @@ Although Ballista is largely inspired by Apache Spark, there are some key differ distributed compute. - The use of Apache Arrow as the memory model and network protocol means that data can be exchanged between executors in any programming language with minimal serialization overhead. - -## Status - -Ballista was [donated](https://arrow.apache.org/blog/2021/04/12/ballista-donation/) to the Apache Arrow project in -April 2021 and should be considered experimental. - -## Getting Started - -The [Ballista Developer Documentation](docs/README.md) and the -[DataFusion User Guide](https://github.com/apache/arrow-datafusion/tree/master/docs/user-guide) are currently the -best sources of information for getting started with Ballista. From f036f185b16ea83fa87329019156c2834381d04e Mon Sep 17 00:00:00 2001 From: QP Hou Date: Tue, 27 Jul 2021 14:29:54 -0700 Subject: [PATCH 296/329] port release automations from arrow-rs (#780) * port release automations from arrow-rs * remove cherry-pick-pr --- .github_changelog_generator | 30 ++++++++ dev/release/create-tarball.sh | 116 +++++++++++++++++++++++++++++++ dev/release/release-tarball.sh | 74 ++++++++++++++++++++ dev/release/update_change_log.sh | 47 +++++++++++++ 4 files changed, 267 insertions(+) create mode 100644 .github_changelog_generator create mode 100755 dev/release/create-tarball.sh create mode 100644 dev/release/release-tarball.sh create mode 100755 dev/release/update_change_log.sh diff --git a/.github_changelog_generator b/.github_changelog_generator new file mode 100644 index 0000000000000..49d20dcd9e5ce --- /dev/null +++ b/.github_changelog_generator @@ -0,0 +1,30 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# point to the old changelog in apache/arrow +front-matter=For older versions, see [apache/arrow/CHANGELOG.md](https://github.com/apache/arrow/blob/master/CHANGELOG.md)\n +# some issues are just documentation +add-sections={"documentation":{"prefix":"**Documentation updates:**","labels":["documentation"]}} +# uncomment to not show PRs. TBD if we shown them or not. +#pull-requests=false +# so that the component is shown associated with the issue +issue-line-labels=ballista,datafusion,python +exclude-labels=development-process,invalid +breaking_labels=api-change diff --git a/dev/release/create-tarball.sh b/dev/release/create-tarball.sh new file mode 100755 index 0000000000000..9e411997b0933 --- /dev/null +++ b/dev/release/create-tarball.sh @@ -0,0 +1,116 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Adapted from https://github.com/apache/arrow-rs/tree/master/dev/release/create-tarball.sh + +# This script creates a signed tarball in +# dev/dist/apache-arrow-datafusion--.tar.gz and uploads it to +# the "dev" area of the dist.apache.arrow repository and prepares an +# email for sending to the dev@arrow.apache.org list for a formal +# vote. +# +# See release/README.md for full release instructions +# +# Requirements: +# +# 1. gpg setup for signing and have uploaded your public +# signature to https://pgp.mit.edu/ +# +# 2. Logged into the apache svn server with the appropriate +# credentials +# +# +# Based in part on 02-source.sh from apache/arrow +# + +set -e + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + echo "ex. $0 4.1.0 2" + exit +fi + +tag=$1 +rc=$2 + +release_hash=$(cd "${SOURCE_TOP_DIR}" && git rev-list --max-count=1 ${tag}) + +release=apache-arrow-datafusion-${tag} +distdir=${SOURCE_TOP_DIR}/dev/dist/${release}-rc${rc} +tarname=${release}.tar.gz +tarball=${distdir}/${tarname} +url="https://dist.apache.org/repos/dist/dev/arrow/${release}-rc${rc}" + +echo "Attempting to create ${tarball} from tag ${tag}" + + +if [ -z "$release_hash" ]; then + echo "Cannot continue: unknown git tag: $tag" +fi + +echo "Draft email for dev@arrow.apache.org mailing list" +echo "" +echo "---------------------------------------------------------" +cat < containing the files in git at $release_hash +# the files in the tarball are prefixed with {tag} (e.g. 4.0.1) +mkdir -p ${distdir} +(cd "${SOURCE_TOP_DIR}" && git archive ${release_hash} --prefix ${release}/ | gzip > ${tarball}) + +echo "Running rat license checker on ${tarball}" +${SOURCE_DIR}/run-rat.sh ${tarball} + +echo "Signing tarball and creating checksums" +gpg --armor --output ${tarball}.asc --detach-sig ${tarball} +# create signing with relative path of tarball +# so that they can be verified with a command such as +# shasum --check apache-arrow-datafusion-4.1.0-rc2.tar.gz.sha512 +(cd ${distdir} && shasum -a 256 ${tarname}) > ${tarball}.sha256 +(cd ${distdir} && shasum -a 512 ${tarname}) > ${tarball}.sha512 + +echo "Uploading to apache dist/dev to ${url}" +svn co --depth=empty https://dist.apache.org/repos/dist/dev/arrow ${SOURCE_TOP_DIR}/dev/dist +svn add ${distdir} +svn ci -m "Apache Arrow Datafusion ${tag} ${rc}" ${distdir} + diff --git a/dev/release/release-tarball.sh b/dev/release/release-tarball.sh new file mode 100644 index 0000000000000..557790c39705d --- /dev/null +++ b/dev/release/release-tarball.sh @@ -0,0 +1,74 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Adapted from https://github.com/apache/arrow-rs/tree/master/dev/release/release-tarball.sh + +# This script copies a tarball from the "dev" area of the +# dist.apache.arrow repository to the "release" area +# +# This script should only be run after the release has been approved +# by the arrow PMC committee. +# +# See release/README.md for full release instructions +# +# Based in part on post-01-upload.sh from apache/arrow + + +set -e +set -u + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + echo "ex. $0 4.1.0 2" + exit +fi + +version=$1 +rc=$2 + +tmp_dir=tmp-apache-arrow-datafusion-dist + +echo "Recreate temporary directory: ${tmp_dir}" +rm -rf ${tmp_dir} +mkdir -p ${tmp_dir} + +echo "Clone dev dist repository" +svn \ + co \ + https://dist.apache.org/repos/dist/dev/arrow/apache-arrow-datafusion-${version}-rc${rc} \ + ${tmp_dir}/dev + +echo "Clone release dist repository" +svn co https://dist.apache.org/repos/dist/release/arrow ${tmp_dir}/release + +echo "Copy ${version}-rc${rc} to release working copy" +release_version=arrow-datafusion-${version} +mkdir -p ${tmp_dir}/release/${release_version} +cp -r ${tmp_dir}/dev/* ${tmp_dir}/release/${release_version}/ +svn add ${tmp_dir}/release/${release_version} + +echo "Commit release" +svn ci -m "Apache Arrow Datafusion ${version}" ${tmp_dir}/release + +echo "Clean up" +rm -rf ${tmp_dir} + +echo "Success! The release is available here:" +echo " https://dist.apache.org/repos/dist/release/arrow/${release_version}" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh new file mode 100755 index 0000000000000..4ee9e2eb1e498 --- /dev/null +++ b/dev/release/update_change_log.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Adapted from https://github.com/apache/arrow-rs/tree/master/dev/release/update_change_log.sh + +# invokes the changelog generator from +# https://github.com/github-changelog-generator/github-changelog-generator +# +# With the config located in +# arrow-datafusion/.github_changelog_generator +# +# Usage: +# CHANGELOG_GITHUB_TOKEN= ./update_change_log.sh + +set -e + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" + +pushd ${SOURCE_TOP_DIR} +docker run -it --rm \ + -e CHANGELOG_GITHUB_TOKEN=$CHANGELOG_GITHUB_TOKEN \ + -v "$(pwd)":/usr/local/src/your-app \ + githubchangeloggenerator/github-changelog-generator \ + --user apache \ + --project arrow-datafusion \ + --since-tag 4.0.0 \ + --future-release 5.0.0 + +sed -i "s/\\\n/\n\n/" CHANGELOG.md From 54163410da05e8e6c68af55d699bf6a89e229bb6 Mon Sep 17 00:00:00 2001 From: Mike Seddon Date: Wed, 28 Jul 2021 17:16:52 +1000 Subject: [PATCH 297/329] JOIN conditions are order dependent (#778) * allow either order joins * refactor to individual condition level * change join signature to 'join_keys' tuple --- .../core/src/serde/logical_plan/from_proto.rs | 3 +- .../rust/core/src/serde/logical_plan/mod.rs | 2 +- datafusion/src/execution/dataframe_impl.rs | 3 +- datafusion/src/logical_plan/builder.rs | 36 +++++--- datafusion/src/optimizer/filter_push_down.rs | 9 +- .../src/optimizer/projection_push_down.rs | 4 +- datafusion/src/sql/planner.rs | 9 +- datafusion/tests/sql.rs | 90 ++++++++++++++----- 8 files changed, 108 insertions(+), 48 deletions(-) diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index cad0543923081..38b5257e32e11 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -272,8 +272,7 @@ impl TryInto for &protobuf::LogicalPlanNode { JoinConstraint::On => builder.join( &convert_box_required!(join.right)?, join_type.into(), - left_keys, - right_keys, + (left_keys, right_keys), )?, JoinConstraint::Using => builder.join_using( &convert_box_required!(join.right)?, diff --git a/ballista/rust/core/src/serde/logical_plan/mod.rs b/ballista/rust/core/src/serde/logical_plan/mod.rs index 0d27c58ac2925..f6dbeaf6a1517 100644 --- a/ballista/rust/core/src/serde/logical_plan/mod.rs +++ b/ballista/rust/core/src/serde/logical_plan/mod.rs @@ -701,7 +701,7 @@ mod roundtrip_tests { CsvReadOptions::new().schema(&schema).has_header(true), Some(vec![0, 3, 4]), ) - .and_then(|plan| plan.join(&scan_plan, JoinType::Inner, vec!["id"], vec!["id"])) + .and_then(|plan| plan.join(&scan_plan, JoinType::Inner, (vec!["id"], vec!["id"]))) .and_then(|plan| plan.build()) .map_err(BallistaError::DataFusionError)?; diff --git a/datafusion/src/execution/dataframe_impl.rs b/datafusion/src/execution/dataframe_impl.rs index 4edd01c2c0a99..451c4c7ba5023 100644 --- a/datafusion/src/execution/dataframe_impl.rs +++ b/datafusion/src/execution/dataframe_impl.rs @@ -117,8 +117,7 @@ impl DataFrame for DataFrameImpl { .join( &right.to_logical_plan(), join_type, - left_cols.to_vec(), - right_cols.to_vec(), + (left_cols.to_vec(), right_cols.to_vec()), )? .build()?; Ok(Arc::new(DataFrameImpl::new(self.ctx_state.clone(), &plan))) diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs index 60e0ed3c09883..a742f346207ad 100644 --- a/datafusion/src/logical_plan/builder.rs +++ b/datafusion/src/logical_plan/builder.rs @@ -273,23 +273,37 @@ impl LogicalPlanBuilder { &self, right: &LogicalPlan, join_type: JoinType, - left_keys: Vec>, - right_keys: Vec>, + join_keys: (Vec>, Vec>), ) -> Result { - if left_keys.len() != right_keys.len() { + if join_keys.0.len() != join_keys.1.len() { return Err(DataFusionError::Plan( "left_keys and right_keys were not the same length".to_string(), )); } - let left_keys: Vec = left_keys - .into_iter() - .map(|c| c.into().normalize(&self.plan)) - .collect::>()?; - let right_keys: Vec = right_keys - .into_iter() - .map(|c| c.into().normalize(right)) - .collect::>()?; + let (left_keys, right_keys): (Vec>, Vec>) = + join_keys + .0 + .into_iter() + .zip(join_keys.1.into_iter()) + .map(|(l, r)| { + let mut swap = false; + let l = l.into(); + let left_key = l.clone().normalize(&self.plan).or_else(|_| { + swap = true; + l.normalize(right) + }); + if swap { + (r.into().normalize(&self.plan), left_key) + } else { + (left_key, r.into().normalize(right)) + } + }) + .unzip(); + + let left_keys = left_keys.into_iter().collect::>>()?; + let right_keys = right_keys.into_iter().collect::>>()?; + let on: Vec<(_, _)> = left_keys.into_iter().zip(right_keys.into_iter()).collect(); let join_schema = build_join_schema(self.plan.schema(), right.schema(), &join_type)?; diff --git a/datafusion/src/optimizer/filter_push_down.rs b/datafusion/src/optimizer/filter_push_down.rs index 399923e87218b..039e92d1c1285 100644 --- a/datafusion/src/optimizer/filter_push_down.rs +++ b/datafusion/src/optimizer/filter_push_down.rs @@ -973,8 +973,7 @@ mod tests { .join( &right, JoinType::Inner, - vec![Column::from_name("a")], - vec![Column::from_name("a")], + (vec![Column::from_name("a")], vec![Column::from_name("a")]), )? .filter(col("a").lt_eq(lit(1i64)))? .build()?; @@ -1058,8 +1057,7 @@ mod tests { .join( &right, JoinType::Inner, - vec![Column::from_name("a")], - vec![Column::from_name("a")], + (vec![Column::from_name("a")], vec![Column::from_name("a")]), )? // "b" and "c" are not shared by either side: they are only available together after the join .filter(col("c").lt_eq(col("b")))? @@ -1099,8 +1097,7 @@ mod tests { .join( &right, JoinType::Inner, - vec![Column::from_name("a")], - vec![Column::from_name("a")], + (vec![Column::from_name("a")], vec![Column::from_name("a")]), )? .filter(col("b").lt_eq(lit(1i64)))? .build()?; diff --git a/datafusion/src/optimizer/projection_push_down.rs b/datafusion/src/optimizer/projection_push_down.rs index 0de36f354206f..96c5094711ba9 100644 --- a/datafusion/src/optimizer/projection_push_down.rs +++ b/datafusion/src/optimizer/projection_push_down.rs @@ -555,7 +555,7 @@ mod tests { LogicalPlanBuilder::scan_empty(Some("test2"), &schema, None)?.build()?; let plan = LogicalPlanBuilder::from(table_scan) - .join(&table2_scan, JoinType::Left, vec!["a"], vec!["c1"])? + .join(&table2_scan, JoinType::Left, (vec!["a"], vec!["c1"]))? .project(vec![col("a"), col("b"), col("c1")])? .build()?; @@ -594,7 +594,7 @@ mod tests { LogicalPlanBuilder::scan_empty(Some("test2"), &schema, None)?.build()?; let plan = LogicalPlanBuilder::from(table_scan) - .join(&table2_scan, JoinType::Left, vec!["a"], vec!["c1"])? + .join(&table2_scan, JoinType::Left, (vec!["a"], vec!["c1"]))? // projecting joined column `a` should push the right side column `c1` projection as // well into test2 table even though `c1` is not referenced in projection. .project(vec![col("a"), col("b")])? diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index fa2b035162a60..6d9484be102ff 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -375,8 +375,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let (left_keys, right_keys): (Vec, Vec) = keys.into_iter().unzip(); // return the logical plan representing the join - let join = LogicalPlanBuilder::from(left) - .join(right, join_type, left_keys, right_keys)?; + let join = LogicalPlanBuilder::from(left).join( + right, + join_type, + (left_keys, right_keys), + )?; if filter.is_empty() { join.build() @@ -548,7 +551,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { join_keys.iter().map(|(_, r)| r.clone()).collect(); let builder = LogicalPlanBuilder::from(left); left = builder - .join(right, JoinType::Inner, left_keys, right_keys)? + .join(right, JoinType::Inner, (left_keys, right_keys))? .build()?; } diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index d9f7c6ea41211..bfe2f2fc49138 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -1717,15 +1717,40 @@ fn create_case_context() -> Result { #[tokio::test] async fn equijoin() -> Result<()> { let mut ctx = create_join_context("t1_id", "t2_id")?; - let sql = - "SELECT t1_id, t1_name, t2_name FROM t1 JOIN t2 ON t1_id = t2_id ORDER BY t1_id"; - let actual = execute(&mut ctx, sql).await; + let equivalent_sql = [ + "SELECT t1_id, t1_name, t2_name FROM t1 JOIN t2 ON t1_id = t2_id ORDER BY t1_id", + "SELECT t1_id, t1_name, t2_name FROM t1 JOIN t2 ON t2_id = t1_id ORDER BY t1_id", + ]; let expected = vec![ vec!["11", "a", "z"], vec!["22", "b", "y"], vec!["44", "d", "x"], ]; - assert_eq!(expected, actual); + for sql in equivalent_sql.iter() { + let actual = execute(&mut ctx, sql).await; + assert_eq!(expected, actual); + } + Ok(()) +} + +#[tokio::test] +async fn equijoin_multiple_condition_ordering() -> Result<()> { + let mut ctx = create_join_context("t1_id", "t2_id")?; + let equivalent_sql = [ + "SELECT t1_id, t1_name, t2_name FROM t1 JOIN t2 ON t1_id = t2_id AND t1_name <> t2_name ORDER BY t1_id", + "SELECT t1_id, t1_name, t2_name FROM t1 JOIN t2 ON t1_id = t2_id AND t2_name <> t1_name ORDER BY t1_id", + "SELECT t1_id, t1_name, t2_name FROM t1 JOIN t2 ON t2_id = t1_id AND t1_name <> t2_name ORDER BY t1_id", + "SELECT t1_id, t1_name, t2_name FROM t1 JOIN t2 ON t2_id = t1_id AND t2_name <> t1_name ORDER BY t1_id", + ]; + let expected = vec![ + vec!["11", "a", "z"], + vec!["22", "b", "y"], + vec!["44", "d", "x"], + ]; + for sql in equivalent_sql.iter() { + let actual = execute(&mut ctx, sql).await; + assert_eq!(expected, actual); + } Ok(()) } @@ -1754,39 +1779,50 @@ async fn equijoin_and_unsupported_condition() -> Result<()> { #[tokio::test] async fn left_join() -> Result<()> { let mut ctx = create_join_context("t1_id", "t2_id")?; - let sql = "SELECT t1_id, t1_name, t2_name FROM t1 LEFT JOIN t2 ON t1_id = t2_id ORDER BY t1_id"; - let actual = execute(&mut ctx, sql).await; + let equivalent_sql = [ + "SELECT t1_id, t1_name, t2_name FROM t1 LEFT JOIN t2 ON t1_id = t2_id ORDER BY t1_id", + "SELECT t1_id, t1_name, t2_name FROM t1 LEFT JOIN t2 ON t2_id = t1_id ORDER BY t1_id", + ]; let expected = vec![ vec!["11", "a", "z"], vec!["22", "b", "y"], vec!["33", "c", "NULL"], vec!["44", "d", "x"], ]; - assert_eq!(expected, actual); + for sql in equivalent_sql.iter() { + let actual = execute(&mut ctx, sql).await; + assert_eq!(expected, actual); + } Ok(()) } #[tokio::test] async fn right_join() -> Result<()> { let mut ctx = create_join_context("t1_id", "t2_id")?; - let sql = - "SELECT t1_id, t1_name, t2_name FROM t1 RIGHT JOIN t2 ON t1_id = t2_id ORDER BY t1_id"; - let actual = execute(&mut ctx, sql).await; + let equivalent_sql = [ + "SELECT t1_id, t1_name, t2_name FROM t1 RIGHT JOIN t2 ON t1_id = t2_id ORDER BY t1_id", + "SELECT t1_id, t1_name, t2_name FROM t1 RIGHT JOIN t2 ON t2_id = t1_id ORDER BY t1_id" + ]; let expected = vec![ vec!["NULL", "NULL", "w"], vec!["11", "a", "z"], vec!["22", "b", "y"], vec!["44", "d", "x"], ]; - assert_eq!(expected, actual); + for sql in equivalent_sql.iter() { + let actual = execute(&mut ctx, sql).await; + assert_eq!(expected, actual); + } Ok(()) } #[tokio::test] async fn full_join() -> Result<()> { let mut ctx = create_join_context("t1_id", "t2_id")?; - let sql = "SELECT t1_id, t1_name, t2_name FROM t1 FULL JOIN t2 ON t1_id = t2_id ORDER BY t1_id"; - let actual = execute(&mut ctx, sql).await; + let equivalent_sql = [ + "SELECT t1_id, t1_name, t2_name FROM t1 FULL JOIN t2 ON t1_id = t2_id ORDER BY t1_id", + "SELECT t1_id, t1_name, t2_name FROM t1 FULL JOIN t2 ON t2_id = t1_id ORDER BY t1_id", + ]; let expected = vec![ vec!["NULL", "NULL", "w"], vec!["11", "a", "z"], @@ -1794,11 +1830,19 @@ async fn full_join() -> Result<()> { vec!["33", "c", "NULL"], vec!["44", "d", "x"], ]; - assert_eq!(expected, actual); + for sql in equivalent_sql.iter() { + let actual = execute(&mut ctx, sql).await; + assert_eq!(expected, actual); + } - let sql = "SELECT t1_id, t1_name, t2_name FROM t1 FULL OUTER JOIN t2 ON t1_id = t2_id ORDER BY t1_id"; - let actual = execute(&mut ctx, sql).await; - assert_eq!(expected, actual); + let equivalent_sql = [ + "SELECT t1_id, t1_name, t2_name FROM t1 FULL OUTER JOIN t2 ON t1_id = t2_id ORDER BY t1_id", + "SELECT t1_id, t1_name, t2_name FROM t1 FULL OUTER JOIN t2 ON t2_id = t1_id ORDER BY t1_id", + ]; + for sql in equivalent_sql.iter() { + let actual = execute(&mut ctx, sql).await; + assert_eq!(expected, actual); + } Ok(()) } @@ -1821,15 +1865,19 @@ async fn left_join_using() -> Result<()> { #[tokio::test] async fn equijoin_implicit_syntax() -> Result<()> { let mut ctx = create_join_context("t1_id", "t2_id")?; - let sql = - "SELECT t1_id, t1_name, t2_name FROM t1, t2 WHERE t1_id = t2_id ORDER BY t1_id"; - let actual = execute(&mut ctx, sql).await; + let equivalent_sql = [ + "SELECT t1_id, t1_name, t2_name FROM t1, t2 WHERE t1_id = t2_id ORDER BY t1_id", + "SELECT t1_id, t1_name, t2_name FROM t1, t2 WHERE t2_id = t1_id ORDER BY t1_id", + ]; let expected = vec![ vec!["11", "a", "z"], vec!["22", "b", "y"], vec!["44", "d", "x"], ]; - assert_eq!(expected, actual); + for sql in equivalent_sql.iter() { + let actual = execute(&mut ctx, sql).await; + assert_eq!(expected, actual); + } Ok(()) } From ea1d767b271cc27c59bb34ad1bf1ae69cd72f8d6 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 28 Jul 2021 07:24:40 -0700 Subject: [PATCH 298/329] Support DataFrame.collect for Ballista DataFrames (#785) --- .../src/bin/ballista-dataframe.rs | 13 +- ballista-examples/src/bin/ballista-sql.rs | 13 +- ballista/rust/client/src/context.rs | 163 ++------------ .../src/execution_plans/distributed_query.rs | 213 ++++++++++++++++++ ballista/rust/core/src/execution_plans/mod.rs | 2 + ballista/rust/core/src/utils.rs | 54 ++++- ballista/rust/scheduler/src/lib.rs | 9 +- benchmarks/src/bin/tpch.rs | 11 +- 8 files changed, 297 insertions(+), 181 deletions(-) create mode 100644 ballista/rust/core/src/execution_plans/distributed_query.rs diff --git a/ballista-examples/src/bin/ballista-dataframe.rs b/ballista-examples/src/bin/ballista-dataframe.rs index da7d99db1cf02..693e67682edad 100644 --- a/ballista-examples/src/bin/ballista-dataframe.rs +++ b/ballista-examples/src/bin/ballista-dataframe.rs @@ -38,18 +38,7 @@ async fn main() -> Result<()> { .select_columns(&["id", "bool_col", "timestamp_col"])? .filter(col("id").gt(lit(1)))?; - // execute the query - note that calling collect on the DataFrame - // trait will execute the query with DataFusion so we have to call - // collect on the BallistaContext instead and pass it the DataFusion - // logical plan - let mut stream = ctx.collect(&df.to_logical_plan()).await?; - - // print the results - let mut results = vec![]; - while let Some(batch) = stream.next().await { - let batch = batch?; - results.push(batch); - } + let results = df.collect().await?; pretty::print_batches(&results)?; Ok(()) diff --git a/ballista-examples/src/bin/ballista-sql.rs b/ballista-examples/src/bin/ballista-sql.rs index f9e7d180af45f..590ab7bcf7a48 100644 --- a/ballista-examples/src/bin/ballista-sql.rs +++ b/ballista-examples/src/bin/ballista-sql.rs @@ -45,18 +45,7 @@ async fn main() -> Result<()> { GROUP BY c1", )?; - // execute the query - note that calling collect on the DataFrame - // trait will execute the query with DataFusion so we have to call - // collect on the BallistaContext instead and pass it the DataFusion - // logical plan - let mut stream = ctx.collect(&df.to_logical_plan()).await?; - - // print the results - let mut results = vec![]; - while let Some(batch) = stream.next().await { - let batch = batch?; - results.push(batch); - } + let results = df.collect().await?; pretty::print_batches(&results)?; Ok(()) diff --git a/ballista/rust/client/src/context.rs b/ballista/rust/client/src/context.rs index 26087f8e6693c..16a90b2e27340 100644 --- a/ballista/rust/client/src/context.rs +++ b/ballista/rust/client/src/context.rs @@ -17,32 +17,19 @@ //! Distributed execution context. +use std::collections::HashMap; +use std::fs; use std::path::PathBuf; -use std::pin::Pin; use std::sync::{Arc, Mutex}; -use std::{collections::HashMap, convert::TryInto}; -use std::{fs, time::Duration}; use ballista_core::config::BallistaConfig; -use ballista_core::serde::protobuf::{ - execute_query_params::Query, job_status, scheduler_grpc_client::SchedulerGrpcClient, - ExecuteQueryParams, GetJobStatusParams, GetJobStatusResult, KeyValuePair, - PartitionLocation, -}; -use ballista_core::{ - client::BallistaClient, datasource::DfTableAdapter, utils::create_datafusion_context, - utils::WrappedStream, -}; +use ballista_core::{datasource::DfTableAdapter, utils::create_datafusion_context}; -use datafusion::arrow::datatypes::Schema; use datafusion::catalog::TableReference; -use datafusion::error::{DataFusionError, Result}; +use datafusion::dataframe::DataFrame; +use datafusion::error::Result; use datafusion::logical_plan::LogicalPlan; use datafusion::physical_plan::csv::CsvReadOptions; -use datafusion::{dataframe::DataFrame, physical_plan::RecordBatchStream}; -use futures::future; -use futures::StreamExt; -use log::{error, info}; struct BallistaContextState { /// Ballista configuration @@ -142,7 +129,12 @@ impl BallistaContext { let path = fs::canonicalize(&path)?; // use local DataFusion context for now but later this might call the scheduler - let mut ctx = create_datafusion_context(&self.state.lock().unwrap().config()); + let guard = self.state.lock().unwrap(); + let mut ctx = create_datafusion_context( + &guard.scheduler_host, + guard.scheduler_port, + &guard.config(), + ); let df = ctx.read_parquet(path.to_str().unwrap())?; Ok(df) } @@ -159,7 +151,12 @@ impl BallistaContext { let path = fs::canonicalize(&path)?; // use local DataFusion context for now but later this might call the scheduler - let mut ctx = create_datafusion_context(&self.state.lock().unwrap().config()); + let guard = self.state.lock().unwrap(); + let mut ctx = create_datafusion_context( + &guard.scheduler_host, + guard.scheduler_port, + &guard.config(), + ); let df = ctx.read_csv(path.to_str().unwrap(), options)?; Ok(df) } @@ -193,7 +190,11 @@ impl BallistaContext { // use local DataFusion context for now but later this might call the scheduler // register tables let state = self.state.lock().unwrap(); - let mut ctx = create_datafusion_context(&state.config()); + let mut ctx = create_datafusion_context( + &state.scheduler_host, + state.scheduler_port, + state.config(), + ); for (name, plan) in &state.tables { let plan = ctx.optimize(plan)?; let execution_plan = ctx.create_physical_plan(&plan)?; @@ -204,126 +205,6 @@ impl BallistaContext { } ctx.sql(sql) } - - async fn fetch_partition( - location: PartitionLocation, - ) -> Result>> { - let metadata = location.executor_meta.ok_or_else(|| { - DataFusionError::Internal("Received empty executor metadata".to_owned()) - })?; - let partition_id = location.partition_id.ok_or_else(|| { - DataFusionError::Internal("Received empty partition id".to_owned()) - })?; - let mut ballista_client = - BallistaClient::try_new(metadata.host.as_str(), metadata.port as u16) - .await - .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; - Ok(ballista_client - .fetch_partition( - &partition_id.job_id, - partition_id.stage_id as usize, - partition_id.partition_id as usize, - &location.path, - ) - .await - .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?) - } - - pub async fn collect( - &self, - plan: &LogicalPlan, - ) -> Result>> { - let (scheduler_url, config) = { - let state = self.state.lock().unwrap(); - let scheduler_url = - format!("http://{}:{}", state.scheduler_host, state.scheduler_port); - (scheduler_url, state.config.clone()) - }; - - info!("Connecting to Ballista scheduler at {}", scheduler_url); - - let mut scheduler = SchedulerGrpcClient::connect(scheduler_url) - .await - .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; - - let schema: Schema = plan.schema().as_ref().clone().into(); - - let job_id = scheduler - .execute_query(ExecuteQueryParams { - query: Some(Query::LogicalPlan( - (plan) - .try_into() - .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?, - )), - settings: config - .settings() - .iter() - .map(|(k, v)| KeyValuePair { - key: k.to_owned(), - value: v.to_owned(), - }) - .collect::>(), - }) - .await - .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))? - .into_inner() - .job_id; - - let mut prev_status: Option = None; - - loop { - let GetJobStatusResult { status } = scheduler - .get_job_status(GetJobStatusParams { - job_id: job_id.clone(), - }) - .await - .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))? - .into_inner(); - let status = status.and_then(|s| s.status).ok_or_else(|| { - DataFusionError::Internal("Received empty status message".to_owned()) - })?; - let wait_future = tokio::time::sleep(Duration::from_millis(100)); - let has_status_change = prev_status.map(|x| x != status).unwrap_or(true); - match status { - job_status::Status::Queued(_) => { - if has_status_change { - info!("Job {} still queued...", job_id); - } - wait_future.await; - prev_status = Some(status); - } - job_status::Status::Running(_) => { - if has_status_change { - info!("Job {} is running...", job_id); - } - wait_future.await; - prev_status = Some(status); - } - job_status::Status::Failed(err) => { - let msg = format!("Job {} failed: {}", job_id, err.error); - error!("{}", msg); - break Err(DataFusionError::Execution(msg)); - } - job_status::Status::Completed(completed) => { - let result = future::join_all( - completed - .partition_location - .into_iter() - .map(BallistaContext::fetch_partition), - ) - .await - .into_iter() - .collect::>>()?; - - let result = WrappedStream::new( - Box::pin(futures::stream::iter(result).flatten()), - Arc::new(schema), - ); - break Ok(Box::pin(result)); - } - }; - } - } } #[cfg(test)] diff --git a/ballista/rust/core/src/execution_plans/distributed_query.rs b/ballista/rust/core/src/execution_plans/distributed_query.rs new file mode 100644 index 0000000000000..8abfe6678893b --- /dev/null +++ b/ballista/rust/core/src/execution_plans/distributed_query.rs @@ -0,0 +1,213 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::convert::TryInto; +use std::pin::Pin; +use std::sync::Arc; +use std::time::Duration; + +use crate::client::BallistaClient; +use crate::config::BallistaConfig; +use crate::serde::protobuf::{ + execute_query_params::Query, job_status, scheduler_grpc_client::SchedulerGrpcClient, + ExecuteQueryParams, GetJobStatusParams, GetJobStatusResult, KeyValuePair, + PartitionLocation, +}; +use crate::utils::WrappedStream; + +use datafusion::arrow::datatypes::{Schema, SchemaRef}; +use datafusion::error::{DataFusionError, Result}; +use datafusion::logical_plan::LogicalPlan; +use datafusion::physical_plan::{ + ExecutionPlan, Partitioning, RecordBatchStream, SendableRecordBatchStream, +}; + +use async_trait::async_trait; +use futures::future; +use futures::StreamExt; +use log::{error, info}; + +/// This operator sends a logial plan to a Ballista scheduler for execution and +/// polls the scheduler until the query is complete and then fetches the resulting +/// batches directly from the executors that hold the results from the final +/// query stage. +#[derive(Debug, Clone)] +pub struct DistributedQueryExec { + /// Ballista scheduler URL + scheduler_url: String, + /// Ballista configuration + config: BallistaConfig, + /// Logical plan to execute + plan: LogicalPlan, +} + +impl DistributedQueryExec { + pub fn new(scheduler_url: String, config: BallistaConfig, plan: LogicalPlan) -> Self { + Self { + scheduler_url, + config, + plan, + } + } +} + +#[async_trait] +impl ExecutionPlan for DistributedQueryExec { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.plan.schema().as_ref().clone().into() + } + + fn output_partitioning(&self) -> Partitioning { + Partitioning::UnknownPartitioning(1) + } + + fn children(&self) -> Vec> { + vec![] + } + + fn with_new_children( + &self, + _children: Vec>, + ) -> datafusion::error::Result> { + Ok(Arc::new(DistributedQueryExec::new( + self.scheduler_url.clone(), + self.config.clone(), + self.plan.clone(), + ))) + } + + async fn execute( + &self, + partition: usize, + ) -> datafusion::error::Result { + assert_eq!(0, partition); + + info!("Connecting to Ballista scheduler at {}", self.scheduler_url); + + let mut scheduler = SchedulerGrpcClient::connect(self.scheduler_url.clone()) + .await + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; + + let schema: Schema = self.plan.schema().as_ref().clone().into(); + + let job_id = scheduler + .execute_query(ExecuteQueryParams { + query: Some(Query::LogicalPlan( + (&self.plan) + .try_into() + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?, + )), + settings: self + .config + .settings() + .iter() + .map(|(k, v)| KeyValuePair { + key: k.to_owned(), + value: v.to_owned(), + }) + .collect::>(), + }) + .await + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))? + .into_inner() + .job_id; + + let mut prev_status: Option = None; + + loop { + let GetJobStatusResult { status } = scheduler + .get_job_status(GetJobStatusParams { + job_id: job_id.clone(), + }) + .await + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))? + .into_inner(); + let status = status.and_then(|s| s.status).ok_or_else(|| { + DataFusionError::Internal("Received empty status message".to_owned()) + })?; + let wait_future = tokio::time::sleep(Duration::from_millis(100)); + let has_status_change = prev_status.map(|x| x != status).unwrap_or(true); + match status { + job_status::Status::Queued(_) => { + if has_status_change { + info!("Job {} still queued...", job_id); + } + wait_future.await; + prev_status = Some(status); + } + job_status::Status::Running(_) => { + if has_status_change { + info!("Job {} is running...", job_id); + } + wait_future.await; + prev_status = Some(status); + } + job_status::Status::Failed(err) => { + let msg = format!("Job {} failed: {}", job_id, err.error); + error!("{}", msg); + break Err(DataFusionError::Execution(msg)); + } + job_status::Status::Completed(completed) => { + let result = future::join_all( + completed + .partition_location + .into_iter() + .map(fetch_partition), + ) + .await + .into_iter() + .collect::>>()?; + + let result = WrappedStream::new( + Box::pin(futures::stream::iter(result).flatten()), + Arc::new(schema), + ); + break Ok(Box::pin(result)); + } + }; + } + } +} + +async fn fetch_partition( + location: PartitionLocation, +) -> Result { + let metadata = location.executor_meta.ok_or_else(|| { + DataFusionError::Internal("Received empty executor metadata".to_owned()) + })?; + let partition_id = location.partition_id.ok_or_else(|| { + DataFusionError::Internal("Received empty partition id".to_owned()) + })?; + let mut ballista_client = + BallistaClient::try_new(metadata.host.as_str(), metadata.port as u16) + .await + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; + Ok(ballista_client + .fetch_partition( + &partition_id.job_id, + partition_id.stage_id as usize, + partition_id.partition_id as usize, + &location.path, + ) + .await + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?) +} diff --git a/ballista/rust/core/src/execution_plans/mod.rs b/ballista/rust/core/src/execution_plans/mod.rs index ca4e60023ce8c..b10ff341e9032 100644 --- a/ballista/rust/core/src/execution_plans/mod.rs +++ b/ballista/rust/core/src/execution_plans/mod.rs @@ -18,10 +18,12 @@ //! This module contains execution plans that are needed to distribute Datafusion's execution plans into //! several Ballista executors. +mod distributed_query; mod shuffle_reader; mod shuffle_writer; mod unresolved_shuffle; +pub use distributed_query::DistributedQueryExec; pub use shuffle_reader::ShuffleReaderExec; pub use shuffle_writer::ShuffleWriterExec; pub use unresolved_shuffle::UnresolvedShuffleExec; diff --git a/ballista/rust/core/src/utils.rs b/ballista/rust/core/src/utils.rs index 8b1cf61a55ee6..4187faa6645aa 100644 --- a/ballista/rust/core/src/utils.rs +++ b/ballista/rust/core/src/utils.rs @@ -23,7 +23,9 @@ use std::sync::Arc; use std::{fs::File, pin::Pin}; use crate::error::{BallistaError, Result}; -use crate::execution_plans::{ShuffleWriterExec, UnresolvedShuffleExec}; +use crate::execution_plans::{ + DistributedQueryExec, ShuffleWriterExec, UnresolvedShuffleExec, +}; use crate::memory_stream::MemoryStream; use crate::serde::scheduler::PartitionStats; @@ -38,8 +40,11 @@ use datafusion::arrow::{ ipc::writer::FileWriter, record_batch::RecordBatch, }; -use datafusion::execution::context::{ExecutionConfig, ExecutionContext}; -use datafusion::logical_plan::Operator; +use datafusion::error::DataFusionError; +use datafusion::execution::context::{ + ExecutionConfig, ExecutionContext, ExecutionContextState, QueryPlanner, +}; +use datafusion::logical_plan::{LogicalPlan, Operator}; use datafusion::physical_optimizer::coalesce_batches::CoalesceBatches; use datafusion::physical_optimizer::merge_exec::AddCoalescePartitionsExec; use datafusion::physical_optimizer::optimizer::PhysicalOptimizerRule; @@ -232,12 +237,49 @@ fn build_exec_plan_diagram( } /// Create a DataFusion context that is compatible with Ballista -pub fn create_datafusion_context(config: &BallistaConfig) -> ExecutionContext { - let config = - ExecutionConfig::new().with_concurrency(config.default_shuffle_partitions()); +pub fn create_datafusion_context( + scheduler_host: &str, + scheduler_port: u16, + config: &BallistaConfig, +) -> ExecutionContext { + let scheduler_url = format!("http://{}:{}", scheduler_host, scheduler_port); + let config = ExecutionConfig::new() + .with_query_planner(Arc::new(BallistaQueryPlanner::new( + scheduler_url, + config.clone(), + ))) + .with_concurrency(config.default_shuffle_partitions()); ExecutionContext::with_config(config) } +pub struct BallistaQueryPlanner { + scheduler_url: String, + config: BallistaConfig, +} + +impl BallistaQueryPlanner { + pub fn new(scheduler_url: String, config: BallistaConfig) -> Self { + Self { + scheduler_url, + config, + } + } +} + +impl QueryPlanner for BallistaQueryPlanner { + fn create_physical_plan( + &self, + logical_plan: &LogicalPlan, + _ctx_state: &ExecutionContextState, + ) -> std::result::Result, DataFusionError> { + Ok(Arc::new(DistributedQueryExec::new( + self.scheduler_url.clone(), + self.config.clone(), + logical_plan.clone(), + ))) + } +} + pub struct WrappedStream { stream: Pin> + Send + Sync>>, schema: SchemaRef, diff --git a/ballista/rust/scheduler/src/lib.rs b/ballista/rust/scheduler/src/lib.rs index f5e2dc1dfd807..3e4e73586d539 100644 --- a/ballista/rust/scheduler/src/lib.rs +++ b/ballista/rust/scheduler/src/lib.rs @@ -82,8 +82,8 @@ use self::state::{ConfigBackendClient, SchedulerState}; use ballista_core::config::BallistaConfig; use ballista_core::execution_plans::ShuffleWriterExec; use ballista_core::serde::scheduler::to_proto::hash_partitioning_to_proto; -use ballista_core::utils::create_datafusion_context; use datafusion::physical_plan::parquet::ParquetExec; +use datafusion::prelude::{ExecutionConfig, ExecutionContext}; use std::time::{Instant, SystemTime, UNIX_EPOCH}; #[derive(Clone)] @@ -511,6 +511,13 @@ impl SchedulerGrpc for SchedulerServer { } } +/// Create a DataFusion context that is compatible with Ballista +pub fn create_datafusion_context(config: &BallistaConfig) -> ExecutionContext { + let config = + ExecutionConfig::new().with_concurrency(config.default_shuffle_partitions()); + ExecutionContext::with_config(config) +} + #[cfg(all(test, feature = "sled"))] mod test { use std::{ diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index 169319d30beef..08b8864acd1b9 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -25,8 +25,6 @@ use std::{ time::Instant, }; -use futures::StreamExt; - use ballista::context::BallistaContext; use ballista::prelude::{BallistaConfig, BALLISTA_DEFAULT_SHUFFLE_PARTITIONS}; @@ -312,15 +310,10 @@ async fn benchmark_ballista(opt: BallistaBenchmarkOpt) -> Result<()> { let df = ctx .sql(&sql) .map_err(|e| DataFusionError::Plan(format!("{:?}", e)))?; - let mut batches = vec![]; - let mut stream = ctx - .collect(&df.to_logical_plan()) + let batches = df + .collect() .await .map_err(|e| DataFusionError::Plan(format!("{:?}", e)))?; - while let Some(result) = stream.next().await { - let batch = result?; - batches.push(batch); - } let elapsed = start.elapsed().as_secs_f64() * 1000.0; millis.push(elapsed as f64); println!("Query {} iteration {} took {:.1} ms", opt.query, i, elapsed); From 4929590eea506608e1d8d425a8801fc21d8c8f45 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 28 Jul 2021 14:44:02 -0400 Subject: [PATCH 299/329] Box ScalarValue:Lists, reduce size by half size (#788) * Box ScalarValue:Lists DataType, reduce to half size * Fixup ballista --- .../core/src/serde/logical_plan/from_proto.rs | 16 +-- .../rust/core/src/serde/logical_plan/mod.rs | 76 ++++++++------ .../core/src/serde/logical_plan/to_proto.rs | 33 ++++--- .../src/physical_plan/distinct_expressions.rs | 8 +- datafusion/src/scalar.rs | 98 +++++++++++-------- 5 files changed, 140 insertions(+), 91 deletions(-) diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index 38b5257e32e11..2665e33137b5d 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -536,7 +536,7 @@ impl TryInto for &protobuf::scalar_value::Value } protobuf::scalar_value::Value::ListValue(v) => v.try_into()?, protobuf::scalar_value::Value::NullListValue(v) => { - ScalarValue::List(None, v.try_into()?) + ScalarValue::List(None, Box::new(v.try_into()?)) } protobuf::scalar_value::Value::NullValue(null_enum) => { PrimitiveScalarType::from_i32(*null_enum) @@ -580,8 +580,8 @@ impl TryInto for &protobuf::ScalarListValue { }) .collect::, _>>()?; datafusion::scalar::ScalarValue::List( - Some(typechecked_values), - leaf_scalar_type.into(), + Some(Box::new(typechecked_values)), + Box::new(leaf_scalar_type.into()), ) } Datatype::List(list_type) => { @@ -625,9 +625,9 @@ impl TryInto for &protobuf::ScalarListValue { datafusion::scalar::ScalarValue::List( match typechecked_values.len() { 0 => None, - _ => Some(typechecked_values), + _ => Some(Box::new(typechecked_values)), }, - list_type.try_into()?, + Box::new(list_type.try_into()?), ) } }; @@ -765,14 +765,16 @@ impl TryInto for &protobuf::ScalarValue { .map(|val| val.try_into()) .collect::, _>>()?; let scalar_type: DataType = pb_scalar_type.try_into()?; - ScalarValue::List(Some(typechecked_values), scalar_type) + let scalar_type = Box::new(scalar_type); + ScalarValue::List(Some(Box::new(typechecked_values)), scalar_type) } protobuf::scalar_value::Value::NullListValue(v) => { let pb_datatype = v .datatype .as_ref() .ok_or_else(|| proto_error("Protobuf deserialization error: NullListValue message missing required field 'datatyp'"))?; - ScalarValue::List(None, pb_datatype.try_into()?) + let pb_datatype = Box::new(pb_datatype.try_into()?); + ScalarValue::List(None, pb_datatype) } protobuf::scalar_value::Value::NullValue(v) => { let null_type_enum = protobuf::PrimitiveScalarType::from_i32(*v) diff --git a/ballista/rust/core/src/serde/logical_plan/mod.rs b/ballista/rust/core/src/serde/logical_plan/mod.rs index f6dbeaf6a1517..e4e438335efda 100644 --- a/ballista/rust/core/src/serde/logical_plan/mod.rs +++ b/ballista/rust/core/src/serde/logical_plan/mod.rs @@ -126,49 +126,57 @@ mod roundtrip_tests { let should_fail_on_seralize: Vec = vec![ //Should fail due to inconsistent types ScalarValue::List( - Some(vec![ + Some(Box::new(vec![ ScalarValue::Int16(None), ScalarValue::Float32(Some(32.0)), - ]), - DataType::List(new_box_field("item", DataType::Int16, true)), + ])), + Box::new(DataType::List(new_box_field("item", DataType::Int16, true))), ), ScalarValue::List( - Some(vec![ + Some(Box::new(vec![ ScalarValue::Float32(None), ScalarValue::Float32(Some(32.0)), - ]), - DataType::List(new_box_field("item", DataType::Int16, true)), + ])), + Box::new(DataType::List(new_box_field("item", DataType::Int16, true))), ), ScalarValue::List( - Some(vec![ + Some(Box::new(vec![ ScalarValue::List( None, - DataType::List(new_box_field("level2", DataType::Float32, true)), + Box::new(DataType::List(new_box_field( + "level2", + DataType::Float32, + true, + ))), ), ScalarValue::List( - Some(vec![ + Some(Box::new(vec![ ScalarValue::Float32(Some(-213.1)), ScalarValue::Float32(None), ScalarValue::Float32(Some(5.5)), ScalarValue::Float32(Some(2.0)), ScalarValue::Float32(Some(1.0)), - ]), - DataType::List(new_box_field("level2", DataType::Float32, true)), + ])), + Box::new(DataType::List(new_box_field( + "level2", + DataType::Float32, + true, + ))), ), ScalarValue::List( None, - DataType::List(new_box_field( + Box::new(DataType::List(new_box_field( "lists are typed inconsistently", DataType::Int16, true, - )), + ))), ), - ]), - DataType::List(new_box_field( + ])), + Box::new(DataType::List(new_box_field( "level1", DataType::List(new_box_field("level2", DataType::Float32, true)), true, - )), + ))), ), ]; @@ -200,7 +208,7 @@ mod roundtrip_tests { ScalarValue::UInt64(None), ScalarValue::Utf8(None), ScalarValue::LargeUtf8(None), - ScalarValue::List(None, DataType::Boolean), + ScalarValue::List(None, Box::new(DataType::Boolean)), ScalarValue::Date32(None), ScalarValue::TimestampMicrosecond(None), ScalarValue::TimestampNanosecond(None), @@ -248,37 +256,49 @@ mod roundtrip_tests { ScalarValue::TimestampMicrosecond(Some(i64::MAX)), ScalarValue::TimestampMicrosecond(None), ScalarValue::List( - Some(vec![ + Some(Box::new(vec![ ScalarValue::Float32(Some(-213.1)), ScalarValue::Float32(None), ScalarValue::Float32(Some(5.5)), ScalarValue::Float32(Some(2.0)), ScalarValue::Float32(Some(1.0)), - ]), - DataType::List(new_box_field("level1", DataType::Float32, true)), + ])), + Box::new(DataType::List(new_box_field( + "level1", + DataType::Float32, + true, + ))), ), ScalarValue::List( - Some(vec![ + Some(Box::new(vec![ ScalarValue::List( None, - DataType::List(new_box_field("level2", DataType::Float32, true)), + Box::new(DataType::List(new_box_field( + "level2", + DataType::Float32, + true, + ))), ), ScalarValue::List( - Some(vec![ + Some(Box::new(vec![ ScalarValue::Float32(Some(-213.1)), ScalarValue::Float32(None), ScalarValue::Float32(Some(5.5)), ScalarValue::Float32(Some(2.0)), ScalarValue::Float32(Some(1.0)), - ]), - DataType::List(new_box_field("level2", DataType::Float32, true)), + ])), + Box::new(DataType::List(new_box_field( + "level2", + DataType::Float32, + true, + ))), ), - ]), - DataType::List(new_box_field( + ])), + Box::new(DataType::List(new_box_field( "level1", DataType::List(new_box_field("level2", DataType::Float32, true)), true, - )), + ))), ), ]; diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/ballista/rust/core/src/serde/logical_plan/to_proto.rs index 07d7a59c114c6..87f26a118e780 100644 --- a/ballista/rust/core/src/serde/logical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/to_proto.rs @@ -565,13 +565,13 @@ impl TryFrom<&datafusion::scalar::ScalarValue> for protobuf::ScalarValue { protobuf::ScalarValue { value: Some(protobuf::scalar_value::Value::ListValue( protobuf::ScalarListValue { - datatype: Some(datatype.try_into()?), + datatype: Some(datatype.as_ref().try_into()?), values: Vec::new(), }, )), } } else { - let scalar_type = match datatype { + let scalar_type = match datatype.as_ref() { DataType::List(field) => field.as_ref().data_type(), _ => todo!("Proper error handling"), }; @@ -579,16 +579,23 @@ impl TryFrom<&datafusion::scalar::ScalarValue> for protobuf::ScalarValue { let type_checked_values: Vec = values .iter() .map(|scalar| match (scalar, scalar_type) { - (scalar::ScalarValue::List(_, DataType::List(list_field)), DataType::List(field)) => { - let scalar_datatype = field.data_type(); - let list_datatype = list_field.data_type(); - if std::mem::discriminant(list_datatype) != std::mem::discriminant(scalar_datatype) { - return Err(proto_error(format!( - "Protobuf serialization error: Lists with inconsistent typing {:?} and {:?} found within list", - list_datatype, scalar_datatype - ))); + (scalar::ScalarValue::List(_, list_type), DataType::List(field)) => { + if let DataType::List(list_field) = list_type.as_ref() { + let scalar_datatype = field.data_type(); + let list_datatype = list_field.data_type(); + if std::mem::discriminant(list_datatype) != std::mem::discriminant(scalar_datatype) { + return Err(proto_error(format!( + "Protobuf serialization error: Lists with inconsistent typing {:?} and {:?} found within list", + list_datatype, scalar_datatype + ))); + } + scalar.try_into() + } else { + Err(proto_error(format!( + "Protobuf serialization error, {:?} was inconsistent with designated type {:?}", + scalar, datatype + ))) } - scalar.try_into() } (scalar::ScalarValue::Boolean(_), DataType::Boolean) => scalar.try_into(), (scalar::ScalarValue::Float32(_), DataType::Float32) => scalar.try_into(), @@ -612,7 +619,7 @@ impl TryFrom<&datafusion::scalar::ScalarValue> for protobuf::ScalarValue { protobuf::ScalarValue { value: Some(protobuf::scalar_value::Value::ListValue( protobuf::ScalarListValue { - datatype: Some(datatype.try_into()?), + datatype: Some(datatype.as_ref().try_into()?), values: type_checked_values, }, )), @@ -621,7 +628,7 @@ impl TryFrom<&datafusion::scalar::ScalarValue> for protobuf::ScalarValue { } None => protobuf::ScalarValue { value: Some(protobuf::scalar_value::Value::NullListValue( - datatype.try_into()?, + datatype.as_ref().try_into()?, )), }, } diff --git a/datafusion/src/physical_plan/distinct_expressions.rs b/datafusion/src/physical_plan/distinct_expressions.rs index f3513c2950e4d..90c0836f7077c 100644 --- a/datafusion/src/physical_plan/distinct_expressions.rs +++ b/datafusion/src/physical_plan/distinct_expressions.rs @@ -178,7 +178,9 @@ impl Accumulator for DistinctCountAccumulator { .state_data_types .iter() .map(|state_data_type| { - ScalarValue::List(Some(Vec::new()), state_data_type.clone()) + let values = Box::new(Vec::new()); + let data_type = Box::new(state_data_type.clone()); + ScalarValue::List(Some(values), data_type) }) .collect::>(); @@ -254,8 +256,8 @@ mod tests { macro_rules! state_to_vec { ($LIST:expr, $DATA_TYPE:ident, $PRIM_TY:ty) => {{ match $LIST { - ScalarValue::List(_, data_type) => match data_type { - DataType::$DATA_TYPE => (), + ScalarValue::List(_, data_type) => match data_type.as_ref() { + &DataType::$DATA_TYPE => (), _ => panic!("Unexpected DataType for list"), }, _ => panic!("Expected a ScalarValue::List"), diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index ab08364242428..129b4166a4e8c 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -65,8 +65,9 @@ pub enum ScalarValue { Binary(Option>), /// large binary LargeBinary(Option>), - /// list of nested ScalarValue - List(Option>, DataType), + /// list of nested ScalarValue (boxed to reduce size_of(ScalarValue)) + #[allow(clippy::box_vec)] + List(Option>>, Box), /// Date stored as a signed 32bit int Date32(Option), /// Date stored as a signed 64bit int @@ -110,7 +111,7 @@ macro_rules! build_list { ) } Some(values) => { - build_values_list!($VALUE_BUILDER_TY, $SCALAR_TY, values, $SIZE) + build_values_list!($VALUE_BUILDER_TY, $SCALAR_TY, values.as_ref(), $SIZE) } } }}; @@ -130,32 +131,35 @@ macro_rules! build_timestamp_list { $SIZE, ) } - Some(values) => match $TIME_UNIT { - TimeUnit::Second => build_values_list!( - TimestampSecondBuilder, - TimestampSecond, - values, - $SIZE - ), - TimeUnit::Microsecond => build_values_list!( - TimestampMillisecondBuilder, - TimestampMillisecond, - values, - $SIZE - ), - TimeUnit::Millisecond => build_values_list!( - TimestampMicrosecondBuilder, - TimestampMicrosecond, - values, - $SIZE - ), - TimeUnit::Nanosecond => build_values_list!( - TimestampNanosecondBuilder, - TimestampNanosecond, - values, - $SIZE - ), - }, + Some(values) => { + let values = values.as_ref(); + match $TIME_UNIT { + TimeUnit::Second => build_values_list!( + TimestampSecondBuilder, + TimestampSecond, + values, + $SIZE + ), + TimeUnit::Microsecond => build_values_list!( + TimestampMillisecondBuilder, + TimestampMillisecond, + values, + $SIZE + ), + TimeUnit::Millisecond => build_values_list!( + TimestampMicrosecondBuilder, + TimestampMicrosecond, + values, + $SIZE + ), + TimeUnit::Nanosecond => build_values_list!( + TimestampNanosecondBuilder, + TimestampNanosecond, + values, + $SIZE + ), + } + } } }}; } @@ -235,9 +239,11 @@ impl ScalarValue { ScalarValue::LargeUtf8(_) => DataType::LargeUtf8, ScalarValue::Binary(_) => DataType::Binary, ScalarValue::LargeBinary(_) => DataType::LargeBinary, - ScalarValue::List(_, data_type) => { - DataType::List(Box::new(Field::new("item", data_type.clone(), true))) - } + ScalarValue::List(_, data_type) => DataType::List(Box::new(Field::new( + "item", + data_type.as_ref().clone(), + true, + ))), ScalarValue::Date32(_) => DataType::Date32, ScalarValue::Date64(_) => DataType::Date64, ScalarValue::IntervalYearMonth(_) => { @@ -415,6 +421,7 @@ impl ScalarValue { for scalar in scalars.into_iter() { match scalar { ScalarValue::List(Some(xs), _) => { + let xs = *xs; for s in xs { match s { ScalarValue::$SCALAR_TY(Some(val)) => { @@ -627,7 +634,7 @@ impl ScalarValue { .collect::(), ), }, - ScalarValue::List(values, data_type) => Arc::new(match data_type { + ScalarValue::List(values, data_type) => Arc::new(match data_type.as_ref() { DataType::Boolean => build_list!(BooleanBuilder, Boolean, values, size), DataType::Int8 => build_list!(Int8Builder, Int8, values, size), DataType::Int16 => build_list!(Int16Builder, Int16, values, size), @@ -643,7 +650,7 @@ impl ScalarValue { DataType::Timestamp(unit, tz) => { build_timestamp_list!(unit.clone(), tz.clone(), values, size) } - DataType::LargeUtf8 => { + &DataType::LargeUtf8 => { build_list!(LargeStringBuilder, LargeUtf8, values, size) } dt => panic!("Unexpected DataType for list {:?}", dt), @@ -705,7 +712,9 @@ impl ScalarValue { Some(scalar_vec) } }; - ScalarValue::List(value, nested_type.data_type().clone()) + let value = value.map(Box::new); + let data_type = Box::new(nested_type.data_type().clone()); + ScalarValue::List(value, data_type) } DataType::Date32 => { typed_cast!(array, index, Date32Array, Date32) @@ -965,7 +974,7 @@ impl TryFrom<&DataType> for ScalarValue { ScalarValue::TimestampNanosecond(None) } DataType::List(ref nested_type) => { - ScalarValue::List(None, nested_type.data_type().clone()) + ScalarValue::List(None, Box::new(nested_type.data_type().clone())) } _ => { return Err(DataFusionError::NotImplemented(format!( @@ -1167,7 +1176,8 @@ mod tests { #[test] fn scalar_list_null_to_array() { - let list_array_ref = ScalarValue::List(None, DataType::UInt64).to_array(); + let list_array_ref = + ScalarValue::List(None, Box::new(DataType::UInt64)).to_array(); let list_array = list_array_ref.as_any().downcast_ref::().unwrap(); assert!(list_array.is_null(0)); @@ -1178,12 +1188,12 @@ mod tests { #[test] fn scalar_list_to_array() { let list_array_ref = ScalarValue::List( - Some(vec![ + Some(Box::new(vec![ ScalarValue::UInt64(Some(100)), ScalarValue::UInt64(None), ScalarValue::UInt64(Some(101)), - ]), - DataType::UInt64, + ])), + Box::new(DataType::UInt64), ) .to_array(); @@ -1336,4 +1346,12 @@ mod tests { assert!(result.to_string().contains("Inconsistent types in ScalarValue::iter_to_array. Expected Boolean, got Int32(5)"), "{}", result); } + + #[test] + fn size_of_scalar() { + // Since ScalarValues are used in a non trivial number of places, + // making it larger means significant more memory consumption + // per distinct value. + assert_eq!(std::mem::size_of::(), 32); + } } From a6baf88501765ea4a8884e9e21be23a25b8e76f8 Mon Sep 17 00:00:00 2001 From: QP Hou Date: Wed, 28 Jul 2021 19:46:49 -0700 Subject: [PATCH 300/329] automatically add python label to PRs (#791) also adjust label detection logic for datafusion and ballista --- .github/workflows/dev_pr/labeler.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index df9d41254a932..c27fb2cb4094a 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -17,6 +17,12 @@ datafusion: - datafusion/**/* + - datafusion-cli/**/* + - datafusion-examples/**/* ballista: - ballista/**/* + - ballista-examples/**/* + +python: + - python/**/* From 7dde5b13801dce6450bac10bd385c323a5f1cbff Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 29 Jul 2021 14:18:02 -0400 Subject: [PATCH 301/329] fix: Fix clippy lints introduced in 1.54 (#794) --- ballista/rust/client/src/context.rs | 4 ++-- ballista/rust/core/src/serde/physical_plan/mod.rs | 2 +- ballista/rust/scheduler/src/state/mod.rs | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ballista/rust/client/src/context.rs b/ballista/rust/client/src/context.rs index 16a90b2e27340..162cd68352ff3 100644 --- a/ballista/rust/client/src/context.rs +++ b/ballista/rust/client/src/context.rs @@ -133,7 +133,7 @@ impl BallistaContext { let mut ctx = create_datafusion_context( &guard.scheduler_host, guard.scheduler_port, - &guard.config(), + guard.config(), ); let df = ctx.read_parquet(path.to_str().unwrap())?; Ok(df) @@ -155,7 +155,7 @@ impl BallistaContext { let mut ctx = create_datafusion_context( &guard.scheduler_host, guard.scheduler_port, - &guard.config(), + guard.config(), ); let df = ctx.read_csv(path.to_str().unwrap(), options)?; Ok(df) diff --git a/ballista/rust/core/src/serde/physical_plan/mod.rs b/ballista/rust/core/src/serde/physical_plan/mod.rs index f544859fa7b2b..4bf013aabfc5c 100644 --- a/ballista/rust/core/src/serde/physical_plan/mod.rs +++ b/ballista/rust/core/src/serde/physical_plan/mod.rs @@ -105,7 +105,7 @@ mod roundtrip_tests { Arc::new(EmptyExec::new(false, schema_left.clone())), Arc::new(EmptyExec::new(false, schema_right.clone())), on.clone(), - &join_type, + join_type, *partition_mode, )?))?; } diff --git a/ballista/rust/scheduler/src/state/mod.rs b/ballista/rust/scheduler/src/state/mod.rs index 0bbab8cebf896..865b44b6e8394 100644 --- a/ballista/rust/scheduler/src/state/mod.rs +++ b/ballista/rust/scheduler/src/state/mod.rs @@ -316,7 +316,7 @@ impl SchedulerState { )) .unwrap(); let task_is_dead = self - .reschedule_dead_task(&referenced_task, &executors) + .reschedule_dead_task(referenced_task, &executors) .await?; if task_is_dead { continue 'tasks; From d637871ce969102681616ea113cf84288ff6c252 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 30 Jul 2021 10:30:39 -0700 Subject: [PATCH 302/329] Implement streaming versions of Dataframe.collect methods (#789) --- datafusion/src/dataframe.rs | 31 ++++++++ datafusion/src/execution/dataframe_impl.rs | 44 ++++++++--- datafusion/src/physical_plan/mod.rs | 92 ++++++++++++++++------ 3 files changed, 130 insertions(+), 37 deletions(-) diff --git a/datafusion/src/dataframe.rs b/datafusion/src/dataframe.rs index 507a79861cd53..1d4cffdf89d49 100644 --- a/datafusion/src/dataframe.rs +++ b/datafusion/src/dataframe.rs @@ -24,6 +24,7 @@ use crate::logical_plan::{ }; use std::sync::Arc; +use crate::physical_plan::SendableRecordBatchStream; use async_trait::async_trait; /// DataFrame represents a logical set of rows with the same named columns. @@ -222,6 +223,21 @@ pub trait DataFrame: Send + Sync { /// ``` async fn collect(&self) -> Result>; + /// Executes this DataFrame and returns a stream over a single partition + /// + /// ``` + /// # use datafusion::prelude::*; + /// # use datafusion::error::Result; + /// # #[tokio::main] + /// # async fn main() -> Result<()> { + /// let mut ctx = ExecutionContext::new(); + /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; + /// let stream = df.execute_stream().await?; + /// # Ok(()) + /// # } + /// ``` + async fn execute_stream(&self) -> Result; + /// Executes this DataFrame and collects all results into a vector of vector of RecordBatch /// maintaining the input partitioning. /// @@ -238,6 +254,21 @@ pub trait DataFrame: Send + Sync { /// ``` async fn collect_partitioned(&self) -> Result>>; + /// Executes this DataFrame and returns one stream per partition. + /// + /// ``` + /// # use datafusion::prelude::*; + /// # use datafusion::error::Result; + /// # #[tokio::main] + /// # async fn main() -> Result<()> { + /// let mut ctx = ExecutionContext::new(); + /// let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; + /// let batches = df.execute_stream_partitioned().await?; + /// # Ok(()) + /// # } + /// ``` + async fn execute_stream_partitioned(&self) -> Result>; + /// Returns the schema describing the output of this DataFrame in terms of columns returned, /// where each column has a name, data type, and nullability attribute. diff --git a/datafusion/src/execution/dataframe_impl.rs b/datafusion/src/execution/dataframe_impl.rs index 451c4c7ba5023..1c0094b711d6b 100644 --- a/datafusion/src/execution/dataframe_impl.rs +++ b/datafusion/src/execution/dataframe_impl.rs @@ -31,6 +31,9 @@ use crate::{ physical_plan::{collect, collect_partitioned}, }; +use crate::physical_plan::{ + execute_stream, execute_stream_partitioned, ExecutionPlan, SendableRecordBatchStream, +}; use async_trait::async_trait; /// Implementation of DataFrame API @@ -47,6 +50,14 @@ impl DataFrameImpl { plan: plan.clone(), } } + + /// Create a physical plan + async fn create_physical_plan(&self) -> Result> { + let state = self.ctx_state.lock().unwrap().clone(); + let ctx = ExecutionContext::from(Arc::new(Mutex::new(state))); + let plan = ctx.optimize(&self.plan)?; + ctx.create_physical_plan(&plan) + } } #[async_trait] @@ -138,26 +149,35 @@ impl DataFrame for DataFrameImpl { self.plan.clone() } - // Convert the logical plan represented by this DataFrame into a physical plan and - // execute it + /// Convert the logical plan represented by this DataFrame into a physical plan and + /// execute it, collecting all resulting batches into memory async fn collect(&self) -> Result> { - let state = self.ctx_state.lock().unwrap().clone(); - let ctx = ExecutionContext::from(Arc::new(Mutex::new(state))); - let plan = ctx.optimize(&self.plan)?; - let plan = ctx.create_physical_plan(&plan)?; + let plan = self.create_physical_plan().await?; Ok(collect(plan).await?) } - // Convert the logical plan represented by this DataFrame into a physical plan and - // execute it + /// Convert the logical plan represented by this DataFrame into a physical plan and + /// execute it, returning a stream over a single partition + async fn execute_stream(&self) -> Result { + let plan = self.create_physical_plan().await?; + execute_stream(plan).await + } + + /// Convert the logical plan represented by this DataFrame into a physical plan and + /// execute it, collecting all resulting batches into memory while maintaining + /// partitioning async fn collect_partitioned(&self) -> Result>> { - let state = self.ctx_state.lock().unwrap().clone(); - let ctx = ExecutionContext::from(Arc::new(Mutex::new(state))); - let plan = ctx.optimize(&self.plan)?; - let plan = ctx.create_physical_plan(&plan)?; + let plan = self.create_physical_plan().await?; Ok(collect_partitioned(plan).await?) } + /// Convert the logical plan represented by this DataFrame into a physical plan and + /// execute it, returning a stream for each partition + async fn execute_stream_partitioned(&self) -> Result> { + let plan = self.create_physical_plan().await?; + Ok(execute_stream_partitioned(plan).await?) + } + /// Returns the schema from the logical plan fn schema(&self) -> &DFSchema { self.plan.schema() diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index b3c0dd63e9eda..86bceb11b4756 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -17,6 +17,14 @@ //! Traits for physical query plan, supporting parallel execution for partitioned relations. +use std::fmt; +use std::fmt::{Debug, Display}; +use std::ops::Range; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; +use std::task::{Context, Poll}; +use std::{any::Any, pin::Pin}; + use self::{ coalesce_partitions::CoalescePartitionsExec, display::DisplayableExecutionPlan, }; @@ -35,12 +43,6 @@ use async_trait::async_trait; pub use display::DisplayFormatType; use futures::stream::Stream; use hashbrown::HashMap; -use std::fmt; -use std::fmt::{Debug, Display}; -use std::ops::Range; -use std::sync::atomic::{AtomicUsize, Ordering}; -use std::sync::Arc; -use std::{any::Any, pin::Pin}; /// Trait for types that stream [arrow::record_batch::RecordBatch] pub trait RecordBatchStream: Stream> { @@ -54,6 +56,37 @@ pub trait RecordBatchStream: Stream> { /// Trait for a stream of record batches. pub type SendableRecordBatchStream = Pin>; +/// EmptyRecordBatchStream can be used to create a RecordBatchStream +/// that will produce no results +pub struct EmptyRecordBatchStream { + /// Schema + schema: SchemaRef, +} + +impl EmptyRecordBatchStream { + /// Create an empty RecordBatchStream + pub fn new(schema: SchemaRef) -> Self { + Self { schema } + } +} + +impl RecordBatchStream for EmptyRecordBatchStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} + +impl Stream for EmptyRecordBatchStream { + type Item = ArrowResult; + + fn poll_next( + self: Pin<&mut Self>, + _cx: &mut Context<'_>, + ) -> Poll> { + Poll::Ready(None) + } +} + /// SQL metric type #[derive(Debug, Clone)] pub enum MetricType { @@ -313,18 +346,23 @@ pub fn plan_metrics(plan: Arc) -> HashMap /// Execute the [ExecutionPlan] and collect the results in memory pub async fn collect(plan: Arc) -> Result> { + let stream = execute_stream(plan).await?; + common::collect(stream).await +} + +/// Execute the [ExecutionPlan] and return a single stream of results +pub async fn execute_stream( + plan: Arc, +) -> Result { match plan.output_partitioning().partition_count() { - 0 => Ok(vec![]), - 1 => { - let it = plan.execute(0).await?; - common::collect(it).await - } + 0 => Ok(Box::pin(EmptyRecordBatchStream::new(plan.schema()))), + 1 => plan.execute(0).await, _ => { // merge into a single partition let plan = CoalescePartitionsExec::new(plan.clone()); // CoalescePartitionsExec must produce a single partition assert_eq!(1, plan.output_partitioning().partition_count()); - common::collect(plan.execute(0).await?).await + plan.execute(0).await } } } @@ -333,20 +371,24 @@ pub async fn collect(plan: Arc) -> Result> { pub async fn collect_partitioned( plan: Arc, ) -> Result>> { - match plan.output_partitioning().partition_count() { - 0 => Ok(vec![]), - 1 => { - let it = plan.execute(0).await?; - Ok(vec![common::collect(it).await?]) - } - _ => { - let mut partitions = vec![]; - for i in 0..plan.output_partitioning().partition_count() { - partitions.push(common::collect(plan.execute(i).await?).await?) - } - Ok(partitions) - } + let streams = execute_stream_partitioned(plan).await?; + let mut batches = Vec::with_capacity(streams.len()); + for stream in streams { + batches.push(common::collect(stream).await?); + } + Ok(batches) +} + +/// Execute the [ExecutionPlan] and return a vec with one stream per output partition +pub async fn execute_stream_partitioned( + plan: Arc, +) -> Result> { + let num_partitions = plan.output_partitioning().partition_count(); + let mut streams = Vec::with_capacity(num_partitions); + for i in 0..num_partitions { + streams.push(plan.execute(i).await?); } + Ok(streams) } /// Partitioning schemes supported by operators. From a4941ee3a0e9dc630b6a144ccd83e577f61e0958 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 30 Jul 2021 13:40:58 -0400 Subject: [PATCH 303/329] Remove GroupByScalar and use ScalarValue instead (#786) --- .../src/physical_plan/distinct_expressions.rs | 13 +- datafusion/src/physical_plan/group_scalar.rs | 217 ------------------ .../src/physical_plan/hash_aggregate.rs | 139 +---------- datafusion/src/physical_plan/mod.rs | 1 - datafusion/src/scalar.rs | 214 +++++++++++++++-- 5 files changed, 215 insertions(+), 369 deletions(-) delete mode 100644 datafusion/src/physical_plan/group_scalar.rs diff --git a/datafusion/src/physical_plan/distinct_expressions.rs b/datafusion/src/physical_plan/distinct_expressions.rs index 90c0836f7077c..ae6025316bdac 100644 --- a/datafusion/src/physical_plan/distinct_expressions.rs +++ b/datafusion/src/physical_plan/distinct_expressions.rs @@ -18,7 +18,6 @@ //! Implementations for DISTINCT expressions, e.g. `COUNT(DISTINCT c)` use std::any::Any; -use std::convert::TryFrom; use std::fmt::Debug; use std::hash::Hash; use std::sync::Arc; @@ -29,12 +28,11 @@ use ahash::RandomState; use std::collections::HashSet; use crate::error::{DataFusionError, Result}; -use crate::physical_plan::group_scalar::GroupByScalar; use crate::physical_plan::{Accumulator, AggregateExpr, PhysicalExpr}; use crate::scalar::ScalarValue; #[derive(Debug, PartialEq, Eq, Hash, Clone)] -struct DistinctScalarValues(Vec); +struct DistinctScalarValues(Vec); fn format_state_name(name: &str, state_name: &str) -> String { format!("{}[{}]", name, state_name) @@ -137,12 +135,7 @@ impl Accumulator for DistinctCountAccumulator { fn update(&mut self, values: &[ScalarValue]) -> Result<()> { // If a row has a NULL, it is not included in the final count. if !values.iter().any(|v| v.is_null()) { - self.values.insert(DistinctScalarValues( - values - .iter() - .map(GroupByScalar::try_from) - .collect::>>()?, - )); + self.values.insert(DistinctScalarValues(values.to_vec())); } Ok(()) @@ -195,7 +188,7 @@ impl Accumulator for DistinctCountAccumulator { self.values.iter().for_each(|distinct_values| { distinct_values.0.iter().enumerate().for_each( |(col_index, distinct_value)| { - cols_vec[col_index].push(ScalarValue::from(distinct_value)); + cols_vec[col_index].push(distinct_value.clone()); }, ) }); diff --git a/datafusion/src/physical_plan/group_scalar.rs b/datafusion/src/physical_plan/group_scalar.rs deleted file mode 100644 index d5f72b0d78176..0000000000000 --- a/datafusion/src/physical_plan/group_scalar.rs +++ /dev/null @@ -1,217 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines scalars used to construct groups, ex. in GROUP BY clauses. - -use ordered_float::OrderedFloat; -use std::convert::{From, TryFrom}; - -use crate::error::{DataFusionError, Result}; -use crate::scalar::ScalarValue; - -/// Enumeration of types that can be used in a GROUP BY expression -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)] -pub(crate) enum GroupByScalar { - Float32(OrderedFloat), - Float64(OrderedFloat), - UInt8(u8), - UInt16(u16), - UInt32(u32), - UInt64(u64), - Int8(i8), - Int16(i16), - Int32(i32), - Int64(i64), - Utf8(Box), - LargeUtf8(Box), - Boolean(bool), - TimeMillisecond(i64), - TimeMicrosecond(i64), - TimeNanosecond(i64), - Date32(i32), -} - -impl TryFrom<&ScalarValue> for GroupByScalar { - type Error = DataFusionError; - - fn try_from(scalar_value: &ScalarValue) -> Result { - Ok(match scalar_value { - ScalarValue::Float32(Some(v)) => { - GroupByScalar::Float32(OrderedFloat::from(*v)) - } - ScalarValue::Float64(Some(v)) => { - GroupByScalar::Float64(OrderedFloat::from(*v)) - } - ScalarValue::Boolean(Some(v)) => GroupByScalar::Boolean(*v), - ScalarValue::Int8(Some(v)) => GroupByScalar::Int8(*v), - ScalarValue::Int16(Some(v)) => GroupByScalar::Int16(*v), - ScalarValue::Int32(Some(v)) => GroupByScalar::Int32(*v), - ScalarValue::Int64(Some(v)) => GroupByScalar::Int64(*v), - ScalarValue::UInt8(Some(v)) => GroupByScalar::UInt8(*v), - ScalarValue::UInt16(Some(v)) => GroupByScalar::UInt16(*v), - ScalarValue::UInt32(Some(v)) => GroupByScalar::UInt32(*v), - ScalarValue::UInt64(Some(v)) => GroupByScalar::UInt64(*v), - ScalarValue::TimestampMillisecond(Some(v)) => { - GroupByScalar::TimeMillisecond(*v) - } - ScalarValue::TimestampMicrosecond(Some(v)) => { - GroupByScalar::TimeMicrosecond(*v) - } - ScalarValue::TimestampNanosecond(Some(v)) => { - GroupByScalar::TimeNanosecond(*v) - } - ScalarValue::Utf8(Some(v)) => GroupByScalar::Utf8(Box::new(v.clone())), - ScalarValue::LargeUtf8(Some(v)) => { - GroupByScalar::LargeUtf8(Box::new(v.clone())) - } - ScalarValue::Float32(None) - | ScalarValue::Float64(None) - | ScalarValue::Boolean(None) - | ScalarValue::Int8(None) - | ScalarValue::Int16(None) - | ScalarValue::Int32(None) - | ScalarValue::Int64(None) - | ScalarValue::UInt8(None) - | ScalarValue::UInt16(None) - | ScalarValue::UInt32(None) - | ScalarValue::UInt64(None) - | ScalarValue::Utf8(None) => { - return Err(DataFusionError::Internal(format!( - "Cannot convert a ScalarValue holding NULL ({:?})", - scalar_value - ))); - } - v => { - return Err(DataFusionError::Internal(format!( - "Cannot convert a ScalarValue with associated DataType {:?}", - v.get_datatype() - ))) - } - }) - } -} - -impl From<&GroupByScalar> for ScalarValue { - fn from(group_by_scalar: &GroupByScalar) -> Self { - match group_by_scalar { - GroupByScalar::Float32(v) => ScalarValue::Float32(Some((*v).into())), - GroupByScalar::Float64(v) => ScalarValue::Float64(Some((*v).into())), - GroupByScalar::Boolean(v) => ScalarValue::Boolean(Some(*v)), - GroupByScalar::Int8(v) => ScalarValue::Int8(Some(*v)), - GroupByScalar::Int16(v) => ScalarValue::Int16(Some(*v)), - GroupByScalar::Int32(v) => ScalarValue::Int32(Some(*v)), - GroupByScalar::Int64(v) => ScalarValue::Int64(Some(*v)), - GroupByScalar::UInt8(v) => ScalarValue::UInt8(Some(*v)), - GroupByScalar::UInt16(v) => ScalarValue::UInt16(Some(*v)), - GroupByScalar::UInt32(v) => ScalarValue::UInt32(Some(*v)), - GroupByScalar::UInt64(v) => ScalarValue::UInt64(Some(*v)), - GroupByScalar::Utf8(v) => ScalarValue::Utf8(Some(v.to_string())), - GroupByScalar::LargeUtf8(v) => ScalarValue::LargeUtf8(Some(v.to_string())), - GroupByScalar::TimeMillisecond(v) => { - ScalarValue::TimestampMillisecond(Some(*v)) - } - GroupByScalar::TimeMicrosecond(v) => { - ScalarValue::TimestampMicrosecond(Some(*v)) - } - GroupByScalar::TimeNanosecond(v) => { - ScalarValue::TimestampNanosecond(Some(*v)) - } - GroupByScalar::Date32(v) => ScalarValue::Date32(Some(*v)), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use crate::error::DataFusionError; - - macro_rules! scalar_eq_test { - ($TYPE:expr, $VALUE:expr) => {{ - let scalar_value = $TYPE($VALUE); - let a = GroupByScalar::try_from(&scalar_value).unwrap(); - - let scalar_value = $TYPE($VALUE); - let b = GroupByScalar::try_from(&scalar_value).unwrap(); - - assert_eq!(a, b); - }}; - } - - #[test] - fn test_scalar_ne_non_std() { - // Test only Scalars with non native Eq, Hash - scalar_eq_test!(ScalarValue::Float32, Some(1.0)); - scalar_eq_test!(ScalarValue::Float64, Some(1.0)); - } - - macro_rules! scalar_ne_test { - ($TYPE:expr, $LVALUE:expr, $RVALUE:expr) => {{ - let scalar_value = $TYPE($LVALUE); - let a = GroupByScalar::try_from(&scalar_value).unwrap(); - - let scalar_value = $TYPE($RVALUE); - let b = GroupByScalar::try_from(&scalar_value).unwrap(); - - assert_ne!(a, b); - }}; - } - - #[test] - fn test_scalar_eq_non_std() { - // Test only Scalars with non native Eq, Hash - scalar_ne_test!(ScalarValue::Float32, Some(1.0), Some(2.0)); - scalar_ne_test!(ScalarValue::Float64, Some(1.0), Some(2.0)); - } - - #[test] - fn from_scalar_holding_none() { - let scalar_value = ScalarValue::Int8(None); - let result = GroupByScalar::try_from(&scalar_value); - - match result { - Err(DataFusionError::Internal(error_message)) => assert_eq!( - error_message, - String::from("Cannot convert a ScalarValue holding NULL (Int8(NULL))") - ), - _ => panic!("Unexpected result"), - } - } - - #[test] - fn from_scalar_unsupported() { - // Use any ScalarValue type not supported by GroupByScalar. - let scalar_value = ScalarValue::Binary(Some(vec![1, 2])); - let result = GroupByScalar::try_from(&scalar_value); - - match result { - Err(DataFusionError::Internal(error_message)) => assert_eq!( - error_message, - String::from( - "Cannot convert a ScalarValue with associated DataType Binary" - ) - ), - _ => panic!("Unexpected result"), - } - } - - #[test] - fn size_of_group_by_scalar() { - assert_eq!(std::mem::size_of::(), 16); - } -} diff --git a/datafusion/src/physical_plan/hash_aggregate.rs b/datafusion/src/physical_plan/hash_aggregate.rs index ae513831bef46..eb4a356e88ce8 100644 --- a/datafusion/src/physical_plan/hash_aggregate.rs +++ b/datafusion/src/physical_plan/hash_aggregate.rs @@ -59,7 +59,6 @@ use arrow::{ record_batch::RecordBatch, }; use hashbrown::HashMap; -use ordered_float::OrderedFloat; use pin_project_lite::pin_project; use arrow::array::{ @@ -68,10 +67,7 @@ use arrow::array::{ }; use async_trait::async_trait; -use super::{ - expressions::Column, group_scalar::GroupByScalar, RecordBatchStream, - SendableRecordBatchStream, -}; +use super::{expressions::Column, RecordBatchStream, SendableRecordBatchStream}; /// Hash aggregate modes #[derive(Debug, Copy, Clone, PartialEq, Eq)] @@ -362,7 +358,7 @@ fn group_aggregate_batch( // it will be overwritten on every iteration of the loop below let mut group_by_values = Vec::with_capacity(group_values.len()); for _ in 0..group_values.len() { - group_by_values.push(GroupByScalar::UInt32(0)); + group_by_values.push(ScalarValue::UInt32(Some(0))); } let mut group_by_values = group_by_values.into_boxed_slice(); @@ -730,7 +726,7 @@ impl GroupedHashAggregateStream { type AccumulatorItem = Box; type Accumulators = - HashMap, (Box<[GroupByScalar]>, Vec, Vec), RandomState>; + HashMap, (Box<[ScalarValue]>, Vec, Vec), RandomState>; impl Stream for GroupedHashAggregateStream { type Item = ArrowResult; @@ -1004,9 +1000,11 @@ fn create_batch_from_map( let mut columns = (0..num_group_expr) .map(|i| { - ScalarValue::iter_to_array(accumulators.into_iter().map( - |(_, (group_by_values, _, _))| ScalarValue::from(&group_by_values[i]), - )) + ScalarValue::iter_to_array( + accumulators + .into_iter() + .map(|(_, (group_by_values, _, _))| group_by_values[i].clone()), + ) }) .collect::>>() .map_err(|x| x.into_arrow_external_error())?; @@ -1088,124 +1086,9 @@ fn finalize_aggregation( } } -/// Extract the value in `col[row]` from a dictionary a GroupByScalar -fn dictionary_create_group_by_value( - col: &ArrayRef, - row: usize, -) -> Result { - let dict_col = col.as_any().downcast_ref::>().unwrap(); - - // look up the index in the values dictionary - let keys_col = dict_col.keys(); - let values_index = keys_col.value(row).to_usize().ok_or_else(|| { - DataFusionError::Internal(format!( - "Can not convert index to usize in dictionary of type creating group by value {:?}", - keys_col.data_type() - )) - })?; - - create_group_by_value(dict_col.values(), values_index) -} - /// Extract the value in `col[row]` as a GroupByScalar -fn create_group_by_value(col: &ArrayRef, row: usize) -> Result { - match col.data_type() { - DataType::Float32 => { - let array = col.as_any().downcast_ref::().unwrap(); - Ok(GroupByScalar::Float32(OrderedFloat::from(array.value(row)))) - } - DataType::Float64 => { - let array = col.as_any().downcast_ref::().unwrap(); - Ok(GroupByScalar::Float64(OrderedFloat::from(array.value(row)))) - } - DataType::UInt8 => { - let array = col.as_any().downcast_ref::().unwrap(); - Ok(GroupByScalar::UInt8(array.value(row))) - } - DataType::UInt16 => { - let array = col.as_any().downcast_ref::().unwrap(); - Ok(GroupByScalar::UInt16(array.value(row))) - } - DataType::UInt32 => { - let array = col.as_any().downcast_ref::().unwrap(); - Ok(GroupByScalar::UInt32(array.value(row))) - } - DataType::UInt64 => { - let array = col.as_any().downcast_ref::().unwrap(); - Ok(GroupByScalar::UInt64(array.value(row))) - } - DataType::Int8 => { - let array = col.as_any().downcast_ref::().unwrap(); - Ok(GroupByScalar::Int8(array.value(row))) - } - DataType::Int16 => { - let array = col.as_any().downcast_ref::().unwrap(); - Ok(GroupByScalar::Int16(array.value(row))) - } - DataType::Int32 => { - let array = col.as_any().downcast_ref::().unwrap(); - Ok(GroupByScalar::Int32(array.value(row))) - } - DataType::Int64 => { - let array = col.as_any().downcast_ref::().unwrap(); - Ok(GroupByScalar::Int64(array.value(row))) - } - DataType::Utf8 => { - let array = col.as_any().downcast_ref::().unwrap(); - Ok(GroupByScalar::Utf8(Box::new(array.value(row).into()))) - } - DataType::LargeUtf8 => { - let array = col.as_any().downcast_ref::().unwrap(); - Ok(GroupByScalar::LargeUtf8(Box::new(array.value(row).into()))) - } - DataType::Boolean => { - let array = col.as_any().downcast_ref::().unwrap(); - Ok(GroupByScalar::Boolean(array.value(row))) - } - DataType::Timestamp(TimeUnit::Millisecond, None) => { - let array = col - .as_any() - .downcast_ref::() - .unwrap(); - Ok(GroupByScalar::TimeMillisecond(array.value(row))) - } - DataType::Timestamp(TimeUnit::Microsecond, None) => { - let array = col - .as_any() - .downcast_ref::() - .unwrap(); - Ok(GroupByScalar::TimeMicrosecond(array.value(row))) - } - DataType::Timestamp(TimeUnit::Nanosecond, None) => { - let array = col - .as_any() - .downcast_ref::() - .unwrap(); - Ok(GroupByScalar::TimeNanosecond(array.value(row))) - } - DataType::Date32 => { - let array = col.as_any().downcast_ref::().unwrap(); - Ok(GroupByScalar::Date32(array.value(row))) - } - DataType::Dictionary(index_type, _) => match **index_type { - DataType::Int8 => dictionary_create_group_by_value::(col, row), - DataType::Int16 => dictionary_create_group_by_value::(col, row), - DataType::Int32 => dictionary_create_group_by_value::(col, row), - DataType::Int64 => dictionary_create_group_by_value::(col, row), - DataType::UInt8 => dictionary_create_group_by_value::(col, row), - DataType::UInt16 => dictionary_create_group_by_value::(col, row), - DataType::UInt32 => dictionary_create_group_by_value::(col, row), - DataType::UInt64 => dictionary_create_group_by_value::(col, row), - _ => Err(DataFusionError::NotImplemented(format!( - "Unsupported GROUP BY type (dictionary index type not supported) {}", - col.data_type(), - ))), - }, - _ => Err(DataFusionError::NotImplemented(format!( - "Unsupported GROUP BY type {}", - col.data_type(), - ))), - } +fn create_group_by_value(col: &ArrayRef, row: usize) -> Result { + ScalarValue::try_from_array(col, row) } /// Extract the values in `group_by_keys` arrow arrays into the target vector @@ -1213,7 +1096,7 @@ fn create_group_by_value(col: &ArrayRef, row: usize) -> Result { pub(crate) fn create_group_by_values( group_by_keys: &[ArrayRef], row: usize, - vec: &mut Box<[GroupByScalar]>, + vec: &mut Box<[ScalarValue]>, ) -> Result<()> { for (i, col) in group_by_keys.iter().enumerate() { vec[i] = create_group_by_value(col, row)? diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs index 86bceb11b4756..0df6e6038e67d 100644 --- a/datafusion/src/physical_plan/mod.rs +++ b/datafusion/src/physical_plan/mod.rs @@ -655,7 +655,6 @@ pub mod explain; pub mod expressions; pub mod filter; pub mod functions; -pub mod group_scalar; pub mod hash_aggregate; pub mod hash_join; pub mod hash_utils; diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index 129b4166a4e8c..8efea63e82368 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -27,13 +27,14 @@ use arrow::{ TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }, }; +use ordered_float::OrderedFloat; use std::convert::Infallible; use std::str::FromStr; use std::{convert::TryFrom, fmt, iter::repeat, sync::Arc}; /// Represents a dynamically typed, nullable single value. /// This is the single-valued counter-part of arrow’s `Array`. -#[derive(Clone, PartialEq)] +#[derive(Clone)] pub enum ScalarValue { /// true or false value Boolean(Option), @@ -86,6 +87,120 @@ pub enum ScalarValue { IntervalDayTime(Option), } +// manual implementation of `PartialEq` that uses OrderedFloat to +// get defined behavior for floating point +impl PartialEq for ScalarValue { + fn eq(&self, other: &Self) -> bool { + use ScalarValue::*; + // This purposely doesn't have a catch-all "(_, _)" so that + // any newly added enum variant will require editing this list + // or else face a compile error + match (self, other) { + (Boolean(v1), Boolean(v2)) => v1.eq(v2), + (Boolean(_), _) => false, + (Float32(v1), Float32(v2)) => { + let v1 = v1.map(OrderedFloat); + let v2 = v2.map(OrderedFloat); + v1.eq(&v2) + } + (Float32(_), _) => false, + (Float64(v1), Float64(v2)) => { + let v1 = v1.map(OrderedFloat); + let v2 = v2.map(OrderedFloat); + v1.eq(&v2) + } + (Float64(_), _) => false, + (Int8(v1), Int8(v2)) => v1.eq(v2), + (Int8(_), _) => false, + (Int16(v1), Int16(v2)) => v1.eq(v2), + (Int16(_), _) => false, + (Int32(v1), Int32(v2)) => v1.eq(v2), + (Int32(_), _) => false, + (Int64(v1), Int64(v2)) => v1.eq(v2), + (Int64(_), _) => false, + (UInt8(v1), UInt8(v2)) => v1.eq(v2), + (UInt8(_), _) => false, + (UInt16(v1), UInt16(v2)) => v1.eq(v2), + (UInt16(_), _) => false, + (UInt32(v1), UInt32(v2)) => v1.eq(v2), + (UInt32(_), _) => false, + (UInt64(v1), UInt64(v2)) => v1.eq(v2), + (UInt64(_), _) => false, + (Utf8(v1), Utf8(v2)) => v1.eq(v2), + (Utf8(_), _) => false, + (LargeUtf8(v1), LargeUtf8(v2)) => v1.eq(v2), + (LargeUtf8(_), _) => false, + (Binary(v1), Binary(v2)) => v1.eq(v2), + (Binary(_), _) => false, + (LargeBinary(v1), LargeBinary(v2)) => v1.eq(v2), + (LargeBinary(_), _) => false, + (List(v1, t1), List(v2, t2)) => v1.eq(v2) && t1.eq(t2), + (List(_, _), _) => false, + (Date32(v1), Date32(v2)) => v1.eq(v2), + (Date32(_), _) => false, + (Date64(v1), Date64(v2)) => v1.eq(v2), + (Date64(_), _) => false, + (TimestampSecond(v1), TimestampSecond(v2)) => v1.eq(v2), + (TimestampSecond(_), _) => false, + (TimestampMillisecond(v1), TimestampMillisecond(v2)) => v1.eq(v2), + (TimestampMillisecond(_), _) => false, + (TimestampMicrosecond(v1), TimestampMicrosecond(v2)) => v1.eq(v2), + (TimestampMicrosecond(_), _) => false, + (TimestampNanosecond(v1), TimestampNanosecond(v2)) => v1.eq(v2), + (TimestampNanosecond(_), _) => false, + (IntervalYearMonth(v1), IntervalYearMonth(v2)) => v1.eq(v2), + (IntervalYearMonth(_), _) => false, + (IntervalDayTime(v1), IntervalDayTime(v2)) => v1.eq(v2), + (IntervalDayTime(_), _) => false, + } + } +} + +impl Eq for ScalarValue {} + +// manual implementation of `Hash` that uses OrderedFloat to +// get defined behavior for floating point +impl std::hash::Hash for ScalarValue { + fn hash(&self, state: &mut H) { + use ScalarValue::*; + match self { + Boolean(v) => v.hash(state), + Float32(v) => { + let v = v.map(OrderedFloat); + v.hash(state) + } + Float64(v) => { + let v = v.map(OrderedFloat); + v.hash(state) + } + Int8(v) => v.hash(state), + Int16(v) => v.hash(state), + Int32(v) => v.hash(state), + Int64(v) => v.hash(state), + UInt8(v) => v.hash(state), + UInt16(v) => v.hash(state), + UInt32(v) => v.hash(state), + UInt64(v) => v.hash(state), + Utf8(v) => v.hash(state), + LargeUtf8(v) => v.hash(state), + Binary(v) => v.hash(state), + LargeBinary(v) => v.hash(state), + List(v, t) => { + v.hash(state); + t.hash(state); + } + Date32(v) => v.hash(state), + Date64(v) => v.hash(state), + TimestampSecond(v) => v.hash(state), + TimestampMillisecond(v) => v.hash(state), + TimestampMicrosecond(v) => v.hash(state), + TimestampNanosecond(v) => v.hash(state), + IntervalYearMonth(v) => v.hash(state), + IntervalDayTime(v) => v.hash(state), + } + } +} + macro_rules! typed_cast { ($array:expr, $index:expr, $ARRAYTYPE:ident, $SCALAR:ident) => {{ let array = $array.as_any().downcast_ref::<$ARRAYTYPE>().unwrap(); @@ -795,73 +910,146 @@ impl ScalarValue { impl From for ScalarValue { fn from(value: f64) -> Self { - ScalarValue::Float64(Some(value)) + Some(value).into() + } +} + +impl From> for ScalarValue { + fn from(value: Option) -> Self { + ScalarValue::Float64(value) } } impl From for ScalarValue { fn from(value: f32) -> Self { - ScalarValue::Float32(Some(value)) + Some(value).into() + } +} + +impl From> for ScalarValue { + fn from(value: Option) -> Self { + ScalarValue::Float32(value) } } impl From for ScalarValue { fn from(value: i8) -> Self { - ScalarValue::Int8(Some(value)) + Some(value).into() + } +} + +impl From> for ScalarValue { + fn from(value: Option) -> Self { + ScalarValue::Int8(value) } } impl From for ScalarValue { fn from(value: i16) -> Self { - ScalarValue::Int16(Some(value)) + Some(value).into() + } +} + +impl From> for ScalarValue { + fn from(value: Option) -> Self { + ScalarValue::Int16(value) } } impl From for ScalarValue { fn from(value: i32) -> Self { - ScalarValue::Int32(Some(value)) + Some(value).into() + } +} + +impl From> for ScalarValue { + fn from(value: Option) -> Self { + ScalarValue::Int32(value) } } impl From for ScalarValue { fn from(value: i64) -> Self { - ScalarValue::Int64(Some(value)) + Some(value).into() + } +} + +impl From> for ScalarValue { + fn from(value: Option) -> Self { + ScalarValue::Int64(value) } } impl From for ScalarValue { fn from(value: bool) -> Self { - ScalarValue::Boolean(Some(value)) + Some(value).into() + } +} + +impl From> for ScalarValue { + fn from(value: Option) -> Self { + ScalarValue::Boolean(value) } } impl From for ScalarValue { fn from(value: u8) -> Self { - ScalarValue::UInt8(Some(value)) + Some(value).into() + } +} + +impl From> for ScalarValue { + fn from(value: Option) -> Self { + ScalarValue::UInt8(value) } } impl From for ScalarValue { fn from(value: u16) -> Self { - ScalarValue::UInt16(Some(value)) + Some(value).into() + } +} + +impl From> for ScalarValue { + fn from(value: Option) -> Self { + ScalarValue::UInt16(value) } } impl From for ScalarValue { fn from(value: u32) -> Self { - ScalarValue::UInt32(Some(value)) + Some(value).into() + } +} + +impl From> for ScalarValue { + fn from(value: Option) -> Self { + ScalarValue::UInt32(value) } } impl From for ScalarValue { fn from(value: u64) -> Self { - ScalarValue::UInt64(Some(value)) + Some(value).into() + } +} + +impl From> for ScalarValue { + fn from(value: Option) -> Self { + ScalarValue::UInt64(value) } } impl From<&str> for ScalarValue { fn from(value: &str) -> Self { - ScalarValue::Utf8(Some(value.to_string())) + Some(value).into() + } +} + +impl From> for ScalarValue { + fn from(value: Option<&str>) -> Self { + let value = value.map(|s| s.to_string()); + ScalarValue::Utf8(value) } } From 3eac2e65437de52a26d2380a7d49fbcea9eb2c15 Mon Sep 17 00:00:00 2001 From: QP Hou Date: Sun, 1 Aug 2021 03:36:32 -0700 Subject: [PATCH 304/329] expand file glob within prettier (#803) '**' pattern is not supported to some of the shells including the one we use in CI. --- .github/workflows/dev.yml | 4 +-- ballista/README.md | 20 ++++++------- .../src/distributed/docker-compose.md | 2 +- docs/user-guide/src/distributed/kubernetes.md | 30 +++++++++---------- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 8bb35f1ef871b..39c449c50a8ef 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -64,7 +64,7 @@ jobs: # if you encounter error, try rerun the command below with --write instead of --check # and commit the changes npx prettier@2.3.2 --check \ - {ballista,datafusion,datafusion-examples,docs,python}/**/*.md \ + '{ballista,datafusion,datafusion-examples,docs,python}/**/*.md' \ README.md \ DEVELOPERS.md \ - ballista/**/*.{ts,tsx} + 'ballista/**/*.{ts,tsx}' diff --git a/ballista/README.md b/ballista/README.md index 0a8db63a1a6cc..eeb4273ee8938 100644 --- a/ballista/README.md +++ b/ballista/README.md @@ -19,8 +19,8 @@ # Ballista: Distributed Compute with Apache Arrow and DataFusion -Ballista is a distributed compute platform primarily implemented in Rust, and powered by Apache Arrow and -DataFusion. It is built on an architecture that allows other programming languages (such as Python, C++, and +Ballista is a distributed compute platform primarily implemented in Rust, and powered by Apache Arrow and +DataFusion. It is built on an architecture that allows other programming languages (such as Python, C++, and Java) to be supported as first-class citizens without paying a penalty for serialization costs. The foundational technologies in Ballista are: @@ -37,23 +37,23 @@ redundancy in the case of a scheduler failing. # Getting Started -Fully working examples are available. Refer to the [Ballista Examples README](../ballista-examples/README.md) for +Fully working examples are available. Refer to the [Ballista Examples README](../ballista-examples/README.md) for more information. ## Distributed Scheduler Overview -Ballista uses the DataFusion query execution framework to create a physical plan and then transforms it into a +Ballista uses the DataFusion query execution framework to create a physical plan and then transforms it into a distributed physical plan by breaking the query down into stages whenever the partitioning scheme changes. -Specifically, any `RepartitionExec` operator is replaced with an `UnresolvedShuffleExec` and the child operator +Specifically, any `RepartitionExec` operator is replaced with an `UnresolvedShuffleExec` and the child operator of the repartition operator is wrapped in a `ShuffleWriterExec` operator and scheduled for execution. -Each executor polls the scheduler for the next task to run. Tasks are currently always `ShuffleWriterExec` operators -and each task represents one *input* partition that will be executed. The resulting batches are repartitioned -according to the shuffle partitioning scheme and each *output* partition is streamed to disk in Arrow IPC format. +Each executor polls the scheduler for the next task to run. Tasks are currently always `ShuffleWriterExec` operators +and each task represents one _input_ partition that will be executed. The resulting batches are repartitioned +according to the shuffle partitioning scheme and each _output_ partition is streamed to disk in Arrow IPC format. -The scheduler will replace `UnresolvedShuffleExec` operators with `ShuffleReaderExec` operators once all shuffle -tasks have completed. The `ShuffleReaderExec` operator connects to other executors as required using the Flight +The scheduler will replace `UnresolvedShuffleExec` operators with `ShuffleReaderExec` operators once all shuffle +tasks have completed. The `ShuffleReaderExec` operator connects to other executors as required using the Flight interface, and streams the shuffle IPC files. # How does this compare to Apache Spark? diff --git a/docs/user-guide/src/distributed/docker-compose.md b/docs/user-guide/src/distributed/docker-compose.md index 14989e58034d0..9ada1baa11a9e 100644 --- a/docs/user-guide/src/distributed/docker-compose.md +++ b/docs/user-guide/src/distributed/docker-compose.md @@ -24,7 +24,7 @@ demonstrates how to start a cluster using a single process that acts as both a s volume mounted into the container so that Ballista can access the host file system. ```yaml -version: '2.2' +version: "2.2" services: etcd: image: quay.io/coreos/etcd:v3.4.9 diff --git a/docs/user-guide/src/distributed/kubernetes.md b/docs/user-guide/src/distributed/kubernetes.md index 4b80d1731943c..ef4accaf37995 100644 --- a/docs/user-guide/src/distributed/kubernetes.md +++ b/docs/user-guide/src/distributed/kubernetes.md @@ -129,16 +129,16 @@ spec: ballista-cluster: ballista spec: containers: - - name: ballista-scheduler - image: - command: ["/scheduler"] - args: ["--bind-port=50050"] - ports: - - containerPort: 50050 - name: flight - volumeMounts: - - mountPath: /mnt - name: data + - name: ballista-scheduler + image: + command: ["/scheduler"] + args: ["--bind-port=50050"] + ports: + - containerPort: 50050 + name: flight + volumeMounts: + - mountPath: /mnt + name: data volumes: - name: data persistentVolumeClaim: @@ -245,10 +245,10 @@ spec: minReplicaCount: 0 maxReplicaCount: 5 triggers: - - type: external - metadata: - # Change this DNS if the scheduler isn't deployed in the "default" namespace - scalerAddress: ballista-scheduler.default.svc.cluster.local:50050 + - type: external + metadata: + # Change this DNS if the scheduler isn't deployed in the "default" namespace + scalerAddress: ballista-scheduler.default.svc.cluster.local:50050 ``` And then deploy it into the cluster: @@ -261,4 +261,4 @@ If the cluster is inactive, Keda will now scale the number of executors down to you launch a query. Please note that Keda will perform a scan once every 30 seconds, so it might take a bit to scale the executors. -Please visit Keda's [documentation page](https://keda.sh/docs/2.3/concepts/scaling-deployments/) for more information. \ No newline at end of file +Please visit Keda's [documentation page](https://keda.sh/docs/2.3/concepts/scaling-deployments/) for more information. From 11734799b4c5e0a8627deba18ec40a0f47fea421 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 1 Aug 2021 08:49:13 -0400 Subject: [PATCH 305/329] Move `hash_array` into hash_utils.rs (#807) --- .../src/execution_plans/shuffle_writer.rs | 2 +- datafusion/src/physical_plan/hash_join.rs | 365 +---------------- datafusion/src/physical_plan/hash_utils.rs | 368 +++++++++++++++++- datafusion/src/physical_plan/repartition.rs | 3 +- 4 files changed, 374 insertions(+), 364 deletions(-) diff --git a/ballista/rust/core/src/execution_plans/shuffle_writer.rs b/ballista/rust/core/src/execution_plans/shuffle_writer.rs index 8081dab36ab5f..b1db21fa90a1b 100644 --- a/ballista/rust/core/src/execution_plans/shuffle_writer.rs +++ b/ballista/rust/core/src/execution_plans/shuffle_writer.rs @@ -44,7 +44,7 @@ use datafusion::arrow::ipc::reader::FileReader; use datafusion::arrow::ipc::writer::FileWriter; use datafusion::arrow::record_batch::RecordBatch; use datafusion::error::{DataFusionError, Result}; -use datafusion::physical_plan::hash_join::create_hashes; +use datafusion::physical_plan::hash_utils::create_hashes; use datafusion::physical_plan::repartition::RepartitionExec; use datafusion::physical_plan::Partitioning::RoundRobinBatch; use datafusion::physical_plan::{ diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index 00ca1539d714f..1a174bb11d10f 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -18,18 +18,15 @@ //! Defines the join plan for executing partitions in parallel and then joining the results //! into a set of partitions. -use ahash::CallHasher; use ahash::RandomState; use arrow::{ array::{ - ArrayData, ArrayRef, BooleanArray, Date32Array, Date64Array, Float32Array, - Float64Array, LargeStringArray, PrimitiveArray, TimestampMicrosecondArray, - TimestampMillisecondArray, TimestampNanosecondArray, UInt32BufferBuilder, - UInt32Builder, UInt64BufferBuilder, UInt64Builder, + ArrayData, ArrayRef, BooleanArray, LargeStringArray, PrimitiveArray, + UInt32BufferBuilder, UInt32Builder, UInt64BufferBuilder, UInt64Builder, }, compute, - datatypes::{TimeUnit, UInt32Type, UInt64Type}, + datatypes::{UInt32Type, UInt64Type}, }; use smallvec::{smallvec, SmallVec}; use std::{any::Any, usize}; @@ -53,6 +50,7 @@ use arrow::array::{ }; use super::expressions::Column; +use super::hash_utils::create_hashes; use super::{ coalesce_partitions::CoalescePartitionsExec, hash_utils::{build_join_schema, check_join_is_valid, JoinOn}, @@ -790,13 +788,6 @@ impl BuildHasher for IdHashBuilder { } } -// Combines two hashes into one hash -#[inline] -fn combine_hashes(l: u64, r: u64) -> u64 { - let hash = (17 * 37u64).wrapping_add(l); - hash.wrapping_mul(37).wrapping_add(r) -} - macro_rules! equal_rows_elem { ($array_type:ident, $l: ident, $r: ident, $left: ident, $right: ident) => {{ let left_array = $l.as_any().downcast_ref::<$array_type>().unwrap(); @@ -848,338 +839,6 @@ fn equal_rows( err.unwrap_or(Ok(res)) } -macro_rules! hash_array { - ($array_type:ident, $column: ident, $ty: ident, $hashes: ident, $random_state: ident, $multi_col: ident) => { - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - if array.null_count() == 0 { - if $multi_col { - for (i, hash) in $hashes.iter_mut().enumerate() { - *hash = combine_hashes( - $ty::get_hash(&array.value(i), $random_state), - *hash, - ); - } - } else { - for (i, hash) in $hashes.iter_mut().enumerate() { - *hash = $ty::get_hash(&array.value(i), $random_state); - } - } - } else { - if $multi_col { - for (i, hash) in $hashes.iter_mut().enumerate() { - if !array.is_null(i) { - *hash = combine_hashes( - $ty::get_hash(&array.value(i), $random_state), - *hash, - ); - } - } - } else { - for (i, hash) in $hashes.iter_mut().enumerate() { - if !array.is_null(i) { - *hash = $ty::get_hash(&array.value(i), $random_state); - } - } - } - } - }; -} - -macro_rules! hash_array_primitive { - ($array_type:ident, $column: ident, $ty: ident, $hashes: ident, $random_state: ident, $multi_col: ident) => { - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - let values = array.values(); - - if array.null_count() == 0 { - if $multi_col { - for (hash, value) in $hashes.iter_mut().zip(values.iter()) { - *hash = combine_hashes($ty::get_hash(value, $random_state), *hash); - } - } else { - for (hash, value) in $hashes.iter_mut().zip(values.iter()) { - *hash = $ty::get_hash(value, $random_state) - } - } - } else { - if $multi_col { - for (i, (hash, value)) in - $hashes.iter_mut().zip(values.iter()).enumerate() - { - if !array.is_null(i) { - *hash = - combine_hashes($ty::get_hash(value, $random_state), *hash); - } - } - } else { - for (i, (hash, value)) in - $hashes.iter_mut().zip(values.iter()).enumerate() - { - if !array.is_null(i) { - *hash = $ty::get_hash(value, $random_state); - } - } - } - } - }; -} - -macro_rules! hash_array_float { - ($array_type:ident, $column: ident, $ty: ident, $hashes: ident, $random_state: ident, $multi_col: ident) => { - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - let values = array.values(); - - if array.null_count() == 0 { - if $multi_col { - for (hash, value) in $hashes.iter_mut().zip(values.iter()) { - *hash = combine_hashes( - $ty::get_hash( - &$ty::from_le_bytes(value.to_le_bytes()), - $random_state, - ), - *hash, - ); - } - } else { - for (hash, value) in $hashes.iter_mut().zip(values.iter()) { - *hash = $ty::get_hash( - &$ty::from_le_bytes(value.to_le_bytes()), - $random_state, - ) - } - } - } else { - if $multi_col { - for (i, (hash, value)) in - $hashes.iter_mut().zip(values.iter()).enumerate() - { - if !array.is_null(i) { - *hash = combine_hashes( - $ty::get_hash( - &$ty::from_le_bytes(value.to_le_bytes()), - $random_state, - ), - *hash, - ); - } - } - } else { - for (i, (hash, value)) in - $hashes.iter_mut().zip(values.iter()).enumerate() - { - if !array.is_null(i) { - *hash = $ty::get_hash( - &$ty::from_le_bytes(value.to_le_bytes()), - $random_state, - ); - } - } - } - } - }; -} - -/// Creates hash values for every element in the row based on the values in the columns -pub fn create_hashes<'a>( - arrays: &[ArrayRef], - random_state: &RandomState, - hashes_buffer: &'a mut Vec, -) -> Result<&'a mut Vec> { - // combine hashes with `combine_hashes` if we have more than 1 column - let multi_col = arrays.len() > 1; - - for col in arrays { - match col.data_type() { - DataType::UInt8 => { - hash_array_primitive!( - UInt8Array, - col, - u8, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::UInt16 => { - hash_array_primitive!( - UInt16Array, - col, - u16, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::UInt32 => { - hash_array_primitive!( - UInt32Array, - col, - u32, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::UInt64 => { - hash_array_primitive!( - UInt64Array, - col, - u64, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::Int8 => { - hash_array_primitive!( - Int8Array, - col, - i8, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::Int16 => { - hash_array_primitive!( - Int16Array, - col, - i16, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::Int32 => { - hash_array_primitive!( - Int32Array, - col, - i32, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::Int64 => { - hash_array_primitive!( - Int64Array, - col, - i64, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::Float32 => { - hash_array_float!( - Float32Array, - col, - u32, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::Float64 => { - hash_array_float!( - Float64Array, - col, - u64, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::Timestamp(TimeUnit::Millisecond, None) => { - hash_array_primitive!( - TimestampMillisecondArray, - col, - i64, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::Timestamp(TimeUnit::Microsecond, None) => { - hash_array_primitive!( - TimestampMicrosecondArray, - col, - i64, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::Timestamp(TimeUnit::Nanosecond, None) => { - hash_array_primitive!( - TimestampNanosecondArray, - col, - i64, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::Date32 => { - hash_array_primitive!( - Date32Array, - col, - i32, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::Date64 => { - hash_array_primitive!( - Date64Array, - col, - i64, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::Boolean => { - hash_array!( - BooleanArray, - col, - u8, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::Utf8 => { - hash_array!( - StringArray, - col, - str, - hashes_buffer, - random_state, - multi_col - ); - } - DataType::LargeUtf8 => { - hash_array!( - LargeStringArray, - col, - str, - hashes_buffer, - random_state, - multi_col - ); - } - _ => { - // This is internal because we should have caught this before. - return Err(DataFusionError::Internal( - "Unsupported data type in hasher".to_string(), - )); - } - } - } - Ok(hashes_buffer) -} - // Produces a batch for left-side rows that have/have not been matched during the whole join fn produce_from_matched( visited_left_side: &[bool], @@ -2115,22 +1774,6 @@ mod tests { Ok(()) } - #[test] - fn create_hashes_for_float_arrays() -> Result<()> { - let f32_arr = Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7])); - let f64_arr = Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7])); - - let random_state = RandomState::with_seeds(0, 0, 0, 0); - let hashes_buff = &mut vec![0; f32_arr.len()]; - let hashes = create_hashes(&[f32_arr], &random_state, hashes_buff)?; - assert_eq!(hashes.len(), 4,); - - let hashes = create_hashes(&[f64_arr], &random_state, hashes_buff)?; - assert_eq!(hashes.len(), 4,); - - Ok(()) - } - #[test] fn join_with_hash_collision() -> Result<()> { let mut hashmap_left = HashMap::with_capacity_and_hasher(2, IdHashBuilder {}); diff --git a/datafusion/src/physical_plan/hash_utils.rs b/datafusion/src/physical_plan/hash_utils.rs index 9243affe9cfc3..e937b4ea549c1 100644 --- a/datafusion/src/physical_plan/hash_utils.rs +++ b/datafusion/src/physical_plan/hash_utils.rs @@ -18,7 +18,14 @@ //! Functionality used both on logical and physical plans use crate::error::{DataFusionError, Result}; -use arrow::datatypes::{Field, Schema}; +use ahash::{CallHasher, RandomState}; +use arrow::array::{ + Array, ArrayRef, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, + Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, StringArray, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + UInt16Array, UInt32Array, UInt64Array, UInt8Array, +}; +use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use std::collections::HashSet; use crate::logical_plan::JoinType; @@ -101,8 +108,351 @@ pub fn build_join_schema(left: &Schema, right: &Schema, join_type: &JoinType) -> Schema::new(fields) } +// Combines two hashes into one hash +#[inline] +fn combine_hashes(l: u64, r: u64) -> u64 { + let hash = (17 * 37u64).wrapping_add(l); + hash.wrapping_mul(37).wrapping_add(r) +} + +macro_rules! hash_array { + ($array_type:ident, $column: ident, $ty: ident, $hashes: ident, $random_state: ident, $multi_col: ident) => { + let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + if array.null_count() == 0 { + if $multi_col { + for (i, hash) in $hashes.iter_mut().enumerate() { + *hash = combine_hashes( + $ty::get_hash(&array.value(i), $random_state), + *hash, + ); + } + } else { + for (i, hash) in $hashes.iter_mut().enumerate() { + *hash = $ty::get_hash(&array.value(i), $random_state); + } + } + } else { + if $multi_col { + for (i, hash) in $hashes.iter_mut().enumerate() { + if !array.is_null(i) { + *hash = combine_hashes( + $ty::get_hash(&array.value(i), $random_state), + *hash, + ); + } + } + } else { + for (i, hash) in $hashes.iter_mut().enumerate() { + if !array.is_null(i) { + *hash = $ty::get_hash(&array.value(i), $random_state); + } + } + } + } + }; +} + +macro_rules! hash_array_primitive { + ($array_type:ident, $column: ident, $ty: ident, $hashes: ident, $random_state: ident, $multi_col: ident) => { + let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + let values = array.values(); + + if array.null_count() == 0 { + if $multi_col { + for (hash, value) in $hashes.iter_mut().zip(values.iter()) { + *hash = combine_hashes($ty::get_hash(value, $random_state), *hash); + } + } else { + for (hash, value) in $hashes.iter_mut().zip(values.iter()) { + *hash = $ty::get_hash(value, $random_state) + } + } + } else { + if $multi_col { + for (i, (hash, value)) in + $hashes.iter_mut().zip(values.iter()).enumerate() + { + if !array.is_null(i) { + *hash = + combine_hashes($ty::get_hash(value, $random_state), *hash); + } + } + } else { + for (i, (hash, value)) in + $hashes.iter_mut().zip(values.iter()).enumerate() + { + if !array.is_null(i) { + *hash = $ty::get_hash(value, $random_state); + } + } + } + } + }; +} + +macro_rules! hash_array_float { + ($array_type:ident, $column: ident, $ty: ident, $hashes: ident, $random_state: ident, $multi_col: ident) => { + let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + let values = array.values(); + + if array.null_count() == 0 { + if $multi_col { + for (hash, value) in $hashes.iter_mut().zip(values.iter()) { + *hash = combine_hashes( + $ty::get_hash( + &$ty::from_le_bytes(value.to_le_bytes()), + $random_state, + ), + *hash, + ); + } + } else { + for (hash, value) in $hashes.iter_mut().zip(values.iter()) { + *hash = $ty::get_hash( + &$ty::from_le_bytes(value.to_le_bytes()), + $random_state, + ) + } + } + } else { + if $multi_col { + for (i, (hash, value)) in + $hashes.iter_mut().zip(values.iter()).enumerate() + { + if !array.is_null(i) { + *hash = combine_hashes( + $ty::get_hash( + &$ty::from_le_bytes(value.to_le_bytes()), + $random_state, + ), + *hash, + ); + } + } + } else { + for (i, (hash, value)) in + $hashes.iter_mut().zip(values.iter()).enumerate() + { + if !array.is_null(i) { + *hash = $ty::get_hash( + &$ty::from_le_bytes(value.to_le_bytes()), + $random_state, + ); + } + } + } + } + }; +} + +/// Creates hash values for every row, based on the values in the columns +/// +/// This implements so-called "vectorized hashing" +pub fn create_hashes<'a>( + arrays: &[ArrayRef], + random_state: &RandomState, + hashes_buffer: &'a mut Vec, +) -> Result<&'a mut Vec> { + // combine hashes with `combine_hashes` if we have more than 1 column + let multi_col = arrays.len() > 1; + + for col in arrays { + match col.data_type() { + DataType::UInt8 => { + hash_array_primitive!( + UInt8Array, + col, + u8, + hashes_buffer, + random_state, + multi_col + ); + } + DataType::UInt16 => { + hash_array_primitive!( + UInt16Array, + col, + u16, + hashes_buffer, + random_state, + multi_col + ); + } + DataType::UInt32 => { + hash_array_primitive!( + UInt32Array, + col, + u32, + hashes_buffer, + random_state, + multi_col + ); + } + DataType::UInt64 => { + hash_array_primitive!( + UInt64Array, + col, + u64, + hashes_buffer, + random_state, + multi_col + ); + } + DataType::Int8 => { + hash_array_primitive!( + Int8Array, + col, + i8, + hashes_buffer, + random_state, + multi_col + ); + } + DataType::Int16 => { + hash_array_primitive!( + Int16Array, + col, + i16, + hashes_buffer, + random_state, + multi_col + ); + } + DataType::Int32 => { + hash_array_primitive!( + Int32Array, + col, + i32, + hashes_buffer, + random_state, + multi_col + ); + } + DataType::Int64 => { + hash_array_primitive!( + Int64Array, + col, + i64, + hashes_buffer, + random_state, + multi_col + ); + } + DataType::Float32 => { + hash_array_float!( + Float32Array, + col, + u32, + hashes_buffer, + random_state, + multi_col + ); + } + DataType::Float64 => { + hash_array_float!( + Float64Array, + col, + u64, + hashes_buffer, + random_state, + multi_col + ); + } + DataType::Timestamp(TimeUnit::Millisecond, None) => { + hash_array_primitive!( + TimestampMillisecondArray, + col, + i64, + hashes_buffer, + random_state, + multi_col + ); + } + DataType::Timestamp(TimeUnit::Microsecond, None) => { + hash_array_primitive!( + TimestampMicrosecondArray, + col, + i64, + hashes_buffer, + random_state, + multi_col + ); + } + DataType::Timestamp(TimeUnit::Nanosecond, None) => { + hash_array_primitive!( + TimestampNanosecondArray, + col, + i64, + hashes_buffer, + random_state, + multi_col + ); + } + DataType::Date32 => { + hash_array_primitive!( + Date32Array, + col, + i32, + hashes_buffer, + random_state, + multi_col + ); + } + DataType::Date64 => { + hash_array_primitive!( + Date64Array, + col, + i64, + hashes_buffer, + random_state, + multi_col + ); + } + DataType::Boolean => { + hash_array!( + BooleanArray, + col, + u8, + hashes_buffer, + random_state, + multi_col + ); + } + DataType::Utf8 => { + hash_array!( + StringArray, + col, + str, + hashes_buffer, + random_state, + multi_col + ); + } + DataType::LargeUtf8 => { + hash_array!( + LargeStringArray, + col, + str, + hashes_buffer, + random_state, + multi_col + ); + } + _ => { + // This is internal because we should have caught this before. + return Err(DataFusionError::Internal( + "Unsupported data type in hasher".to_string(), + )); + } + } + } + Ok(hashes_buffer) +} + #[cfg(test)] mod tests { + use std::sync::Arc; + use super::*; fn check(left: &[Column], right: &[Column], on: &[(Column, Column)]) -> Result<()> { @@ -163,4 +513,20 @@ mod tests { assert!(check(&left, &right, on).is_ok()); } + + #[test] + fn create_hashes_for_float_arrays() -> Result<()> { + let f32_arr = Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7])); + let f64_arr = Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7])); + + let random_state = RandomState::with_seeds(0, 0, 0, 0); + let hashes_buff = &mut vec![0; f32_arr.len()]; + let hashes = create_hashes(&[f32_arr], &random_state, hashes_buff)?; + assert_eq!(hashes.len(), 4,); + + let hashes = create_hashes(&[f64_arr], &random_state, hashes_buff)?; + assert_eq!(hashes.len(), 4,); + + Ok(()) + } } diff --git a/datafusion/src/physical_plan/repartition.rs b/datafusion/src/physical_plan/repartition.rs index e67e4c2d44779..b59071adb3a1e 100644 --- a/datafusion/src/physical_plan/repartition.rs +++ b/datafusion/src/physical_plan/repartition.rs @@ -25,13 +25,14 @@ use std::time::Instant; use std::{any::Any, vec}; use crate::error::{DataFusionError, Result}; +use crate::physical_plan::hash_utils::create_hashes; use crate::physical_plan::{DisplayFormatType, ExecutionPlan, Partitioning, SQLMetric}; use arrow::record_batch::RecordBatch; use arrow::{array::Array, error::Result as ArrowResult}; use arrow::{compute::take, datatypes::SchemaRef}; use tokio_stream::wrappers::UnboundedReceiverStream; -use super::{hash_join::create_hashes, RecordBatchStream, SendableRecordBatchStream}; +use super::{RecordBatchStream, SendableRecordBatchStream}; use async_trait::async_trait; use futures::stream::Stream; From 949bb0e7a75ee44a4ecd5f0ba8dee61a2113ba14 Mon Sep 17 00:00:00 2001 From: QP Hou Date: Sun, 1 Aug 2021 20:16:39 -0700 Subject: [PATCH 306/329] automatically add sql, development-process and documentation labels to PRs (#800) * automatically add SQL label to PRs * automatically add development-process and documentation labels * add more docs matching paths --- .github/workflows/dev_pr/labeler.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index c27fb2cb4094a..5b956a18a43e2 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -26,3 +26,20 @@ ballista: python: - python/**/* + +sql: + - datafusion/src/sql/**/* + +development-process: + - dev/**.* + - .github/**.* + - ci/**.* + - .asf.yaml + +documentation: + - docs/**.* + - README.md + - ./**/README.md + - DEVELOPERS.md + - ballista/docs/**.* + - datafusion/docs/**.* From 979dc3139c3e2aa1f2995c018375aea36cdf6274 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Mon, 2 Aug 2021 05:46:43 +0200 Subject: [PATCH 307/329] Convert unsupported conditions in left right join to filters (#796) * Unsupported conditions in join * Simplify, support right join, add tests * Add tests * Add comments * Clippy --- benchmarks/src/bin/tpch.rs | 5 ++ datafusion/src/sql/planner.rs | 128 ++++++++++++++++++++++++++++++---- datafusion/tests/sql.rs | 46 +++++++++++- 3 files changed, 164 insertions(+), 15 deletions(-) diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index 08b8864acd1b9..42755ec4b1b37 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -744,6 +744,11 @@ mod tests { run_query(12).await } + #[tokio::test] + async fn run_q13() -> Result<()> { + run_query(13).await + } + #[tokio::test] async fn run_q14() -> Result<()> { run_query(14).await diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 6d9484be102ff..481f12b968064 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -31,6 +31,7 @@ use crate::logical_plan::{ DFSchema, Expr, LogicalPlan, LogicalPlanBuilder, Operator, PlanType, ToDFSchema, ToStringifiedPlan, }; +use crate::optimizer::utils::exprlist_to_columns; use crate::prelude::JoinType; use crate::scalar::ScalarValue; use crate::{ @@ -325,16 +326,16 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let right = self.create_relation(&join.relation, ctes)?; match &join.join_operator { JoinOperator::LeftOuter(constraint) => { - self.parse_join(left, &right, constraint, JoinType::Left) + self.parse_join(left, right, constraint, JoinType::Left) } JoinOperator::RightOuter(constraint) => { - self.parse_join(left, &right, constraint, JoinType::Right) + self.parse_join(left, right, constraint, JoinType::Right) } JoinOperator::Inner(constraint) => { - self.parse_join(left, &right, constraint, JoinType::Inner) + self.parse_join(left, right, constraint, JoinType::Inner) } JoinOperator::FullOuter(constraint) => { - self.parse_join(left, &right, constraint, JoinType::Full) + self.parse_join(left, right, constraint, JoinType::Full) } JoinOperator::CrossJoin => self.parse_cross_join(left, &right), other => Err(DataFusionError::NotImplemented(format!( @@ -354,7 +355,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { fn parse_join( &self, left: LogicalPlan, - right: &LogicalPlan, + right: LogicalPlan, constraint: &JoinConstraint, join_type: JoinType, ) -> Result { @@ -372,18 +373,26 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // extract join keys extract_join_keys(&expr, &mut keys, &mut filter); + let mut cols = HashSet::new(); + exprlist_to_columns(&filter, &mut cols)?; + let (left_keys, right_keys): (Vec, Vec) = keys.into_iter().unzip(); - // return the logical plan representing the join - let join = LogicalPlanBuilder::from(left).join( - right, - join_type, - (left_keys, right_keys), - )?; + // return the logical plan representing the join if filter.is_empty() { + let join = LogicalPlanBuilder::from(left).join( + &right, + join_type, + (left_keys, right_keys), + )?; join.build() } else if join_type == JoinType::Inner { + let join = LogicalPlanBuilder::from(left).join( + &right, + join_type, + (left_keys, right_keys), + )?; join.filter( filter .iter() @@ -391,6 +400,64 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .fold(filter[0].clone(), |acc, e| acc.and(e.clone())), )? .build() + } + // Left join with all non-equijoin expressions from the right + // l left join r + // on l1=r1 and r2 > [..] + else if join_type == JoinType::Left + && cols.iter().all( + |Column { + relation: qualifier, + name, + }| { + right + .schema() + .field_with_name(qualifier.as_deref(), name) + .is_ok() + }, + ) + { + LogicalPlanBuilder::from(left) + .join( + &LogicalPlanBuilder::from(right) + .filter( + filter + .iter() + .skip(1) + .fold(filter[0].clone(), |acc, e| { + acc.and(e.clone()) + }), + )? + .build()?, + join_type, + (left_keys, right_keys), + )? + .build() + } + // Right join with all non-equijoin expressions from the left + // l right join r + // on l1=r1 and l2 > [..] + else if join_type == JoinType::Right + && cols.iter().all( + |Column { + relation: qualifier, + name, + }| { + left.schema() + .field_with_name(qualifier.as_deref(), name) + .is_ok() + }, + ) + { + LogicalPlanBuilder::from(left) + .filter( + filter + .iter() + .skip(1) + .fold(filter[0].clone(), |acc, e| acc.and(e.clone())), + )? + .join(&right, join_type, (left_keys, right_keys))? + .build() } else { Err(DataFusionError::NotImplemented(format!( "Unsupported expressions in {:?} JOIN: {:?}", @@ -404,7 +471,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .map(|x| Column::from_name(x.value.clone())) .collect(); LogicalPlanBuilder::from(left) - .join_using(right, join_type, keys)? + .join_using(&right, join_type, keys)? .build() } JoinConstraint::Natural => { @@ -1650,9 +1717,16 @@ fn extract_join_keys( extract_join_keys(left, accum, accum_filter); extract_join_keys(right, accum, accum_filter); } - _other => { + _other + if matches!(**left, Expr::Column(_)) + || matches!(**right, Expr::Column(_)) => + { accum_filter.push(expr.clone()); } + _other => { + extract_join_keys(left, accum, accum_filter); + extract_join_keys(right, accum, accum_filter); + } }, _other => { accum_filter.push(expr.clone()); @@ -2811,6 +2885,34 @@ mod tests { quick_test(sql, expected); } + #[test] + fn left_equijoin_unsupported_expression() { + let sql = "SELECT id, order_id \ + FROM person \ + LEFT JOIN orders \ + ON id = customer_id AND order_id > 1"; + let expected = "Projection: #person.id, #orders.order_id\ + \n Join: #person.id = #orders.customer_id\ + \n TableScan: person projection=None\ + \n Filter: #orders.order_id Gt Int64(1)\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); + } + + #[test] + fn right_equijoin_unsupported_expression() { + let sql = "SELECT id, order_id \ + FROM person \ + RIGHT JOIN orders \ + ON id = customer_id AND id > 1"; + let expected = "Projection: #person.id, #orders.order_id\ + \n Join: #person.id = #orders.customer_id\ + \n Filter: #person.id Gt Int64(1)\ + \n TableScan: person projection=None\ + \n TableScan: orders projection=None"; + quick_test(sql, expected); + } + #[test] fn join_with_table_name() { let sql = "SELECT id, order_id \ diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index bfe2f2fc49138..42a7d20c6e5a7 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -1765,14 +1765,56 @@ async fn equijoin_and_other_condition() -> Result<()> { Ok(()) } +#[tokio::test] +async fn equijoin_left_and_condition_from_right() -> Result<()> { + let mut ctx = create_join_context("t1_id", "t2_id")?; + let sql = + "SELECT t1_id, t1_name, t2_name FROM t1 LEFT JOIN t2 ON t1_id = t2_id AND t2_name >= 'y' ORDER BY t1_id"; + let res = ctx.create_logical_plan(sql); + assert!(res.is_ok()); + let actual = execute(&mut ctx, sql).await; + + let expected = vec![ + vec!["11", "a", "z"], + vec!["22", "b", "y"], + vec!["33", "c", "NULL"], + vec!["44", "d", "NULL"], + ]; + assert_eq!(expected, actual); + + Ok(()) +} + +#[tokio::test] +async fn equijoin_right_and_condition_from_left() -> Result<()> { + let mut ctx = create_join_context("t1_id", "t2_id")?; + let sql = + "SELECT t1_id, t1_name, t2_name FROM t1 RIGHT JOIN t2 ON t1_id = t2_id AND t1_id >= 22 ORDER BY t2_name"; + let res = ctx.create_logical_plan(sql); + assert!(res.is_ok()); + let actual = execute(&mut ctx, sql).await; + + let expected = vec![ + vec!["NULL", "NULL", "w"], + vec!["44", "d", "x"], + vec!["22", "b", "y"], + vec!["NULL", "NULL", "z"], + ]; + assert_eq!(expected, actual); + + Ok(()) +} + #[tokio::test] async fn equijoin_and_unsupported_condition() -> Result<()> { let ctx = create_join_context("t1_id", "t2_id")?; let sql = - "SELECT t1_id, t1_name, t2_name FROM t1 LEFT JOIN t2 ON t1_id = t2_id AND t2_name >= 'y' ORDER BY t1_id"; + "SELECT t1_id, t1_name, t2_name FROM t1 LEFT JOIN t2 ON t1_id = t2_id AND t1_id >= '44' ORDER BY t1_id"; let res = ctx.create_logical_plan(sql); + assert!(res.is_err()); - assert_eq!(format!("{}", res.unwrap_err()), "This feature is not implemented: Unsupported expressions in Left JOIN: [#t2_name GtEq Utf8(\"y\")]"); + assert_eq!(format!("{}", res.unwrap_err()), "This feature is not implemented: Unsupported expressions in Left JOIN: [#t1_id GtEq Utf8(\"44\")]"); + Ok(()) } From 2bcf04017d710a6b8684617e81a64c9db1184f5c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 2 Aug 2021 07:23:49 -0400 Subject: [PATCH 308/329] Produce correct answers for Group BY NULL (Option 1) (#793) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add support for group by hash of a null column, tests for same * Update datafusion/src/physical_plan/hash_aggregate.rs Co-authored-by: Daniël Heres Co-authored-by: Daniël Heres --- .../src/physical_plan/hash_aggregate.rs | 60 +++++++++- datafusion/src/scalar.rs | 37 +++++- datafusion/tests/sql.rs | 110 ++++++++++++++++++ 3 files changed, 202 insertions(+), 5 deletions(-) diff --git a/datafusion/src/physical_plan/hash_aggregate.rs b/datafusion/src/physical_plan/hash_aggregate.rs index eb4a356e88ce8..5c3c57695d0f0 100644 --- a/datafusion/src/physical_plan/hash_aggregate.rs +++ b/datafusion/src/physical_plan/hash_aggregate.rs @@ -395,7 +395,10 @@ fn group_aggregate_batch( // We can safely unwrap here as we checked we can create an accumulator before let accumulator_set = create_accumulators(aggr_expr).unwrap(); batch_keys.push(key.clone()); - let _ = create_group_by_values(&group_values, row, &mut group_by_values); + // Note it would be nice to make this a real error (rather than panic) + // but it is better than silently ignoring the issue and getting wrong results + create_group_by_values(&group_values, row, &mut group_by_values) + .expect("can not create group by value"); ( key.clone(), (group_by_values.clone(), accumulator_set, vec![row as u32]), @@ -508,7 +511,9 @@ fn dictionary_create_key_for_col( } /// Appends a sequence of [u8] bytes for the value in `col[row]` to -/// `vec` to be used as a key into the hash map +/// `vec` to be used as a key into the hash map. +/// +/// NOTE: This function does not check col.is_valid(). Caller must do so fn create_key_for_col(col: &ArrayRef, row: usize, vec: &mut Vec) -> Result<()> { match col.data_type() { DataType::Boolean => { @@ -640,6 +645,50 @@ fn create_key_for_col(col: &ArrayRef, row: usize, vec: &mut Vec) -> Result<( } /// Create a key `Vec` that is used as key for the hashmap +/// +/// This looks like +/// [null_byte][col_value_bytes][null_byte][col_value_bytes] +/// +/// Note that relatively uncommon patterns (e.g. not 0x00) are chosen +/// for the null_byte to make debugging easier. The actual values are +/// arbitrary. +/// +/// For a NULL value in a column, the key looks like +/// [0xFE] +/// +/// For a Non-NULL value in a column, this looks like: +/// [0xFF][byte representation of column value] +/// +/// Example of a key with no NULL values: +/// ```text +/// 0xFF byte at the start of each column +/// signifies the value is non-null +/// │ +/// +/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┴ ─ ─ ─ ─ ─ ─ ─ ┐ +/// +/// │ string len │ 0x1234 +/// { ▼ (as usize le) "foo" ▼(as u16 le) +/// k1: "foo" ╔ ═┌──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──╦ ═┌──┬──┐ +/// k2: 0x1234u16 FF║03│00│00│00│00│00│00│00│"f│"o│"o│FF║34│12│ +/// } ╚ ═└──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──╩ ═└──┴──┘ +/// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 +/// ``` +/// +/// Example of a key with NULL values: +/// +///```text +/// 0xFE byte at the start of k1 column +/// ┌ ─ signifies the value is NULL +/// +/// └ ┐ +/// 0x1234 +/// { ▼ (as u16 le) +/// k1: NULL ╔ ═╔ ═┌──┬──┐ +/// k2: 0x1234u16 FE║FF║12│34│ +/// } ╚ ═╚ ═└──┴──┘ +/// 0 1 2 3 +///``` pub(crate) fn create_key( group_by_keys: &[ArrayRef], row: usize, @@ -647,7 +696,12 @@ pub(crate) fn create_key( ) -> Result<()> { vec.clear(); for col in group_by_keys { - create_key_for_col(col, row, vec)? + if !col.is_valid(row) { + vec.push(0xFE); + } else { + vec.push(0xFF); + create_key_for_col(col, row, vec)? + } } Ok(()) } diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index 8efea63e82368..90c9bf7369d4a 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -28,7 +28,7 @@ use arrow::{ }, }; use ordered_float::OrderedFloat; -use std::convert::Infallible; +use std::convert::{Infallible, TryInto}; use std::str::FromStr; use std::{convert::TryFrom, fmt, iter::repeat, sync::Arc}; @@ -796,6 +796,11 @@ impl ScalarValue { /// Converts a value in `array` at `index` into a ScalarValue pub fn try_from_array(array: &ArrayRef, index: usize) -> Result { + // handle NULL value + if !array.is_valid(index) { + return array.data_type().try_into(); + } + Ok(match array.data_type() { DataType::Boolean => typed_cast!(array, index, BooleanArray, Boolean), DataType::Float64 => typed_cast!(array, index, Float64Array, Float64), @@ -897,6 +902,7 @@ impl ScalarValue { let dict_array = array.as_any().downcast_ref::>().unwrap(); // look up the index in the values dictionary + // (note validity was previously checked in `try_from_array`) let keys_col = dict_array.keys(); let values_index = keys_col.value(index).to_usize().ok_or_else(|| { DataFusionError::Internal(format!( @@ -1132,6 +1138,7 @@ impl_try_from!(Boolean, bool); impl TryFrom<&DataType> for ScalarValue { type Error = DataFusionError; + /// Create a Null instance of ScalarValue for this datatype fn try_from(datatype: &DataType) -> Result { Ok(match datatype { DataType::Boolean => ScalarValue::Boolean(None), @@ -1161,12 +1168,15 @@ impl TryFrom<&DataType> for ScalarValue { DataType::Timestamp(TimeUnit::Nanosecond, _) => { ScalarValue::TimestampNanosecond(None) } + DataType::Dictionary(_index_type, value_type) => { + value_type.as_ref().try_into()? + } DataType::List(ref nested_type) => { ScalarValue::List(None, Box::new(nested_type.data_type().clone())) } _ => { return Err(DataFusionError::NotImplemented(format!( - "Can't create a scalar of type \"{:?}\"", + "Can't create a scalar from data_type \"{:?}\"", datatype ))) } @@ -1535,6 +1545,29 @@ mod tests { "{}", result); } + #[test] + fn scalar_try_from_array_null() { + let array = vec![Some(33), None].into_iter().collect::(); + let array: ArrayRef = Arc::new(array); + + assert_eq!( + ScalarValue::Int64(Some(33)), + ScalarValue::try_from_array(&array, 0).unwrap() + ); + assert_eq!( + ScalarValue::Int64(None), + ScalarValue::try_from_array(&array, 1).unwrap() + ); + } + + #[test] + fn scalar_try_from_dict_datatype() { + let data_type = + DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)); + let data_type = &data_type; + assert_eq!(ScalarValue::Utf8(None), data_type.try_into().unwrap()) + } + #[test] fn size_of_scalar() { // Since ScalarValues are used in a non trivial number of places, diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 42a7d20c6e5a7..3a83f205aa753 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -3056,6 +3056,109 @@ async fn query_count_distinct() -> Result<()> { Ok(()) } +#[tokio::test] +async fn query_group_on_null() -> Result<()> { + let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, true)])); + + let data = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![ + Some(0), + Some(3), + None, + Some(1), + Some(3), + ]))], + )?; + + let table = MemTable::try_new(schema, vec![vec![data]])?; + + let mut ctx = ExecutionContext::new(); + ctx.register_table("test", Arc::new(table))?; + let sql = "SELECT COUNT(*), c1 FROM test GROUP BY c1"; + + let actual = execute_to_batches(&mut ctx, sql).await; + + // Note that the results also + // include a row for NULL (c1=NULL, count = 1) + let expected = vec![ + "+-----------------+----+", + "| COUNT(UInt8(1)) | c1 |", + "+-----------------+----+", + "| 1 | |", + "| 1 | 0 |", + "| 1 | 1 |", + "| 2 | 3 |", + "+-----------------+----+", + ]; + assert_batches_sorted_eq!(expected, &actual); + Ok(()) +} + +#[tokio::test] +async fn query_group_on_null_multi_col() -> Result<()> { + let schema = Arc::new(Schema::new(vec![ + Field::new("c1", DataType::Int32, true), + Field::new("c2", DataType::Utf8, true), + ])); + + let data = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![ + Some(0), + Some(0), + Some(3), + None, + None, + Some(3), + Some(0), + None, + Some(3), + ])), + Arc::new(StringArray::from(vec![ + None, + None, + Some("foo"), + None, + Some("bar"), + Some("foo"), + None, + Some("bar"), + Some("foo"), + ])), + ], + )?; + + let table = MemTable::try_new(schema, vec![vec![data]])?; + + let mut ctx = ExecutionContext::new(); + ctx.register_table("test", Arc::new(table))?; + let sql = "SELECT COUNT(*), c1, c2 FROM test GROUP BY c1, c2"; + + let actual = execute_to_batches(&mut ctx, sql).await; + + // Note that the results also include values for null + // include a row for NULL (c1=NULL, count = 1) + let expected = vec![ + "+-----------------+----+-----+", + "| COUNT(UInt8(1)) | c1 | c2 |", + "+-----------------+----+-----+", + "| 1 | | |", + "| 2 | | bar |", + "| 3 | 0 | |", + "| 3 | 3 | foo |", + "+-----------------+----+-----+", + ]; + assert_batches_sorted_eq!(expected, &actual); + + // Also run query with group columns reversed (results shoudl be the same) + let sql = "SELECT COUNT(*), c1, c2 FROM test GROUP BY c2, c1"; + let actual = execute_to_batches(&mut ctx, sql).await; + assert_batches_sorted_eq!(expected, &actual); + Ok(()) +} + #[tokio::test] async fn query_on_string_dictionary() -> Result<()> { // Test to ensure DataFusion can operate on dictionary types @@ -3109,6 +3212,13 @@ async fn query_on_string_dictionary() -> Result<()> { let expected = vec![vec!["2"]]; assert_eq!(expected, actual); + // grouping + let sql = "SELECT d1, COUNT(*) FROM test group by d1"; + let mut actual = execute(&mut ctx, sql).await; + actual.sort(); + let expected = vec![vec!["NULL", "1"], vec!["one", "1"], vec!["three", "1"]]; + assert_eq!(expected, actual); + Ok(()) } From 96cc4e9ca562f5f3d22af6f13b8a08ef04b9bfc4 Mon Sep 17 00:00:00 2001 From: QP Hou Date: Mon, 2 Aug 2021 12:31:34 -0700 Subject: [PATCH 309/329] show prettier diff in CI (#802) --- .github/workflows/dev.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 39c449c50a8ef..f49c7b18b419a 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -63,8 +63,9 @@ jobs: run: | # if you encounter error, try rerun the command below with --write instead of --check # and commit the changes - npx prettier@2.3.2 --check \ + npx prettier@2.3.2 --write \ '{ballista,datafusion,datafusion-examples,docs,python}/**/*.md' \ README.md \ DEVELOPERS.md \ 'ballista/**/*.{ts,tsx}' + git diff --exit-code \ No newline at end of file From e18d79f7c2cae44de28af2b643c0e45085212f48 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 2 Aug 2021 15:45:23 -0400 Subject: [PATCH 310/329] Update dependencies: prost to 0.8 and tonic to 0.5 (#818) --- ballista-examples/Cargo.toml | 4 ++-- ballista/rust/core/Cargo.toml | 6 +++--- ballista/rust/executor/Cargo.toml | 2 +- ballista/rust/executor/src/flight_service.rs | 4 ++-- ballista/rust/scheduler/Cargo.toml | 6 +++--- datafusion-examples/Cargo.toml | 4 ++-- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/ballista-examples/Cargo.toml b/ballista-examples/Cargo.toml index b7d40223c4693..dbcfad44f62fe 100644 --- a/ballista-examples/Cargo.toml +++ b/ballista-examples/Cargo.toml @@ -31,8 +31,8 @@ publish = false arrow-flight = { version = "5.0" } datafusion = { path = "../datafusion" } ballista = { path = "../ballista/rust/client" } -prost = "0.7" -tonic = "0.4" +prost = "0.8" +tonic = "0.5" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } futures = "0.3" num_cpus = "1.13.0" diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml index ce72d2fda92d4..2495343b3e878 100644 --- a/ballista/rust/core/Cargo.toml +++ b/ballista/rust/core/Cargo.toml @@ -35,11 +35,11 @@ async-trait = "0.1.36" futures = "0.3" hashbrown = "0.11" log = "0.4" -prost = "0.7" +prost = "0.8" serde = {version = "1", features = ["derive"]} sqlparser = "0.9.0" tokio = "1.0" -tonic = "0.4" +tonic = "0.5" uuid = { version = "0.8", features = ["v4"] } arrow-flight = { version = "5.0" } @@ -50,4 +50,4 @@ datafusion = { path = "../../../datafusion" } tempfile = "3" [build-dependencies] -tonic-build = { version = "0.4" } +tonic-build = { version = "0.5" } diff --git a/ballista/rust/executor/Cargo.toml b/ballista/rust/executor/Cargo.toml index 428a5bb0f01f5..5d6ecb986a15e 100644 --- a/ballista/rust/executor/Cargo.toml +++ b/ballista/rust/executor/Cargo.toml @@ -40,7 +40,7 @@ snmalloc-rs = {version = "0.2", features= ["cache-friendly"], optional = true} tempfile = "3" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread"] } tokio-stream = { version = "0.1", features = ["net"] } -tonic = "0.4" +tonic = "0.5" uuid = { version = "0.8", features = ["v4"] } arrow = { version = "5.0" } diff --git a/ballista/rust/executor/src/flight_service.rs b/ballista/rust/executor/src/flight_service.rs index 73dd1a946d554..27b1a33b7c877 100644 --- a/ballista/rust/executor/src/flight_service.rs +++ b/ballista/rust/executor/src/flight_service.rs @@ -218,8 +218,8 @@ where let batch_flight_data: Vec<_> = batch .map(|b| create_flight_iter(&b, &options).collect()) .map_err(|e| from_arrow_err(&e))?; - for batch in &batch_flight_data { - send_response(&tx, batch.clone()).await?; + for batch in batch_flight_data.into_iter() { + send_response(&tx, batch).await?; } } info!("FetchPartition streamed {} rows", row_count); diff --git a/ballista/rust/scheduler/Cargo.toml b/ballista/rust/scheduler/Cargo.toml index 9bca8d9695714..382f7c62af10e 100644 --- a/ballista/rust/scheduler/Cargo.toml +++ b/ballista/rust/scheduler/Cargo.toml @@ -43,13 +43,13 @@ http-body = "0.4" hyper = "0.14.4" log = "0.4" parse_arg = "0.1.3" -prost = "0.7" +prost = "0.8" rand = "0.8" serde = {version = "1", features = ["derive"]} sled_package = { package = "sled", version = "0.34", optional = true } tokio = { version = "1.0", features = ["full"] } tokio-stream = { version = "0.1", features = ["net"], optional = true } -tonic = "0.4" +tonic = "0.5" tower = { version = "0.4" } warp = "0.3" @@ -61,7 +61,7 @@ uuid = { version = "0.8", features = ["v4"] } [build-dependencies] configure_me_codegen = "0.4.0" -tonic-build = { version = "0.4" } +tonic-build = { version = "0.5" } [package.metadata.configure_me.bin] scheduler = "scheduler_config_spec.toml" diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 35aa3764d6dc4..3ddcdf41bf218 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -31,8 +31,8 @@ publish = false [dev-dependencies] arrow-flight = { version = "5.0" } datafusion = { path = "../datafusion" } -prost = "0.7" -tonic = "0.4" +prost = "0.8" +tonic = "0.5" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } futures = "0.3" num_cpus = "1.13.0" From 15ea3b9e6ac485ec8b8604fb3409f19aaa295d5c Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 3 Aug 2021 00:02:31 -0700 Subject: [PATCH 311/329] Support date datatypes in max/min (#820) * Support date datatype in max/min. * fix format. --- datafusion/src/physical_plan/aggregates.rs | 3 + .../src/physical_plan/expressions/min_max.rs | 70 +++++++++++++++++-- 2 files changed, 69 insertions(+), 4 deletions(-) diff --git a/datafusion/src/physical_plan/aggregates.rs b/datafusion/src/physical_plan/aggregates.rs index c297a959639a5..57c9b61c91fd4 100644 --- a/datafusion/src/physical_plan/aggregates.rs +++ b/datafusion/src/physical_plan/aggregates.rs @@ -188,6 +188,8 @@ static TIMESTAMPS: &[DataType] = &[ DataType::Timestamp(TimeUnit::Nanosecond, None), ]; +static DATES: &[DataType] = &[DataType::Date32, DataType::Date64]; + /// the signatures supported by the function `fun`. pub fn signature(fun: &AggregateFunction) -> Signature { // note: the physical expression must accept the type returned by this function or the execution panics. @@ -198,6 +200,7 @@ pub fn signature(fun: &AggregateFunction) -> Signature { .iter() .chain(NUMERICS.iter()) .chain(TIMESTAMPS.iter()) + .chain(DATES.iter()) .cloned() .collect::>(); Signature::Uniform(1, valid) diff --git a/datafusion/src/physical_plan/expressions/min_max.rs b/datafusion/src/physical_plan/expressions/min_max.rs index 46e41f46a0e53..6bb4c5b21b861 100644 --- a/datafusion/src/physical_plan/expressions/min_max.rs +++ b/datafusion/src/physical_plan/expressions/min_max.rs @@ -28,10 +28,10 @@ use arrow::compute; use arrow::datatypes::{DataType, TimeUnit}; use arrow::{ array::{ - ArrayRef, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, - Int8Array, LargeStringArray, StringArray, TimestampMicrosecondArray, - TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, - UInt16Array, UInt32Array, UInt64Array, UInt8Array, + ArrayRef, Date32Array, Date64Array, Float32Array, Float64Array, Int16Array, + Int32Array, Int64Array, Int8Array, LargeStringArray, StringArray, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }, datatypes::Field, }; @@ -158,6 +158,8 @@ macro_rules! min_max_batch { TimestampNanosecond, $OP ), + DataType::Date32 => typed_min_max_batch!($VALUES, Date32Array, Date32, $OP), + DataType::Date64 => typed_min_max_batch!($VALUES, Date64Array, Date64, $OP), other => { // This should have been handled before return Err(DataFusionError::Internal(format!( @@ -280,6 +282,18 @@ macro_rules! min_max { ) => { typed_min_max!(lhs, rhs, TimestampNanosecond, $OP) } + ( + ScalarValue::Date32(lhs), + ScalarValue::Date32(rhs), + ) => { + typed_min_max!(lhs, rhs, Date32, $OP) + } + ( + ScalarValue::Date64(lhs), + ScalarValue::Date64(rhs), + ) => { + typed_min_max!(lhs, rhs, Date64, $OP) + } e => { return Err(DataFusionError::Internal(format!( "MIN/MAX is not expected to receive scalars of incompatible types {:?}", @@ -668,4 +682,52 @@ mod tests { DataType::Float64 ) } + + #[test] + fn min_date32() -> Result<()> { + let a: ArrayRef = Arc::new(Date32Array::from(vec![1, 2, 3, 4, 5])); + generic_test_op!( + a, + DataType::Date32, + Min, + ScalarValue::Date32(Some(1)), + DataType::Date32 + ) + } + + #[test] + fn min_date64() -> Result<()> { + let a: ArrayRef = Arc::new(Date64Array::from(vec![1, 2, 3, 4, 5])); + generic_test_op!( + a, + DataType::Date64, + Min, + ScalarValue::Date64(Some(1)), + DataType::Date64 + ) + } + + #[test] + fn max_date32() -> Result<()> { + let a: ArrayRef = Arc::new(Date32Array::from(vec![1, 2, 3, 4, 5])); + generic_test_op!( + a, + DataType::Date32, + Max, + ScalarValue::Date32(Some(5)), + DataType::Date32 + ) + } + + #[test] + fn max_date64() -> Result<()> { + let a: ArrayRef = Arc::new(Date64Array::from(vec![1, 2, 3, 4, 5])); + generic_test_op!( + a, + DataType::Date64, + Max, + ScalarValue::Date64(Some(5)), + DataType::Date64 + ) + } } From b8b046508dbadd6441241dd033d55c44e19ba3b4 Mon Sep 17 00:00:00 2001 From: Mike Seddon Date: Wed, 4 Aug 2021 21:11:27 +1000 Subject: [PATCH 312/329] Qualified field resolution too strict (#810) * rewrite qualifiers * refactor based on review * simplify resolution code * simplify further based on reviews --- datafusion/src/logical_plan/dfschema.rs | 7 +++-- datafusion/src/sql/planner.rs | 39 ++++++++++++++++++++++++- datafusion/tests/sql.rs | 15 ++++++++++ 3 files changed, 58 insertions(+), 3 deletions(-) diff --git a/datafusion/src/logical_plan/dfschema.rs b/datafusion/src/logical_plan/dfschema.rs index 217e2de6d4ff3..c067b5f963eeb 100644 --- a/datafusion/src/logical_plan/dfschema.rs +++ b/datafusion/src/logical_plan/dfschema.rs @@ -289,7 +289,10 @@ impl DFSchema { fn get_field_names(&self) -> String { self.fields .iter() - .map(|f| format!("'{}'", f.name())) + .map(|f| match f.qualifier() { + Some(qualifier) => format!("'{}.{}'", qualifier, f.name()), + None => format!("'{}'", f.name()), + }) .collect::>() .join(", ") } @@ -619,7 +622,7 @@ mod tests { #[test] fn helpful_error_messages() -> Result<()> { let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; - let expected_help = "Valid fields are \'c0\', \'c1\'."; + let expected_help = "Valid fields are \'t1.c0\', \'t1.c1\'."; assert!(schema .field_with_qualified_name("x", "y") .unwrap_err() diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 481f12b968064..ef2b63464969b 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -992,11 +992,40 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { /// Generate a relational expression from a SQL expression pub fn sql_to_rex(&self, sql: &SQLExpr, schema: &DFSchema) -> Result { - let expr = self.sql_expr_to_logical_expr(sql, schema)?; + let mut expr = self.sql_expr_to_logical_expr(sql, schema)?; + expr = self.rewrite_partial_qualifier(expr, schema); self.validate_schema_satisfies_exprs(schema, &[expr.clone()])?; Ok(expr) } + /// Rewrite aliases which are not-complete (e.g. ones that only include only table qualifier in a schema.table qualified relation) + fn rewrite_partial_qualifier(&self, expr: Expr, schema: &DFSchema) -> Expr { + match expr { + Expr::Column(col) => match &col.relation { + Some(q) => { + match schema + .fields() + .iter() + .find(|field| match field.qualifier() { + Some(field_q) => { + field.name() == &col.name + && field_q.ends_with(&format!(".{}", q)) + } + _ => false, + }) { + Some(df_field) => Expr::Column(Column { + relation: df_field.qualifier().cloned(), + name: df_field.name().clone(), + }), + None => Expr::Column(col), + } + } + None => Expr::Column(col), + }, + _ => expr, + } + } + fn sql_fn_arg_to_logical_expr( &self, sql: &FunctionArg, @@ -3490,4 +3519,12 @@ mod tests { unimplemented!() } } + + #[test] + fn select_partially_qualified_column() { + let sql = r#"SELECT person.first_name FROM public.person"#; + let expected = "Projection: #public.person.first_name\ + \n TableScan: public.person projection=None"; + quick_test(sql, expected); + } } diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 3a83f205aa753..379cad6233292 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -4210,3 +4210,18 @@ fn normalize_vec_for_explain(v: Vec>) -> Vec> { }) .collect::>() } + +#[tokio::test] +async fn test_partial_qualified_name() -> Result<()> { + let mut ctx = create_join_context("t1_id", "t2_id")?; + let sql = "SELECT t1.t1_id, t1_name FROM public.t1"; + let expected = vec![ + vec!["11", "a"], + vec!["22", "b"], + vec!["33", "c"], + vec!["44", "d"], + ]; + let actual = execute(&mut ctx, sql).await; + assert_eq!(expected, actual); + Ok(()) +} From a5a58c4f23720eda63b02a6cad2902b715288db6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 4 Aug 2021 12:31:48 -0400 Subject: [PATCH 313/329] Implement vectorized hashing for DictionaryArray types (#812) * Implement vectorized hashing for DictionaryArray types * improve comments * Check is_multicol outside of the loop --- datafusion/src/physical_plan/hash_utils.rs | 224 ++++++++++++++++++++- 1 file changed, 214 insertions(+), 10 deletions(-) diff --git a/datafusion/src/physical_plan/hash_utils.rs b/datafusion/src/physical_plan/hash_utils.rs index e937b4ea549c1..abfa09a98ccdb 100644 --- a/datafusion/src/physical_plan/hash_utils.rs +++ b/datafusion/src/physical_plan/hash_utils.rs @@ -20,13 +20,17 @@ use crate::error::{DataFusionError, Result}; use ahash::{CallHasher, RandomState}; use arrow::array::{ - Array, ArrayRef, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, - Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, StringArray, - TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, - UInt16Array, UInt32Array, UInt64Array, UInt8Array, + Array, ArrayRef, BooleanArray, Date32Array, Date64Array, DictionaryArray, + Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, + LargeStringArray, StringArray, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, +}; +use arrow::datatypes::{ + ArrowDictionaryKeyType, ArrowNativeType, DataType, Field, Int16Type, Int32Type, + Int64Type, Int8Type, Schema, TimeUnit, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; -use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use std::collections::HashSet; +use std::sync::Arc; use crate::logical_plan::JoinType; use crate::physical_plan::expressions::Column; @@ -245,9 +249,60 @@ macro_rules! hash_array_float { }; } -/// Creates hash values for every row, based on the values in the columns +/// Hash the values in a dictionary array +fn create_hashes_dictionary( + array: &ArrayRef, + random_state: &RandomState, + hashes_buffer: &mut Vec, + multi_col: bool, +) -> Result<()> { + let dict_array = array.as_any().downcast_ref::>().unwrap(); + + // Hash each dictionary value once, and then use that computed + // hash for each key value to avoid a potentially expensive + // redundant hashing for large dictionary elements (e.g. strings) + let dict_values = Arc::clone(dict_array.values()); + let mut dict_hashes = vec![0; dict_values.len()]; + create_hashes(&[dict_values], random_state, &mut dict_hashes)?; + + // combine hash for each index in values + if multi_col { + for (hash, key) in hashes_buffer.iter_mut().zip(dict_array.keys().iter()) { + if let Some(key) = key { + let idx = key + .to_usize() + .ok_or_else(|| { + DataFusionError::Internal(format!( + "Can not convert key value {:?} to usize in dictionary of type {:?}", + key, dict_array.data_type() + )) + })?; + *hash = combine_hashes(dict_hashes[idx], *hash) + } // no update for Null, consistent with other hashes + } + } else { + for (hash, key) in hashes_buffer.iter_mut().zip(dict_array.keys().iter()) { + if let Some(key) = key { + let idx = key + .to_usize() + .ok_or_else(|| { + DataFusionError::Internal(format!( + "Can not convert key value {:?} to usize in dictionary of type {:?}", + key, dict_array.data_type() + )) + })?; + *hash = dict_hashes[idx] + } // no update for Null, consistent with other hashes + } + } + Ok(()) +} + +/// Creates hash values for every row, based on the values in the +/// columns. /// -/// This implements so-called "vectorized hashing" +/// The number of rows to hash is determined by `hashes_buffer.len()`. +/// `hashes_buffer` should be pre-sized appropriately pub fn create_hashes<'a>( arrays: &[ArrayRef], random_state: &RandomState, @@ -438,11 +493,84 @@ pub fn create_hashes<'a>( multi_col ); } + DataType::Dictionary(index_type, _) => match **index_type { + DataType::Int8 => { + create_hashes_dictionary::( + col, + random_state, + hashes_buffer, + multi_col, + )?; + } + DataType::Int16 => { + create_hashes_dictionary::( + col, + random_state, + hashes_buffer, + multi_col, + )?; + } + DataType::Int32 => { + create_hashes_dictionary::( + col, + random_state, + hashes_buffer, + multi_col, + )?; + } + DataType::Int64 => { + create_hashes_dictionary::( + col, + random_state, + hashes_buffer, + multi_col, + )?; + } + DataType::UInt8 => { + create_hashes_dictionary::( + col, + random_state, + hashes_buffer, + multi_col, + )?; + } + DataType::UInt16 => { + create_hashes_dictionary::( + col, + random_state, + hashes_buffer, + multi_col, + )?; + } + DataType::UInt32 => { + create_hashes_dictionary::( + col, + random_state, + hashes_buffer, + multi_col, + )?; + } + DataType::UInt64 => { + create_hashes_dictionary::( + col, + random_state, + hashes_buffer, + multi_col, + )?; + } + _ => { + return Err(DataFusionError::Internal(format!( + "Unsupported dictionary type in hasher hashing: {}", + col.data_type(), + ))) + } + }, _ => { // This is internal because we should have caught this before. - return Err(DataFusionError::Internal( - "Unsupported data type in hasher".to_string(), - )); + return Err(DataFusionError::Internal(format!( + "Unsupported data type in hasher: {}", + col.data_type() + ))); } } } @@ -453,6 +581,8 @@ pub fn create_hashes<'a>( mod tests { use std::sync::Arc; + use arrow::{array::DictionaryArray, datatypes::Int8Type}; + use super::*; fn check(left: &[Column], right: &[Column], on: &[(Column, Column)]) -> Result<()> { @@ -529,4 +659,78 @@ mod tests { Ok(()) } + + #[test] + fn create_hashes_for_dict_arrays() { + let strings = vec![Some("foo"), None, Some("bar"), Some("foo"), None]; + + let string_array = Arc::new(strings.iter().cloned().collect::()); + let dict_array = Arc::new( + strings + .iter() + .cloned() + .collect::>(), + ); + + let random_state = RandomState::with_seeds(0, 0, 0, 0); + + let mut string_hashes = vec![0; strings.len()]; + create_hashes(&[string_array], &random_state, &mut string_hashes).unwrap(); + + let mut dict_hashes = vec![0; strings.len()]; + create_hashes(&[dict_array], &random_state, &mut dict_hashes).unwrap(); + + // Null values result in a zero hash, + for (val, hash) in strings.iter().zip(string_hashes.iter()) { + match val { + Some(_) => assert_ne!(*hash, 0), + None => assert_eq!(*hash, 0), + } + } + + // same logical values should hash to the same hash value + assert_eq!(string_hashes, dict_hashes); + + // Same values should map to same hash values + assert_eq!(strings[1], strings[4]); + assert_eq!(dict_hashes[1], dict_hashes[4]); + assert_eq!(strings[0], strings[3]); + assert_eq!(dict_hashes[0], dict_hashes[3]); + + // different strings should matp to different hash values + assert_ne!(strings[0], strings[2]); + assert_ne!(dict_hashes[0], dict_hashes[2]); + } + + #[test] + fn create_multi_column_hash_for_dict_arrays() { + let strings1 = vec![Some("foo"), None, Some("bar")]; + let strings2 = vec![Some("blarg"), Some("blah"), None]; + + let string_array = Arc::new(strings1.iter().cloned().collect::()); + let dict_array = Arc::new( + strings2 + .iter() + .cloned() + .collect::>(), + ); + + let random_state = RandomState::with_seeds(0, 0, 0, 0); + + let mut one_col_hashes = vec![0; strings1.len()]; + create_hashes(&[dict_array.clone()], &random_state, &mut one_col_hashes).unwrap(); + + let mut two_col_hashes = vec![0; strings1.len()]; + create_hashes( + &[dict_array, string_array], + &random_state, + &mut two_col_hashes, + ) + .unwrap(); + + assert_eq!(one_col_hashes.len(), 3); + assert_eq!(two_col_hashes.len(), 3); + + assert_ne!(one_col_hashes, two_col_hashes); + } } From 119948f1ccfa4dc26c2fa9400c69d0d676936fa8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Wed, 4 Aug 2021 23:26:47 +0200 Subject: [PATCH 314/329] Speed up inlist for strings and primitives (#813) * Speed up inlist for strings * Add implementation for primitives * Fix bench code, add bench for in * Feedback --- datafusion/benches/filter_query_sql.rs | 20 +- .../src/physical_plan/expressions/in_list.rs | 297 ++++++++++++++---- 2 files changed, 252 insertions(+), 65 deletions(-) diff --git a/datafusion/benches/filter_query_sql.rs b/datafusion/benches/filter_query_sql.rs index 253ef455f5af2..aac7f96248725 100644 --- a/datafusion/benches/filter_query_sql.rs +++ b/datafusion/benches/filter_query_sql.rs @@ -25,16 +25,14 @@ use datafusion::prelude::ExecutionContext; use datafusion::{datasource::MemTable, error::Result}; use futures::executor::block_on; use std::sync::Arc; +use tokio::runtime::Runtime; async fn query(ctx: &mut ExecutionContext, sql: &str) { + let rt = Runtime::new().unwrap(); + // execute the query let df = ctx.sql(sql).unwrap(); - let results = df.collect().await.unwrap(); - - // display the relation - for _batch in results { - // println!("num_rows: {}", _batch.num_rows()); - } + criterion::black_box(rt.block_on(df.collect()).unwrap()); } fn create_context(array_len: usize, batch_size: usize) -> Result { @@ -85,6 +83,16 @@ fn criterion_benchmark(c: &mut Criterion) { )) }) }); + + c.bench_function("filter_scalar in list", |b| { + let mut ctx = create_context(array_len, batch_size).unwrap(); + b.iter(|| { + block_on(query( + &mut ctx, + "select f32, f64 from t where f32 in (10, 20, 30, 40)", + )) + }) + }); } criterion_group!(benches, criterion_benchmark); diff --git a/datafusion/src/physical_plan/expressions/in_list.rs b/datafusion/src/physical_plan/expressions/in_list.rs index 38b2b9d45b9bb..00767c7a67079 100644 --- a/datafusion/src/physical_plan/expressions/in_list.rs +++ b/datafusion/src/physical_plan/expressions/in_list.rs @@ -26,14 +26,39 @@ use arrow::array::{ Int64Array, Int8Array, StringOffsetSizeTrait, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; +use arrow::datatypes::ArrowPrimitiveType; use arrow::{ datatypes::{DataType, Schema}, record_batch::RecordBatch, }; -use crate::error::Result; +use crate::error::{DataFusionError, Result}; use crate::physical_plan::{ColumnarValue, PhysicalExpr}; use crate::scalar::ScalarValue; +use arrow::array::*; +use arrow::buffer::{Buffer, MutableBuffer}; + +macro_rules! compare_op_scalar { + ($left: expr, $right:expr, $op:expr) => {{ + let null_bit_buffer = $left.data().null_buffer().cloned(); + + let comparison = + (0..$left.len()).map(|i| unsafe { $op($left.value_unchecked(i), $right) }); + // same as $left.len() + let buffer = unsafe { MutableBuffer::from_trusted_len_iter_bool(comparison) }; + + let data = ArrayData::new( + DataType::Boolean, + $left.len(), + None, + null_bit_buffer, + 0, + vec![Buffer::from(buffer)], + vec![], + ); + Ok(BooleanArray::from(data)) + }}; +} /// InList #[derive(Debug)] @@ -47,20 +72,16 @@ macro_rules! make_contains { ($ARRAY:expr, $LIST_VALUES:expr, $NEGATED:expr, $SCALAR_VALUE:ident, $ARRAY_TYPE:ident) => {{ let array = $ARRAY.as_any().downcast_ref::<$ARRAY_TYPE>().unwrap(); - let mut contains_null = false; + let contains_null = $LIST_VALUES + .iter() + .any(|v| matches!(v, ColumnarValue::Scalar(s) if s.is_null())); let values = $LIST_VALUES .iter() .flat_map(|expr| match expr { ColumnarValue::Scalar(s) => match s { ScalarValue::$SCALAR_VALUE(Some(v)) => Some(*v), - ScalarValue::$SCALAR_VALUE(None) => { - contains_null = true; - None - } - ScalarValue::Utf8(None) => { - contains_null = true; - None - } + ScalarValue::$SCALAR_VALUE(None) => None, + ScalarValue::Utf8(None) => None, datatype => unimplemented!("Unexpected type {} for InList", datatype), }, ColumnarValue::Array(_) => { @@ -99,6 +120,103 @@ macro_rules! make_contains { }}; } +macro_rules! make_contains_primitive { + ($ARRAY:expr, $LIST_VALUES:expr, $NEGATED:expr, $SCALAR_VALUE:ident, $ARRAY_TYPE:ident) => {{ + let array = $ARRAY.as_any().downcast_ref::<$ARRAY_TYPE>().unwrap(); + + let contains_null = $LIST_VALUES + .iter() + .any(|v| matches!(v, ColumnarValue::Scalar(s) if s.is_null())); + let values = $LIST_VALUES + .iter() + .flat_map(|expr| match expr { + ColumnarValue::Scalar(s) => match s { + ScalarValue::$SCALAR_VALUE(Some(v)) => Some(*v), + ScalarValue::$SCALAR_VALUE(None) => None, + ScalarValue::Utf8(None) => None, + datatype => unimplemented!("Unexpected type {} for InList", datatype), + }, + ColumnarValue::Array(_) => { + unimplemented!("InList does not yet support nested columns.") + } + }) + .collect::>(); + + if $NEGATED { + if contains_null { + Ok(ColumnarValue::Array(Arc::new( + array + .iter() + .map(|x| match x.map(|v| !values.contains(&v)) { + Some(true) => None, + x => x, + }) + .collect::(), + ))) + } else { + Ok(ColumnarValue::Array(Arc::new( + not_in_list_primitive(array, &values)?, + ))) + } + } else { + if contains_null { + Ok(ColumnarValue::Array(Arc::new( + array + .iter() + .map(|x| match x.map(|v| values.contains(&v)) { + Some(false) => None, + x => x, + }) + .collect::(), + ))) + } else { + Ok(ColumnarValue::Array(Arc::new(in_list_primitive( + array, &values, + )?))) + } + } + }}; +} + +// whether each value on the left (can be null) is contained in the non-null list +fn in_list_primitive( + array: &PrimitiveArray, + values: &[::Native], +) -> Result { + compare_op_scalar!( + array, + values, + |x, v: &[::Native]| v.contains(&x) + ) +} + +// whether each value on the left (can be null) is contained in the non-null list +fn not_in_list_primitive( + array: &PrimitiveArray, + values: &[::Native], +) -> Result { + compare_op_scalar!( + array, + values, + |x, v: &[::Native]| !v.contains(&x) + ) +} + +// whether each value on the left (can be null) is contained in the non-null list +fn in_list_utf8( + array: &GenericStringArray, + values: &[&str], +) -> Result { + compare_op_scalar!(array, values, |x, v: &[&str]| v.contains(&x)) +} + +fn not_in_list_utf8( + array: &GenericStringArray, + values: &[&str], +) -> Result { + compare_op_scalar!(array, values, |x, v: &[&str]| !v.contains(&x)) +} + impl InListExpr { /// Create a new InList expression pub fn new( @@ -141,21 +259,17 @@ impl InListExpr { .downcast_ref::>() .unwrap(); - let mut contains_null = false; + let contains_null = list_values + .iter() + .any(|v| matches!(v, ColumnarValue::Scalar(s) if s.is_null())); let values = list_values .iter() .flat_map(|expr| match expr { ColumnarValue::Scalar(s) => match s { ScalarValue::Utf8(Some(v)) => Some(v.as_str()), - ScalarValue::Utf8(None) => { - contains_null = true; - None - } + ScalarValue::Utf8(None) => None, ScalarValue::LargeUtf8(Some(v)) => Some(v.as_str()), - ScalarValue::LargeUtf8(None) => { - contains_null = true; - None - } + ScalarValue::LargeUtf8(None) => None, datatype => unimplemented!("Unexpected type {} for InList", datatype), }, ColumnarValue::Array(_) => { @@ -164,33 +278,37 @@ impl InListExpr { }) .collect::>(); - Ok(ColumnarValue::Array(Arc::new( - array - .iter() - .map(|x| { - let contains = x.map(|x| values.contains(&x)); - match contains { - Some(true) => { - if negated { - Some(false) - } else { - Some(true) - } - } - Some(false) => { - if contains_null { - None - } else if negated { - Some(true) - } else { - Some(false) - } - } - None => None, - } - }) - .collect::(), - ))) + if negated { + if contains_null { + Ok(ColumnarValue::Array(Arc::new( + array + .iter() + .map(|x| match x.map(|v| !values.contains(&v)) { + Some(true) => None, + x => x, + }) + .collect::(), + ))) + } else { + Ok(ColumnarValue::Array(Arc::new(not_in_list_utf8( + array, &values, + )?))) + } + } else if contains_null { + Ok(ColumnarValue::Array(Arc::new( + array + .iter() + .map(|x| match x.map(|v| values.contains(&v)) { + Some(false) => None, + x => x, + }) + .collect::(), + ))) + } else { + Ok(ColumnarValue::Array(Arc::new(in_list_utf8( + array, &values, + )?))) + } } } @@ -234,34 +352,94 @@ impl PhysicalExpr for InListExpr { match value_data_type { DataType::Float32 => { - make_contains!(array, list_values, self.negated, Float32, Float32Array) + make_contains_primitive!( + array, + list_values, + self.negated, + Float32, + Float32Array + ) } DataType::Float64 => { - make_contains!(array, list_values, self.negated, Float64, Float64Array) + make_contains_primitive!( + array, + list_values, + self.negated, + Float64, + Float64Array + ) } DataType::Int16 => { - make_contains!(array, list_values, self.negated, Int16, Int16Array) + make_contains_primitive!( + array, + list_values, + self.negated, + Int16, + Int16Array + ) } DataType::Int32 => { - make_contains!(array, list_values, self.negated, Int32, Int32Array) + make_contains_primitive!( + array, + list_values, + self.negated, + Int32, + Int32Array + ) } DataType::Int64 => { - make_contains!(array, list_values, self.negated, Int64, Int64Array) + make_contains_primitive!( + array, + list_values, + self.negated, + Int64, + Int64Array + ) } DataType::Int8 => { - make_contains!(array, list_values, self.negated, Int8, Int8Array) + make_contains_primitive!( + array, + list_values, + self.negated, + Int8, + Int8Array + ) } DataType::UInt16 => { - make_contains!(array, list_values, self.negated, UInt16, UInt16Array) + make_contains_primitive!( + array, + list_values, + self.negated, + UInt16, + UInt16Array + ) } DataType::UInt32 => { - make_contains!(array, list_values, self.negated, UInt32, UInt32Array) + make_contains_primitive!( + array, + list_values, + self.negated, + UInt32, + UInt32Array + ) } DataType::UInt64 => { - make_contains!(array, list_values, self.negated, UInt64, UInt64Array) + make_contains_primitive!( + array, + list_values, + self.negated, + UInt64, + UInt64Array + ) } DataType::UInt8 => { - make_contains!(array, list_values, self.negated, UInt8, UInt8Array) + make_contains_primitive!( + array, + list_values, + self.negated, + UInt8, + UInt8Array + ) } DataType::Boolean => { make_contains!(array, list_values, self.negated, Boolean, BooleanArray) @@ -270,9 +448,10 @@ impl PhysicalExpr for InListExpr { DataType::LargeUtf8 => { self.compare_utf8::(array, list_values, self.negated) } - datatype => { - unimplemented!("InList does not support datatype {:?}.", datatype) - } + datatype => Result::Err(DataFusionError::NotImplemented(format!( + "InList does not support datatype {:?}.", + datatype + ))), } } } From 3908b7b172c217e7a73d17af5f38d243df519fae Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 5 Aug 2021 08:00:51 -0400 Subject: [PATCH 315/329] Add test for window functions on dictionary (#823) --- datafusion/tests/sql.rs | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 379cad6233292..19ed1b3cc1538 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -3163,22 +3163,14 @@ async fn query_group_on_null_multi_col() -> Result<()> { async fn query_on_string_dictionary() -> Result<()> { // Test to ensure DataFusion can operate on dictionary types // Use StringDictionary (32 bit indexes = keys) - let field_type = - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); - let schema = Arc::new(Schema::new(vec![Field::new("d1", field_type, true)])); + let array = vec![Some("one"), None, Some("three")] + .into_iter() + .collect::>(); - let keys_builder = PrimitiveBuilder::::new(10); - let values_builder = StringBuilder::new(10); - let mut builder = StringDictionaryBuilder::new(keys_builder, values_builder); + let batch = + RecordBatch::try_from_iter(vec![("d1", Arc::new(array) as ArrayRef)]).unwrap(); - builder.append("one")?; - builder.append_null()?; - builder.append("three")?; - let array = Arc::new(builder.finish()); - - let data = RecordBatch::try_new(schema.clone(), vec![array])?; - - let table = MemTable::try_new(schema, vec![vec![data]])?; + let table = MemTable::try_new(batch.schema(), vec![vec![batch]])?; let mut ctx = ExecutionContext::new(); ctx.register_table("test", Arc::new(table))?; @@ -3219,6 +3211,13 @@ async fn query_on_string_dictionary() -> Result<()> { let expected = vec![vec!["NULL", "1"], vec!["one", "1"], vec!["three", "1"]]; assert_eq!(expected, actual); + // window functions + let sql = "SELECT d1, row_number() OVER (partition by d1) FROM test"; + let mut actual = execute(&mut ctx, sql).await; + actual.sort(); + let expected = vec![vec!["NULL", "1"], vec!["one", "1"], vec!["three", "1"]]; + assert_eq!(expected, actual); + Ok(()) } From db214bc8bff682ea295b8c77807dc0a38f274f66 Mon Sep 17 00:00:00 2001 From: Mike Seddon Date: Fri, 6 Aug 2021 03:37:16 +1000 Subject: [PATCH 316/329] Better join order resolution logic (#797) * better join order resolution logic * simplify by using existing methods * remove unneeded clone and minor formatting --- datafusion/src/logical_plan/builder.rs | 65 ++++++++++++++++++++++---- datafusion/tests/sql.rs | 11 +++++ 2 files changed, 67 insertions(+), 9 deletions(-) diff --git a/datafusion/src/logical_plan/builder.rs b/datafusion/src/logical_plan/builder.rs index a742f346207ad..0dfc1e7aa0480 100644 --- a/datafusion/src/logical_plan/builder.rs +++ b/datafusion/src/logical_plan/builder.rs @@ -287,16 +287,63 @@ impl LogicalPlanBuilder { .into_iter() .zip(join_keys.1.into_iter()) .map(|(l, r)| { - let mut swap = false; let l = l.into(); - let left_key = l.clone().normalize(&self.plan).or_else(|_| { - swap = true; - l.normalize(right) - }); - if swap { - (r.into().normalize(&self.plan), left_key) - } else { - (left_key, r.into().normalize(right)) + let r = r.into(); + + match (&l.relation, &r.relation) { + (Some(lr), Some(rr)) => { + let l_is_left = + self.plan.schema().field_with_qualified_name(lr, &l.name); + let l_is_right = + right.schema().field_with_qualified_name(lr, &l.name); + let r_is_left = + self.plan.schema().field_with_qualified_name(rr, &r.name); + let r_is_right = + right.schema().field_with_qualified_name(rr, &r.name); + + match (l_is_left, l_is_right, r_is_left, r_is_right) { + (_, Ok(_), Ok(_), _) => (Ok(r), Ok(l)), + (Ok(_), _, _, Ok(_)) => (Ok(l), Ok(r)), + _ => (l.normalize(&self.plan), r.normalize(right)), + } + } + (Some(lr), None) => { + let l_is_left = + self.plan.schema().field_with_qualified_name(lr, &l.name); + let l_is_right = + right.schema().field_with_qualified_name(lr, &l.name); + + match (l_is_left, l_is_right) { + (Ok(_), _) => (Ok(l), r.normalize(right)), + (_, Ok(_)) => (r.normalize(&self.plan), Ok(l)), + _ => (l.normalize(&self.plan), r.normalize(right)), + } + } + (None, Some(rr)) => { + let r_is_left = + self.plan.schema().field_with_qualified_name(rr, &r.name); + let r_is_right = + right.schema().field_with_qualified_name(rr, &r.name); + + match (r_is_left, r_is_right) { + (Ok(_), _) => (Ok(r), l.normalize(right)), + (_, Ok(_)) => (l.normalize(&self.plan), Ok(r)), + _ => (l.normalize(&self.plan), r.normalize(right)), + } + } + (None, None) => { + let mut swap = false; + let left_key = + l.clone().normalize(&self.plan).or_else(|_| { + swap = true; + l.normalize(right) + }); + if swap { + (r.normalize(&self.plan), left_key) + } else { + (left_key, r.normalize(right)) + } + } } }) .unzip(); diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 19ed1b3cc1538..0c33bd4772668 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -1730,6 +1730,17 @@ async fn equijoin() -> Result<()> { let actual = execute(&mut ctx, sql).await; assert_eq!(expected, actual); } + + let mut ctx = create_join_context_qualified()?; + let equivalent_sql = [ + "SELECT t1.a, t2.b FROM t1 INNER JOIN t2 ON t1.a = t2.a ORDER BY t1.a", + "SELECT t1.a, t2.b FROM t1 INNER JOIN t2 ON t2.a = t1.a ORDER BY t1.a", + ]; + let expected = vec![vec!["1", "100"], vec!["2", "200"], vec!["4", "400"]]; + for sql in equivalent_sql.iter() { + let actual = execute(&mut ctx, sql).await; + assert_eq!(expected, actual); + } Ok(()) } From 01a51aceab09d96931a151d17257e54a57c3e44f Mon Sep 17 00:00:00 2001 From: baishen Date: Fri, 6 Aug 2021 09:35:03 -0500 Subject: [PATCH 317/329] Optimize min/max queries with table statistics (#719) * support statistics max min * fix test * add test * make pub(crate) * update arrow vertion to 5.1 * fix clippy --- datafusion/Cargo.toml | 4 +- .../src/optimizer/aggregate_statistics.rs | 166 +++++++++++- .../src/physical_plan/expressions/min_max.rs | 6 +- .../src/physical_plan/expressions/mod.rs | 1 + datafusion/src/physical_plan/parquet.rs | 252 ++++++++++++++++-- datafusion/src/scalar.rs | 2 + 6 files changed, 400 insertions(+), 31 deletions(-) diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index 2f1e997c3596f..bfb3a93e3249e 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -46,8 +46,8 @@ unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = "0.7" hashbrown = "0.11" -arrow = { version = "5.0", features = ["prettyprint"] } -parquet = { version = "5.0", features = ["arrow"] } +arrow = { version = "5.1", features = ["prettyprint"] } +parquet = { version = "5.1", features = ["arrow"] } sqlparser = "0.9.0" paste = "^1.0" num_cpus = "1.13.0" diff --git a/datafusion/src/optimizer/aggregate_statistics.rs b/datafusion/src/optimizer/aggregate_statistics.rs index a20eafc688b8d..e2d9054642019 100644 --- a/datafusion/src/optimizer/aggregate_statistics.rs +++ b/datafusion/src/optimizer/aggregate_statistics.rs @@ -16,6 +16,7 @@ // under the License. //! Utilizing exact statistics from sources to avoid scanning data +use std::collections::HashMap; use std::{sync::Arc, vec}; use crate::{ @@ -55,12 +56,40 @@ impl OptimizerRule for AggregateStatistics { // aggregations that can not be replaced // using statistics let mut agg = vec![]; + let mut max_values = HashMap::new(); + let mut min_values = HashMap::new(); + // expressions that can be replaced by constants let mut projections = vec![]; if let Some(num_rows) = match input.as_ref() { - LogicalPlan::TableScan { source, .. } - if source.has_exact_statistics() => - { + LogicalPlan::TableScan { + table_name, source, .. + } if source.has_exact_statistics() => { + let schema = source.schema(); + let fields = schema.fields(); + if let Some(column_statistics) = + source.statistics().column_statistics + { + if fields.len() == column_statistics.len() { + for (i, field) in fields.iter().enumerate() { + if let Some(max_value) = + column_statistics[i].max_value.clone() + { + let max_key = + format!("{}.{}", table_name, field.name()); + max_values.insert(max_key, max_value); + } + if let Some(min_value) = + column_statistics[i].min_value.clone() + { + let min_key = + format!("{}.{}", table_name, field.name()); + min_values.insert(min_key, min_value); + } + } + } + } + source.statistics().num_rows } _ => None, @@ -81,6 +110,60 @@ impl OptimizerRule for AggregateStatistics { "COUNT(Uint8(1))".to_string(), )); } + Expr::AggregateFunction { + fun: AggregateFunction::Max, + args, + .. + } => match &args[0] { + Expr::Column(c) => match max_values.get(&c.flat_name()) { + Some(max_value) => { + if !max_value.is_null() { + let name = format!("MAX({})", c.name); + projections.push(Expr::Alias( + Box::new(Expr::Literal( + max_value.clone(), + )), + name, + )); + } else { + agg.push(expr.clone()); + } + } + None => { + agg.push(expr.clone()); + } + }, + _ => { + agg.push(expr.clone()); + } + }, + Expr::AggregateFunction { + fun: AggregateFunction::Min, + args, + .. + } => match &args[0] { + Expr::Column(c) => match min_values.get(&c.flat_name()) { + Some(min_value) => { + if !min_value.is_null() { + let name = format!("MIN({})", c.name); + projections.push(Expr::Alias( + Box::new(Expr::Literal( + min_value.clone(), + )), + name, + )); + } else { + agg.push(expr.clone()); + } + } + None => { + agg.push(expr.clone()); + } + }, + _ => { + agg.push(expr.clone()); + } + }, _ => { agg.push(expr.clone()); } @@ -159,13 +242,18 @@ mod tests { use crate::logical_plan::LogicalPlan; use crate::optimizer::aggregate_statistics::AggregateStatistics; use crate::optimizer::optimizer::OptimizerRule; + use crate::scalar::ScalarValue; use crate::{ - datasource::{datasource::Statistics, TableProvider}, + datasource::{ + datasource::{ColumnStatistics, Statistics}, + TableProvider, + }, logical_plan::Expr, }; struct TestTableProvider { num_rows: usize, + column_statistics: Vec, is_exact: bool, } @@ -186,11 +274,11 @@ mod tests { ) -> Result> { unimplemented!() } - fn statistics(&self) -> crate::datasource::datasource::Statistics { + fn statistics(&self) -> Statistics { Statistics { num_rows: Some(self.num_rows), total_byte_size: None, - column_statistics: None, + column_statistics: Some(self.column_statistics.clone()), } } fn has_exact_statistics(&self) -> bool { @@ -206,6 +294,7 @@ mod tests { "test", Arc::new(TestTableProvider { num_rows: 100, + column_statistics: Vec::new(), is_exact: true, }), ) @@ -231,6 +320,7 @@ mod tests { "test", Arc::new(TestTableProvider { num_rows: 100, + column_statistics: Vec::new(), is_exact: false, }), ) @@ -256,6 +346,7 @@ mod tests { "test", Arc::new(TestTableProvider { num_rows: 100, + column_statistics: Vec::new(), is_exact: true, }), ) @@ -282,6 +373,7 @@ mod tests { "test", Arc::new(TestTableProvider { num_rows: 100, + column_statistics: Vec::new(), is_exact: true, }), ) @@ -307,6 +399,7 @@ mod tests { "test", Arc::new(TestTableProvider { num_rows: 100, + column_statistics: Vec::new(), is_exact: true, }), ) @@ -325,6 +418,67 @@ mod tests { Ok(()) } + #[test] + fn optimize_max_min_using_statistics() -> Result<()> { + use crate::execution::context::ExecutionContext; + let mut ctx = ExecutionContext::new(); + + let column_statistic = ColumnStatistics { + null_count: None, + max_value: Some(ScalarValue::from(100_i64)), + min_value: Some(ScalarValue::from(1_i64)), + distinct_count: None, + }; + let column_statistics = vec![column_statistic]; + + ctx.register_table( + "test", + Arc::new(TestTableProvider { + num_rows: 100, + column_statistics, + is_exact: true, + }), + ) + .unwrap(); + + let plan = ctx + .create_logical_plan("select max(a), min(a) from test") + .unwrap(); + let expected = "\ + Projection: #MAX(test.a), #MIN(test.a)\ + \n Projection: Int64(100) AS MAX(a), Int64(1) AS MIN(a)\ + \n EmptyRelation"; + + assert_optimized_plan_eq(&plan, expected); + Ok(()) + } + + #[test] + fn optimize_max_min_not_using_statistics() -> Result<()> { + use crate::execution::context::ExecutionContext; + let mut ctx = ExecutionContext::new(); + ctx.register_table( + "test", + Arc::new(TestTableProvider { + num_rows: 100, + column_statistics: Vec::new(), + is_exact: true, + }), + ) + .unwrap(); + + let plan = ctx + .create_logical_plan("select max(a), min(a) from test") + .unwrap(); + let expected = "\ + Projection: #MAX(test.a), #MIN(test.a)\ + \n Aggregate: groupBy=[[]], aggr=[[MAX(#test.a), MIN(#test.a)]]\ + \n TableScan: test projection=None"; + + assert_optimized_plan_eq(&plan, expected); + Ok(()) + } + fn assert_optimized_plan_eq(plan: &LogicalPlan, expected: &str) { let opt = AggregateStatistics::new(); let optimized_plan = opt.optimize(plan, &ExecutionProps::new()).unwrap(); diff --git a/datafusion/src/physical_plan/expressions/min_max.rs b/datafusion/src/physical_plan/expressions/min_max.rs index 6bb4c5b21b861..21cf95d6d626f 100644 --- a/datafusion/src/physical_plan/expressions/min_max.rs +++ b/datafusion/src/physical_plan/expressions/min_max.rs @@ -314,8 +314,9 @@ fn max(lhs: &ScalarValue, rhs: &ScalarValue) -> Result { min_max!(lhs, rhs, max) } +/// An accumulator to compute the maximum value #[derive(Debug)] -struct MaxAccumulator { +pub(crate) struct MaxAccumulator { max: ScalarValue, } @@ -419,8 +420,9 @@ impl AggregateExpr for Min { } } +/// An accumulator to compute the minimum value #[derive(Debug)] -struct MinAccumulator { +pub(crate) struct MinAccumulator { min: ScalarValue, } diff --git a/datafusion/src/physical_plan/expressions/mod.rs b/datafusion/src/physical_plan/expressions/mod.rs index bd3dab65b05de..d60a871baa800 100644 --- a/datafusion/src/physical_plan/expressions/mod.rs +++ b/datafusion/src/physical_plan/expressions/mod.rs @@ -62,6 +62,7 @@ pub use is_null::{is_null, IsNullExpr}; pub use lead_lag::{lag, lead}; pub use literal::{lit, Literal}; pub use min_max::{Max, Min}; +pub(crate) use min_max::{MaxAccumulator, MinAccumulator}; pub use negative::{negative, NegativeExpr}; pub use not::{not, NotExpr}; pub use nth_value::NthValue; diff --git a/datafusion/src/physical_plan/parquet.rs b/datafusion/src/physical_plan/parquet.rs index f606b5315281e..ec5611f962922 100644 --- a/datafusion/src/physical_plan/parquet.rs +++ b/datafusion/src/physical_plan/parquet.rs @@ -36,7 +36,7 @@ use crate::{ use arrow::{ array::ArrayRef, - datatypes::{Schema, SchemaRef}, + datatypes::{DataType, Schema, SchemaRef}, error::{ArrowError, Result as ArrowResult}, record_batch::RecordBatch, }; @@ -62,6 +62,8 @@ use async_trait::async_trait; use futures::stream::{Stream, StreamExt}; use super::SQLMetric; +use crate::physical_plan::expressions::{MaxAccumulator, MinAccumulator}; +use crate::physical_plan::Accumulator; /// Execution plan for scanning one or more Parquet partitions #[derive(Debug, Clone)] @@ -173,8 +175,12 @@ impl ParquetExec { let filenames: Vec = filenames.iter().map(|s| s.to_string()).collect(); let chunks = split_files(&filenames, max_concurrency); let mut num_rows = 0; + let mut num_fields = 0; + let mut fields = Vec::new(); let mut total_byte_size = 0; let mut null_counts = Vec::new(); + let mut max_values: Vec> = Vec::new(); + let mut min_values: Vec> = Vec::new(); let mut limit_exhausted = false; for chunk in chunks { let mut filenames: Vec = @@ -188,11 +194,23 @@ impl ParquetExec { let meta_data = arrow_reader.get_metadata(); // collect all the unique schemas in this data set let schema = arrow_reader.get_schema()?; - let num_fields = schema.fields().len(); if schemas.is_empty() || schema != schemas[0] { + fields = schema.fields().to_vec(); + num_fields = schema.fields().len(); + null_counts = vec![0; num_fields]; + max_values = schema + .fields() + .iter() + .map(|field| MaxAccumulator::try_new(field.data_type()).ok()) + .collect::>(); + min_values = schema + .fields() + .iter() + .map(|field| MinAccumulator::try_new(field.data_type()).ok()) + .collect::>(); schemas.push(schema); - null_counts = vec![0; num_fields] } + for row_group_meta in meta_data.row_groups() { num_rows += row_group_meta.num_rows(); total_byte_size += row_group_meta.total_byte_size(); @@ -207,20 +225,167 @@ impl ParquetExec { for (i, cnt) in columns_null_counts.enumerate() { null_counts[i] += cnt } + + for (i, column) in row_group_meta.columns().iter().enumerate() { + if let Some(stat) = column.statistics() { + match stat { + ParquetStatistics::Boolean(s) => { + if let DataType::Boolean = fields[i].data_type() { + if s.has_min_max_set() { + if let Some(max_value) = &mut max_values[i] { + match max_value.update(&[ + ScalarValue::Boolean(Some(*s.max())), + ]) { + Ok(_) => {} + Err(_) => { + max_values[i] = None; + } + } + } + if let Some(min_value) = &mut min_values[i] { + match min_value.update(&[ + ScalarValue::Boolean(Some(*s.min())), + ]) { + Ok(_) => {} + Err(_) => { + min_values[i] = None; + } + } + } + } + } + } + ParquetStatistics::Int32(s) => { + if let DataType::Int32 = fields[i].data_type() { + if s.has_min_max_set() { + if let Some(max_value) = &mut max_values[i] { + match max_value.update(&[ + ScalarValue::Int32(Some(*s.max())), + ]) { + Ok(_) => {} + Err(_) => { + max_values[i] = None; + } + } + } + if let Some(min_value) = &mut min_values[i] { + match min_value.update(&[ + ScalarValue::Int32(Some(*s.min())), + ]) { + Ok(_) => {} + Err(_) => { + min_values[i] = None; + } + } + } + } + } + } + ParquetStatistics::Int64(s) => { + if let DataType::Int64 = fields[i].data_type() { + if s.has_min_max_set() { + if let Some(max_value) = &mut max_values[i] { + match max_value.update(&[ + ScalarValue::Int64(Some(*s.max())), + ]) { + Ok(_) => {} + Err(_) => { + max_values[i] = None; + } + } + } + if let Some(min_value) = &mut min_values[i] { + match min_value.update(&[ + ScalarValue::Int64(Some(*s.min())), + ]) { + Ok(_) => {} + Err(_) => { + min_values[i] = None; + } + } + } + } + } + } + ParquetStatistics::Float(s) => { + if let DataType::Float32 = fields[i].data_type() { + if s.has_min_max_set() { + if let Some(max_value) = &mut max_values[i] { + match max_value.update(&[ + ScalarValue::Float32(Some(*s.max())), + ]) { + Ok(_) => {} + Err(_) => { + max_values[i] = None; + } + } + } + if let Some(min_value) = &mut min_values[i] { + match min_value.update(&[ + ScalarValue::Float32(Some(*s.min())), + ]) { + Ok(_) => {} + Err(_) => { + min_values[i] = None; + } + } + } + } + } + } + ParquetStatistics::Double(s) => { + if let DataType::Float64 = fields[i].data_type() { + if s.has_min_max_set() { + if let Some(max_value) = &mut max_values[i] { + match max_value.update(&[ + ScalarValue::Float64(Some(*s.max())), + ]) { + Ok(_) => {} + Err(_) => { + max_values[i] = None; + } + } + } + if let Some(min_value) = &mut min_values[i] { + match min_value.update(&[ + ScalarValue::Float64(Some(*s.min())), + ]) { + Ok(_) => {} + Err(_) => { + min_values[i] = None; + } + } + } + } + } + } + _ => {} + } + } + } + if limit.map(|x| num_rows >= x as i64).unwrap_or(false) { limit_exhausted = true; break; } } } - - let column_stats = null_counts - .iter() - .map(|null_count| ColumnStatistics { - null_count: Some(*null_count as usize), - max_value: None, - min_value: None, - distinct_count: None, + let column_stats = (0..num_fields) + .map(|i| { + let max_value = match &max_values[i] { + Some(max_value) => max_value.evaluate().ok(), + None => None, + }; + let min_value = match &min_values[i] { + Some(min_value) => min_value.evaluate().ok(), + None => None, + }; + ColumnStatistics { + null_count: Some(null_counts[i] as usize), + max_value, + min_value, + distinct_count: None, + } }) .collect(); @@ -301,7 +466,17 @@ impl ParquetExec { let mut num_rows: Option = None; let mut total_byte_size: Option = None; let mut null_counts: Vec = vec![0; schema.fields().len()]; - let mut has_null_counts = false; + let mut has_statistics = false; + let mut max_values = schema + .fields() + .iter() + .map(|field| MaxAccumulator::try_new(field.data_type()).ok()) + .collect::>(); + let mut min_values = schema + .fields() + .iter() + .map(|field| MinAccumulator::try_new(field.data_type()).ok()) + .collect::>(); for part in &partitions { if let Some(n) = part.statistics.num_rows { num_rows = Some(num_rows.unwrap_or(0) + n) @@ -312,22 +487,57 @@ impl ParquetExec { if let Some(x) = &part.statistics.column_statistics { let part_nulls: Vec> = x.iter().map(|c| c.null_count).collect(); - has_null_counts = true; + has_statistics = true; + + let part_max_values: Vec> = + x.iter().map(|c| c.max_value.clone()).collect(); + let part_min_values: Vec> = + x.iter().map(|c| c.min_value.clone()).collect(); for &i in projection.iter() { null_counts[i] = part_nulls[i].unwrap_or(0); + if let Some(part_max_value) = part_max_values[i].clone() { + if let Some(max_value) = &mut max_values[i] { + match max_value.update(&[part_max_value]) { + Ok(_) => {} + Err(_) => { + max_values[i] = None; + } + } + } + } + if let Some(part_min_value) = part_min_values[i].clone() { + if let Some(min_value) = &mut min_values[i] { + match min_value.update(&[part_min_value]) { + Ok(_) => {} + Err(_) => { + min_values[i] = None; + } + } + } + } } } } - let column_stats = if has_null_counts { + + let column_stats = if has_statistics { Some( - null_counts - .iter() - .map(|null_count| ColumnStatistics { - null_count: Some(*null_count), - distinct_count: None, - max_value: None, - min_value: None, + (0..schema.fields().len()) + .map(|i| { + let max_value = match &max_values[i] { + Some(max_value) => max_value.evaluate().ok(), + None => None, + }; + let min_value = match &min_values[i] { + Some(min_value) => min_value.evaluate().ok(), + None => None, + }; + ColumnStatistics { + null_count: Some(null_counts[i] as usize), + max_value, + min_value, + distinct_count: None, + } }) .collect(), ) diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index 90c9bf7369d4a..3896055e9233b 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -402,6 +402,8 @@ impl ScalarValue { | ScalarValue::Int64(None) | ScalarValue::Float32(None) | ScalarValue::Float64(None) + | ScalarValue::Date32(None) + | ScalarValue::Date64(None) | ScalarValue::Utf8(None) | ScalarValue::LargeUtf8(None) | ScalarValue::List(None, _) From 5a7bbccf9580897443aac85db6df04608a9d84d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 6 Aug 2021 20:53:14 +0200 Subject: [PATCH 318/329] Expose ExecutionContext.register_csv to the python bindings (#524) * Expose register_csv * Validate delimiter * Fix tests * Pass schema * unused imports * add linting * Update deps * Restore venv --- .github/workflows/python_test.yaml | 19 +- python/requirements.in | 10 +- python/requirements.txt | 285 +++++++++++++++------------- python/src/context.rs | 45 +++++ python/src/to_rust.rs | 9 + python/tests/generic.py | 13 +- python/tests/test_math_functions.py | 16 +- python/tests/test_pa_types.py | 5 +- python/tests/test_sql.py | 65 ++++++- 9 files changed, 303 insertions(+), 164 deletions(-) diff --git a/.github/workflows/python_test.yaml b/.github/workflows/python_test.yaml index ebf5e9f594c0b..8d2eb85809c49 100644 --- a/.github/workflows/python_test.yaml +++ b/.github/workflows/python_test.yaml @@ -41,18 +41,21 @@ jobs: - uses: actions/setup-python@v2 with: python-version: "3.9" - - name: Install Python dependencies - run: python -m pip install --upgrade pip setuptools wheel - - name: Run tests + - name: Create Virtualenv run: | - cd python/ - python -m venv venv source venv/bin/activate - - pip install -r requirements.txt + pip install -r python/requirements.txt + - name: Run Linters + run: | + source venv/bin/activate + flake8 python + black --line-length 79 --check python + - name: Run tests + run: | + source venv/bin/activate + cd python maturin develop - pytest -v . env: CARGO_HOME: "/home/runner/.cargo" diff --git a/python/requirements.in b/python/requirements.in index 5f145dc3b9276..7e54705fc8ab2 100644 --- a/python/requirements.in +++ b/python/requirements.in @@ -14,12 +14,14 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -maturin -toml -pyarrow -pytest + black +flake8 isort +maturin mypy numpy pandas +pyarrow +pytest +toml diff --git a/python/requirements.txt b/python/requirements.txt index b7f0080f7296b..2176988ef1232 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,8 +1,8 @@ # -# This file is autogenerated by pip-compile with python 3.9 +# This file is autogenerated by pip-compile with python 3.8 # To update, run: # -# pip-compile --generate-hashes +# pip-compile --generate-hashes requirements.in # appdirs==1.4.4 \ --hash=sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41 \ @@ -20,27 +20,35 @@ click==8.0.1 \ --hash=sha256:8c04c11192119b1ef78ea049e0a6f0463e4c48ef00a30160c704337586f3ad7a \ --hash=sha256:fba402a4a47334742d782209a7c79bc448911afe1149d07bdabdf480b3e2f4b6 # via black +flake8==3.9.2 \ + --hash=sha256:07528381786f2a6237b061f6e96610a4167b226cb926e2aa2b6b1d78057c576b \ + --hash=sha256:bf8fd333346d844f616e8d47905ef3a3384edae6b4e9beb0c5101e25e3110907 + # via -r requirements.in iniconfig==1.1.1 \ --hash=sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3 \ --hash=sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32 # via pytest -isort==5.9.2 \ - --hash=sha256:eed17b53c3e7912425579853d078a0832820f023191561fcee9d7cae424e0813 \ - --hash=sha256:f65ce5bd4cbc6abdfbe29afc2f0245538ab358c14590912df638033f157d555e +isort==5.9.3 \ + --hash=sha256:9c2ea1e62d871267b78307fe511c0838ba0da28698c5732d54e2790bf3ba9899 \ + --hash=sha256:e17d6e2b81095c9db0a03a8025a957f334d6ea30b26f9ec70805411e5c7c81f2 # via -r requirements.in -maturin==0.11.1 \ - --hash=sha256:1d8a276b4c4ac74ecf9624ebc718982cdd0f86581d6338c877d7eb2833b89a13 \ - --hash=sha256:56b1dc8651a40d024a0ac59720ffeb61a41059fcd836f1742ad828b78650fc1a \ - --hash=sha256:70b35e77e60772002e279e87e936dd5467a7952b7ccc37054a1e478d3b25c279 \ - --hash=sha256:7b9f66a5425cf9f04276effbe31f1fea331a1bd742e1726effd72e48a98da0e1 \ - --hash=sha256:8921ab6dccde53625075b5d24d2e817bf36abaeef4387237fedb4e298e73e77d \ - --hash=sha256:a2a22ea9c8448796ce8078c7706623b212f2940ac79adaebe17000ada3d6647a \ - --hash=sha256:b322f36ee7ff67870fe6d0b5fcd41226a7eca05e6819ae812875ed5f2116038d \ - --hash=sha256:ba23ac9ca8d4a23ad794b9d966f09959dc5d511afd23992afc93b720dbe0f676 \ - --hash=sha256:c7dca1e2d8eabeb3dbd9b08a182be85621b2519a9968c728b9db73023bbdd823 \ - --hash=sha256:e1598a844fdc7b5093749feb0b373fb2f7545033bb1f00779cfbf173906e374a \ - --hash=sha256:e60308dd43eb5f763126d0651827683141b12878541c6ede008f77ef655d1343 +maturin==0.11.2 \ + --hash=sha256:14afc56be161e52bad8211eba84d8f8e8fad90a86d1c1894a703a8801c6d81df \ + --hash=sha256:2631a52d4936a550baa645921fe290749df3bc3c8be85255e650cb3b419dd445 \ + --hash=sha256:27cbc80e9adb19957a4006274edddf6184e639b0c096dfa2f8fdd652401b2b69 \ + --hash=sha256:520d3aaf4e0a8d520f9498ffd7f106641d5e5782cbf26d83da66c26c50b0a7c4 \ + --hash=sha256:5c492ea891602088528485f6c257ffbe02143ea651b3287dcaccf3d542ebd6f7 \ + --hash=sha256:8f520eccda84f028bce5556b2c8b6dce5eced105734fe8f645c72937f05cf054 \ + --hash=sha256:99aed60b0427233e9d7005eea3d42887455c3be1e52a3b50246a86c4df215cd1 \ + --hash=sha256:c13392b4e6913c35655bbbf62ed4f65a340c1501734ab172410b03068c4633f0 \ + --hash=sha256:d2ed778b67317efd73f9ce580a6dd6569474e5a9bc3bfff4aa4d7e9a0bd9f888 \ + --hash=sha256:de439e1572eb0711caeb6e4f4aec692829c7cc856b264861c73219b16be8b8d4 \ + --hash=sha256:f14ddc72a3a37c72b8449704816e670f557e6d9148d35d342750dded7709eae4 # via -r requirements.in +mccabe==0.6.1 \ + --hash=sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42 \ + --hash=sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f + # via flake8 mypy==0.910 \ --hash=sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9 \ --hash=sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a \ @@ -72,35 +80,35 @@ mypy-extensions==0.4.3 \ # via # black # mypy -numpy==1.21.0 \ - --hash=sha256:1a784e8ff7ea2a32e393cc53eb0003eca1597c7ca628227e34ce34eb11645a0e \ - --hash=sha256:2ba579dde0563f47021dcd652253103d6fd66165b18011dce1a0609215b2791e \ - --hash=sha256:3537b967b350ad17633b35c2f4b1a1bbd258c018910b518c30b48c8e41272717 \ - --hash=sha256:3c40e6b860220ed862e8097b8f81c9af6d7405b723f4a7af24a267b46f90e461 \ - --hash=sha256:598fe100b2948465cf3ed64b1a326424b5e4be2670552066e17dfaa67246011d \ - --hash=sha256:620732f42259eb2c4642761bd324462a01cdd13dd111740ce3d344992dd8492f \ - --hash=sha256:709884863def34d72b183d074d8ba5cfe042bc3ff8898f1ffad0209161caaa99 \ - --hash=sha256:75579acbadbf74e3afd1153da6177f846212ea2a0cc77de53523ae02c9256513 \ - --hash=sha256:7c55407f739f0bfcec67d0df49103f9333edc870061358ac8a8c9e37ea02fcd2 \ - --hash=sha256:a1f2fb2da242568af0271455b89aee0f71e4e032086ee2b4c5098945d0e11cf6 \ - --hash=sha256:a290989cd671cd0605e9c91a70e6df660f73ae87484218e8285c6522d29f6e38 \ - --hash=sha256:ac4fd578322842dbda8d968e3962e9f22e862b6ec6e3378e7415625915e2da4d \ - --hash=sha256:ad09f55cc95ed8d80d8ab2052f78cc21cb231764de73e229140d81ff49d8145e \ - --hash=sha256:b9205711e5440954f861ceeea8f1b415d7dd15214add2e878b4d1cf2bcb1a914 \ - --hash=sha256:bba474a87496d96e61461f7306fba2ebba127bed7836212c360f144d1e72ac54 \ - --hash=sha256:bebab3eaf0641bba26039fb0b2c5bf9b99407924b53b1ea86e03c32c64ef5aef \ - --hash=sha256:cc367c86eb87e5b7c9592935620f22d13b090c609f1b27e49600cd033b529f54 \ - --hash=sha256:ccc6c650f8700ce1e3a77668bb7c43e45c20ac06ae00d22bdf6760b38958c883 \ - --hash=sha256:cf680682ad0a3bef56dae200dbcbac2d57294a73e5b0f9864955e7dd7c2c2491 \ - --hash=sha256:d2910d0a075caed95de1a605df00ee03b599de5419d0b95d55342e9a33ad1fb3 \ - --hash=sha256:d5caa946a9f55511e76446e170bdad1d12d6b54e17a2afe7b189112ed4412bb8 \ - --hash=sha256:d89b0dc7f005090e32bb4f9bf796e1dcca6b52243caf1803fdd2b748d8561f63 \ - --hash=sha256:d95d16204cd51ff1a1c8d5f9958ce90ae190be81d348b514f9be39f878b8044a \ - --hash=sha256:e4d5a86a5257843a18fb1220c5f1c199532bc5d24e849ed4b0289fb59fbd4d8f \ - --hash=sha256:e58ddb53a7b4959932f5582ac455ff90dcb05fac3f8dcc8079498d43afbbde6c \ - --hash=sha256:e80fe25cba41c124d04c662f33f6364909b985f2eb5998aaa5ae4b9587242cce \ - --hash=sha256:eda2829af498946c59d8585a9fd74da3f810866e05f8df03a86f70079c7531dd \ - --hash=sha256:fd0a359c1c17f00cb37de2969984a74320970e0ceef4808c32e00773b06649d9 +numpy==1.21.1 \ + --hash=sha256:01721eefe70544d548425a07c80be8377096a54118070b8a62476866d5208e33 \ + --hash=sha256:0318c465786c1f63ac05d7c4dbcecd4d2d7e13f0959b01b534ea1e92202235c5 \ + --hash=sha256:05a0f648eb28bae4bcb204e6fd14603de2908de982e761a2fc78efe0f19e96e1 \ + --hash=sha256:1412aa0aec3e00bc23fbb8664d76552b4efde98fb71f60737c83efbac24112f1 \ + --hash=sha256:25b40b98ebdd272bc3020935427a4530b7d60dfbe1ab9381a39147834e985eac \ + --hash=sha256:2d4d1de6e6fb3d28781c73fbde702ac97f03d79e4ffd6598b880b2d95d62ead4 \ + --hash=sha256:38e8648f9449a549a7dfe8d8755a5979b45b3538520d1e735637ef28e8c2dc50 \ + --hash=sha256:4a3d5fb89bfe21be2ef47c0614b9c9c707b7362386c9a3ff1feae63e0267ccb6 \ + --hash=sha256:635e6bd31c9fb3d475c8f44a089569070d10a9ef18ed13738b03049280281267 \ + --hash=sha256:73101b2a1fef16602696d133db402a7e7586654682244344b8329cdcbbb82172 \ + --hash=sha256:791492091744b0fe390a6ce85cc1bf5149968ac7d5f0477288f78c89b385d9af \ + --hash=sha256:7a708a79c9a9d26904d1cca8d383bf869edf6f8e7650d85dbc77b041e8c5a0f8 \ + --hash=sha256:88c0b89ad1cc24a5efbb99ff9ab5db0f9a86e9cc50240177a571fbe9c2860ac2 \ + --hash=sha256:8a326af80e86d0e9ce92bcc1e65c8ff88297de4fa14ee936cb2293d414c9ec63 \ + --hash=sha256:8a92c5aea763d14ba9d6475803fc7904bda7decc2a0a68153f587ad82941fec1 \ + --hash=sha256:91c6f5fc58df1e0a3cc0c3a717bb3308ff850abdaa6d2d802573ee2b11f674a8 \ + --hash=sha256:95b995d0c413f5d0428b3f880e8fe1660ff9396dcd1f9eedbc311f37b5652e16 \ + --hash=sha256:9749a40a5b22333467f02fe11edc98f022133ee1bfa8ab99bda5e5437b831214 \ + --hash=sha256:978010b68e17150db8765355d1ccdd450f9fc916824e8c4e35ee620590e234cd \ + --hash=sha256:9a513bd9c1551894ee3d31369f9b07460ef223694098cf27d399513415855b68 \ + --hash=sha256:a75b4498b1e93d8b700282dc8e655b8bd559c0904b3910b144646dbbbc03e062 \ + --hash=sha256:c6a2324085dd52f96498419ba95b5777e40b6bcbc20088fddb9e8cbb58885e8e \ + --hash=sha256:d7a4aeac3b94af92a9373d6e77b37691b86411f9745190d2c351f410ab3a791f \ + --hash=sha256:d9e7912a56108aba9b31df688a4c4f5cb0d9d3787386b87d504762b6754fbb1b \ + --hash=sha256:dff4af63638afcc57a3dfb9e4b26d434a7a602d225b42d746ea7fe2edf1342fd \ + --hash=sha256:e46ceaff65609b5399163de5893d8f2a82d3c77d5e56d976c8b5fb01faa6b671 \ + --hash=sha256:f01f28075a92eede918b965e86e8f0ba7b7797a95aa8d35e1cc8821f5fc3ad6a \ + --hash=sha256:fd7d7409fa643a91d0a05c7554dd68aa9c9bb16e186f6ccfe40d6e003156e33a # via # -r requirements.in # pandas @@ -109,26 +117,26 @@ packaging==21.0 \ --hash=sha256:7dc96269f53a4ccec5c0670940a4281106dd0bb343f47b7471f779df49c2fbe7 \ --hash=sha256:c86254f9220d55e31cc94d69bade760f0847da8000def4dfe1c6b872fd14ff14 # via pytest -pandas==1.3.0 \ - --hash=sha256:08eeff3da6a188e24db7f292b39a8ca9e073bf841fbbeadb946b3ad5c19d843e \ - --hash=sha256:1ff13eed501e07e7fb26a4ea18a846b6e5d7de549b497025601fd9ccb7c1d123 \ - --hash=sha256:522bfea92f3ef6207cadc7428bda1e7605dae0383b8065030e7b5d0266717b48 \ - --hash=sha256:7897326cae660eee69d501cbfa950281a193fcf407393965e1bc07448e1cc35a \ - --hash=sha256:798675317d0e4863a92a9a6bc5bd2490b5f6fef8c17b95f29e2e33f28bef9eca \ - --hash=sha256:7d3cd2c99faa94d717ca00ea489264a291ad7209453dffbf059bfb7971fd3a61 \ - --hash=sha256:823737830364d0e2af8c3912a28ba971296181a07950873492ed94e12d28c405 \ - --hash=sha256:872aa91e0f9ca913046ab639d4181a899f5e592030d954d28c2529b88756a736 \ - --hash=sha256:88864c1e28353b958b1f30e4193818519624ad9a1776921622a6a2a016d5d807 \ - --hash=sha256:92835113a67cbd34747c198d41f09f4b63f6fe11ca5643baebc7ab1e30e89e95 \ - --hash=sha256:98efc2d4983d5bb47662fe2d97b2c81b91566cb08b266490918b9c7d74a5ef64 \ - --hash=sha256:b10d7910ae9d7920a5ff7816d794d99acbc361f7b16a0f017d4fa83ced8cb55e \ - --hash=sha256:c554e6c9cf2d5ea1aba5979cc837b3649539ced0e18ece186f055450c86622e2 \ - --hash=sha256:c746876cdd8380be0c3e70966d4566855901ac9aaa5e4b9ccaa5ca5311457d11 \ - --hash=sha256:c81b8d91e9ae861eb4406b4e0f8d4dabbc105b9c479b3d1e921fba1d35b5b62a \ - --hash=sha256:e6b75091fa54a53db3927b4d1bc997c23c5ba6f87acdfe1ee5a92c38c6b2ed6a \ - --hash=sha256:ed4fc66f23fe17c93a5d439230ca2d6b5f8eac7154198d327dbe8a16d98f3f10 \ - --hash=sha256:f058c786e7b0a9e7fa5e0b9f4422e0ccdd3bf3aa3053c18d77ed2a459bd9a45a \ - --hash=sha256:fe7a549d10ca534797095586883a5c17d140d606747591258869c56e14d1b457 +pandas==1.3.1 \ + --hash=sha256:0c976e023ed580e60a82ccebdca8e1cc24d8b1fbb28175eb6521025c127dab66 \ + --hash=sha256:114c6789d15862508900a25cb4cb51820bfdd8595ea306bab3b53cd19f990b65 \ + --hash=sha256:1ee8418d0f936ff2216513aa03e199657eceb67690995d427a4a7ecd2e68f442 \ + --hash=sha256:22f3fcc129fb482ef44e7df2a594f0bd514ac45aabe50da1a10709de1b0f9d84 \ + --hash=sha256:23c7452771501254d2ae23e9e9dac88417de7e6eff3ce64ee494bb94dc88c300 \ + --hash=sha256:341935a594db24f3ff07d1b34d1d231786aa9adfa84b76eab10bf42907c8aed3 \ + --hash=sha256:45656cd59ae9745a1a21271a62001df58342b59c66d50754390066db500a8362 \ + --hash=sha256:527c43311894aff131dea99cf418cd723bfd4f0bcf3c3da460f3b57e52a64da5 \ + --hash=sha256:5c09a2538f0fddf3895070579082089ff4ae52b6cb176d8ec7a4dacf7e3676c1 \ + --hash=sha256:5d9acfca191140a518779d1095036d842d5e5bc8e8ad8b5eaad1aff90fe1870d \ + --hash=sha256:5ee927c70794e875a59796fab8047098aa59787b1be680717c141cd7873818ae \ + --hash=sha256:7150039e78a81eddd9f5a05363a11cadf90a4968aac6f086fd83e66cf1c8d1d6 \ + --hash=sha256:905fc3e0fcd86b0a9f1f97abee7d36894698d2592b22b859f08ea5a8fe3d3aab \ + --hash=sha256:9d06661c6eb741ae633ee1c57e8c432bb4203024e263fe1a077fa3fda7817fdb \ + --hash=sha256:9e1fe6722cbe27eb5891c1977bca62d456c19935352eea64d33956db46139364 \ + --hash=sha256:be12d77f7e03c40a2466ed00ccd1a5f20a574d3c622fe1516037faa31aa448aa \ + --hash=sha256:c28760932283d2c9f6fa5e53d2f77a514163b9e67fd0ee0879081be612567195 \ + --hash=sha256:e323028ab192fcfe1e8999c012a0fa96d066453bb354c7e7a4a267b25e73d3c8 \ + --hash=sha256:fdb3b33dde260b1766ea4d3c6b8fbf6799cee18d50a2a8bc534cf3550b7c819a # via -r requirements.in pathspec==0.9.0 \ --hash=sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a \ @@ -142,33 +150,44 @@ py==1.10.0 \ --hash=sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3 \ --hash=sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a # via pytest -pyarrow==4.0.1 \ - --hash=sha256:04be0f7cb9090bd029b5b53bed628548fef569e5d0b5c6cd7f6d0106dbbc782d \ - --hash=sha256:0fde9c7a3d5d37f3fe5d18c4ed015e8f585b68b26d72a10d7012cad61afe43ff \ - --hash=sha256:11517f0b4f4acbab0c37c674b4d1aad3c3dfea0f6b1bb322e921555258101ab3 \ - --hash=sha256:150db335143edd00d3ec669c7c8167d401c4aa0a290749351c80bbf146892b2e \ - --hash=sha256:24040a20208e9b16ba7b284624ebfe67e40f5c40b5dc8d874da322ac0053f9d3 \ - --hash=sha256:33c457728a1ce825b80aa8c8ed573709f1efe72003d45fa6fdbb444de9cc0b74 \ - --hash=sha256:423cd6a14810f4e40cb76e13d4240040fc1594d69fe1c4f2c70be00ad512ade5 \ - --hash=sha256:5387db80c6a7b5598884bf4df3fc546b3373771ad614548b782e840b71704877 \ - --hash=sha256:5a76ec44af838862b23fb5cfc48765bc7978f7b58a181c96ad92856280de548b \ - --hash=sha256:5f2660f59dfcfd34adac7c08dc7f615920de703f191066ed6277628975f06878 \ - --hash=sha256:6b7bd8f5aa327cc32a1b9b02a76502851575f5edb110f93c59a45c70211a5618 \ - --hash=sha256:72cf3477538bd8504f14d6299a387cc335444f7a188f548096dfea9533551f02 \ - --hash=sha256:76b75a9cfc572e890a1e000fd532bdd2084ec3f1ee94ee51802a477913a21072 \ - --hash=sha256:a81adbfbe2f6528d4593b5a8962b2751838517401d14e9d4cab6787478802693 \ - --hash=sha256:a968375c66e505f72b421f5864a37f51aad5da61b6396fa283f956e9f2b2b923 \ - --hash=sha256:afd4f7c0a225a326d2c0039cdc8631b5e8be30f78f6b7a3e5ce741cf5dd81c72 \ - --hash=sha256:b05bdd513f045d43228247ef4d9269c88139788e2d566f4cb3e855e282ad0330 \ - --hash=sha256:c2733c9bcd00074ce5497dd0a7b8a10c91d3395ddce322d7021c7fdc4ea6f610 \ - --hash=sha256:d0f080b2d9720bec42624cb0df66f60ae66b84a2ccd1fe2c291322df915ac9db \ - --hash=sha256:dcd20ee0240a88772eeb5691102c276f5cdec79527fb3a0679af7f93f93cb4bd \ - --hash=sha256:e1351576877764fb4d5690e4721ce902e987c85f4ab081c70a34e1d24646586e \ - --hash=sha256:e44dfd7e61c9eb6dda59bc49ad69e77945f6d049185a517c130417e3ca0494d8 \ - --hash=sha256:ee3d87615876550fee9a523307dd4b00f0f44cf47a94a32a07793da307df31a0 \ - --hash=sha256:fa7b165cfa97158c1e6d15c68428317b4f4ae786d1dc2dbab43f1328c1eb43aa \ - --hash=sha256:fe976695318560a97c6d31bba828eeca28c44c6f6401005e54ba476a28ac0a10 +pyarrow==5.0.0 \ + --hash=sha256:1832709281efefa4f199c639e9f429678286329860188e53beeda71750775923 \ + --hash=sha256:1d9485741e497ccc516cb0a0c8f56e22be55aea815be185c3f9a681323b0e614 \ + --hash=sha256:24e64ea33eed07441cc0e80c949e3a1b48211a1add8953268391d250f4d39922 \ + --hash=sha256:2d26186ca9748a1fb89ae6c1fa04fb343a4279b53f118734ea8096f15d66c820 \ + --hash=sha256:357605665fbefb573d40939b13a684c2490b6ed1ab4a5de8dd246db4ab02e5a4 \ + --hash=sha256:4341ac0f552dc04c450751e049976940c7f4f8f2dae03685cc465ebe0a61e231 \ + --hash=sha256:456a4488ae810a0569d1adf87dbc522bcc9a0e4a8d1809b934ca28c163d8edce \ + --hash=sha256:4d8adda1892ef4553c4804af7f67cce484f4d6371564e2d8374b8e2bc85293e2 \ + --hash=sha256:53e550dec60d1ab86cba3afa1719dc179a8bc9632a0e50d9fe91499cf0a7f2bc \ + --hash=sha256:5c0d1b68e67bb334a5af0cecdf9b6a702aaa4cc259c5cbb71b25bbed40fcedaf \ + --hash=sha256:601b0aabd6fb066429e706282934d4d8d38f53bdb8d82da9576be49f07eedf5c \ + --hash=sha256:64f30aa6b28b666a925d11c239344741850eb97c29d3aa0f7187918cf82494f7 \ + --hash=sha256:6e1f0e4374061116f40e541408a8a170c170d0a070b788717e18165ebfdd2a54 \ + --hash=sha256:6e937ce4a40ea0cc7896faff96adecadd4485beb53fbf510b46858e29b2e75ae \ + --hash=sha256:7560332e5846f0e7830b377c14c93624e24a17f91c98f0b25dafb0ca1ea6ba02 \ + --hash=sha256:7c4edd2bacee3eea6c8c28bddb02347f9d41a55ec9692c71c6de6e47c62a7f0d \ + --hash=sha256:99c8b0f7e2ce2541dd4c0c0101d9944bb8e592ae3295fe7a2f290ab99222666d \ + --hash=sha256:9e04d3621b9f2f23898eed0d044203f66c156d880f02c5534a7f9947ebb1a4af \ + --hash=sha256:b1453c2411b5062ba6bf6832dbc4df211ad625f678c623a2ee177aee158f199b \ + --hash=sha256:b3115df938b8d7a7372911a3cb3904196194bcea8bb48911b4b3eafee3ab8d90 \ + --hash=sha256:b6387d2058d95fa48ccfedea810a768187affb62f4a3ef6595fa30bf9d1a65cf \ + --hash=sha256:bbe2e439bec2618c74a3bb259700c8a7353dc2ea0c5a62686b6cf04a50ab1e0d \ + --hash=sha256:c3fc856f107ca2fb3c9391d7ea33bbb33f3a1c2b4a0e2b41f7525c626214cc03 \ + --hash=sha256:c5493d2414d0d690a738aac8dd6d38518d1f9b870e52e24f89d8d7eb3afd4161 \ + --hash=sha256:e9ec80f4a77057498cf4c5965389e42e7f6a618b6859e6dd615e57505c9167a6 \ + --hash=sha256:ed135a99975380c27077f9d0e210aea8618ed9fadcec0e71f8a3190939557afe \ + --hash=sha256:f4db312e9ba80e730cefcae0a05b63ea5befc7634c28df56682b628ad8e1c25c \ + --hash=sha256:ff21711f6ff3b0bc90abc8ca8169e676faeb2401ddc1a0bc1c7dc181708a3406 # via -r requirements.in +pycodestyle==2.7.0 \ + --hash=sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068 \ + --hash=sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef + # via flake8 +pyflakes==2.3.1 \ + --hash=sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3 \ + --hash=sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db + # via flake8 pyparsing==2.4.7 \ --hash=sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1 \ --hash=sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b @@ -185,48 +204,40 @@ pytz==2021.1 \ --hash=sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da \ --hash=sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798 # via pandas -regex==2021.7.6 \ - --hash=sha256:0eb2c6e0fcec5e0f1d3bcc1133556563222a2ffd2211945d7b1480c1b1a42a6f \ - --hash=sha256:15dddb19823f5147e7517bb12635b3c82e6f2a3a6b696cc3e321522e8b9308ad \ - --hash=sha256:173bc44ff95bc1e96398c38f3629d86fa72e539c79900283afa895694229fe6a \ - --hash=sha256:1c78780bf46d620ff4fff40728f98b8afd8b8e35c3efd638c7df67be2d5cddbf \ - --hash=sha256:2366fe0479ca0e9afa534174faa2beae87847d208d457d200183f28c74eaea59 \ - --hash=sha256:2bceeb491b38225b1fee4517107b8491ba54fba77cf22a12e996d96a3c55613d \ - --hash=sha256:2ddeabc7652024803666ea09f32dd1ed40a0579b6fbb2a213eba590683025895 \ - --hash=sha256:2fe5e71e11a54e3355fa272137d521a40aace5d937d08b494bed4529964c19c4 \ - --hash=sha256:319eb2a8d0888fa6f1d9177705f341bc9455a2c8aca130016e52c7fe8d6c37a3 \ - --hash=sha256:3f5716923d3d0bfb27048242a6e0f14eecdb2e2a7fac47eda1d055288595f222 \ - --hash=sha256:422dec1e7cbb2efbbe50e3f1de36b82906def93ed48da12d1714cabcd993d7f0 \ - --hash=sha256:4c9c3155fe74269f61e27617529b7f09552fbb12e44b1189cebbdb24294e6e1c \ - --hash=sha256:4f64fc59fd5b10557f6cd0937e1597af022ad9b27d454e182485f1db3008f417 \ - --hash=sha256:564a4c8a29435d1f2256ba247a0315325ea63335508ad8ed938a4f14c4116a5d \ - --hash=sha256:59506c6e8bd9306cd8a41511e32d16d5d1194110b8cfe5a11d102d8b63cf945d \ - --hash=sha256:598c0a79b4b851b922f504f9f39a863d83ebdfff787261a5ed061c21e67dd761 \ - --hash=sha256:59c00bb8dd8775473cbfb967925ad2c3ecc8886b3b2d0c90a8e2707e06c743f0 \ - --hash=sha256:6110bab7eab6566492618540c70edd4d2a18f40ca1d51d704f1d81c52d245026 \ - --hash=sha256:6afe6a627888c9a6cfbb603d1d017ce204cebd589d66e0703309b8048c3b0854 \ - --hash=sha256:791aa1b300e5b6e5d597c37c346fb4d66422178566bbb426dd87eaae475053fb \ - --hash=sha256:8394e266005f2d8c6f0bc6780001f7afa3ef81a7a2111fa35058ded6fce79e4d \ - --hash=sha256:875c355360d0f8d3d827e462b29ea7682bf52327d500a4f837e934e9e4656068 \ - --hash=sha256:89e5528803566af4df368df2d6f503c84fbfb8249e6631c7b025fe23e6bd0cde \ - --hash=sha256:99d8ab206a5270c1002bfcf25c51bf329ca951e5a169f3b43214fdda1f0b5f0d \ - --hash=sha256:9a854b916806c7e3b40e6616ac9e85d3cdb7649d9e6590653deb5b341a736cec \ - --hash=sha256:b85ac458354165405c8a84725de7bbd07b00d9f72c31a60ffbf96bb38d3e25fa \ - --hash=sha256:bc84fb254a875a9f66616ed4538542fb7965db6356f3df571d783f7c8d256edd \ - --hash=sha256:c92831dac113a6e0ab28bc98f33781383fe294df1a2c3dfd1e850114da35fd5b \ - --hash=sha256:cbe23b323988a04c3e5b0c387fe3f8f363bf06c0680daf775875d979e376bd26 \ - --hash=sha256:ccb3d2190476d00414aab36cca453e4596e8f70a206e2aa8db3d495a109153d2 \ - --hash=sha256:d8bbce0c96462dbceaa7ac4a7dfbbee92745b801b24bce10a98d2f2b1ea9432f \ - --hash=sha256:db2b7df831c3187a37f3bb80ec095f249fa276dbe09abd3d35297fc250385694 \ - --hash=sha256:e586f448df2bbc37dfadccdb7ccd125c62b4348cb90c10840d695592aa1b29e0 \ - --hash=sha256:e5983c19d0beb6af88cb4d47afb92d96751fb3fa1784d8785b1cdf14c6519407 \ - --hash=sha256:e6a1e5ca97d411a461041d057348e578dc344ecd2add3555aedba3b408c9f874 \ - --hash=sha256:eaf58b9e30e0e546cdc3ac06cf9165a1ca5b3de8221e9df679416ca667972035 \ - --hash=sha256:ed693137a9187052fc46eedfafdcb74e09917166362af4cc4fddc3b31560e93d \ - --hash=sha256:edd1a68f79b89b0c57339bce297ad5d5ffcc6ae7e1afdb10f1947706ed066c9c \ - --hash=sha256:f080248b3e029d052bf74a897b9d74cfb7643537fbde97fe8225a6467fb559b5 \ - --hash=sha256:f9392a4555f3e4cb45310a65b403d86b589adc773898c25a39184b1ba4db8985 \ - --hash=sha256:f98dc35ab9a749276f1a4a38ab3e0e2ba1662ce710f6530f5b0a6656f1c32b58 +regex==2021.8.3 \ + --hash=sha256:026beb631097a4a3def7299aa5825e05e057de3c6d72b139c37813bfa351274b \ + --hash=sha256:14caacd1853e40103f59571f169704367e79fb78fac3d6d09ac84d9197cadd16 \ + --hash=sha256:16d9eaa8c7e91537516c20da37db975f09ac2e7772a0694b245076c6d68f85da \ + --hash=sha256:18fdc51458abc0a974822333bd3a932d4e06ba2a3243e9a1da305668bd62ec6d \ + --hash=sha256:28e8af338240b6f39713a34e337c3813047896ace09d51593d6907c66c0708ba \ + --hash=sha256:3835de96524a7b6869a6c710b26c90e94558c31006e96ca3cf6af6751b27dca1 \ + --hash=sha256:3905c86cc4ab6d71635d6419a6f8d972cab7c634539bba6053c47354fd04452c \ + --hash=sha256:3c09d88a07483231119f5017904db8f60ad67906efac3f1baa31b9b7f7cca281 \ + --hash=sha256:4551728b767f35f86b8e5ec19a363df87450c7376d7419c3cac5b9ceb4bce576 \ + --hash=sha256:459bbe342c5b2dec5c5223e7c363f291558bc27982ef39ffd6569e8c082bdc83 \ + --hash=sha256:4f421e3cdd3a273bace013751c345f4ebeef08f05e8c10757533ada360b51a39 \ + --hash=sha256:577737ec3d4c195c4aef01b757905779a9e9aee608fa1cf0aec16b5576c893d3 \ + --hash=sha256:57fece29f7cc55d882fe282d9de52f2f522bb85290555b49394102f3621751ee \ + --hash=sha256:7976d410e42be9ae7458c1816a416218364e06e162b82e42f7060737e711d9ce \ + --hash=sha256:85f568892422a0e96235eb8ea6c5a41c8ccbf55576a2260c0160800dbd7c4f20 \ + --hash=sha256:8764a78c5464ac6bde91a8c87dd718c27c1cabb7ed2b4beaf36d3e8e390567f9 \ + --hash=sha256:8935937dad2c9b369c3d932b0edbc52a62647c2afb2fafc0c280f14a8bf56a6a \ + --hash=sha256:8fe58d9f6e3d1abf690174fd75800fda9bdc23d2a287e77758dc0e8567e38ce6 \ + --hash=sha256:937b20955806381e08e54bd9d71f83276d1f883264808521b70b33d98e4dec5d \ + --hash=sha256:9569da9e78f0947b249370cb8fadf1015a193c359e7e442ac9ecc585d937f08d \ + --hash=sha256:a3b73390511edd2db2d34ff09aa0b2c08be974c71b4c0505b4a048d5dc128c2b \ + --hash=sha256:a4eddbe2a715b2dd3849afbdeacf1cc283160b24e09baf64fa5675f51940419d \ + --hash=sha256:a5c6dbe09aff091adfa8c7cfc1a0e83fdb8021ddb2c183512775a14f1435fe16 \ + --hash=sha256:b63e3571b24a7959017573b6455e05b675050bbbea69408f35f3cb984ec54363 \ + --hash=sha256:bb350eb1060591d8e89d6bac4713d41006cd4d479f5e11db334a48ff8999512f \ + --hash=sha256:bf6d987edd4a44dd2fa2723fca2790f9442ae4de2c8438e53fcb1befdf5d823a \ + --hash=sha256:bfa6a679410b394600eafd16336b2ce8de43e9b13f7fb9247d84ef5ad2b45e91 \ + --hash=sha256:c856ec9b42e5af4fe2d8e75970fcc3a2c15925cbcc6e7a9bcb44583b10b95e80 \ + --hash=sha256:cea56288eeda8b7511d507bbe7790d89ae7049daa5f51ae31a35ae3c05408531 \ + --hash=sha256:ea212df6e5d3f60341aef46401d32fcfded85593af1d82b8b4a7a68cd67fdd6b \ + --hash=sha256:f35567470ee6dbfb946f069ed5f5615b40edcbb5f1e6e1d3d2b114468d505fc6 \ + --hash=sha256:fbc20975eee093efa2071de80df7f972b7b35e560b213aafabcec7c0bd00bd8c \ + --hash=sha256:ff4a8ad9638b7ca52313d8732f37ecd5fd3c8e3aff10a8ccb93176fd5b3812f6 # via black six==1.16.0 \ --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ @@ -240,9 +251,9 @@ toml==0.10.2 \ # maturin # mypy # pytest -tomli==1.0.4 \ - --hash=sha256:0713b16ff91df8638a6a694e295c8159ab35ba93e3424a626dd5226d386057be \ - --hash=sha256:be670d0d8d7570fd0ea0113bd7bb1ba3ac6706b4de062cc4c952769355c9c268 +tomli==1.2.0 \ + --hash=sha256:056f0376bf5a6b182c513f9582c1e5b0487265eb6c48842b69aa9ca1cd5f640a \ + --hash=sha256:d60e681734099207a6add7a10326bc2ddd1fdc36c1b0f547d00ef73ac63739c2 # via black typing-extensions==3.10.0.0 \ --hash=sha256:0ac0f89795dd19de6b97debb0c6af1c70987fd80a2d62d1958f7e56fcc31b497 \ diff --git a/python/src/context.rs b/python/src/context.rs index 14ef0f7321f15..9acc14a5e2609 100644 --- a/python/src/context.rs +++ b/python/src/context.rs @@ -15,16 +15,19 @@ // specific language governing permissions and limitations // under the License. +use std::path::PathBuf; use std::{collections::HashSet, sync::Arc}; use rand::distributions::Alphanumeric; use rand::Rng; +use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use datafusion::arrow::record_batch::RecordBatch; use datafusion::datasource::MemTable; use datafusion::execution::context::ExecutionContext as _ExecutionContext; +use datafusion::prelude::CsvReadOptions; use crate::dataframe; use crate::errors; @@ -97,6 +100,48 @@ impl ExecutionContext { Ok(()) } + #[args( + schema = "None", + has_header = "true", + delimiter = "\",\"", + schema_infer_max_records = "1000", + file_extension = "\".csv\"" + )] + fn register_csv( + &mut self, + name: &str, + path: PathBuf, + schema: Option<&PyAny>, + has_header: bool, + delimiter: &str, + schema_infer_max_records: usize, + file_extension: &str, + ) -> PyResult<()> { + let path = path + .to_str() + .ok_or(PyValueError::new_err("Unable to convert path to a string"))?; + let schema = match schema { + Some(s) => Some(to_rust::to_rust_schema(s)?), + None => None, + }; + let delimiter = delimiter.as_bytes(); + if delimiter.len() != 1 { + return Err(PyValueError::new_err( + "Delimiter must be a single character", + )); + } + + let mut options = CsvReadOptions::new() + .has_header(has_header) + .delimiter(delimiter[0]) + .schema_infer_max_records(schema_infer_max_records) + .file_extension(file_extension); + options.schema = schema.as_ref(); + + errors::wrap(self.ctx.register_csv(name, path, options))?; + Ok(()) + } + fn register_udf( &mut self, name: &str, diff --git a/python/src/to_rust.rs b/python/src/to_rust.rs index e7957ec42d92f..7977fe4ff8ce1 100644 --- a/python/src/to_rust.rs +++ b/python/src/to_rust.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use std::convert::TryFrom; use std::sync::Arc; use datafusion::arrow::{ @@ -111,3 +112,11 @@ pub fn to_rust_scalar(ob: &PyAny) -> PyResult { } }) } + +pub fn to_rust_schema(ob: &PyAny) -> PyResult { + let c_schema = ffi::FFI_ArrowSchema::empty(); + let c_schema_ptr = &c_schema as *const ffi::FFI_ArrowSchema; + ob.call_method1("_export_to_c", (c_schema_ptr as uintptr_t,))?; + let schema = Schema::try_from(&c_schema).map_err(errors::DataFusionError::from)?; + Ok(schema) +} diff --git a/python/tests/generic.py b/python/tests/generic.py index 5871c5e891b28..8d5adaaaf9563 100644 --- a/python/tests/generic.py +++ b/python/tests/generic.py @@ -19,6 +19,7 @@ import numpy as np import pyarrow as pa +import pyarrow.csv import pyarrow.parquet as pq # used to write parquet files @@ -49,7 +50,9 @@ def data_datetime(f): datetime.datetime.now() - datetime.timedelta(days=1), datetime.datetime.now() + datetime.timedelta(days=1), ] - return pa.array(data, type=pa.timestamp(f), mask=np.array([False, True, False])) + return pa.array( + data, type=pa.timestamp(f), mask=np.array([False, True, False]) + ) def data_date32(): @@ -58,7 +61,9 @@ def data_date32(): datetime.date(1980, 1, 1), datetime.date(2030, 1, 1), ] - return pa.array(data, type=pa.date32(), mask=np.array([False, True, False])) + return pa.array( + data, type=pa.date32(), mask=np.array([False, True, False]) + ) def data_timedelta(f): @@ -67,7 +72,9 @@ def data_timedelta(f): datetime.timedelta(days=1), datetime.timedelta(seconds=1), ] - return pa.array(data, type=pa.duration(f), mask=np.array([False, True, False])) + return pa.array( + data, type=pa.duration(f), mask=np.array([False, True, False]) + ) def data_binary_other(): diff --git a/python/tests/test_math_functions.py b/python/tests/test_math_functions.py index cb03753121fa0..98656b8c4f422 100644 --- a/python/tests/test_math_functions.py +++ b/python/tests/test_math_functions.py @@ -26,7 +26,9 @@ def df(): ctx = ExecutionContext() # create a RecordBatch and a new DataFrame from it - batch = pa.RecordBatch.from_arrays([pa.array([0.1, -0.7, 0.55])], names=["value"]) + batch = pa.RecordBatch.from_arrays( + [pa.array([0.1, -0.7, 0.55])], names=["value"] + ) return ctx.create_dataframe([[batch]]) @@ -56,7 +58,13 @@ def test_math_functions(df): np.testing.assert_array_almost_equal(result.column(4), np.arcsin(values)) np.testing.assert_array_almost_equal(result.column(5), np.arccos(values)) np.testing.assert_array_almost_equal(result.column(6), np.exp(values)) - np.testing.assert_array_almost_equal(result.column(7), np.log(values + 1.0)) - np.testing.assert_array_almost_equal(result.column(8), np.log2(values + 1.0)) - np.testing.assert_array_almost_equal(result.column(9), np.log10(values + 1.0)) + np.testing.assert_array_almost_equal( + result.column(7), np.log(values + 1.0) + ) + np.testing.assert_array_almost_equal( + result.column(8), np.log2(values + 1.0) + ) + np.testing.assert_array_almost_equal( + result.column(9), np.log10(values + 1.0) + ) np.testing.assert_array_less(result.column(10), np.ones_like(values)) diff --git a/python/tests/test_pa_types.py b/python/tests/test_pa_types.py index 069343f8a45f4..04f6110e3a429 100644 --- a/python/tests/test_pa_types.py +++ b/python/tests/test_pa_types.py @@ -19,8 +19,8 @@ def test_type_ids(): - """having this fixed is very important because internally we rely on this id to parse from - python""" + # Having this fixed is very important because internally we rely on this id + # to parse from python for idx, arrow_type in [ (0, pa.null()), (1, pa.bool_()), @@ -47,5 +47,4 @@ def test_type_ids(): (34, pa.large_utf8()), (35, pa.large_binary()), ]: - assert idx == arrow_type.id diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py index 62d6c0975f3db..669f640529eb5 100644 --- a/python/tests/test_sql.py +++ b/python/tests/test_sql.py @@ -18,8 +18,8 @@ import numpy as np import pyarrow as pa import pytest -from datafusion import ExecutionContext +from datafusion import ExecutionContext from . import generic as helpers @@ -33,12 +33,63 @@ def test_no_table(ctx): ctx.sql("SELECT a FROM b").collect() -def test_register(ctx, tmp_path): +def test_register_csv(ctx, tmp_path): + path = tmp_path / "test.csv" + + table = pa.Table.from_arrays( + [ + [1, 2, 3, 4], + ["a", "b", "c", "d"], + [1.1, 2.2, 3.3, 4.4], + ], + names=["int", "str", "float"], + ) + pa.csv.write_csv(table, path) + + ctx.register_csv("csv", path) + ctx.register_csv("csv1", str(path)) + ctx.register_csv( + "csv2", + path, + has_header=True, + delimiter=",", + schema_infer_max_records=10, + ) + alternative_schema = pa.schema( + [ + ("some_int", pa.int16()), + ("some_bytes", pa.string()), + ("some_floats", pa.float32()), + ] + ) + ctx.register_csv("csv3", path, schema=alternative_schema) + + assert ctx.tables() == {"csv", "csv1", "csv2", "csv3"} + + for table in ["csv", "csv1", "csv2"]: + result = ctx.sql(f"SELECT COUNT(int) FROM {table}").collect() + result = pa.Table.from_batches(result) + assert result.to_pydict() == {"COUNT(int)": [4]} + + result = ctx.sql("SELECT * FROM csv3").collect() + result = pa.Table.from_batches(result) + assert result.schema == alternative_schema + + with pytest.raises( + ValueError, match="Delimiter must be a single character" + ): + ctx.register_csv("csv4", path, delimiter="wrong") + + +def test_register_parquet(ctx, tmp_path): path = helpers.write_parquet(tmp_path / "a.parquet", helpers.data()) ctx.register_parquet("t", path) - assert ctx.tables() == {"t"} + result = ctx.sql("SELECT COUNT(a) FROM t").collect() + result = pa.Table.from_batches(result) + assert result.to_pydict() == {"COUNT(a)": [100]} + def test_execute(ctx, tmp_path): data = [1, 1, 2, 2, 3, 11, 12] @@ -112,7 +163,9 @@ def test_cast(ctx, tmp_path): "float", ] - select = ", ".join([f"CAST(9 AS {t}) AS A{i}" for i, t in enumerate(valid_types)]) + select = ", ".join( + [f"CAST(9 AS {t}) AS A{i}" for i, t in enumerate(valid_types)] + ) # can execute, which implies that we can cast ctx.sql(f"SELECT {select} FROM t").collect() @@ -141,7 +194,9 @@ def test_udf( ctx, tmp_path, fn, input_types, output_type, input_values, expected_values ): # write to disk - path = helpers.write_parquet(tmp_path / "a.parquet", pa.array(input_values)) + path = helpers.write_parquet( + tmp_path / "a.parquet", pa.array(input_values) + ) ctx.register_parquet("t", path) ctx.register_udf("udf", fn, input_types, output_type) From 2c0c06248667bfeb9c56a4c2119b3a7994b9fc1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 7 Aug 2021 07:55:04 +0200 Subject: [PATCH 319/329] Use `RawTable` API in hash join (#827) * Use rawtable API * Avoid changes * Check on hash again * Test fix --- datafusion/src/physical_plan/hash_join.rs | 107 ++++++++-------------- 1 file changed, 36 insertions(+), 71 deletions(-) diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index 1a174bb11d10f..1a57c404e96e5 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -29,8 +29,8 @@ use arrow::{ datatypes::{UInt32Type, UInt64Type}, }; use smallvec::{smallvec, SmallVec}; +use std::sync::Arc; use std::{any::Any, usize}; -use std::{hash::Hasher, sync::Arc}; use std::{time::Instant, vec}; use async_trait::async_trait; @@ -49,6 +49,8 @@ use arrow::array::{ UInt64Array, UInt8Array, }; +use hashbrown::raw::RawTable; + use super::expressions::Column; use super::hash_utils::create_hashes; use super::{ @@ -65,6 +67,7 @@ use super::{ use crate::physical_plan::coalesce_batches::concat_batches; use crate::physical_plan::{PhysicalExpr, SQLMetric}; use log::debug; +use std::fmt; // Maps a `u64` hash value based on the left ["on" values] to a list of indices with this key's value. // @@ -78,7 +81,14 @@ use log::debug; // but the values don't match. Those are checked in the [equal_rows] macro // TODO: speed up collission check and move away from using a hashbrown HashMap // https://github.com/apache/arrow-datafusion/issues/50 -type JoinHashMap = HashMap<(), SmallVec<[u64; 1]>, IdHashBuilder>; +struct JoinHashMap(RawTable<(u64, SmallVec<[u64; 1]>)>); + +impl fmt::Debug for JoinHashMap { + fn fmt(&self, _f: &mut fmt::Formatter<'_>) -> fmt::Result { + Ok(()) + } +} + type JoinLeftData = Arc<(JoinHashMap, RecordBatch)>; /// join execution plan executes partitions in parallel and combines them into a set of @@ -303,10 +313,8 @@ impl ExecutionPlan for HashJoinExec { Ok(acc) }) .await?; - let mut hashmap = JoinHashMap::with_capacity_and_hasher( - num_rows, - IdHashBuilder {}, - ); + let mut hashmap = + JoinHashMap(RawTable::with_capacity(num_rows)); let mut hashes_buffer = Vec::new(); let mut offset = 0; for batch in batches.iter() { @@ -358,8 +366,7 @@ impl ExecutionPlan for HashJoinExec { Ok(acc) }) .await?; - let mut hashmap = - JoinHashMap::with_capacity_and_hasher(num_rows, IdHashBuilder {}); + let mut hashmap = JoinHashMap(RawTable::with_capacity(num_rows)); let mut hashes_buffer = Vec::new(); let mut offset = 0; for batch in batches.iter() { @@ -460,7 +467,7 @@ impl ExecutionPlan for HashJoinExec { fn update_hash( on: &[Column], batch: &RecordBatch, - hash: &mut JoinHashMap, + hash_map: &mut JoinHashMap, offset: usize, random_state: &RandomState, hashes_buffer: &mut Vec, @@ -476,18 +483,18 @@ fn update_hash( // insert hashes to key of the hashmap for (row, hash_value) in hash_values.iter().enumerate() { - match hash.raw_entry_mut().from_hash(*hash_value, |_| true) { - hashbrown::hash_map::RawEntryMut::Occupied(mut entry) => { - entry.get_mut().push((row + offset) as u64); - } - hashbrown::hash_map::RawEntryMut::Vacant(entry) => { - entry.insert_hashed_nocheck( - *hash_value, - (), - smallvec![(row + offset) as u64], - ); - } - }; + let item = hash_map + .0 + .get_mut(*hash_value, |(hash, _)| *hash_value == *hash); + if let Some((_, indices)) = item { + indices.push((row + offset) as u64); + } else { + hash_map.0.insert( + *hash_value, + (*hash_value, smallvec![(row + offset) as u64]), + |(hash, _)| *hash, + ); + } } Ok(()) } @@ -678,7 +685,7 @@ fn build_join_indexes( // This possibly contains rows with hash collisions, // So we have to check here whether rows are equal or not if let Some((_, indices)) = - left.raw_entry().from_hash(*hash_value, |_| true) + left.0.get(*hash_value, |(hash, _)| *hash_value == *hash) { for &i in indices { // Check hash collisions @@ -710,7 +717,7 @@ fn build_join_indexes( // First visit all of the rows for (row, hash_value) in hash_values.iter().enumerate() { if let Some((_, indices)) = - left.raw_entry().from_hash(*hash_value, |_| true) + left.0.get(*hash_value, |(hash, _)| *hash_value == *hash) { for &i in indices { // Collision check @@ -728,7 +735,7 @@ fn build_join_indexes( let mut right_indices = UInt32Builder::new(0); for (row, hash_value) in hash_values.iter().enumerate() { - match left.raw_entry().from_hash(*hash_value, |_| true) { + match left.0.get(*hash_value, |(hash, _)| *hash_value == *hash) { Some((_, indices)) => { for &i in indices { if equal_rows( @@ -755,38 +762,6 @@ fn build_join_indexes( } } } -use core::hash::BuildHasher; - -/// `Hasher` that returns the same `u64` value as a hash, to avoid re-hashing -/// it when inserting/indexing or regrowing the `HashMap` -struct IdHasher { - hash: u64, -} - -impl Hasher for IdHasher { - fn finish(&self) -> u64 { - self.hash - } - - fn write_u64(&mut self, i: u64) { - self.hash = i; - } - - fn write(&mut self, _bytes: &[u8]) { - unreachable!("IdHasher should only be used for u64 keys") - } -} - -#[derive(Debug)] -struct IdHashBuilder {} - -impl BuildHasher for IdHashBuilder { - type Hasher = IdHasher; - - fn build_hasher(&self) -> Self::Hasher { - IdHasher { hash: 0 } - } -} macro_rules! equal_rows_elem { ($array_type:ident, $l: ident, $r: ident, $left: ident, $right: ident) => {{ @@ -1776,7 +1751,7 @@ mod tests { #[test] fn join_with_hash_collision() -> Result<()> { - let mut hashmap_left = HashMap::with_capacity_and_hasher(2, IdHashBuilder {}); + let mut hashmap_left = RawTable::with_capacity(2); let left = build_table_i32( ("a", &vec![10, 20]), ("x", &vec![100, 200]), @@ -1788,19 +1763,9 @@ mod tests { let hashes = create_hashes(&[left.columns()[0].clone()], &random_state, hashes_buff)?; - // Create hash collisions - match hashmap_left.raw_entry_mut().from_hash(hashes[0], |_| true) { - hashbrown::hash_map::RawEntryMut::Vacant(entry) => { - entry.insert_hashed_nocheck(hashes[0], (), smallvec![0, 1]) - } - _ => unreachable!("Hash should not be vacant"), - }; - match hashmap_left.raw_entry_mut().from_hash(hashes[1], |_| true) { - hashbrown::hash_map::RawEntryMut::Vacant(entry) => { - entry.insert_hashed_nocheck(hashes[1], (), smallvec![0, 1]) - } - _ => unreachable!("Hash should not be vacant"), - }; + // Create hash collisions (same hashes) + hashmap_left.insert(hashes[0], (hashes[0], smallvec![0, 1]), |(h, _)| *h); + hashmap_left.insert(hashes[1], (hashes[1], smallvec![0, 1]), |(h, _)| *h); let right = build_table_i32( ("a", &vec![10, 20]), @@ -1808,7 +1773,7 @@ mod tests { ("c", &vec![30, 40]), ); - let left_data = JoinLeftData::new((hashmap_left, left)); + let left_data = JoinLeftData::new((JoinHashMap(hashmap_left), left)); let (l, r) = build_join_indexes( &left_data, &right, From dd14e1806576cff995d8d4655e04050cba08923f Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 7 Aug 2021 07:25:01 -0600 Subject: [PATCH 320/329] Add ballista-examples to docker build (#829) --- dev/docker/ballista.dockerfile | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/dev/docker/ballista.dockerfile b/dev/docker/ballista.dockerfile index 730e86749a63b..11b788e7a2b1a 100644 --- a/dev/docker/ballista.dockerfile +++ b/dev/docker/ballista.dockerfile @@ -29,15 +29,17 @@ RUN cargo install cargo-chef FROM base as planner RUN mkdir /tmp/ballista/ballista +RUN mkdir /tmp/ballista/ballista-examples RUN mkdir /tmp/ballista/benchmarks RUN mkdir /tmp/ballista/datafusion RUN mkdir /tmp/ballista/datafusion-examples ADD Cargo.toml . +COPY ballista ./ballista/ +COPY ballista-examples ./ballista-examples/ COPY benchmarks ./benchmarks/ COPY datafusion ./datafusion/ COPY datafusion-cli ./datafusion-cli/ COPY datafusion-examples ./datafusion-examples/ -COPY ballista ./ballista/ RUN cargo chef prepare --recipe-path recipe.json FROM base as cacher @@ -46,14 +48,16 @@ RUN cargo chef cook $RELEASE_FLAG --recipe-path recipe.json FROM base as builder RUN mkdir /tmp/ballista/ballista +RUN mkdir /tmp/ballista/ballista-examples RUN mkdir /tmp/ballista/benchmarks RUN mkdir /tmp/ballista/datafusion RUN mkdir /tmp/ballista/datafusion-cli RUN mkdir /tmp/ballista/datafusion-examples ADD Cargo.toml . +COPY ballista ./ballista/ +COPY ballista-examples ./ballista-examples/ COPY benchmarks ./benchmarks/ COPY datafusion ./datafusion/ -COPY ballista ./ballista/ COPY datafusion-cli ./datafusion-cli/ COPY datafusion-examples ./datafusion-examples/ COPY --from=cacher /tmp/ballista/target target From ea1356f6af2ca7cc54b9fdca56ad861d3f7f0976 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 7 Aug 2021 11:35:30 -0600 Subject: [PATCH 321/329] Implement serde for MIN and MAX (#833) --- ballista/rust/core/src/serde/physical_plan/to_proto.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ballista/rust/core/src/serde/physical_plan/to_proto.rs b/ballista/rust/core/src/serde/physical_plan/to_proto.rs index ec5ec7cb7affa..48b21345525bc 100644 --- a/ballista/rust/core/src/serde/physical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/to_proto.rs @@ -47,7 +47,7 @@ use datafusion::{ use datafusion::physical_plan::{ empty::EmptyExec, - expressions::{Avg, BinaryExpr, Column, Sum}, + expressions::{Avg, BinaryExpr, Column, Max, Min, Sum}, Partitioning, }; use datafusion::physical_plan::{AggregateExpr, ExecutionPlan, PhysicalExpr}; @@ -421,6 +421,10 @@ impl TryInto for Arc { Ok(protobuf::AggregateFunction::Sum.into()) } else if self.as_any().downcast_ref::().is_some() { Ok(protobuf::AggregateFunction::Count.into()) + } else if self.as_any().downcast_ref::().is_some() { + Ok(protobuf::AggregateFunction::Min.into()) + } else if self.as_any().downcast_ref::().is_some() { + Ok(protobuf::AggregateFunction::Max.into()) } else { Err(BallistaError::NotImplemented(format!( "Aggregate function not supported: {:?}", From 7c751de70076b6a218fccb9bb6d32e65c1cfeb34 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 7 Aug 2021 14:13:12 -0600 Subject: [PATCH 322/329] Add minimal crate documentation for Ballista crates (#831) --- ballista/rust/client/src/lib.rs | 98 ++++++++++++++++++++++++++++++ ballista/rust/core/src/lib.rs | 7 ++- ballista/rust/executor/src/lib.rs | 5 +- ballista/rust/scheduler/src/lib.rs | 5 +- 4 files changed, 112 insertions(+), 3 deletions(-) diff --git a/ballista/rust/client/src/lib.rs b/ballista/rust/client/src/lib.rs index c3c62918680d3..35bd12bb9c253 100644 --- a/ballista/rust/client/src/lib.rs +++ b/ballista/rust/client/src/lib.rs @@ -15,6 +15,104 @@ // specific language governing permissions and limitations // under the License. +//! Ballista is a distributed compute platform primarily implemented in Rust, and powered by Apache Arrow and +//! DataFusion. It is built on an architecture that allows other programming languages (such as Python, C++, and +//! Java) to be supported as first-class citizens without paying a penalty for serialization costs. +//! +//! The foundational technologies in Ballista are: +//! +//! - [Apache Arrow](https://arrow.apache.org/) memory model and compute kernels for efficient processing of data. +//! - [Apache Arrow Flight Protocol](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for efficient +//! data transfer between processes. +//! - [Google Protocol Buffers](https://developers.google.com/protocol-buffers) for serializing query plans. +//! - [Docker](https://www.docker.com/) for packaging up executors along with user-defined code. +//! +//! Ballista can be deployed as a standalone cluster and also supports [Kubernetes](https://kubernetes.io/). In either +//! case, the scheduler can be configured to use [etcd](https://etcd.io/) as a backing store to (eventually) provide +//! redundancy in the case of a scheduler failing. +//! +//! ## Starting a cluster +//! +//! There are numerous ways to start a Ballista cluster, including support for Docker and +//! Kubernetes. For full documentation, refer to the +//! [DataFusion User Guide](https://github.com/apache/arrow-datafusion/tree/master/docs/user-guide) +//! +//! A simple way to start a local cluster for testing purposes is to use cargo to install +//! the scheduler and executor crates. +//! +//! ```bash +//! cargo install ballista-scheduler +//! cargo install ballista-executor +//! ``` +//! +//! With these crates installed, it is now possible to start a scheduler process. +//! +//! ```bash +//! RUST_LOG=info ballista-scheduler +//! ``` +//! +//! The scheduler will bind to port 50050 by default. +//! +//! Next, start an executor processes in a new terminal session with the specified concurrency +//! level. +//! +//! ```bash +//! RUST_LOG=info ballista-executor -c 4 +//! ``` +//! +//! The executor will bind to port 50051 by default. Additional executors can be started by +//! manually specifying a bind port. For example: +//! +//! ```bash +//! RUST_LOG=info ballista-executor --bind-port 50052 -c 4 +//! ``` +//! +//! ## Executing a query +//! +//! Ballista provides a `BallistaContext` as a starting point for creating queries. DataFrames can be created +//! by invoking the `read_csv`, `read_parquet`, and `sql` methods. +//! +//! The following example runs a simple aggregate SQL query against a CSV file from the +//! [New York Taxi and Limousine Commission](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) +//! data set. +//! +//! ```no_run +//! use ballista::prelude::*; +//! use datafusion::arrow::util::pretty; +//! use datafusion::prelude::CsvReadOptions; +//! +//! #[tokio::main] +//! async fn main() -> Result<()> { +//! // create configuration +//! let config = BallistaConfig::builder() +//! .set("ballista.shuffle.partitions", "4") +//! .build()?; +//! +//! // connect to Ballista scheduler +//! let ctx = BallistaContext::remote("localhost", 50050, &config); +//! +//! // register csv file with the execution context +//! ctx.register_csv( +//! "tripdata", +//! "/path/to/yellow_tripdata_2020-01.csv", +//! CsvReadOptions::new(), +//! )?; +//! +//! // execute the query +//! let df = ctx.sql( +//! "SELECT passenger_count, MIN(fare_amount), MAX(fare_amount), AVG(fare_amount), SUM(fare_amount) +//! FROM tripdata +//! GROUP BY passenger_count +//! ORDER BY passenger_count", +//! )?; +//! +//! // collect the results and print them to stdout +//! let results = df.collect().await?; +//! pretty::print_batches(&results)?; +//! Ok(()) +//! } +//! ``` + pub mod columnar_batch; pub mod context; pub mod prelude; diff --git a/ballista/rust/core/src/lib.rs b/ballista/rust/core/src/lib.rs index 2a8486945ad0a..614bf9ab84c8c 100644 --- a/ballista/rust/core/src/lib.rs +++ b/ballista/rust/core/src/lib.rs @@ -15,7 +15,12 @@ // specific language governing permissions and limitations // under the License. -//! Ballista Distributed Compute +//! Ballista Core Library +//! +//! This crate contains the Ballista core library which is used as a dependency by the ballista, +//! ballista-scheduler, and ballista-executor crates. Refer to for +//! general Ballista documentation. + #![allow(unused_imports)] pub const BALLISTA_VERSION: &str = env!("CARGO_PKG_VERSION"); diff --git a/ballista/rust/executor/src/lib.rs b/ballista/rust/executor/src/lib.rs index f3ab7dcf5ae51..f2abf31e8166d 100644 --- a/ballista/rust/executor/src/lib.rs +++ b/ballista/rust/executor/src/lib.rs @@ -15,7 +15,10 @@ // specific language governing permissions and limitations // under the License. -//! Core executor logic for executing queries and storing results in memory. +//! Ballista Executor Process +//! +//! This crate contains the Ballista executor process. Refer to for +//! documentation. pub mod collect; pub mod execution_loop; diff --git a/ballista/rust/scheduler/src/lib.rs b/ballista/rust/scheduler/src/lib.rs index 3e4e73586d539..676975fcaec9d 100644 --- a/ballista/rust/scheduler/src/lib.rs +++ b/ballista/rust/scheduler/src/lib.rs @@ -15,7 +15,10 @@ // specific language governing permissions and limitations // under the License. -//! Support for distributed schedulers, such as Kubernetes +//! Ballista Scheduler Process +//! +//! This crate contains the Ballista scheduler process. Refer to for +//! documentation. pub mod api; pub mod planner; From ee27f6ec11fb31df7d17cd1431890c35885732ad Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 7 Aug 2021 15:47:41 -0600 Subject: [PATCH 323/329] Change datatype of tpch keys from Int32 to UInt64 to support sf=1000 (#836) * Change datatype of tpch keys from Int32 to UInt64 to support sf=1000 * revert accidental change --- benchmarks/src/bin/tpch.rs | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index 42755ec4b1b37..978fbaa9afe71 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -490,7 +490,7 @@ fn get_schema(table: &str) -> Schema { match table { "part" => Schema::new(vec![ - Field::new("p_partkey", DataType::Int32, false), + Field::new("p_partkey", DataType::UInt64, false), Field::new("p_name", DataType::Utf8, false), Field::new("p_mfgr", DataType::Utf8, false), Field::new("p_brand", DataType::Utf8, false), @@ -502,28 +502,28 @@ fn get_schema(table: &str) -> Schema { ]), "supplier" => Schema::new(vec![ - Field::new("s_suppkey", DataType::Int32, false), + Field::new("s_suppkey", DataType::UInt64, false), Field::new("s_name", DataType::Utf8, false), Field::new("s_address", DataType::Utf8, false), - Field::new("s_nationkey", DataType::Int32, false), + Field::new("s_nationkey", DataType::UInt64, false), Field::new("s_phone", DataType::Utf8, false), Field::new("s_acctbal", DataType::Float64, false), Field::new("s_comment", DataType::Utf8, false), ]), "partsupp" => Schema::new(vec![ - Field::new("ps_partkey", DataType::Int32, false), - Field::new("ps_suppkey", DataType::Int32, false), + Field::new("ps_partkey", DataType::UInt64, false), + Field::new("ps_suppkey", DataType::UInt64, false), Field::new("ps_availqty", DataType::Int32, false), Field::new("ps_supplycost", DataType::Float64, false), Field::new("ps_comment", DataType::Utf8, false), ]), "customer" => Schema::new(vec![ - Field::new("c_custkey", DataType::Int32, false), + Field::new("c_custkey", DataType::UInt64, false), Field::new("c_name", DataType::Utf8, false), Field::new("c_address", DataType::Utf8, false), - Field::new("c_nationkey", DataType::Int32, false), + Field::new("c_nationkey", DataType::UInt64, false), Field::new("c_phone", DataType::Utf8, false), Field::new("c_acctbal", DataType::Float64, false), Field::new("c_mktsegment", DataType::Utf8, false), @@ -531,8 +531,8 @@ fn get_schema(table: &str) -> Schema { ]), "orders" => Schema::new(vec![ - Field::new("o_orderkey", DataType::Int32, false), - Field::new("o_custkey", DataType::Int32, false), + Field::new("o_orderkey", DataType::UInt64, false), + Field::new("o_custkey", DataType::UInt64, false), Field::new("o_orderstatus", DataType::Utf8, false), Field::new("o_totalprice", DataType::Float64, false), Field::new("o_orderdate", DataType::Date32, false), @@ -543,9 +543,9 @@ fn get_schema(table: &str) -> Schema { ]), "lineitem" => Schema::new(vec![ - Field::new("l_orderkey", DataType::Int32, false), - Field::new("l_partkey", DataType::Int32, false), - Field::new("l_suppkey", DataType::Int32, false), + Field::new("l_orderkey", DataType::UInt64, false), + Field::new("l_partkey", DataType::UInt64, false), + Field::new("l_suppkey", DataType::UInt64, false), Field::new("l_linenumber", DataType::Int32, false), Field::new("l_quantity", DataType::Float64, false), Field::new("l_extendedprice", DataType::Float64, false), @@ -562,14 +562,14 @@ fn get_schema(table: &str) -> Schema { ]), "nation" => Schema::new(vec![ - Field::new("n_nationkey", DataType::Int32, false), + Field::new("n_nationkey", DataType::UInt64, false), Field::new("n_name", DataType::Utf8, false), - Field::new("n_regionkey", DataType::Int32, false), + Field::new("n_regionkey", DataType::UInt64, false), Field::new("n_comment", DataType::Utf8, false), ]), "region" => Schema::new(vec![ - Field::new("r_regionkey", DataType::Int32, false), + Field::new("r_regionkey", DataType::UInt64, false), Field::new("r_name", DataType::Utf8, false), Field::new("r_comment", DataType::Utf8, false), ]), From 4ddd2f5e7582ffe662aea27bbb74c58cd0715152 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sun, 8 Aug 2021 15:42:31 -0700 Subject: [PATCH 324/329] Implement PartialOrd for ScalarValue (#838) * Implement PartialOrd for ScalarValue. * Avoid catch all match. --- datafusion/src/scalar.rs | 146 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs index 3896055e9233b..3fbcadd3de5a3 100644 --- a/datafusion/src/scalar.rs +++ b/datafusion/src/scalar.rs @@ -28,6 +28,7 @@ use arrow::{ }, }; use ordered_float::OrderedFloat; +use std::cmp::Ordering; use std::convert::{Infallible, TryInto}; use std::str::FromStr; use std::{convert::TryFrom, fmt, iter::repeat, sync::Arc}; @@ -156,6 +157,81 @@ impl PartialEq for ScalarValue { } } +// manual implementation of `PartialOrd` that uses OrderedFloat to +// get defined behavior for floating point +impl PartialOrd for ScalarValue { + fn partial_cmp(&self, other: &Self) -> Option { + use ScalarValue::*; + // This purposely doesn't have a catch-all "(_, _)" so that + // any newly added enum variant will require editing this list + // or else face a compile error + match (self, other) { + (Boolean(v1), Boolean(v2)) => v1.partial_cmp(v2), + (Boolean(_), _) => None, + (Float32(v1), Float32(v2)) => { + let v1 = v1.map(OrderedFloat); + let v2 = v2.map(OrderedFloat); + v1.partial_cmp(&v2) + } + (Float32(_), _) => None, + (Float64(v1), Float64(v2)) => { + let v1 = v1.map(OrderedFloat); + let v2 = v2.map(OrderedFloat); + v1.partial_cmp(&v2) + } + (Float64(_), _) => None, + (Int8(v1), Int8(v2)) => v1.partial_cmp(v2), + (Int8(_), _) => None, + (Int16(v1), Int16(v2)) => v1.partial_cmp(v2), + (Int16(_), _) => None, + (Int32(v1), Int32(v2)) => v1.partial_cmp(v2), + (Int32(_), _) => None, + (Int64(v1), Int64(v2)) => v1.partial_cmp(v2), + (Int64(_), _) => None, + (UInt8(v1), UInt8(v2)) => v1.partial_cmp(v2), + (UInt8(_), _) => None, + (UInt16(v1), UInt16(v2)) => v1.partial_cmp(v2), + (UInt16(_), _) => None, + (UInt32(v1), UInt32(v2)) => v1.partial_cmp(v2), + (UInt32(_), _) => None, + (UInt64(v1), UInt64(v2)) => v1.partial_cmp(v2), + (UInt64(_), _) => None, + (Utf8(v1), Utf8(v2)) => v1.partial_cmp(v2), + (Utf8(_), _) => None, + (LargeUtf8(v1), LargeUtf8(v2)) => v1.partial_cmp(v2), + (LargeUtf8(_), _) => None, + (Binary(v1), Binary(v2)) => v1.partial_cmp(v2), + (Binary(_), _) => None, + (LargeBinary(v1), LargeBinary(v2)) => v1.partial_cmp(v2), + (LargeBinary(_), _) => None, + (List(v1, t1), List(v2, t2)) => { + if t1.eq(t2) { + v1.partial_cmp(v2) + } else { + None + } + } + (List(_, _), _) => None, + (Date32(v1), Date32(v2)) => v1.partial_cmp(v2), + (Date32(_), _) => None, + (Date64(v1), Date64(v2)) => v1.partial_cmp(v2), + (Date64(_), _) => None, + (TimestampSecond(v1), TimestampSecond(v2)) => v1.partial_cmp(v2), + (TimestampSecond(_), _) => None, + (TimestampMillisecond(v1), TimestampMillisecond(v2)) => v1.partial_cmp(v2), + (TimestampMillisecond(_), _) => None, + (TimestampMicrosecond(v1), TimestampMicrosecond(v2)) => v1.partial_cmp(v2), + (TimestampMicrosecond(_), _) => None, + (TimestampNanosecond(v1), TimestampNanosecond(v2)) => v1.partial_cmp(v2), + (TimestampNanosecond(_), _) => None, + (IntervalYearMonth(v1), IntervalYearMonth(v2)) => v1.partial_cmp(v2), + (IntervalYearMonth(_), _) => None, + (IntervalDayTime(v1), IntervalDayTime(v2)) => v1.partial_cmp(v2), + (IntervalDayTime(_), _) => None, + } + } +} + impl Eq for ScalarValue {} // manual implementation of `Hash` that uses OrderedFloat to @@ -1577,4 +1653,74 @@ mod tests { // per distinct value. assert_eq!(std::mem::size_of::(), 32); } + + #[test] + fn scalar_partial_ordering() { + use ScalarValue::*; + + assert_eq!( + Int64(Some(33)).partial_cmp(&Int64(Some(0))), + Some(Ordering::Greater) + ); + assert_eq!( + Int64(Some(0)).partial_cmp(&Int64(Some(33))), + Some(Ordering::Less) + ); + assert_eq!( + Int64(Some(33)).partial_cmp(&Int64(Some(33))), + Some(Ordering::Equal) + ); + // For different data type, `partial_cmp` returns None. + assert_eq!(Int64(Some(33)).partial_cmp(&Int32(Some(33))), None); + assert_eq!(Int32(Some(33)).partial_cmp(&Int64(Some(33))), None); + + assert_eq!( + List( + Some(Box::new(vec![Int32(Some(1)), Int32(Some(5))])), + Box::new(DataType::Int32) + ) + .partial_cmp(&List( + Some(Box::new(vec![Int32(Some(1)), Int32(Some(5))])), + Box::new(DataType::Int32) + )), + Some(Ordering::Equal) + ); + + assert_eq!( + List( + Some(Box::new(vec![Int32(Some(10)), Int32(Some(5))])), + Box::new(DataType::Int32) + ) + .partial_cmp(&List( + Some(Box::new(vec![Int32(Some(1)), Int32(Some(5))])), + Box::new(DataType::Int32) + )), + Some(Ordering::Greater) + ); + + assert_eq!( + List( + Some(Box::new(vec![Int32(Some(1)), Int32(Some(5))])), + Box::new(DataType::Int32) + ) + .partial_cmp(&List( + Some(Box::new(vec![Int32(Some(10)), Int32(Some(5))])), + Box::new(DataType::Int32) + )), + Some(Ordering::Less) + ); + + // For different data type, `partial_cmp` returns None. + assert_eq!( + List( + Some(Box::new(vec![Int64(Some(1)), Int64(Some(5))])), + Box::new(DataType::Int64) + ) + .partial_cmp(&List( + Some(Box::new(vec![Int32(Some(1)), Int32(Some(5))])), + Box::new(DataType::Int32) + )), + None + ); + } } From 30cc6746ec9b7f5b181c589f6687fa2ebfaa5d18 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 9 Aug 2021 11:37:43 -0400 Subject: [PATCH 325/329] Add tests for hash collisions (#842) * Add force_hash_collisions feature flag * Add CI job for hash collisions * Disable tests that check values of hashes * Disable failing join tests when force_hash_collisions is enabled --- .github/workflows/rust.yml | 48 +++++++++++++++++++++ datafusion/Cargo.toml | 3 ++ datafusion/src/physical_plan/hash_join.rs | 8 ++++ datafusion/src/physical_plan/hash_utils.rs | 23 +++++++++- datafusion/src/physical_plan/repartition.rs | 3 ++ datafusion/tests/sql.rs | 6 +++ 6 files changed, 90 insertions(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 89c56b5fcedad..5246db6727409 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -312,6 +312,54 @@ jobs: # Ignore MIRI errors until we can get a clean run cargo miri test || true + + # Check answers are correct when hash values collide + hash-collisions: + name: Test Hash Collisions on AMD64 Rust ${{ matrix.rust }} + needs: [linux-build-lib] + runs-on: ubuntu-latest + strategy: + matrix: + arch: [amd64] + rust: [stable] + container: + image: ${{ matrix.arch }}/rust + env: + # Disable full debug symbol generation to speed up CI build and keep memory down + # "1" means line tables only, which is useful for panic tracebacks. + RUSTFLAGS: "-C debuginfo=1" + steps: + - uses: actions/checkout@v2 + with: + submodules: true + - name: Cache Cargo + uses: actions/cache@v2 + with: + path: /github/home/.cargo + # this key equals the ones on `linux-build-lib` for re-use + key: cargo-cache- + - name: Cache Rust dependencies + uses: actions/cache@v2 + with: + path: /github/home/target + # this key equals the ones on `linux-build-lib` for re-use + key: ${{ runner.os }}-${{ matrix.arch }}-target-cache-${{ matrix.rust }} + - name: Setup Rust toolchain + run: | + rustup toolchain install ${{ matrix.rust }} + rustup default ${{ matrix.rust }} + rustup component add rustfmt + - name: Run tests + run: | + export ARROW_TEST_DATA=$(pwd)/testing/data + export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data + cd datafusion + # Force all hash values to collide + cargo test --features=force_hash_collisions + env: + CARGO_HOME: "/github/home/.cargo" + CARGO_TARGET_DIR: "/github/home/target" + # Coverage job was failing. https://github.com/apache/arrow-datafusion/issues/590 tracks re-instating it # coverage: diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index bfb3a93e3249e..9b094ac1a828c 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -42,6 +42,9 @@ simd = ["arrow/simd"] crypto_expressions = ["md-5", "sha2"] regex_expressions = ["regex", "lazy_static"] unicode_expressions = ["unicode-segmentation"] +# Used for testing ONLY: causes all values to hash to the same value (test for collisions) +force_hash_collisions = [] + [dependencies] ahash = "0.7" diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index 1a57c404e96e5..fa75437e3fea0 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -1372,6 +1372,8 @@ mod tests { } #[tokio::test] + // Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed + #[cfg(not(feature = "force_hash_collisions"))] async fn join_full_multi_batch() { let left = build_table( ("a1", &vec![1, 2, 3]), @@ -1637,6 +1639,8 @@ mod tests { } #[tokio::test] + // Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed + #[cfg(not(feature = "force_hash_collisions"))] async fn join_right_one() -> Result<()> { let left = build_table( ("a1", &vec![1, 2, 3]), @@ -1673,6 +1677,8 @@ mod tests { } #[tokio::test] + // Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed + #[cfg(not(feature = "force_hash_collisions"))] async fn partitioned_join_right_one() -> Result<()> { let left = build_table( ("a1", &vec![1, 2, 3]), @@ -1710,6 +1716,8 @@ mod tests { } #[tokio::test] + // Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed + #[cfg(not(feature = "force_hash_collisions"))] async fn join_full_one() -> Result<()> { let left = build_table( ("a1", &vec![1, 2, 3]), diff --git a/datafusion/src/physical_plan/hash_utils.rs b/datafusion/src/physical_plan/hash_utils.rs index abfa09a98ccdb..6a622df4f68da 100644 --- a/datafusion/src/physical_plan/hash_utils.rs +++ b/datafusion/src/physical_plan/hash_utils.rs @@ -298,11 +298,28 @@ fn create_hashes_dictionary( Ok(()) } +/// Test version of `create_hashes` that produces the same value for +/// all hashes (to test collisions) +/// +/// See comments on `hashes_buffer` for more details +#[cfg(feature = "force_hash_collisions")] +pub fn create_hashes<'a>( + _arrays: &[ArrayRef], + _random_state: &RandomState, + hashes_buffer: &'a mut Vec, +) -> Result<&'a mut Vec> { + for hash in hashes_buffer.iter_mut() { + *hash = 0 + } + return Ok(hashes_buffer); +} + /// Creates hash values for every row, based on the values in the /// columns. /// /// The number of rows to hash is determined by `hashes_buffer.len()`. /// `hashes_buffer` should be pre-sized appropriately +#[cfg(not(feature = "force_hash_collisions"))] pub fn create_hashes<'a>( arrays: &[ArrayRef], random_state: &RandomState, @@ -661,6 +678,8 @@ mod tests { } #[test] + // Tests actual values of hashes, which are different if forcing collisions + #[cfg(not(feature = "force_hash_collisions"))] fn create_hashes_for_dict_arrays() { let strings = vec![Some("foo"), None, Some("bar"), Some("foo"), None]; @@ -697,12 +716,14 @@ mod tests { assert_eq!(strings[0], strings[3]); assert_eq!(dict_hashes[0], dict_hashes[3]); - // different strings should matp to different hash values + // different strings should map to different hash values assert_ne!(strings[0], strings[2]); assert_ne!(dict_hashes[0], dict_hashes[2]); } #[test] + // Tests actual values of hashes, which are different if forcing collisions + #[cfg(not(feature = "force_hash_collisions"))] fn create_multi_column_hash_for_dict_arrays() { let strings1 = vec![Some("foo"), None, Some("bar")]; let strings2 = vec![Some("blarg"), Some("blah"), None]; diff --git a/datafusion/src/physical_plan/repartition.rs b/datafusion/src/physical_plan/repartition.rs index b59071adb3a1e..eb3fe5560fd6d 100644 --- a/datafusion/src/physical_plan/repartition.rs +++ b/datafusion/src/physical_plan/repartition.rs @@ -732,6 +732,9 @@ mod tests { } #[tokio::test] + // skip this test when hash function is different because the hard + // coded expected output is a function of the hash values + #[cfg(not(feature = "force_hash_collisions"))] async fn repartition_with_dropping_output_stream() { #[derive(Debug)] struct Case<'a> { diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 0c33bd4772668..046e4f28ec427 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -1797,6 +1797,8 @@ async fn equijoin_left_and_condition_from_right() -> Result<()> { } #[tokio::test] +// Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed +#[cfg(not(feature = "force_hash_collisions"))] async fn equijoin_right_and_condition_from_left() -> Result<()> { let mut ctx = create_join_context("t1_id", "t2_id")?; let sql = @@ -1850,6 +1852,8 @@ async fn left_join() -> Result<()> { } #[tokio::test] +// Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed +#[cfg(not(feature = "force_hash_collisions"))] async fn right_join() -> Result<()> { let mut ctx = create_join_context("t1_id", "t2_id")?; let equivalent_sql = [ @@ -1870,6 +1874,8 @@ async fn right_join() -> Result<()> { } #[tokio::test] +// Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed +#[cfg(not(feature = "force_hash_collisions"))] async fn full_join() -> Result<()> { let mut ctx = create_join_context("t1_id", "t2_id")?; let equivalent_sql = [ From 0125451e5fc194b1b1e4828bae5350bcd8ac24f9 Mon Sep 17 00:00:00 2001 From: QP Hou Date: Mon, 9 Aug 2021 10:34:41 -0700 Subject: [PATCH 326/329] [ballista] support date_part and date_turnc ser/de, pass tpch 7 (#840) --- ballista/rust/core/proto/ballista.proto | 17 ++-- .../core/src/serde/logical_plan/from_proto.rs | 99 ++++++++----------- .../core/src/serde/logical_plan/to_proto.rs | 5 +- .../src/serde/physical_plan/from_proto.rs | 1 + benchmarks/run.sh | 2 +- benchmarks/src/bin/tpch.rs | 1 + datafusion/src/logical_plan/expr.rs | 19 +++- datafusion/src/logical_plan/mod.rs | 15 +-- datafusion/src/physical_plan/functions.rs | 4 +- datafusion/src/prelude.rs | 9 +- 10 files changed, 88 insertions(+), 84 deletions(-) diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto index 9dbce81c21f1f..2538a10ceda3b 100644 --- a/ballista/rust/core/proto/ballista.proto +++ b/ballista/rust/core/proto/ballista.proto @@ -144,18 +144,19 @@ enum ScalarFunction { TOTIMESTAMP = 24; ARRAY = 25; NULLIF = 26; - DATETRUNC = 27; - MD5 = 28; - SHA224 = 29; - SHA256 = 30; - SHA384 = 31; - SHA512 = 32; - LN = 33; + DATEPART = 27; + DATETRUNC = 28; + MD5 = 29; + SHA224 = 30; + SHA256 = 31; + SHA384 = 32; + SHA512 = 33; + LN = 34; } message ScalarFunctionNode { ScalarFunction fun = 1; - repeated LogicalExprNode expr = 2; + repeated LogicalExprNode args = 2; } enum AggregateFunction { diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs index 2665e33137b5d..31b8b6d3bcbcd 100644 --- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs @@ -988,77 +988,58 @@ impl TryInto for &protobuf::LogicalExprNode { expr.fun )) })?; + let args = &expr.args; + match scalar_function { - protobuf::ScalarFunction::Sqrt => { - Ok(sqrt((&expr.expr[0]).try_into()?)) - } - protobuf::ScalarFunction::Sin => Ok(sin((&expr.expr[0]).try_into()?)), - protobuf::ScalarFunction::Cos => Ok(cos((&expr.expr[0]).try_into()?)), - protobuf::ScalarFunction::Tan => Ok(tan((&expr.expr[0]).try_into()?)), - // protobuf::ScalarFunction::Asin => Ok(asin(&expr.expr[0]).try_into()?)), - // protobuf::ScalarFunction::Acos => Ok(acos(&expr.expr[0]).try_into()?)), - protobuf::ScalarFunction::Atan => { - Ok(atan((&expr.expr[0]).try_into()?)) - } - protobuf::ScalarFunction::Exp => Ok(exp((&expr.expr[0]).try_into()?)), - protobuf::ScalarFunction::Log2 => { - Ok(log2((&expr.expr[0]).try_into()?)) - } - protobuf::ScalarFunction::Ln => Ok(ln((&expr.expr[0]).try_into()?)), - protobuf::ScalarFunction::Log10 => { - Ok(log10((&expr.expr[0]).try_into()?)) - } - protobuf::ScalarFunction::Floor => { - Ok(floor((&expr.expr[0]).try_into()?)) - } - protobuf::ScalarFunction::Ceil => { - Ok(ceil((&expr.expr[0]).try_into()?)) - } - protobuf::ScalarFunction::Round => { - Ok(round((&expr.expr[0]).try_into()?)) - } - protobuf::ScalarFunction::Trunc => { - Ok(trunc((&expr.expr[0]).try_into()?)) - } - protobuf::ScalarFunction::Abs => Ok(abs((&expr.expr[0]).try_into()?)), + protobuf::ScalarFunction::Sqrt => Ok(sqrt((&args[0]).try_into()?)), + protobuf::ScalarFunction::Sin => Ok(sin((&args[0]).try_into()?)), + protobuf::ScalarFunction::Cos => Ok(cos((&args[0]).try_into()?)), + protobuf::ScalarFunction::Tan => Ok(tan((&args[0]).try_into()?)), + // protobuf::ScalarFunction::Asin => Ok(asin(&args[0]).try_into()?)), + // protobuf::ScalarFunction::Acos => Ok(acos(&args[0]).try_into()?)), + protobuf::ScalarFunction::Atan => Ok(atan((&args[0]).try_into()?)), + protobuf::ScalarFunction::Exp => Ok(exp((&args[0]).try_into()?)), + protobuf::ScalarFunction::Log2 => Ok(log2((&args[0]).try_into()?)), + protobuf::ScalarFunction::Ln => Ok(ln((&args[0]).try_into()?)), + protobuf::ScalarFunction::Log10 => Ok(log10((&args[0]).try_into()?)), + protobuf::ScalarFunction::Floor => Ok(floor((&args[0]).try_into()?)), + protobuf::ScalarFunction::Ceil => Ok(ceil((&args[0]).try_into()?)), + protobuf::ScalarFunction::Round => Ok(round((&args[0]).try_into()?)), + protobuf::ScalarFunction::Trunc => Ok(trunc((&args[0]).try_into()?)), + protobuf::ScalarFunction::Abs => Ok(abs((&args[0]).try_into()?)), protobuf::ScalarFunction::Signum => { - Ok(signum((&expr.expr[0]).try_into()?)) + Ok(signum((&args[0]).try_into()?)) } protobuf::ScalarFunction::Octetlength => { - Ok(length((&expr.expr[0]).try_into()?)) - } - // // protobuf::ScalarFunction::Concat => Ok(concat((&expr.expr[0]).try_into()?)), - protobuf::ScalarFunction::Lower => { - Ok(lower((&expr.expr[0]).try_into()?)) - } - protobuf::ScalarFunction::Upper => { - Ok(upper((&expr.expr[0]).try_into()?)) - } - protobuf::ScalarFunction::Trim => { - Ok(trim((&expr.expr[0]).try_into()?)) + Ok(length((&args[0]).try_into()?)) } - protobuf::ScalarFunction::Ltrim => { - Ok(ltrim((&expr.expr[0]).try_into()?)) + // // protobuf::ScalarFunction::Concat => Ok(concat((&args[0]).try_into()?)), + protobuf::ScalarFunction::Lower => Ok(lower((&args[0]).try_into()?)), + protobuf::ScalarFunction::Upper => Ok(upper((&args[0]).try_into()?)), + protobuf::ScalarFunction::Trim => Ok(trim((&args[0]).try_into()?)), + protobuf::ScalarFunction::Ltrim => Ok(ltrim((&args[0]).try_into()?)), + protobuf::ScalarFunction::Rtrim => Ok(rtrim((&args[0]).try_into()?)), + // protobuf::ScalarFunction::Totimestamp => Ok(to_timestamp((&args[0]).try_into()?)), + // protobuf::ScalarFunction::Array => Ok(array((&args[0]).try_into()?)), + // // protobuf::ScalarFunction::Nullif => Ok(nulli((&args[0]).try_into()?)), + protobuf::ScalarFunction::Datepart => { + Ok(date_part((&args[0]).try_into()?, (&args[1]).try_into()?)) } - protobuf::ScalarFunction::Rtrim => { - Ok(rtrim((&expr.expr[0]).try_into()?)) + protobuf::ScalarFunction::Datetrunc => { + Ok(date_trunc((&args[0]).try_into()?, (&args[1]).try_into()?)) } - // protobuf::ScalarFunction::Totimestamp => Ok(to_timestamp((&expr.expr[0]).try_into()?)), - // protobuf::ScalarFunction::Array => Ok(array((&expr.expr[0]).try_into()?)), - // // protobuf::ScalarFunction::Nullif => Ok(nulli((&expr.expr[0]).try_into()?)), - // protobuf::ScalarFunction::Datetrunc => Ok(date_trunc((&expr.expr[0]).try_into()?)), - // protobuf::ScalarFunction::Md5 => Ok(md5((&expr.expr[0]).try_into()?)), + // protobuf::ScalarFunction::Md5 => Ok(md5((&args[0]).try_into()?)), protobuf::ScalarFunction::Sha224 => { - Ok(sha224((&expr.expr[0]).try_into()?)) + Ok(sha224((&args[0]).try_into()?)) } protobuf::ScalarFunction::Sha256 => { - Ok(sha256((&expr.expr[0]).try_into()?)) + Ok(sha256((&args[0]).try_into()?)) } protobuf::ScalarFunction::Sha384 => { - Ok(sha384((&expr.expr[0]).try_into()?)) + Ok(sha384((&args[0]).try_into()?)) } protobuf::ScalarFunction::Sha512 => { - Ok(sha512((&expr.expr[0]).try_into()?)) + Ok(sha512((&args[0]).try_into()?)) } _ => Err(proto_error( "Protobuf deserialization error: Unsupported scalar function", @@ -1119,10 +1100,10 @@ impl TryInto for &protobuf::Field { } } -use datafusion::physical_plan::datetime_expressions::{date_trunc, to_timestamp}; use datafusion::physical_plan::{aggregates, windows}; use datafusion::prelude::{ - array, length, lower, ltrim, md5, rtrim, sha224, sha256, sha384, sha512, trim, upper, + array, date_part, date_trunc, length, lower, ltrim, md5, rtrim, sha224, sha256, + sha384, sha512, trim, upper, }; use std::convert::TryFrom; diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/ballista/rust/core/src/serde/logical_plan/to_proto.rs index 87f26a118e780..1a3834af59d9c 100644 --- a/ballista/rust/core/src/serde/logical_plan/to_proto.rs +++ b/ballista/rust/core/src/serde/logical_plan/to_proto.rs @@ -1065,7 +1065,7 @@ impl TryInto for &Expr { Expr::ScalarVariable(_) => unimplemented!(), Expr::ScalarFunction { ref fun, ref args } => { let fun: protobuf::ScalarFunction = fun.try_into()?; - let expr: Vec = args + let args: Vec = args .iter() .map(|e| e.try_into()) .collect::, BallistaError>>()?; @@ -1074,7 +1074,7 @@ impl TryInto for &Expr { protobuf::logical_expr_node::ExprType::ScalarFunction( protobuf::ScalarFunctionNode { fun: fun.into(), - expr, + args, }, ), ), @@ -1374,6 +1374,7 @@ impl TryInto for &BuiltinScalarFunction { } BuiltinScalarFunction::Array => Ok(protobuf::ScalarFunction::Array), BuiltinScalarFunction::NullIf => Ok(protobuf::ScalarFunction::Nullif), + BuiltinScalarFunction::DatePart => Ok(protobuf::ScalarFunction::Datepart), BuiltinScalarFunction::DateTrunc => Ok(protobuf::ScalarFunction::Datetrunc), BuiltinScalarFunction::MD5 => Ok(protobuf::ScalarFunction::Md5), BuiltinScalarFunction::SHA224 => Ok(protobuf::ScalarFunction::Sha224), diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs b/ballista/rust/core/src/serde/physical_plan/from_proto.rs index 509044b3d1bac..678bcde8fa737 100644 --- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs +++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs @@ -501,6 +501,7 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Totimestamp => BuiltinScalarFunction::ToTimestamp, ScalarFunction::Array => BuiltinScalarFunction::Array, ScalarFunction::Nullif => BuiltinScalarFunction::NullIf, + ScalarFunction::Datepart => BuiltinScalarFunction::DatePart, ScalarFunction::Datetrunc => BuiltinScalarFunction::DateTrunc, ScalarFunction::Md5 => BuiltinScalarFunction::MD5, ScalarFunction::Sha224 => BuiltinScalarFunction::SHA224, diff --git a/benchmarks/run.sh b/benchmarks/run.sh index 8e36424da89f0..b1f47a24c2d8d 100755 --- a/benchmarks/run.sh +++ b/benchmarks/run.sh @@ -20,7 +20,7 @@ set -e # This bash script is meant to be run inside the docker-compose environment. Check the README for instructions cd / -for query in 1 3 5 6 10 12 +for query in 1 3 5 6 7 10 12 do /tpch benchmark ballista --host ballista-scheduler --port 50050 --query $query --path /data --format tbl --iterations 1 --debug done diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index 978fbaa9afe71..10b5c2db795f4 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -1140,6 +1140,7 @@ mod tests { test_round_trip!(q3, 3); test_round_trip!(q5, 5); test_round_trip!(q6, 6); + test_round_trip!(q7, 7); test_round_trip!(q10, 10); test_round_trip!(q12, 12); } diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs index 8b0e647261da8..e4952840487b0 100644 --- a/datafusion/src/logical_plan/expr.rs +++ b/datafusion/src/logical_plan/expr.rs @@ -1421,7 +1421,20 @@ macro_rules! unary_scalar_expr { }; } -// generate methods for creating the supported unary expressions +/// Create an convenience function representing a /binaryunary scalar function +macro_rules! binary_scalar_expr { + ($ENUM:ident, $FUNC:ident) => { + #[doc = "this scalar function is not documented yet"] + pub fn $FUNC(arg1: Expr, arg2: Expr) -> Expr { + Expr::ScalarFunction { + fun: functions::BuiltinScalarFunction::$ENUM, + args: vec![arg1, arg2], + } + } + }; +} + +// generate methods for creating the supported unary/binary expressions // math functions unary_scalar_expr!(Sqrt, sqrt); @@ -1478,6 +1491,10 @@ unary_scalar_expr!(Translate, translate); unary_scalar_expr!(Trim, trim); unary_scalar_expr!(Upper, upper); +// date functions +binary_scalar_expr!(DatePart, date_part); +binary_scalar_expr!(DateTrunc, date_trunc); + /// returns an array of fixed size with each argument on it. pub fn array(args: Vec) -> Expr { Expr::ScalarFunction { diff --git a/datafusion/src/logical_plan/mod.rs b/datafusion/src/logical_plan/mod.rs index a021d06f09502..7f5ac2491843f 100644 --- a/datafusion/src/logical_plan/mod.rs +++ b/datafusion/src/logical_plan/mod.rs @@ -38,13 +38,14 @@ pub use display::display_schema; pub use expr::{ abs, acos, and, array, ascii, asin, atan, avg, binary_expr, bit_length, btrim, case, ceil, character_length, chr, col, columnize_expr, combine_filters, concat, concat_ws, - cos, count, count_distinct, create_udaf, create_udf, exp, exprlist_to_fields, floor, - in_list, initcap, left, length, lit, ln, log10, log2, lower, lpad, ltrim, max, md5, - min, normalize_col, normalize_cols, now, octet_length, or, random, regexp_match, - regexp_replace, repeat, replace, replace_col, reverse, right, round, rpad, rtrim, - sha224, sha256, sha384, sha512, signum, sin, split_part, sqrt, starts_with, strpos, - substr, sum, tan, to_hex, translate, trim, trunc, unnormalize_col, unnormalize_cols, - upper, when, Column, Expr, ExprRewriter, ExpressionVisitor, Literal, Recursion, + cos, count, count_distinct, create_udaf, create_udf, date_part, date_trunc, exp, + exprlist_to_fields, floor, in_list, initcap, left, length, lit, ln, log10, log2, + lower, lpad, ltrim, max, md5, min, normalize_col, normalize_cols, now, octet_length, + or, random, regexp_match, regexp_replace, repeat, replace, replace_col, reverse, + right, round, rpad, rtrim, sha224, sha256, sha384, sha512, signum, sin, split_part, + sqrt, starts_with, strpos, substr, sum, tan, to_hex, translate, trim, trunc, + unnormalize_col, unnormalize_cols, upper, when, Column, Expr, ExprRewriter, + ExpressionVisitor, Literal, Recursion, }; pub use extension::UserDefinedLogicalNode; pub use operators::Operator; diff --git a/datafusion/src/physical_plan/functions.rs b/datafusion/src/physical_plan/functions.rs index 7bb3cb456e9fd..a005f56dd02af 100644 --- a/datafusion/src/physical_plan/functions.rs +++ b/datafusion/src/physical_plan/functions.rs @@ -277,8 +277,8 @@ impl FromStr for BuiltinScalarFunction { "concat" => BuiltinScalarFunction::Concat, "concat_ws" => BuiltinScalarFunction::ConcatWithSeparator, "chr" => BuiltinScalarFunction::Chr, - "date_part" => BuiltinScalarFunction::DatePart, - "date_trunc" => BuiltinScalarFunction::DateTrunc, + "date_part" | "datepart" => BuiltinScalarFunction::DatePart, + "date_trunc" | "datetrunc" => BuiltinScalarFunction::DateTrunc, "initcap" => BuiltinScalarFunction::InitCap, "left" => BuiltinScalarFunction::Left, "length" => BuiltinScalarFunction::CharacterLength, diff --git a/datafusion/src/prelude.rs b/datafusion/src/prelude.rs index e7ad04e74d1a0..168e1d5df41ac 100644 --- a/datafusion/src/prelude.rs +++ b/datafusion/src/prelude.rs @@ -29,9 +29,10 @@ pub use crate::dataframe::DataFrame; pub use crate::execution::context::{ExecutionConfig, ExecutionContext}; pub use crate::logical_plan::{ array, ascii, avg, bit_length, btrim, character_length, chr, col, concat, concat_ws, - count, create_udf, in_list, initcap, left, length, lit, lower, lpad, ltrim, max, md5, - min, now, octet_length, random, regexp_replace, repeat, replace, reverse, right, - rpad, rtrim, sha224, sha256, sha384, sha512, split_part, starts_with, strpos, substr, - sum, to_hex, translate, trim, upper, Column, JoinType, Partitioning, + count, create_udf, date_part, date_trunc, in_list, initcap, left, length, lit, lower, + lpad, ltrim, max, md5, min, now, octet_length, random, regexp_replace, repeat, + replace, reverse, right, rpad, rtrim, sha224, sha256, sha384, sha512, split_part, + starts_with, strpos, substr, sum, to_hex, translate, trim, upper, Column, JoinType, + Partitioning, }; pub use crate::physical_plan::csv::CsvReadOptions; From 5cadc6a45a7a58db8c1c9b2c4834953233d365e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Tue, 10 Aug 2021 08:50:19 +0200 Subject: [PATCH 327/329] Fix right, full join handling when having multiple non-matching rows at the left side (#845) * Fix right, full join handling * Remove comment * Clippy * Update datafusion/src/physical_plan/hash_join.rs Co-authored-by: Andrew Lamb * Fmt Co-authored-by: Andrew Lamb --- datafusion/src/physical_plan/hash_join.rs | 20 +++++++++----------- datafusion/tests/sql.rs | 6 ------ 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs index fa75437e3fea0..99708249fc6a7 100644 --- a/datafusion/src/physical_plan/hash_join.rs +++ b/datafusion/src/physical_plan/hash_join.rs @@ -737,6 +737,7 @@ fn build_join_indexes( for (row, hash_value) in hash_values.iter().enumerate() { match left.0.get(*hash_value, |(hash, _)| *hash_value == *hash) { Some((_, indices)) => { + let mut no_match = true; for &i in indices { if equal_rows( i as usize, @@ -745,9 +746,14 @@ fn build_join_indexes( &keys_values, )? { left_indices.append_value(i)?; - } else { - left_indices.append_null()?; + right_indices.append_value(row as u32)?; + no_match = false; } + } + // If no rows matched left, still must keep the right + // with all nulls for left + if no_match { + left_indices.append_null()?; right_indices.append_value(row as u32)?; } } @@ -768,7 +774,7 @@ macro_rules! equal_rows_elem { let left_array = $l.as_any().downcast_ref::<$array_type>().unwrap(); let right_array = $r.as_any().downcast_ref::<$array_type>().unwrap(); - match (left_array.is_null($left), left_array.is_null($right)) { + match (left_array.is_null($left), right_array.is_null($right)) { (false, false) => left_array.value($left) == right_array.value($right), _ => false, } @@ -1372,8 +1378,6 @@ mod tests { } #[tokio::test] - // Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed - #[cfg(not(feature = "force_hash_collisions"))] async fn join_full_multi_batch() { let left = build_table( ("a1", &vec![1, 2, 3]), @@ -1639,8 +1643,6 @@ mod tests { } #[tokio::test] - // Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed - #[cfg(not(feature = "force_hash_collisions"))] async fn join_right_one() -> Result<()> { let left = build_table( ("a1", &vec![1, 2, 3]), @@ -1677,8 +1679,6 @@ mod tests { } #[tokio::test] - // Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed - #[cfg(not(feature = "force_hash_collisions"))] async fn partitioned_join_right_one() -> Result<()> { let left = build_table( ("a1", &vec![1, 2, 3]), @@ -1716,8 +1716,6 @@ mod tests { } #[tokio::test] - // Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed - #[cfg(not(feature = "force_hash_collisions"))] async fn join_full_one() -> Result<()> { let left = build_table( ("a1", &vec![1, 2, 3]), diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs index 046e4f28ec427..0c33bd4772668 100644 --- a/datafusion/tests/sql.rs +++ b/datafusion/tests/sql.rs @@ -1797,8 +1797,6 @@ async fn equijoin_left_and_condition_from_right() -> Result<()> { } #[tokio::test] -// Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed -#[cfg(not(feature = "force_hash_collisions"))] async fn equijoin_right_and_condition_from_left() -> Result<()> { let mut ctx = create_join_context("t1_id", "t2_id")?; let sql = @@ -1852,8 +1850,6 @@ async fn left_join() -> Result<()> { } #[tokio::test] -// Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed -#[cfg(not(feature = "force_hash_collisions"))] async fn right_join() -> Result<()> { let mut ctx = create_join_context("t1_id", "t2_id")?; let equivalent_sql = [ @@ -1874,8 +1870,6 @@ async fn right_join() -> Result<()> { } #[tokio::test] -// Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed -#[cfg(not(feature = "force_hash_collisions"))] async fn full_join() -> Result<()> { let mut ctx = create_join_context("t1_id", "t2_id")?; let equivalent_sql = [ From 96658eb100436c47601ed10095d74299d2229020 Mon Sep 17 00:00:00 2001 From: QP Hou Date: Tue, 10 Aug 2021 09:02:57 -0700 Subject: [PATCH 328/329] Create changelog for datafusion and ballista release (#801) * create changelog for datafusion and ballista release Created changelog for the following projects: * datafusion 5.0.0 * python 0.3.0 * ballista 0.5.0 Other changes: * updated CHANGELOG.md in the repo root to point to changelogs in subproject folders * updated dev/release/update_change_log.sh to take subproject as argument * added dev/update_ballista_versions.py to help update ballista crate versions. * ignore autogenerated CHANGELOG.md from prettier run * expand file glob within prettier '**' pattern is not supported to some of the shells including the one we use in CI. * exclude subproject changelog from rat * update changelog to latest master * update changelog with updated pr labels * update datafusion version in readme * add verify-release-candidate.sh * update changelog for latest master * update release email formatting * update verify-release-candidate.sh permission * cargo test all in release verify script --- .github/workflows/dev.yml | 5 +- .github_changelog_generator | 6 +- CHANGELOG.md | 9513 +------------------ README.md | 2 +- ballista-examples/Cargo.toml | 2 +- ballista/CHANGELOG.md | 180 + ballista/rust/client/Cargo.toml | 4 +- ballista/rust/core/Cargo.toml | 2 +- ballista/rust/executor/Cargo.toml | 2 +- ballista/rust/scheduler/Cargo.toml | 2 +- datafusion/CHANGELOG.md | 318 + datafusion/Cargo.toml | 2 +- dev/release/create-tarball.sh | 13 +- dev/release/rat_exclude_files.txt | 3 + dev/release/release-tarball.sh | 0 dev/release/update_change_log-all.sh | 29 + dev/release/update_change_log-ballista.sh | 28 + dev/release/update_change_log-datafusion.sh | 28 + dev/release/update_change_log-python.sh | 28 + dev/release/update_change_log.sh | 41 +- dev/release/verify-release-candidate.sh | 146 + dev/update_arrow_deps.py | 2 +- dev/update_ballista_versions.py | 68 + python/CHANGELOG.md | 72 + python/Cargo.toml | 2 +- 25 files changed, 990 insertions(+), 9508 deletions(-) create mode 100644 ballista/CHANGELOG.md create mode 100644 datafusion/CHANGELOG.md mode change 100644 => 100755 dev/release/release-tarball.sh create mode 100755 dev/release/update_change_log-all.sh create mode 100755 dev/release/update_change_log-ballista.sh create mode 100755 dev/release/update_change_log-datafusion.sh create mode 100755 dev/release/update_change_log-python.sh create mode 100755 dev/release/verify-release-candidate.sh create mode 100755 dev/update_ballista_versions.py create mode 100644 python/CHANGELOG.md diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index f49c7b18b419a..fc8740b7757ae 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -63,9 +63,12 @@ jobs: run: | # if you encounter error, try rerun the command below with --write instead of --check # and commit the changes + # + # ignore subproject CHANGELOG.md because they are machine generated npx prettier@2.3.2 --write \ '{ballista,datafusion,datafusion-examples,docs,python}/**/*.md' \ + '!{ballista,datafusion,python}/CHANGELOG.md' \ README.md \ DEVELOPERS.md \ 'ballista/**/*.{ts,tsx}' - git diff --exit-code \ No newline at end of file + git diff --exit-code diff --git a/.github_changelog_generator b/.github_changelog_generator index 49d20dcd9e5ce..6ee6508b7216f 100644 --- a/.github_changelog_generator +++ b/.github_changelog_generator @@ -21,10 +21,10 @@ # point to the old changelog in apache/arrow front-matter=For older versions, see [apache/arrow/CHANGELOG.md](https://github.com/apache/arrow/blob/master/CHANGELOG.md)\n # some issues are just documentation -add-sections={"documentation":{"prefix":"**Documentation updates:**","labels":["documentation"]}} +add-sections={"documentation":{"prefix":"**Documentation updates:**","labels":["documentation"]},"performance":{"prefix":"**Performance improvements:**","labels":["performance"]}} # uncomment to not show PRs. TBD if we shown them or not. #pull-requests=false # so that the component is shown associated with the issue -issue-line-labels=ballista,datafusion,python +issue-line-labels=sql exclude-labels=development-process,invalid -breaking_labels=api-change +breaking-labels=api change diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ed715d7f4fc4..467cddc62c885 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9486 +1,27 @@ - -# Apache Arrow 3.0.0 (2021-01-18) - -## New Features and Improvements - -* [ARROW-1846](https://issues.apache.org/jira/browse/ARROW-1846) - [C++] Implement "any" reduction kernel for boolean data -* [ARROW-3850](https://issues.apache.org/jira/browse/ARROW-3850) - [Python] Support MapType and StructType for enhanced PySpark integration -* [ARROW-4193](https://issues.apache.org/jira/browse/ARROW-4193) - [Rust] Add support for decimal data type -* [ARROW-4544](https://issues.apache.org/jira/browse/ARROW-4544) - [Rust] Read nested JSON structs into StructArrays -* [ARROW-4804](https://issues.apache.org/jira/browse/ARROW-4804) - [Rust] Read temporal values from CSV - Parse Date32 and Date64 in CSV reader -* [ARROW-4960](https://issues.apache.org/jira/browse/ARROW-4960) - [R] Add crossbow task for r-arrow-feedstock -* [ARROW-4970](https://issues.apache.org/jira/browse/ARROW-4970) - [C++][Parquet] Implement parquet::FileMetaData::Equals -* [ARROW-5336](https://issues.apache.org/jira/browse/ARROW-5336) - [C++] Implement arrow::Concatenate for dictionary-encoded arrays with unequal dictionaries -* [ARROW-5350](https://issues.apache.org/jira/browse/ARROW-5350) - [Rust] Support filtering on primitive/string lists -* [ARROW-5394](https://issues.apache.org/jira/browse/ARROW-5394) - [C++] Benchmarks for IsIn Kernel -* [ARROW-5679](https://issues.apache.org/jira/browse/ARROW-5679) - [Python] Drop Python 3.5 from support matrix -* [ARROW-5950](https://issues.apache.org/jira/browse/ARROW-5950) - [Rust] [DataFusion] Add ability to log via logger dependency -* [ARROW-6071](https://issues.apache.org/jira/browse/ARROW-6071) - [C++] Implement casting Binary <-\> LargeBinary -* [ARROW-6697](https://issues.apache.org/jira/browse/ARROW-6697) - [Rust] [DataFusion] Validate that all parquet partitions have the same schema -* [ARROW-6715](https://issues.apache.org/jira/browse/ARROW-6715) - [Website] Describe "non-free" component is needed for Plasma packages in install page -* [ARROW-6883](https://issues.apache.org/jira/browse/ARROW-6883) - [C++] Support sending delta DictionaryBatch or replacement DictionaryBatch in IPC stream writer class -* [ARROW-6995](https://issues.apache.org/jira/browse/ARROW-6995) - [Packaging][Crossbow] The windows conda artifacts are not uploaded to GitHub releases -* [ARROW-7531](https://issues.apache.org/jira/browse/ARROW-7531) - [C++] Investigate header cost reduction -* [ARROW-7800](https://issues.apache.org/jira/browse/ARROW-7800) - [Python] Expose GetRecordBatchReader API in PyArrow -* [ARROW-7842](https://issues.apache.org/jira/browse/ARROW-7842) - [Rust] [Parquet] Implement array reader for list type -* [ARROW-8113](https://issues.apache.org/jira/browse/ARROW-8113) - [C++] Implement a lighter-weight variant -* [ARROW-8199](https://issues.apache.org/jira/browse/ARROW-8199) - [C++] Add support for multi-column sort on Table -* [ARROW-8289](https://issues.apache.org/jira/browse/ARROW-8289) - [Rust] [Parquet] Implement minimal Arrow Parquet writer as starting point for full writer -* [ARROW-8423](https://issues.apache.org/jira/browse/ARROW-8423) - [Rust] [Parquet] Serialize arrow schema into metadata when writing parquet -* [ARROW-8425](https://issues.apache.org/jira/browse/ARROW-8425) - [Rust] [Parquet] Add support for writing temporal types -* [ARROW-8426](https://issues.apache.org/jira/browse/ARROW-8426) - [Rust] [Parquet] Add support for writing dictionary types -* [ARROW-8426](https://issues.apache.org/jira/browse/ARROW-8426) - [Rust] [Parquet] Add support for writing dictionary types -* [ARROW-8853](https://issues.apache.org/jira/browse/ARROW-8853) - [Rust] [Integration Testing] Enable Flight tests -* [ARROW-8876](https://issues.apache.org/jira/browse/ARROW-8876) - [C++] Implement casts from date types to Timestamp -* [ARROW-8883](https://issues.apache.org/jira/browse/ARROW-8883) - [Rust] [Integration Testing] Enable passing tests and update spec doc -* [ARROW-9001](https://issues.apache.org/jira/browse/ARROW-9001) - [R] Box outputs as correct type in call\_function -* [ARROW-9164](https://issues.apache.org/jira/browse/ARROW-9164) - [C++] Provide APIs for adding "docstrings" to arrow::compute::Function classes that can be accessed by bindings -* [ARROW-9187](https://issues.apache.org/jira/browse/ARROW-9187) - [R] Add bindings for arithmetic kernels -* [ARROW-9296](https://issues.apache.org/jira/browse/ARROW-9296) - [CI][Rust] Enable more clippy lint checks -* [ARROW-9304](https://issues.apache.org/jira/browse/ARROW-9304) - [C++] Add "AppendEmptyValue" builder APIs for use inside StructBuilder::AppendNull -* [ARROW-9361](https://issues.apache.org/jira/browse/ARROW-9361) - [Rust] Move other array types into their own modules -* [ARROW-9400](https://issues.apache.org/jira/browse/ARROW-9400) - [Python] Do not depend on conda-forge static libraries in Windows wheel builds -* [ARROW-9475](https://issues.apache.org/jira/browse/ARROW-9475) - [Java] Clean up usages of BaseAllocator, use BufferAllocator instead -* [ARROW-9489](https://issues.apache.org/jira/browse/ARROW-9489) - [C++] Add fill\_null kernel implementation for (array[string], scalar[string]) -* [ARROW-9555](https://issues.apache.org/jira/browse/ARROW-9555) - [Rust] [DataFusion] Add inner (hash) equijoin physical plan -* [ARROW-9564](https://issues.apache.org/jira/browse/ARROW-9564) - [Packaging] Vendor r-arrow-feedstock conda-forge recipe -* [ARROW-9674](https://issues.apache.org/jira/browse/ARROW-9674) - [Rust] Parquet reader should implement Send + Sync -* [ARROW-9704](https://issues.apache.org/jira/browse/ARROW-9704) - [Java] TestEndianness.testLittleEndian fails on big endian platform -* [ARROW-9707](https://issues.apache.org/jira/browse/ARROW-9707) - [Rust] [DataFusion] Re-implement threading model -* [ARROW-9709](https://issues.apache.org/jira/browse/ARROW-9709) - [Java] Test cases in arrow-vector assume little-endian platform -* [ARROW-9728](https://issues.apache.org/jira/browse/ARROW-9728) - [Rust] [Parquet] Compute nested definition and repetition for structs -* [ARROW-9747](https://issues.apache.org/jira/browse/ARROW-9747) - [C++][Java][Format] Support Decimal256 Type -* [ARROW-9771](https://issues.apache.org/jira/browse/ARROW-9771) - [Rust] [DataFusion] Predicate Pushdown Improvement: treat predicates separated by AND separately -* [ARROW-9803](https://issues.apache.org/jira/browse/ARROW-9803) - [Go] Add initial support for s390x -* [ARROW-9804](https://issues.apache.org/jira/browse/ARROW-9804) - [FlightRPC] Authentication Redesign -* [ARROW-9828](https://issues.apache.org/jira/browse/ARROW-9828) - [Rust] [DataFusion] TableProvider trait should support predicate push-down -* [ARROW-9861](https://issues.apache.org/jira/browse/ARROW-9861) - [Java] Failed Arrow Vector on big-endian platform -* [ARROW-9862](https://issues.apache.org/jira/browse/ARROW-9862) - Throw an exception in UnsafeDirectLittleEndian on Big-Endian platform -* [ARROW-9911](https://issues.apache.org/jira/browse/ARROW-9911) - [Rust][DataFusion] SELECT with no FROM clause should produce a single row of output -* [ARROW-9945](https://issues.apache.org/jira/browse/ARROW-9945) - [C++][Dataset] Refactor Expression::Assume to return a Result -* [ARROW-9991](https://issues.apache.org/jira/browse/ARROW-9991) - [C++] split kernels for strings/binary -* [ARROW-10002](https://issues.apache.org/jira/browse/ARROW-10002) - [Rust] Trait-specialization requires nightly -* [ARROW-10021](https://issues.apache.org/jira/browse/ARROW-10021) - [C++][Compute] Support finding nth frequently used value in mode kernel -* [ARROW-10032](https://issues.apache.org/jira/browse/ARROW-10032) - [Documentation] C++ Windows docs are out of date -* [ARROW-10079](https://issues.apache.org/jira/browse/ARROW-10079) - [Rust]: Benchmark and improve count\_set\_bits function -* [ARROW-10095](https://issues.apache.org/jira/browse/ARROW-10095) - [Rust] [Parquet] Update for IPC changes -* [ARROW-10097](https://issues.apache.org/jira/browse/ARROW-10097) - [C++] Persist SetLookupState in between usages of IsIn when filtering dataset batches -* [ARROW-10106](https://issues.apache.org/jira/browse/ARROW-10106) - [FlightRPC][Java] Expose onIsReady() callback on OutboundStreamListener -* [ARROW-10108](https://issues.apache.org/jira/browse/ARROW-10108) - [Rust] [Parquet] Fix compiler warning about unused return value -* [ARROW-10109](https://issues.apache.org/jira/browse/ARROW-10109) - [Rust] Add support to produce a C Data interface -* [ARROW-10110](https://issues.apache.org/jira/browse/ARROW-10110) - [Rust] Add support to consume C Data Interface -* [ARROW-10131](https://issues.apache.org/jira/browse/ARROW-10131) - [C++][Dataset] Lazily parse parquet metadata / statistics in ParquetDatasetFactory and ParquetFileFragment -* [ARROW-10135](https://issues.apache.org/jira/browse/ARROW-10135) - [Rust] [Parquet] Refactor file module to help adding sources -* [ARROW-10143](https://issues.apache.org/jira/browse/ARROW-10143) - [C++] ArrayRangeEquals should accept EqualOptions -* [ARROW-10144](https://issues.apache.org/jira/browse/ARROW-10144) - [Flight] Add support for using the TLS\_SNI extension -* [ARROW-10149](https://issues.apache.org/jira/browse/ARROW-10149) - [Rust] Add support to external release of un-owned buffers -* [ARROW-10163](https://issues.apache.org/jira/browse/ARROW-10163) - [Rust] [DataFusion] Add DictionaryArray coercion support -* [ARROW-10168](https://issues.apache.org/jira/browse/ARROW-10168) - [Rust] [Parquet] Extend arrow schema conversion to projected fields -* [ARROW-10173](https://issues.apache.org/jira/browse/ARROW-10173) - [Rust][DataFusion] Improve performance of equality to a constant predicate support -* [ARROW-10180](https://issues.apache.org/jira/browse/ARROW-10180) - [C++][Doc] Update dependency management docs following aws-sdk-cpp addition -* [ARROW-10182](https://issues.apache.org/jira/browse/ARROW-10182) - [C++] Add basic continuation support to futures -* [ARROW-10191](https://issues.apache.org/jira/browse/ARROW-10191) - [Rust] [Parquet] Add roundtrip tests for single column batches -* [ARROW-10197](https://issues.apache.org/jira/browse/ARROW-10197) - [Gandiva][python] Execute expression on filtered data -* [ARROW-10203](https://issues.apache.org/jira/browse/ARROW-10203) - [Doc] Capture guidance for endianness support in contributors guide. -* [ARROW-10207](https://issues.apache.org/jira/browse/ARROW-10207) - [C++] Unary kernels that results in a list have no preallocated offset buffer -* [ARROW-10208](https://issues.apache.org/jira/browse/ARROW-10208) - [C++] String split kernels do not propagate nulls correctly on sliced input -* [ARROW-10216](https://issues.apache.org/jira/browse/ARROW-10216) - [Rust] Simd implementation of min/max aggregation kernels for primitive types -* [ARROW-10224](https://issues.apache.org/jira/browse/ARROW-10224) - [Python] Add support for Python 3.9 except macOS wheel and Windows wheel -* [ARROW-10225](https://issues.apache.org/jira/browse/ARROW-10225) - [Rust] [Parquet] Fix null bitmap comparisons in roundtrip tests -* [ARROW-10228](https://issues.apache.org/jira/browse/ARROW-10228) - [Julia] Donate Julia Implementation -* [ARROW-10236](https://issues.apache.org/jira/browse/ARROW-10236) - [Rust] [DataFusion] Make DataFusion casting rules consistent with cast kernel -* [ARROW-10241](https://issues.apache.org/jira/browse/ARROW-10241) - [C++][Compute] Add variance kernel benchmark -* [ARROW-10249](https://issues.apache.org/jira/browse/ARROW-10249) - [Rust]: Support Dictionary types for ListArrays in arrow json reader -* [ARROW-10259](https://issues.apache.org/jira/browse/ARROW-10259) - [Rust] Support field metadata -* [ARROW-10261](https://issues.apache.org/jira/browse/ARROW-10261) - [Rust] [BREAKING] Lists should take Field instead of DataType -* [ARROW-10263](https://issues.apache.org/jira/browse/ARROW-10263) - [C++][Compute] Improve numerical stability of variances merging -* [ARROW-10268](https://issues.apache.org/jira/browse/ARROW-10268) - [Rust] Support writing dictionaries to IPC file and stream -* [ARROW-10269](https://issues.apache.org/jira/browse/ARROW-10269) - [Rust] Update nightly: Oct 2020 Edition -* [ARROW-10277](https://issues.apache.org/jira/browse/ARROW-10277) - [C++] Support comparing scalars approximately -* [ARROW-10289](https://issues.apache.org/jira/browse/ARROW-10289) - [Rust] Support reading dictionary streams -* [ARROW-10292](https://issues.apache.org/jira/browse/ARROW-10292) - [Rust] [DataFusion] Simplify merge -* [ARROW-10295](https://issues.apache.org/jira/browse/ARROW-10295) - [Rust] [DataFusion] Simplify accumulators -* [ARROW-10300](https://issues.apache.org/jira/browse/ARROW-10300) - [Rust] Improve benchmark documentation for generating/converting TPC-H data -* [ARROW-10301](https://issues.apache.org/jira/browse/ARROW-10301) - [C++] Add "all" boolean reducing kernel -* [ARROW-10302](https://issues.apache.org/jira/browse/ARROW-10302) - [Python] Don't double-package plasma-store-server -* [ARROW-10304](https://issues.apache.org/jira/browse/ARROW-10304) - [C++][Compute] Optimize variance kernel for integers -* [ARROW-10310](https://issues.apache.org/jira/browse/ARROW-10310) - [C++][Gandiva] Add single argument round() in Gandiva -* [ARROW-10311](https://issues.apache.org/jira/browse/ARROW-10311) - [Release] Update crossbow verification process -* [ARROW-10313](https://issues.apache.org/jira/browse/ARROW-10313) - [C++] Improve UTF8 validation speed and CSV string conversion -* [ARROW-10318](https://issues.apache.org/jira/browse/ARROW-10318) - [C++] Use pimpl idiom in CSV parser -* [ARROW-10319](https://issues.apache.org/jira/browse/ARROW-10319) - [Flight][Go] Add Context to Client Auth Handler functions for Flight -* [ARROW-10320](https://issues.apache.org/jira/browse/ARROW-10320) - [Rust] Convert RecordBatchIterator to a Stream -* [ARROW-10322](https://issues.apache.org/jira/browse/ARROW-10322) - [C++][Dataset] Minimize Expression to a wrapper around compute::Function -* [ARROW-10323](https://issues.apache.org/jira/browse/ARROW-10323) - [Release][wheel] Add missing verification setup step -* [ARROW-10325](https://issues.apache.org/jira/browse/ARROW-10325) - [C++][Compute] Separate aggregate kernel registration -* [ARROW-10328](https://issues.apache.org/jira/browse/ARROW-10328) - [C++] Consider using fast-double-parser -* [ARROW-10330](https://issues.apache.org/jira/browse/ARROW-10330) - [Rust][Datafusion] Implement nullif() function for DataFusion -* [ARROW-10331](https://issues.apache.org/jira/browse/ARROW-10331) - [Rust] [DataFusion] Re-organize errors -* [ARROW-10332](https://issues.apache.org/jira/browse/ARROW-10332) - [Rust] Allow CSV reader to start from a line -* [ARROW-10334](https://issues.apache.org/jira/browse/ARROW-10334) - [Rust] [Parquet] Support reading and writing Arrow NullArray -* [ARROW-10336](https://issues.apache.org/jira/browse/ARROW-10336) - [Rust] Support fromIter and toIter for strings -* [ARROW-10337](https://issues.apache.org/jira/browse/ARROW-10337) - [C++] More liberal parsing of ISO8601 timestamps with fractional seconds -* [ARROW-10338](https://issues.apache.org/jira/browse/ARROW-10338) - [Rust]: Use const fn for applicable methods -* [ARROW-10340](https://issues.apache.org/jira/browse/ARROW-10340) - [Packaging][deb][RPM] Use Python 3.8 for pygit2 -* [ARROW-10356](https://issues.apache.org/jira/browse/ARROW-10356) - [Rust] [DataFusion] Add support for is\_in -* [ARROW-10363](https://issues.apache.org/jira/browse/ARROW-10363) - [Python] Remove workaround for CMake bug in manylinux -* [ARROW-10366](https://issues.apache.org/jira/browse/ARROW-10366) - [Rust] [DataFusion] Remove collect from merge -* [ARROW-10375](https://issues.apache.org/jira/browse/ARROW-10375) - [Rust] Remove PrimitiveArrayOps -* [ARROW-10378](https://issues.apache.org/jira/browse/ARROW-10378) - [Rust] Update take() kernel with support for large lists -* [ARROW-10381](https://issues.apache.org/jira/browse/ARROW-10381) - [Rust] Generalize Arrow to support MergeSort -* [ARROW-10382](https://issues.apache.org/jira/browse/ARROW-10382) - [Rust] Fix typos and spelling -* [ARROW-10383](https://issues.apache.org/jira/browse/ARROW-10383) - [Doc] Fix typos and spelling -* [ARROW-10384](https://issues.apache.org/jira/browse/ARROW-10384) - [C++] Fix typos and spelling -* [ARROW-10385](https://issues.apache.org/jira/browse/ARROW-10385) - [C++][Gandiva] Add support for LLVM 11 -* [ARROW-10389](https://issues.apache.org/jira/browse/ARROW-10389) - [Rust][DataFusion] Make the custom source implementation API more explicit -* [ARROW-10392](https://issues.apache.org/jira/browse/ARROW-10392) - [C++][Gandiva] Avoid string copy while evaluating IN expression -* [ARROW-10396](https://issues.apache.org/jira/browse/ARROW-10396) - [Rust] [Parquet] Expose SliceableCursor and FileSource -* [ARROW-10398](https://issues.apache.org/jira/browse/ARROW-10398) - [Rust] [Parquet] Re-export parquet::record::api::Field -* [ARROW-10400](https://issues.apache.org/jira/browse/ARROW-10400) - Propagate TLS client peer\_identity when using mutual TLS -* [ARROW-10402](https://issues.apache.org/jira/browse/ARROW-10402) - [Rust] Improve array equality -* [ARROW-10407](https://issues.apache.org/jira/browse/ARROW-10407) - [C++] Division Support in Decimal256 -* [ARROW-10408](https://issues.apache.org/jira/browse/ARROW-10408) - [Java] Upgrade Avro dependency to 1.10 -* [ARROW-10410](https://issues.apache.org/jira/browse/ARROW-10410) - [Rust] Some refactorings -* [ARROW-10416](https://issues.apache.org/jira/browse/ARROW-10416) - [R] Support Tables in Flight -* [ARROW-10422](https://issues.apache.org/jira/browse/ARROW-10422) - [Rust] Removed unused BinaryArrayBuilder -* [ARROW-10424](https://issues.apache.org/jira/browse/ARROW-10424) - [Rust] Simplify code for impl PrimitiveArray -* [ARROW-10428](https://issues.apache.org/jira/browse/ARROW-10428) - [FlightRPC][Java] Add support for HTTP cookies -* [ARROW-10445](https://issues.apache.org/jira/browse/ARROW-10445) - [Rust] Add DoubleEnded to PrimitiveArrayIter -* [ARROW-10449](https://issues.apache.org/jira/browse/ARROW-10449) - [Rust] Make dictionary keys be a PrimitiveArray -* [ARROW-10454](https://issues.apache.org/jira/browse/ARROW-10454) - [Rust][Datafusion] support creating ParquetExec from externally resolved file list and schema -* [ARROW-10455](https://issues.apache.org/jira/browse/ARROW-10455) - [Rust] Fix CI cache misses on windows -* [ARROW-10458](https://issues.apache.org/jira/browse/ARROW-10458) - [Rust] [Datafusion] context.create\_logical\_plan should not take a mutable self reference -* [ARROW-10464](https://issues.apache.org/jira/browse/ARROW-10464) - [Rust] Implement utility to convert TPC-H tbl files to CSV and Parquet -* [ARROW-10466](https://issues.apache.org/jira/browse/ARROW-10466) - [Rust] [Website] Update implementation status page -* [ARROW-10467](https://issues.apache.org/jira/browse/ARROW-10467) - [FlightRPC][Java] Ability to pass arbitrary client properties to server -* [ARROW-10468](https://issues.apache.org/jira/browse/ARROW-10468) - [C++][Compute] Refactor FunctionExecutor -\> KernelExecutor -* [ARROW-10476](https://issues.apache.org/jira/browse/ARROW-10476) - [Rust] Allow string array to be built from iterator of &str -* [ARROW-10477](https://issues.apache.org/jira/browse/ARROW-10477) - [Rust] Add support for iterators over binary arrays -* [ARROW-10478](https://issues.apache.org/jira/browse/ARROW-10478) - [Dev][Release] Correct Java versions to 3.0.0-SNAPSHOT -* [ARROW-10481](https://issues.apache.org/jira/browse/ARROW-10481) - [R] Bindings to add, remove, replace Table columns -* [ARROW-10483](https://issues.apache.org/jira/browse/ARROW-10483) - [C++] Move Executor into a separate header -* [ARROW-10484](https://issues.apache.org/jira/browse/ARROW-10484) - [C++] Future<{void,Status}\> could be more generic -* [ARROW-10487](https://issues.apache.org/jira/browse/ARROW-10487) - [FlightRPC][C++] Header-based auth in clients -* [ARROW-10490](https://issues.apache.org/jira/browse/ARROW-10490) - [C++][GLib] Fail to build with Xcode 12.0.1 -* [ARROW-10492](https://issues.apache.org/jira/browse/ARROW-10492) - [Java][JDBC] Allow users to config the mapping between SQL types and Arrow types -* [ARROW-10504](https://issues.apache.org/jira/browse/ARROW-10504) - [C++] Suppress UBSAN pointer-overflow warning in RapidJSON -* [ARROW-10510](https://issues.apache.org/jira/browse/ARROW-10510) - [Rust] [DataFusion] Add benchmarks for COUNT(DISTINCT) -* [ARROW-10515](https://issues.apache.org/jira/browse/ARROW-10515) - [Julia][Doc] Update lists of supported languages to include Julia -* [ARROW-10522](https://issues.apache.org/jira/browse/ARROW-10522) - [R] Allow rename Table and RecordBatch columns with names() -* [ARROW-10526](https://issues.apache.org/jira/browse/ARROW-10526) - [FlightRPC][C++] HTTP cookie handling in clients -* [ARROW-10530](https://issues.apache.org/jira/browse/ARROW-10530) - [R] Optionally use distro package in linuxlibs.R -* [ARROW-10531](https://issues.apache.org/jira/browse/ARROW-10531) - [Rust] [DataFusion] Better display for logical plans: Graphviz and Schema information -* [ARROW-10539](https://issues.apache.org/jira/browse/ARROW-10539) - [Packaging][Python] Use GitHub Actions to build wheels for Windows -* [ARROW-10540](https://issues.apache.org/jira/browse/ARROW-10540) - [Rust] Allow unary kernels of arbitrary array types -* [ARROW-10541](https://issues.apache.org/jira/browse/ARROW-10541) - [C++] Add re2 library to core arrow / ARROW\_WITH\_RE2 -* [ARROW-10542](https://issues.apache.org/jira/browse/ARROW-10542) - [C\#][Flight] Add beginning on flight code for net core -* [ARROW-10543](https://issues.apache.org/jira/browse/ARROW-10543) - [Developer] Update dev instructions to note there may be a timelag -* [ARROW-10552](https://issues.apache.org/jira/browse/ARROW-10552) - [Rust] Remove un-used Result from Buffer -* [ARROW-10559](https://issues.apache.org/jira/browse/ARROW-10559) - [Rust] [DataFusion] Break up logical\_plan/mod.rs into smaller modules -* [ARROW-10561](https://issues.apache.org/jira/browse/ARROW-10561) - [Rust] Simplify \`MutableBuffer::write\` and \`MutableBuffer::write\_bytes\` -* [ARROW-10562](https://issues.apache.org/jira/browse/ARROW-10562) - [Rust] Potential UB on unsafe code -* [ARROW-10566](https://issues.apache.org/jira/browse/ARROW-10566) - [C++] Array validation should work on ArrayData -* [ARROW-10567](https://issues.apache.org/jira/browse/ARROW-10567) - [C++][FlightRPC] Add options to help increase precision of arrow-flight-benchmark -* [ARROW-10572](https://issues.apache.org/jira/browse/ARROW-10572) - [Rust][DataFusion] Use aHash and std::collections hashmap for aggregates / distinct -* [ARROW-10574](https://issues.apache.org/jira/browse/ARROW-10574) - [Python][Parquet] Allow collections for 'in' / 'not in' filter (in addition to sets) -* [ARROW-10575](https://issues.apache.org/jira/browse/ARROW-10575) - [Rust] Rename union.rs to be cosistent with other arrays -* [ARROW-10581](https://issues.apache.org/jira/browse/ARROW-10581) - [Doc] IPC dictionary reference to relevant section -* [ARROW-10582](https://issues.apache.org/jira/browse/ARROW-10582) - [Rust] [DataFusion] Implement "repartition" operator -* [ARROW-10584](https://issues.apache.org/jira/browse/ARROW-10584) - [Rust] [DataFusion] Implement SQL join support using explicit JOIN ON syntax -* [ARROW-10585](https://issues.apache.org/jira/browse/ARROW-10585) - [Rust] [DataFusion] Add join support to DataFrame and LogicalPlan -* [ARROW-10586](https://issues.apache.org/jira/browse/ARROW-10586) - [Rust] [DataFusion] Add join support to query planner -* [ARROW-10589](https://issues.apache.org/jira/browse/ARROW-10589) - [Rust]: Implement AVX-512 bit and operation -* [ARROW-10590](https://issues.apache.org/jira/browse/ARROW-10590) - [Rust] Remove Date32(Millisecond) from test -* [ARROW-10591](https://issues.apache.org/jira/browse/ARROW-10591) - [Rust] Add support to structArrays for MutableArrayData -* [ARROW-10595](https://issues.apache.org/jira/browse/ARROW-10595) - [Rust] Simplify inner loop of min/max kernels for non-null case -* [ARROW-10596](https://issues.apache.org/jira/browse/ARROW-10596) - [Rust] Improve take benchmark -* [ARROW-10598](https://issues.apache.org/jira/browse/ARROW-10598) - [C++] Improve performance of GenerateBitsUnrolled -* [ARROW-10604](https://issues.apache.org/jira/browse/ARROW-10604) - [Ruby] Support Decimal256 type -* [ARROW-10607](https://issues.apache.org/jira/browse/ARROW-10607) - [C++][Parquet] Support Reading/Writing Decimal256 type in Parquet -* [ARROW-10609](https://issues.apache.org/jira/browse/ARROW-10609) - [Rust] Optimize min/max of non null strings -* [ARROW-10628](https://issues.apache.org/jira/browse/ARROW-10628) - [Rust] Make clippy error on clippy warnings -* [ARROW-10633](https://issues.apache.org/jira/browse/ARROW-10633) - [Rust][DataFusion] Dependency version upgrades -* [ARROW-10634](https://issues.apache.org/jira/browse/ARROW-10634) - [C\#][CI] Change the build version from 2.2 to 3.1 in CI -* [ARROW-10636](https://issues.apache.org/jira/browse/ARROW-10636) - [Rust] Remove specialisation from Rust parquet -* [ARROW-10637](https://issues.apache.org/jira/browse/ARROW-10637) - [Rust] Add examples to boolean kernels -* [ARROW-10638](https://issues.apache.org/jira/browse/ARROW-10638) - [Rust] Improve tests of boolean kernels -* [ARROW-10639](https://issues.apache.org/jira/browse/ARROW-10639) - [Rust] Simplify signature of is\_null and add example -* [ARROW-10644](https://issues.apache.org/jira/browse/ARROW-10644) - [Python] Consolidate path/filesystem handling in pyarrow.dataset and pyarrow.fs -* [ARROW-10646](https://issues.apache.org/jira/browse/ARROW-10646) - [C++][FlightRPC] Disable flaky test -* [ARROW-10648](https://issues.apache.org/jira/browse/ARROW-10648) - [Java] Prepare Java codebase for source release without requiring any git tags to be created or pushed -* [ARROW-10651](https://issues.apache.org/jira/browse/ARROW-10651) - [C++] alloc-dealloc-mismatch in s3fs.cc -* [ARROW-10652](https://issues.apache.org/jira/browse/ARROW-10652) - [C++][Gandiva] Make gandiva cache size configurable -* [ARROW-10653](https://issues.apache.org/jira/browse/ARROW-10653) - [Rust]: Update toolchain version to bring new features -* [ARROW-10654](https://issues.apache.org/jira/browse/ARROW-10654) - [Rust] Specialize parsing of floats / bools -* [ARROW-10660](https://issues.apache.org/jira/browse/ARROW-10660) - [Rust] Implement AVX-512 bit or operation -* [ARROW-10665](https://issues.apache.org/jira/browse/ARROW-10665) - [Rust] Add fast paths for common utf8 like patterns -* [ARROW-10666](https://issues.apache.org/jira/browse/ARROW-10666) - [Rust] [DataFusion] Support nested SELECT statements -* [ARROW-10669](https://issues.apache.org/jira/browse/ARROW-10669) - [C++][Compute] Support Scalar inputs to boolean kernels -* [ARROW-10672](https://issues.apache.org/jira/browse/ARROW-10672) - [Rust] [DataFusion] Make limit be computed as a stream -* [ARROW-10673](https://issues.apache.org/jira/browse/ARROW-10673) - [Rust] [DataFusion] Make sort be computed on the stream -* [ARROW-10674](https://issues.apache.org/jira/browse/ARROW-10674) - [Rust] Add integration tests for Decimal type -* [ARROW-10677](https://issues.apache.org/jira/browse/ARROW-10677) - [Rust] Fix Bug and Add tests as documentation showing supported csv parsing -* [ARROW-10679](https://issues.apache.org/jira/browse/ARROW-10679) - [Rust] [DataFusion] Implement SQL CASE WHEN physical expression -* [ARROW-10680](https://issues.apache.org/jira/browse/ARROW-10680) - [Rust] [DataFusion] Implement TPC-H Query 12 -* [ARROW-10682](https://issues.apache.org/jira/browse/ARROW-10682) - [Rust] Sort kernel performance tuning -* [ARROW-10685](https://issues.apache.org/jira/browse/ARROW-10685) - [Rust] [DataFusion] Add support for join on filter pushdown optimizer -* [ARROW-10688](https://issues.apache.org/jira/browse/ARROW-10688) - [Rust] [DataFusion] Support CASE WHEN from DataFrame API -* [ARROW-10689](https://issues.apache.org/jira/browse/ARROW-10689) - [Rust] [DataFusion] Support CASE WHEN from SQL -* [ARROW-10693](https://issues.apache.org/jira/browse/ARROW-10693) - [Rust] [DataFusion] Add support for the left join -* [ARROW-10696](https://issues.apache.org/jira/browse/ARROW-10696) - [C++] Investigate a bit run reader that would only return runs of set bits -* [ARROW-10697](https://issues.apache.org/jira/browse/ARROW-10697) - [C++] Consolidate bitmap word readers -* [ARROW-10703](https://issues.apache.org/jira/browse/ARROW-10703) - [Rust] [DataFusion] Make join not collect left on every part -* [ARROW-10704](https://issues.apache.org/jira/browse/ARROW-10704) - [Rust][DataFusion] Remove Nested from expression enum -* [ARROW-10708](https://issues.apache.org/jira/browse/ARROW-10708) - [Packaging][deb] Add support for Ubuntu 20.10 -* [ARROW-10709](https://issues.apache.org/jira/browse/ARROW-10709) - [Python] Difficult to make an efficient zero-copy file reader in Python -* [ARROW-10712](https://issues.apache.org/jira/browse/ARROW-10712) - [Rust] [DataFusion] Add tests to TPC-H benchmarks -* [ARROW-10717](https://issues.apache.org/jira/browse/ARROW-10717) - [Rust] [DataFusion] Add support for right join -* [ARROW-10720](https://issues.apache.org/jira/browse/ARROW-10720) - [C++] Add BasicDecimal256 Rescale Support -* [ARROW-10721](https://issues.apache.org/jira/browse/ARROW-10721) - [C\#][CI] Use .NET 3.1 by default -* [ARROW-10722](https://issues.apache.org/jira/browse/ARROW-10722) - [Rust][DataFusion] Reduce overhead in data types in aggregations / joins, improve benchmarks -* [ARROW-10723](https://issues.apache.org/jira/browse/ARROW-10723) - [Packaging][deb][RPM] Enable Parquet encription -* [ARROW-10724](https://issues.apache.org/jira/browse/ARROW-10724) - [Developer Tools] Add labeler to when PRs need rebase -* [ARROW-10725](https://issues.apache.org/jira/browse/ARROW-10725) - [Python][Compute] Exposing bindings for sort options -* [ARROW-10728](https://issues.apache.org/jira/browse/ARROW-10728) - [Rust] [DataFusion] Add SQL support for JOIN with USING clause -* [ARROW-10729](https://issues.apache.org/jira/browse/ARROW-10729) - [Rust] [DataFusion] Add SQL support for JOIN using implicit syntax -* [ARROW-10732](https://issues.apache.org/jira/browse/ARROW-10732) - [Rust] [DataFusion] Add SQL support for table/relation aliases and compound identifiers -* [ARROW-10733](https://issues.apache.org/jira/browse/ARROW-10733) - [R] Improvements to Linux installation troubleshooting -* [ARROW-10740](https://issues.apache.org/jira/browse/ARROW-10740) - [Rust][DataFusion] Remove redundant clones found by clippy -* [ARROW-10741](https://issues.apache.org/jira/browse/ARROW-10741) - Apply clippy lints to source code, remove them from ignore list -* [ARROW-10742](https://issues.apache.org/jira/browse/ARROW-10742) - [Python] Mask not checked when creating array from numpy array -* [ARROW-10745](https://issues.apache.org/jira/browse/ARROW-10745) - [Rust] Allocate padding bytes in filter context -* [ARROW-10747](https://issues.apache.org/jira/browse/ARROW-10747) - [Rust] Optimizations for csv reader -* [ARROW-10750](https://issues.apache.org/jira/browse/ARROW-10750) - [Rust] [DataFusion] Add SQL support for LEFT and RIGHT join -* [ARROW-10752](https://issues.apache.org/jira/browse/ARROW-10752) - [GLib] Add garrow\_schema\_has\_metadata() -* [ARROW-10754](https://issues.apache.org/jira/browse/ARROW-10754) - [GLib] Add support for metadata to GArrowField -* [ARROW-10755](https://issues.apache.org/jira/browse/ARROW-10755) - [Rust] [Parquet] Add support for writing boolean type -* [ARROW-10756](https://issues.apache.org/jira/browse/ARROW-10756) - [Rust] Clippy - fix reduntant clone -* [ARROW-10759](https://issues.apache.org/jira/browse/ARROW-10759) - [Rust][DataFusion] Implement support for casting string to date in sql expressions -* [ARROW-10763](https://issues.apache.org/jira/browse/ARROW-10763) - [Rust] Speed up take kernels -* [ARROW-10765](https://issues.apache.org/jira/browse/ARROW-10765) - [Rust] Optimize take strings for non-null arrays -* [ARROW-10767](https://issues.apache.org/jira/browse/ARROW-10767) - [Rust] Speed up sum kernel with nulls -* [ARROW-10770](https://issues.apache.org/jira/browse/ARROW-10770) - [Rust] Support reading nested JSON lists -* [ARROW-10772](https://issues.apache.org/jira/browse/ARROW-10772) - [Rust] Improve take performance -* [ARROW-10775](https://issues.apache.org/jira/browse/ARROW-10775) - [Rust][DataFusion] Use ahash in hash join -* [ARROW-10776](https://issues.apache.org/jira/browse/ARROW-10776) - [C++] Provide iterator access to primitive elements inside an Array -* [ARROW-10781](https://issues.apache.org/jira/browse/ARROW-10781) - [Rust] [DataFusion] TableProvider should provide row count statistics -* [ARROW-10783](https://issues.apache.org/jira/browse/ARROW-10783) - [Rust] [DataFusion] Implement row count statistics for Parquet TableProvider -* [ARROW-10785](https://issues.apache.org/jira/browse/ARROW-10785) - Further optimize take string -* [ARROW-10786](https://issues.apache.org/jira/browse/ARROW-10786) - [Packaging][RPM] Drop support for CentOS 6 -* [ARROW-10788](https://issues.apache.org/jira/browse/ARROW-10788) - [C++] Make S3 recursive walks parallel -* [ARROW-10789](https://issues.apache.org/jira/browse/ARROW-10789) - [Rust][DataFusion] Make TableProvider dynamically typed -* [ARROW-10790](https://issues.apache.org/jira/browse/ARROW-10790) - [C++][Compute] Investigate ChunkedArray sort performance -* [ARROW-10792](https://issues.apache.org/jira/browse/ARROW-10792) - [Rust] [CI] Modulararize CI for faster and smaller builds -* [ARROW-10795](https://issues.apache.org/jira/browse/ARROW-10795) - [Rust] Fix specialization for arrow datatypes -* [ARROW-10796](https://issues.apache.org/jira/browse/ARROW-10796) - [C++] Investigate RecordBatch sort performance -* [ARROW-10800](https://issues.apache.org/jira/browse/ARROW-10800) - [Rust] [Parquet] Provide access to the elements of parquet::record::{List, Map} -* [ARROW-10802](https://issues.apache.org/jira/browse/ARROW-10802) - [C++] Remove Dictionary[NullType] special casing in parquet column writer -* [ARROW-10808](https://issues.apache.org/jira/browse/ARROW-10808) - [Rust] [DataFusion] Support nested expressions in aggregations -* [ARROW-10809](https://issues.apache.org/jira/browse/ARROW-10809) - [C++] Use Datum for SortIndices() input -* [ARROW-10812](https://issues.apache.org/jira/browse/ARROW-10812) - [Rust] Make BooleanArray not a PrimitiveArray -* [ARROW-10813](https://issues.apache.org/jira/browse/ARROW-10813) - [Rust] [DataFusion] Implement DFSchema -* [ARROW-10814](https://issues.apache.org/jira/browse/ARROW-10814) - [Packaging][deb] Drop support for Debian GNU/Linux Stretch -* [ARROW-10817](https://issues.apache.org/jira/browse/ARROW-10817) - [Rust] [DataFusion] Implement TypedString -* [ARROW-10820](https://issues.apache.org/jira/browse/ARROW-10820) - [Rust] [DataFusion] Complete TPC-H Benchmark Queries -* [ARROW-10821](https://issues.apache.org/jira/browse/ARROW-10821) - [Rust] [Datafusion] implement negative expression -* [ARROW-10822](https://issues.apache.org/jira/browse/ARROW-10822) - [Rust] [Datafusion] support compiling datafusion with simd support -* [ARROW-10824](https://issues.apache.org/jira/browse/ARROW-10824) - [Rust] Added PartialEq for NullArray -* [ARROW-10825](https://issues.apache.org/jira/browse/ARROW-10825) - [Rust] Add support to NullArrays for MutableArrayData -* [ARROW-10826](https://issues.apache.org/jira/browse/ARROW-10826) - [Rust] Add support for FixedSizeBinary to MutableArrayData -* [ARROW-10827](https://issues.apache.org/jira/browse/ARROW-10827) - [Rust] Extend concatenate to all types -* [ARROW-10828](https://issues.apache.org/jira/browse/ARROW-10828) - [Rust][DataFusion] Enable more clippy lints -* [ARROW-10829](https://issues.apache.org/jira/browse/ARROW-10829) - [Rust] [DataFusion] Implement Into for DFSchema -* [ARROW-10832](https://issues.apache.org/jira/browse/ARROW-10832) - [Rust] Evaluate latest snapshot flatc -* [ARROW-10836](https://issues.apache.org/jira/browse/ARROW-10836) - [Rust] Extend take kernel to FixedSizeListArray -* [ARROW-10838](https://issues.apache.org/jira/browse/ARROW-10838) - [Rust] [CI] Add CI for wasm32 target -* [ARROW-10839](https://issues.apache.org/jira/browse/ARROW-10839) - [Rust] [DataFusion] Implement BETWEEN Operator -* [ARROW-10843](https://issues.apache.org/jira/browse/ARROW-10843) - [C++] Add support for temporal types in sort family kernels -* [ARROW-10845](https://issues.apache.org/jira/browse/ARROW-10845) - [Python][CI] Add python CI build using numpy nightly -* [ARROW-10849](https://issues.apache.org/jira/browse/ARROW-10849) - [Python] Handle numpy deprecation warnings for builtin type aliases -* [ARROW-10851](https://issues.apache.org/jira/browse/ARROW-10851) - [C++] Reduce code size of vector\_sort.cc -* [ARROW-10857](https://issues.apache.org/jira/browse/ARROW-10857) - [Packaging] Follow PowerTools repository name change on CentOS 8 -* [ARROW-10858](https://issues.apache.org/jira/browse/ARROW-10858) - [C++][MSVC] Add missing Boost dependency -* [ARROW-10861](https://issues.apache.org/jira/browse/ARROW-10861) - [Python] Update minimal NumPy version to 1.16.6 -* [ARROW-10864](https://issues.apache.org/jira/browse/ARROW-10864) - [Rust] Use standard ordering for floats -* [ARROW-10865](https://issues.apache.org/jira/browse/ARROW-10865) - [Rust][DataFusion] More ergonomic conversion between Schema, SchemaRef, DFSchema, and DFSchemaRef -* [ARROW-10867](https://issues.apache.org/jira/browse/ARROW-10867) - build failure on aarch64 with -DARROW\_PYTHON=ON and gcc -* [ARROW-10869](https://issues.apache.org/jira/browse/ARROW-10869) - [GLib] Add garrow\_\*\_sort\_indices() and related options -* [ARROW-10870](https://issues.apache.org/jira/browse/ARROW-10870) - [Julia] Update website with Julia implementation -* [ARROW-10871](https://issues.apache.org/jira/browse/ARROW-10871) - [Julia] Setup Julia CI via GitHub Actions -* [ARROW-10873](https://issues.apache.org/jira/browse/ARROW-10873) - [C++] Apple Silicon is reported as arm64 in CMake -* [ARROW-10874](https://issues.apache.org/jira/browse/ARROW-10874) - [Rust][DataFusion] Add table statistics for MemTable -* [ARROW-10877](https://issues.apache.org/jira/browse/ARROW-10877) - [Rust] [DataFusion] Add benchmark based on kaggle movies -* [ARROW-10878](https://issues.apache.org/jira/browse/ARROW-10878) - [Rust] Simplify extend\_from\_slice -* [ARROW-10879](https://issues.apache.org/jira/browse/ARROW-10879) - [Packaging][deb] Restore Debian GNU/Linux Buster configuration -* [ARROW-10881](https://issues.apache.org/jira/browse/ARROW-10881) - [C++] EXC\_BAD\_ACCESS in BaseSetBitRunReader::NextRun -* [ARROW-10885](https://issues.apache.org/jira/browse/ARROW-10885) - [Rust][DataFusion] Optimize join build vs probe based on statistics on row number -* [ARROW-10887](https://issues.apache.org/jira/browse/ARROW-10887) - [C++][Doc] Document IPC API -* [ARROW-10889](https://issues.apache.org/jira/browse/ARROW-10889) - [Rust] Document our approach to unsafe code in README -* [ARROW-10890](https://issues.apache.org/jira/browse/ARROW-10890) - [Rust] [DataFusion] JOIN support -* [ARROW-10891](https://issues.apache.org/jira/browse/ARROW-10891) - [Rust][DataFusion] More clippy lints -* [ARROW-10893](https://issues.apache.org/jira/browse/ARROW-10893) - [Rust] [DataFusion] Easier clippy fixes -* [ARROW-10896](https://issues.apache.org/jira/browse/ARROW-10896) - [C++][CMake] Rename internal RE2 package name to "re2" from "RE2" -* [ARROW-10900](https://issues.apache.org/jira/browse/ARROW-10900) - [Rust][DataFusion] Resolve TableScan provider eagerly -* [ARROW-10904](https://issues.apache.org/jira/browse/ARROW-10904) - [Python] Add support for Python 3.9 macOS wheels -* [ARROW-10905](https://issues.apache.org/jira/browse/ARROW-10905) - [Python] Add support for Python 3.9 windows wheels -* [ARROW-10908](https://issues.apache.org/jira/browse/ARROW-10908) - [Rust] [DataFusion] Update relevant tpch-queries with BETWEEN -* [ARROW-10917](https://issues.apache.org/jira/browse/ARROW-10917) - [Rust][Doc] Update feature matrix -* [ARROW-10918](https://issues.apache.org/jira/browse/ARROW-10918) - [C++][Doc] Document supported Parquet features -* [ARROW-10927](https://issues.apache.org/jira/browse/ARROW-10927) - Add Decimal to ArrayBuilderReader for physical type fixed size binary -* [ARROW-10927](https://issues.apache.org/jira/browse/ARROW-10927) - Add Decimal to ArrayBuilderReader for physical type fixed size binary -* [ARROW-10927](https://issues.apache.org/jira/browse/ARROW-10927) - Add Decimal to ArrayBuilderReader for physical type fixed size binary -* [ARROW-10929](https://issues.apache.org/jira/browse/ARROW-10929) - [Rust] Migrate CI tests to stable rust -* [ARROW-10933](https://issues.apache.org/jira/browse/ARROW-10933) - [Rust] Update docs in regard to stable rust -* [ARROW-10934](https://issues.apache.org/jira/browse/ARROW-10934) - [Python] Tests are failed with fsspec-0.8.5 -* [ARROW-10938](https://issues.apache.org/jira/browse/ARROW-10938) - [Rust] upgrade dependency "flatbuffers" to 0.8 -* [ARROW-10940](https://issues.apache.org/jira/browse/ARROW-10940) - [Rust] Extend sort kernel to ListArray -* [ARROW-10941](https://issues.apache.org/jira/browse/ARROW-10941) - [Doc][C++] Document supported Parquet encryption features -* [ARROW-10944](https://issues.apache.org/jira/browse/ARROW-10944) - [Rust] Implement min/max kernels for BooleanArray -* [ARROW-10946](https://issues.apache.org/jira/browse/ARROW-10946) - [Rust] Make ChunkIter not depend on a buffer -* [ARROW-10947](https://issues.apache.org/jira/browse/ARROW-10947) - [Rust][DataFusion] Refactor UTF8 to Date32 for Performance -* [ARROW-10948](https://issues.apache.org/jira/browse/ARROW-10948) - [C++] Always use GTestConfig.cmake -* [ARROW-10949](https://issues.apache.org/jira/browse/ARROW-10949) - [Rust] Avoid clones in getting values of boolean arrays -* [ARROW-10951](https://issues.apache.org/jira/browse/ARROW-10951) - [Python][CI] Nightly pandas builds failing because of pytest monkeypatch issue -* [ARROW-10952](https://issues.apache.org/jira/browse/ARROW-10952) - [Rust] Add pre-commit hook -* [ARROW-10966](https://issues.apache.org/jira/browse/ARROW-10966) - [C++] Use FnOnce for ThreadPool's tasks instead of std::function -* [ARROW-10968](https://issues.apache.org/jira/browse/ARROW-10968) - [Rust][DataFusion] Don't build hash table for right side of the join -* [ARROW-10969](https://issues.apache.org/jira/browse/ARROW-10969) - [Rust][DataFusion] Implement ANSI SQL Functions -* [ARROW-10985](https://issues.apache.org/jira/browse/ARROW-10985) - [Rust] Update unsafe guidelines for adding JIRA references -* [ARROW-10986](https://issues.apache.org/jira/browse/ARROW-10986) - [Rust][DataFusion] Add average statistic to TCP-H benchmark too -* [ARROW-10988](https://issues.apache.org/jira/browse/ARROW-10988) - [C++] Require CMake 3.5 or later -* [ARROW-10989](https://issues.apache.org/jira/browse/ARROW-10989) - [Rust] Use slices for iterating primitive arrays -* [ARROW-10993](https://issues.apache.org/jira/browse/ARROW-10993) - [CI][macOS] Fix Python 3.9 installation by Homebrew -* [ARROW-10995](https://issues.apache.org/jira/browse/ARROW-10995) - [Rust] [DataFusion] Improve parallelism when reading Parquet files -* [ARROW-11004](https://issues.apache.org/jira/browse/ARROW-11004) - [FlightRPC][Python] Header-based auth in clients -* [ARROW-11005](https://issues.apache.org/jira/browse/ARROW-11005) - [Rust] Remove indirection from take kernel and simplify interface -* [ARROW-11008](https://issues.apache.org/jira/browse/ARROW-11008) - [Rust][DataFusion] Simplify count accumulator -* [ARROW-11009](https://issues.apache.org/jira/browse/ARROW-11009) - [Python] Add environment variable to elect default usage of system memory allocator instead of jemalloc/mimalloc -* [ARROW-11010](https://issues.apache.org/jira/browse/ARROW-11010) - [Python] \`np.float\` deprecation warning in \`\_pandas\_logical\_type\_map\` -* [ARROW-11012](https://issues.apache.org/jira/browse/ARROW-11012) - [Rust] [DataFusion] Make write\_csv and write\_parquet concurrent -* [ARROW-11015](https://issues.apache.org/jira/browse/ARROW-11015) - [CI][Gandiva] Move gandiva nightly build from travis to github action -* [ARROW-11018](https://issues.apache.org/jira/browse/ARROW-11018) - [Rust][DataFusion] Add null count column statistics -* [ARROW-11026](https://issues.apache.org/jira/browse/ARROW-11026) - [Rust]: Run tests without requiring environment variables -* [ARROW-11028](https://issues.apache.org/jira/browse/ARROW-11028) - [Rust] Somewhat pedantic pattern-matches -* [ARROW-11029](https://issues.apache.org/jira/browse/ARROW-11029) - [Rust] [DataFusion] Document why join order optimization does not work with filter pushdown -* [ARROW-11032](https://issues.apache.org/jira/browse/ARROW-11032) - [C++][FlightRPC] Add benchmark for local RPC through unix socket -* [ARROW-11033](https://issues.apache.org/jira/browse/ARROW-11033) - [Rust] CSV writer performance improvements -* [ARROW-11034](https://issues.apache.org/jira/browse/ARROW-11034) - [Rust] rustfmt cleanup -* [ARROW-11035](https://issues.apache.org/jira/browse/ARROW-11035) - [Rust] Improve performance of cast to utf8 via FromIter -* [ARROW-11037](https://issues.apache.org/jira/browse/ARROW-11037) - [Rust] Improve performance of string fromIter -* [ARROW-11038](https://issues.apache.org/jira/browse/ARROW-11038) - [Rust] Remove \`BufferBuilderTrait\` and associated Result requirement. -* [ARROW-11039](https://issues.apache.org/jira/browse/ARROW-11039) - [Rust] Improve performance for utf8 to float cast -* [ARROW-11040](https://issues.apache.org/jira/browse/ARROW-11040) - [Rust] Simplify builders with generics -* [ARROW-11042](https://issues.apache.org/jira/browse/ARROW-11042) - [Rust][DataFusion] Increase default batch size -* [ARROW-11043](https://issues.apache.org/jira/browse/ARROW-11043) - [C++] Add "is\_nan" kernel -* [ARROW-11046](https://issues.apache.org/jira/browse/ARROW-11046) - [Rust][DataFusion] Add count\_distinct to dataframe API -* [ARROW-11049](https://issues.apache.org/jira/browse/ARROW-11049) - [Python] Expose alternate memory pools -* [ARROW-11052](https://issues.apache.org/jira/browse/ARROW-11052) - [Rust] [DataFusion] Implement metrics in join operator -* [ARROW-11053](https://issues.apache.org/jira/browse/ARROW-11053) - [Rust] [DataFusion] Optimize joins with dynamic capacity for output batches -* [ARROW-11054](https://issues.apache.org/jira/browse/ARROW-11054) - Update SQLParser to 0.70 -* [ARROW-11055](https://issues.apache.org/jira/browse/ARROW-11055) - [Rust] [DataFusion] Support date\_trunc function -* [ARROW-11058](https://issues.apache.org/jira/browse/ARROW-11058) - [Rust] [DataFusion] Implement "coalesce batches" operator -* [ARROW-11063](https://issues.apache.org/jira/browse/ARROW-11063) - [Rust] Validate null counts when building arrays -* [ARROW-11064](https://issues.apache.org/jira/browse/ARROW-11064) - [Rust][DataFusion] Speed up hash join on smaller batches -* [ARROW-11072](https://issues.apache.org/jira/browse/ARROW-11072) - [Rust] [Parquet] Support int32 and int64 physical types -* [ARROW-11076](https://issues.apache.org/jira/browse/ARROW-11076) - [Rust][DataFusion] Refactor usage of right indices in hash join -* [ARROW-11079](https://issues.apache.org/jira/browse/ARROW-11079) - [R] Catch up on changelog since 2.0 -* [ARROW-11080](https://issues.apache.org/jira/browse/ARROW-11080) - [C++][Dataset] Improvements to implicit casting -* [ARROW-11082](https://issues.apache.org/jira/browse/ARROW-11082) - [Rust] Add FFI for LargeUtf8 -* [ARROW-11086](https://issues.apache.org/jira/browse/ARROW-11086) - [Rust] Extend take to support more index types -* [ARROW-11091](https://issues.apache.org/jira/browse/ARROW-11091) - [Rust][DataFusion] Fix clippy warning in rust 1.49 -* [ARROW-11095](https://issues.apache.org/jira/browse/ARROW-11095) - [Python] Access pyarrow.RecordBatch column by name -* [ARROW-11096](https://issues.apache.org/jira/browse/ARROW-11096) - [Rust] Add FFI for [Large]Binary -* [ARROW-11097](https://issues.apache.org/jira/browse/ARROW-11097) - [Rust] Simplify tests -* [ARROW-11099](https://issues.apache.org/jira/browse/ARROW-11099) - [Rust]: Remove unsafe value\_slice method from PrimitiveArray and BooleanArray -* [ARROW-11100](https://issues.apache.org/jira/browse/ARROW-11100) - [Rust] Speed up numeric to string cast using lexical\_core -* [ARROW-11101](https://issues.apache.org/jira/browse/ARROW-11101) - [Rust] enable "cargo +nightly fmt" in git pre-commit hook -* [ARROW-11104](https://issues.apache.org/jira/browse/ARROW-11104) - [GLib] Add append\_null/append\_nulls to GArrowArrayBuilder and use them -* [ARROW-11105](https://issues.apache.org/jira/browse/ARROW-11105) - [Rust] Favor From/Into traits in MutableBuffer -* [ARROW-11109](https://issues.apache.org/jira/browse/ARROW-11109) - [GLib] Add garrow\_array\_builder\_append\_empty\_value() and values() -* [ARROW-11110](https://issues.apache.org/jira/browse/ARROW-11110) - [Rust] [Datafusion] context.table should not take a mutable self reference -* [ARROW-11111](https://issues.apache.org/jira/browse/ARROW-11111) - [GLib] Add GArrowFixedSizeBinaryArrayBuilder -* [ARROW-11121](https://issues.apache.org/jira/browse/ARROW-11121) - [Developer] Use pull\_request\_target for PR JIRA integration -* [ARROW-11122](https://issues.apache.org/jira/browse/ARROW-11122) - [Rust] Add FFI for date and time -* [ARROW-11124](https://issues.apache.org/jira/browse/ARROW-11124) - [Doc] Update status matrix for Decimal256 -* [ARROW-11125](https://issues.apache.org/jira/browse/ARROW-11125) - [Rust] Implement logical equality for list arrays -* [ARROW-11126](https://issues.apache.org/jira/browse/ARROW-11126) - [Rust] Document and test ARROW-10656 -* [ARROW-11127](https://issues.apache.org/jira/browse/ARROW-11127) - [C++] Unused cpu\_info on non-x86 architecture -* [ARROW-11129](https://issues.apache.org/jira/browse/ARROW-11129) - [Rust][DataFusion] Use tokio thread pool for loading parquet -* [ARROW-11130](https://issues.apache.org/jira/browse/ARROW-11130) - [Website][CentOS 8][RHEL 8] Enable all required repositories by default -* [ARROW-11131](https://issues.apache.org/jira/browse/ARROW-11131) - [Rust] Improve performance of bool\_equal -* [ARROW-11136](https://issues.apache.org/jira/browse/ARROW-11136) - [R] Bindings for is.nan -* [ARROW-11137](https://issues.apache.org/jira/browse/ARROW-11137) - [Rust][DataFusion] Fix Clippy needless\_range\_loop, needless\_lifetimes -* [ARROW-11138](https://issues.apache.org/jira/browse/ARROW-11138) - [Rust] [DataFusion] Support ltrim, rtrim -* [ARROW-11139](https://issues.apache.org/jira/browse/ARROW-11139) - [GLib] Add support for extension type -* [ARROW-11155](https://issues.apache.org/jira/browse/ARROW-11155) - [C++][Packaging] Move gandiva crossbow jobs off of Travis-CI -* [ARROW-11158](https://issues.apache.org/jira/browse/ARROW-11158) - [Julia] Implement Decimal256 support -* [ARROW-11159](https://issues.apache.org/jira/browse/ARROW-11159) - [Developer] Consolidate pull request related jobs -* [ARROW-11165](https://issues.apache.org/jira/browse/ARROW-11165) - [Rust] [DataFusion] Document the desired SQL dialect for DataFusion -* [ARROW-11168](https://issues.apache.org/jira/browse/ARROW-11168) - [Rust] Fix cargo doc warnings -* [ARROW-11169](https://issues.apache.org/jira/browse/ARROW-11169) - [Rust] Add a comment explaining where float total\_order algorithm came from -* [ARROW-11175](https://issues.apache.org/jira/browse/ARROW-11175) - [R] Small docs fixes -* [ARROW-11176](https://issues.apache.org/jira/browse/ARROW-11176) - [R] Expose memory pool name and document setting it -* [ARROW-11187](https://issues.apache.org/jira/browse/ARROW-11187) - [Rust] [Parquet] Pin specific parquet-format-rs version -* [ARROW-11188](https://issues.apache.org/jira/browse/ARROW-11188) - [Rust] Implement crypto functions from PostgreSQL dialect -* [ARROW-11193](https://issues.apache.org/jira/browse/ARROW-11193) - [Documentation] Add docs for Java ListVector -* [ARROW-11194](https://issues.apache.org/jira/browse/ARROW-11194) - [Rust] Enable SIMD for aarch64 -* [ARROW-11195](https://issues.apache.org/jira/browse/ARROW-11195) - [Rust] [DataFusion] Built-in table providers should expose relevant fields -* [ARROW-11196](https://issues.apache.org/jira/browse/ARROW-11196) - [GLib] Add support for mock, HDFS and S3 file systems with factory function -* [ARROW-11198](https://issues.apache.org/jira/browse/ARROW-11198) - [Packaging][Python] Ensure setuptools version during build supports markdown -* [ARROW-11200](https://issues.apache.org/jira/browse/ARROW-11200) - [Rust] [DateFusion] Physical operators and expressions should have public accessor methods -* [ARROW-11201](https://issues.apache.org/jira/browse/ARROW-11201) - [Rust] create\_batch\_empty - support more types -* [ARROW-11203](https://issues.apache.org/jira/browse/ARROW-11203) - [Developer][Website] Enable JIRA and pull request integration -* [ARROW-11204](https://issues.apache.org/jira/browse/ARROW-11204) - [C++] Fix build failure with bundled gRPC and Protobuf -* [ARROW-11205](https://issues.apache.org/jira/browse/ARROW-11205) - [GLib][Dataset] Add GADFileFormat and its family -* [ARROW-11209](https://issues.apache.org/jira/browse/ARROW-11209) - [Rust] DF - Provide better error message on unsupported GROUP BY -* [ARROW-11210](https://issues.apache.org/jira/browse/ARROW-11210) - [CI] Restore workflows that had been blocked by INFRA -* [ARROW-11212](https://issues.apache.org/jira/browse/ARROW-11212) - [Packaging][Python] Use vcpkg as dependency source for manylinux and windows wheels -* [ARROW-11213](https://issues.apache.org/jira/browse/ARROW-11213) - [Packaging][Python] Dockerize wheel building on windows -* [ARROW-11215](https://issues.apache.org/jira/browse/ARROW-11215) - [CI] Use named volumes by default for caching in docker-compose -* [ARROW-11218](https://issues.apache.org/jira/browse/ARROW-11218) - [R] Make SubTreeFileSystem print method more informative -* [ARROW-11219](https://issues.apache.org/jira/browse/ARROW-11219) - [CI][Ruby][MinGW] Reduce CI time -* [ARROW-11221](https://issues.apache.org/jira/browse/ARROW-11221) - [Rust] DF Implement GROUP BY support for Float32/Float64 -* [ARROW-11231](https://issues.apache.org/jira/browse/ARROW-11231) - [Packaging] Add mimalloc to Linux builds -* [ARROW-11234](https://issues.apache.org/jira/browse/ARROW-11234) - [CI][Ruby][macOS] Reduce CI time -* [ARROW-11236](https://issues.apache.org/jira/browse/ARROW-11236) - [Java] Bump Jackson to 2.11.4 -* [ARROW-11240](https://issues.apache.org/jira/browse/ARROW-11240) - [Packaging][R] Add mimalloc to R packaging -* [ARROW-11242](https://issues.apache.org/jira/browse/ARROW-11242) - [CI] Remove CMake 3.2 job -* [ARROW-11245](https://issues.apache.org/jira/browse/ARROW-11245) - [C++][Gandiva] Add support for LLVM 11.1 -* [ARROW-11247](https://issues.apache.org/jira/browse/ARROW-11247) - [C++] Infer date32 columns in CSV -* [ARROW-11256](https://issues.apache.org/jira/browse/ARROW-11256) - [Packaging][Linux] Don't buffer packaging output -* [ARROW-11272](https://issues.apache.org/jira/browse/ARROW-11272) - [Release][wheel] Remove unsupported Python 3.5 and manylinux1 -* [ARROW-11273](https://issues.apache.org/jira/browse/ARROW-11273) - [Release][deb] Remove unsupported Debian GNU/Linux stretch -* [ARROW-11278](https://issues.apache.org/jira/browse/ARROW-11278) - [Release][NodeJS] Don't touch \~/.bash\_profile -* [ARROW-11280](https://issues.apache.org/jira/browse/ARROW-11280) - [Release][APT] Fix minimal build example check -* [ARROW-11281](https://issues.apache.org/jira/browse/ARROW-11281) - [C++] Remove needless runtime RapidJSON dependency -* [ARROW-11282](https://issues.apache.org/jira/browse/ARROW-11282) - [Packaging][deb] Add missing libgflags-dev dependency -* [ARROW-11285](https://issues.apache.org/jira/browse/ARROW-11285) - [Release][APT] Add support for Ubuntu Groovy -* [ARROW-11292](https://issues.apache.org/jira/browse/ARROW-11292) - [Release][JS] Use Node.JS LTS -* [ARROW-11293](https://issues.apache.org/jira/browse/ARROW-11293) - [C++] Don't require Boost and gflags with find\_package(Arrow) -* [ARROW-11307](https://issues.apache.org/jira/browse/ARROW-11307) - [Release][Ubuntu][20.10] Add workaround for dependency issue -* [PARQUET-1566](https://issues.apache.org/jira/browse/PARQUET-1566) - [C++] Indicate if null count, distinct count are present in column statistics - - -## Bug Fixes - -* [ARROW-2616](https://issues.apache.org/jira/browse/ARROW-2616) - [Python] Cross-compiling Pyarrow -* [ARROW-6582](https://issues.apache.org/jira/browse/ARROW-6582) - [R] Arrow to R fails with embedded nuls in strings -* [ARROW-7363](https://issues.apache.org/jira/browse/ARROW-7363) - [Python] Add combine\_chunks method to ChunkedArray -* [ARROW-7909](https://issues.apache.org/jira/browse/ARROW-7909) - [Website] Add how to install on Red Hat Enterprise Linux -* [ARROW-8258](https://issues.apache.org/jira/browse/ARROW-8258) - [Rust] [Parquet] ArrowReader fails on some timestamp types -* [ARROW-9027](https://issues.apache.org/jira/browse/ARROW-9027) - [Python] Split in multiple files + clean-up pyarrow.parquet tests -* [ARROW-9479](https://issues.apache.org/jira/browse/ARROW-9479) - [JS] Table.from fails for zero-item Lists, FixedSizeLists, Maps. ditto Table.empty -* [ARROW-9636](https://issues.apache.org/jira/browse/ARROW-9636) - [Python] Update documentation about 'LZO' compression in parquet.write\_table -* [ARROW-9776](https://issues.apache.org/jira/browse/ARROW-9776) - [R] read\_feather causes segfault in R if file doesn't exist -* [ARROW-9897](https://issues.apache.org/jira/browse/ARROW-9897) - [C++][Gandiva] Add to\_date() function from pattern -* [ARROW-9897](https://issues.apache.org/jira/browse/ARROW-9897) - [C++][Gandiva] Add to\_date() function from pattern -* [ARROW-9898](https://issues.apache.org/jira/browse/ARROW-9898) - [C++][Gandiva] Error handling in castINT fails in some enviroments -* [ARROW-9903](https://issues.apache.org/jira/browse/ARROW-9903) - [R] open\_dataset freezes opening feather files on Windows -* [ARROW-9963](https://issues.apache.org/jira/browse/ARROW-9963) - [Python] Recognize datetime.timezone.utc as UTC on conversion python-\>pyarrow -* [ARROW-10039](https://issues.apache.org/jira/browse/ARROW-10039) - [Rust] Do not require memory alignment of buffers -* [ARROW-10042](https://issues.apache.org/jira/browse/ARROW-10042) - [Rust] Buffer equalities may be incorrect -* [ARROW-10080](https://issues.apache.org/jira/browse/ARROW-10080) - [R] Arrow does not release unused memory -* [ARROW-10122](https://issues.apache.org/jira/browse/ARROW-10122) - [Python] Selecting one column of multi-index results in a duplicated value column. -* [ARROW-10145](https://issues.apache.org/jira/browse/ARROW-10145) - [C++][Dataset] Assert integer overflow in partitioning falls back to string -* [ARROW-10146](https://issues.apache.org/jira/browse/ARROW-10146) - [Python] Parquet metadata to\_dict raises attribute error -* [ARROW-10174](https://issues.apache.org/jira/browse/ARROW-10174) - [Java] Reading of Dictionary encoded struct vector fails -* [ARROW-10177](https://issues.apache.org/jira/browse/ARROW-10177) - [CI][Gandiva] Nightly gandiva-jar-xenial fails -* [ARROW-10186](https://issues.apache.org/jira/browse/ARROW-10186) - [Rust] Tests fail when following instructions in README -* [ARROW-10247](https://issues.apache.org/jira/browse/ARROW-10247) - [C++][Dataset] Cannot write dataset with dictionary column as partition field -* [ARROW-10264](https://issues.apache.org/jira/browse/ARROW-10264) - [C++][Python] Parquet test failing with HadoopFileSystem URI -* [ARROW-10270](https://issues.apache.org/jira/browse/ARROW-10270) - [R] Fix CSV timestamp\_parsers test on R-devel -* [ARROW-10283](https://issues.apache.org/jira/browse/ARROW-10283) - [Python] Python deprecation warning for "PY\_SSIZE\_T\_CLEAN will be required for '\#' formats" -* [ARROW-10293](https://issues.apache.org/jira/browse/ARROW-10293) - [Rust] [DataFusion] Fix benchmarks -* [ARROW-10294](https://issues.apache.org/jira/browse/ARROW-10294) - [Java] Resolve problems of DecimalVector APIs on ArrowBufs -* [ARROW-10321](https://issues.apache.org/jira/browse/ARROW-10321) - [C++] Building AVX512 code when we should not -* [ARROW-10333](https://issues.apache.org/jira/browse/ARROW-10333) - [Java] Remove split packages in arrow-memory-core and arrow-vectors -* [ARROW-10345](https://issues.apache.org/jira/browse/ARROW-10345) - [C++] NaN breaks sorting -* [ARROW-10346](https://issues.apache.org/jira/browse/ARROW-10346) - [Python] Default S3 region is eu-central-1 even with LANG=C -* [ARROW-10348](https://issues.apache.org/jira/browse/ARROW-10348) - [C++] Fix crash on invalid Parquet file (OSS-Fuzz) -* [ARROW-10350](https://issues.apache.org/jira/browse/ARROW-10350) - [Rust] parquet\_derive crate cannot be published to crates.io -* [ARROW-10353](https://issues.apache.org/jira/browse/ARROW-10353) - [C++] Parquet decompresses DataPageV2 pages even if is\_compressed==0 -* [ARROW-10358](https://issues.apache.org/jira/browse/ARROW-10358) - [R] Followups to 2.0.0 release -* [ARROW-10365](https://issues.apache.org/jira/browse/ARROW-10365) - [R] Remove duplicate setting of S3 flag on macOS -* [ARROW-10369](https://issues.apache.org/jira/browse/ARROW-10369) - [Dev] Fix archery release utility test cases -* [ARROW-10371](https://issues.apache.org/jira/browse/ARROW-10371) - [R] Linux system requirements check needs to support older cmake versions -* [ARROW-10386](https://issues.apache.org/jira/browse/ARROW-10386) - [R] List column class attributes not preserved in roundtrip -* [ARROW-10388](https://issues.apache.org/jira/browse/ARROW-10388) - [Java] Fix Spark integration build failure -* [ARROW-10390](https://issues.apache.org/jira/browse/ARROW-10390) - [Rust] [Parquet] Regression Can not implement custom ParquetWriter because \`TryClone\` is not publically exported -* [ARROW-10393](https://issues.apache.org/jira/browse/ARROW-10393) - [Rust]: Fix null value reading in jsonreader for both dictionary and stringbuilders -* [ARROW-10394](https://issues.apache.org/jira/browse/ARROW-10394) - [Rust] [Large]BinaryArray can be created from non-binary datatypes -* [ARROW-10397](https://issues.apache.org/jira/browse/ARROW-10397) - [C++] Outdated and confusing comment on dictionary indices -* [ARROW-10399](https://issues.apache.org/jira/browse/ARROW-10399) - [R] Fix performance regression from cpp11::r\_string -* [ARROW-10411](https://issues.apache.org/jira/browse/ARROW-10411) - [C++] Fix incorrect child array lengths for Concatenate of FixedSizeList -* [ARROW-10412](https://issues.apache.org/jira/browse/ARROW-10412) - [C++] CMake Build Fails with grpc 1.33.1, "GRPC\_CPP\_PLUGIN-NOTFOUND: program not found or is not executable" -* [ARROW-10413](https://issues.apache.org/jira/browse/ARROW-10413) - [Rust] [Parquet] Unignore some roundtrip tests that are passing now -* [ARROW-10414](https://issues.apache.org/jira/browse/ARROW-10414) - [R] open\_dataset doesn't work with absolute/expanded paths on Windows -* [ARROW-10426](https://issues.apache.org/jira/browse/ARROW-10426) - [C++] Arrow type large\_string cannot be written to Parquet type column descriptor -* [ARROW-10433](https://issues.apache.org/jira/browse/ARROW-10433) - [Python] pyarrow doesn't work with s3fs\>=0.5 -* [ARROW-10434](https://issues.apache.org/jira/browse/ARROW-10434) - [Rust] Debug formatting arrays with lengths greater than 10 and less than 20 produces incorrect values -* [ARROW-10441](https://issues.apache.org/jira/browse/ARROW-10441) - [FlightRPC][Java] FlightClients from FlightGrpcUtils\#createFlightClient shutdown gRPC channel when closed -* [ARROW-10446](https://issues.apache.org/jira/browse/ARROW-10446) - [C++][Python] Timezone aware pd.Timestamp's are incorrectly converted to Timestamp arrys -* [ARROW-10448](https://issues.apache.org/jira/browse/ARROW-10448) - [Rust] PrimitiveArray::new can create arrays not in spec -* [ARROW-10453](https://issues.apache.org/jira/browse/ARROW-10453) - [Rust] [DataFusion] Performance degredation after removing specialization -* [ARROW-10457](https://issues.apache.org/jira/browse/ARROW-10457) - [CI] Fix Spark branch-3.0 integration tests -* [ARROW-10461](https://issues.apache.org/jira/browse/ARROW-10461) - [Rust] Offset related bug in BitChunks::remainder\_bits -* [ARROW-10462](https://issues.apache.org/jira/browse/ARROW-10462) - [Python] ParquetDatasetPiece's path broken when using fsspec fs on Windows -* [ARROW-10463](https://issues.apache.org/jira/browse/ARROW-10463) - [R] Better messaging for currently unsupported CSV options in open\_dataset -* [ARROW-10470](https://issues.apache.org/jira/browse/ARROW-10470) - [R] Fix missing file error causing NYC taxi example to fail -* [ARROW-10471](https://issues.apache.org/jira/browse/ARROW-10471) - [CI][Python] Ensure we have tests with s3fs and run those on CI -* [ARROW-10472](https://issues.apache.org/jira/browse/ARROW-10472) - [C++][Python] casting a scalar timestamp to date32 results in Aborted (core dump) -* [ARROW-10475](https://issues.apache.org/jira/browse/ARROW-10475) - [С++][FlightRPC] Arrow Flight Server / Client cannot be initialized with Ipv6 host -* [ARROW-10480](https://issues.apache.org/jira/browse/ARROW-10480) - [Python] Parquet write\_table creates gzipped Parquet file, not Parquet with gzip compression -* [ARROW-10482](https://issues.apache.org/jira/browse/ARROW-10482) - [Python] Specifying compression type on a column basis when writing Parquet not working -* [ARROW-10489](https://issues.apache.org/jira/browse/ARROW-10489) - [C++] Unable to configure or make with intel compiler -* [ARROW-10491](https://issues.apache.org/jira/browse/ARROW-10491) - [FlightRPC][Java] Fix NPE when using FlightProducer without interceptors -* [ARROW-10493](https://issues.apache.org/jira/browse/ARROW-10493) - [C++][Parquet] Writing nullable nested strings results in wrong data in file -* [ARROW-10495](https://issues.apache.org/jira/browse/ARROW-10495) - [C++] find\_package(Arrow) is broken on Ubuntu 18 -* [ARROW-10496](https://issues.apache.org/jira/browse/ARROW-10496) - [R][CI] Fix conda-r job -* [ARROW-10499](https://issues.apache.org/jira/browse/ARROW-10499) - [C++][Java] Fix ORC Java JNI Crash -* [ARROW-10502](https://issues.apache.org/jira/browse/ARROW-10502) - [C++/Python] CUDA detection messes up nightly conda-win builds -* [ARROW-10503](https://issues.apache.org/jira/browse/ARROW-10503) - [C++] Uriparser will not compile using Intel compiler -* [ARROW-10508](https://issues.apache.org/jira/browse/ARROW-10508) - [Java] Allow FixedSizeListVector to have empty children -* [ARROW-10509](https://issues.apache.org/jira/browse/ARROW-10509) - [C++] Define operator<<(ostream, ParquetException) for clang+Windows -* [ARROW-10511](https://issues.apache.org/jira/browse/ARROW-10511) - [Python] Table.to\_pandas() failing when timezone-awareness mismatch in metadata -* [ARROW-10518](https://issues.apache.org/jira/browse/ARROW-10518) - Fix cast function issues in gandiva -* [ARROW-10519](https://issues.apache.org/jira/browse/ARROW-10519) - [Python] Deadlock when PyArrow imports Pandas from multiple threads -* [ARROW-10525](https://issues.apache.org/jira/browse/ARROW-10525) - [C++] Fix crash on unsupported IPC stream (OSS-Fuzz) -* [ARROW-10532](https://issues.apache.org/jira/browse/ARROW-10532) - [Python] Mangled pandas\_metadata when specified schema has different order as DataFrame columns -* [ARROW-10545](https://issues.apache.org/jira/browse/ARROW-10545) - [C++] Fix crash on invalid Parquet file (OSS-Fuzz) -* [ARROW-10546](https://issues.apache.org/jira/browse/ARROW-10546) - [Python] Deprecate the S3FSWrapper class -* [ARROW-10547](https://issues.apache.org/jira/browse/ARROW-10547) - [Rust][DataFusion] Filter pushdown loses filters if below a user defined node -* [ARROW-10551](https://issues.apache.org/jira/browse/ARROW-10551) - [Rust]: Fix unreproducible benchmarks -* [ARROW-10558](https://issues.apache.org/jira/browse/ARROW-10558) - [Python] Filesystem S3 tests not independent (native s3 influences s3fs) -* [ARROW-10560](https://issues.apache.org/jira/browse/ARROW-10560) - [Python] Crash when creating array with string over 2GB -* [ARROW-10563](https://issues.apache.org/jira/browse/ARROW-10563) - [Packaging][C++] CMake find\_package(Arrow 2.0 CONFIG REQUIRED) broken -* [ARROW-10565](https://issues.apache.org/jira/browse/ARROW-10565) - [Python] Table.from\_batches and Table.from\_pandas have argument Schema\_schema in documentation instead of schema -* [ARROW-10568](https://issues.apache.org/jira/browse/ARROW-10568) - [C++][Parquet] Parquet writer crashes process when Tell() does not succeed -* [ARROW-10569](https://issues.apache.org/jira/browse/ARROW-10569) - [C++][Python] Poor Table filtering performance -* [ARROW-10577](https://issues.apache.org/jira/browse/ARROW-10577) - [Rust][DataFusion] Hash Aggregator stream finishes unexpectedly after going to Pending state -* [ARROW-10578](https://issues.apache.org/jira/browse/ARROW-10578) - [C++] Comparison kernels crashing for string array with null string scalar -* [ARROW-10610](https://issues.apache.org/jira/browse/ARROW-10610) - [C++] arrow-utility-test and arrow-csv-test causes failures on a big-endian platform -* [ARROW-10616](https://issues.apache.org/jira/browse/ARROW-10616) - [Developer] Expand PR labeler to all supported languages -* [ARROW-10617](https://issues.apache.org/jira/browse/ARROW-10617) - [Python] RecordBatchStreamReader's iterator doesn't work with python 3.8 -* [ARROW-10619](https://issues.apache.org/jira/browse/ARROW-10619) - [C++] Fix crash on unsupported IPC stream (OSS-Fuzz) -* [ARROW-10620](https://issues.apache.org/jira/browse/ARROW-10620) - [Rust][Parquet] move column chunk range logic to metadata.rs -* [ARROW-10621](https://issues.apache.org/jira/browse/ARROW-10621) - [Java] flight-cpre test causes a failure on s390x -* [ARROW-10622](https://issues.apache.org/jira/browse/ARROW-10622) - [R] Nameof<\>() is incorrect in r-arrow build environment -* [ARROW-10623](https://issues.apache.org/jira/browse/ARROW-10623) - [R] Version 1.0.1 breaks data.frame attributes when reading file written by 2.0.0 -* [ARROW-10624](https://issues.apache.org/jira/browse/ARROW-10624) - [R] Proactively remove "problems" attributes -* [ARROW-10627](https://issues.apache.org/jira/browse/ARROW-10627) - [Rust] Github master does not compile for WASM target -* [ARROW-10629](https://issues.apache.org/jira/browse/ARROW-10629) - [CI] MinGW builds broken on Github Actions -* [ARROW-10631](https://issues.apache.org/jira/browse/ARROW-10631) - [Rust] Equality of fixed-sized binary is incorrect. -* [ARROW-10642](https://issues.apache.org/jira/browse/ARROW-10642) - [R] Can't get Table from RecordBatchReader with 0 batches -* [ARROW-10656](https://issues.apache.org/jira/browse/ARROW-10656) - [Rust] New RecordBatch requires exact match of Data Types -* [ARROW-10656](https://issues.apache.org/jira/browse/ARROW-10656) - [Rust] New RecordBatch requires exact match of Data Types -* [ARROW-10661](https://issues.apache.org/jira/browse/ARROW-10661) - [C\#] Fix benchmarking project -* [ARROW-10662](https://issues.apache.org/jira/browse/ARROW-10662) - [Java] Avoid integer overflow for Json file reader -* [ARROW-10663](https://issues.apache.org/jira/browse/ARROW-10663) - [C++/Doc] The IsIn kernel ignores the skip\_nulls option of SetLookupOptions -* [ARROW-10667](https://issues.apache.org/jira/browse/ARROW-10667) - [Rust] [Parquet] Add a convenience type for writing Parquet to memory -* [ARROW-10668](https://issues.apache.org/jira/browse/ARROW-10668) - [R] Filtering does not work with .data pronoun -* [ARROW-10681](https://issues.apache.org/jira/browse/ARROW-10681) - [Rust] [DataFusion] TPC-H Query 12 fails with scheduler error -* [ARROW-10684](https://issues.apache.org/jira/browse/ARROW-10684) - [Rust] Logical equality should consider parent array nullability -* [ARROW-10690](https://issues.apache.org/jira/browse/ARROW-10690) - [Java] ComplexCopier gives incorrect result for list vector if target vector is non-empty -* [ARROW-10692](https://issues.apache.org/jira/browse/ARROW-10692) - [Rust] Segfault while array buffer append -* [ARROW-10699](https://issues.apache.org/jira/browse/ARROW-10699) - [C++] BitmapUInt64Reader doesn't work on big-endian -* [ARROW-10701](https://issues.apache.org/jira/browse/ARROW-10701) - [Rust] [Datafusion] Benchmark sort\_limit\_query\_sql fails because order by clause specifies column index instead of expression -* [ARROW-10705](https://issues.apache.org/jira/browse/ARROW-10705) - [Rust] Lifetime annotations in the IPC writer are too strict, preventing code reuse -* [ARROW-10710](https://issues.apache.org/jira/browse/ARROW-10710) - [Rust] Example flight server is broken after tokio upgrade (among other things) -* [ARROW-10711](https://issues.apache.org/jira/browse/ARROW-10711) - [CI] Remove set-env from auto-tune to work with new GHA settings -* [ARROW-10719](https://issues.apache.org/jira/browse/ARROW-10719) - [C\#] ArrowStreamWriter doesn't write schema metadata -* [ARROW-10746](https://issues.apache.org/jira/browse/ARROW-10746) - [C++] Use GTEST\_SKIP in parquet encoding tests -* [ARROW-10748](https://issues.apache.org/jira/browse/ARROW-10748) - [Java] TimeStampMilliVector cannot be cast to TimeStampMilliTZVector -* [ARROW-10749](https://issues.apache.org/jira/browse/ARROW-10749) - [C++] Incorrect string format for Datum with the collection type -* [ARROW-10751](https://issues.apache.org/jira/browse/ARROW-10751) - [C++] Add RE2 to minimal build example -* [ARROW-10753](https://issues.apache.org/jira/browse/ARROW-10753) - [Rust] [DataFusion] Negative numbers in SQL WHERE clause not parsed correctly -* [ARROW-10757](https://issues.apache.org/jira/browse/ARROW-10757) - [Rust] [CI] Sporadic failures due to disk filling up -* [ARROW-10760](https://issues.apache.org/jira/browse/ARROW-10760) - [Rust] [DataFusion] Predicate push down does not support joins correctly -* [ARROW-10769](https://issues.apache.org/jira/browse/ARROW-10769) - [CI] Integration tests are failing in master -* [ARROW-10774](https://issues.apache.org/jira/browse/ARROW-10774) - [R] Set minimum cpp11 version -* [ARROW-10777](https://issues.apache.org/jira/browse/ARROW-10777) - [Packaging][Python] PyPI pyarrow source dist (sdist) contains architecture dependent binaries -* [ARROW-10778](https://issues.apache.org/jira/browse/ARROW-10778) - [Python] RowGroupInfo.statistics errors for empty row group -* [ARROW-10779](https://issues.apache.org/jira/browse/ARROW-10779) - [Java] writeNull method in UnionListWriter doesn't work correctly if validity at that index is already set -* [ARROW-10780](https://issues.apache.org/jira/browse/ARROW-10780) - [R] Update known R installation issues for CentOS 7 -* [ARROW-10791](https://issues.apache.org/jira/browse/ARROW-10791) - [Rust] StreamReader, read\_dictionary duplicating schema info -* [ARROW-10801](https://issues.apache.org/jira/browse/ARROW-10801) - [Rust] [Flight] Support sending FlightData for Dictionaries with that of a RecordBatch -* [ARROW-10803](https://issues.apache.org/jira/browse/ARROW-10803) - [R] Support R \>= 3.3 and add CI -* [ARROW-10804](https://issues.apache.org/jira/browse/ARROW-10804) - [Rust] Remove UB on parquet crate -* [ARROW-10807](https://issues.apache.org/jira/browse/ARROW-10807) - [Rust][DataFusion] Avoid double hashing -* [ARROW-10810](https://issues.apache.org/jira/browse/ARROW-10810) - [Rust] Speed up comparison kernels -* [ARROW-10811](https://issues.apache.org/jira/browse/ARROW-10811) - [R][CI] Remove nightly centos6 build -* [ARROW-10823](https://issues.apache.org/jira/browse/ARROW-10823) - MutableArrayData with use\_null false yields wrong results -* [ARROW-10830](https://issues.apache.org/jira/browse/ARROW-10830) - [Rust] json reader should not hard crash on invalid json -* [ARROW-10833](https://issues.apache.org/jira/browse/ARROW-10833) - [Python] Avoid usage of NumPy's PyArray\_DescrCheck macro -* [ARROW-10834](https://issues.apache.org/jira/browse/ARROW-10834) - [R] Fix print method for SubTreeFileSystem -* [ARROW-10837](https://issues.apache.org/jira/browse/ARROW-10837) - [Rust] Use \`Vec\` for hash key instead -* [ARROW-10840](https://issues.apache.org/jira/browse/ARROW-10840) - [C++] Parquet FileMetaData does not have key\_value\_metadata when built from FileMetaDataBuilder -* [ARROW-10842](https://issues.apache.org/jira/browse/ARROW-10842) - [Rust] decouple IO from json schema inference code -* [ARROW-10844](https://issues.apache.org/jira/browse/ARROW-10844) - [Rust] [DataFusion] join of two DataFrames is not possible -* [ARROW-10850](https://issues.apache.org/jira/browse/ARROW-10850) - [R] Unrecognized compression type: LZ4 -* [ARROW-10852](https://issues.apache.org/jira/browse/ARROW-10852) - [C++] AssertTablesEqual(verbose=true) segfaults if the left array has more rows -* [ARROW-10854](https://issues.apache.org/jira/browse/ARROW-10854) - [Rust] [DataFusion] Simplified logical scans -* [ARROW-10855](https://issues.apache.org/jira/browse/ARROW-10855) - [Python][Numpy] ArrowTypeError after upgrading NumPy to 1.20.0rc1 -* [ARROW-10856](https://issues.apache.org/jira/browse/ARROW-10856) - [R] CentOS 7 not correctly identifying compiler version -* [ARROW-10859](https://issues.apache.org/jira/browse/ARROW-10859) - [Rust] [DataFusion] Make collect not require ExecutionContext -* [ARROW-10860](https://issues.apache.org/jira/browse/ARROW-10860) - [Java] Avoid integer overflow for generated classes in Vector -* [ARROW-10863](https://issues.apache.org/jira/browse/ARROW-10863) - [Python] ExtensionArray.to\_pandas not working -* [ARROW-10863](https://issues.apache.org/jira/browse/ARROW-10863) - [Python] ExtensionArray.to\_pandas not working -* [ARROW-10875](https://issues.apache.org/jira/browse/ARROW-10875) - simplify simd cfg check -* [ARROW-10876](https://issues.apache.org/jira/browse/ARROW-10876) - [Rust] json reader should validate value type -* [ARROW-10897](https://issues.apache.org/jira/browse/ARROW-10897) - [Rust] Replace Arc by String in DataType::Timestamp -* [ARROW-10907](https://issues.apache.org/jira/browse/ARROW-10907) - [Rust] Cast UTF8 to Date64 Incorrect -* [ARROW-10913](https://issues.apache.org/jira/browse/ARROW-10913) - [Python][Doc] Code block typo in filesystems docs -* [ARROW-10914](https://issues.apache.org/jira/browse/ARROW-10914) - [Rust]: SIMD implementation of arithmetic kernels reads out of bounds -* [ARROW-10915](https://issues.apache.org/jira/browse/ARROW-10915) - [Rust] Make ARROW\_TEST\_DATA and PARQUET\_TEST\_DATA absolute dirs -* [ARROW-10921](https://issues.apache.org/jira/browse/ARROW-10921) - \`TypeError: 'coroutine' object is not iterable\` when reading parquet partitions via s3fs \>= 0.5 with pyarrow -* [ARROW-10930](https://issues.apache.org/jira/browse/ARROW-10930) - [Python] LargeListType doesn't have a value\_field -* [ARROW-10932](https://issues.apache.org/jira/browse/ARROW-10932) - [C++] BinaryMemoTable::CopyOffsets access out-of-bound address when data is empty -* [ARROW-10932](https://issues.apache.org/jira/browse/ARROW-10932) - [C++] BinaryMemoTable::CopyOffsets access out-of-bound address when data is empty -* [ARROW-10942](https://issues.apache.org/jira/browse/ARROW-10942) - [C++] S3FileSystem::Impl::IsEmptyDirectory fails on Amazon S3 -* [ARROW-10943](https://issues.apache.org/jira/browse/ARROW-10943) - [Rust] Intermittent build failure in parquet encoding -* [ARROW-10954](https://issues.apache.org/jira/browse/ARROW-10954) - [C++][Doc] PlasmaClient is threadSafe now, doc not update -* [ARROW-10955](https://issues.apache.org/jira/browse/ARROW-10955) - [C++] Reading empty json lists results in invalid non-nullable null type -* [ARROW-10960](https://issues.apache.org/jira/browse/ARROW-10960) - [C++][FlightRPC] Missing protobuf data\_body should result in default value of empty bytes, not null -* [ARROW-10962](https://issues.apache.org/jira/browse/ARROW-10962) - [Java][FlightRPC] FlightData deserializer should accept missing fields -* [ARROW-10967](https://issues.apache.org/jira/browse/ARROW-10967) - [Rust] Make env vars ARROW\_TEST\_DATA and PARQUET\_TEST\_DATA optional -* [ARROW-10990](https://issues.apache.org/jira/browse/ARROW-10990) - [Rust]: SIMD implementation of compare kernels reads out of bounds -* [ARROW-10994](https://issues.apache.org/jira/browse/ARROW-10994) - [Rust] Fix bugs in TPC-H file conversion -* [ARROW-10996](https://issues.apache.org/jira/browse/ARROW-10996) - [Rust] Return error messages via Result for get\_arrow\_schema\_from\_metadata -* [ARROW-10999](https://issues.apache.org/jira/browse/ARROW-10999) - [Rust] TPC-H parquet files cannot be read by Apache Spark -* [ARROW-11014](https://issues.apache.org/jira/browse/ARROW-11014) - [Rust] [DataFusion] ParquetExec reports incorrect statistics -* [ARROW-11023](https://issues.apache.org/jira/browse/ARROW-11023) - [C++][CMake] gRPC doesn't respect CMAKE\_CXX\_COMPILER -* [ARROW-11024](https://issues.apache.org/jira/browse/ARROW-11024) - [C++][Parquet] Writing List to parquet sometimes writes wrong data -* [ARROW-11025](https://issues.apache.org/jira/browse/ARROW-11025) - [Rust] Bench for boolean kernels measure array creation -* [ARROW-11030](https://issues.apache.org/jira/browse/ARROW-11030) - [Rust] [DataFusion] HashJoinExec slow with many batches -* [ARROW-11048](https://issues.apache.org/jira/browse/ARROW-11048) - [Rust] Add bench to MutableBuffer -* [ARROW-11050](https://issues.apache.org/jira/browse/ARROW-11050) - [R] Handle RecordBatch in write\_parquet -* [ARROW-11067](https://issues.apache.org/jira/browse/ARROW-11067) - [C++] CSV reader returns nulls for some strings on macOS -* [ARROW-11069](https://issues.apache.org/jira/browse/ARROW-11069) - [C++] Parquet writer incorrect data being written when data type is struct -* [ARROW-11073](https://issues.apache.org/jira/browse/ARROW-11073) - [Rust] Lint Error on CI Tests in /arrow/rust/arrow/src/ipc/reader.rs -* [ARROW-11083](https://issues.apache.org/jira/browse/ARROW-11083) - [CI] Build "Source Release and Merge Script" is broken -* [ARROW-11084](https://issues.apache.org/jira/browse/ARROW-11084) - [Rust] Clippy failing in master -* [ARROW-11085](https://issues.apache.org/jira/browse/ARROW-11085) - [Rust] Rust CI no longer works b/c it uses action-rs: Migrate CI away from action-rs/\* -* [ARROW-11092](https://issues.apache.org/jira/browse/ARROW-11092) - [CI] (Temporarily) move offending workflows to separate files -* [ARROW-11102](https://issues.apache.org/jira/browse/ARROW-11102) - [Rust][DataFusion] fmt::Debug for ScalarValue(Utf8) is always quoted -* [ARROW-11113](https://issues.apache.org/jira/browse/ARROW-11113) - [Rust] support as\_struct\_array cast -* [ARROW-11114](https://issues.apache.org/jira/browse/ARROW-11114) - [Java] Metadata serialization is broken for Field class -* [ARROW-11132](https://issues.apache.org/jira/browse/ARROW-11132) - [CI] Use pip to install crossbow's dependencies for the comment bot -* [ARROW-11144](https://issues.apache.org/jira/browse/ARROW-11144) - [C++][Python][CI] Fix HDFS nightly build -* [ARROW-11152](https://issues.apache.org/jira/browse/ARROW-11152) - [CI][C++] Fix Homebrew numpy installation on macOS builds -* [ARROW-11162](https://issues.apache.org/jira/browse/ARROW-11162) - [C++] Fix crash on Decimal256 Parquet file (OSS-Fuzz) -* [ARROW-11163](https://issues.apache.org/jira/browse/ARROW-11163) - [C++][Python] Compressed Feather file written with pyarrow 0.17 not readable in pyarrow 2.0.0+ -* [ARROW-11166](https://issues.apache.org/jira/browse/ARROW-11166) - [Python][Compute] Add bindings for ProjectOptions -* [ARROW-11171](https://issues.apache.org/jira/browse/ARROW-11171) - [Go] Build fails on s390x with noasm tag -* [ARROW-11189](https://issues.apache.org/jira/browse/ARROW-11189) - [Developer] Achery benchmark diff cannot compare two jsons -* [ARROW-11190](https://issues.apache.org/jira/browse/ARROW-11190) - [C++][Dataset] Clean up compiler warnings -* [ARROW-11202](https://issues.apache.org/jira/browse/ARROW-11202) - [R][CI] Nightly builds not happening (or artifacts not exported) -* [ARROW-11224](https://issues.apache.org/jira/browse/ARROW-11224) - [R] don't test metadata serialization on old R versions -* [ARROW-11226](https://issues.apache.org/jira/browse/ARROW-11226) - [Python][CI] Filesystem tests failing with s3fs 0.5.2 -* [ARROW-11227](https://issues.apache.org/jira/browse/ARROW-11227) - [Python][CI] AMD64 Conda Python 3.7 Pandas 0.24 cron job failing in to\_pandas extension dtype test -* [ARROW-11229](https://issues.apache.org/jira/browse/ARROW-11229) - [C++][Dataset] Static build is failed -* [ARROW-11230](https://issues.apache.org/jira/browse/ARROW-11230) - [R] Fix build failures on Windows when multiple libarrow binaries found -* [ARROW-11232](https://issues.apache.org/jira/browse/ARROW-11232) - [C++] Table::CombineChunks() returns incorrect results if Table has no column -* [ARROW-11233](https://issues.apache.org/jira/browse/ARROW-11233) - [C++][Flight] Fail to link with bundled gRPC and Abseil -* [ARROW-11237](https://issues.apache.org/jira/browse/ARROW-11237) - [C++] Compiler error with GLog and unity build enabled -* [ARROW-11251](https://issues.apache.org/jira/browse/ARROW-11251) - [CI] Make sure that devtoolset-8 is really installed + being used -* [ARROW-11253](https://issues.apache.org/jira/browse/ARROW-11253) - [R] Make sure that large metadata tests are reproducible -* [ARROW-11255](https://issues.apache.org/jira/browse/ARROW-11255) - [Packaging][Conda][macOS] Fix Python version -* [ARROW-11271](https://issues.apache.org/jira/browse/ARROW-11271) - [Rust] [Parquet] List schema to Arrow parser misinterpreting child nullability -* [ARROW-11274](https://issues.apache.org/jira/browse/ARROW-11274) - [Packaging][wheel][Windows] Fix wheels path for Gemfury -* [ARROW-11275](https://issues.apache.org/jira/browse/ARROW-11275) - [Packaging][wheel][Linux] Fix paths for Gemfury -* [ARROW-11283](https://issues.apache.org/jira/browse/ARROW-11283) - [Julia] Fix install link -* [ARROW-11286](https://issues.apache.org/jira/browse/ARROW-11286) - [Release][Yum] Fix minimal build example check -* [ARROW-11287](https://issues.apache.org/jira/browse/ARROW-11287) - [Packaging][RPM] Add missing dependencies -* [ARROW-11301](https://issues.apache.org/jira/browse/ARROW-11301) - [C++] Fix reading LZ4-compressed Parquet files produced by Java Parquet implementation -* [ARROW-11302](https://issues.apache.org/jira/browse/ARROW-11302) - [Release][Python] Remove verification of python 3.5 wheel on macOS -* [ARROW-11306](https://issues.apache.org/jira/browse/ARROW-11306) - [Packaging][Ubuntu][16.04] Add missing libprotobuf-dev dependency -* [PARQUET-1935](https://issues.apache.org/jira/browse/PARQUET-1935) - [C++][Parquet] nullptr access violation when writing arrays of non-nullable values - - - -# Apache Arrow 2.0.0 (2020-10-13) - -## Bug Fixes - -* [ARROW-2367](https://issues.apache.org/jira/browse/ARROW-2367) - [Python] ListArray has trouble with sizes greater than kMaximumCapacity -* [ARROW-4189](https://issues.apache.org/jira/browse/ARROW-4189) - [CI] [Rust] Fix broken cargo coverage -* [ARROW-4917](https://issues.apache.org/jira/browse/ARROW-4917) - [C++] orc\_ep fails in cpp-alpine docker -* [ARROW-5578](https://issues.apache.org/jira/browse/ARROW-5578) - [C++][Flight] Flight does not build out of the box on Alpine Linux -* [ARROW-7226](https://issues.apache.org/jira/browse/ARROW-7226) - [JSON][Python] Json loader fails on example in documentation. -* [ARROW-7384](https://issues.apache.org/jira/browse/ARROW-7384) - [Website] Fix search indexing warning reported by Google -* [ARROW-7517](https://issues.apache.org/jira/browse/ARROW-7517) - [C++] Builder does not honour dictionary type provided during initialization -* [ARROW-7663](https://issues.apache.org/jira/browse/ARROW-7663) - [Python] from\_pandas gives TypeError instead of ArrowTypeError in some cases -* [ARROW-7903](https://issues.apache.org/jira/browse/ARROW-7903) - [Rust] [DataFusion] Upgrade SQLParser dependency for DataFusion -* [ARROW-7957](https://issues.apache.org/jira/browse/ARROW-7957) - [Python] ParquetDataset cannot take HadoopFileSystem as filesystem -* [ARROW-8265](https://issues.apache.org/jira/browse/ARROW-8265) - [Rust] [DataFusion] Table API collect() should not require context -* [ARROW-8394](https://issues.apache.org/jira/browse/ARROW-8394) - [JS] Typescript compiler errors for arrow d.ts files, when using es2015-esm package -* [ARROW-8735](https://issues.apache.org/jira/browse/ARROW-8735) - [Rust] [Parquet] Parquet crate fails to compile on Arm architecture -* [ARROW-8749](https://issues.apache.org/jira/browse/ARROW-8749) - [C++] IpcFormatWriter writes dictionary batches with wrong ID -* [ARROW-8773](https://issues.apache.org/jira/browse/ARROW-8773) - [Python] pyarrow schema.empty\_table() does not preserve nullability of fields -* [ARROW-9028](https://issues.apache.org/jira/browse/ARROW-9028) - [R] Should be able to convert an empty table -* [ARROW-9096](https://issues.apache.org/jira/browse/ARROW-9096) - [Python] Pandas roundtrip with object-dtype column labels with integer values: data type "integer" not understood -* [ARROW-9177](https://issues.apache.org/jira/browse/ARROW-9177) - [C++][Parquet] Tracking issue for cross-implementation LZ4 Parquet compression compatibility -* [ARROW-9414](https://issues.apache.org/jira/browse/ARROW-9414) - [C++] apt package includes headers for S3 interface, but no support -* [ARROW-9462](https://issues.apache.org/jira/browse/ARROW-9462) - [Go] The Indentation after the first Record arrjson writer is missing -* [ARROW-9463](https://issues.apache.org/jira/browse/ARROW-9463) - [Go] The writer is double closed in TestReadWrite -* [ARROW-9490](https://issues.apache.org/jira/browse/ARROW-9490) - [Python] pyarrow array creation for specific set of numpy scalars fails -* [ARROW-9495](https://issues.apache.org/jira/browse/ARROW-9495) - [C++] Equality assertions don't handle Inf / -Inf properly -* [ARROW-9520](https://issues.apache.org/jira/browse/ARROW-9520) - [Rust] [DataFusion] Can't alias an aggregate expression -* [ARROW-9528](https://issues.apache.org/jira/browse/ARROW-9528) - [Python] Honor tzinfo information when converting from datetime to pyarrow -* [ARROW-9532](https://issues.apache.org/jira/browse/ARROW-9532) - [Python] Building pyarrow for MacPorts on macOS -* [ARROW-9535](https://issues.apache.org/jira/browse/ARROW-9535) - [Python] Remove symlink fixes from conda recipe -* [ARROW-9536](https://issues.apache.org/jira/browse/ARROW-9536) - Missing parameters in PlasmaOutOfMemoryException.java -* [ARROW-9541](https://issues.apache.org/jira/browse/ARROW-9541) - [C++] CMakeLists requires UTF8PROC\_STATIC when building static library -* [ARROW-9544](https://issues.apache.org/jira/browse/ARROW-9544) - [R] version argument of write\_parquet not working -* [ARROW-9546](https://issues.apache.org/jira/browse/ARROW-9546) - [Python] Clean up Pandas Metadata Conversion test -* [ARROW-9548](https://issues.apache.org/jira/browse/ARROW-9548) - [Go] Test output files in tmp directory are not removed correctly -* [ARROW-9549](https://issues.apache.org/jira/browse/ARROW-9549) - [Rust] Parquet no longer builds -* [ARROW-9554](https://issues.apache.org/jira/browse/ARROW-9554) - [Java] FixedWidthInPlaceVectorSorter sometimes produces wrong result -* [ARROW-9556](https://issues.apache.org/jira/browse/ARROW-9556) - [Python][C++] Segfaults in UnionArray with null values -* [ARROW-9560](https://issues.apache.org/jira/browse/ARROW-9560) - [Packaging] conda recipes failing due to missing conda-forge.yml -* [ARROW-9569](https://issues.apache.org/jira/browse/ARROW-9569) - [CI][R] Fix rtools35 builds for msys2 key change -* [ARROW-9570](https://issues.apache.org/jira/browse/ARROW-9570) - [Doc] Clean up sphinx sidebar -* [ARROW-9573](https://issues.apache.org/jira/browse/ARROW-9573) - [Python] Parquet doesn't load when partitioned column starts with '\_' -* [ARROW-9574](https://issues.apache.org/jira/browse/ARROW-9574) - [R] Cleanups for CRAN 1.0.0 release -* [ARROW-9575](https://issues.apache.org/jira/browse/ARROW-9575) - [R] gcc-UBSAN failure on CRAN -* [ARROW-9577](https://issues.apache.org/jira/browse/ARROW-9577) - [Python][C++] posix\_madvise error on Debian in pyarrow 1.0.0 -* [ARROW-9583](https://issues.apache.org/jira/browse/ARROW-9583) - [Rust] Offset is mishandled in arithmetic and boolean compute kernels -* [ARROW-9588](https://issues.apache.org/jira/browse/ARROW-9588) - [C++] clang/win: Copy constructor of ParquetInvalidOrCorruptedFileException not correctly triggered -* [ARROW-9589](https://issues.apache.org/jira/browse/ARROW-9589) - [C++/R] arrow\_exports.h contains structs declared as class -* [ARROW-9592](https://issues.apache.org/jira/browse/ARROW-9592) - [CI] Update homebrew before calling brew bundle -* [ARROW-9596](https://issues.apache.org/jira/browse/ARROW-9596) - [CI][Crossbow] Fix homebrew-cpp again, again -* [ARROW-9597](https://issues.apache.org/jira/browse/ARROW-9597) - [C++] AddAlias in compute::FunctionRegistry should be synchronized -* [ARROW-9598](https://issues.apache.org/jira/browse/ARROW-9598) - [C++][Parquet] Spaced definition levels is not assigned correctly. -* [ARROW-9599](https://issues.apache.org/jira/browse/ARROW-9599) - [CI] Appveyor toolchain build fails because CMake detects different C and C++ compilers -* [ARROW-9600](https://issues.apache.org/jira/browse/ARROW-9600) - [Rust] When used as a crate dependency, arrow-flight is rebuilt on every invocation of cargo build -* [ARROW-9600](https://issues.apache.org/jira/browse/ARROW-9600) - [Rust] When used as a crate dependency, arrow-flight is rebuilt on every invocation of cargo build -* [ARROW-9602](https://issues.apache.org/jira/browse/ARROW-9602) - [R] Improve cmake detection in Linux build -* [ARROW-9603](https://issues.apache.org/jira/browse/ARROW-9603) - [C++][Parquet] Write Arrow relies on unspecified behavior for nested types -* [ARROW-9606](https://issues.apache.org/jira/browse/ARROW-9606) - [C++][Dataset] in expressions don't work with \>1 partition levels -* [ARROW-9609](https://issues.apache.org/jira/browse/ARROW-9609) - [C++] CSV datasets don't materialize virtual columns -* [ARROW-9621](https://issues.apache.org/jira/browse/ARROW-9621) - [Python] test\_move\_file() is failed with fsspec 0.8.0 -* [ARROW-9622](https://issues.apache.org/jira/browse/ARROW-9622) - [Java] ComplexCopier fails if a structvector has a child UnionVector with nulls -* [ARROW-9628](https://issues.apache.org/jira/browse/ARROW-9628) - [Rust] Clippy PR test failing intermittently on Rust / AMD64 MacOS -* [ARROW-9629](https://issues.apache.org/jira/browse/ARROW-9629) - [Python] Kartothek integration tests failing due to missing freezegun module -* [ARROW-9631](https://issues.apache.org/jira/browse/ARROW-9631) - [Rust] Arrow crate should not depend on flight -* [ARROW-9631](https://issues.apache.org/jira/browse/ARROW-9631) - [Rust] Arrow crate should not depend on flight -* [ARROW-9642](https://issues.apache.org/jira/browse/ARROW-9642) - [C++] Let MakeBuilder refer DictionaryType's index\_type for deciding the starting bit width of the indices -* [ARROW-9643](https://issues.apache.org/jira/browse/ARROW-9643) - [C++] Illegal instruction on haswell cpu -* [ARROW-9644](https://issues.apache.org/jira/browse/ARROW-9644) - [C++][Dataset] Do not check for ignore\_prefixes in the base path -* [ARROW-9652](https://issues.apache.org/jira/browse/ARROW-9652) - [Rust][DataFusion] Panic trying to select \* from a CSV (panicked at 'index out of bounds: the len is 0 but the index is 0) -* [ARROW-9653](https://issues.apache.org/jira/browse/ARROW-9653) - [Rust][DataFusion] Multi-column Group by: Invalid Argument Error -* [ARROW-9659](https://issues.apache.org/jira/browse/ARROW-9659) - [C++] RecordBatchStreamReader throws on CUDA device buffers -* [ARROW-9660](https://issues.apache.org/jira/browse/ARROW-9660) - [C++] IPC - dictionaries in maps -* [ARROW-9666](https://issues.apache.org/jira/browse/ARROW-9666) - [Python][wheel][Windows] library missing failure by ARROW-9412 -* [ARROW-9670](https://issues.apache.org/jira/browse/ARROW-9670) - [C++][FlightRPC] Close()ing a DoPut with an ongoing read locks up the client -* [ARROW-9684](https://issues.apache.org/jira/browse/ARROW-9684) - [C++] Fix undefined behaviour on invalid IPC / Parquet input (OSS-Fuzz) -* [ARROW-9692](https://issues.apache.org/jira/browse/ARROW-9692) - [Python] distutils import warning -* [ARROW-9693](https://issues.apache.org/jira/browse/ARROW-9693) - [CI][Docs] Nightly docs build fails -* [ARROW-9696](https://issues.apache.org/jira/browse/ARROW-9696) - [Rust] [Datafusion] nested binary expressions broken -* [ARROW-9698](https://issues.apache.org/jira/browse/ARROW-9698) - [C++] Revert "Add -NDEBUG flag to arrow.pc" -* [ARROW-9700](https://issues.apache.org/jira/browse/ARROW-9700) - [Python] create\_library\_symlinks doesn't work in macos -* [ARROW-9712](https://issues.apache.org/jira/browse/ARROW-9712) - [Rust] [DataFusion] ParquetScanExec panics on error -* [ARROW-9714](https://issues.apache.org/jira/browse/ARROW-9714) - [Rust] [DataFusion] TypeCoercionRule not implemented for Limit or Sort -* [ARROW-9716](https://issues.apache.org/jira/browse/ARROW-9716) - [Rust] [DataFusion] MergeExec should have concurrency limit -* [ARROW-9726](https://issues.apache.org/jira/browse/ARROW-9726) - [Rust] [DataFusion] ParquetScanExec launches threads too early -* [ARROW-9727](https://issues.apache.org/jira/browse/ARROW-9727) - [C++] Fix crash on invalid IPC input (OSS-Fuzz) -* [ARROW-9729](https://issues.apache.org/jira/browse/ARROW-9729) - [Java] Error Prone causes other annotation processors to not work with Eclipse -* [ARROW-9733](https://issues.apache.org/jira/browse/ARROW-9733) - [Rust][DataFusion] Aggregates COUNT/MIN/MAX don't work on VARCHAR columns -* [ARROW-9734](https://issues.apache.org/jira/browse/ARROW-9734) - [Rust] [DataFusion] TableProvider.scan executing partitions prematurely -* [ARROW-9741](https://issues.apache.org/jira/browse/ARROW-9741) - [Rust] [DataFusion] Incorrect count in TPC-H query 1 result set -* [ARROW-9743](https://issues.apache.org/jira/browse/ARROW-9743) - [R] Sanitize paths in open\_dataset -* [ARROW-9744](https://issues.apache.org/jira/browse/ARROW-9744) - [Python] Failed to install on aarch64 -* [ARROW-9764](https://issues.apache.org/jira/browse/ARROW-9764) - [CI][Java] Push wrong Docker image -* [ARROW-9768](https://issues.apache.org/jira/browse/ARROW-9768) - [Python] Pyarrow allows for unsafe conversions of datetime objects to timestamp nanoseconds -* [ARROW-9768](https://issues.apache.org/jira/browse/ARROW-9768) - [Python] Pyarrow allows for unsafe conversions of datetime objects to timestamp nanoseconds -* [ARROW-9778](https://issues.apache.org/jira/browse/ARROW-9778) - [Rust] [DataFusion] Logical and physical schemas' nullability does not match in 8 out of 20 end-to-end tests -* [ARROW-9783](https://issues.apache.org/jira/browse/ARROW-9783) - [Rust] [DataFusion] Logical aggregate expressions require explicit data type -* [ARROW-9785](https://issues.apache.org/jira/browse/ARROW-9785) - [Python] pyarrow/tests/test\_fs.py::test\_s3\_options too slow -* [ARROW-9789](https://issues.apache.org/jira/browse/ARROW-9789) - [C++] Don't install jemalloc in parallel -* [ARROW-9790](https://issues.apache.org/jira/browse/ARROW-9790) - [Rust] [Parquet] ParquetFileArrowReader fails to decode all pages if batches fall exactly on row group boundaries -* [ARROW-9790](https://issues.apache.org/jira/browse/ARROW-9790) - [Rust] [Parquet] ParquetFileArrowReader fails to decode all pages if batches fall exactly on row group boundaries -* [ARROW-9793](https://issues.apache.org/jira/browse/ARROW-9793) - [Rust] [DataFusion] Tests failing in master -* [ARROW-9797](https://issues.apache.org/jira/browse/ARROW-9797) - [Rust] AMD64 Conda Integration Tests is failing for the Master branch -* [ARROW-9799](https://issues.apache.org/jira/browse/ARROW-9799) - [Rust] [DataFusion] Implementation of physical binary expression get\_type method is incorrect -* [ARROW-9800](https://issues.apache.org/jira/browse/ARROW-9800) - [Rust] [Parquet] "min" and "max" written to standard out when writing columns -* [ARROW-9809](https://issues.apache.org/jira/browse/ARROW-9809) - [Rust] [DataFusion] logical schema = physical schema is not true -* [ARROW-9814](https://issues.apache.org/jira/browse/ARROW-9814) - [Python] Crash in test\_parquet.py::test\_read\_partitioned\_directory\_s3fs -* [ARROW-9815](https://issues.apache.org/jira/browse/ARROW-9815) - [Rust] [DataFusion] Deadlock in creation of physical plan with two udfs -* [ARROW-9815](https://issues.apache.org/jira/browse/ARROW-9815) - [Rust] [DataFusion] Deadlock in creation of physical plan with two udfs -* [ARROW-9815](https://issues.apache.org/jira/browse/ARROW-9815) - [Rust] [DataFusion] Deadlock in creation of physical plan with two udfs -* [ARROW-9816](https://issues.apache.org/jira/browse/ARROW-9816) - [C++] Escape quotes in config.h -* [ARROW-9827](https://issues.apache.org/jira/browse/ARROW-9827) - [Python] pandas.read\_parquet fails for wide parquet files and pyarrow 1.0.X -* [ARROW-9831](https://issues.apache.org/jira/browse/ARROW-9831) - [Rust] [DataFusion] Fix compilation error -* [ARROW-9840](https://issues.apache.org/jira/browse/ARROW-9840) - [Python] Python fs documentation out of date with code -* [ARROW-9846](https://issues.apache.org/jira/browse/ARROW-9846) - [Rust] Master branch broken build -* [ARROW-9851](https://issues.apache.org/jira/browse/ARROW-9851) - [C++] Valgrind errors due to unrecognized instructions -* [ARROW-9852](https://issues.apache.org/jira/browse/ARROW-9852) - [C++] Fix crash on invalid IPC input (OSS-Fuzz) -* [ARROW-9852](https://issues.apache.org/jira/browse/ARROW-9852) - [C++] Fix crash on invalid IPC input (OSS-Fuzz) -* [ARROW-9855](https://issues.apache.org/jira/browse/ARROW-9855) - [R] Fix bad merge/Rcpp conflict -* [ARROW-9859](https://issues.apache.org/jira/browse/ARROW-9859) - [C++] S3 FileSystemFromUri with special char in secret key fails -* [ARROW-9864](https://issues.apache.org/jira/browse/ARROW-9864) - [Python] pathlib.Path not supported in write\_to\_dataset with partition columns -* [ARROW-9874](https://issues.apache.org/jira/browse/ARROW-9874) - [C++] NewStreamWriter / NewFileWriter don't own output stream -* [ARROW-9876](https://issues.apache.org/jira/browse/ARROW-9876) - [CI][C++] Travis ARM jobs timeout -* [ARROW-9877](https://issues.apache.org/jira/browse/ARROW-9877) - [C++][CI] homebrew-cpp fails due to avx512 -* [ARROW-9879](https://issues.apache.org/jira/browse/ARROW-9879) - [Python] ChunkedArray.\_\_getitem\_\_ doesn't work with numpy scalars -* [ARROW-9882](https://issues.apache.org/jira/browse/ARROW-9882) - [C++/Python] Update conda-forge-pinning to 3 for OSX conda packages -* [ARROW-9883](https://issues.apache.org/jira/browse/ARROW-9883) - [R] Fix linuxlibs.R install script for R < 3.6 -* [ARROW-9888](https://issues.apache.org/jira/browse/ARROW-9888) - [Rust] [DataFusion] ExecutionContext can not be shared between threads -* [ARROW-9889](https://issues.apache.org/jira/browse/ARROW-9889) - [Rust][DataFusion] Datafusion CLI: CREATE EXTERNAL TABLE errors with "Unsupported logical plan variant" -* [ARROW-9897](https://issues.apache.org/jira/browse/ARROW-9897) - [C++][Gandiva] Add to\_date() function from pattern -* [ARROW-9906](https://issues.apache.org/jira/browse/ARROW-9906) - [Python] Crash in test\_parquet.py::test\_parquet\_writer\_filesystem\_s3\_uri (closing NativeFile from S3FileSystem) -* [ARROW-9913](https://issues.apache.org/jira/browse/ARROW-9913) - [C++] Outputs of Decimal128::FromString depend on presence of one another -* [ARROW-9920](https://issues.apache.org/jira/browse/ARROW-9920) - [Python] pyarrow.concat\_arrays segfaults when passing it a chunked array -* [ARROW-9922](https://issues.apache.org/jira/browse/ARROW-9922) - [Rust] Add \`try\_from(Vec\>)\` to StructArray -* [ARROW-9924](https://issues.apache.org/jira/browse/ARROW-9924) - [Python] Performance regression reading individual Parquet files using Dataset interface -* [ARROW-9931](https://issues.apache.org/jira/browse/ARROW-9931) - [C++] Fix undefined behaviour on invalid IPC (OSS-Fuzz) -* [ARROW-9932](https://issues.apache.org/jira/browse/ARROW-9932) - [R] Arrow 1.0.1 R package fails to install on R3.4 over linux -* [ARROW-9936](https://issues.apache.org/jira/browse/ARROW-9936) - [Python] Fix / test relative file paths in pyarrow.parquet -* [ARROW-9937](https://issues.apache.org/jira/browse/ARROW-9937) - [Rust] [DataFusion] Average is not correct -* [ARROW-9943](https://issues.apache.org/jira/browse/ARROW-9943) - [C++] Arrow metadata not applied recursively when reading Parquet file -* [ARROW-9946](https://issues.apache.org/jira/browse/ARROW-9946) - [R] ParquetFileWriter segfaults when \`sink\` is a string -* [ARROW-9953](https://issues.apache.org/jira/browse/ARROW-9953) - [R] Declare minimum version for bit64 -* [ARROW-9962](https://issues.apache.org/jira/browse/ARROW-9962) - [Python] Conversion to pandas with index column using fixed timezone fails -* [ARROW-9968](https://issues.apache.org/jira/browse/ARROW-9968) - [C++] UBSAN link failure with \_\_int8\_t -* [ARROW-9969](https://issues.apache.org/jira/browse/ARROW-9969) - [C++] RecordBatchBuilder yields invalid result with dictionary fields -* [ARROW-9970](https://issues.apache.org/jira/browse/ARROW-9970) - [Go] checkptr failures in sum methods -* [ARROW-9972](https://issues.apache.org/jira/browse/ARROW-9972) - [CI] Work around grpc-re2 clash on Homebrew -* [ARROW-9973](https://issues.apache.org/jira/browse/ARROW-9973) - [Java] JDBC DateConsumer does not allow dates before epoch -* [ARROW-9976](https://issues.apache.org/jira/browse/ARROW-9976) - [Python] ArrowCapacityError when doing Table.from\_pandas with large dataframe -* [ARROW-9990](https://issues.apache.org/jira/browse/ARROW-9990) - [Rust] [DataFusion] NOT is not plannable -* [ARROW-9993](https://issues.apache.org/jira/browse/ARROW-9993) - [Python] Tzinfo - string roundtrip fails on pytz.StaticTzInfo objects -* [ARROW-9994](https://issues.apache.org/jira/browse/ARROW-9994) - [C++][Python] Auto chunking nested array containing binary-like fields result malformed output -* [ARROW-9996](https://issues.apache.org/jira/browse/ARROW-9996) - [C++] Dictionary is unset when calling DictionaryArray.GetScalar for null values -* [ARROW-10003](https://issues.apache.org/jira/browse/ARROW-10003) - [C++] Create directories in CopyFiles when copying within the same filesystem -* [ARROW-10008](https://issues.apache.org/jira/browse/ARROW-10008) - [Python] pyarrow.parquet.read\_table fails with predicate pushdown on categorical data with use\_legacy\_dataset=False -* [ARROW-10011](https://issues.apache.org/jira/browse/ARROW-10011) - [C++] Make FindRE2.cmake re-entrant -* [ARROW-10012](https://issues.apache.org/jira/browse/ARROW-10012) - [C++] Sporadic failures in CopyFiles test -* [ARROW-10013](https://issues.apache.org/jira/browse/ARROW-10013) - [C++][CI] Flight test failure in TestFlightClient.GenericOptions -* [ARROW-10017](https://issues.apache.org/jira/browse/ARROW-10017) - [Java] LargeMemoryUtil.checkedCastToInt has buggy logic -* [ARROW-10022](https://issues.apache.org/jira/browse/ARROW-10022) - [C++] [Compute] core dumped on some scalar-arithmetic-benchmark -* [ARROW-10027](https://issues.apache.org/jira/browse/ARROW-10027) - [Python] Incorrect null column returned when using a dataset filter expression. -* [ARROW-10034](https://issues.apache.org/jira/browse/ARROW-10034) - [Rust] Master build broken -* [ARROW-10041](https://issues.apache.org/jira/browse/ARROW-10041) - [Rust] Possible to create LargeStringArray with DataType::Utf8 -* [ARROW-10047](https://issues.apache.org/jira/browse/ARROW-10047) - [CI] Conda integration tests failing with cmake error -* [ARROW-10048](https://issues.apache.org/jira/browse/ARROW-10048) - [Rust] Error in aggregate of min/max for strings -* [ARROW-10049](https://issues.apache.org/jira/browse/ARROW-10049) - [C++/Python] Sync conda recipe with conda-forge -* [ARROW-10060](https://issues.apache.org/jira/browse/ARROW-10060) - [Rust] [DataFusion] MergeExec currently discards partitions with errors -* [ARROW-10062](https://issues.apache.org/jira/browse/ARROW-10062) - [Rust]: Fix for null elems for DoubleEndedIter for DictArray -* [ARROW-10073](https://issues.apache.org/jira/browse/ARROW-10073) - [Python] Test test\_parquet\_nested\_storage relies on dict item ordering -* [ARROW-10081](https://issues.apache.org/jira/browse/ARROW-10081) - [C++/Python] Fix bash syntax in drone.io conda builds -* [ARROW-10085](https://issues.apache.org/jira/browse/ARROW-10085) - [C++] S3 tests fail on AppVeyor -* [ARROW-10087](https://issues.apache.org/jira/browse/ARROW-10087) - [CI] Fix nightly docs job -* [ARROW-10098](https://issues.apache.org/jira/browse/ARROW-10098) - [R][Doc] Fix copy\_files doc mismatch -* [ARROW-10104](https://issues.apache.org/jira/browse/ARROW-10104) - [Python] Separate tests into its own conda package -* [ARROW-10114](https://issues.apache.org/jira/browse/ARROW-10114) - [R] Segfault in to\_dataframe\_parallel with deeply nested structs -* [ARROW-10116](https://issues.apache.org/jira/browse/ARROW-10116) - [Python][Packaging] Fix gRPC linking error in macOS wheels builds -* [ARROW-10119](https://issues.apache.org/jira/browse/ARROW-10119) - [C++] Fix Parquet crashes on invalid input (OSS-Fuzz) -* [ARROW-10121](https://issues.apache.org/jira/browse/ARROW-10121) - [C++][Python] Variable dictionaries do not survive roundtrip to IPC stream -* [ARROW-10124](https://issues.apache.org/jira/browse/ARROW-10124) - [R] Write functions don't follow umask setting -* [ARROW-10125](https://issues.apache.org/jira/browse/ARROW-10125) - [R] Int64 downcast check doesn't consider all chunks -* [ARROW-10130](https://issues.apache.org/jira/browse/ARROW-10130) - [C++][Dataset] ParquetFileFragment::SplitByRowGroup does not preserve "complete\_metadata" status -* [ARROW-10136](https://issues.apache.org/jira/browse/ARROW-10136) - [Rust][Arrow] Nulls are transformed into "" after filtering for StringArray -* [ARROW-10137](https://issues.apache.org/jira/browse/ARROW-10137) - [R] Fix cpp helper that breaks if libarrow is not present -* [ARROW-10147](https://issues.apache.org/jira/browse/ARROW-10147) - [Python] Constructing pandas metadata fails if an Index name is not JSON-serializable by default -* [ARROW-10150](https://issues.apache.org/jira/browse/ARROW-10150) - [C++] Fix crashes on invalid Parquet file (OSS-Fuzz) -* [ARROW-10169](https://issues.apache.org/jira/browse/ARROW-10169) - [Rust] Nulls should be rendered as "" rather than default value when pretty printing arrays -* [ARROW-10175](https://issues.apache.org/jira/browse/ARROW-10175) - [CI] Nightly hdfs integration test job fails -* [ARROW-10176](https://issues.apache.org/jira/browse/ARROW-10176) - [CI] Nightly valgrind job fails -* [ARROW-10178](https://issues.apache.org/jira/browse/ARROW-10178) - [CI] Fix spark master integration test build setup -* [ARROW-10179](https://issues.apache.org/jira/browse/ARROW-10179) - [Rust] Labeler is not labeling -* [ARROW-10181](https://issues.apache.org/jira/browse/ARROW-10181) - [Rust] Arrow tests fail to compile on Raspberry Pi (32 bit) -* [ARROW-10188](https://issues.apache.org/jira/browse/ARROW-10188) - [Rust] [DataFusion] Some examples are broken -* [ARROW-10189](https://issues.apache.org/jira/browse/ARROW-10189) - [Doc] C data interface example for i32 uses \`l\`, not \`i\`, in the format -* [ARROW-10192](https://issues.apache.org/jira/browse/ARROW-10192) - [C++][Python] Segfault when converting nested struct array with dictionary field to pandas series -* [ARROW-10193](https://issues.apache.org/jira/browse/ARROW-10193) - [Python] Segfault when converting to fixed size binary array -* [ARROW-10200](https://issues.apache.org/jira/browse/ARROW-10200) - [Java][CI] Fix failure of Java CI on s390x -* [ARROW-10204](https://issues.apache.org/jira/browse/ARROW-10204) - [RUST] [Datafusion] Test failure in aggregate\_grouped\_empty with simd feature enabled -* [ARROW-10214](https://issues.apache.org/jira/browse/ARROW-10214) - [Python] UnicodeDecodeError when printing schema with binary metadata -* [ARROW-10226](https://issues.apache.org/jira/browse/ARROW-10226) - [Rust] [Parquet] Parquet reader reading wrong columns in some batches within a parquet file -* [ARROW-10230](https://issues.apache.org/jira/browse/ARROW-10230) - [JS][Doc] JavaScript documentation fails to build -* [ARROW-10232](https://issues.apache.org/jira/browse/ARROW-10232) - FixedSizeListArray is incorrectly written/read to/from parquet -* [ARROW-10234](https://issues.apache.org/jira/browse/ARROW-10234) - [C++][Gandiva] Fix logic of round() for floats/decimals in Gandiva -* [ARROW-10237](https://issues.apache.org/jira/browse/ARROW-10237) - [C++] Duplicate values in a dictionary result in corrupted parquet -* [ARROW-10238](https://issues.apache.org/jira/browse/ARROW-10238) - [C\#] List is broken -* [ARROW-10239](https://issues.apache.org/jira/browse/ARROW-10239) - [C++] aws-sdk-cpp apparently requires zlib too -* [ARROW-10244](https://issues.apache.org/jira/browse/ARROW-10244) - [Python][Docs] Add docs on using pyarrow.dataset.parquet\_dataset -* [ARROW-10248](https://issues.apache.org/jira/browse/ARROW-10248) - [C++][Dataset] Dataset writing does not write schema metadata -* [ARROW-10262](https://issues.apache.org/jira/browse/ARROW-10262) - [C++] Some TypeClass in Scalar classes seem incorrect -* [ARROW-10271](https://issues.apache.org/jira/browse/ARROW-10271) - [Rust] packed\_simd is broken and continued under a new project -* [ARROW-10279](https://issues.apache.org/jira/browse/ARROW-10279) - [Release][Python] Fix verification script to align with the new macos wheel platform tags -* [ARROW-10280](https://issues.apache.org/jira/browse/ARROW-10280) - [Packaging][Python] Fix macOS wheel artifact patterns -* [ARROW-10281](https://issues.apache.org/jira/browse/ARROW-10281) - [Python] Fix warnings when running tests -* [ARROW-10284](https://issues.apache.org/jira/browse/ARROW-10284) - [Python] Pyarrow is raising deprecation warning about filesystems on import -* [ARROW-10285](https://issues.apache.org/jira/browse/ARROW-10285) - [Python] pyarrow.orc submodule is using deprecated functionality -* [ARROW-10286](https://issues.apache.org/jira/browse/ARROW-10286) - [C++][Flight] Misleading CMake errors -* [ARROW-10288](https://issues.apache.org/jira/browse/ARROW-10288) - [C++] Compilation fails on i386 -* [ARROW-10290](https://issues.apache.org/jira/browse/ARROW-10290) - [C++] List POP\_BACK is not available in older CMake versions - - -## New Features and Improvements - -* [ARROW-983](https://issues.apache.org/jira/browse/ARROW-983) - [C++] Implement InputStream and OutputStream classes for interacting with socket connections -* [ARROW-1105](https://issues.apache.org/jira/browse/ARROW-1105) - [C++] SQLite record batch reader -* [ARROW-1509](https://issues.apache.org/jira/browse/ARROW-1509) - [Python] Write serialized object as a stream of encapsulated IPC messages -* [ARROW-1669](https://issues.apache.org/jira/browse/ARROW-1669) - [C++] Consider adding Abseil (Google C++11 standard library extensions) to toolchain -* [ARROW-1797](https://issues.apache.org/jira/browse/ARROW-1797) - [C++] Implement binary arithmetic kernels for numeric arrays -* [ARROW-2164](https://issues.apache.org/jira/browse/ARROW-2164) - [C++] Clean up unnecessary decimal module refs -* [ARROW-3080](https://issues.apache.org/jira/browse/ARROW-3080) - [Python] Unify Arrow to Python object conversion paths -* [ARROW-3757](https://issues.apache.org/jira/browse/ARROW-3757) - [R] R bindings for Flight RPC client -* [ARROW-3872](https://issues.apache.org/jira/browse/ARROW-3872) - [R] Add ad hoc test of feather compatibility -* [ARROW-4046](https://issues.apache.org/jira/browse/ARROW-4046) - [Python/CI] Exercise large memory tests -* [ARROW-4248](https://issues.apache.org/jira/browse/ARROW-4248) - [C++][Plasma] Build on Windows / Visual Studio -* [ARROW-4685](https://issues.apache.org/jira/browse/ARROW-4685) - [C++] Update Boost to 1.69 in manylinux1 docker image -* [ARROW-4927](https://issues.apache.org/jira/browse/ARROW-4927) - [Rust] Update top level README to describe current functionality -* [ARROW-4957](https://issues.apache.org/jira/browse/ARROW-4957) - [Rust] [DataFusion] Implement get\_supertype correctly -* [ARROW-4965](https://issues.apache.org/jira/browse/ARROW-4965) - [Python] Timestamp array type detection should use tzname of datetime.datetime objects -* [ARROW-5034](https://issues.apache.org/jira/browse/ARROW-5034) - [C\#] ArrowStreamWriter should expose synchronous Write methods -* [ARROW-5123](https://issues.apache.org/jira/browse/ARROW-5123) - [Rust] derive RecordWriter from struct definitions -* [ARROW-6075](https://issues.apache.org/jira/browse/ARROW-6075) - [FlightRPC] Handle uncaught exceptions in middleware -* [ARROW-6281](https://issues.apache.org/jira/browse/ARROW-6281) - [Python] Produce chunked arrays for nested types in pyarrow.array -* [ARROW-6282](https://issues.apache.org/jira/browse/ARROW-6282) - [Format] Support lossy compression -* [ARROW-6437](https://issues.apache.org/jira/browse/ARROW-6437) - [R] Add AWS SDK to system dependencies for macOS and Windows -* [ARROW-6535](https://issues.apache.org/jira/browse/ARROW-6535) - [C++] Status::WithMessage should accept variadic parameters -* [ARROW-6537](https://issues.apache.org/jira/browse/ARROW-6537) - [R] Pass column\_types to CSV reader -* [ARROW-6972](https://issues.apache.org/jira/browse/ARROW-6972) - [C\#] Should support StructField arrays -* [ARROW-6982](https://issues.apache.org/jira/browse/ARROW-6982) - [R] Add bindings for compare and boolean kernels -* [ARROW-7136](https://issues.apache.org/jira/browse/ARROW-7136) - [Rust][CI] Pre-install the rust dependencies in the dockerfile -* [ARROW-7218](https://issues.apache.org/jira/browse/ARROW-7218) - [Python] Conversion from boolean numpy scalars not working -* [ARROW-7302](https://issues.apache.org/jira/browse/ARROW-7302) - [C++] CSV: allow converting a column to a specific dictionary type -* [ARROW-7372](https://issues.apache.org/jira/browse/ARROW-7372) - [C++] Allow creating dictionary array from simple JSON -* [ARROW-7871](https://issues.apache.org/jira/browse/ARROW-7871) - [Python] Expose more compute kernels -* [ARROW-7960](https://issues.apache.org/jira/browse/ARROW-7960) - [C++][Parquet] Add support for schema translation from parquet nodes back to arrow for missing types -* [ARROW-8001](https://issues.apache.org/jira/browse/ARROW-8001) - [R][Dataset] Bindings for dataset writing -* [ARROW-8002](https://issues.apache.org/jira/browse/ARROW-8002) - [C++][Dataset] Dataset writing should let you (re)partition the data -* [ARROW-8048](https://issues.apache.org/jira/browse/ARROW-8048) - [Python] Run memory leak tests nightly as follow up to ARROW-4120 -* [ARROW-8172](https://issues.apache.org/jira/browse/ARROW-8172) - [C++] ArrayFromJSON for dictionary arrays -* [ARROW-8205](https://issues.apache.org/jira/browse/ARROW-8205) - [Rust] [DataFusion] DataFusion should enforce unique field names in a schema -* [ARROW-8253](https://issues.apache.org/jira/browse/ARROW-8253) - [Rust] [DataFusion] Improve ergonomics of registering UDFs -* [ARROW-8262](https://issues.apache.org/jira/browse/ARROW-8262) - [Rust] [DataFusion] Add example that uses LogicalPlanBuilder -* [ARROW-8289](https://issues.apache.org/jira/browse/ARROW-8289) - [Rust] [Parquet] Implement minimal Arrow Parquet writer as starting point for full writer -* [ARROW-8296](https://issues.apache.org/jira/browse/ARROW-8296) - [C++][Dataset] IpcFileFormat should support writing files with compressed buffers -* [ARROW-8355](https://issues.apache.org/jira/browse/ARROW-8355) - [Python] Reduce the number of pandas dependent test cases in test\_feather -* [ARROW-8359](https://issues.apache.org/jira/browse/ARROW-8359) - [C++/Python] Enable aarch64/ppc64le build in conda recipes -* [ARROW-8383](https://issues.apache.org/jira/browse/ARROW-8383) - [Rust] Easier random access to DictionaryArray keys and values -* [ARROW-8402](https://issues.apache.org/jira/browse/ARROW-8402) - [Java] Support ValidateFull methods in Java -* [ARROW-8423](https://issues.apache.org/jira/browse/ARROW-8423) - [Rust] [Parquet] Serialize arrow schema into metadata when writing parquet -* [ARROW-8426](https://issues.apache.org/jira/browse/ARROW-8426) - [Rust] [Parquet] Add support for writing dictionary types -* [ARROW-8493](https://issues.apache.org/jira/browse/ARROW-8493) - [C++] Create unified schema resolution code for Array reconstruction. -* [ARROW-8494](https://issues.apache.org/jira/browse/ARROW-8494) - [C++] Implement basic array-by-array reassembly logic -* [ARROW-8581](https://issues.apache.org/jira/browse/ARROW-8581) - [C\#] Date32/64Array.Builder should accept DateTime, not DateTimeOffset -* [ARROW-8601](https://issues.apache.org/jira/browse/ARROW-8601) - [Go][Flight] Implement Flight Writer interface -* [ARROW-8601](https://issues.apache.org/jira/browse/ARROW-8601) - [Go][Flight] Implement Flight Writer interface -* [ARROW-8618](https://issues.apache.org/jira/browse/ARROW-8618) - [C++] ASSIGN\_OR\_RAISE should move its argument -* [ARROW-8678](https://issues.apache.org/jira/browse/ARROW-8678) - [C++][Parquet] Remove legacy arrow to level translation. -* [ARROW-8712](https://issues.apache.org/jira/browse/ARROW-8712) - [R] Expose strptime timestamp parsing in read\_csv conversion options -* [ARROW-8774](https://issues.apache.org/jira/browse/ARROW-8774) - [Rust] [DataFusion] Improve threading model -* [ARROW-8810](https://issues.apache.org/jira/browse/ARROW-8810) - [R] Add documentation about Parquet format, appending to stream format -* [ARROW-8824](https://issues.apache.org/jira/browse/ARROW-8824) - [Rust] [DataFusion] Implement new SQL parser -* [ARROW-8828](https://issues.apache.org/jira/browse/ARROW-8828) - [Rust] Implement SQL tokenizer -* [ARROW-8829](https://issues.apache.org/jira/browse/ARROW-8829) - [Rust] Implement SQL parser -* [ARROW-9010](https://issues.apache.org/jira/browse/ARROW-9010) - [Java] Framework and interface changes for RecordBatch IPC buffer compression -* [ARROW-9065](https://issues.apache.org/jira/browse/ARROW-9065) - [C++] Support parsing date32 in dataset partition folders -* [ARROW-9068](https://issues.apache.org/jira/browse/ARROW-9068) - [C++][Dataset] Simplify Partitioning interface -* [ARROW-9078](https://issues.apache.org/jira/browse/ARROW-9078) - [C++] Parquet writing of extension type with nested storage type fails -* [ARROW-9104](https://issues.apache.org/jira/browse/ARROW-9104) - [C++] Parquet encryption tests should write files to a temporary directory instead of the testing submodule's directory -* [ARROW-9107](https://issues.apache.org/jira/browse/ARROW-9107) - [C++][Dataset] Time-based types support -* [ARROW-9147](https://issues.apache.org/jira/browse/ARROW-9147) - [C++][Dataset] Support null -\> other type promotion in Dataset scanning -* [ARROW-9205](https://issues.apache.org/jira/browse/ARROW-9205) - [Documentation] Fix typos in Columnar.rst -* [ARROW-9266](https://issues.apache.org/jira/browse/ARROW-9266) - [Python][Packaging] Enable S3 support in macOS wheels -* [ARROW-9271](https://issues.apache.org/jira/browse/ARROW-9271) - [R] Preserve data frame metadata in round trip -* [ARROW-9286](https://issues.apache.org/jira/browse/ARROW-9286) - [C++] Add function "aliases" to compute::FunctionRegistry -* [ARROW-9328](https://issues.apache.org/jira/browse/ARROW-9328) - [C++][Gandiva] Add LTRIM, RTRIM, BTRIM functions for string -* [ARROW-9338](https://issues.apache.org/jira/browse/ARROW-9338) - [Rust] Add instructions for running clippy locally -* [ARROW-9344](https://issues.apache.org/jira/browse/ARROW-9344) - [C++][Flight] measure latency quantile in flight benchmark -* [ARROW-9358](https://issues.apache.org/jira/browse/ARROW-9358) - [Integration] Reconsider generated\_large\_batch.json -* [ARROW-9371](https://issues.apache.org/jira/browse/ARROW-9371) - [Java] Run vector tests for both allocators -* [ARROW-9377](https://issues.apache.org/jira/browse/ARROW-9377) - [Java] Support unsigned dictionary indices -* [ARROW-9387](https://issues.apache.org/jira/browse/ARROW-9387) - [R] Use new C++ table select method -* [ARROW-9388](https://issues.apache.org/jira/browse/ARROW-9388) - [C++] Division kernels -* [ARROW-9394](https://issues.apache.org/jira/browse/ARROW-9394) - [Python] Support pickling of Scalars -* [ARROW-9398](https://issues.apache.org/jira/browse/ARROW-9398) - [C++] Register the SIMD sum variants under function instance instead a SIMD function -* [ARROW-9402](https://issues.apache.org/jira/browse/ARROW-9402) - [C++] Add portable wrappers for \_\_builtin\_add\_overflow and friends -* [ARROW-9405](https://issues.apache.org/jira/browse/ARROW-9405) - [R] Switch to cpp11 -* [ARROW-9412](https://issues.apache.org/jira/browse/ARROW-9412) - [C++] Add non-BUNDLED dependencies to exported INSTALL\_INTERFACE\_LIBS of arrow\_static and test that it works -* [ARROW-9429](https://issues.apache.org/jira/browse/ARROW-9429) - [Python] ChunkedArray.to\_numpy -* [ARROW-9454](https://issues.apache.org/jira/browse/ARROW-9454) - [GLib] Add binding of some dictionary builders -* [ARROW-9465](https://issues.apache.org/jira/browse/ARROW-9465) - [Python] Improve ergonomics of compute functions -* [ARROW-9469](https://issues.apache.org/jira/browse/ARROW-9469) - [Python] Make more objects weakrefable -* [ARROW-9487](https://issues.apache.org/jira/browse/ARROW-9487) - [Developer] Cover the archery release utilities with unittests -* [ARROW-9488](https://issues.apache.org/jira/browse/ARROW-9488) - [Release] Use the new changelog generation when updating the website -* [ARROW-9507](https://issues.apache.org/jira/browse/ARROW-9507) - [Rust] [DataFusion] PhysicalExpr should implement Display trait -* [ARROW-9508](https://issues.apache.org/jira/browse/ARROW-9508) - [Release][APT][Yum] Enable verification for arm64 binaries -* [ARROW-9516](https://issues.apache.org/jira/browse/ARROW-9516) - [Rust][DataFusion] Refactor physical expressions to not care about their names nor indexes -* [ARROW-9517](https://issues.apache.org/jira/browse/ARROW-9517) - [C++][Python] Allow session\_token argument when initializing S3FileSystem -* [ARROW-9518](https://issues.apache.org/jira/browse/ARROW-9518) - [Python] Deprecate pyarrow serialization -* [ARROW-9521](https://issues.apache.org/jira/browse/ARROW-9521) - [Rust] CsvReadOptions should allow file extension to be specified -* [ARROW-9523](https://issues.apache.org/jira/browse/ARROW-9523) - [Rust] improve performance of filter kernel -* [ARROW-9534](https://issues.apache.org/jira/browse/ARROW-9534) - [Rust] [DataFusion] Implement functions for creating literal expressions for all types -* [ARROW-9550](https://issues.apache.org/jira/browse/ARROW-9550) - [Rust] [DataFusion] Remove Rc\> from hash aggregate operator -* [ARROW-9553](https://issues.apache.org/jira/browse/ARROW-9553) - [Rust] Release script doesn't bump parquet crate's arrow dependency version -* [ARROW-9557](https://issues.apache.org/jira/browse/ARROW-9557) - [R] Iterating over parquet columns is slow in R -* [ARROW-9559](https://issues.apache.org/jira/browse/ARROW-9559) - [Rust] [DataFusion] Revert privatization of exprlist\_to\_fields -* [ARROW-9563](https://issues.apache.org/jira/browse/ARROW-9563) - [Dev][Release] Use archery's changelog generator when creating release notes for the website -* [ARROW-9568](https://issues.apache.org/jira/browse/ARROW-9568) - [CI] Use official msys action on GHA -* [ARROW-9576](https://issues.apache.org/jira/browse/ARROW-9576) - [Python][Doc] Fix error in code example for extension types -* [ARROW-9580](https://issues.apache.org/jira/browse/ARROW-9580) - [JS] Docs have superfluous () -* [ARROW-9581](https://issues.apache.org/jira/browse/ARROW-9581) - [Dev][Release] Bump next snapshot versions to 2.0.0 -* [ARROW-9582](https://issues.apache.org/jira/browse/ARROW-9582) - [Rust] Implement Array::memory\_size() -* [ARROW-9585](https://issues.apache.org/jira/browse/ARROW-9585) - [Rust] Remove duplicated to-do line in DataFusion readme -* [ARROW-9587](https://issues.apache.org/jira/browse/ARROW-9587) - [FlightRPC][Java] Clean up DoPut/FlightStream memory handling -* [ARROW-9593](https://issues.apache.org/jira/browse/ARROW-9593) - [Python] Add custom pickle reducers for DictionaryScalar -* [ARROW-9604](https://issues.apache.org/jira/browse/ARROW-9604) - [C++] Add benchmark for aggregate min/max compute kernels -* [ARROW-9605](https://issues.apache.org/jira/browse/ARROW-9605) - [C++] Optimize performance for aggregate min/max compute kernels -* [ARROW-9607](https://issues.apache.org/jira/browse/ARROW-9607) - [C++][Gandiva] Add bitwise\_and(), bitwise\_or() and bitwise\_not() functions for integers -* [ARROW-9608](https://issues.apache.org/jira/browse/ARROW-9608) - [Rust] Remove arrow flight from parquet's feature gating -* [ARROW-9615](https://issues.apache.org/jira/browse/ARROW-9615) - [Rust] Add kernel to compute length of string array -* [ARROW-9617](https://issues.apache.org/jira/browse/ARROW-9617) - [Rust] [DataFusion] Add length of string array -* [ARROW-9618](https://issues.apache.org/jira/browse/ARROW-9618) - [Rust] [DataFusion] Make it easier to write optimizers -* [ARROW-9619](https://issues.apache.org/jira/browse/ARROW-9619) - [Rust] [DataFusion] Add predicate push-down -* [ARROW-9632](https://issues.apache.org/jira/browse/ARROW-9632) - [Rust] Add a "new" method for ExecutionContextSchemaProvider -* [ARROW-9638](https://issues.apache.org/jira/browse/ARROW-9638) - [C++][Compute] Implement mode(most frequent number) kernel -* [ARROW-9639](https://issues.apache.org/jira/browse/ARROW-9639) - [Ruby] Add dependency version check -* [ARROW-9640](https://issues.apache.org/jira/browse/ARROW-9640) - [C++][Gandiva] Implement round() for integers and long integers -* [ARROW-9641](https://issues.apache.org/jira/browse/ARROW-9641) - [C++][Gandiva] Implement round() for floating point and double floating point numbers -* [ARROW-9645](https://issues.apache.org/jira/browse/ARROW-9645) - [Python] Deprecate the legacy pyarrow.filesystem interface -* [ARROW-9646](https://issues.apache.org/jira/browse/ARROW-9646) - [C++][Dataset] Add support for writing parquet datasets -* [ARROW-9650](https://issues.apache.org/jira/browse/ARROW-9650) - [Packaging][APT] Drop support for Ubuntu 19.10 -* [ARROW-9654](https://issues.apache.org/jira/browse/ARROW-9654) - [Rust][DataFusion] Add an EXPLAIN command to the datafusion CLI -* [ARROW-9656](https://issues.apache.org/jira/browse/ARROW-9656) - [Rust][DataFusion] Slightly confusing error message when unsupported type is provided to CREATE EXTERNAL TABLE -* [ARROW-9658](https://issues.apache.org/jira/browse/ARROW-9658) - [Python][Dataset] Bindings for dataset writing -* [ARROW-9665](https://issues.apache.org/jira/browse/ARROW-9665) - [R] head/tail/take for Datasets -* [ARROW-9667](https://issues.apache.org/jira/browse/ARROW-9667) - [CI][Crossbow] Segfault in 2 nightly R builds -* [ARROW-9671](https://issues.apache.org/jira/browse/ARROW-9671) - [C++] BasicDecimal128 constructor interprets uint64\_t integers with highest bit set as negative -* [ARROW-9673](https://issues.apache.org/jira/browse/ARROW-9673) - [Rust] Add a param "dialect" for DFParser::parse\_sql -* [ARROW-9678](https://issues.apache.org/jira/browse/ARROW-9678) - [Rust] [DataFusion] Improve projection push down to remove unused columns -* [ARROW-9679](https://issues.apache.org/jira/browse/ARROW-9679) - [Rust] [DataFusion] HashAggregate walks map many times building final batch -* [ARROW-9681](https://issues.apache.org/jira/browse/ARROW-9681) - [Java] Failed Arrow Memory - Core on big-endian platform -* [ARROW-9683](https://issues.apache.org/jira/browse/ARROW-9683) - [Rust][DataFusion] Implement Debug for ExecutionPlan trait -* [ARROW-9691](https://issues.apache.org/jira/browse/ARROW-9691) - [Rust] [DataFusion] Make sql\_statement\_to\_plan public -* [ARROW-9695](https://issues.apache.org/jira/browse/ARROW-9695) - [Rust][DataFusion] Improve documentation on LogicalPlan variants -* [ARROW-9699](https://issues.apache.org/jira/browse/ARROW-9699) - [C++][Compute] Improve mode kernel performance for small integer types -* [ARROW-9701](https://issues.apache.org/jira/browse/ARROW-9701) - [Java][CI] Add a test job on s390x -* [ARROW-9702](https://issues.apache.org/jira/browse/ARROW-9702) - [C++] Move bpacking simd to runtime path -* [ARROW-9703](https://issues.apache.org/jira/browse/ARROW-9703) - [Developer][Archery] Restartable cherry-picking process for creating maintenance branches -* [ARROW-9706](https://issues.apache.org/jira/browse/ARROW-9706) - [Java] Tests in TestLargeListVector fails on big endian platform -* [ARROW-9710](https://issues.apache.org/jira/browse/ARROW-9710) - [C++] Generalize Decimal ToString in preparation for Decimal256 -* [ARROW-9711](https://issues.apache.org/jira/browse/ARROW-9711) - [Rust] Add benchmark based on TPC-H -* [ARROW-9713](https://issues.apache.org/jira/browse/ARROW-9713) - [Rust][DataFusion] Remove explicit panics -* [ARROW-9715](https://issues.apache.org/jira/browse/ARROW-9715) - [R] changelog/doc updates for 1.0.1 -* [ARROW-9718](https://issues.apache.org/jira/browse/ARROW-9718) - [Python] Make pyarrow.parquet work with the new filesystem interfaces -* [ARROW-9721](https://issues.apache.org/jira/browse/ARROW-9721) - [Packaging][Python] Update wheel dependency files -* [ARROW-9722](https://issues.apache.org/jira/browse/ARROW-9722) - [Rust]: Shorten key lifetime for reverse lookup for dictionary arrays -* [ARROW-9723](https://issues.apache.org/jira/browse/ARROW-9723) - [C++] Expected behaviour of "mode" kernel with NaNs ? -* [ARROW-9725](https://issues.apache.org/jira/browse/ARROW-9725) - [Rust] [DataFusion] LimitExec and SortExec should use MergeExec -* [ARROW-9737](https://issues.apache.org/jira/browse/ARROW-9737) - [C++][Gandiva] Add bitwise\_xor() for integers -* [ARROW-9739](https://issues.apache.org/jira/browse/ARROW-9739) - [CI][Ruby] Don't install gem documents -* [ARROW-9742](https://issues.apache.org/jira/browse/ARROW-9742) - [Rust] Create one standard DataFrame API -* [ARROW-9751](https://issues.apache.org/jira/browse/ARROW-9751) - [Rust] [DataFusion] Extend UDFs to accept more than one type per argument -* [ARROW-9752](https://issues.apache.org/jira/browse/ARROW-9752) - [Rust] [DataFusion] Add support for Aggregate UDFs -* [ARROW-9753](https://issues.apache.org/jira/browse/ARROW-9753) - [Rust] [DataFusion] Remove the use of Mutex in ExecutionPlan trait -* [ARROW-9754](https://issues.apache.org/jira/browse/ARROW-9754) - [Rust] [DataFusion] Implement async in DataFusion traits -* [ARROW-9757](https://issues.apache.org/jira/browse/ARROW-9757) - [Rust] [DataFusion] Use "pub use" to expose a clean public API -* [ARROW-9758](https://issues.apache.org/jira/browse/ARROW-9758) - [Rust] [DataFusion] Implement extension API for DataFusion -* [ARROW-9759](https://issues.apache.org/jira/browse/ARROW-9759) - [Rust] [DataFusion] Implement DataFrame::sort -* [ARROW-9760](https://issues.apache.org/jira/browse/ARROW-9760) - [Rust] [DataFusion] Implement DataFrame::explain -* [ARROW-9761](https://issues.apache.org/jira/browse/ARROW-9761) - [C++] Add experimental pull-based iterator structures to C interface implementation -* [ARROW-9762](https://issues.apache.org/jira/browse/ARROW-9762) - [Rust] [DataFusion] ExecutionContext::sql should return DataFrame -* [ARROW-9769](https://issues.apache.org/jira/browse/ARROW-9769) - [Python] Remove skip for in-memory fsspec in test\_move\_file -* [ARROW-9775](https://issues.apache.org/jira/browse/ARROW-9775) - [C++] Automatic S3 region selection -* [ARROW-9781](https://issues.apache.org/jira/browse/ARROW-9781) - [C++] Fix uninitialized value warnings -* [ARROW-9782](https://issues.apache.org/jira/browse/ARROW-9782) - [C++][Dataset] Ability to write ".feather" files with IpcFileFormat -* [ARROW-9784](https://issues.apache.org/jira/browse/ARROW-9784) - [Rust] [DataFusion] Improve instructions for running tpch benchmark -* [ARROW-9786](https://issues.apache.org/jira/browse/ARROW-9786) - [R] Unvendor cpp11 before release -* [ARROW-9788](https://issues.apache.org/jira/browse/ARROW-9788) - Handle naming inconsistencies between SQL, DataFrame API and struct names -* [ARROW-9792](https://issues.apache.org/jira/browse/ARROW-9792) - [Rust] [DataFusion] Logical aggregate functions should not return Result -* [ARROW-9794](https://issues.apache.org/jira/browse/ARROW-9794) - [C++] Add functionality to cpu\_info to discriminate between Intel vs AMD x86 -* [ARROW-9795](https://issues.apache.org/jira/browse/ARROW-9795) - [C++][Gandiva] Implement castTIMESTAMP(int64) in Gandiva -* [ARROW-9806](https://issues.apache.org/jira/browse/ARROW-9806) - [R] More compute kernel bindings -* [ARROW-9807](https://issues.apache.org/jira/browse/ARROW-9807) - [R] News update/version bump post-1.0.1 -* [ARROW-9808](https://issues.apache.org/jira/browse/ARROW-9808) - [Python] parquet.read\_table docstring wrong use\_legacy\_dataset explanation -* [ARROW-9811](https://issues.apache.org/jira/browse/ARROW-9811) - [C++] Unchecked floating point division by 0 should succeed -* [ARROW-9813](https://issues.apache.org/jira/browse/ARROW-9813) - [C++] Disable semantic interposition -* [ARROW-9819](https://issues.apache.org/jira/browse/ARROW-9819) - [C++] Bump mimalloc to 1.6.4 -* [ARROW-9821](https://issues.apache.org/jira/browse/ARROW-9821) - [Rust][DataFusion] User Defined PlanNode / Operator API -* [ARROW-9821](https://issues.apache.org/jira/browse/ARROW-9821) - [Rust][DataFusion] User Defined PlanNode / Operator API -* [ARROW-9823](https://issues.apache.org/jira/browse/ARROW-9823) - [CI][C++][MinGW] Enable S3 -* [ARROW-9832](https://issues.apache.org/jira/browse/ARROW-9832) - [Rust] [DataFusion] Refactor PhysicalPlan to remove Partition -* [ARROW-9833](https://issues.apache.org/jira/browse/ARROW-9833) - [Rust] [DataFusion] Refactor TableProvider.scan to return ExecutionPlan -* [ARROW-9834](https://issues.apache.org/jira/browse/ARROW-9834) - [Rust] [DataFusion] Remove Partition trait -* [ARROW-9835](https://issues.apache.org/jira/browse/ARROW-9835) - [Rust] [DataFusion] Remove FunctionMeta -* [ARROW-9836](https://issues.apache.org/jira/browse/ARROW-9836) - [Rust] [DataFusion] Improve API for usage of UDFs -* [ARROW-9837](https://issues.apache.org/jira/browse/ARROW-9837) - [Rust] Add provider for variable -* [ARROW-9838](https://issues.apache.org/jira/browse/ARROW-9838) - [Rust] [DataFusion] DefaultPhysicalPlanner should insert explicit MergeExec nodes -* [ARROW-9839](https://issues.apache.org/jira/browse/ARROW-9839) - [Rust] [DataFusion] Add ability to downcast ExecutionPlan to specific operator -* [ARROW-9841](https://issues.apache.org/jira/browse/ARROW-9841) - [Rust] Update checked-in flatbuffer files -* [ARROW-9844](https://issues.apache.org/jira/browse/ARROW-9844) - [Go][CI] Add Travis CI job for Go on s390x -* [ARROW-9845](https://issues.apache.org/jira/browse/ARROW-9845) - [Rust] [Parquet] serde\_json is only used in tests but isn't in dev-dependencies -* [ARROW-9848](https://issues.apache.org/jira/browse/ARROW-9848) - [Rust] Implement changes to ensure flatbuffer alignment -* [ARROW-9849](https://issues.apache.org/jira/browse/ARROW-9849) - [Rust] [DataFusion] Make UDFs not need a Field -* [ARROW-9850](https://issues.apache.org/jira/browse/ARROW-9850) - [Go] Defer should not be used in the loop -* [ARROW-9853](https://issues.apache.org/jira/browse/ARROW-9853) - [RUST] Implement "take" kernel for dictionary arrays -* [ARROW-9854](https://issues.apache.org/jira/browse/ARROW-9854) - [R] Support reading/writing data to/from S3 -* [ARROW-9858](https://issues.apache.org/jira/browse/ARROW-9858) - [C++][Python][Docs] Expand user guide for FileSystem -* [ARROW-9863](https://issues.apache.org/jira/browse/ARROW-9863) - [C++] [PARQUET] Optimize meta data recovery of ApplicationVersion -* [ARROW-9867](https://issues.apache.org/jira/browse/ARROW-9867) - [C++][Dataset] FileSystemDataset should expose its filesystem -* [ARROW-9868](https://issues.apache.org/jira/browse/ARROW-9868) - [C++] Provide utility for copying files between filesystems -* [ARROW-9869](https://issues.apache.org/jira/browse/ARROW-9869) - [R] Implement full S3FileSystem/S3Options constructor -* [ARROW-9870](https://issues.apache.org/jira/browse/ARROW-9870) - [R] Friendly interface for filesystems (S3) -* [ARROW-9871](https://issues.apache.org/jira/browse/ARROW-9871) - [C++] Add uppercase support to ARROW\_USER\_SIMD\_LEVEL. -* [ARROW-9873](https://issues.apache.org/jira/browse/ARROW-9873) - [C++][Compute] Improve mode kernel for intergers within limited value range -* [ARROW-9875](https://issues.apache.org/jira/browse/ARROW-9875) - [Python] Let FileSystem.get\_file\_info accept a single path -* [ARROW-9884](https://issues.apache.org/jira/browse/ARROW-9884) - [R] Bindings for writing datasets to Parquet -* [ARROW-9885](https://issues.apache.org/jira/browse/ARROW-9885) - [Rust] [DataFusion] Simplify code of type coercion for binary types -* [ARROW-9886](https://issues.apache.org/jira/browse/ARROW-9886) - [Rust] [DataFusion] Simplify code to test cast -* [ARROW-9887](https://issues.apache.org/jira/browse/ARROW-9887) - [Rust] [DataFusion] Add support for complex return types of built-in functions -* [ARROW-9890](https://issues.apache.org/jira/browse/ARROW-9890) - [R] Add zstandard compression codec in macOS build -* [ARROW-9891](https://issues.apache.org/jira/browse/ARROW-9891) - [Rust] [DataFusion] Make math functions support f32 -* [ARROW-9892](https://issues.apache.org/jira/browse/ARROW-9892) - [Rust] [DataFusion] Add support for concat -* [ARROW-9893](https://issues.apache.org/jira/browse/ARROW-9893) - [Python] Bindings for writing datasets to Parquet -* [ARROW-9895](https://issues.apache.org/jira/browse/ARROW-9895) - [RUST] Improve sort kernels -* [ARROW-9899](https://issues.apache.org/jira/browse/ARROW-9899) - [Rust] [DataFusion] Switch from Box --\> SchemaRef (Arc) to be consistent with the rest of Arrow -* [ARROW-9900](https://issues.apache.org/jira/browse/ARROW-9900) - [Rust][DataFusion] Use Arc<\> instead of Box<\> in LogicalPlan -* [ARROW-9901](https://issues.apache.org/jira/browse/ARROW-9901) - [C++] Add hand-crafted Parquet to Arrow reconstruction test for nested reading -* [ARROW-9902](https://issues.apache.org/jira/browse/ARROW-9902) - [Rust] [DataFusion] Add support for array() -* [ARROW-9904](https://issues.apache.org/jira/browse/ARROW-9904) - [C++] Unroll the loop manually for CountSetBits -* [ARROW-9908](https://issues.apache.org/jira/browse/ARROW-9908) - [Rust] Support temporal data types in JSON reader -* [ARROW-9910](https://issues.apache.org/jira/browse/ARROW-9910) - [Rust] [DataFusion] Type coercion of Variadic is wrong -* [ARROW-9914](https://issues.apache.org/jira/browse/ARROW-9914) - [Rust][DataFusion] Document the SQL -\> Arrow type mapping -* [ARROW-9916](https://issues.apache.org/jira/browse/ARROW-9916) - [RUST] Avoid cloning ArrayData in several places -* [ARROW-9917](https://issues.apache.org/jira/browse/ARROW-9917) - [Python][Compute] Add bindings for mode kernel -* [ARROW-9919](https://issues.apache.org/jira/browse/ARROW-9919) - [Rust] [DataFusion] Math functions -* [ARROW-9921](https://issues.apache.org/jira/browse/ARROW-9921) - [Rust] Add \`from(Vec\>)\` to [Large]StringArray -* [ARROW-9925](https://issues.apache.org/jira/browse/ARROW-9925) - [GLib] Add low level value readers for GArrowListArray family -* [ARROW-9926](https://issues.apache.org/jira/browse/ARROW-9926) - [GLib] Use placement new for GArrowRecordBatchFileReader -* [ARROW-9928](https://issues.apache.org/jira/browse/ARROW-9928) - [C++] Speed up integer parsing slightly -* [ARROW-9929](https://issues.apache.org/jira/browse/ARROW-9929) - [Developer] Autotune cmake-format -* [ARROW-9933](https://issues.apache.org/jira/browse/ARROW-9933) - [Developer] Add drone as a CI provider for crossbow -* [ARROW-9934](https://issues.apache.org/jira/browse/ARROW-9934) - [Rust] Shape and stride check in tensor -* [ARROW-9941](https://issues.apache.org/jira/browse/ARROW-9941) - [Python] Better string representation for extension types -* [ARROW-9944](https://issues.apache.org/jira/browse/ARROW-9944) - [Rust] Implement TO\_TIMESTAMP function -* [ARROW-9949](https://issues.apache.org/jira/browse/ARROW-9949) - [C++] Generalize Decimal128::FromString for reuse in Decimal256 -* [ARROW-9950](https://issues.apache.org/jira/browse/ARROW-9950) - [Rust] [DataFusion] Allow UDF usage without registry -* [ARROW-9952](https://issues.apache.org/jira/browse/ARROW-9952) - [Python] Use pyarrow.dataset writing for pq.write\_to\_dataset -* [ARROW-9954](https://issues.apache.org/jira/browse/ARROW-9954) - [Rust] [DataFusion] Simplify code of aggregate planning -* [ARROW-9956](https://issues.apache.org/jira/browse/ARROW-9956) - [C++][Gandiva] Implement Binary string function in Gandiva -* [ARROW-9957](https://issues.apache.org/jira/browse/ARROW-9957) - [Rust] Remove unmaintained tempdir dependency -* [ARROW-9961](https://issues.apache.org/jira/browse/ARROW-9961) - [Rust][DataFusion] to\_timestamp function parses timestamp without timezone offset as UTC rather than local -* [ARROW-9964](https://issues.apache.org/jira/browse/ARROW-9964) - [C++] CSV date support -* [ARROW-9965](https://issues.apache.org/jira/browse/ARROW-9965) - [Java] Buffer capacity calculations are slow for fixed-width vectors -* [ARROW-9966](https://issues.apache.org/jira/browse/ARROW-9966) - [Rust] Speedup aggregate kernels -* [ARROW-9967](https://issues.apache.org/jira/browse/ARROW-9967) - [Python] Add compute module docs -* [ARROW-9971](https://issues.apache.org/jira/browse/ARROW-9971) - [Rust] Speedup take -* [ARROW-9977](https://issues.apache.org/jira/browse/ARROW-9977) - [Rust] Add min/max for [Large]String -* [ARROW-9979](https://issues.apache.org/jira/browse/ARROW-9979) - [Rust] Fix arrow crate clippy lints -* [ARROW-9980](https://issues.apache.org/jira/browse/ARROW-9980) - [Rust] Fix parquet crate clippy lints -* [ARROW-9981](https://issues.apache.org/jira/browse/ARROW-9981) - [Rust] Allow configuring flight IPC with IpcWriteOptions -* [ARROW-9983](https://issues.apache.org/jira/browse/ARROW-9983) - [C++][Dataset][Python] Use larger default batch size than 32K for Datasets API -* [ARROW-9984](https://issues.apache.org/jira/browse/ARROW-9984) - [Rust] [DataFusion] DRY of function to string -* [ARROW-9986](https://issues.apache.org/jira/browse/ARROW-9986) - [Rust][DataFusion] TO\_TIMESTAMP function erroneously requires fractional seconds when no timezone is present -* [ARROW-9987](https://issues.apache.org/jira/browse/ARROW-9987) - [Rust] [DataFusion] Improve docs of \`Expr\`. -* [ARROW-9988](https://issues.apache.org/jira/browse/ARROW-9988) - [Rust] [DataFusion] Added std::ops to logical expressions -* [ARROW-9992](https://issues.apache.org/jira/browse/ARROW-9992) - [C++][Python] Refactor python to arrow conversions based on a reusable conversion API -* [ARROW-9998](https://issues.apache.org/jira/browse/ARROW-9998) - [Python] Support pickling DictionaryScalar -* [ARROW-9999](https://issues.apache.org/jira/browse/ARROW-9999) - [Python] Support constructing dictionary array directly through pa.array() -* [ARROW-10000](https://issues.apache.org/jira/browse/ARROW-10000) - [C++][Python] Support constructing StructArray from list of key-value pairs -* [ARROW-10001](https://issues.apache.org/jira/browse/ARROW-10001) - [Rust] [DataFusion] Add developer guide to README -* [ARROW-10010](https://issues.apache.org/jira/browse/ARROW-10010) - [Rust] Speedup arithmetic -* [ARROW-10015](https://issues.apache.org/jira/browse/ARROW-10015) - [Rust] Implement SIMD for aggregate kernel sum -* [ARROW-10016](https://issues.apache.org/jira/browse/ARROW-10016) - [Rust] [DataFusion] Implement IsNull and IsNotNull -* [ARROW-10018](https://issues.apache.org/jira/browse/ARROW-10018) - [CI] Disable Sphinx and API documentation build since it takes 6 hours on master -* [ARROW-10019](https://issues.apache.org/jira/browse/ARROW-10019) - [Rust] Add substring kernel -* [ARROW-10023](https://issues.apache.org/jira/browse/ARROW-10023) - [Gandiva][C++] Implementing Split part function in gandiva -* [ARROW-10024](https://issues.apache.org/jira/browse/ARROW-10024) - [C++][Parquet] Create nested reading benchmarks -* [ARROW-10028](https://issues.apache.org/jira/browse/ARROW-10028) - [Rust] Simplify macro def\_numeric\_from\_vec -* [ARROW-10030](https://issues.apache.org/jira/browse/ARROW-10030) - [Rust] Support fromIter and toIter -* [ARROW-10035](https://issues.apache.org/jira/browse/ARROW-10035) - [C++] Bump versions of vendored code -* [ARROW-10037](https://issues.apache.org/jira/browse/ARROW-10037) - [C++] Workaround to force find AWS SDK to look for shared libraries -* [ARROW-10040](https://issues.apache.org/jira/browse/ARROW-10040) - [Rust] Create a way to slice unalligned offset buffers -* [ARROW-10043](https://issues.apache.org/jira/browse/ARROW-10043) - [Rust] [DataFusion] Introduce support for DISTINCT by partially implementing COUNT(DISTINCT) -* [ARROW-10044](https://issues.apache.org/jira/browse/ARROW-10044) - [Rust] Improve README -* [ARROW-10046](https://issues.apache.org/jira/browse/ARROW-10046) - [Rust] [DataFusion] Made \`\*Iterator\` implement Iterator -* [ARROW-10050](https://issues.apache.org/jira/browse/ARROW-10050) - [C++][Gandiva] Implement concat() in Gandiva for up to 10 arguments -* [ARROW-10051](https://issues.apache.org/jira/browse/ARROW-10051) - [C++][Compute] Make aggregate kernel merge state mutable -* [ARROW-10054](https://issues.apache.org/jira/browse/ARROW-10054) - [Python] Slice methods should return empty arrays instead of crashing -* [ARROW-10055](https://issues.apache.org/jira/browse/ARROW-10055) - [Rust] Implement DoubleEndedIterator for NullableIter -* [ARROW-10057](https://issues.apache.org/jira/browse/ARROW-10057) - [C++] Add Parquet-Arrow roundtrip tests for nested data -* [ARROW-10058](https://issues.apache.org/jira/browse/ARROW-10058) - [C++] Investigate performance of LevelsToBitmap without BMI2 -* [ARROW-10059](https://issues.apache.org/jira/browse/ARROW-10059) - [R][Doc] Give more advice on how to set up C++ build -* [ARROW-10063](https://issues.apache.org/jira/browse/ARROW-10063) - [Archery][CI] Fetch main branch in archery build only when it is a pull request -* [ARROW-10064](https://issues.apache.org/jira/browse/ARROW-10064) - [C++] Resolve compile warnings on Apple Clang 12 -* [ARROW-10065](https://issues.apache.org/jira/browse/ARROW-10065) - [Rust] DRY downcasted Arrays -* [ARROW-10066](https://issues.apache.org/jira/browse/ARROW-10066) - [C++] Make sure that default AWS region is respected -* [ARROW-10068](https://issues.apache.org/jira/browse/ARROW-10068) - [C++] Add bundled external project for aws-sdk-cpp -* [ARROW-10069](https://issues.apache.org/jira/browse/ARROW-10069) - [Java] Support running Java benchmarks from command line -* [ARROW-10070](https://issues.apache.org/jira/browse/ARROW-10070) - [C++][Compute] Implement stdev aggregate kernel -* [ARROW-10071](https://issues.apache.org/jira/browse/ARROW-10071) - [R] segfault with ArrowObject from previous session, or saved -* [ARROW-10074](https://issues.apache.org/jira/browse/ARROW-10074) - [C++] Don't use string\_view.to\_string() -* [ARROW-10075](https://issues.apache.org/jira/browse/ARROW-10075) - [C++] Don't use nonstd::nullopt this breaks out vendoring abstraction. -* [ARROW-10076](https://issues.apache.org/jira/browse/ARROW-10076) - [C++] Use TemporaryDir for all tests that don't already use it. -* [ARROW-10077](https://issues.apache.org/jira/browse/ARROW-10077) - [C++] Potential overflow in bit\_stream\_utils.h multiplication. -* [ARROW-10083](https://issues.apache.org/jira/browse/ARROW-10083) - [C++] Improve Parquet fuzz seed corpus -* [ARROW-10084](https://issues.apache.org/jira/browse/ARROW-10084) - [Rust] [DataFusion] Add length of large string array -* [ARROW-10086](https://issues.apache.org/jira/browse/ARROW-10086) - [Rust] Migrate min\_large\_string -\> min\_string kernels -* [ARROW-10090](https://issues.apache.org/jira/browse/ARROW-10090) - [C++][Compute] Improve mode kernel -* [ARROW-10092](https://issues.apache.org/jira/browse/ARROW-10092) - [Dev][Go] Add grpc generated go files to rat exclusion list -* [ARROW-10093](https://issues.apache.org/jira/browse/ARROW-10093) - [R] Add ability to opt-out of int64 -\> int demotion -* [ARROW-10095](https://issues.apache.org/jira/browse/ARROW-10095) - [Rust] [Parquet] Update for IPC changes -* [ARROW-10096](https://issues.apache.org/jira/browse/ARROW-10096) - [Rust] [DataFusion] Remove unused code -* [ARROW-10099](https://issues.apache.org/jira/browse/ARROW-10099) - [C++][Dataset] Also allow integer partition fields to be dictionary encoded -* [ARROW-10100](https://issues.apache.org/jira/browse/ARROW-10100) - [C++][Dataset] Ability to read/subset a ParquetFileFragment with given set of row group ids -* [ARROW-10102](https://issues.apache.org/jira/browse/ARROW-10102) - [C++] Generalize BasicDecimal128::operator\*= for reuse in Decimal256 -* [ARROW-10103](https://issues.apache.org/jira/browse/ARROW-10103) - [Rust] Add a Contains kernel -* [ARROW-10105](https://issues.apache.org/jira/browse/ARROW-10105) - [FlightRPC] Add client option to disable certificate validation with TLS -* [ARROW-10120](https://issues.apache.org/jira/browse/ARROW-10120) - [C++][Parquet] Create reading benchmarks for 2-level nested data -* [ARROW-10127](https://issues.apache.org/jira/browse/ARROW-10127) - [Format] Update specification to support 256-bit Decimal types -* [ARROW-10129](https://issues.apache.org/jira/browse/ARROW-10129) - [Rust] Cargo build is rebuilding dependencies on arrow changes -* [ARROW-10134](https://issues.apache.org/jira/browse/ARROW-10134) - [C++][Dataset] Add ParquetFileFragment::num\_row\_groups property -* [ARROW-10139](https://issues.apache.org/jira/browse/ARROW-10139) - [C++] Add support for building arrow\_testing without building tests -* [ARROW-10148](https://issues.apache.org/jira/browse/ARROW-10148) - [Rust] Add documentation to lib.rs -* [ARROW-10151](https://issues.apache.org/jira/browse/ARROW-10151) - [Python] Add support MapArray to\_pandas conversion -* [ARROW-10155](https://issues.apache.org/jira/browse/ARROW-10155) - [Rust] [DataFusion] Add documentation to lib.rs -* [ARROW-10156](https://issues.apache.org/jira/browse/ARROW-10156) - [Rust] Auto-label PRs -* [ARROW-10157](https://issues.apache.org/jira/browse/ARROW-10157) - [Rust] Add more documentation about take -* [ARROW-10160](https://issues.apache.org/jira/browse/ARROW-10160) - [Rust] Improve documentation of DictionaryType -* [ARROW-10161](https://issues.apache.org/jira/browse/ARROW-10161) - [Rust] [DataFusion] Simplify expression tests -* [ARROW-10162](https://issues.apache.org/jira/browse/ARROW-10162) - [Rust] Support display of DictionaryArrays in pretty printing -* [ARROW-10164](https://issues.apache.org/jira/browse/ARROW-10164) - [Rust] Add support for DictionaryArray types to cast kernels -* [ARROW-10167](https://issues.apache.org/jira/browse/ARROW-10167) - [Rust] Support display of DictionaryArrays in sql.rs -* [ARROW-10168](https://issues.apache.org/jira/browse/ARROW-10168) - [Rust] [Parquet] Extend arrow schema conversion to projected fields -* [ARROW-10171](https://issues.apache.org/jira/browse/ARROW-10171) - [Rust] [DataFusion] Add \`ExecutionContext::from\` -* [ARROW-10190](https://issues.apache.org/jira/browse/ARROW-10190) - [Website] Add Jorge to list of committers -* [ARROW-10191](https://issues.apache.org/jira/browse/ARROW-10191) - [Rust] [Parquet] Add roundtrip tests for single column batches -* [ARROW-10196](https://issues.apache.org/jira/browse/ARROW-10196) - [C++] Add Future::DeferNotOk() -* [ARROW-10199](https://issues.apache.org/jira/browse/ARROW-10199) - [Rust][Parquet] Release Parquet at crates.io to remove debug prints -* [ARROW-10201](https://issues.apache.org/jira/browse/ARROW-10201) - [C++][CI] Disable S3 in arm64 job on Travis CI -* [ARROW-10202](https://issues.apache.org/jira/browse/ARROW-10202) - [CI][Windows] Use sf.net mirror for MSYS2 -* [ARROW-10205](https://issues.apache.org/jira/browse/ARROW-10205) - [Java][FlightRPC] Add client option to disable server verification -* [ARROW-10206](https://issues.apache.org/jira/browse/ARROW-10206) - [Python][C++][FlightRPC] Add client option to disable server validation -* [ARROW-10215](https://issues.apache.org/jira/browse/ARROW-10215) - [Rust] [DataFusion] Rename "Source" typedef -* [ARROW-10217](https://issues.apache.org/jira/browse/ARROW-10217) - [CI] Run fewer GitHub Actions jobs -* [ARROW-10225](https://issues.apache.org/jira/browse/ARROW-10225) - [Rust] [Parquet] Fix null bitmap comparisons in roundtrip tests -* [ARROW-10227](https://issues.apache.org/jira/browse/ARROW-10227) - [Ruby] Use a table size as the default for parquet chunk\_size -* [ARROW-10229](https://issues.apache.org/jira/browse/ARROW-10229) - [C++][Parquet] Remove left over ARROW\_LOG statement. -* [ARROW-10231](https://issues.apache.org/jira/browse/ARROW-10231) - [CI] Unable to download minio in arm32v7 docker image -* [ARROW-10233](https://issues.apache.org/jira/browse/ARROW-10233) - [Rust] Make array\_value\_to\_string available in all Arrow builds -* [ARROW-10235](https://issues.apache.org/jira/browse/ARROW-10235) - [Rust][DataFusion] Improve documentation for type coercion -* [ARROW-10240](https://issues.apache.org/jira/browse/ARROW-10240) - [Rust] [Datafusion] Optionally load tpch data into memory before running benchmark query -* [ARROW-10251](https://issues.apache.org/jira/browse/ARROW-10251) - [Rust] [DataFusion] MemTable::load() should load partitions in parallel -* [ARROW-10252](https://issues.apache.org/jira/browse/ARROW-10252) - [Python] Add option to skip inclusion of Arrow headers in Python installation -* [ARROW-10256](https://issues.apache.org/jira/browse/ARROW-10256) - [C++][Flight] Disable -Werror carefully -* [ARROW-10257](https://issues.apache.org/jira/browse/ARROW-10257) - [R] Prepare news/docs for 2.0 release -* [ARROW-10260](https://issues.apache.org/jira/browse/ARROW-10260) - [Python] Missing MapType to Pandas dtype -* [ARROW-10265](https://issues.apache.org/jira/browse/ARROW-10265) - [CI] Use smaler build when cache doesn't exit on Travis CI -* [ARROW-10266](https://issues.apache.org/jira/browse/ARROW-10266) - [CI][macOS] Ensure using Python 3.8 with Homebrew -* [ARROW-10267](https://issues.apache.org/jira/browse/ARROW-10267) - [Python] Skip flight test if disable\_server\_verification feature is not available -* [ARROW-10272](https://issues.apache.org/jira/browse/ARROW-10272) - [Packaging][Python] Pin newer multibuild version to avoid updating homebrew -* [ARROW-10273](https://issues.apache.org/jira/browse/ARROW-10273) - [CI][Homebrew] Fix "brew audit" usage -* [ARROW-10287](https://issues.apache.org/jira/browse/ARROW-10287) - [C++] Avoid std::random\_device whenever possible -* [PARQUET-1845](https://issues.apache.org/jira/browse/PARQUET-1845) - [C++] Int96 memory images in test cases assume only little-endian -* [PARQUET-1878](https://issues.apache.org/jira/browse/PARQUET-1878) - [C++] lz4 codec is not compatible with Hadoop Lz4Codec -* [PARQUET-1904](https://issues.apache.org/jira/browse/PARQUET-1904) - [C++] Export file\_offset in RowGroupMetaData - - - -# Apache Arrow 1.0.0 (2020-07-20) - -## Bug Fixes - -* [ARROW-1692](https://issues.apache.org/jira/browse/ARROW-1692) - [Python, Java] UnionArray round trip not working -* [ARROW-3329](https://issues.apache.org/jira/browse/ARROW-3329) - [Python] Error casting decimal(38, 4) to int64 -* [ARROW-3861](https://issues.apache.org/jira/browse/ARROW-3861) - [Python] ParquetDataset().read columns argument always returns partition column -* [ARROW-4018](https://issues.apache.org/jira/browse/ARROW-4018) - [C++] RLE decoder may not big-endian compatible -* [ARROW-4309](https://issues.apache.org/jira/browse/ARROW-4309) - [Documentation] Add a docker-compose entry which builds the documentation with CUDA enabled -* [ARROW-4600](https://issues.apache.org/jira/browse/ARROW-4600) - [Ruby] Arrow::DictionaryArray\#[] should returns the item in the indices array -* [ARROW-5158](https://issues.apache.org/jira/browse/ARROW-5158) - [Packaging][Wheel] Symlink libraries in wheels -* [ARROW-5310](https://issues.apache.org/jira/browse/ARROW-5310) - [Python] better error message on creating ParquetDataset from empty directory -* [ARROW-5359](https://issues.apache.org/jira/browse/ARROW-5359) - [Python] timestamp\_as\_object support for pa.Table.to\_pandas in pyarrow -* [ARROW-5572](https://issues.apache.org/jira/browse/ARROW-5572) - [Python] raise error message when passing invalid filter in parquet reading -* [ARROW-5666](https://issues.apache.org/jira/browse/ARROW-5666) - [Python] Underscores in partition (string) values are dropped when reading dataset -* [ARROW-5744](https://issues.apache.org/jira/browse/ARROW-5744) - [C++] Do not error in Table::CombineChunks for BinaryArray types that overflow 2GB limit -* [ARROW-5875](https://issues.apache.org/jira/browse/ARROW-5875) - [FlightRPC] Test RPC features in integration tests -* [ARROW-6235](https://issues.apache.org/jira/browse/ARROW-6235) - [R] Conversion from arrow::BinaryArray to R character vector not implemented -* [ARROW-6523](https://issues.apache.org/jira/browse/ARROW-6523) - [C++][Dataset] arrow\_dataset target does not depend on anything -* [ARROW-6848](https://issues.apache.org/jira/browse/ARROW-6848) - [C++] Specify -std=c++11 instead of -std=gnu++11 when building -* [ARROW-7018](https://issues.apache.org/jira/browse/ARROW-7018) - [R] Non-UTF-8 data in Arrow <--\> R conversion -* [ARROW-7028](https://issues.apache.org/jira/browse/ARROW-7028) - [R] Date roundtrip results in different R storage mode -* [ARROW-7084](https://issues.apache.org/jira/browse/ARROW-7084) - [C++] ArrayRangeEquals should check for full type equality? -* [ARROW-7173](https://issues.apache.org/jira/browse/ARROW-7173) - [Integration] Add test to verify Map field names can be arbitrary -* [ARROW-7208](https://issues.apache.org/jira/browse/ARROW-7208) - [Python] Passing directory to ParquetFile class gives confusing error message -* [ARROW-7273](https://issues.apache.org/jira/browse/ARROW-7273) - [Python] Non-nullable null field is allowed / crashes when writing to parquet -* [ARROW-7480](https://issues.apache.org/jira/browse/ARROW-7480) - [Rust] [DataFusion] Query fails/incorrect when aggregated + grouped columns don't match the selected columns -* [ARROW-7610](https://issues.apache.org/jira/browse/ARROW-7610) - [Java] Finish support for 64 bit int allocations -* [ARROW-7654](https://issues.apache.org/jira/browse/ARROW-7654) - [Python] Ability to set column\_types to a Schema in csv.ConvertOptions is undocumented -* [ARROW-7681](https://issues.apache.org/jira/browse/ARROW-7681) - [Rust] Explicitly seeking a BufReader will discard the internal buffer -* [ARROW-7702](https://issues.apache.org/jira/browse/ARROW-7702) - [C++][Dataset] Provide (optional) deterministic order of batches -* [ARROW-7782](https://issues.apache.org/jira/browse/ARROW-7782) - [Python] Losing index information when using write\_to\_dataset with partition\_cols -* [ARROW-7840](https://issues.apache.org/jira/browse/ARROW-7840) - [Java] [Integration] Java executables fail -* [ARROW-7925](https://issues.apache.org/jira/browse/ARROW-7925) - [C++][Documentation] Instructions about running IWYU and other tasks in cpp/development.rst have gone stale -* [ARROW-7939](https://issues.apache.org/jira/browse/ARROW-7939) - [Python] crashes when reading parquet file compressed with snappy -* [ARROW-7967](https://issues.apache.org/jira/browse/ARROW-7967) - [CI][Crossbow] Pin macOS version in autobrew job to match CRAN -* [ARROW-8050](https://issues.apache.org/jira/browse/ARROW-8050) - [Python][Packaging] Do not include generated Cython source files in wheel packages -* [ARROW-8078](https://issues.apache.org/jira/browse/ARROW-8078) - [Python] Missing links in the docs regarding field and schema DataTypes -* [ARROW-8115](https://issues.apache.org/jira/browse/ARROW-8115) - [Python] Conversion when mixing NaT and datetime objects not working -* [ARROW-8251](https://issues.apache.org/jira/browse/ARROW-8251) - [Python] pandas.ExtensionDtype does not survive round trip with write\_to\_dataset -* [ARROW-8344](https://issues.apache.org/jira/browse/ARROW-8344) - [C\#] StringArray.Builder.Clear() corrupts subsequently-built array contents -* [ARROW-8360](https://issues.apache.org/jira/browse/ARROW-8360) - [C++][Gandiva] Fixes date32 support for date/time functions -* [ARROW-8374](https://issues.apache.org/jira/browse/ARROW-8374) - [R] Table to vector of DictonaryType will error when Arrays don't have the same Dictionary per array -* [ARROW-8392](https://issues.apache.org/jira/browse/ARROW-8392) - [Java] Fix overflow related corner cases for vector value comparison -* [ARROW-8448](https://issues.apache.org/jira/browse/ARROW-8448) - [Package] Can't build apt packages with ubuntu-focal -* [ARROW-8455](https://issues.apache.org/jira/browse/ARROW-8455) - [Rust] [Parquet] Arrow column read on partially compatible files -* [ARROW-8455](https://issues.apache.org/jira/browse/ARROW-8455) - [Rust] [Parquet] Arrow column read on partially compatible files -* [ARROW-8471](https://issues.apache.org/jira/browse/ARROW-8471) - [C++][Integration] Regression to /u?int64/ as JSON::number -* [ARROW-8472](https://issues.apache.org/jira/browse/ARROW-8472) - [Go][Integration] Represent 64 bit integers as JSON::string -* [ARROW-8473](https://issues.apache.org/jira/browse/ARROW-8473) - [Rust] "Statistics support" in rust/parquet readme is incorrect -* [ARROW-8480](https://issues.apache.org/jira/browse/ARROW-8480) - [Rust] There is no check for allocation failure -* [ARROW-8503](https://issues.apache.org/jira/browse/ARROW-8503) - [Packaging][deb] Can't build apache-arrow-archive-keyring for RC -* [ARROW-8505](https://issues.apache.org/jira/browse/ARROW-8505) - [Release][C\#] "sourcelink test" is failed by Apache.Arrow.AssemblyInfo.cs -* [ARROW-8508](https://issues.apache.org/jira/browse/ARROW-8508) - [Rust] ListBuilder of FixedSizeListBuilder creates wrong offsets -* [ARROW-8510](https://issues.apache.org/jira/browse/ARROW-8510) - [C++] arrow/dataset/file\_base.cc fails to compile with internal compiler error with "Visual Studio 15 2017 Win64" generator -* [ARROW-8511](https://issues.apache.org/jira/browse/ARROW-8511) - [Developer][Release] Windows release verification script does not halt if C++ compilation fails -* [ARROW-8514](https://issues.apache.org/jira/browse/ARROW-8514) - [Developer] Windows wheel verification script does not check Python 3.5 -* [ARROW-8529](https://issues.apache.org/jira/browse/ARROW-8529) - [C++] Fix usage of NextCounts() in GetBatchWithDict[Spaced] -* [ARROW-8535](https://issues.apache.org/jira/browse/ARROW-8535) - [Rust] Arrow crate does not specify arrow-flight version -* [ARROW-8536](https://issues.apache.org/jira/browse/ARROW-8536) - [Rust] Failed to locate format/Flight.proto in any parent directory -* [ARROW-8537](https://issues.apache.org/jira/browse/ARROW-8537) - [C++] Performance regression from ARROW-8523 -* [ARROW-8539](https://issues.apache.org/jira/browse/ARROW-8539) - [CI] "AMD64 MacOS 10.15 GLib & Ruby" fails -* [ARROW-8554](https://issues.apache.org/jira/browse/ARROW-8554) - [C++][Benchmark] Fix building error "cannot bind lvalue" -* [ARROW-8556](https://issues.apache.org/jira/browse/ARROW-8556) - [R] zstd symbol not found if there are multiple installations of zstd -* [ARROW-8566](https://issues.apache.org/jira/browse/ARROW-8566) - [R] error when writing POSIXct to spark -* [ARROW-8568](https://issues.apache.org/jira/browse/ARROW-8568) - [C++][Python] Crash on decimal cast in debug mode -* [ARROW-8577](https://issues.apache.org/jira/browse/ARROW-8577) - [Plasma] PlasmaClient::Connect() of CUDA enabled build is always failed on no CUDA device machine -* [ARROW-8583](https://issues.apache.org/jira/browse/ARROW-8583) - [C++][Doc] Undocumented parameter in Dataset namespace -* [ARROW-8584](https://issues.apache.org/jira/browse/ARROW-8584) - [Packaging][C++] Protobuf link error in deb builds -* [ARROW-8585](https://issues.apache.org/jira/browse/ARROW-8585) - [Packaging][Python] Windows wheels fail to build because of link error -* [ARROW-8586](https://issues.apache.org/jira/browse/ARROW-8586) - [R] installation failure on CentOS 7 -* [ARROW-8587](https://issues.apache.org/jira/browse/ARROW-8587) - [C++] Compilation error when linking arrow-flight-perf-server -* [ARROW-8592](https://issues.apache.org/jira/browse/ARROW-8592) - [C++] Docs still list LLVM 7 as compiler used -* [ARROW-8593](https://issues.apache.org/jira/browse/ARROW-8593) - [C++] Parquet file\_serialize\_test.cc fails to build with musl libc -* [ARROW-8598](https://issues.apache.org/jira/browse/ARROW-8598) - [Rust] simd\_compare\_op creates buffer of incorrect length when item count is not a multiple of T::lanes() -* [ARROW-8602](https://issues.apache.org/jira/browse/ARROW-8602) - [CMake] Fix ws2\_32 link issue when cross-compiling on Linux -* [ARROW-8603](https://issues.apache.org/jira/browse/ARROW-8603) - [Documentation] Fix Sphinx doxygen comment -* [ARROW-8604](https://issues.apache.org/jira/browse/ARROW-8604) - [R][CI] Update CI to use R 4.0 -* [ARROW-8608](https://issues.apache.org/jira/browse/ARROW-8608) - [C++] Update vendored mpark/variant.h to latest to fix NVCC compilation issues -* [ARROW-8609](https://issues.apache.org/jira/browse/ARROW-8609) - [C++] ORC JNI bridge crashed on null arrow buffer -* [ARROW-8610](https://issues.apache.org/jira/browse/ARROW-8610) - [Rust] DivideByZero when running arrow crate when simd feature is disabled -* [ARROW-8613](https://issues.apache.org/jira/browse/ARROW-8613) - [C++][Dataset] Raise error for unparsable partition value -* [ARROW-8615](https://issues.apache.org/jira/browse/ARROW-8615) - [R] Error better and insist on RandomAccessFile in read\_feather -* [ARROW-8617](https://issues.apache.org/jira/browse/ARROW-8617) - [Rust] simd\_load\_set\_invalid does not exist on aarch64 -* [ARROW-8632](https://issues.apache.org/jira/browse/ARROW-8632) - [C++] Fix conversion error warning in array\_union\_test.cc -* [ARROW-8641](https://issues.apache.org/jira/browse/ARROW-8641) - [Python] Regression in feather: no longer supports permutation in column selection -* [ARROW-8643](https://issues.apache.org/jira/browse/ARROW-8643) - [Python] Tests with pandas master failing due to freq assertion -* [ARROW-8644](https://issues.apache.org/jira/browse/ARROW-8644) - [Python] Dask integration tests failing due to change in not including partition columns -* [ARROW-8646](https://issues.apache.org/jira/browse/ARROW-8646) - [Java] Allow UnionListWriter to write null values -* [ARROW-8649](https://issues.apache.org/jira/browse/ARROW-8649) - [Java] [Website] Java documentation on website is hidden -* [ARROW-8657](https://issues.apache.org/jira/browse/ARROW-8657) - [Python][C++][Parquet] Forward compatibility issue from 0.16 to 0.17 when using version='2.0' -* [ARROW-8663](https://issues.apache.org/jira/browse/ARROW-8663) - [Documentation] Small correction to building.rst -* [ARROW-8680](https://issues.apache.org/jira/browse/ARROW-8680) - [Rust] ComplexObjectArrayReader incorrect null value shuffling -* [ARROW-8684](https://issues.apache.org/jira/browse/ARROW-8684) - [Python] "SystemError: Bad call flags in \_PyMethodDef\_RawFastCallDict" in Python 3.7.7 on macOS when using pyarrow wheel -* [ARROW-8689](https://issues.apache.org/jira/browse/ARROW-8689) - [C++] S3 benchmarks fail linking -* [ARROW-8693](https://issues.apache.org/jira/browse/ARROW-8693) - [Python] Dataset.get\_fragments is missing an implicit cast when filtering -* [ARROW-8694](https://issues.apache.org/jira/browse/ARROW-8694) - [Python][Parquet] parquet.read\_schema() fails when loading wide table created from Pandas DataFrame -* [ARROW-8701](https://issues.apache.org/jira/browse/ARROW-8701) - [Rust] Unresolved import \`crate::compute::util::simd\_load\_set\_invalid\` on Raspberry Pi -* [ARROW-8704](https://issues.apache.org/jira/browse/ARROW-8704) - [C++] Fix Parquet crash on invalid input (OSS-Fuzz) -* [ARROW-8705](https://issues.apache.org/jira/browse/ARROW-8705) - [Java] ComplexCopier is skipping null values -* [ARROW-8706](https://issues.apache.org/jira/browse/ARROW-8706) - [C++][Parquet] Tracking JIRA for PARQUET-1857 (unencrypted INT16\_MAX Parquet row group limit) -* [ARROW-8710](https://issues.apache.org/jira/browse/ARROW-8710) - [Rust] Continuation marker not written correctly in IPC writer, and stream not flushed -* [ARROW-8722](https://issues.apache.org/jira/browse/ARROW-8722) - [Dev] "archery docker run -e" doesn't work -* [ARROW-8726](https://issues.apache.org/jira/browse/ARROW-8726) - [C++][Dataset] Mis-specified DirectoryPartitioning incorrectly uses the file name as value -* [ARROW-8728](https://issues.apache.org/jira/browse/ARROW-8728) - [C++] Bitmap operation may cause buffer overflow -* [ARROW-8729](https://issues.apache.org/jira/browse/ARROW-8729) - [C++][Dataset] Only selecting a partition column results in empty table -* [ARROW-8734](https://issues.apache.org/jira/browse/ARROW-8734) - [R] improve nightly build installation -* [ARROW-8741](https://issues.apache.org/jira/browse/ARROW-8741) - [Python][Packaging] Keep VS2015 with for the windows wheels -* [ARROW-8750](https://issues.apache.org/jira/browse/ARROW-8750) - [Python] pyarrow.feather.write\_feather does not default to lz4 compression if it's available -* [ARROW-8768](https://issues.apache.org/jira/browse/ARROW-8768) - [R][CI] Fix nightly as-cran spurious failure -* [ARROW-8775](https://issues.apache.org/jira/browse/ARROW-8775) - [C++][FlightRPC] Integration client doesn't run integration tests -* [ARROW-8776](https://issues.apache.org/jira/browse/ARROW-8776) - [FlightRPC][C++] Flight/C++ middleware don't receive headers on failed calls to Java servers -* [ARROW-8798](https://issues.apache.org/jira/browse/ARROW-8798) - [C++] Fix Parquet crashes on invalid input (OSS-Fuzz) -* [ARROW-8799](https://issues.apache.org/jira/browse/ARROW-8799) - [C++][Dataset] Reading list column as nested dictionary segfaults -* [ARROW-8801](https://issues.apache.org/jira/browse/ARROW-8801) - [Python] Memory leak on read from parquet file with UTC timestamps using pandas -* [ARROW-8802](https://issues.apache.org/jira/browse/ARROW-8802) - [C++][Dataset] Schema metadata are lost when reading a subset of columns -* [ARROW-8803](https://issues.apache.org/jira/browse/ARROW-8803) - [Java] Row count should be set before loading buffers in VectorLoader -* [ARROW-8808](https://issues.apache.org/jira/browse/ARROW-8808) - [Rust] Divide by zero in arrays/builder.rs -* [ARROW-8809](https://issues.apache.org/jira/browse/ARROW-8809) - [Rust] schema mismatch in integration test -* [ARROW-8811](https://issues.apache.org/jira/browse/ARROW-8811) - [Java] Fix build on master -* [ARROW-8820](https://issues.apache.org/jira/browse/ARROW-8820) - [C++][Gandiva] fix date\_trunc functions to return date types -* [ARROW-8821](https://issues.apache.org/jira/browse/ARROW-8821) - [Rust] nested binary expression with Like, NotLike and Not operator results in type cast error -* [ARROW-8825](https://issues.apache.org/jira/browse/ARROW-8825) - [C++] Cannot compiled pass with Wunused-parameter flag -* [ARROW-8826](https://issues.apache.org/jira/browse/ARROW-8826) - [Crossbow] remote URL should always have .git -* [ARROW-8832](https://issues.apache.org/jira/browse/ARROW-8832) - [Python] AttributeError: module 'pyarrow.fs' has no attribute 'S3FileSystem' -* [ARROW-8848](https://issues.apache.org/jira/browse/ARROW-8848) - [CI][C/Glib] MinGW build error -* [ARROW-8848](https://issues.apache.org/jira/browse/ARROW-8848) - [CI][C/Glib] MinGW build error -* [ARROW-8858](https://issues.apache.org/jira/browse/ARROW-8858) - [FlightRPC] Ensure headers are uniformly exposed -* [ARROW-8860](https://issues.apache.org/jira/browse/ARROW-8860) - [C++] IPC/Feather decompression broken for nested arrays -* [ARROW-8862](https://issues.apache.org/jira/browse/ARROW-8862) - [C++] NumericBuilder does not use MemoryPool passed to CTOR -* [ARROW-8863](https://issues.apache.org/jira/browse/ARROW-8863) - [C++] Array subclass constructors must set ArrayData::null\_count to 0 when there is no validity bitmap -* [ARROW-8869](https://issues.apache.org/jira/browse/ARROW-8869) - [Rust] [DataFusion] Type Coercion optimizer rule does not support new scan nodes -* [ARROW-8871](https://issues.apache.org/jira/browse/ARROW-8871) - [C++] Gandiva build failure -* [ARROW-8872](https://issues.apache.org/jira/browse/ARROW-8872) - [CI] Travis-CI jobs fail (can't open file 'ci/detect-changes.py') -* [ARROW-8874](https://issues.apache.org/jira/browse/ARROW-8874) - [C++][Dataset] Scanner::ToTable race when ScanTask exit early with an error -* [ARROW-8878](https://issues.apache.org/jira/browse/ARROW-8878) - [R] try\_download is confused when download.file.method isn't default -* [ARROW-8882](https://issues.apache.org/jira/browse/ARROW-8882) - [C\#] Add .editorconfig to C\# code -* [ARROW-8888](https://issues.apache.org/jira/browse/ARROW-8888) - [Python] Heuristic in dataframe\_to\_arrays that decides to multithread convert cause slow conversions -* [ARROW-8889](https://issues.apache.org/jira/browse/ARROW-8889) - [Python] Python 3.7 SIGSEGV when comparing RecordBatch to None -* [ARROW-8892](https://issues.apache.org/jira/browse/ARROW-8892) - [C++][CI] CI builds for MSVC do not build benchmarks -* [ARROW-8909](https://issues.apache.org/jira/browse/ARROW-8909) - [Java] Out of order writes using setSafe -* [ARROW-8911](https://issues.apache.org/jira/browse/ARROW-8911) - [C++] Slicing a ChunkedArray with zero chunks segfaults -* [ARROW-8924](https://issues.apache.org/jira/browse/ARROW-8924) - [C++][Gandiva] castDATE\_date32() may cause overflow -* [ARROW-8925](https://issues.apache.org/jira/browse/ARROW-8925) - [Rust] [DataFusion] CsvExec::schema() returns incorrect results -* [ARROW-8930](https://issues.apache.org/jira/browse/ARROW-8930) - [C++] libz.so linking error with liborc.a -* [ARROW-8932](https://issues.apache.org/jira/browse/ARROW-8932) - [C++] symbol resolution failures with liborc.a -* [ARROW-8946](https://issues.apache.org/jira/browse/ARROW-8946) - [Python] Add tests for parquet.write\_metadata metadata\_collector -* [ARROW-8948](https://issues.apache.org/jira/browse/ARROW-8948) - [Java][Integration] enable duplicate field names integration tests -* [ARROW-8951](https://issues.apache.org/jira/browse/ARROW-8951) - [C++] Fix compiler warning in compute/kernels/scalar\_cast\_temporal.cc -* [ARROW-8954](https://issues.apache.org/jira/browse/ARROW-8954) - [Website] ca-certificates should be listed in installation instructions -* [ARROW-8957](https://issues.apache.org/jira/browse/ARROW-8957) - [FlightRPC][C++] Fail to build due to IpcOptions -* [ARROW-8959](https://issues.apache.org/jira/browse/ARROW-8959) - [Rust] Broken build due to new benchmark crate using old API -* [ARROW-8962](https://issues.apache.org/jira/browse/ARROW-8962) - [C++] Linking failure with clang-4.0 -* [ARROW-8968](https://issues.apache.org/jira/browse/ARROW-8968) - [C++][Gandiva] Show link warning message on s390x -* [ARROW-8975](https://issues.apache.org/jira/browse/ARROW-8975) - [FlightRPC][C++] Fix flaky MacOS tests -* [ARROW-8977](https://issues.apache.org/jira/browse/ARROW-8977) - [R] Table$create with schema crashes with some dictionary index types -* [ARROW-8978](https://issues.apache.org/jira/browse/ARROW-8978) - [C++][Compute] "Conditional jump or move depends on uninitialised value(s)" Valgrind warning -* [ARROW-8980](https://issues.apache.org/jira/browse/ARROW-8980) - [Python] Metadata grows exponentially when using schema from disk -* [ARROW-8982](https://issues.apache.org/jira/browse/ARROW-8982) - [CI] Remove allow\_failures for s390x in TravisCI -* [ARROW-8986](https://issues.apache.org/jira/browse/ARROW-8986) - [Archery][ursabot] Fix benchmark diff checkout of origin/master -* [ARROW-9000](https://issues.apache.org/jira/browse/ARROW-9000) - [Java] build crashes with JDK14 -* [ARROW-9009](https://issues.apache.org/jira/browse/ARROW-9009) - [C++][Dataset] ARROW:schema should be removed from schema's metadata when reading Parquet files -* [ARROW-9013](https://issues.apache.org/jira/browse/ARROW-9013) - [C++] Validate enum-style CMake options -* [ARROW-9020](https://issues.apache.org/jira/browse/ARROW-9020) - [Python] read\_json won't respect explicit\_schema in parse\_options -* [ARROW-9024](https://issues.apache.org/jira/browse/ARROW-9024) - [C++/Python] Install anaconda-client in conda-clean job -* [ARROW-9026](https://issues.apache.org/jira/browse/ARROW-9026) - [C++/Python] Force package removal from arrow-nightlies conda repository -* [ARROW-9037](https://issues.apache.org/jira/browse/ARROW-9037) - [C++][C] unable to import array with null count == -1 (which could be exported) -* [ARROW-9057](https://issues.apache.org/jira/browse/ARROW-9057) - [Rust] Projection should work on InMemoryScan without error -* [ARROW-9059](https://issues.apache.org/jira/browse/ARROW-9059) - [Rust] Documentation for slicing array data has the wrong sign -* [ARROW-9066](https://issues.apache.org/jira/browse/ARROW-9066) - [Python] Raise correct error in isnull() -* [ARROW-9071](https://issues.apache.org/jira/browse/ARROW-9071) - [C++] MakeArrayOfNull makes invalid ListArray -* [ARROW-9077](https://issues.apache.org/jira/browse/ARROW-9077) - [C++] Fix aggregate/scalar-compare benchmark null\_percent calculation -* [ARROW-9080](https://issues.apache.org/jira/browse/ARROW-9080) - [C++] arrow::AllocateBuffer returns a Result\> -* [ARROW-9082](https://issues.apache.org/jira/browse/ARROW-9082) - [Rust] - Stream reader fail when steam not ended with (optional) 0xFFFFFFFF 0x00000000" -* [ARROW-9084](https://issues.apache.org/jira/browse/ARROW-9084) - [C++] CMake is unable to find zstd target when ZSTD\_SOURCE=SYSTEM -* [ARROW-9085](https://issues.apache.org/jira/browse/ARROW-9085) - [C++][CI] Appveyor CI test failures -* [ARROW-9087](https://issues.apache.org/jira/browse/ARROW-9087) - [C++] Missing HDFS options parsing -* [ARROW-9098](https://issues.apache.org/jira/browse/ARROW-9098) - RecordBatch::ToStructArray cannot handle record batches with 0 column -* [ARROW-9105](https://issues.apache.org/jira/browse/ARROW-9105) - [C++] ParquetFileFragment scanning doesn't handle filter on partition field -* [ARROW-9120](https://issues.apache.org/jira/browse/ARROW-9120) - [C++] Lint and Format C++ files with "codegen" in file name -* [ARROW-9121](https://issues.apache.org/jira/browse/ARROW-9121) - [C++] Do not wipe the filesystem when path is empty -* [ARROW-9122](https://issues.apache.org/jira/browse/ARROW-9122) - [C++] Adapt ascii\_lower/ascii\_upper bulk transforms to work on sliced arrays -* [ARROW-9126](https://issues.apache.org/jira/browse/ARROW-9126) - [C++] Trimmed Boost bundle fails to build on Windows -* [ARROW-9127](https://issues.apache.org/jira/browse/ARROW-9127) - [Rust] Update thrift library dependencies -* [ARROW-9134](https://issues.apache.org/jira/browse/ARROW-9134) - [Python] Parquet partitioning degrades Int32 to float64 -* [ARROW-9141](https://issues.apache.org/jira/browse/ARROW-9141) - [R] Update cross-package documentation links -* [ARROW-9142](https://issues.apache.org/jira/browse/ARROW-9142) - [C++] random::RandomArrayGenerator::Boolean "probability" misdocumented / incorrect -* [ARROW-9143](https://issues.apache.org/jira/browse/ARROW-9143) - [C++] RecordBatch::Slice erroneously sets non-nullable field's internal null\_count to unknown -* [ARROW-9146](https://issues.apache.org/jira/browse/ARROW-9146) - [C++][Dataset] Scanning a Fragment with a filter + mismatching schema shouldn't abort -* [ARROW-9151](https://issues.apache.org/jira/browse/ARROW-9151) - [R][CI] Fix Rtools 4.0 build: pacman sync -* [ARROW-9160](https://issues.apache.org/jira/browse/ARROW-9160) - [C++] Implement string/binary contains for exact matches -* [ARROW-9174](https://issues.apache.org/jira/browse/ARROW-9174) - [Go] Tests panic with 386 (x86) builds -* [ARROW-9183](https://issues.apache.org/jira/browse/ARROW-9183) - [C++] Failed to build arrow-cpp with gcc 4.9.2 -* [ARROW-9184](https://issues.apache.org/jira/browse/ARROW-9184) - [Rust][Datafusion] table scan without projection should return all columns -* [ARROW-9194](https://issues.apache.org/jira/browse/ARROW-9194) - [C++] Array::GetScalar not implemented for decimal type -* [ARROW-9195](https://issues.apache.org/jira/browse/ARROW-9195) - [Java] Wrong usage of Unsafe.get from bytearray in ByteFunctionsHelper class -* [ARROW-9209](https://issues.apache.org/jira/browse/ARROW-9209) - [C++] Benchmarks fail to build ARROW\_IPC=OFF and ARROW\_BUILD\_TESTS=OFF -* [ARROW-9219](https://issues.apache.org/jira/browse/ARROW-9219) - [R] coerce\_timestamps in Parquet write options does not work -* [ARROW-9221](https://issues.apache.org/jira/browse/ARROW-9221) - ArrowBuf\#setBytes(int, ByteBuffer) doesn't check the byte buffer's endianness -* [ARROW-9223](https://issues.apache.org/jira/browse/ARROW-9223) - [Python] Fix to\_pandas() export for timestamps within structs -* [ARROW-9230](https://issues.apache.org/jira/browse/ARROW-9230) - [FlightRPC][Python] flight.connect() doesn't pass through all arguments -* [ARROW-9233](https://issues.apache.org/jira/browse/ARROW-9233) - [C++] is\_null on NullArray should be true for all values -* [ARROW-9236](https://issues.apache.org/jira/browse/ARROW-9236) - [Rust] CSV WriterBuilder never writes header -* [ARROW-9237](https://issues.apache.org/jira/browse/ARROW-9237) - [R] 0.17 install on Arch Linux -* [ARROW-9238](https://issues.apache.org/jira/browse/ARROW-9238) - [C++][CI] A few test coverages of round-robin in ipc and flight -* [ARROW-9252](https://issues.apache.org/jira/browse/ARROW-9252) - [Integration] GitHub Actions integration test job does not test against "gold" 0.14.1 files in apache/arrow-testing -* [ARROW-9260](https://issues.apache.org/jira/browse/ARROW-9260) - [CI] "ARM64v8 Ubuntu 20.04 C++" fails -* [ARROW-9260](https://issues.apache.org/jira/browse/ARROW-9260) - [CI] "ARM64v8 Ubuntu 20.04 C++" fails -* [ARROW-9261](https://issues.apache.org/jira/browse/ARROW-9261) - [Python][Packaging] S3FileSystem curl errors in manylinux wheels -* [ARROW-9274](https://issues.apache.org/jira/browse/ARROW-9274) - [Rust] [Integration Testing] Read i64 from json files as strings -* [ARROW-9282](https://issues.apache.org/jira/browse/ARROW-9282) - [R] Remove usage of \_EXTPTR\_PTR -* [ARROW-9284](https://issues.apache.org/jira/browse/ARROW-9284) - [Java] getMinorTypeForArrowType returns sparse minor type for dense union types -* [ARROW-9288](https://issues.apache.org/jira/browse/ARROW-9288) - [C++][Dataset] Discovery of partition field as dictionary type segfaulting with HivePartitioning -* [ARROW-9297](https://issues.apache.org/jira/browse/ARROW-9297) - [C++][Dataset] Dataset scanner cannot handle large binary column (\> 2 GB) -* [ARROW-9298](https://issues.apache.org/jira/browse/ARROW-9298) - [C++] Fix crashes on invalid input (OSS-Fuzz) -* [ARROW-9303](https://issues.apache.org/jira/browse/ARROW-9303) - [R] Linux static build should always bundle dependencies -* [ARROW-9305](https://issues.apache.org/jira/browse/ARROW-9305) - [Python] Dependency load failure in Windows wheel build -* [ARROW-9315](https://issues.apache.org/jira/browse/ARROW-9315) - [Java] Fix the failure of testAllocationManagerType -* [ARROW-9317](https://issues.apache.org/jira/browse/ARROW-9317) - [Java] A few testcases for arrow-memory -* [ARROW-9326](https://issues.apache.org/jira/browse/ARROW-9326) - [Python] Setuptools 49.1.0 appears to break our Python 3.6 builds -* [ARROW-9326](https://issues.apache.org/jira/browse/ARROW-9326) - [Python] Setuptools 49.1.0 appears to break our Python 3.6 builds -* [ARROW-9326](https://issues.apache.org/jira/browse/ARROW-9326) - [Python] Setuptools 49.1.0 appears to break our Python 3.6 builds -* [ARROW-9330](https://issues.apache.org/jira/browse/ARROW-9330) - [C++] Fix crashes on corrupt IPC input (OSS-Fuzz) -* [ARROW-9334](https://issues.apache.org/jira/browse/ARROW-9334) - [Dev][Archery] Push ancestor docker images -* [ARROW-9336](https://issues.apache.org/jira/browse/ARROW-9336) - [Ruby] Creating RecordBatch with structs missing keys results in a malformed table -* [ARROW-9343](https://issues.apache.org/jira/browse/ARROW-9343) - [C++][Gandiva] CastINT/Float functions from string should handle leading/trailing white spaces -* [ARROW-9347](https://issues.apache.org/jira/browse/ARROW-9347) - [Python] Tests fail with latest fsspec -* [ARROW-9350](https://issues.apache.org/jira/browse/ARROW-9350) - [C++][CI] Nightly valgrind job failures -* [ARROW-9351](https://issues.apache.org/jira/browse/ARROW-9351) - [C++][CI] Nightly test-ubuntu-18.04-cpp-cmake32 fails -* [ARROW-9353](https://issues.apache.org/jira/browse/ARROW-9353) - [Python][CI] Nightly dask integration jobs fail -* [ARROW-9354](https://issues.apache.org/jira/browse/ARROW-9354) - [C++] Turbodbc latest fails to build in the integration tests -* [ARROW-9355](https://issues.apache.org/jira/browse/ARROW-9355) - [R] Fix -Wimplicit-int-float-conversion -* [ARROW-9360](https://issues.apache.org/jira/browse/ARROW-9360) - [CI][Crossbow] Nightly homebrew-cpp job times out -* [ARROW-9363](https://issues.apache.org/jira/browse/ARROW-9363) - [C++][Dataset] ParquetDatasetFactory schema: pandas metadata is lost -* [ARROW-9368](https://issues.apache.org/jira/browse/ARROW-9368) - [Python] Rename predicate argument to filter in split\_by\_row\_group() -* [ARROW-9373](https://issues.apache.org/jira/browse/ARROW-9373) - [C++] Fix Parquet crash on invalid input (OSS-Fuzz) -* [ARROW-9380](https://issues.apache.org/jira/browse/ARROW-9380) - [C++] Segfaults in compute::CallFunction -* [ARROW-9384](https://issues.apache.org/jira/browse/ARROW-9384) - [C++] Out-of-memory on invalid IPC input (OSS-Fuzz) -* [ARROW-9385](https://issues.apache.org/jira/browse/ARROW-9385) - [Python] [CI] jpype integration failure -* [ARROW-9389](https://issues.apache.org/jira/browse/ARROW-9389) - [C++] Can't call isin/match through CallFunction -* [ARROW-9397](https://issues.apache.org/jira/browse/ARROW-9397) - [R] Pass CC/CXX to cmake when building libarrow in Linux build -* [ARROW-9408](https://issues.apache.org/jira/browse/ARROW-9408) - [Integration] Tests do not run in Windows due to numpy 64-bit errors -* [ARROW-9409](https://issues.apache.org/jira/browse/ARROW-9409) - [CI][Crossbow] Nightly conda-r fails -* [ARROW-9410](https://issues.apache.org/jira/browse/ARROW-9410) - [CI][Crossbow] Fix homebrew-cpp again -* [ARROW-9413](https://issues.apache.org/jira/browse/ARROW-9413) - [Rust] Fix clippy lint on master -* [ARROW-9415](https://issues.apache.org/jira/browse/ARROW-9415) - [C++] Arrow does not compile on Power9 -* [ARROW-9416](https://issues.apache.org/jira/browse/ARROW-9416) - [Go] Add test cases for some datatypes -* [ARROW-9417](https://issues.apache.org/jira/browse/ARROW-9417) - [C++][IPC] size in message written in native endian -* [ARROW-9418](https://issues.apache.org/jira/browse/ARROW-9418) - [R] nyc-taxi Parquet files not downloaded in binary mode on Windows -* [ARROW-9419](https://issues.apache.org/jira/browse/ARROW-9419) - [C++] Test that "fill\_null" function works with sliced inputs, expand tests -* [ARROW-9428](https://issues.apache.org/jira/browse/ARROW-9428) - [C++] Update documentation for buffer allocation functions -* [ARROW-9436](https://issues.apache.org/jira/browse/ARROW-9436) - [C++][CI] Valgrind errors in fill\_null kernel tests -* [ARROW-9438](https://issues.apache.org/jira/browse/ARROW-9438) - [CI] Spark integration tests are failing -* [ARROW-9439](https://issues.apache.org/jira/browse/ARROW-9439) - [C++] Fix crash on invalid IPC input (OSS-Fuzz) -* [ARROW-9440](https://issues.apache.org/jira/browse/ARROW-9440) - [Python] Expose Fill Null Compute Kernel in PyArrow -* [ARROW-9443](https://issues.apache.org/jira/browse/ARROW-9443) - [C++] Bundled bz2 build should only build libbz2 -* [ARROW-9448](https://issues.apache.org/jira/browse/ARROW-9448) - [Java] Circular initialization between ArrowBuf and BaseAllocator leads to null HistoricalLog for empty buffer -* [ARROW-9449](https://issues.apache.org/jira/browse/ARROW-9449) - [R] Strip arrow.so -* [ARROW-9450](https://issues.apache.org/jira/browse/ARROW-9450) - [Python] "pytest pyarrow" takes over 10 seconds to collect tests and start executing -* [ARROW-9456](https://issues.apache.org/jira/browse/ARROW-9456) - [Python] Dataset segfault when not importing pyarrow.parquet -* [ARROW-9458](https://issues.apache.org/jira/browse/ARROW-9458) - [Python] Dataset Scanner is single-threaded only -* [ARROW-9460](https://issues.apache.org/jira/browse/ARROW-9460) - [C++] BinaryContainsExact doesn't cope with double characters in the pattern -* [ARROW-9461](https://issues.apache.org/jira/browse/ARROW-9461) - [Rust] Reading Date32 and Date64 errors - they are incorrectly converted to RecordBatch -* [ARROW-9476](https://issues.apache.org/jira/browse/ARROW-9476) - [C++][Dataset] HivePartitioning discovery with dictionary types fails for multiple fields -* [ARROW-9486](https://issues.apache.org/jira/browse/ARROW-9486) - [C++][Dataset] Support implicit casting InExpression::set\_ to dict -* [ARROW-9497](https://issues.apache.org/jira/browse/ARROW-9497) - [C++][Parquet] Fix failure caused by malformed repetition/definition levels -* [ARROW-9499](https://issues.apache.org/jira/browse/ARROW-9499) - [C++] AdaptiveIntBuilder::AppendNull does not increment the null count -* [ARROW-9500](https://issues.apache.org/jira/browse/ARROW-9500) - [C++] Fix segfault with std::to\_string in -O3 builds on gcc 7.5.0 -* [ARROW-9501](https://issues.apache.org/jira/browse/ARROW-9501) - [C++][Gandiva] Add logic in timestampdiff() when end date is last day of a month -* [ARROW-9503](https://issues.apache.org/jira/browse/ARROW-9503) - [Rust] Comparison sliced arrays is wrong -* [ARROW-9504](https://issues.apache.org/jira/browse/ARROW-9504) - [Python] Segmentation fault on ChunkedArray.take -* [ARROW-9506](https://issues.apache.org/jira/browse/ARROW-9506) - [Packaging][Python] Fix macOS wheel build failures -* [ARROW-9512](https://issues.apache.org/jira/browse/ARROW-9512) - [C++] Variadic template unpack inside lambda doesn't compile with gcc -* [ARROW-9524](https://issues.apache.org/jira/browse/ARROW-9524) - [CI][Gandiva] C++ unit test arrow-ipc-read-write failing in gandiva nightly build -* [ARROW-9527](https://issues.apache.org/jira/browse/ARROW-9527) - [Rust] Remove un-needed dev-dependencies -* [ARROW-9528](https://issues.apache.org/jira/browse/ARROW-9528) - [Python] Honor tzinfo information when converting from datetime to pyarrow -* [PARQUET-1839](https://issues.apache.org/jira/browse/PARQUET-1839) - [C++] values\_read not updated in ReadBatchSpaced -* [PARQUET-1857](https://issues.apache.org/jira/browse/PARQUET-1857) - [C++][Parquet] ParquetFileReader unable to read files with more than 32767 row groups -* [PARQUET-1865](https://issues.apache.org/jira/browse/PARQUET-1865) - [C++] Failure from C++17 feature used in parquet/encoding\_benchmark.cc -* [PARQUET-1877](https://issues.apache.org/jira/browse/PARQUET-1877) - [C++] Reconcile container size with string size for memory issues -* [PARQUET-1882](https://issues.apache.org/jira/browse/PARQUET-1882) - [C++] Writing an all-null column and then reading it with buffered\_stream aborts the process - - -## New Features and Improvements - -* [ARROW-300](https://issues.apache.org/jira/browse/ARROW-300) - [Format] Add body buffer compression option to IPC message protocol using LZ4 or ZSTD -* [ARROW-842](https://issues.apache.org/jira/browse/ARROW-842) - [Python] Handle more kinds of null sentinel objects from pandas 0.x -* [ARROW-971](https://issues.apache.org/jira/browse/ARROW-971) - [C++/Python] Implement Array.isvalid/notnull/isnull as scalar functions -* [ARROW-974](https://issues.apache.org/jira/browse/ARROW-974) - [Website] Add Use Cases section to the website -* [ARROW-1277](https://issues.apache.org/jira/browse/ARROW-1277) - Completing integration tests for major implemented data types -* [ARROW-1567](https://issues.apache.org/jira/browse/ARROW-1567) - [C++] Implement "fill null" kernels that replace null values with some scalar replacement value -* [ARROW-1570](https://issues.apache.org/jira/browse/ARROW-1570) - [C++] Define API for creating a kernel instance from function of scalar input and output with a particular signature -* [ARROW-1682](https://issues.apache.org/jira/browse/ARROW-1682) - [Python] Add documentation / example for reading a directory of Parquet files on S3 -* [ARROW-1796](https://issues.apache.org/jira/browse/ARROW-1796) - [Python] RowGroup filtering on file level -* [ARROW-2260](https://issues.apache.org/jira/browse/ARROW-2260) - [C++][Plasma] plasma\_store should show usage -* [ARROW-2444](https://issues.apache.org/jira/browse/ARROW-2444) - [Python][C++] Better handle reading empty parquet files -* [ARROW-2702](https://issues.apache.org/jira/browse/ARROW-2702) - [Python] Examine usages of Invalid and TypeError errors in numpy\_to\_arrow.cc to see if we are using the right error type in each instance -* [ARROW-2714](https://issues.apache.org/jira/browse/ARROW-2714) - [C++/Python] Variable step size slicing for arrays -* [ARROW-2912](https://issues.apache.org/jira/browse/ARROW-2912) - [Website] Build more detailed Community landing page a la Apache Spark -* [ARROW-3089](https://issues.apache.org/jira/browse/ARROW-3089) - [Rust] Add ArrayBuilder for different Arrow arrays -* [ARROW-3134](https://issues.apache.org/jira/browse/ARROW-3134) - [C++] Implement n-ary iterator for a collection of chunked arrays with possibly different chunking layouts -* [ARROW-3154](https://issues.apache.org/jira/browse/ARROW-3154) - [Python][C++] Document how to write \_metadata, \_common\_metadata files with Parquet datasets -* [ARROW-3244](https://issues.apache.org/jira/browse/ARROW-3244) - [Python] Multi-file parquet loading without scan -* [ARROW-3275](https://issues.apache.org/jira/browse/ARROW-3275) - [Python] Add documentation about inspecting Parquet file metadata -* [ARROW-3308](https://issues.apache.org/jira/browse/ARROW-3308) - [R] Convert R character vector with data exceeding 2GB to Large type -* [ARROW-3317](https://issues.apache.org/jira/browse/ARROW-3317) - [R] Test/support conversions from data.frame with a single character column exceeding 2GB capacity of BinaryArray -* [ARROW-3446](https://issues.apache.org/jira/browse/ARROW-3446) - [R] Document mapping of Arrow <-\> R types -* [ARROW-3509](https://issues.apache.org/jira/browse/ARROW-3509) - [C++] Inconsistent child accessor naming -* [ARROW-3520](https://issues.apache.org/jira/browse/ARROW-3520) - [C++] Implement List Flatten kernel -* [ARROW-3688](https://issues.apache.org/jira/browse/ARROW-3688) - [Rust] Implement PrimitiveArrayBuilder.push\_values -* [ARROW-3827](https://issues.apache.org/jira/browse/ARROW-3827) - [Rust] Implement UnionArray -* [ARROW-4022](https://issues.apache.org/jira/browse/ARROW-4022) - [C++] Promote Datum variant out of compute namespace -* [ARROW-4221](https://issues.apache.org/jira/browse/ARROW-4221) - [Format] Add canonical flag in COO sparse index -* [ARROW-4390](https://issues.apache.org/jira/browse/ARROW-4390) - [R] Serialize "labeled" metadata in Feather files, IPC messages -* [ARROW-4412](https://issues.apache.org/jira/browse/ARROW-4412) - [DOCUMENTATION] Add explicit version numbers to the arrow specification documents. -* [ARROW-4427](https://issues.apache.org/jira/browse/ARROW-4427) - [Doc] Move Confluence Wiki pages to the Sphinx docs -* [ARROW-4429](https://issues.apache.org/jira/browse/ARROW-4429) - [Doc] Add git rebase tips to the 'Contributing' page in the developer docs -* [ARROW-5035](https://issues.apache.org/jira/browse/ARROW-5035) - [C\#] ArrowBuffer.Builder is broken -* [ARROW-5082](https://issues.apache.org/jira/browse/ARROW-5082) - [Python][Packaging] Reduce size of macOS and manylinux1 wheels -* [ARROW-5143](https://issues.apache.org/jira/browse/ARROW-5143) - [Flight] Enable integration testing of batches with dictionaries -* [ARROW-5279](https://issues.apache.org/jira/browse/ARROW-5279) - [C++] Support reading delta dictionaries in IPC streams -* [ARROW-5377](https://issues.apache.org/jira/browse/ARROW-5377) - [C++] Make IpcPayload public and add GetPayloadSize -* [ARROW-5489](https://issues.apache.org/jira/browse/ARROW-5489) - [C++] Normalize kernels and ChunkedArray behavior -* [ARROW-5548](https://issues.apache.org/jira/browse/ARROW-5548) - [Documentation] http://arrow.apache.org/docs/latest/ is not latest -* [ARROW-5649](https://issues.apache.org/jira/browse/ARROW-5649) - [Integration][C++] Create round trip integration test for extension types -* [ARROW-5708](https://issues.apache.org/jira/browse/ARROW-5708) - [C\#] Null support for BooleanArray -* [ARROW-5760](https://issues.apache.org/jira/browse/ARROW-5760) - [C++] Optimize Take implementation -* [ARROW-5854](https://issues.apache.org/jira/browse/ARROW-5854) - [Python] Expose compare kernels on Array class -* [ARROW-6052](https://issues.apache.org/jira/browse/ARROW-6052) - [C++] Divide up arrow/array.h,cc into files in arrow/array/ similar to builder files -* [ARROW-6110](https://issues.apache.org/jira/browse/ARROW-6110) - [Java] Support LargeList Type and add integration test with C++ -* [ARROW-6111](https://issues.apache.org/jira/browse/ARROW-6111) - [Java] Support LargeVarChar and LargeBinary types and add integration test with C++ -* [ARROW-6439](https://issues.apache.org/jira/browse/ARROW-6439) - [R] Implement S3 file-system interface in R -* [ARROW-6456](https://issues.apache.org/jira/browse/ARROW-6456) - [C++] Possible to reduce object code generated in compute/kernels/take.cc? -* [ARROW-6501](https://issues.apache.org/jira/browse/ARROW-6501) - [C++] Remove non\_zero\_length field from SparseIndex -* [ARROW-6521](https://issues.apache.org/jira/browse/ARROW-6521) - [C++] Add function to arrow:: namespace that returns the current ABI version -* [ARROW-6543](https://issues.apache.org/jira/browse/ARROW-6543) - [R] Support LargeBinary and LargeString types -* [ARROW-6602](https://issues.apache.org/jira/browse/ARROW-6602) - [Doc] Add feature / implementation matrix -* [ARROW-6603](https://issues.apache.org/jira/browse/ARROW-6603) - [C\#] ArrayBuilder API to support writing nulls -* [ARROW-6645](https://issues.apache.org/jira/browse/ARROW-6645) - [Python] Faster boundschecking of dictionary indices when converting to Categorical -* [ARROW-6689](https://issues.apache.org/jira/browse/ARROW-6689) - [Rust] [DataFusion] Query execution enhancements for 1.0.0 release -* [ARROW-6691](https://issues.apache.org/jira/browse/ARROW-6691) - [Rust] [DataFusion] Use tokio and Futures instead of spawning threads -* [ARROW-6775](https://issues.apache.org/jira/browse/ARROW-6775) - [C++] [Python] Proposal for several Array utility functions -* [ARROW-6776](https://issues.apache.org/jira/browse/ARROW-6776) - [Python] Need a lite version of pyarrow -* [ARROW-6800](https://issues.apache.org/jira/browse/ARROW-6800) - [C++] Add CMake option to build libraries targeting a C++14 or C++17 toolchain environment -* [ARROW-6839](https://issues.apache.org/jira/browse/ARROW-6839) - [Java] Add APIs to read and write "custom\_metadata" field of IPC file footer -* [ARROW-6856](https://issues.apache.org/jira/browse/ARROW-6856) - [C++] Use ArrayData instead of Array for ArrayData::dictionary -* [ARROW-6917](https://issues.apache.org/jira/browse/ARROW-6917) - ARROW-6917: [Archery][Release] Add support for JIRA curation, changelog generation and commit cherry-picking for maintenance releases -* [ARROW-6945](https://issues.apache.org/jira/browse/ARROW-6945) - [Rust] Enable integration tests -* [ARROW-6959](https://issues.apache.org/jira/browse/ARROW-6959) - [C++] Clarify what signatures are preferred for compute kernels -* [ARROW-6978](https://issues.apache.org/jira/browse/ARROW-6978) - [R] Add bindings for sum and mean compute kernels -* [ARROW-6979](https://issues.apache.org/jira/browse/ARROW-6979) - [R] Enable jemalloc in autobrew formula -* [ARROW-7009](https://issues.apache.org/jira/browse/ARROW-7009) - [C++] Refactor filter/take kernels to use Datum instead of overloads -* [ARROW-7010](https://issues.apache.org/jira/browse/ARROW-7010) - [C++] Support lossy casts from decimal128 to float32 and float64/double -* [ARROW-7011](https://issues.apache.org/jira/browse/ARROW-7011) - [C++] Implement casts from float/double to decimal128 -* [ARROW-7012](https://issues.apache.org/jira/browse/ARROW-7012) - [C++] Clarify ChunkedArray chunking strategy and policy -* [ARROW-7068](https://issues.apache.org/jira/browse/ARROW-7068) - [C++] Expose the offsets of a ListArray as a Int32Array -* [ARROW-7075](https://issues.apache.org/jira/browse/ARROW-7075) - [C++] Boolean kernels should not allocate in Call() -* [ARROW-7175](https://issues.apache.org/jira/browse/ARROW-7175) - [Website] Add a security page to track when vulnerabilities are patched -* [ARROW-7229](https://issues.apache.org/jira/browse/ARROW-7229) - [C++] Unify ConcatenateTables APIs -* [ARROW-7230](https://issues.apache.org/jira/browse/ARROW-7230) - [C++] Use vendored std::optional instead of boost::optional in Gandiva -* [ARROW-7237](https://issues.apache.org/jira/browse/ARROW-7237) - [C++] Add Result to APIs to arrow/json -* [ARROW-7243](https://issues.apache.org/jira/browse/ARROW-7243) - [Docs] Add common "implementation status" table to the README of each native language implementation, as well as top level README -* [ARROW-7285](https://issues.apache.org/jira/browse/ARROW-7285) - [C++] ensure C++ implementation meets clarified dictionary spec -* [ARROW-7300](https://issues.apache.org/jira/browse/ARROW-7300) - [C++][Gandiva] Implement functions to cast from strings to integers/floats -* [ARROW-7313](https://issues.apache.org/jira/browse/ARROW-7313) - [C++] Add function for retrieving a scalar from an array slot -* [ARROW-7371](https://issues.apache.org/jira/browse/ARROW-7371) - [GLib] Add Datasets binding -* [ARROW-7375](https://issues.apache.org/jira/browse/ARROW-7375) - [Python] Expose C++ MakeArrayOfNull -* [ARROW-7391](https://issues.apache.org/jira/browse/ARROW-7391) - [Python] Remove unnecessary classes from the binding layer -* [ARROW-7495](https://issues.apache.org/jira/browse/ARROW-7495) - [Java] Remove "empty" concept from ArrowBuf, replace with custom referencemanager -* [ARROW-7605](https://issues.apache.org/jira/browse/ARROW-7605) - [C++] Create and install static library containing all dependencies built by Arrow -* [ARROW-7607](https://issues.apache.org/jira/browse/ARROW-7607) - [C++] Add to cpp/examples minimal examples of using Arrow as a dependency of another CMake project -* [ARROW-7673](https://issues.apache.org/jira/browse/ARROW-7673) - [C++][Dataset] Revisit File discovery failure mode -* [ARROW-7676](https://issues.apache.org/jira/browse/ARROW-7676) - [Packaging][Python] Ensure that the static libraries are not built in the wheel scripts -* [ARROW-7699](https://issues.apache.org/jira/browse/ARROW-7699) - [Java] Support concating dense union vectors in batch -* [ARROW-7705](https://issues.apache.org/jira/browse/ARROW-7705) - [Rust] Initial sort implementation -* [ARROW-7717](https://issues.apache.org/jira/browse/ARROW-7717) - [CI] Have nightly integration test for Spark's latest release -* [ARROW-7759](https://issues.apache.org/jira/browse/ARROW-7759) - [C++][Dataset] Add CsvFileFormat for CSV support -* [ARROW-7778](https://issues.apache.org/jira/browse/ARROW-7778) - [C++] Support nested dictionaries in JSON integration format -* [ARROW-7784](https://issues.apache.org/jira/browse/ARROW-7784) - [C++] diff.cc is extremely slow to compile -* [ARROW-7801](https://issues.apache.org/jira/browse/ARROW-7801) - [Developer] Add issue\_comment workflow to fix lint/style/codegen -* [ARROW-7803](https://issues.apache.org/jira/browse/ARROW-7803) - [R][CI] Autobrew/homebrew tests should not always install from master -* [ARROW-7831](https://issues.apache.org/jira/browse/ARROW-7831) - [Java] unnecessary buffer allocation when calling splitAndTransferTo on variable width vectors -* [ARROW-7831](https://issues.apache.org/jira/browse/ARROW-7831) - [Java] unnecessary buffer allocation when calling splitAndTransferTo on variable width vectors -* [ARROW-7902](https://issues.apache.org/jira/browse/ARROW-7902) - [Integration] Unskip nested dictionary integration tests -* [ARROW-7910](https://issues.apache.org/jira/browse/ARROW-7910) - [C++] Provide function to query page size portably -* [ARROW-7924](https://issues.apache.org/jira/browse/ARROW-7924) - [Rust] Add sort for float types -* [ARROW-7950](https://issues.apache.org/jira/browse/ARROW-7950) - [Python] When initializing pandas API shim, inform user if their installed pandas version is too old -* [ARROW-7955](https://issues.apache.org/jira/browse/ARROW-7955) - [Java] Support large buffer for file/stream IPC -* [ARROW-8020](https://issues.apache.org/jira/browse/ARROW-8020) - [Java] Implement vector validate functionality -* [ARROW-8023](https://issues.apache.org/jira/browse/ARROW-8023) - [Website] Write a blog post about the C data interface -* [ARROW-8025](https://issues.apache.org/jira/browse/ARROW-8025) - [C++] Implement cast to Binary and FixedSizeBinary -* [ARROW-8025](https://issues.apache.org/jira/browse/ARROW-8025) - [C++] Implement cast to Binary and FixedSizeBinary -* [ARROW-8046](https://issues.apache.org/jira/browse/ARROW-8046) - [Developer][Integration] Makefile.docker's target names are broken -* [ARROW-8062](https://issues.apache.org/jira/browse/ARROW-8062) - [C++][Dataset] Parquet Dataset factory from a \_metadata/\_common\_metadata file -* [ARROW-8065](https://issues.apache.org/jira/browse/ARROW-8065) - [C++][Dataset] Untangle Dataset, Fragment and ScanOptions -* [ARROW-8074](https://issues.apache.org/jira/browse/ARROW-8074) - [C++][Dataset] Support for file-like objects (buffers) in FileSystemDataset? -* [ARROW-8108](https://issues.apache.org/jira/browse/ARROW-8108) - [Java] Extract a common interface for dictionary encoders -* [ARROW-8111](https://issues.apache.org/jira/browse/ARROW-8111) - [C++][CSV] Support MM/DD/YYYY date format -* [ARROW-8114](https://issues.apache.org/jira/browse/ARROW-8114) - [Java][Integration] Enable custom\_metadata integration test -* [ARROW-8121](https://issues.apache.org/jira/browse/ARROW-8121) - [Java] Enhance code style checking for Java code (add space after commas, semi-colons and type casts) -* [ARROW-8149](https://issues.apache.org/jira/browse/ARROW-8149) - [C++/Python] Enable CUDA Support in conda recipes -* [ARROW-8157](https://issues.apache.org/jira/browse/ARROW-8157) - [C++][Gandiva] Support building with LLVM 9 -* [ARROW-8162](https://issues.apache.org/jira/browse/ARROW-8162) - [Format][Python] Add serialization for CSF sparse tensors -* [ARROW-8169](https://issues.apache.org/jira/browse/ARROW-8169) - [Java] Improve the performance of JDBC adapter by allocating memory proactively -* [ARROW-8171](https://issues.apache.org/jira/browse/ARROW-8171) - Consider pre-allocating memory for fix-width vector in Avro adapter iterator -* [ARROW-8190](https://issues.apache.org/jira/browse/ARROW-8190) - [C++][Flight] Allow setting IpcWriteOptions and IpcReadOptions in Flight IPC message reader and writer classes -* [ARROW-8229](https://issues.apache.org/jira/browse/ARROW-8229) - [Java] Move ArrowBuf into the Arrow package -* [ARROW-8230](https://issues.apache.org/jira/browse/ARROW-8230) - [Java] Move Netty memory manager into a separate module -* [ARROW-8261](https://issues.apache.org/jira/browse/ARROW-8261) - [Rust] [DataFusion] LogicalPlanBuilder.limit() should take a literal argument -* [ARROW-8263](https://issues.apache.org/jira/browse/ARROW-8263) - [Rust] [DataFusion] Add documentation for supported SQL functions -* [ARROW-8281](https://issues.apache.org/jira/browse/ARROW-8281) - [R] Name collision of arrow.dll on Windows conda -* [ARROW-8283](https://issues.apache.org/jira/browse/ARROW-8283) - [Python][Dataset] Non-existent files are silently dropped in pa.dataset.FileSystemDataset -* [ARROW-8287](https://issues.apache.org/jira/browse/ARROW-8287) - [Rust] Arrow examples should use utility to print results -* [ARROW-8293](https://issues.apache.org/jira/browse/ARROW-8293) - [Python] Run flake8 on python/examples also -* [ARROW-8297](https://issues.apache.org/jira/browse/ARROW-8297) - [FlightRPC][C++] Implement Flight DoExchange for C++ -* [ARROW-8301](https://issues.apache.org/jira/browse/ARROW-8301) - [R] Handle ChunkedArray and Table in C data interface -* [ARROW-8312](https://issues.apache.org/jira/browse/ARROW-8312) - [Java][Gandiva] improve IN expression support -* [ARROW-8314](https://issues.apache.org/jira/browse/ARROW-8314) - [Python] Provide a method to select a subset of columns of a Table -* [ARROW-8318](https://issues.apache.org/jira/browse/ARROW-8318) - [C++][Dataset] Dataset should instantiate Fragment -* [ARROW-8399](https://issues.apache.org/jira/browse/ARROW-8399) - [Rust] Extend memory alignments to include other architectures -* [ARROW-8413](https://issues.apache.org/jira/browse/ARROW-8413) - [C++] Refactor DefLevelsToBitmap -* [ARROW-8422](https://issues.apache.org/jira/browse/ARROW-8422) - [Rust] [Parquet] Implement function to convert Arrow schema to Parquet schema -* [ARROW-8430](https://issues.apache.org/jira/browse/ARROW-8430) - [CI] Configure self-hosted runners for Github Actions -* [ARROW-8434](https://issues.apache.org/jira/browse/ARROW-8434) - [C++] Ipc RecordBatchFileReader deserializes the Schema multiple times -* [ARROW-8440](https://issues.apache.org/jira/browse/ARROW-8440) - [C++] Refine simd header files -* [ARROW-8443](https://issues.apache.org/jira/browse/ARROW-8443) - [Gandiva][C++] Fix round/truncate to no-op for special cases -* [ARROW-8447](https://issues.apache.org/jira/browse/ARROW-8447) - [C++][Dataset] Ensure Scanner::ToTable preserve ordering of ScanTasks -* [ARROW-8467](https://issues.apache.org/jira/browse/ARROW-8467) - [C++] Test cases using ArrayFromJSON assume only a little-endian platform -* [ARROW-8474](https://issues.apache.org/jira/browse/ARROW-8474) - [CI][Crossbow] Skip some nightlies we don't need to run -* [ARROW-8477](https://issues.apache.org/jira/browse/ARROW-8477) - [C++] Enable reading and writing of long filenames for Windows -* [ARROW-8481](https://issues.apache.org/jira/browse/ARROW-8481) - [Java] Provide an allocation manager based on Unsafe API -* [ARROW-8483](https://issues.apache.org/jira/browse/ARROW-8483) - [Ruby] Arrow::Table documentation improvement -* [ARROW-8485](https://issues.apache.org/jira/browse/ARROW-8485) - [Integration][Java] Implement extension types integration -* [ARROW-8486](https://issues.apache.org/jira/browse/ARROW-8486) - [C++] arrow-utility-test causes failures on a big-endian platform -* [ARROW-8487](https://issues.apache.org/jira/browse/ARROW-8487) - [FlightRPC][C++] Make it possible to target a specific payload size -* [ARROW-8488](https://issues.apache.org/jira/browse/ARROW-8488) - [R] Replace VALUE\_OR\_STOP with ValueOrStop -* [ARROW-8496](https://issues.apache.org/jira/browse/ARROW-8496) - [C++] Refine ByteStreamSplitDecodeScalar -* [ARROW-8497](https://issues.apache.org/jira/browse/ARROW-8497) - [Archery] Add missing component to builds -* [ARROW-8499](https://issues.apache.org/jira/browse/ARROW-8499) - [C++][Dataset] In ScannerBuilder, batch\_size will not work if projecter is not empty -* [ARROW-8500](https://issues.apache.org/jira/browse/ARROW-8500) - [C++] Use selection vectors in Filter implementation for record batches, tables -* [ARROW-8501](https://issues.apache.org/jira/browse/ARROW-8501) - [Packaging][RPM] Upgrade devtoolset to 8 on CentOS 6 -* [ARROW-8502](https://issues.apache.org/jira/browse/ARROW-8502) - [Release][APT][Yum] Ignore all arm64 verifications -* [ARROW-8504](https://issues.apache.org/jira/browse/ARROW-8504) - [C++] Add Run Length Reader -* [ARROW-8506](https://issues.apache.org/jira/browse/ARROW-8506) - [c++] Miss tests to verify expected\_buffer with bit\_width \> 8 in RLE -* [ARROW-8507](https://issues.apache.org/jira/browse/ARROW-8507) - [Release] Detect .git directory automatically in changelog.py -* [ARROW-8509](https://issues.apache.org/jira/browse/ARROW-8509) - [GLib] Add low level record batch read/write functions -* [ARROW-8512](https://issues.apache.org/jira/browse/ARROW-8512) - [C++] Delete unused compute expr prototype code -* [ARROW-8513](https://issues.apache.org/jira/browse/ARROW-8513) - [Python] Expose Take with Table input in Python -* [ARROW-8515](https://issues.apache.org/jira/browse/ARROW-8515) - [C++] Bitmap ToString should have an option of grouping by bytes -* [ARROW-8516](https://issues.apache.org/jira/browse/ARROW-8516) - [Rust] Slow BufferBuilder inserts within PrimitiveBuilder::append\_slice -* [ARROW-8517](https://issues.apache.org/jira/browse/ARROW-8517) - [Developer][Release] Update Crossbow RC verification setup for changes since 0.16.0 -* [ARROW-8520](https://issues.apache.org/jira/browse/ARROW-8520) - [Developer] Use .asf.yaml to direct GitHub notifications to e-mail lists and JIRA -* [ARROW-8521](https://issues.apache.org/jira/browse/ARROW-8521) - [Developer] Group Sub-task, Task, Test, and Wish issue types as "Improvement" in Changelog -* [ARROW-8522](https://issues.apache.org/jira/browse/ARROW-8522) - [Developer] Add environment variable option to toggle whether ephemeral NodeJS is installed in release verification script -* [ARROW-8524](https://issues.apache.org/jira/browse/ARROW-8524) - [CI] Free up space on github actions -* [ARROW-8526](https://issues.apache.org/jira/browse/ARROW-8526) - [Python] Fix non-deterministic row order failure in dataset tests -* [ARROW-8531](https://issues.apache.org/jira/browse/ARROW-8531) - [C++] Deprecate ARROW\_USE\_SIMD CMake option -* [ARROW-8538](https://issues.apache.org/jira/browse/ARROW-8538) - [Packaging] Remove boost from homebrew formula -* [ARROW-8540](https://issues.apache.org/jira/browse/ARROW-8540) - [C++] Create memory allocation benchmark -* [ARROW-8541](https://issues.apache.org/jira/browse/ARROW-8541) - [Release] Don't remove previous source releases automatically -* [ARROW-8542](https://issues.apache.org/jira/browse/ARROW-8542) - [Release] Fix checksum url in the website post release script -* [ARROW-8543](https://issues.apache.org/jira/browse/ARROW-8543) - [C++] IO: single pass coalescing algorithm -* [ARROW-8544](https://issues.apache.org/jira/browse/ARROW-8544) - [CI][Crossbow] Add a status.json to the gh-pages summary of nightly builds to get around rate limiting -* [ARROW-8548](https://issues.apache.org/jira/browse/ARROW-8548) - [Website] 0.17 release post -* [ARROW-8549](https://issues.apache.org/jira/browse/ARROW-8549) - [R] Assorted post-0.17 release cleanups -* [ARROW-8550](https://issues.apache.org/jira/browse/ARROW-8550) - [CI] Don't run cron GHA jobs on forks -* [ARROW-8551](https://issues.apache.org/jira/browse/ARROW-8551) - [CI][Gandiva] Use LLVM 8 to build gandiva linux jar -* [ARROW-8552](https://issues.apache.org/jira/browse/ARROW-8552) - [Rust] support column iteration for parquet row -* [ARROW-8553](https://issues.apache.org/jira/browse/ARROW-8553) - [C++] Optimize unaligned bitmap operations -* [ARROW-8555](https://issues.apache.org/jira/browse/ARROW-8555) - [FlightRPC][Java] Implement Flight DoExchange for Java -* [ARROW-8558](https://issues.apache.org/jira/browse/ARROW-8558) - [Rust] GitHub Actions missing rustfmt -* [ARROW-8559](https://issues.apache.org/jira/browse/ARROW-8559) - [Rust] Consolidate Record Batch reader traits in main arrow crate -* [ARROW-8560](https://issues.apache.org/jira/browse/ARROW-8560) - [Rust] Docs for MutableBuffer resize are incorrect -* [ARROW-8561](https://issues.apache.org/jira/browse/ARROW-8561) - [C++][Gandiva] Stop using deprecated google::protobuf::MessageLite::ByteSize() -* [ARROW-8562](https://issues.apache.org/jira/browse/ARROW-8562) - [C++] IO: Parameterize I/O coalescing using S3 storage metrics -* [ARROW-8563](https://issues.apache.org/jira/browse/ARROW-8563) - [Go] Minor change to make newBuilder public -* [ARROW-8564](https://issues.apache.org/jira/browse/ARROW-8564) - [Website] Add Ubuntu 20.04 LTS to supported package list -* [ARROW-8569](https://issues.apache.org/jira/browse/ARROW-8569) - [CI] Upgrade xcode version for testing homebrew formulae -* [ARROW-8571](https://issues.apache.org/jira/browse/ARROW-8571) - [C++] Switch AppVeyor image to VS 2017 -* [ARROW-8572](https://issues.apache.org/jira/browse/ARROW-8572) - [Python] Expose UnionArray.array and other fields -* [ARROW-8573](https://issues.apache.org/jira/browse/ARROW-8573) - [Rust] Upgrade to Rust 1.44 nightly -* [ARROW-8574](https://issues.apache.org/jira/browse/ARROW-8574) - [Rust] Implement Debug for all plain types -* [ARROW-8575](https://issues.apache.org/jira/browse/ARROW-8575) - [Developer] Add issue\_comment workflow to rebase a PR -* [ARROW-8590](https://issues.apache.org/jira/browse/ARROW-8590) - [Rust] Use Arrow pretty print utility in DataFusion -* [ARROW-8591](https://issues.apache.org/jira/browse/ARROW-8591) - [Rust] Reverse lookup for a key in DictionaryArray -* [ARROW-8597](https://issues.apache.org/jira/browse/ARROW-8597) - [Rust] arrow crate lint and readability improvements -* [ARROW-8606](https://issues.apache.org/jira/browse/ARROW-8606) - [CI] Don't trigger all builds on a change to any file in ci/ -* [ARROW-8607](https://issues.apache.org/jira/browse/ARROW-8607) - [R][CI] Unbreak builds following R 4.0 release -* [ARROW-8611](https://issues.apache.org/jira/browse/ARROW-8611) - [R] Can't install arrow 0.17 on Ubuntu 18.04 R 3.6.3 -* [ARROW-8612](https://issues.apache.org/jira/browse/ARROW-8612) - [GLib] Add GArrowReadOptions and GArrowWriteOptions -* [ARROW-8616](https://issues.apache.org/jira/browse/ARROW-8616) - [Rust] Turn explicit SIMD off by default -* [ARROW-8619](https://issues.apache.org/jira/browse/ARROW-8619) - [C++] Use distinct Type::type values for interval types -* [ARROW-8622](https://issues.apache.org/jira/browse/ARROW-8622) - [Rust] Parquet crate does not compile on aarch64 -* [ARROW-8623](https://issues.apache.org/jira/browse/ARROW-8623) - [C++][Gandiva] Reduce use of Boost, remove Boost headers from header files -* [ARROW-8624](https://issues.apache.org/jira/browse/ARROW-8624) - [Website] Install page should mention arrow-dataset packages -* [ARROW-8628](https://issues.apache.org/jira/browse/ARROW-8628) - [CI][Dev] Wrap docker-compose commands with archery -* [ARROW-8629](https://issues.apache.org/jira/browse/ARROW-8629) - [Rust] Eliminate indirection of ZST allocations -* [ARROW-8633](https://issues.apache.org/jira/browse/ARROW-8633) - [C++] Add ValidateAscii function -* [ARROW-8634](https://issues.apache.org/jira/browse/ARROW-8634) - [Java] Create an example -* [ARROW-8639](https://issues.apache.org/jira/browse/ARROW-8639) - [C++][Plasma] Require gflags -* [ARROW-8645](https://issues.apache.org/jira/browse/ARROW-8645) - [C++] Missing gflags dependency for plasma -* [ARROW-8647](https://issues.apache.org/jira/browse/ARROW-8647) - [C++][Dataset] Optionally encode partition field values as dictionary type -* [ARROW-8648](https://issues.apache.org/jira/browse/ARROW-8648) - [Rust] Optimize Rust CI Build Times -* [ARROW-8650](https://issues.apache.org/jira/browse/ARROW-8650) - [Rust] [Website] Add documentation to Arrow website -* [ARROW-8651](https://issues.apache.org/jira/browse/ARROW-8651) - [Python][Dataset] Support pickling of Dataset objects -* [ARROW-8655](https://issues.apache.org/jira/browse/ARROW-8655) - [C++][Dataset][Python][R] Preserve partitioning information for a discovered Dataset -* [ARROW-8656](https://issues.apache.org/jira/browse/ARROW-8656) - [Python] Switch to VS2017 in the windows wheel builds -* [ARROW-8659](https://issues.apache.org/jira/browse/ARROW-8659) - [Rust] ListBuilder and FixedSizeListBuilder capacity -* [ARROW-8660](https://issues.apache.org/jira/browse/ARROW-8660) - [C++][Gandiva] Reduce dependence on Boost -* [ARROW-8662](https://issues.apache.org/jira/browse/ARROW-8662) - [CI] Consolidate appveyor scripts -* [ARROW-8664](https://issues.apache.org/jira/browse/ARROW-8664) - [Java] Add skip null check to all Vector types -* [ARROW-8668](https://issues.apache.org/jira/browse/ARROW-8668) - [Packaging][APT][Yum][ARM] Use Travis CI's ARM machine to build packages -* [ARROW-8669](https://issues.apache.org/jira/browse/ARROW-8669) - [C++] Add IpcWriteOptions argument to GetRecordBatchSize() -* [ARROW-8671](https://issues.apache.org/jira/browse/ARROW-8671) - [C++] Use IPC body compression metadata approved in ARROW-300 -* [ARROW-8671](https://issues.apache.org/jira/browse/ARROW-8671) - [C++] Use IPC body compression metadata approved in ARROW-300 -* [ARROW-8682](https://issues.apache.org/jira/browse/ARROW-8682) - [Ruby][Parquet] Add support for column level compression -* [ARROW-8687](https://issues.apache.org/jira/browse/ARROW-8687) - [Java] Finish move of io.netty.buffer.ArrowBuf -* [ARROW-8690](https://issues.apache.org/jira/browse/ARROW-8690) - [Python] Clean-up dataset+parquet tests now order is determinstic -* [ARROW-8692](https://issues.apache.org/jira/browse/ARROW-8692) - [C++] Avoid memory copies when downloading from S3 -* [ARROW-8695](https://issues.apache.org/jira/browse/ARROW-8695) - [Java] remove references to PlatformDependent in memory module -* [ARROW-8696](https://issues.apache.org/jira/browse/ARROW-8696) - [Java] Convert tests to integration tests -* [ARROW-8699](https://issues.apache.org/jira/browse/ARROW-8699) - [R] Fix automatic r\_to\_py conversion -* [ARROW-8702](https://issues.apache.org/jira/browse/ARROW-8702) - [Packaging][C\#] Build NuGet packages in release process -* [ARROW-8703](https://issues.apache.org/jira/browse/ARROW-8703) - [R] schema$metadata should be properly typed -* [ARROW-8707](https://issues.apache.org/jira/browse/ARROW-8707) - [CI] Docker push fails because of wrong dockerhub credentials -* [ARROW-8708](https://issues.apache.org/jira/browse/ARROW-8708) - [CI] Utilize github actions cache for docker-compose volumes -* [ARROW-8711](https://issues.apache.org/jira/browse/ARROW-8711) - [Python] Expose strptime timestamp parsing in read\_csv conversion options -* [ARROW-8717](https://issues.apache.org/jira/browse/ARROW-8717) - [CI][Packaging] Add build dependency on boost to homebrew -* [ARROW-8720](https://issues.apache.org/jira/browse/ARROW-8720) - [C++] Fix checked\_pointer\_cast -* [ARROW-8721](https://issues.apache.org/jira/browse/ARROW-8721) - [CI] Fix R build matrix -* [ARROW-8723](https://issues.apache.org/jira/browse/ARROW-8723) - [Rust] Remove SIMD specific benchmark code -* [ARROW-8724](https://issues.apache.org/jira/browse/ARROW-8724) - [Packaging][deb][RPM] Use directory in host as build directory -* [ARROW-8725](https://issues.apache.org/jira/browse/ARROW-8725) - [Rust] redundant directory walk in rust parquet datasource code -* [ARROW-8727](https://issues.apache.org/jira/browse/ARROW-8727) - [C++] Do not require struct-initialization of StringConverter to parse strings to other types -* [ARROW-8730](https://issues.apache.org/jira/browse/ARROW-8730) - [Rust] Use slice instead of &Vec for function arguments -* [ARROW-8733](https://issues.apache.org/jira/browse/ARROW-8733) - [C++][Dataset][Python] ParquetFileFragment should provide access to parquet FileMetadata -* [ARROW-8736](https://issues.apache.org/jira/browse/ARROW-8736) - [Rust] [DataFusion] Table API should provide a schema() method -* [ARROW-8740](https://issues.apache.org/jira/browse/ARROW-8740) - [CI] Fix archery option in pandas master cron test -* [ARROW-8742](https://issues.apache.org/jira/browse/ARROW-8742) - [C++][Python] Add flight client support for Mutual TLS -* [ARROW-8743](https://issues.apache.org/jira/browse/ARROW-8743) - [C++][CI] Add a test job on s390x -* [ARROW-8744](https://issues.apache.org/jira/browse/ARROW-8744) - [Rust] ParquetIterator's next method should be safe to call even after reached end of iteration -* [ARROW-8745](https://issues.apache.org/jira/browse/ARROW-8745) - [C++] Bitmap.ToString causes failures on a big-endian platform -* [ARROW-8747](https://issues.apache.org/jira/browse/ARROW-8747) - [C++] Feather tests with compression cause failure on big-endian platforms -* [ARROW-8751](https://issues.apache.org/jira/browse/ARROW-8751) - [Rust] ParquetFileArrowReader should be able to read empty parquet file without error -* [ARROW-8752](https://issues.apache.org/jira/browse/ARROW-8752) - [Rust] Remove unused hashmap -* [ARROW-8753](https://issues.apache.org/jira/browse/ARROW-8753) - [C++][CI] Add a test job on ARM -* [ARROW-8754](https://issues.apache.org/jira/browse/ARROW-8754) - [C++][CI] enable tests for additional components on big-endian platforms -* [ARROW-8756](https://issues.apache.org/jira/browse/ARROW-8756) - [C++] Bitmap word tests cause failures on a big-endian platform -* [ARROW-8757](https://issues.apache.org/jira/browse/ARROW-8757) - [C++] Plasma header is written in native endian -* [ARROW-8758](https://issues.apache.org/jira/browse/ARROW-8758) - [R] Updates for compatibility with dplyr 1.0 -* [ARROW-8759](https://issues.apache.org/jira/browse/ARROW-8759) - [C++] TestPlasmaSerialization.DeleteReply tests failure on big-endian platforms -* [ARROW-8762](https://issues.apache.org/jira/browse/ARROW-8762) - [C++][Gandiva] Replace Gandiva's BitmapAnd with common implementation -* [ARROW-8763](https://issues.apache.org/jira/browse/ARROW-8763) - [C++] Create RandomAccessFile::WillNeed-like API -* [ARROW-8764](https://issues.apache.org/jira/browse/ARROW-8764) - [C++] Make ThreadPool configurable in ReadRangeCache -* [ARROW-8766](https://issues.apache.org/jira/browse/ARROW-8766) - [Python] A FileSystem implementation based on Python callbacks -* [ARROW-8769](https://issues.apache.org/jira/browse/ARROW-8769) - [C++] Add convenience methods to access fields by name in StructScalar -* [ARROW-8770](https://issues.apache.org/jira/browse/ARROW-8770) - [C++][CI] enable arrow-csv-test on s390x -* [ARROW-8772](https://issues.apache.org/jira/browse/ARROW-8772) - [C++] Expand SumKernel benchmark to more types -* [ARROW-8777](https://issues.apache.org/jira/browse/ARROW-8777) - [Rust] Parquet.rs does not support reading fixed-size binary fields. -* [ARROW-8778](https://issues.apache.org/jira/browse/ARROW-8778) - [C++][Gandiva] SelectionVector related test failed on big-endian platforms -* [ARROW-8779](https://issues.apache.org/jira/browse/ARROW-8779) - [R] Implement conversion to List -* [ARROW-8781](https://issues.apache.org/jira/browse/ARROW-8781) - [CI][C++] Enable ccache on GHA MinGW jobs -* [ARROW-8782](https://issues.apache.org/jira/browse/ARROW-8782) - [Rust] [DataFusion] Add benchmarks based on NYC Taxi data set -* [ARROW-8783](https://issues.apache.org/jira/browse/ARROW-8783) - [Rust] [DataFusion] Logical plan should have ParquetScan and CsvScan entries -* [ARROW-8784](https://issues.apache.org/jira/browse/ARROW-8784) - [Rust] [DataFusion] Remove use of Arc from LogicalPlan -* [ARROW-8785](https://issues.apache.org/jira/browse/ARROW-8785) - [Python][Packaging] Build the windows wheels with MIMALLOC enabled -* [ARROW-8786](https://issues.apache.org/jira/browse/ARROW-8786) - [Packaging][rpm] Use bundled zstd in the CentOS 8 build -* [ARROW-8788](https://issues.apache.org/jira/browse/ARROW-8788) - [C\#] Array builders to use bit-packed buffer builder rather than boolean array builder for validity map -* [ARROW-8789](https://issues.apache.org/jira/browse/ARROW-8789) - [Rust] Add separate crate for integration test binaries -* [ARROW-8790](https://issues.apache.org/jira/browse/ARROW-8790) - [C++][CI] Enable arrow-flight-test on s390x -* [ARROW-8791](https://issues.apache.org/jira/browse/ARROW-8791) - [Rust] Creating StringDictionaryBuilder with existing dictionary values -* [ARROW-8792](https://issues.apache.org/jira/browse/ARROW-8792) - [C++] Improved declarative compute function / kernel development framework, normalize calling conventions -* [ARROW-8793](https://issues.apache.org/jira/browse/ARROW-8793) - [C++] BitUtil::SetBitsTo probably doesn't need to be inline -* [ARROW-8794](https://issues.apache.org/jira/browse/ARROW-8794) - [C++] Expand benchmark coverage for arrow from parquet reading -* [ARROW-8795](https://issues.apache.org/jira/browse/ARROW-8795) - [C++] Limited iOS support -* [ARROW-8800](https://issues.apache.org/jira/browse/ARROW-8800) - [C++] Split arrow::ChunkedArray into arrow/chunked\_array.h -* [ARROW-8804](https://issues.apache.org/jira/browse/ARROW-8804) - [R][CI] Followup to Rtools40 upgrade -* [ARROW-8814](https://issues.apache.org/jira/browse/ARROW-8814) - [Dev][Release] Binary upload script keeps raising locale warnings -* [ARROW-8815](https://issues.apache.org/jira/browse/ARROW-8815) - [Dev][Release] Binary upload script should retry on unexpected bintray request error -* [ARROW-8818](https://issues.apache.org/jira/browse/ARROW-8818) - [Rust] Failing to build on master due to Flatbuffers/Union issues -* [ARROW-8822](https://issues.apache.org/jira/browse/ARROW-8822) - [Rust] [DataFusion] Add MemoryScan variant to LogicalPlan -* [ARROW-8827](https://issues.apache.org/jira/browse/ARROW-8827) - [Integration Testing] Initial skeleton for Rust integration tests -* [ARROW-8830](https://issues.apache.org/jira/browse/ARROW-8830) - [GLib] Add support for Tell againt not seekable GIO output stream -* [ARROW-8831](https://issues.apache.org/jira/browse/ARROW-8831) - [Rust] incomplete SIMD implementation in simd\_compare\_op -* [ARROW-8833](https://issues.apache.org/jira/browse/ARROW-8833) - [Rust] Implement VALIDATE mode in integration test binary -* [ARROW-8834](https://issues.apache.org/jira/browse/ARROW-8834) - [Rust] Implement arrow-file-to-stream for integration testing -* [ARROW-8835](https://issues.apache.org/jira/browse/ARROW-8835) - [Rust] Implement arrow-stream-to-file for integration testing -* [ARROW-8836](https://issues.apache.org/jira/browse/ARROW-8836) - [Website] Update copyright end year automatically -* [ARROW-8837](https://issues.apache.org/jira/browse/ARROW-8837) - [Rust] Add Null type -* [ARROW-8838](https://issues.apache.org/jira/browse/ARROW-8838) - [Rust] File reader fails to read header from valid files -* [ARROW-8839](https://issues.apache.org/jira/browse/ARROW-8839) - [Rust] datafusion logical plan should support scaning csv without provided schema -* [ARROW-8840](https://issues.apache.org/jira/browse/ARROW-8840) - [Rust] datafusion ExecutionError should implement std::error:Error trait -* [ARROW-8841](https://issues.apache.org/jira/browse/ARROW-8841) - [C++] Add benchmark and unittest for PLAIN spaced -* [ARROW-8843](https://issues.apache.org/jira/browse/ARROW-8843) - [C++] Optimize BitmapEquals unaligned case -* [ARROW-8844](https://issues.apache.org/jira/browse/ARROW-8844) - [C++] Optimize TransferBitmap unaligned case -* [ARROW-8846](https://issues.apache.org/jira/browse/ARROW-8846) - [Dev][Python] Autoformat Python sources with Archery -* [ARROW-8847](https://issues.apache.org/jira/browse/ARROW-8847) - [C++] Pass task size / metrics in Executor API -* [ARROW-8851](https://issues.apache.org/jira/browse/ARROW-8851) - [Python][Documentation] Fix FutureWarnings in Python Plasma docs -* [ARROW-8852](https://issues.apache.org/jira/browse/ARROW-8852) - [R] Post-0.17.1 adjustments -* [ARROW-8854](https://issues.apache.org/jira/browse/ARROW-8854) - [Rust] [Integration Testing] Show output from arrow-json-integration-test -* [ARROW-8855](https://issues.apache.org/jira/browse/ARROW-8855) - [Rust] [Integration Testing] data type Date32(Day) not supported -* [ARROW-8856](https://issues.apache.org/jira/browse/ARROW-8856) - [Rust] [Integration Testing] Return empty batch if MessageHeader is NONE -* [ARROW-8864](https://issues.apache.org/jira/browse/ARROW-8864) - [R] Add methods to Table/RecordBatch for consistency with data.frame -* [ARROW-8866](https://issues.apache.org/jira/browse/ARROW-8866) - [C++] Split Type::UNION into Type::SPARSE\_UNION and Type::DENSE\_UNION -* [ARROW-8867](https://issues.apache.org/jira/browse/ARROW-8867) - [R] Support converting POSIXlt type -* [ARROW-8875](https://issues.apache.org/jira/browse/ARROW-8875) - [C++] use AWS SDK SetResponseStreamFactory to avoid a copy of bytes -* [ARROW-8877](https://issues.apache.org/jira/browse/ARROW-8877) - [Rust] add CSV read option struct to simplify datafusion interface -* [ARROW-8880](https://issues.apache.org/jira/browse/ARROW-8880) - [R][Linux] Make R Binary Install Friendlier -* [ARROW-8881](https://issues.apache.org/jira/browse/ARROW-8881) - [Rust] Add large list and binary support -* [ARROW-8885](https://issues.apache.org/jira/browse/ARROW-8885) - [R] Don't include everything everywhere -* [ARROW-8886](https://issues.apache.org/jira/browse/ARROW-8886) - [C\#] Decide and implement appropriate behaviour for Array builder resize to negative size -* [ARROW-8887](https://issues.apache.org/jira/browse/ARROW-8887) - [Java] Buffer size for complex vectors increases rapidly in case of clear/write loop -* [ARROW-8890](https://issues.apache.org/jira/browse/ARROW-8890) - [R] Fix C++ lint issue -* [ARROW-8895](https://issues.apache.org/jira/browse/ARROW-8895) - [C++] Add C++ unit tests for filter and take functions on temporal type inputs, including timestamps -* [ARROW-8896](https://issues.apache.org/jira/browse/ARROW-8896) - [C++] Reimplement dictionary unpacking in Cast kernels using Take -* [ARROW-8899](https://issues.apache.org/jira/browse/ARROW-8899) - [R] Add R metadata like pandas metadata for round-trip fidelity -* [ARROW-8901](https://issues.apache.org/jira/browse/ARROW-8901) - [C++] Reduce number of take kernels -* [ARROW-8903](https://issues.apache.org/jira/browse/ARROW-8903) - [C++] Implement optimized "unsafe take" for use with selection vectors for kernel execution -* [ARROW-8904](https://issues.apache.org/jira/browse/ARROW-8904) - [Python] Fix usages of deprecated C++ APIs related to child/field -* [ARROW-8906](https://issues.apache.org/jira/browse/ARROW-8906) - [Rust] Support reading multiple CSV files for schema inference -* [ARROW-8907](https://issues.apache.org/jira/browse/ARROW-8907) - [Rust] implement scalar comparison operations -* [ARROW-8912](https://issues.apache.org/jira/browse/ARROW-8912) - [Ruby] Keep reference of Arrow::Buffer's data for GC -* [ARROW-8913](https://issues.apache.org/jira/browse/ARROW-8913) - [Ruby] Use "field" instead of "child" -* [ARROW-8914](https://issues.apache.org/jira/browse/ARROW-8914) - [C++][Gandiva] Decimal128 related test failed on big-endian platforms -* [ARROW-8915](https://issues.apache.org/jira/browse/ARROW-8915) - [Dev][Archery] Require Click 7 -* [ARROW-8917](https://issues.apache.org/jira/browse/ARROW-8917) - [C++][Compute] Formalize "metafunction" concept -* [ARROW-8918](https://issues.apache.org/jira/browse/ARROW-8918) - [C++] Add cast "metafunction" to FunctionRegistry that addresses dispatching to appropriate type-specific CastFunction -* [ARROW-8922](https://issues.apache.org/jira/browse/ARROW-8922) - [C++] Implement example string scalar kernel function to assist with string kernels buildout per ARROW-555 -* [ARROW-8923](https://issues.apache.org/jira/browse/ARROW-8923) - [C++] Improve usability of arrow::compute::CallFunction by moving ExecContext\* argument to end and adding default -* [ARROW-8926](https://issues.apache.org/jira/browse/ARROW-8926) - [C++] Improve docstrings in new public APIs in arrow/compute and fix miscellaneous typos -* [ARROW-8927](https://issues.apache.org/jira/browse/ARROW-8927) - [C++] Support dictionary memos when reading/writing record batches using cuda IPC -* [ARROW-8929](https://issues.apache.org/jira/browse/ARROW-8929) - [C++] Change compute::Arity:VarArgs min\_args default to 0 -* [ARROW-8931](https://issues.apache.org/jira/browse/ARROW-8931) - [Rust] Support lexical sort in arrow compute kernel -* [ARROW-8933](https://issues.apache.org/jira/browse/ARROW-8933) - [C++] Reduce generated code in vector\_hash.cc -* [ARROW-8934](https://issues.apache.org/jira/browse/ARROW-8934) - [C++] Add timestamp subtract kernel aliased to int64 subtract implementation -* [ARROW-8937](https://issues.apache.org/jira/browse/ARROW-8937) - [C++] Add "parse\_strptime" function for string to timestamp conversions using the kernels framework -* [ARROW-8938](https://issues.apache.org/jira/browse/ARROW-8938) - [R] Provide binding for arrow::compute::CallFunction -* [ARROW-8940](https://issues.apache.org/jira/browse/ARROW-8940) - [Java] Fix the performance degradation of integration tests -* [ARROW-8941](https://issues.apache.org/jira/browse/ARROW-8941) - [C++/Python] arrow-nightlies conda repository is full -* [ARROW-8942](https://issues.apache.org/jira/browse/ARROW-8942) - [R] Detect compression in reading CSV/JSON -* [ARROW-8943](https://issues.apache.org/jira/browse/ARROW-8943) - [C++][Dataset] Add support for Partitioning to ParquetDatasetFactory -* [ARROW-8950](https://issues.apache.org/jira/browse/ARROW-8950) - [C++] Make head optional in s3fs -* [ARROW-8958](https://issues.apache.org/jira/browse/ARROW-8958) - [FlightRPC][Python] Implement Flight DoExchange for Python -* [ARROW-8960](https://issues.apache.org/jira/browse/ARROW-8960) - [MINOR] [FORMAT] Fix typos in comments -* [ARROW-8961](https://issues.apache.org/jira/browse/ARROW-8961) - [C++] Add utf8proc library to toolchain -* [ARROW-8963](https://issues.apache.org/jira/browse/ARROW-8963) - [C++][Parquet] Parquet cpp optimize allocate memory -* [ARROW-8965](https://issues.apache.org/jira/browse/ARROW-8965) - [Python][Documentation] Pyarrow documentation for pip nightlies references 404'd location -* [ARROW-8966](https://issues.apache.org/jira/browse/ARROW-8966) - [C++] Move arrow::ArrayData to a separate header file -* [ARROW-8969](https://issues.apache.org/jira/browse/ARROW-8969) - [C++] Reduce generated code in compute/kernels/scalar\_compare.cc -* [ARROW-8970](https://issues.apache.org/jira/browse/ARROW-8970) - [C++] Reduce shared library / binary code size (umbrella issue) -* [ARROW-8972](https://issues.apache.org/jira/browse/ARROW-8972) - [Java] Support range value comparison for large varchar/varbinary vectors -* [ARROW-8973](https://issues.apache.org/jira/browse/ARROW-8973) - [Java] Support batch value appending for large varchar/varbinary vectors -* [ARROW-8974](https://issues.apache.org/jira/browse/ARROW-8974) - [C++] Refine TransferBitmap template parameters -* [ARROW-8976](https://issues.apache.org/jira/browse/ARROW-8976) - [C++] compute::CallFunction can't Filter/Take with ChunkedArray -* [ARROW-8979](https://issues.apache.org/jira/browse/ARROW-8979) - [C++] Implement bitmap word reader and writer -* [ARROW-8984](https://issues.apache.org/jira/browse/ARROW-8984) - [R] Revise install guides now that Windows conda package exists -* [ARROW-8985](https://issues.apache.org/jira/browse/ARROW-8985) - [Format] Add "byte width" field with default of 16 to Decimal Flatbuffers type for forward compatibility -* [ARROW-8989](https://issues.apache.org/jira/browse/ARROW-8989) - [C++] Document available functions in compute::FunctionRegistry -* [ARROW-8993](https://issues.apache.org/jira/browse/ARROW-8993) - [Rust] Support reading non-seekable sources in text readers -* [ARROW-8994](https://issues.apache.org/jira/browse/ARROW-8994) - [C++] Disable include-what-you-use cpplint lint checks -* [ARROW-8996](https://issues.apache.org/jira/browse/ARROW-8996) - [C++] Runtime SIMD path for Aggregate Sum/Mean kernel -* [ARROW-8997](https://issues.apache.org/jira/browse/ARROW-8997) - [Archery] Benchmark formatter should have friendly units -* [ARROW-9004](https://issues.apache.org/jira/browse/ARROW-9004) - [C++][Gandiva] Support building with LLVM 10 -* [ARROW-9005](https://issues.apache.org/jira/browse/ARROW-9005) - [Rust] [DataFusion] Support sort expression -* [ARROW-9007](https://issues.apache.org/jira/browse/ARROW-9007) - [Rust] Support appending arrays by merging array data -* [ARROW-9014](https://issues.apache.org/jira/browse/ARROW-9014) - [Packaging] Bump the minor part of the automatically generated version in crossbow -* [ARROW-9015](https://issues.apache.org/jira/browse/ARROW-9015) - [Java] Make BaseAllocator package private -* [ARROW-9016](https://issues.apache.org/jira/browse/ARROW-9016) - [Java] Remove direct references to Netty/Unsafe Allocators -* [ARROW-9017](https://issues.apache.org/jira/browse/ARROW-9017) - [Python] Refactor the Scalar classes -* [ARROW-9018](https://issues.apache.org/jira/browse/ARROW-9018) - [C++] Remove APIs that were deprecated in 0.17.x and prior -* [ARROW-9021](https://issues.apache.org/jira/browse/ARROW-9021) - [Python] The filesystem keyword in parquet.read\_table is not documented -* [ARROW-9022](https://issues.apache.org/jira/browse/ARROW-9022) - [C++] Add/Sub/Mul arithmetic kernels with overflow check -* [ARROW-9029](https://issues.apache.org/jira/browse/ARROW-9029) - [C++] Implement BitBlockCounter interface for blockwise popcounts of validity bitmaps -* [ARROW-9030](https://issues.apache.org/jira/browse/ARROW-9030) - [Python] Clean up some usages of pyarrow.compat, move some common functions/symbols to lib.pyx -* [ARROW-9031](https://issues.apache.org/jira/browse/ARROW-9031) - [R] Implement conversion from Type::UINT64 to R vector -* [ARROW-9032](https://issues.apache.org/jira/browse/ARROW-9032) - [C++] Split arrow/util/bit\_util.h into multiple header files -* [ARROW-9034](https://issues.apache.org/jira/browse/ARROW-9034) - [C++] Implement binary (two bitmap) version of BitBlockCounter -* [ARROW-9042](https://issues.apache.org/jira/browse/ARROW-9042) - [C++] Add Subtract and Multiply arithmetic kernels with wrap-around behavior -* [ARROW-9043](https://issues.apache.org/jira/browse/ARROW-9043) - [Go] Temporarily copy LICENSE.txt to go/ -* [ARROW-9043](https://issues.apache.org/jira/browse/ARROW-9043) - [Go] Temporarily copy LICENSE.txt to go/ -* [ARROW-9045](https://issues.apache.org/jira/browse/ARROW-9045) - [C++] Improve and expand Take/Filter benchmarks -* [ARROW-9046](https://issues.apache.org/jira/browse/ARROW-9046) - [C++][R] Put more things in type\_fwds -* [ARROW-9047](https://issues.apache.org/jira/browse/ARROW-9047) - [Rust] Setting 0-bits of a 0-length bitset segfaults -* [ARROW-9050](https://issues.apache.org/jira/browse/ARROW-9050) - [Release] Use 1.0.0 as the next version -* [ARROW-9051](https://issues.apache.org/jira/browse/ARROW-9051) - [GLib] Refer Array related objects from Array -* [ARROW-9052](https://issues.apache.org/jira/browse/ARROW-9052) - [CI][MinGW] Enable Gandiva -* [ARROW-9055](https://issues.apache.org/jira/browse/ARROW-9055) - [C++] Add sum/mean kernels for Boolean type -* [ARROW-9058](https://issues.apache.org/jira/browse/ARROW-9058) - [Packaging][wheel] Boost download is failed -* [ARROW-9060](https://issues.apache.org/jira/browse/ARROW-9060) - [GLib] Add support for building Apache Arrow Datasets GLib with non-installed Apache Arrow Datasets -* [ARROW-9061](https://issues.apache.org/jira/browse/ARROW-9061) - [Packaging][APT][Yum][GLib] Add Apache Arrow Datasets GLib -* [ARROW-9062](https://issues.apache.org/jira/browse/ARROW-9062) - [Rust] Support to read JSON into dictionary type -* [ARROW-9067](https://issues.apache.org/jira/browse/ARROW-9067) - [C++] Create reusable branchless / vectorized index boundschecking functions -* [ARROW-9070](https://issues.apache.org/jira/browse/ARROW-9070) - [C++] StructScalar needs field accessor methods -* [ARROW-9073](https://issues.apache.org/jira/browse/ARROW-9073) - [C++] RapidJSON include directory detection doesn't work with RapidJSONConfig.cmake -* [ARROW-9074](https://issues.apache.org/jira/browse/ARROW-9074) - [GLib] Add missing arrow-json check -* [ARROW-9075](https://issues.apache.org/jira/browse/ARROW-9075) - [C++] Optimize Filter implementation -* [ARROW-9079](https://issues.apache.org/jira/browse/ARROW-9079) - [C++] Write benchmark for arithmetic kernels -* [ARROW-9083](https://issues.apache.org/jira/browse/ARROW-9083) - [R] collect int64, uint32, uint64 as R integer type if not out of bounds -* [ARROW-9086](https://issues.apache.org/jira/browse/ARROW-9086) - [CI][Homebrew] Enable Gandiva -* [ARROW-9088](https://issues.apache.org/jira/browse/ARROW-9088) - [Rust] Recent version of arrow crate does not compile into wasm target -* [ARROW-9089](https://issues.apache.org/jira/browse/ARROW-9089) - [Python] A PyFileSystem handler for fsspec-based filesystems -* [ARROW-9090](https://issues.apache.org/jira/browse/ARROW-9090) - [C++] Bump versions of bundled libraries -* [ARROW-9091](https://issues.apache.org/jira/browse/ARROW-9091) - [C++] Utilize function's default options when passing no options to CallFunction for a function that requires them -* [ARROW-9093](https://issues.apache.org/jira/browse/ARROW-9093) - [FlightRPC][C++][Python] Allow setting gRPC client options -* [ARROW-9094](https://issues.apache.org/jira/browse/ARROW-9094) - [Python] Bump versions of compiled dependencies in manylinux wheels -* [ARROW-9095](https://issues.apache.org/jira/browse/ARROW-9095) - [Rust] Fix NullArray to comply with spec -* [ARROW-9099](https://issues.apache.org/jira/browse/ARROW-9099) - [C++][Gandiva] Add TRIM function for string -* [ARROW-9100](https://issues.apache.org/jira/browse/ARROW-9100) - [C++] Add ascii\_lower kernel -* [ARROW-9101](https://issues.apache.org/jira/browse/ARROW-9101) - [Doc][C++][Python] Document encoding expected by CSV and JSON readers -* [ARROW-9102](https://issues.apache.org/jira/browse/ARROW-9102) - [Packaging] Upload built manylinux docker images -* [ARROW-9106](https://issues.apache.org/jira/browse/ARROW-9106) - [C++] Add C++ foundation to ease file transcoding -* [ARROW-9108](https://issues.apache.org/jira/browse/ARROW-9108) - [C++][Dataset] Add Parquet Statistics conversion for timestamp columns -* [ARROW-9109](https://issues.apache.org/jira/browse/ARROW-9109) - [Python][Packaging] Enable S3 support in manylinux wheels -* [ARROW-9110](https://issues.apache.org/jira/browse/ARROW-9110) - [C++] Fix CPU cache size detection on macOS -* [ARROW-9112](https://issues.apache.org/jira/browse/ARROW-9112) - [R] Update autobrew script location -* [ARROW-9115](https://issues.apache.org/jira/browse/ARROW-9115) - [C++] Process data buffers in batch in ascii\_lower / ascii\_upper kernels rather than using string\_view value iteration -* [ARROW-9116](https://issues.apache.org/jira/browse/ARROW-9116) - [C++] Add BinaryArray::total\_values\_length() -* [ARROW-9116](https://issues.apache.org/jira/browse/ARROW-9116) - [C++] Add BinaryArray::total\_values\_length() -* [ARROW-9118](https://issues.apache.org/jira/browse/ARROW-9118) - [C++] Add more general BoundsCheck function that also checks for arbitrary lower limits in integer arrays -* [ARROW-9119](https://issues.apache.org/jira/browse/ARROW-9119) - [C++] Add support for building with system static gRPC -* [ARROW-9123](https://issues.apache.org/jira/browse/ARROW-9123) - [Python][wheel] Use libzstd.a explicitly -* [ARROW-9124](https://issues.apache.org/jira/browse/ARROW-9124) - [Rust][Datafusion] DFParser should consume sql query as &str instead of String -* [ARROW-9125](https://issues.apache.org/jira/browse/ARROW-9125) - [C++] Add missing include for arrow::internal::ZeroMemory() for Valgrind -* [ARROW-9129](https://issues.apache.org/jira/browse/ARROW-9129) - [Python][JPype] Test is failed with JPype 0.7.5 -* [ARROW-9130](https://issues.apache.org/jira/browse/ARROW-9130) - [Python] Add deprecated wrappers functions to a pyarrow/compat.py module for 1.0.0 that will be removed later -* [ARROW-9131](https://issues.apache.org/jira/browse/ARROW-9131) - [C++] Faster ascii\_lower and ascii\_upper -* [ARROW-9132](https://issues.apache.org/jira/browse/ARROW-9132) - [C++] Implement hash kernels for dictionary data with constant dictionaries -* [ARROW-9133](https://issues.apache.org/jira/browse/ARROW-9133) - [C++] Add utf8\_upper and utf8\_lower -* [ARROW-9137](https://issues.apache.org/jira/browse/ARROW-9137) - [GLib][Ruby] Allow to read Parquet files in chunks (by RowGroup) -* [ARROW-9138](https://issues.apache.org/jira/browse/ARROW-9138) - [Docs][Format] Make sure format version is hard coded in the docs -* [ARROW-9139](https://issues.apache.org/jira/browse/ARROW-9139) - [Python] parquet read\_table should not use\_legacy\_dataset -* [ARROW-9144](https://issues.apache.org/jira/browse/ARROW-9144) - [CI] OSS-Fuzz build fails because recent changes in the google repository -* [ARROW-9145](https://issues.apache.org/jira/browse/ARROW-9145) - [C++] Add true\_count / false\_count methods to BooleanArray -* [ARROW-9152](https://issues.apache.org/jira/browse/ARROW-9152) - [C++] Create specialized filter implementation for varbinary types -* [ARROW-9153](https://issues.apache.org/jira/browse/ARROW-9153) - [Python] Add bindings for StructScalar -* [ARROW-9154](https://issues.apache.org/jira/browse/ARROW-9154) - [Developer] Use GitHub issue templates better -* [ARROW-9155](https://issues.apache.org/jira/browse/ARROW-9155) - [Archery] Less precise but faster default settings for "archery benchmark diff" -* [ARROW-9156](https://issues.apache.org/jira/browse/ARROW-9156) - [C++] Reducing the code size of the tensor module -* [ARROW-9157](https://issues.apache.org/jira/browse/ARROW-9157) - [Rust][Datafusion] execution context's create\_physical\_plan should take self as immutable reference -* [ARROW-9158](https://issues.apache.org/jira/browse/ARROW-9158) - [Rust][Datafusion] Projection physical plan compilation should preserve nullability -* [ARROW-9159](https://issues.apache.org/jira/browse/ARROW-9159) - [Python] Expose the isnull/isvalid kernels -* [ARROW-9162](https://issues.apache.org/jira/browse/ARROW-9162) - [Python] Expose Add/Subtract/Multiply arithmetic kernels -* [ARROW-9163](https://issues.apache.org/jira/browse/ARROW-9163) - [C++] Add methods to StringArray, LargeStringArray, to validate whether its values are all UTF-8 -* [ARROW-9166](https://issues.apache.org/jira/browse/ARROW-9166) - [Website] Add overview page -* [ARROW-9167](https://issues.apache.org/jira/browse/ARROW-9167) - [Doc][Website] /docs/c\_glib/index.html is overwritten -* [ARROW-9168](https://issues.apache.org/jira/browse/ARROW-9168) - [C++][Flight] allow flight benchmark to use separated TCP connections -* [ARROW-9173](https://issues.apache.org/jira/browse/ARROW-9173) - [C++] Document how to use Arrow from a third-party CMake project -* [ARROW-9175](https://issues.apache.org/jira/browse/ARROW-9175) - [FlightRPC][C++][Python] Expose connected peer -* [ARROW-9176](https://issues.apache.org/jira/browse/ARROW-9176) - [Rust] Fix for memory leaks in Arrow allocator -* [ARROW-9178](https://issues.apache.org/jira/browse/ARROW-9178) - [R] Improve documentation about CSV reader -* [ARROW-9179](https://issues.apache.org/jira/browse/ARROW-9179) - [R] Replace usage of iris dataset in tests -* [ARROW-9180](https://issues.apache.org/jira/browse/ARROW-9180) - [Developer] Remove usage of whitelist, blacklist, slave, etc. -* [ARROW-9181](https://issues.apache.org/jira/browse/ARROW-9181) - [C++] Instantiate fewer templates in Cast kernel implementation -* [ARROW-9182](https://issues.apache.org/jira/browse/ARROW-9182) - [C++] Use "applicator" namespace for kernel operator-to-kernel functors, streamline argument unboxing -* [ARROW-9185](https://issues.apache.org/jira/browse/ARROW-9185) - [C++] [Java][Gandiva] Make llvm build optimisation configurable from java -* [ARROW-9188](https://issues.apache.org/jira/browse/ARROW-9188) - [C++] Do not always statically link Brotli libraries -* [ARROW-9189](https://issues.apache.org/jira/browse/ARROW-9189) - [Website] Improve contributor guide -* [ARROW-9190](https://issues.apache.org/jira/browse/ARROW-9190) - [Website][C++] Add blog post on efforts to make building lighter and easier -* [ARROW-9191](https://issues.apache.org/jira/browse/ARROW-9191) - [Rust] Do not panic when int96 milliseconds are negative -* [ARROW-9192](https://issues.apache.org/jira/browse/ARROW-9192) - [CI][Rust] Add support for running clippy -* [ARROW-9193](https://issues.apache.org/jira/browse/ARROW-9193) - [C++] Add method to parse date from null-terminated string -* [ARROW-9197](https://issues.apache.org/jira/browse/ARROW-9197) - [C++] Revamp numeric casts: faster performance and reduced binary size -* [ARROW-9201](https://issues.apache.org/jira/browse/ARROW-9201) - [Archery] Render-human readable table when using "archery benchmark diff" -* [ARROW-9202](https://issues.apache.org/jira/browse/ARROW-9202) - [GLib] Add GArrowDatum -* [ARROW-9203](https://issues.apache.org/jira/browse/ARROW-9203) - [Packaging][deb] Add missing gir1.2-arrow-dataset-1.0.install -* [ARROW-9204](https://issues.apache.org/jira/browse/ARROW-9204) - [C++][Flight] change records\_per\_stream to int64 in flight benchmark -* [ARROW-9205](https://issues.apache.org/jira/browse/ARROW-9205) - [Documentation] Fix typos in Columnar.rst -* [ARROW-9206](https://issues.apache.org/jira/browse/ARROW-9206) - [C++][Flight] measure latency in flight benchmark -* [ARROW-9207](https://issues.apache.org/jira/browse/ARROW-9207) - [Python][Dataset] Clean-up internal FileSource class -* [ARROW-9210](https://issues.apache.org/jira/browse/ARROW-9210) - [C++] Use OptionalBitBlockCounter in ArrayDataInlineVisitor -* [ARROW-9214](https://issues.apache.org/jira/browse/ARROW-9214) - [C++] Avoid util::optional in favor of separate inlineable functions in arrow/visitor\_inline.h -* [ARROW-9216](https://issues.apache.org/jira/browse/ARROW-9216) - [C++][Parquet] Use BitBlockCounter for plain spaced encoding/decoding -* [ARROW-9217](https://issues.apache.org/jira/browse/ARROW-9217) - [C++][Parquet] Cover 0.01% null for the plain spaced encoding/decoding benchmark -* [ARROW-9220](https://issues.apache.org/jira/browse/ARROW-9220) - [C++] Disable relevant compute kernels if ARROW\_WITH\_UTF8PROC=OFF -* [ARROW-9222](https://issues.apache.org/jira/browse/ARROW-9222) - [Format][Proposal] Remove validity bitmap from Union types -* [ARROW-9224](https://issues.apache.org/jira/browse/ARROW-9224) - [Dev][Archery] Copy local repo on clone failure -* [ARROW-9225](https://issues.apache.org/jira/browse/ARROW-9225) - [C++][Compute] Improve counting sort -* [ARROW-9231](https://issues.apache.org/jira/browse/ARROW-9231) - [Format] Increment MetadataVersion from V4 to V5 -* [ARROW-9234](https://issues.apache.org/jira/browse/ARROW-9234) - [GLib][CUDA] Add support for dictionary memo on reading record batch from buffer -* [ARROW-9241](https://issues.apache.org/jira/browse/ARROW-9241) - [C++] Add forward compatibility checks for Decimal::bitWidth -* [ARROW-9242](https://issues.apache.org/jira/browse/ARROW-9242) - [Java] Add forward compatibility checks for Decimal::bitWidth -* [ARROW-9247](https://issues.apache.org/jira/browse/ARROW-9247) - [Python] Expose BinaryArray::total\_values\_length in bindings -* [ARROW-9248](https://issues.apache.org/jira/browse/ARROW-9248) - [C++] Add "list\_size" function that returns Int32Array/Int64Array giving list cell sizes -* [ARROW-9249](https://issues.apache.org/jira/browse/ARROW-9249) - [C++] Implement "list\_parent\_indices" vector function -* [ARROW-9250](https://issues.apache.org/jira/browse/ARROW-9250) - [C++] Compact generated code in compute/kernels/scalar\_set\_lookup.cc using same method as vector\_hash.cc -* [ARROW-9251](https://issues.apache.org/jira/browse/ARROW-9251) - [C++] Move JSON testing code for integration tests to libarrow\_testing -* [ARROW-9254](https://issues.apache.org/jira/browse/ARROW-9254) - [C++] Factor out some integer casting internals so it can be reused with temporal casts -* [ARROW-9255](https://issues.apache.org/jira/browse/ARROW-9255) - [C++] Use CMake to build bundled Protobuf with CMake \>= 3.7 -* [ARROW-9256](https://issues.apache.org/jira/browse/ARROW-9256) - [C++] Incorrect variable name ARROW\_CXX\_FLAGS -* [ARROW-9258](https://issues.apache.org/jira/browse/ARROW-9258) - [Format] Add V5 MetadataVersion -* [ARROW-9259](https://issues.apache.org/jira/browse/ARROW-9259) - [Format] Permit unsigned dictionary indices in Columnar.rst -* [ARROW-9262](https://issues.apache.org/jira/browse/ARROW-9262) - [Packaging][Linux][CI] Use Ubuntu 18.04 to build ARM64 packages on Travis CI -* [ARROW-9263](https://issues.apache.org/jira/browse/ARROW-9263) - [C++] Benchmark: promote RegressionSetArgs size to L2 -* [ARROW-9264](https://issues.apache.org/jira/browse/ARROW-9264) - [C++] Cleanup Parquet Arrow Schema code -* [ARROW-9265](https://issues.apache.org/jira/browse/ARROW-9265) - [C++] Add support for writing MetadataVersion::V4-compatible IPC messages for compatibility with library versions <= 0.17.1 -* [ARROW-9268](https://issues.apache.org/jira/browse/ARROW-9268) - [C++] Add is{alnum,alpha,...} kernels for strings -* [ARROW-9272](https://issues.apache.org/jira/browse/ARROW-9272) - [C++][Python] Reduce complexity in python to arrow conversion -* [ARROW-9276](https://issues.apache.org/jira/browse/ARROW-9276) - [Dev] Enable ARROW\_CUDA when generating API documentations -* [ARROW-9277](https://issues.apache.org/jira/browse/ARROW-9277) - [C++] Fix documentation of Reading CSV files -* [ARROW-9278](https://issues.apache.org/jira/browse/ARROW-9278) - [C++] Implement Union validity bitmap changes from ARROW-9222 -* [ARROW-9280](https://issues.apache.org/jira/browse/ARROW-9280) - [Rust] Write statistics to Parquet files -* [ARROW-9281](https://issues.apache.org/jira/browse/ARROW-9281) - [R] Turn off utf8proc in R builds -* [ARROW-9283](https://issues.apache.org/jira/browse/ARROW-9283) - [Python] Expose C++ build info -* [ARROW-9287](https://issues.apache.org/jira/browse/ARROW-9287) - [C++] Implement support for unsigned dictionary indices -* [ARROW-9289](https://issues.apache.org/jira/browse/ARROW-9289) - [R] Remove deprecated functions -* [ARROW-9290](https://issues.apache.org/jira/browse/ARROW-9290) - [Rust] [Parquet] Add features to allow opting out of dependencies -* [ARROW-9291](https://issues.apache.org/jira/browse/ARROW-9291) - [R] Support fixed size binary/list types -* [ARROW-9292](https://issues.apache.org/jira/browse/ARROW-9292) - [Rust] Update feature matrix with passing tests -* [ARROW-9294](https://issues.apache.org/jira/browse/ARROW-9294) - [GLib] Add GArrowFunction -* [ARROW-9300](https://issues.apache.org/jira/browse/ARROW-9300) - [Java] Separate Netty Memory to its own module -* [ARROW-9306](https://issues.apache.org/jira/browse/ARROW-9306) - [Ruby] Add support for Arrow::RecordBatch.new(raw\_table) -* [ARROW-9307](https://issues.apache.org/jira/browse/ARROW-9307) - [Ruby] Add Arrow::RecordBatchIterator\#to\_a -* [ARROW-9308](https://issues.apache.org/jira/browse/ARROW-9308) - [Format] Add Feature enum to schema.fbs for forward compatibity -* [ARROW-9316](https://issues.apache.org/jira/browse/ARROW-9316) - [C++] Use "Dataset" instead of "Datasets" -* [ARROW-9321](https://issues.apache.org/jira/browse/ARROW-9321) - [C++][Dataset] Allow to "collect" statistics for ParquetFragment row groups if not constructed from \_metadata -* [ARROW-9322](https://issues.apache.org/jira/browse/ARROW-9322) - [R] Dataset documentation polishing -* [ARROW-9323](https://issues.apache.org/jira/browse/ARROW-9323) - [Ruby] Add Red Arrow Dataset -* [ARROW-9327](https://issues.apache.org/jira/browse/ARROW-9327) - Fix all clippy errors for arrow crate -* [ARROW-9329](https://issues.apache.org/jira/browse/ARROW-9329) - [C++][Gandiva] Implement castTimestampToDate function -* [ARROW-9331](https://issues.apache.org/jira/browse/ARROW-9331) - [C++] Improve the performance of Tensor-to-SparseTensor conversion -* [ARROW-9333](https://issues.apache.org/jira/browse/ARROW-9333) - [Python] Expose more IPC write options in Python -* [ARROW-9335](https://issues.apache.org/jira/browse/ARROW-9335) - [Website] Update website for 1.0 -* [ARROW-9337](https://issues.apache.org/jira/browse/ARROW-9337) - [R] On C++ library build failure, give an unambiguous message -* [ARROW-9339](https://issues.apache.org/jira/browse/ARROW-9339) - [Rust] Comments on SIMD in Arrow README are incorrect -* [ARROW-9340](https://issues.apache.org/jira/browse/ARROW-9340) - [R] Use CRAN version of decor package -* [ARROW-9341](https://issues.apache.org/jira/browse/ARROW-9341) - [GLib] Use arrow::Datum version Take() -* [ARROW-9345](https://issues.apache.org/jira/browse/ARROW-9345) - [C++][Dataset] Expression with dictionary type should work with operand of value type -* [ARROW-9346](https://issues.apache.org/jira/browse/ARROW-9346) - [C++][Python][Dataset] Add total\_byte\_size metadata to RowGroupInfo -* [ARROW-9362](https://issues.apache.org/jira/browse/ARROW-9362) - [Java] Add support for writing MetadataVersion::V4-compatible IPC messages for compatibility with library versions <= 0.17.1 -* [ARROW-9365](https://issues.apache.org/jira/browse/ARROW-9365) - [Go] Implement the rest of the typed array builders in NewBuilder -* [ARROW-9370](https://issues.apache.org/jira/browse/ARROW-9370) - [Java] Bump Netty version -* [ARROW-9374](https://issues.apache.org/jira/browse/ARROW-9374) - [C++][Python] Expose MakeArrayFromScalar -* [ARROW-9379](https://issues.apache.org/jira/browse/ARROW-9379) - [Rust] Support unsigned dictionary indices -* [ARROW-9383](https://issues.apache.org/jira/browse/ARROW-9383) - [Python] Support fsspec filesystems in Dataset API through fs handler -* [ARROW-9386](https://issues.apache.org/jira/browse/ARROW-9386) - [Rust] RecordBatch.schema() should not return &Arc -* [ARROW-9390](https://issues.apache.org/jira/browse/ARROW-9390) - [C++] Review compute function names -* [ARROW-9390](https://issues.apache.org/jira/browse/ARROW-9390) - [C++] Review compute function names -* [ARROW-9390](https://issues.apache.org/jira/browse/ARROW-9390) - [C++] Review compute function names -* [ARROW-9391](https://issues.apache.org/jira/browse/ARROW-9391) - [Rust] Float32 values interpreted as zero when record batch has one row -* [ARROW-9393](https://issues.apache.org/jira/browse/ARROW-9393) - [Doc] update supported types documentation for Java -* [ARROW-9395](https://issues.apache.org/jira/browse/ARROW-9395) - [Python] Provide configurable MetadataVersion in IPC API and environment variable to set default to V4 when needed -* [ARROW-9399](https://issues.apache.org/jira/browse/ARROW-9399) - [C++] Add forward compatibility checks for unrecognized future MetadataVersion -* [ARROW-9403](https://issues.apache.org/jira/browse/ARROW-9403) - [Python] add .tolist as alias of .to\_pylist -* [ARROW-9407](https://issues.apache.org/jira/browse/ARROW-9407) - [Python] Accept pd.NA as missing value in array constructor -* [ARROW-9411](https://issues.apache.org/jira/browse/ARROW-9411) - [Rust] Update dependencies -* [ARROW-9424](https://issues.apache.org/jira/browse/ARROW-9424) - [C++][Parquet] Disable writing files with LZ4 codec -* [ARROW-9425](https://issues.apache.org/jira/browse/ARROW-9425) - [Rust][DataFusion] Make ExecutionContext sharable between threads -* [ARROW-9427](https://issues.apache.org/jira/browse/ARROW-9427) - [Rust][DataFusion] Add pub fn ExecutionContext.tables() -* [ARROW-9437](https://issues.apache.org/jira/browse/ARROW-9437) - [Python][Packaging] Homebrew fails to install build dependencies in the macOS wheel builds -* [ARROW-9442](https://issues.apache.org/jira/browse/ARROW-9442) - [Python] Do not force Validate() to be called in pyarrow\_wrap\_table -* [ARROW-9445](https://issues.apache.org/jira/browse/ARROW-9445) - [Python] Revert Array.equals changes + expose comparison ops in compute -* [ARROW-9446](https://issues.apache.org/jira/browse/ARROW-9446) - [C++] Export compiler information in BuildInfo -* [ARROW-9447](https://issues.apache.org/jira/browse/ARROW-9447) - [Rust][DataFusion] Allow closures as ScalarUDFs -* [ARROW-9452](https://issues.apache.org/jira/browse/ARROW-9452) - [Rust] [DateFusion] Improve performance of parquet scan -* [ARROW-9470](https://issues.apache.org/jira/browse/ARROW-9470) - [CI][Java] Run Maven in parallel -* [ARROW-9472](https://issues.apache.org/jira/browse/ARROW-9472) - [R] Provide configurable MetadataVersion in IPC API and environment variable to set default to V4 when needed -* [ARROW-9473](https://issues.apache.org/jira/browse/ARROW-9473) - [Doc] Polishing for 1.0 -* [ARROW-9478](https://issues.apache.org/jira/browse/ARROW-9478) - [C++] Improve error message on unsupported cast types -* [ARROW-9484](https://issues.apache.org/jira/browse/ARROW-9484) - [Docs] Update is\* functions to be is\_\* in the compute docs -* [ARROW-9485](https://issues.apache.org/jira/browse/ARROW-9485) - [R] Better shared library stripping -* [ARROW-9493](https://issues.apache.org/jira/browse/ARROW-9493) - [Python][Dataset] Dictionary encode string partition columns by default -* [ARROW-9509](https://issues.apache.org/jira/browse/ARROW-9509) - [Release] Don't test Gandiva in the windows wheel verification script -* [ARROW-9511](https://issues.apache.org/jira/browse/ARROW-9511) - [Packaging][Release] Set conda packages' build number to 0 -* [ARROW-9519](https://issues.apache.org/jira/browse/ARROW-9519) - [Rust] Improve error message when getting a field by name from schema -* [ARROW-9523](https://issues.apache.org/jira/browse/ARROW-9523) - [Rust] improve performance of filter kernel -* [ARROW-9529](https://issues.apache.org/jira/browse/ARROW-9529) - [Dev][Release] Improvements to release verification scripts -* [ARROW-9531](https://issues.apache.org/jira/browse/ARROW-9531) - [Packaging][Release] Update conda forge dependency pins -* [PARQUET-1820](https://issues.apache.org/jira/browse/PARQUET-1820) - [C++] Use a column filter hint to inform read prefetching in Arrow reads -* [PARQUET-1843](https://issues.apache.org/jira/browse/PARQUET-1843) - [C++] Unnecessary assignment in DictDecoderImpl::Decode -* [PARQUET-1855](https://issues.apache.org/jira/browse/PARQUET-1855) - [C++] Improve documentation on MetaData ownership -* [PARQUET-1861](https://issues.apache.org/jira/browse/PARQUET-1861) - [Documentation][C++] Explain ReaderProperters.buffer\_stream\* - - - -# Apache Arrow 0.17.1 (2020-05-18) - -## Bug Fixes - -* [ARROW-8503](https://issues.apache.org/jira/browse/ARROW-8503) - [Packaging][deb] Can't build apache-arrow-archive-keyring for RC -* [ARROW-8505](https://issues.apache.org/jira/browse/ARROW-8505) - [Release][C\#] "sourcelink test" is failed by Apache.Arrow.AssemblyInfo.cs -* [ARROW-8584](https://issues.apache.org/jira/browse/ARROW-8584) - [Packaging][C++] Protobuf link error in deb builds -* [ARROW-8608](https://issues.apache.org/jira/browse/ARROW-8608) - [C++] Update vendored mpark/variant.h to latest to fix NVCC compilation issues -* [ARROW-8609](https://issues.apache.org/jira/browse/ARROW-8609) - [C++] ORC JNI bridge crashed on null arrow buffer -* [ARROW-8641](https://issues.apache.org/jira/browse/ARROW-8641) - [Python] Regression in feather: no longer supports permutation in column selection -* [ARROW-8657](https://issues.apache.org/jira/browse/ARROW-8657) - [Python][C++][Parquet] Forward compatibility issue from 0.16 to 0.17 when using version='2.0' -* [ARROW-8684](https://issues.apache.org/jira/browse/ARROW-8684) - [Python] "SystemError: Bad call flags in \_PyMethodDef\_RawFastCallDict" in Python 3.7.7 on macOS when using pyarrow wheel -* [ARROW-8694](https://issues.apache.org/jira/browse/ARROW-8694) - [Python][Parquet] parquet.read\_schema() fails when loading wide table created from Pandas DataFrame -* [ARROW-8704](https://issues.apache.org/jira/browse/ARROW-8704) - [C++] Fix Parquet crash on invalid input (OSS-Fuzz) -* [ARROW-8706](https://issues.apache.org/jira/browse/ARROW-8706) - [C++][Parquet] Tracking JIRA for PARQUET-1857 (unencrypted INT16\_MAX Parquet row group limit) -* [ARROW-8728](https://issues.apache.org/jira/browse/ARROW-8728) - [C++] Bitmap operation may cause buffer overflow -* [ARROW-8741](https://issues.apache.org/jira/browse/ARROW-8741) - [Python][Packaging] Keep VS2015 with for the windows wheels -* [ARROW-8750](https://issues.apache.org/jira/browse/ARROW-8750) - [Python] pyarrow.feather.write\_feather does not default to lz4 compression if it's available -* [PARQUET-1857](https://issues.apache.org/jira/browse/PARQUET-1857) - [C++][Parquet] ParquetFileReader unable to read files with more than 32767 row groups - - -## New Features and Improvements - -* [ARROW-8501](https://issues.apache.org/jira/browse/ARROW-8501) - [Packaging][RPM] Upgrade devtoolset to 8 on CentOS 6 -* [ARROW-8549](https://issues.apache.org/jira/browse/ARROW-8549) - [R] Assorted post-0.17 release cleanups -* [ARROW-8699](https://issues.apache.org/jira/browse/ARROW-8699) - [R] Fix automatic r\_to\_py conversion -* [ARROW-8758](https://issues.apache.org/jira/browse/ARROW-8758) - [R] Updates for compatibility with dplyr 1.0 -* [ARROW-8786](https://issues.apache.org/jira/browse/ARROW-8786) - [Packaging][rpm] Use bundled zstd in the CentOS 8 build - - - -# Apache Arrow 0.17.0 (2020-04-20) - -## Bug Fixes - -* [ARROW-1907](https://issues.apache.org/jira/browse/ARROW-1907) - [C++/Python] Feather format cannot accommodate string columns containing more than a total of 2GB of data -* [ARROW-2255](https://issues.apache.org/jira/browse/ARROW-2255) - [Developer][Integration] Serialize schema- and field-level custom metadata in integration test JSON format -* [ARROW-2587](https://issues.apache.org/jira/browse/ARROW-2587) - [Python] Unable to write StructArrays with multiple children to parquet -* [ARROW-3004](https://issues.apache.org/jira/browse/ARROW-3004) - [Documentation] Builds docs for master rather than a pinned commit -* [ARROW-3543](https://issues.apache.org/jira/browse/ARROW-3543) - [R] Better support for timestamp format and time zones in R -* [ARROW-5265](https://issues.apache.org/jira/browse/ARROW-5265) - [Python/CI] Add integration test with kartothek -* [ARROW-5473](https://issues.apache.org/jira/browse/ARROW-5473) - [C++] Build failure on googletest\_ep on Windows when using Ninja -* [ARROW-5981](https://issues.apache.org/jira/browse/ARROW-5981) - [C++] DictionaryBuilder initialization with Array can fail silently -* [ARROW-6528](https://issues.apache.org/jira/browse/ARROW-6528) - [C++] Spurious Flight test failures (port allocation failure) -* [ARROW-6547](https://issues.apache.org/jira/browse/ARROW-6547) - [C++] valgrind errors in diff-test -* [ARROW-6738](https://issues.apache.org/jira/browse/ARROW-6738) - [Java] Fix problems with current union comparison logic -* [ARROW-6757](https://issues.apache.org/jira/browse/ARROW-6757) - [Python] Creating csv.ParseOptions() causes "Windows fatal exception: access violation" with Visual Studio 2017 -* [ARROW-6871](https://issues.apache.org/jira/browse/ARROW-6871) - [Java] Enhance TransferPair related parameters check and tests -* [ARROW-6872](https://issues.apache.org/jira/browse/ARROW-6872) - [C++][Python] Empty table with dictionary-columns raises ArrowNotImplementedError -* [ARROW-6890](https://issues.apache.org/jira/browse/ARROW-6890) - [Rust] [Parquet] ArrowReader fails with seg fault -* [ARROW-6895](https://issues.apache.org/jira/browse/ARROW-6895) - [C++][Parquet] parquet::arrow::ColumnReader: ByteArrayDictionaryRecordReader repeats returned values when calling \`NextBatch()\` -* [ARROW-7008](https://issues.apache.org/jira/browse/ARROW-7008) - [Python] pyarrow.chunked\_array([array]) fails on array with all-None buffers -* [ARROW-7049](https://issues.apache.org/jira/browse/ARROW-7049) - [C++] warnings building on mingw-w64 -* [ARROW-7301](https://issues.apache.org/jira/browse/ARROW-7301) - [Java] Sql type DATE should correspond to DateDayVector -* [ARROW-7335](https://issues.apache.org/jira/browse/ARROW-7335) - [C++][Gandiva] Add castBIGINT, extractDay interval\_day functions in Gandiva -* [ARROW-7390](https://issues.apache.org/jira/browse/ARROW-7390) - [C++][Dataset] Concurrency race in Projector::Project -* [ARROW-7405](https://issues.apache.org/jira/browse/ARROW-7405) - [Java] ListVector isEmpty API is incorrect -* [ARROW-7466](https://issues.apache.org/jira/browse/ARROW-7466) - [CI][Java] Fix gandiva-jar-osx nightly build failure -* [ARROW-7467](https://issues.apache.org/jira/browse/ARROW-7467) - [Java] ComplexCopier does incorrect copy for Map nullable info -* [ARROW-7507](https://issues.apache.org/jira/browse/ARROW-7507) - [Rust] Bump Thrift version to 0.13 in parquet-format and parquet -* [ARROW-7520](https://issues.apache.org/jira/browse/ARROW-7520) - [R] Writing many batches causes a crash -* [ARROW-7546](https://issues.apache.org/jira/browse/ARROW-7546) - [Java] Use new implementation to concat vectors values in batch -* [ARROW-7624](https://issues.apache.org/jira/browse/ARROW-7624) - [Rust] Soundness issues via \`Buffer\` methods -* [ARROW-7628](https://issues.apache.org/jira/browse/ARROW-7628) - [Python] Better document some read\_csv corner cases -* [ARROW-7631](https://issues.apache.org/jira/browse/ARROW-7631) - [C++][Gandiva] return zero if there is an overflow while converting a decimal to a lower precision/scale -* [ARROW-7672](https://issues.apache.org/jira/browse/ARROW-7672) - [C++] NULL pointer dereference bug -* [ARROW-7680](https://issues.apache.org/jira/browse/ARROW-7680) - [C++][Dataset] Partition discovery is not working with windows path -* [ARROW-7701](https://issues.apache.org/jira/browse/ARROW-7701) - [C++] [CI] Flight test error on macOS -* [ARROW-7713](https://issues.apache.org/jira/browse/ARROW-7713) - [Java] TastLeak was put at the wrong location -* [ARROW-7722](https://issues.apache.org/jira/browse/ARROW-7722) - [Java][FlightRPC] Memory leak -* [ARROW-7734](https://issues.apache.org/jira/browse/ARROW-7734) - [C++] Segfault when comparing status with and without detail -* [ARROW-7740](https://issues.apache.org/jira/browse/ARROW-7740) - [C++] Array internals corruption in StructArray::Flatten -* [ARROW-7755](https://issues.apache.org/jira/browse/ARROW-7755) - [Python] Windows wheel cannot be installed on Python 3.8 -* [ARROW-7758](https://issues.apache.org/jira/browse/ARROW-7758) - [Python] Wrong conversion of timestamps that are out of bounds for pandas (eg 0000-01-01) -* [ARROW-7760](https://issues.apache.org/jira/browse/ARROW-7760) - [Release] Fix verify-release-candidate.sh since pip3 seems to no longer be in miniconda -* [ARROW-7762](https://issues.apache.org/jira/browse/ARROW-7762) - [Python] Exceptions in ParquetWriter get ignored -* [ARROW-7766](https://issues.apache.org/jira/browse/ARROW-7766) - [Python][Packaging] Windows py38 wheels are built with wrong ABI tag -* [ARROW-7772](https://issues.apache.org/jira/browse/ARROW-7772) - [R][C++][Dataset] Unable to filter on date32 object with date64 scalar -* [ARROW-7775](https://issues.apache.org/jira/browse/ARROW-7775) - [Rust] Don't let safe code arbitrarily transmute readers and writers -* [ARROW-7777](https://issues.apache.org/jira/browse/ARROW-7777) - [Go] StructBuilder/ListBuilder index out of range panic -* [ARROW-7780](https://issues.apache.org/jira/browse/ARROW-7780) - [Release] Fix Windows wheel RC verification script given lack of "m" ABI tag in Python 3.8 -* [ARROW-7781](https://issues.apache.org/jira/browse/ARROW-7781) - [C++][Dataset] Filtering on a non-existent column gives a segfault -* [ARROW-7783](https://issues.apache.org/jira/browse/ARROW-7783) - [C++] ARROW\_DATASET should enable ARROW\_COMPUTE -* [ARROW-7785](https://issues.apache.org/jira/browse/ARROW-7785) - [C++] sparse\_tensor.cc is extremely slow to compile -* [ARROW-7786](https://issues.apache.org/jira/browse/ARROW-7786) - [R] Wire up check\_metadata in Table.Equals method -* [ARROW-7789](https://issues.apache.org/jira/browse/ARROW-7789) - [R] Can't initialize arrow objects when R.oo package is loaded -* [ARROW-7791](https://issues.apache.org/jira/browse/ARROW-7791) - [C++][Parquet] Fix building error "cannot bind lvalue" -* [ARROW-7792](https://issues.apache.org/jira/browse/ARROW-7792) - [R] read\_\* functions should close connection to file -* [ARROW-7793](https://issues.apache.org/jira/browse/ARROW-7793) - [Java] If there is a leak the base allocator should release the excess memory to parent before throwing exception -* [ARROW-7794](https://issues.apache.org/jira/browse/ARROW-7794) - [Rust] cargo publish fails for arrow-flight due to relative path to Flight.proto -* [ARROW-7794](https://issues.apache.org/jira/browse/ARROW-7794) - [Rust] cargo publish fails for arrow-flight due to relative path to Flight.proto -* [ARROW-7797](https://issues.apache.org/jira/browse/ARROW-7797) - [Release][Rust] Fix arrow-flight's version in datafusion crate -* [ARROW-7802](https://issues.apache.org/jira/browse/ARROW-7802) - [C++] Support for LargeBinary and LargeString in the hash kernel -* [ARROW-7806](https://issues.apache.org/jira/browse/ARROW-7806) - [Python] Implement to\_pandas for lists of LargeBinary/String -* [ARROW-7807](https://issues.apache.org/jira/browse/ARROW-7807) - [R] Installation on RHEL 7 Cannot call io\_\_\_MemoryMappedFile\_\_Open() -* [ARROW-7809](https://issues.apache.org/jira/browse/ARROW-7809) - [R] vignette does not run on Win 10 nor ubuntu -* [ARROW-7813](https://issues.apache.org/jira/browse/ARROW-7813) - [Rust] Fix undefined behaviour and and remove unsafe -* [ARROW-7815](https://issues.apache.org/jira/browse/ARROW-7815) - [C++] Fix crashes on corrupt IPC input (OSS-Fuzz) -* [ARROW-7827](https://issues.apache.org/jira/browse/ARROW-7827) - [Python] conda-forge pyarrow package does not have s3 enabled -* [ARROW-7832](https://issues.apache.org/jira/browse/ARROW-7832) - [R] Patches to 0.16.0 release -* [ARROW-7836](https://issues.apache.org/jira/browse/ARROW-7836) - [Rust] "allocate\_aligned"/"reallocate" need to initialize memory to avoid UB -* [ARROW-7837](https://issues.apache.org/jira/browse/ARROW-7837) - [Java] bug in BaseVariableWidthVector.copyFromSafe results with an index out of bounds exception -* [ARROW-7838](https://issues.apache.org/jira/browse/ARROW-7838) - [C++] Installed plasma-store-server fails finding Boost -* [ARROW-7841](https://issues.apache.org/jira/browse/ARROW-7841) - [C++] HADOOP\_HOME doesn't work to find libhdfs.so -* [ARROW-7844](https://issues.apache.org/jira/browse/ARROW-7844) - [R] array\_to\_vector is not thread safe -* [ARROW-7848](https://issues.apache.org/jira/browse/ARROW-7848) - Add doc for MapType -* [ARROW-7852](https://issues.apache.org/jira/browse/ARROW-7852) - [Python] 0.16.0 wheels not compatible with older numpy -* [ARROW-7857](https://issues.apache.org/jira/browse/ARROW-7857) - [Python] Failing test with pandas master for extension type conversion -* [ARROW-7861](https://issues.apache.org/jira/browse/ARROW-7861) - [C++][Parquet] Add fuzz regression corpus for parquet reader -* [ARROW-7884](https://issues.apache.org/jira/browse/ARROW-7884) - [C++][Python] Crash in pq.read\_table() -* [ARROW-7887](https://issues.apache.org/jira/browse/ARROW-7887) - [Rust] Filter kernel does not support temporal types -* [ARROW-7889](https://issues.apache.org/jira/browse/ARROW-7889) - [Rust] Datafusion CLI does not support registering Parquet files -* [ARROW-7899](https://issues.apache.org/jira/browse/ARROW-7899) - [Integration][Java] null type integration test -* [ARROW-7908](https://issues.apache.org/jira/browse/ARROW-7908) - [R] Can't install package without setting LIBARROW\_DOWNLOAD=true -* [ARROW-7922](https://issues.apache.org/jira/browse/ARROW-7922) - [CI][Crossbow] Nightly macOS wheel builds fail (brew bundle edition) -* [ARROW-7923](https://issues.apache.org/jira/browse/ARROW-7923) - [CI][Crossbow] macOS autobrew fails on homebrew-versions -* [ARROW-7926](https://issues.apache.org/jira/browse/ARROW-7926) - [Developer] "archery lint" target is not ergonomic for running a single check like IWYU -* [ARROW-7928](https://issues.apache.org/jira/browse/ARROW-7928) - [Python] Example of flight server and client not working -* [ARROW-7931](https://issues.apache.org/jira/browse/ARROW-7931) - [C++] Fix crash on corrupt Map array input (OSS-Fuzz) -* [ARROW-7936](https://issues.apache.org/jira/browse/ARROW-7936) - [Python] FileSystem.from\_uri test fails on python 3.5 -* [ARROW-7940](https://issues.apache.org/jira/browse/ARROW-7940) - [C++] Unable to generate cmake build with settings other than default -* [ARROW-7944](https://issues.apache.org/jira/browse/ARROW-7944) - [Python] Test failures without Pandas -* [ARROW-7956](https://issues.apache.org/jira/browse/ARROW-7956) - [Python] Memory leak in pyarrow functions .ipc.serialize\_pandas/deserialize\_pandas -* [ARROW-7958](https://issues.apache.org/jira/browse/ARROW-7958) - [Java] Update Avro to version 1.9.2 -* [ARROW-7962](https://issues.apache.org/jira/browse/ARROW-7962) - [R][Dataset] Followup to "Consolidate Source and Dataset classes" -* [ARROW-7968](https://issues.apache.org/jira/browse/ARROW-7968) - [C++] orc\_ep build fails on 64-bit Raspbian -* [ARROW-7973](https://issues.apache.org/jira/browse/ARROW-7973) - [Developer][C++] ResourceWarnings in run\_cpplint.py -* [ARROW-7974](https://issues.apache.org/jira/browse/ARROW-7974) - [Developer][C++] ResourceWarning in "make check-format" -* [ARROW-7975](https://issues.apache.org/jira/browse/ARROW-7975) - [C++] Do not include padding bytes in "Buffer" IPC metadata accounting -* [ARROW-7978](https://issues.apache.org/jira/browse/ARROW-7978) - [Developer] GitHub Actions "lint" task is running include-what-you-use and failing -* [ARROW-7980](https://issues.apache.org/jira/browse/ARROW-7980) - [Python] Deserialization with pyarrow fails for certain Timestamp-based data frame -* [ARROW-7981](https://issues.apache.org/jira/browse/ARROW-7981) - [C++][Dataset] Fails to compile on gcc 5.4 -* [ARROW-7985](https://issues.apache.org/jira/browse/ARROW-7985) - [C++] ListBuilder.Finish fails if underlying value builder is empty and .Reserve'd -* [ARROW-7990](https://issues.apache.org/jira/browse/ARROW-7990) - [C++][Developer] Add "archery lint" option for running "iwyu.sh all" -* [ARROW-7992](https://issues.apache.org/jira/browse/ARROW-7992) - [C++] MSVC warning causing Appveyor failure in sort\_to\_indices.cc -* [ARROW-7996](https://issues.apache.org/jira/browse/ARROW-7996) - [Python] Error serializing empty pandas DataFrame with pyarrow -* [ARROW-7997](https://issues.apache.org/jira/browse/ARROW-7997) - [Python] Schema equals method with inconsistent docs in pyarrow -* [ARROW-7999](https://issues.apache.org/jira/browse/ARROW-7999) - [C++] Fix crash on corrupt Map array input (OSS-Fuzz) -* [ARROW-8000](https://issues.apache.org/jira/browse/ARROW-8000) - [C++] gcc 4.8 build failures -* [ARROW-8003](https://issues.apache.org/jira/browse/ARROW-8003) - [C++] -DBZip2\_SOURCE=BUNDLED fails when building with clang -* [ARROW-8006](https://issues.apache.org/jira/browse/ARROW-8006) - [C++] Unsafe arrow dictionary recovered from parquet -* [ARROW-8007](https://issues.apache.org/jira/browse/ARROW-8007) - [Python] Remove unused and defunct assert\_get\_object\_equal in plasma tests -* [ARROW-8008](https://issues.apache.org/jira/browse/ARROW-8008) - [C++/Python] Framework Python is preferred even though not the activated one -* [ARROW-8009](https://issues.apache.org/jira/browse/ARROW-8009) - [Java] Fix the hash code methods for BitVector -* [ARROW-8011](https://issues.apache.org/jira/browse/ARROW-8011) - [C++] Some buffers not resized when reading from Parquet -* [ARROW-8013](https://issues.apache.org/jira/browse/ARROW-8013) - [Python][Packaging] Fix manylinux wheels -* [ARROW-8021](https://issues.apache.org/jira/browse/ARROW-8021) - [Python] Appveyor does not appear to be including pandas in test runs -* [ARROW-8029](https://issues.apache.org/jira/browse/ARROW-8029) - [R] rstudio/r-base:3.6-centos7 GHA build failing on master -* [ARROW-8036](https://issues.apache.org/jira/browse/ARROW-8036) - [C++] Compilation failure with gtest 1.10.0 -* [ARROW-8042](https://issues.apache.org/jira/browse/ARROW-8042) - [Python] pyarrow.ChunkedArray docstring is incorrect regarding zero-length ChunkedArray having no chunks -* [ARROW-8057](https://issues.apache.org/jira/browse/ARROW-8057) - [Python] Don't check Schema metadata in \_\_eq\_\_ and \_\_ne\_\_ -* [ARROW-8070](https://issues.apache.org/jira/browse/ARROW-8070) - [C++] Cast segfaults on unsupported cast from list to utf8 -* [ARROW-8071](https://issues.apache.org/jira/browse/ARROW-8071) - [GLib] Build error with configure -* [ARROW-8075](https://issues.apache.org/jira/browse/ARROW-8075) - [R] Loading R.utils after arrow breaks some arrow functions -* [ARROW-8088](https://issues.apache.org/jira/browse/ARROW-8088) - [C++][Dataset] Partition columns with specified dictionary type result in all nulls -* [ARROW-8091](https://issues.apache.org/jira/browse/ARROW-8091) - [CI][Crossbow] Fix nightly homebrew and R failures -* [ARROW-8092](https://issues.apache.org/jira/browse/ARROW-8092) - [CI][Crossbow] OSX wheels fail on bundled bzip2 -* [ARROW-8094](https://issues.apache.org/jira/browse/ARROW-8094) - [CI][Crossbow] Nightly valgrind test fails -* [ARROW-8095](https://issues.apache.org/jira/browse/ARROW-8095) - [CI][Crossbow] Nightly turbodbc job fails -* [ARROW-8098](https://issues.apache.org/jira/browse/ARROW-8098) - [go] Checkptr Failures on Go 1.14 -* [ARROW-8099](https://issues.apache.org/jira/browse/ARROW-8099) - [Integration] archery integration --with-LANG flags don't work -* [ARROW-8101](https://issues.apache.org/jira/browse/ARROW-8101) - [FlightRPC][Java] Can't read/write only an empty null array -* [ARROW-8102](https://issues.apache.org/jira/browse/ARROW-8102) - [Dev] Crossbow's version detection doesn't work in the comment bot's scenario -* [ARROW-8105](https://issues.apache.org/jira/browse/ARROW-8105) - [Python] pyarrow.array segfaults when passed masked array with shrunken mask -* [ARROW-8106](https://issues.apache.org/jira/browse/ARROW-8106) - [Python] Builds on master broken by pandas 1.0.2 release -* [ARROW-8110](https://issues.apache.org/jira/browse/ARROW-8110) - [C\#] BuildArrays fails if NestedType is included -* [ARROW-8112](https://issues.apache.org/jira/browse/ARROW-8112) - [FlightRPC][C++] Some status codes don't round-trip through gRPC -* [ARROW-8119](https://issues.apache.org/jira/browse/ARROW-8119) - [Dev] Make Yaml optional dependency for archery -* [ARROW-8122](https://issues.apache.org/jira/browse/ARROW-8122) - [Python] Empty numpy arrays with shape cannot be deserialized -* [ARROW-8125](https://issues.apache.org/jira/browse/ARROW-8125) - [C++] "arrow-tests" target broken with ninja build -* [ARROW-8127](https://issues.apache.org/jira/browse/ARROW-8127) - [C++] [Parquet] Incorrect column chunk metadata for multipage batch writes -* [ARROW-8128](https://issues.apache.org/jira/browse/ARROW-8128) - [C\#] NestedType children serialized on wrong length -* [ARROW-8132](https://issues.apache.org/jira/browse/ARROW-8132) - [C++] arrow-s3fs-test failing on master -* [ARROW-8133](https://issues.apache.org/jira/browse/ARROW-8133) - [CI] Github Actions sometimes fail to checkout Arrow -* [ARROW-8136](https://issues.apache.org/jira/browse/ARROW-8136) - [C++][Python] Creating dataset from relative path no longer working -* [ARROW-8136](https://issues.apache.org/jira/browse/ARROW-8136) - [C++][Python] Creating dataset from relative path no longer working -* [ARROW-8138](https://issues.apache.org/jira/browse/ARROW-8138) - [C++] parquet::arrow::FileReader cannot read multiple RowGroup -* [ARROW-8139](https://issues.apache.org/jira/browse/ARROW-8139) - [C++] FileSystem enum causes attributes warning -* [ARROW-8142](https://issues.apache.org/jira/browse/ARROW-8142) - [C++] Casting a chunked array with 0 chunks critical failure -* [ARROW-8144](https://issues.apache.org/jira/browse/ARROW-8144) - [CI] Cmake 3.2 nightly build fails -* [ARROW-8154](https://issues.apache.org/jira/browse/ARROW-8154) - [Python] HDFS Filesystem does not set environment variables in pyarrow 0.16.0 release -* [ARROW-8159](https://issues.apache.org/jira/browse/ARROW-8159) - [Python] pyarrow.Schema.from\_pandas doesn't support ExtensionDtype -* [ARROW-8166](https://issues.apache.org/jira/browse/ARROW-8166) - [C++] AVX512 intrinsics fail to compile with clang-8 on Ubuntu 18.04 -* [ARROW-8176](https://issues.apache.org/jira/browse/ARROW-8176) - [FlightRPC][Integration] Have Flight services bind to port 0 in integration -* [ARROW-8186](https://issues.apache.org/jira/browse/ARROW-8186) - [Python] Dataset expression != returns bool instead of expression for invalid value -* [ARROW-8188](https://issues.apache.org/jira/browse/ARROW-8188) - [R] Adapt to latest checks in R-devel -* [ARROW-8193](https://issues.apache.org/jira/browse/ARROW-8193) - [C++] arrow-future-test fails to compile on gcc 4.8 -* [ARROW-8197](https://issues.apache.org/jira/browse/ARROW-8197) - [Rust] DataFusion "create\_physical\_plan" returns incorrect schema? -* [ARROW-8206](https://issues.apache.org/jira/browse/ARROW-8206) - [R] Minor fix for backwards compatibility on Linux installation -* [ARROW-8209](https://issues.apache.org/jira/browse/ARROW-8209) - [Python] Accessing duplicate column of Table by name gives wrong error -* [ARROW-8213](https://issues.apache.org/jira/browse/ARROW-8213) - [Python][Dataset] Opening a dataset with a local incorrect path gives confusing error message -* [ARROW-8216](https://issues.apache.org/jira/browse/ARROW-8216) - [R][C++][Dataset] Filtering returns all-missing rows where the filtering column is missing -* [ARROW-8217](https://issues.apache.org/jira/browse/ARROW-8217) - [R][C++] Fix crashing test in test-dataset.R on 32-bit Windows from ARROW-7979 -* [ARROW-8219](https://issues.apache.org/jira/browse/ARROW-8219) - [Rust] sqlparser crate needs to be bumped to version 0.2.5 -* [ARROW-8223](https://issues.apache.org/jira/browse/ARROW-8223) - [Python] Schema.from\_pandas breaks with pandas nullable integer dtype -* [ARROW-8233](https://issues.apache.org/jira/browse/ARROW-8233) - [CI] Build timeouts on "AMD64 Windows MinGW 64 GLib & Ruby " -* [ARROW-8234](https://issues.apache.org/jira/browse/ARROW-8234) - [CI] Build timeouts on "AMD64 Windows RTools 35" -* [ARROW-8236](https://issues.apache.org/jira/browse/ARROW-8236) - [Rust] Linting GitHub Actions task failing -* [ARROW-8237](https://issues.apache.org/jira/browse/ARROW-8237) - [Python] Review Developer build instructions for conda and non-conda users -* [ARROW-8237](https://issues.apache.org/jira/browse/ARROW-8237) - [Python] Review Developer build instructions for conda and non-conda users -* [ARROW-8238](https://issues.apache.org/jira/browse/ARROW-8238) - [C++][Compute] Failed to build compute tests on windows with msvc2015 -* [ARROW-8239](https://issues.apache.org/jira/browse/ARROW-8239) - [Java] fix param checks in splitAndTransfer method -* [ARROW-8245](https://issues.apache.org/jira/browse/ARROW-8245) - [Python][Parquet] Skip hidden directories when reading partitioned parquet files -* [ARROW-8254](https://issues.apache.org/jira/browse/ARROW-8254) - [Rust] [DataFusion] CLI is not working as expected -* [ARROW-8255](https://issues.apache.org/jira/browse/ARROW-8255) - [Rust] [DataFusion] COUNT(\*) results in confusing error -* [ARROW-8259](https://issues.apache.org/jira/browse/ARROW-8259) - [Rust] [DataFusion] ProjectionPushDownRule does not rewrite LIMIT -* [ARROW-8268](https://issues.apache.org/jira/browse/ARROW-8268) - [Ruby] Test failure due to lack of built ZSTD support -* [ARROW-8269](https://issues.apache.org/jira/browse/ARROW-8269) - [Python] Failure in "nopandas" build in test\_parquet\_row\_group\_fragments -* [ARROW-8270](https://issues.apache.org/jira/browse/ARROW-8270) - [Python][Flight] Example Flight server with TLS's certificate and key is not working -* [ARROW-8272](https://issues.apache.org/jira/browse/ARROW-8272) - [CI][Python] Test failure on Ubuntu 16.04 -* [ARROW-8274](https://issues.apache.org/jira/browse/ARROW-8274) - [C++] Use LZ4 frame format for "LZ4" compression in IPC write -* [ARROW-8276](https://issues.apache.org/jira/browse/ARROW-8276) - [C++][Dataset] Scanning a Fragment does not take into account the partition columns -* [ARROW-8280](https://issues.apache.org/jira/browse/ARROW-8280) - [C++] MinGW builds failing due to CARES-related toolchain issue -* [ARROW-8286](https://issues.apache.org/jira/browse/ARROW-8286) - [Python] Creating dataset from pathlib results in UnionDataset instead of FileSystemDataset -* [ARROW-8298](https://issues.apache.org/jira/browse/ARROW-8298) - [C++][CI] MinGW builds fail building grpc -* [ARROW-8303](https://issues.apache.org/jira/browse/ARROW-8303) - [Python] Fix test failure caused by non-deterministic dict key ordering on Python 3.5 -* [ARROW-8304](https://issues.apache.org/jira/browse/ARROW-8304) - [Flight][Python] Flight client with TLS root certificate is reporting error on do\_get() -* [ARROW-8305](https://issues.apache.org/jira/browse/ARROW-8305) - [Java] ExtensionTypeVector should make sure underlyingVector not null -* [ARROW-8310](https://issues.apache.org/jira/browse/ARROW-8310) - [C++] Minio's exceptions not recognized by IsConnectError() -* [ARROW-8315](https://issues.apache.org/jira/browse/ARROW-8315) - [Python][Dataset] Don't rely on ordered dict keys in test\_dataset.py -* [ARROW-8323](https://issues.apache.org/jira/browse/ARROW-8323) - [C++] Pin gRPC at v1.27 to avoid compilation error in its headers -* [ARROW-8326](https://issues.apache.org/jira/browse/ARROW-8326) - [C++] Don't use deprecated TYPED\_TEST\_CASE -* [ARROW-8327](https://issues.apache.org/jira/browse/ARROW-8327) - [FlightRPC][Java] gRPC trailers may be null -* [ARROW-8331](https://issues.apache.org/jira/browse/ARROW-8331) - [C++] arrow-compute-filter-benchmark fails to compile -* [ARROW-8333](https://issues.apache.org/jira/browse/ARROW-8333) - [C++][CI] Always compile benchmarks in some C++ CI entry -* [ARROW-8334](https://issues.apache.org/jira/browse/ARROW-8334) - [C++] [Gandiva] Missing DATE32 in LLVM Types / Simple D32 Compute Functions -* [ARROW-8342](https://issues.apache.org/jira/browse/ARROW-8342) - [Python] dask and kartothek integration tests are failing -* [ARROW-8345](https://issues.apache.org/jira/browse/ARROW-8345) - [Python] feather.read\_table should not require pandas -* [ARROW-8346](https://issues.apache.org/jira/browse/ARROW-8346) - [CI][Ruby] GLib/Ruby macOS build fails on zlib -* [ARROW-8349](https://issues.apache.org/jira/browse/ARROW-8349) - [CI][NIGHTLY:gandiva-jar-osx] Use latest pygit2 -* [ARROW-8353](https://issues.apache.org/jira/browse/ARROW-8353) - [C++] is\_nullable maybe not initialized in parquet writer -* [ARROW-8354](https://issues.apache.org/jira/browse/ARROW-8354) - [R] Fix segfault in Table to Array conversion -* [ARROW-8357](https://issues.apache.org/jira/browse/ARROW-8357) - [Rust] [DataFusion] Dockerfile for CLI is missing format dir -* [ARROW-8358](https://issues.apache.org/jira/browse/ARROW-8358) - [C++] Fix -Wrange-loop-construct warnings in clang-11 -* [ARROW-8365](https://issues.apache.org/jira/browse/ARROW-8365) - [C++] Error when writing files to S3 larger than 5 GB -* [ARROW-8366](https://issues.apache.org/jira/browse/ARROW-8366) - [Rust] Need to revert recent arrow-flight build change -* [ARROW-8369](https://issues.apache.org/jira/browse/ARROW-8369) - [CI] Fix crossbow wildcard groups -* [ARROW-8373](https://issues.apache.org/jira/browse/ARROW-8373) - [GLib] Problems resolving gobject-introspection, arrow in Meson builds -* [ARROW-8380](https://issues.apache.org/jira/browse/ARROW-8380) - [RUST] StringDictionaryBuilder not publicly exported from arrow::array -* [ARROW-8384](https://issues.apache.org/jira/browse/ARROW-8384) - [C++][Python] arrow/filesystem/hdfs.h and Python wrapper does not have an option for setting a path to a Kerberos ticket -* [ARROW-8386](https://issues.apache.org/jira/browse/ARROW-8386) - [Python] pyarrow.jvm raises error for empty Arrays -* [ARROW-8388](https://issues.apache.org/jira/browse/ARROW-8388) - [C++] GCC 4.8 fails to move on return -* [ARROW-8397](https://issues.apache.org/jira/browse/ARROW-8397) - [C++] Fail to compile aggregate\_test.cc on Ubuntu 16.04 -* [ARROW-8406](https://issues.apache.org/jira/browse/ARROW-8406) - [Python] test\_fs fails when run from a different drive on Windows -* [ARROW-8410](https://issues.apache.org/jira/browse/ARROW-8410) - [C++] CMake fails on aarch64 systems that do not support -march=armv8-a+crc+crypto -* [ARROW-8414](https://issues.apache.org/jira/browse/ARROW-8414) - [Python] Non-deterministic row order failure in test\_parquet.py -* [ARROW-8414](https://issues.apache.org/jira/browse/ARROW-8414) - [Python] Non-deterministic row order failure in test\_parquet.py -* [ARROW-8414](https://issues.apache.org/jira/browse/ARROW-8414) - [Python] Non-deterministic row order failure in test\_parquet.py -* [ARROW-8415](https://issues.apache.org/jira/browse/ARROW-8415) - [C++][Packaging] fix gandiva linux job -* [ARROW-8416](https://issues.apache.org/jira/browse/ARROW-8416) - [Python] Provide a "feather" alias in the dataset API -* [ARROW-8420](https://issues.apache.org/jira/browse/ARROW-8420) - [C++] CMake fails to configure on armv7l platform (e.g. Raspberry Pi 3) -* [ARROW-8427](https://issues.apache.org/jira/browse/ARROW-8427) - [C++][Dataset] Do not ignore file paths with underscore/dot when full path was specified -* [ARROW-8428](https://issues.apache.org/jira/browse/ARROW-8428) - [C++][NIGHTLY:gandiva-jar-trusty] GCC 4.8 failures in C++ unit tests -* [ARROW-8429](https://issues.apache.org/jira/browse/ARROW-8429) - [C++] Fix Buffer::CopySlice on 0-sized buffer -* [ARROW-8432](https://issues.apache.org/jira/browse/ARROW-8432) - [Python][CI] Failure to download Hadoop -* [ARROW-8437](https://issues.apache.org/jira/browse/ARROW-8437) - [C++] Remove std::move return value from MakeRandomNullBitmap test utility -* [ARROW-8438](https://issues.apache.org/jira/browse/ARROW-8438) - [C++] arrow-io-memory-benchmark crashes -* [ARROW-8439](https://issues.apache.org/jira/browse/ARROW-8439) - [Python] Filesystem docs are outdated -* [ARROW-8441](https://issues.apache.org/jira/browse/ARROW-8441) - [C++] Fix crashes on invalid input (OSS-Fuzz) -* [ARROW-8442](https://issues.apache.org/jira/browse/ARROW-8442) - [Python] NullType.to\_pandas\_dtype inconsisent with dtype returned in to\_pandas/to\_numpy -* [ARROW-8460](https://issues.apache.org/jira/browse/ARROW-8460) - [Packaging][deb] Ubuntu Focal build is failed -* [ARROW-8465](https://issues.apache.org/jira/browse/ARROW-8465) - [Packaging][Python] Windows py35 wheel build fails because of boost -* [ARROW-8466](https://issues.apache.org/jira/browse/ARROW-8466) - [Packaging] The python unittests are not running in the windows wheel builds -* [ARROW-8468](https://issues.apache.org/jira/browse/ARROW-8468) - [Document] Fix the incorrect null bits description -* [ARROW-8469](https://issues.apache.org/jira/browse/ARROW-8469) - [Dev] Fix nightly docker tests on azure -* [ARROW-8478](https://issues.apache.org/jira/browse/ARROW-8478) - [Java] Rollback contrib package changes. -* [ARROW-8498](https://issues.apache.org/jira/browse/ARROW-8498) - [Python] Schema.from\_pandas fails on extension type, while Table.from\_pandas works -* [PARQUET-1780](https://issues.apache.org/jira/browse/PARQUET-1780) - [C++] Set ColumnMetadata.encoding\_stats field -* [PARQUET-1788](https://issues.apache.org/jira/browse/PARQUET-1788) - [C++] ColumnWriter has undefined behavior when writing arrow chunks -* [PARQUET-1797](https://issues.apache.org/jira/browse/PARQUET-1797) - [C++] Fix fuzzing errors -* [PARQUET-1799](https://issues.apache.org/jira/browse/PARQUET-1799) - [C++] Stream API: Relax schema checking when reading -* [PARQUET-1810](https://issues.apache.org/jira/browse/PARQUET-1810) - [C++] Fix undefined behaviour on invalid enum values (OSS-Fuzz) -* [PARQUET-1813](https://issues.apache.org/jira/browse/PARQUET-1813) - [C++] Remove logging statement in unit test -* [PARQUET-1819](https://issues.apache.org/jira/browse/PARQUET-1819) - [C++] Fix crashes on corrupt IPC input (OSS-Fuzz) -* [PARQUET-1819](https://issues.apache.org/jira/browse/PARQUET-1819) - [C++] Fix crashes on corrupt IPC input (OSS-Fuzz) -* [PARQUET-1823](https://issues.apache.org/jira/browse/PARQUET-1823) - [C++] Invalid RowGroup returned when reading with parquet::arrow::FileReader-\>RowGroup(i)-\>Column(j) -* [PARQUET-1824](https://issues.apache.org/jira/browse/PARQUET-1824) - [C++] Fix crashes on invalid input (OSS-Fuzz) -* [PARQUET-1829](https://issues.apache.org/jira/browse/PARQUET-1829) - [C++] Fix crashes on invalid input (OSS-Fuzz) -* [PARQUET-1831](https://issues.apache.org/jira/browse/PARQUET-1831) - [C++] Fix crashes on invalid input (OSS-Fuzz) -* [PARQUET-1835](https://issues.apache.org/jira/browse/PARQUET-1835) - [C++] Fix crashes on invalid input (OSS-Fuzz) - - -## New Features and Improvements - -* [ARROW-590](https://issues.apache.org/jira/browse/ARROW-590) - [Integration] Add integration tests for Union types -* [ARROW-1470](https://issues.apache.org/jira/browse/ARROW-1470) - [C++] Add BufferAllocator abstract interface -* [ARROW-1560](https://issues.apache.org/jira/browse/ARROW-1560) - [C++] Kernel implementations for "match" function -* [ARROW-1571](https://issues.apache.org/jira/browse/ARROW-1571) - [C++] Implement argsort kernels (sort indices) for integers using O(n) counting sort -* [ARROW-1581](https://issues.apache.org/jira/browse/ARROW-1581) - [Packaging] Tooling to make nightly wheels available for install -* [ARROW-1582](https://issues.apache.org/jira/browse/ARROW-1582) - [Python] Set up + document nightly conda builds for macOS -* [ARROW-1636](https://issues.apache.org/jira/browse/ARROW-1636) - [Format] Integration tests for null type -* [ARROW-2447](https://issues.apache.org/jira/browse/ARROW-2447) - [C++] Create a device abstraction -* [ARROW-2882](https://issues.apache.org/jira/browse/ARROW-2882) - [C++][Python] Support AWS Firehose partition\_scheme implementation for Parquet datasets -* [ARROW-3054](https://issues.apache.org/jira/browse/ARROW-3054) - [Packaging] Tooling to enable nightly conda packages to be updated to some anaconda.org channel -* [ARROW-3410](https://issues.apache.org/jira/browse/ARROW-3410) - [C++][Dataset] Streaming CSV reader interface for memory-constrainted environments -* [ARROW-3750](https://issues.apache.org/jira/browse/ARROW-3750) - [R] Pass various wrapped Arrow objects created in Python into R with zero copy via reticulate -* [ARROW-4120](https://issues.apache.org/jira/browse/ARROW-4120) - [Python] Define process for testing procedures that check for no macro-level memory leaks -* [ARROW-4226](https://issues.apache.org/jira/browse/ARROW-4226) - [Format][C++] Add CSF sparse tensor support -* [ARROW-4286](https://issues.apache.org/jira/browse/ARROW-4286) - [C++/R] Namespace vendored Boost -* [ARROW-4304](https://issues.apache.org/jira/browse/ARROW-4304) - [Rust] Enhance documentation for arrow -* [ARROW-4428](https://issues.apache.org/jira/browse/ARROW-4428) - [R] Feature flags for R build -* [ARROW-4482](https://issues.apache.org/jira/browse/ARROW-4482) - [Website] Add blog archive page -* [ARROW-4815](https://issues.apache.org/jira/browse/ARROW-4815) - [Rust] [DataFusion] Add support for \* in SQL projection -* [ARROW-5357](https://issues.apache.org/jira/browse/ARROW-5357) - [Rust] Add capacity field in Buffer -* [ARROW-5405](https://issues.apache.org/jira/browse/ARROW-5405) - [Documentation] Move integration testing documentation to Sphinx docs, add instructions for JavaScript -* [ARROW-5497](https://issues.apache.org/jira/browse/ARROW-5497) - [Release] Build and publish R/Java/JS docs -* [ARROW-5501](https://issues.apache.org/jira/browse/ARROW-5501) - [R] Reorganize read/write file/stream functions -* [ARROW-5510](https://issues.apache.org/jira/browse/ARROW-5510) - [Format] Feather V2 based on Arrow IPC file format, with compression support -* [ARROW-5563](https://issues.apache.org/jira/browse/ARROW-5563) - [Format] Update integration test JSON format documentation -* [ARROW-5585](https://issues.apache.org/jira/browse/ARROW-5585) - [Go] rename arrow.TypeEquals into arrow.TypeEqual -* [ARROW-5742](https://issues.apache.org/jira/browse/ARROW-5742) - [CI] Add daily / weekly Valgrind build -* [ARROW-5757](https://issues.apache.org/jira/browse/ARROW-5757) - [Python] Stop supporting Python 2.7 -* [ARROW-5949](https://issues.apache.org/jira/browse/ARROW-5949) - [Rust] Implement DictionaryArray -* [ARROW-6165](https://issues.apache.org/jira/browse/ARROW-6165) - [Integration] Use multiprocessing to run integration tests on multiple CPU cores -* [ARROW-6176](https://issues.apache.org/jira/browse/ARROW-6176) - [Python] Allow to subclass ExtensionArray to attach to custom extension type -* [ARROW-6275](https://issues.apache.org/jira/browse/ARROW-6275) - [C++] Deprecate RecordBatchReader::ReadNext -* [ARROW-6393](https://issues.apache.org/jira/browse/ARROW-6393) - [C++] Add EqualOptions support in SparseTensor::Equals -* [ARROW-6479](https://issues.apache.org/jira/browse/ARROW-6479) - [C++] inline errors from external projects' build logs -* [ARROW-6510](https://issues.apache.org/jira/browse/ARROW-6510) - [Python][Filesystem] Expose nanosecond resolution mtime -* [ARROW-6666](https://issues.apache.org/jira/browse/ARROW-6666) - [Rust] [DataFusion] Implement string literal expression -* [ARROW-6724](https://issues.apache.org/jira/browse/ARROW-6724) - [C++] Add simpler static ctor for BufferOutputStream than the current Create function -* [ARROW-6821](https://issues.apache.org/jira/browse/ARROW-6821) - [C++][Parquet] Do not require Thrift compiler when building (but still require library) -* [ARROW-6823](https://issues.apache.org/jira/browse/ARROW-6823) - [C++][Python][R] Support metadata in the feather format? -* [ARROW-6829](https://issues.apache.org/jira/browse/ARROW-6829) - [Docs] Migrate integration test docs to Sphinx, fix instructions after ARROW-6466 -* [ARROW-6837](https://issues.apache.org/jira/browse/ARROW-6837) - [C++/Python] access File Footer custom\_metadata -* [ARROW-6841](https://issues.apache.org/jira/browse/ARROW-6841) - [C++] Upgrade to LLVM 8 -* [ARROW-6875](https://issues.apache.org/jira/browse/ARROW-6875) - [FlightRPC] Implement Criteria for ListFlights RPC / list\_flights method -* [ARROW-6915](https://issues.apache.org/jira/browse/ARROW-6915) - [Developer] Do not overwrite minor release version with merge script, even if not specified by committer -* [ARROW-6947](https://issues.apache.org/jira/browse/ARROW-6947) - [Rust] [DataFusion] Add support for scalar UDFs -* [ARROW-6996](https://issues.apache.org/jira/browse/ARROW-6996) - [Python] Expose boolean filter kernel on Table -* [ARROW-7044](https://issues.apache.org/jira/browse/ARROW-7044) - [Release] Create a post release script for the home-brew formulas -* [ARROW-7048](https://issues.apache.org/jira/browse/ARROW-7048) - [Java] Support for combining multiple vectors under VectorSchemaRoot -* [ARROW-7063](https://issues.apache.org/jira/browse/ARROW-7063) - [C++] Schema print method prints too much metadata -* [ARROW-7073](https://issues.apache.org/jira/browse/ARROW-7073) - [Java] Support concating vectors values in batch -* [ARROW-7080](https://issues.apache.org/jira/browse/ARROW-7080) - [Python][Parquet][C++] Expose parquet field\_id in Schema objects -* [ARROW-7091](https://issues.apache.org/jira/browse/ARROW-7091) - [C++] Move all factories to type\_fwd.h -* [ARROW-7119](https://issues.apache.org/jira/browse/ARROW-7119) - [C++][CI] Use scripts/util\_coredump.sh to show automatic backtraces -* [ARROW-7201](https://issues.apache.org/jira/browse/ARROW-7201) - [GLib][Gandiva] Add support for BooleanNode -* [ARROW-7202](https://issues.apache.org/jira/browse/ARROW-7202) - [R][CI] Improve rwinlib building on CI to stop re-downloading dependencies -* [ARROW-7222](https://issues.apache.org/jira/browse/ARROW-7222) - [Python][Release] Wipe any existing generated Python API documentation when updating website -* [ARROW-7233](https://issues.apache.org/jira/browse/ARROW-7233) - [C++] Add Result APIs to IPC module -* [ARROW-7256](https://issues.apache.org/jira/browse/ARROW-7256) - [C++] Remove ARROW\_MEMORY\_POOL\_DEFAULT macro -* [ARROW-7330](https://issues.apache.org/jira/browse/ARROW-7330) - [C++] Add Result to APIs to arrow/gpu -* [ARROW-7332](https://issues.apache.org/jira/browse/ARROW-7332) - [C++][Parquet] Explicitly catch status exceptions in PARQUET\_CATCH\_NOT\_OK -* [ARROW-7336](https://issues.apache.org/jira/browse/ARROW-7336) - [C++] Implement MinMax options to not skip nulls -* [ARROW-7338](https://issues.apache.org/jira/browse/ARROW-7338) - [C++] Improve InMemoryDataSource to support generator instead of static list -* [ARROW-7365](https://issues.apache.org/jira/browse/ARROW-7365) - [Python] Support FixedSizeList type in conversion to numpy/pandas -* [ARROW-7373](https://issues.apache.org/jira/browse/ARROW-7373) - [C++][Dataset] Remove FileSource -* [ARROW-7400](https://issues.apache.org/jira/browse/ARROW-7400) - [Java] Avoids the worst case for quick sort -* [ARROW-7412](https://issues.apache.org/jira/browse/ARROW-7412) - [C++][Dataset] Ensure that dataset code is robust to schemas with duplicate field names -* [ARROW-7419](https://issues.apache.org/jira/browse/ARROW-7419) - [Python] Support SparseCSCMatrix -* [ARROW-7427](https://issues.apache.org/jira/browse/ARROW-7427) - [Python] Support SparseCSFTensor -* [ARROW-7428](https://issues.apache.org/jira/browse/ARROW-7428) - [Format][C++] Add serialization for CSF sparse tensors -* [ARROW-7444](https://issues.apache.org/jira/browse/ARROW-7444) - [GLib] Add LocalFileSystem support -* [ARROW-7462](https://issues.apache.org/jira/browse/ARROW-7462) - [C++] Add CpuInfo detection for Arm64 Architecture -* [ARROW-7491](https://issues.apache.org/jira/browse/ARROW-7491) - [Java] Improve the performance of aligning -* [ARROW-7499](https://issues.apache.org/jira/browse/ARROW-7499) - [C++] CMake should collect libs when making static build -* [ARROW-7501](https://issues.apache.org/jira/browse/ARROW-7501) - [C++] CMake build\_thrift should build flex and bison if necessary -* [ARROW-7515](https://issues.apache.org/jira/browse/ARROW-7515) - [C++] Rename nonexistent and non\_existent to not\_found -* [ARROW-7524](https://issues.apache.org/jira/browse/ARROW-7524) - [C++][CI] Build parquet support in the VS2019 GitHub Actions job -* [ARROW-7530](https://issues.apache.org/jira/browse/ARROW-7530) - [Developer] Do not include list of commits from PR in squashed summary message -* [ARROW-7534](https://issues.apache.org/jira/browse/ARROW-7534) - [Java] Create a new java/contrib module -* [ARROW-7547](https://issues.apache.org/jira/browse/ARROW-7547) - [C++] [Python] [Dataset] Additional reader options in ParquetFileFormat -* [ARROW-7555](https://issues.apache.org/jira/browse/ARROW-7555) - [Python] Drop support for python 2.7 -* [ARROW-7587](https://issues.apache.org/jira/browse/ARROW-7587) - [C++][Compute] Add Top-k kernel -* [ARROW-7608](https://issues.apache.org/jira/browse/ARROW-7608) - [C++][Dataset] Expose more informational properties -* [ARROW-7615](https://issues.apache.org/jira/browse/ARROW-7615) - [CI][Gandiva] Ensure that the gandiva\_jni library has only a whitelisted set of shared dependencies as part of Travis CI job -* [ARROW-7616](https://issues.apache.org/jira/browse/ARROW-7616) - [Java] Support comparing value ranges for dense union vector -* [ARROW-7625](https://issues.apache.org/jira/browse/ARROW-7625) - [GLib] Parquet GLib and Red Parquet (Ruby) do not allow specifying compression type -* [ARROW-7641](https://issues.apache.org/jira/browse/ARROW-7641) - [R] Make dataset vignette have executable code -* [ARROW-7662](https://issues.apache.org/jira/browse/ARROW-7662) - [R] Support creating ListArray from R list -* [ARROW-7664](https://issues.apache.org/jira/browse/ARROW-7664) - [C++] Extract localfs default from FileSystemFromUri -* [ARROW-7675](https://issues.apache.org/jira/browse/ARROW-7675) - [R][CI] Move Windows CI from Appveyor to GHA -* [ARROW-7679](https://issues.apache.org/jira/browse/ARROW-7679) - [R] Cleaner interface for creating UnionDataset -* [ARROW-7684](https://issues.apache.org/jira/browse/ARROW-7684) - [Rust] Provide example of Flight server for DataFusion -* [ARROW-7685](https://issues.apache.org/jira/browse/ARROW-7685) - [Developer] Add support for GitHub Actions to Crossbow -* [ARROW-7691](https://issues.apache.org/jira/browse/ARROW-7691) - [C++] Verify missing fields when walking Flatbuffers data -* [ARROW-7708](https://issues.apache.org/jira/browse/ARROW-7708) - [Release] Include PARQUET commits from git changelog in release changelogs -* [ARROW-7712](https://issues.apache.org/jira/browse/ARROW-7712) - [CI][Crossbow] Fix or delete fuzzit jobs -* [ARROW-7720](https://issues.apache.org/jira/browse/ARROW-7720) - [C++][Python] Add check\_metadata argument to Table.equals -* [ARROW-7725](https://issues.apache.org/jira/browse/ARROW-7725) - [C++] Add infrastructure for unity builds and precompiled headers -* [ARROW-7726](https://issues.apache.org/jira/browse/ARROW-7726) - [CI] [C++] Use boost binaries on Windows GHA build -* [ARROW-7729](https://issues.apache.org/jira/browse/ARROW-7729) - [Python][CI] Pin pandas version to 0.25 in the dask integration test -* [ARROW-7733](https://issues.apache.org/jira/browse/ARROW-7733) - [Developer] Install locally a new enough version of Go for release verification script -* [ARROW-7735](https://issues.apache.org/jira/browse/ARROW-7735) - [Release] conda-forge channel is missing for verifying wheels -* [ARROW-7736](https://issues.apache.org/jira/browse/ARROW-7736) - [Release] Binary verification sometimes fails with transient error -* [ARROW-7739](https://issues.apache.org/jira/browse/ARROW-7739) - [GLib] Use placement new to initialize shared\_ptr object in private structs -* [ARROW-7741](https://issues.apache.org/jira/browse/ARROW-7741) - [C++][Parquet] Incorporate new level generation logic in parquet write path with a flag to revert back to old logic -* [ARROW-7742](https://issues.apache.org/jira/browse/ARROW-7742) - [GLib] Add support for MapArray -* [ARROW-7745](https://issues.apache.org/jira/browse/ARROW-7745) - [Doc] [C++] Update Parquet documentation -* [ARROW-7749](https://issues.apache.org/jira/browse/ARROW-7749) - [C++] Link some more tests together -* [ARROW-7750](https://issues.apache.org/jira/browse/ARROW-7750) - [Release] Make the source release verification script restartable -* [ARROW-7751](https://issues.apache.org/jira/browse/ARROW-7751) - [Release] macOS wheel verification also needs arrow-testing -* [ARROW-7752](https://issues.apache.org/jira/browse/ARROW-7752) - [Release] Enable and test dataset in the verification script -* [ARROW-7754](https://issues.apache.org/jira/browse/ARROW-7754) - [C++] Result is slow -* [ARROW-7761](https://issues.apache.org/jira/browse/ARROW-7761) - [C++] Add S3 support to fs::FileSystemFromUri -* [ARROW-7764](https://issues.apache.org/jira/browse/ARROW-7764) - [C++] Builders allocate a null bitmap buffer even if there is no nulls -* [ARROW-7771](https://issues.apache.org/jira/browse/ARROW-7771) - [Developer] Use ARROW\_TMPDIR environment variable in the verification scripts instead of TMPDIR -* [ARROW-7774](https://issues.apache.org/jira/browse/ARROW-7774) - [Packaging][Python] Update macos and windows wheel filenames -* [ARROW-7787](https://issues.apache.org/jira/browse/ARROW-7787) - [Rust] Add collect to Table API -* [ARROW-7788](https://issues.apache.org/jira/browse/ARROW-7788) - [C++] Add schema conversion support for map type -* [ARROW-7790](https://issues.apache.org/jira/browse/ARROW-7790) - [Website] Update how to install Linux packages -* [ARROW-7795](https://issues.apache.org/jira/browse/ARROW-7795) - [Rust - DataFusion] Support boolean negation (NOT) -* [ARROW-7796](https://issues.apache.org/jira/browse/ARROW-7796) - [R] write\_\* functions should invisibly return their inputs -* [ARROW-7799](https://issues.apache.org/jira/browse/ARROW-7799) - [R][CI] Remove flatbuffers from homebrew formulae -* [ARROW-7804](https://issues.apache.org/jira/browse/ARROW-7804) - [C++][R] Compile error on macOS 10.11 -* [ARROW-7812](https://issues.apache.org/jira/browse/ARROW-7812) - [Packaging][Python] Upgrade LLVM in manylinux1 docker image -* [ARROW-7817](https://issues.apache.org/jira/browse/ARROW-7817) - [CI] macOS R autobrew nightly failed on installing dependency from source -* [ARROW-7819](https://issues.apache.org/jira/browse/ARROW-7819) - [C++][Gandiva] Add DumpIR to Filter/Projector classes -* [ARROW-7824](https://issues.apache.org/jira/browse/ARROW-7824) - [C++][Dataset] Provide Dataset writing to IPC format -* [ARROW-7828](https://issues.apache.org/jira/browse/ARROW-7828) - [Release] Remove SSH keys for internal use -* [ARROW-7829](https://issues.apache.org/jira/browse/ARROW-7829) - [R] Test R bindings on clang -* [ARROW-7833](https://issues.apache.org/jira/browse/ARROW-7833) - [R] Make install\_arrow() actually install arrow -* [ARROW-7834](https://issues.apache.org/jira/browse/ARROW-7834) - [Release] Post release task for updating the documentations -* [ARROW-7839](https://issues.apache.org/jira/browse/ARROW-7839) - [Python][Dataset] Add IPC format to python bindings -* [ARROW-7846](https://issues.apache.org/jira/browse/ARROW-7846) - [Python][Dev] Remove last dependencies on six -* [ARROW-7847](https://issues.apache.org/jira/browse/ARROW-7847) - [Website] Write a blog post about fuzzing -* [ARROW-7849](https://issues.apache.org/jira/browse/ARROW-7849) - [Packaging][Python] Remove the remaining py27 crossbow wheel tasks from the nightlies -* [ARROW-7858](https://issues.apache.org/jira/browse/ARROW-7858) - [C++][Python] Support casting an Extension type to its storage type -* [ARROW-7859](https://issues.apache.org/jira/browse/ARROW-7859) - [R] Minor patches for CRAN submission 0.16.0.2 -* [ARROW-7860](https://issues.apache.org/jira/browse/ARROW-7860) - [C++] Support cast to/from halffloat -* [ARROW-7862](https://issues.apache.org/jira/browse/ARROW-7862) - [R] Linux installation should run quieter by default -* [ARROW-7863](https://issues.apache.org/jira/browse/ARROW-7863) - [C++][Python][CI] Ensure running HDFS related tests -* [ARROW-7864](https://issues.apache.org/jira/browse/ARROW-7864) - [R] Make sure bundled installation works even if there are system packages -* [ARROW-7865](https://issues.apache.org/jira/browse/ARROW-7865) - [R] Test builds on latest Linux versions -* [ARROW-7868](https://issues.apache.org/jira/browse/ARROW-7868) - [Crossbow] Reduce GitHub API query parallelism -* [ARROW-7869](https://issues.apache.org/jira/browse/ARROW-7869) - [Python] Boost::system and boost::filesystem not necessary anymore in Python wheels -* [ARROW-7872](https://issues.apache.org/jira/browse/ARROW-7872) - [Python] Support conversion of list-of-struct in Array/Table.to\_pandas -* [ARROW-7874](https://issues.apache.org/jira/browse/ARROW-7874) - [Python][Archery] Validate docstrings with numpydoc -* [ARROW-7876](https://issues.apache.org/jira/browse/ARROW-7876) - [R] Installation fails in the documentation generation image -* [ARROW-7877](https://issues.apache.org/jira/browse/ARROW-7877) - [Packaging] Fix crossbow deployment to github artifacts -* [ARROW-7879](https://issues.apache.org/jira/browse/ARROW-7879) - [C++][Doc] Add doc for the Device API -* [ARROW-7880](https://issues.apache.org/jira/browse/ARROW-7880) - [CI][R] R sanitizer job is not really working -* [ARROW-7881](https://issues.apache.org/jira/browse/ARROW-7881) - [C++] Fix pedantic warnings -* [ARROW-7882](https://issues.apache.org/jira/browse/ARROW-7882) - [C++][Gandiva] Optimise like function for substring pattern -* [ARROW-7886](https://issues.apache.org/jira/browse/ARROW-7886) - [C++][Dataset] Consolidate Source and Dataset -* [ARROW-7888](https://issues.apache.org/jira/browse/ARROW-7888) - [Python] Allow using a more modern version of jpype in pyarrow.jvm -* [ARROW-7890](https://issues.apache.org/jira/browse/ARROW-7890) - [C++] Add Promise / Future implementation -* [ARROW-7891](https://issues.apache.org/jira/browse/ARROW-7891) - [C++] RecordBatch-\>Equals should also have a check\_metadata argument -* [ARROW-7892](https://issues.apache.org/jira/browse/ARROW-7892) - [Python] Expose FilesystemSource.format attribute -* [ARROW-7895](https://issues.apache.org/jira/browse/ARROW-7895) - [Python] Remove more python 2.7 cruft -* [ARROW-7896](https://issues.apache.org/jira/browse/ARROW-7896) - [C++] Refactor from \#include guards to \#pragma once -* [ARROW-7897](https://issues.apache.org/jira/browse/ARROW-7897) - [Packaging] Temporarily disable artifact uploading until we fix the deployment issues -* [ARROW-7898](https://issues.apache.org/jira/browse/ARROW-7898) - [Python] Reduce the number docstring violations using numpydoc -* [ARROW-7904](https://issues.apache.org/jira/browse/ARROW-7904) - [C++] Decide about Field/Schema metadata printing parameters and how much to show by default -* [ARROW-7907](https://issues.apache.org/jira/browse/ARROW-7907) - [Python] Conversion to pandas of empty table with timestamp type aborts -* [ARROW-7912](https://issues.apache.org/jira/browse/ARROW-7912) - [Format] C data interface -* [ARROW-7913](https://issues.apache.org/jira/browse/ARROW-7913) - [C++][Python][R] C++ implementation of C data interface -* [ARROW-7915](https://issues.apache.org/jira/browse/ARROW-7915) - [CI] [Python] Run tests with Python development mode enabled -* [ARROW-7916](https://issues.apache.org/jira/browse/ARROW-7916) - [C++][Dataset] Project IPC record batches to materialized fields -* [ARROW-7917](https://issues.apache.org/jira/browse/ARROW-7917) - [CMake] FindPythonInterp should check for python3 -* [ARROW-7919](https://issues.apache.org/jira/browse/ARROW-7919) - [R] install\_arrow() should conda install if appropriate -* [ARROW-7920](https://issues.apache.org/jira/browse/ARROW-7920) - [R] Fill in some missing input validation -* [ARROW-7921](https://issues.apache.org/jira/browse/ARROW-7921) - [Go] Add Reset method to various components and clean up comments -* [ARROW-7927](https://issues.apache.org/jira/browse/ARROW-7927) - [C++] Fix 'cpu\_info.cc' compilation warning -* [ARROW-7929](https://issues.apache.org/jira/browse/ARROW-7929) - [C++] CMake target names differ from upstream provided names -* [ARROW-7930](https://issues.apache.org/jira/browse/ARROW-7930) - [Python][CI] Test jpype integration in CI -* [ARROW-7932](https://issues.apache.org/jira/browse/ARROW-7932) - [Rust] [Parquet] Implement array reader for temporal types -* [ARROW-7934](https://issues.apache.org/jira/browse/ARROW-7934) - [C++] Fix UriEscape for empty string -* [ARROW-7935](https://issues.apache.org/jira/browse/ARROW-7935) - [Java] Remove Netty dependency for BufferAllocator and ReferenceManager -* [ARROW-7937](https://issues.apache.org/jira/browse/ARROW-7937) - [Python][Packaging] Remove boost from the macos wheels -* [ARROW-7941](https://issues.apache.org/jira/browse/ARROW-7941) - [Rust] [DataFusion] Logical plan should support unresolved column references -* [ARROW-7943](https://issues.apache.org/jira/browse/ARROW-7943) - [C++][Parquet] Add a new level builder capable of handling nested data -* [ARROW-7947](https://issues.apache.org/jira/browse/ARROW-7947) - [Rust] [Flight] [DataFusion] Implement example for get\_schema -* [ARROW-7949](https://issues.apache.org/jira/browse/ARROW-7949) - [Developer] Update to '.gitignore' to not track user specific 'cpp/Brewfile.lock.json' file -* [ARROW-7951](https://issues.apache.org/jira/browse/ARROW-7951) - [Python][Parquet] Expose BYTE\_STREAM\_SPLIT to pyarrow -* [ARROW-7959](https://issues.apache.org/jira/browse/ARROW-7959) - [Ruby] Add support for Ruby 2.3 again -* [ARROW-7963](https://issues.apache.org/jira/browse/ARROW-7963) - [C++][Python][Dataset] Expose listing fragments -* [ARROW-7965](https://issues.apache.org/jira/browse/ARROW-7965) - [Python] Refine higher level dataset API -* [ARROW-7966](https://issues.apache.org/jira/browse/ARROW-7966) - [Integration][Flight][C++] Client should verify each batch independently -* [ARROW-7969](https://issues.apache.org/jira/browse/ARROW-7969) - [Packaging] Use cURL to upload artifacts -* [ARROW-7970](https://issues.apache.org/jira/browse/ARROW-7970) - [Packaging][Python] Use system boost to build the macos wheels -* [ARROW-7971](https://issues.apache.org/jira/browse/ARROW-7971) - [Rust] Create rowcount utility -* [ARROW-7977](https://issues.apache.org/jira/browse/ARROW-7977) - [C++] Rename fs::FileStats to fs::FileInfo -* [ARROW-7979](https://issues.apache.org/jira/browse/ARROW-7979) - [C++] Implement experimental buffer compression in IPC messages -* [ARROW-7982](https://issues.apache.org/jira/browse/ARROW-7982) - [C++] Let ArrayDataVisitor accept void-returning functions -* [ARROW-7983](https://issues.apache.org/jira/browse/ARROW-7983) - [CI][R] Nightly builds should be more verbose when they fail -* [ARROW-7984](https://issues.apache.org/jira/browse/ARROW-7984) - [R] Check for valid inputs in more places -* [ARROW-7986](https://issues.apache.org/jira/browse/ARROW-7986) - [Python] pa.Array.from\_pandas cannot convert pandas.Series containing pyspark.ml.linalg.SparseVector -* [ARROW-7987](https://issues.apache.org/jira/browse/ARROW-7987) - [CI][R] Fix for verbose nightly builds -* [ARROW-7988](https://issues.apache.org/jira/browse/ARROW-7988) - [R] Fix on.exit calls in reticulate bindings -* [ARROW-7991](https://issues.apache.org/jira/browse/ARROW-7991) - [C++][Plasma] Allow option for evicting if full when creating an object -* [ARROW-7993](https://issues.apache.org/jira/browse/ARROW-7993) - [Java] Support decimal type in ComplexCopier -* [ARROW-7994](https://issues.apache.org/jira/browse/ARROW-7994) - [CI][C++] Move AppVeyor MinGW builds to GitHub Actions -* [ARROW-7995](https://issues.apache.org/jira/browse/ARROW-7995) - [C++] IO: coalescing and caching read ranges -* [ARROW-7998](https://issues.apache.org/jira/browse/ARROW-7998) - [C++][Plasma] Make Seal requests synchronous -* [ARROW-8005](https://issues.apache.org/jira/browse/ARROW-8005) - [Website] Review and adjust any usages of Apache dist system from website / tools -* [ARROW-8014](https://issues.apache.org/jira/browse/ARROW-8014) - [C++] Provide CMake targets to test only within a given label -* [ARROW-8016](https://issues.apache.org/jira/browse/ARROW-8016) - [Developer] Fix deprecation warning in PR merge tool -* [ARROW-8018](https://issues.apache.org/jira/browse/ARROW-8018) - [C++][Parquet]Parquet Modular Encryption -* [ARROW-8024](https://issues.apache.org/jira/browse/ARROW-8024) - [R] Bindings for BinaryType and FixedBinaryType -* [ARROW-8026](https://issues.apache.org/jira/browse/ARROW-8026) - [Python] Support memoryview in addition to string value types for constructing string and binary type arrays -* [ARROW-8027](https://issues.apache.org/jira/browse/ARROW-8027) - [Developer][Integration] Add integration tests for duplicate field names -* [ARROW-8028](https://issues.apache.org/jira/browse/ARROW-8028) - [Go] Allow duplicate field names in schemas and nested types -* [ARROW-8030](https://issues.apache.org/jira/browse/ARROW-8030) - [C++][Plasma] Fix inconsistent comment style -* [ARROW-8035](https://issues.apache.org/jira/browse/ARROW-8035) - [Developer][Integration] Add integration tests for extension types -* [ARROW-8039](https://issues.apache.org/jira/browse/ARROW-8039) - [Python][Dataset] Support using dataset API in pyarrow.parquet with a minimal ParquetDataset shim -* [ARROW-8044](https://issues.apache.org/jira/browse/ARROW-8044) - [CI][NIGHTLY:gandiva-jar-osx] pygit2 needs libgit2 v1.0.x -* [ARROW-8055](https://issues.apache.org/jira/browse/ARROW-8055) - [GLib][Ruby] Add some metadata bindings to GArrowSchema -* [ARROW-8058](https://issues.apache.org/jira/browse/ARROW-8058) - [C++][Python][Dataset] Provide an option to toggle validation and schema inference in FileSystemDatasetFactoryOptions -* [ARROW-8059](https://issues.apache.org/jira/browse/ARROW-8059) - [Python] Make FileSystem objects serializable -* [ARROW-8060](https://issues.apache.org/jira/browse/ARROW-8060) - [Python] Make dataset Expression objects serializable -* [ARROW-8061](https://issues.apache.org/jira/browse/ARROW-8061) - [C++][Dataset] Ability to specify granularity of ParquetFileFragment (support row groups) -* [ARROW-8063](https://issues.apache.org/jira/browse/ARROW-8063) - [Python] Add user guide documentation for Datasets API -* [ARROW-8064](https://issues.apache.org/jira/browse/ARROW-8064) - [Dev] Implement Comment bot via Github actions -* [ARROW-8069](https://issues.apache.org/jira/browse/ARROW-8069) - [C++] Should the default value of "check\_metadata" arguments of Equals methods be "true"? -* [ARROW-8072](https://issues.apache.org/jira/browse/ARROW-8072) - [C++][Plasma] Add const constraint when parsing data -* [ARROW-8077](https://issues.apache.org/jira/browse/ARROW-8077) - [Python] Add wheel build script and Crossbow configuration for Windows on Python 3.5 -* [ARROW-8079](https://issues.apache.org/jira/browse/ARROW-8079) - [Python] Implement a wrapper for KeyValueMetadata, duck-typing dict where relevant -* [ARROW-8080](https://issues.apache.org/jira/browse/ARROW-8080) - [C++] Add AVX512 build option -* [ARROW-8082](https://issues.apache.org/jira/browse/ARROW-8082) - [Java][Plasma] Add JNI list() interface -* [ARROW-8083](https://issues.apache.org/jira/browse/ARROW-8083) - [GLib] Add support for Peek() to GIOInputStream -* [ARROW-8086](https://issues.apache.org/jira/browse/ARROW-8086) - [Java] Support writing decimal from big endian byte array in UnionListWriter -* [ARROW-8087](https://issues.apache.org/jira/browse/ARROW-8087) - [C++][Dataset] Order of keys with HivePartitioning is lost in resulting schema -* [ARROW-8096](https://issues.apache.org/jira/browse/ARROW-8096) - [C++][Gandiva] Create null node of Interval type -* [ARROW-8097](https://issues.apache.org/jira/browse/ARROW-8097) - [Dev] Comment bot's crossbow command acts on the master branch -* [ARROW-8103](https://issues.apache.org/jira/browse/ARROW-8103) - [R] Make default Linux build more minimal -* [ARROW-8104](https://issues.apache.org/jira/browse/ARROW-8104) - [C++] Don't install bundled Thrift -* [ARROW-8107](https://issues.apache.org/jira/browse/ARROW-8107) - [Packaging][APT] Use HTTPS for LLVM APT repository for Debian GNU/Linux stretch -* [ARROW-8109](https://issues.apache.org/jira/browse/ARROW-8109) - [Packaging][APT] Drop support for Ubuntu Disco -* [ARROW-8117](https://issues.apache.org/jira/browse/ARROW-8117) - [Rust] [Datafusion] Allow CAST from number to timestamp -* [ARROW-8118](https://issues.apache.org/jira/browse/ARROW-8118) - [R] dim method for FileSystemDataset -* [ARROW-8120](https://issues.apache.org/jira/browse/ARROW-8120) - [Packaging][APT] Add support for Ubuntu Focal -* [ARROW-8123](https://issues.apache.org/jira/browse/ARROW-8123) - [Rust] [DataFusion] Create LogicalPlanBuilder -* [ARROW-8124](https://issues.apache.org/jira/browse/ARROW-8124) - [Rust] Update library dependencies -* [ARROW-8126](https://issues.apache.org/jira/browse/ARROW-8126) - [C++][Compute] Add Top-K kernel benchmark -* [ARROW-8129](https://issues.apache.org/jira/browse/ARROW-8129) - [C++][Compute] Refine compare sorting kernel -* [ARROW-8130](https://issues.apache.org/jira/browse/ARROW-8130) - [C++][Gandiva] Fix Dex visitor in llvm\_generator to handle interval type -* [ARROW-8140](https://issues.apache.org/jira/browse/ARROW-8140) - [Developer] Follow NullType -\> NullField change -* [ARROW-8141](https://issues.apache.org/jira/browse/ARROW-8141) - [C++] Optimize BM\_PlainDecodingBoolean performance using AVX512 Intrinsics API -* [ARROW-8145](https://issues.apache.org/jira/browse/ARROW-8145) - [C++] Rename GetTargetInfos -* [ARROW-8146](https://issues.apache.org/jira/browse/ARROW-8146) - [C++] Add per-filesystem facility to sanitize a path -* [ARROW-8150](https://issues.apache.org/jira/browse/ARROW-8150) - [Rust] Allow writing custom FileMetaData k/v pairs -* [ARROW-8151](https://issues.apache.org/jira/browse/ARROW-8151) - [Benchmarking][Dataset] Benchmark Parquet read performance with S3File -* [ARROW-8153](https://issues.apache.org/jira/browse/ARROW-8153) - [Packaging] Update the conda feedstock files and upload artifacts to Anaconda -* [ARROW-8158](https://issues.apache.org/jira/browse/ARROW-8158) - [Java] Getting length of data buffer and base variable width vector -* [ARROW-8164](https://issues.apache.org/jira/browse/ARROW-8164) - [C++][Dataset] Let datasets be viewable with non-identical schema -* [ARROW-8165](https://issues.apache.org/jira/browse/ARROW-8165) - [Packaging] Make nightly wheels available on a PyPI server -* [ARROW-8167](https://issues.apache.org/jira/browse/ARROW-8167) - [CI] Add support for skipping builds with skip pattern in pull request title -* [ARROW-8168](https://issues.apache.org/jira/browse/ARROW-8168) - [Java][Plasma] Improve Java Plasma client off-heap memory usage -* [ARROW-8177](https://issues.apache.org/jira/browse/ARROW-8177) - [Rust] Make schema\_to\_fb\_offset public -* [ARROW-8178](https://issues.apache.org/jira/browse/ARROW-8178) - [C++] Upgrade to Flatbuffers 1.12 -* [ARROW-8179](https://issues.apache.org/jira/browse/ARROW-8179) - [R] Windows build script tweaking for nightly packaging on GHA -* [ARROW-8181](https://issues.apache.org/jira/browse/ARROW-8181) - [Java][FlightRPC] Expose transport error metadata -* [ARROW-8182](https://issues.apache.org/jira/browse/ARROW-8182) - [Packaging] Increment the version number detected from the latest git tag -* [ARROW-8183](https://issues.apache.org/jira/browse/ARROW-8183) - [c++][FlightRPC] Expose transport error metadata -* [ARROW-8184](https://issues.apache.org/jira/browse/ARROW-8184) - [Packaging] Use arrow-nightlies organization name on Anaconda and Gemfury to host the nightlies -* [ARROW-8185](https://issues.apache.org/jira/browse/ARROW-8185) - [Packaging] Document the available nightly wheels and conda packages -* [ARROW-8187](https://issues.apache.org/jira/browse/ARROW-8187) - [R] Make test assertions robust to i18n -* [ARROW-8191](https://issues.apache.org/jira/browse/ARROW-8191) - [Packaging][APT] Fix cmake removal in Debian GNU/Linux Stretch -* [ARROW-8192](https://issues.apache.org/jira/browse/ARROW-8192) - [C++] script for unpack avx512 intrinsics code -* [ARROW-8194](https://issues.apache.org/jira/browse/ARROW-8194) - [CI] Github Actions Windows job should run tests in parallel -* [ARROW-8195](https://issues.apache.org/jira/browse/ARROW-8195) - [CI] Remove Boost download step in Github Actions -* [ARROW-8198](https://issues.apache.org/jira/browse/ARROW-8198) - [C++] Diffing should handle null arrays -* [ARROW-8200](https://issues.apache.org/jira/browse/ARROW-8200) - [GLib] Rename garrow\_file\_system\_target\_info{,s}() to ...\_file\_info{,s}() -* [ARROW-8203](https://issues.apache.org/jira/browse/ARROW-8203) - [C\#] "dotnet pack" is failed -* [ARROW-8204](https://issues.apache.org/jira/browse/ARROW-8204) - [Rust] [DataFusion] Add support for aliased expressions in SQL -* [ARROW-8207](https://issues.apache.org/jira/browse/ARROW-8207) - [Packaging][wheel] Use LLVM 8 in manylinux2010 and manylinux2014 -* [ARROW-8215](https://issues.apache.org/jira/browse/ARROW-8215) - [CI][GLib] Meson install fails in the macOS build -* [ARROW-8218](https://issues.apache.org/jira/browse/ARROW-8218) - [C++] Parallelize decompression at field level in experimental IPC compression code -* [ARROW-8220](https://issues.apache.org/jira/browse/ARROW-8220) - [Python] Make dataset FileFormat objects serializable -* [ARROW-8222](https://issues.apache.org/jira/browse/ARROW-8222) - [C++] Use bcp to make a slim boost for bundled build -* [ARROW-8224](https://issues.apache.org/jira/browse/ARROW-8224) - [C++] Remove APIs deprecated prior to 0.16.0 -* [ARROW-8225](https://issues.apache.org/jira/browse/ARROW-8225) - [Rust] IPC reader must respect continuation markers -* [ARROW-8225](https://issues.apache.org/jira/browse/ARROW-8225) - [Rust] IPC reader must respect continuation markers -* [ARROW-8227](https://issues.apache.org/jira/browse/ARROW-8227) - [C++] Refine SIMD feature definitions -* [ARROW-8231](https://issues.apache.org/jira/browse/ARROW-8231) - [Rust] Parse key\_value\_metadata from parquet FileMetaData into arrow schema metadata -* [ARROW-8232](https://issues.apache.org/jira/browse/ARROW-8232) - [Python] Deprecate pa.open\_file and pa.open\_stream in favor of pa.ipc.open\_file/open\_stream -* [ARROW-8235](https://issues.apache.org/jira/browse/ARROW-8235) - [C++][Compute] Filter out nulls by default -* [ARROW-8241](https://issues.apache.org/jira/browse/ARROW-8241) - [Rust] Add convenience methods to Schema -* [ARROW-8242](https://issues.apache.org/jira/browse/ARROW-8242) - [C++] Flight fails to compile on GCC 4.8 -* [ARROW-8243](https://issues.apache.org/jira/browse/ARROW-8243) - [Rust] [DataFusion] Fix inconsistent API in LogicalPlanBuilder -* [ARROW-8244](https://issues.apache.org/jira/browse/ARROW-8244) - [Python][Parquet] Add \`write\_to\_dataset\` option to populate the "file\_path" metadata fields -* [ARROW-8246](https://issues.apache.org/jira/browse/ARROW-8246) - [C++] Add -Wa,-mbig-obj when compiling with MinGW to avoid linking errors -* [ARROW-8247](https://issues.apache.org/jira/browse/ARROW-8247) - [Python] Expose Parquet writing "engine" setting in pyarrow.parquet.write\_table -* [ARROW-8249](https://issues.apache.org/jira/browse/ARROW-8249) - [Rust] [DataFusion] Make Table and LogicalPlanBuilder APIs more consistent -* [ARROW-8252](https://issues.apache.org/jira/browse/ARROW-8252) - [CI][Ruby] Add Ubuntu 20.04 -* [ARROW-8256](https://issues.apache.org/jira/browse/ARROW-8256) - [Rust] [DataFusion] Update CLI documentation for 0.17.0 release -* [ARROW-8264](https://issues.apache.org/jira/browse/ARROW-8264) - [Rust] [DataFusion] Create utility for printing record batches -* [ARROW-8266](https://issues.apache.org/jira/browse/ARROW-8266) - [C++] Add backup mirrors for external project source downloads -* [ARROW-8267](https://issues.apache.org/jira/browse/ARROW-8267) - [CI][GLib] Failed to build on Ubuntu 16.04 -* [ARROW-8271](https://issues.apache.org/jira/browse/ARROW-8271) - [Packaging] Allow wheel upload failures to gemfury -* [ARROW-8275](https://issues.apache.org/jira/browse/ARROW-8275) - [Python][Docs] Review Feather + IPC file documentation per "Feather V2" changes -* [ARROW-8277](https://issues.apache.org/jira/browse/ARROW-8277) - [Python] RecordBatch interface improvements -* [ARROW-8279](https://issues.apache.org/jira/browse/ARROW-8279) - [C++] Do not export symbols from Codec implementations, remove need for PIMPL pattern -* [ARROW-8288](https://issues.apache.org/jira/browse/ARROW-8288) - [Python] Expose with\_ modifiers on DataType -* [ARROW-8290](https://issues.apache.org/jira/browse/ARROW-8290) - [Python][Dataset] Improve ergonomy of the FileSystemDataset constructor -* [ARROW-8291](https://issues.apache.org/jira/browse/ARROW-8291) - [Packaging] Conda nightly builds can't locate Numpy -* [ARROW-8292](https://issues.apache.org/jira/browse/ARROW-8292) - [Python][Dataset] Passthrough schema to Factory.finish() in dataset() function -* [ARROW-8294](https://issues.apache.org/jira/browse/ARROW-8294) - [Format][Flight] Add DoExchange RPC to Flight protocol -* [ARROW-8295](https://issues.apache.org/jira/browse/ARROW-8295) - [C++][Dataset] IpcFileFormat should expliclity push down column projection -* [ARROW-8299](https://issues.apache.org/jira/browse/ARROW-8299) - [C++] Reusable "optional ParallelFor" function for optional use of multithreading -* [ARROW-8300](https://issues.apache.org/jira/browse/ARROW-8300) - [R] Documentation and changelog updates for 0.17 -* [ARROW-8307](https://issues.apache.org/jira/browse/ARROW-8307) - [Python] Expose use\_memory\_map option in pyarrow.feather APIs -* [ARROW-8308](https://issues.apache.org/jira/browse/ARROW-8308) - [Rust] [Flight] Implement DoExchange on examples -* [ARROW-8309](https://issues.apache.org/jira/browse/ARROW-8309) - [CI] C++/Java/Rust workflows should trigger on changes to Flight.proto -* [ARROW-8311](https://issues.apache.org/jira/browse/ARROW-8311) - [C++] Add push style stream format reader -* [ARROW-8316](https://issues.apache.org/jira/browse/ARROW-8316) - [CI] Set docker-compose to use docker-cli instead of docker-py for building images -* [ARROW-8319](https://issues.apache.org/jira/browse/ARROW-8319) - [CI] Install thrift compiler in the debian build -* [ARROW-8320](https://issues.apache.org/jira/browse/ARROW-8320) - [Documentation][Format] Clarify (lack of) alignment requirements in C data interface -* [ARROW-8321](https://issues.apache.org/jira/browse/ARROW-8321) - [CI] Use bundled thrift in Fedora 30 build -* [ARROW-8322](https://issues.apache.org/jira/browse/ARROW-8322) - [CI] Fix C\# workflow file syntax -* [ARROW-8325](https://issues.apache.org/jira/browse/ARROW-8325) - [R][CI] Stop including boost in R windows bundle -* [ARROW-8329](https://issues.apache.org/jira/browse/ARROW-8329) - [Documentation][C++] Undocumented FilterOptions argument in Filter kernel -* [ARROW-8330](https://issues.apache.org/jira/browse/ARROW-8330) - [Documentation] The post release script generates the documentation with a development version -* [ARROW-8332](https://issues.apache.org/jira/browse/ARROW-8332) - [C++] Require Thrift compiler to use system libthrift for Parquet build -* [ARROW-8335](https://issues.apache.org/jira/browse/ARROW-8335) - [Release] Add crossbow jobs to run release verification -* [ARROW-8336](https://issues.apache.org/jira/browse/ARROW-8336) - [Packaging][deb] Use libthrift-dev on Debian 10 and Ubuntu 19.10 or later -* [ARROW-8341](https://issues.apache.org/jira/browse/ARROW-8341) - [Packaging][deb] Fail to build by no disk space -* [ARROW-8343](https://issues.apache.org/jira/browse/ARROW-8343) - [GLib] Add GArrowRecordBatchIterator -* [ARROW-8347](https://issues.apache.org/jira/browse/ARROW-8347) - [C++] Add Result APIs to Array methods -* [ARROW-8351](https://issues.apache.org/jira/browse/ARROW-8351) - [R][CI] Store the Rtools-built Arrow C++ library as a build artifact -* [ARROW-8352](https://issues.apache.org/jira/browse/ARROW-8352) - [R] Add install\_pyarrow() -* [ARROW-8356](https://issues.apache.org/jira/browse/ARROW-8356) - [Developer] Support \* wildcards with "crossbow submit" via GitHub actions -* [ARROW-8361](https://issues.apache.org/jira/browse/ARROW-8361) - [C++] Add Result APIs to Buffer methods and functions -* [ARROW-8362](https://issues.apache.org/jira/browse/ARROW-8362) - [Crossbow] Ensure that the locally generated version is used in the docker tasks -* [ARROW-8367](https://issues.apache.org/jira/browse/ARROW-8367) - [C++] Deprecate Buffer::FromString(..., pool) -* [ARROW-8368](https://issues.apache.org/jira/browse/ARROW-8368) - [Format] In C interface, clarify resource management for consumers needing only a subset of child fields in ArrowArray -* [ARROW-8370](https://issues.apache.org/jira/browse/ARROW-8370) - [C++] Add Result to type / schema APIs -* [ARROW-8371](https://issues.apache.org/jira/browse/ARROW-8371) - [Crossbow] Implement and exercise sanity checks for tasks.yml -* [ARROW-8372](https://issues.apache.org/jira/browse/ARROW-8372) - [C++] Add Result to table / record batch APIs -* [ARROW-8375](https://issues.apache.org/jira/browse/ARROW-8375) - [CI][R] Make Windows tests more verbose in case of segfault -* [ARROW-8376](https://issues.apache.org/jira/browse/ARROW-8376) - [R] Add experimental interface to ScanTask/RecordBatch iterators -* [ARROW-8387](https://issues.apache.org/jira/browse/ARROW-8387) - [Rust] Make schema\_to\_fb public -* [ARROW-8389](https://issues.apache.org/jira/browse/ARROW-8389) - [Integration] Run tests in parallel -* [ARROW-8390](https://issues.apache.org/jira/browse/ARROW-8390) - [R] Expose schema unification features -* [ARROW-8393](https://issues.apache.org/jira/browse/ARROW-8393) - [C++][Gandiva] Make gandiva function registry case-insensitive -* [ARROW-8396](https://issues.apache.org/jira/browse/ARROW-8396) - [Rust] Remove libc from dependencies -* [ARROW-8398](https://issues.apache.org/jira/browse/ARROW-8398) - [Python] Remove deprecation warnings originating from python tests -* [ARROW-8401](https://issues.apache.org/jira/browse/ARROW-8401) - [C++] Add AVX2/AVX512 version of ByteStreamSplitDecode/ByteStreamSplitEncode -* [ARROW-8403](https://issues.apache.org/jira/browse/ARROW-8403) - [C++] Add ToString() to ChunkedArray, Table and RecordBatch -* [ARROW-8407](https://issues.apache.org/jira/browse/ARROW-8407) - [Rust] Add rustdoc for Dictionary type -* [ARROW-8408](https://issues.apache.org/jira/browse/ARROW-8408) - [Python] Add memory\_map= toggle to pyarrow.feather.read\_feather -* [ARROW-8409](https://issues.apache.org/jira/browse/ARROW-8409) - [R] Add arrow::cpu\_count, arrow::set\_cpu\_count wrapper functions a la Python -* [ARROW-8412](https://issues.apache.org/jira/browse/ARROW-8412) - [C++][Gandiva] Fix gandiva date\_diff function definitions -* [ARROW-8433](https://issues.apache.org/jira/browse/ARROW-8433) - [R] Add feather alias for ipc format in dataset API -* [ARROW-8444](https://issues.apache.org/jira/browse/ARROW-8444) - [Documentation] Fix spelling errors across the codebase -* [ARROW-8449](https://issues.apache.org/jira/browse/ARROW-8449) - [R] Use CMAKE\_UNITY\_BUILD everywhere -* [ARROW-8450](https://issues.apache.org/jira/browse/ARROW-8450) - [Integration][C++] Implement large list/binary/utf8 integration -* [ARROW-8457](https://issues.apache.org/jira/browse/ARROW-8457) - [C++] bridge test does not take care of endianness -* [ARROW-8458](https://issues.apache.org/jira/browse/ARROW-8458) - [C++] Prefer the original mirrors for the bundled thirdparty dependencies -* [ARROW-8461](https://issues.apache.org/jira/browse/ARROW-8461) - [Packaging][deb] Use zstd package for Ubuntu Xenial -* [ARROW-8463](https://issues.apache.org/jira/browse/ARROW-8463) - [CI] Balance the nightly test builds between CircleCI, Azure and Github -* [ARROW-8679](https://issues.apache.org/jira/browse/ARROW-8679) - [Python] supporting pandas sparse series in pyarrow -* [PARQUET-458](https://issues.apache.org/jira/browse/PARQUET-458) - [C++] Implement support for DataPageV2 -* [PARQUET-1663](https://issues.apache.org/jira/browse/PARQUET-1663) - [C++] Provide API to check the presence of complex data types -* [PARQUET-1716](https://issues.apache.org/jira/browse/PARQUET-1716) - [C++] Add support for BYTE\_STREAM\_SPLIT encoding -* [PARQUET-1770](https://issues.apache.org/jira/browse/PARQUET-1770) - [C++][CI] Add fuzz target for reading Parquet files -* [PARQUET-1785](https://issues.apache.org/jira/browse/PARQUET-1785) - [C++] Improve code reusability in encoding-test.cc -* [PARQUET-1786](https://issues.apache.org/jira/browse/PARQUET-1786) - [C++] Use simd to improve BYTE\_STREAM\_SPLIT decoding performance -* [PARQUET-1806](https://issues.apache.org/jira/browse/PARQUET-1806) - [C++] [CI] Improve fuzzing seed corpus -* [PARQUET-1825](https://issues.apache.org/jira/browse/PARQUET-1825) - [C++] Fix compilation error in column\_io\_benchmark.cc -* [PARQUET-1828](https://issues.apache.org/jira/browse/PARQUET-1828) - [C++] Add a SSE2 path for the ByteStreamSplit encoder implementation -* [PARQUET-1840](https://issues.apache.org/jira/browse/PARQUET-1840) - [C++] DecodeSpaced copies more values then necessary - - - -# Apache Arrow 0.16.0 (2020-02-07) - -## Bug Fixes - -* [ARROW-3783](https://issues.apache.org/jira/browse/ARROW-3783) - [R] Incorrect collection of float type -* [ARROW-3962](https://issues.apache.org/jira/browse/ARROW-3962) - [Go] Support null values while reading a CSV file. -* [ARROW-4470](https://issues.apache.org/jira/browse/ARROW-4470) - [Python] Pyarrow using considerable more memory when reading partitioned Parquet file -* [ARROW-4998](https://issues.apache.org/jira/browse/ARROW-4998) - [R] R package fails to install on OSX -* [ARROW-5575](https://issues.apache.org/jira/browse/ARROW-5575) - [C++] arrowConfig.cmake includes uninstalled targets -* [ARROW-5655](https://issues.apache.org/jira/browse/ARROW-5655) - [Python] Table.from\_pydict/from\_arrays not using types in specified schema correctly -* [ARROW-5680](https://issues.apache.org/jira/browse/ARROW-5680) - [Rust] datafusion group-by tests depends on result set order -* [ARROW-6157](https://issues.apache.org/jira/browse/ARROW-6157) - [Python][C++] UnionArray with invalid data passes validation / leads to segfaults -* [ARROW-6195](https://issues.apache.org/jira/browse/ARROW-6195) - [C++] CMake fails with file not found error while bundling thrift if python is not installed -* [ARROW-6298](https://issues.apache.org/jira/browse/ARROW-6298) - [Rust] [CI] Examples are not being tested in CI -* [ARROW-6320](https://issues.apache.org/jira/browse/ARROW-6320) - [C++] Arrow utilities are linked statically -* [ARROW-6429](https://issues.apache.org/jira/browse/ARROW-6429) - [CI][Crossbow] Nightly spark integration job fails -* [ARROW-6445](https://issues.apache.org/jira/browse/ARROW-6445) - [CI][Crossbow] Nightly Gandiva jar trusty job fails -* [ARROW-6567](https://issues.apache.org/jira/browse/ARROW-6567) - [Rust] [DataFusion] SQL aggregate query execution assume grouping expressions precede aggregate expressions -* [ARROW-6581](https://issues.apache.org/jira/browse/ARROW-6581) - [C++] Fix fuzzit job submission -* [ARROW-6704](https://issues.apache.org/jira/browse/ARROW-6704) - [C++] Cast from timestamp to higher resolution does not check out of bounds timestamps -* [ARROW-6708](https://issues.apache.org/jira/browse/ARROW-6708) - [C++] "cannot find -lboost\_filesystem\_static" -* [ARROW-6728](https://issues.apache.org/jira/browse/ARROW-6728) - [C\#] Support reading and writing Date32 and Date64 arrays -* [ARROW-6736](https://issues.apache.org/jira/browse/ARROW-6736) - [Rust] [DataFusion] Aggregate expressions get evaluated repeatedly -* [ARROW-6740](https://issues.apache.org/jira/browse/ARROW-6740) - [Python] Unable to delete closed MemoryMappedFile on Windows -* [ARROW-6745](https://issues.apache.org/jira/browse/ARROW-6745) - [Rust] Fix a variety of typos -* [ARROW-6749](https://issues.apache.org/jira/browse/ARROW-6749) - [Python] Conversion of non-ns timestamp array to numpy gives wrong values -* [ARROW-6750](https://issues.apache.org/jira/browse/ARROW-6750) - [Python] Silence S3 error logs by default -* [ARROW-6761](https://issues.apache.org/jira/browse/ARROW-6761) - [Rust] Travis CI builds not respecting rust-toolchain -* [ARROW-6762](https://issues.apache.org/jira/browse/ARROW-6762) - [C++] JSON reader segfaults on newline -* [ARROW-6785](https://issues.apache.org/jira/browse/ARROW-6785) - [JS] Remove superfluous child assignment -* [ARROW-6786](https://issues.apache.org/jira/browse/ARROW-6786) - [C++] arrow-dataset-file-parquet-test is slow -* [ARROW-6795](https://issues.apache.org/jira/browse/ARROW-6795) - [C\#] Reading large Arrow files in C\# results in an exception -* [ARROW-6798](https://issues.apache.org/jira/browse/ARROW-6798) - [CI] [Rust] Improve build times by caching dependencies in the Docker image -* [ARROW-6801](https://issues.apache.org/jira/browse/ARROW-6801) - [Rust] Arrow source release tarball is missing benchmarks -* [ARROW-6806](https://issues.apache.org/jira/browse/ARROW-6806) - [C++] Segfault deserializing ListArray containing null/empty list -* [ARROW-6808](https://issues.apache.org/jira/browse/ARROW-6808) - [Ruby] Ensure requiring suitable MSYS2 package -* [ARROW-6809](https://issues.apache.org/jira/browse/ARROW-6809) - [RUBY] Gem does not install on macOS due to glib2 3.3.7 compilation failure -* [ARROW-6812](https://issues.apache.org/jira/browse/ARROW-6812) - [Java] Remove Dremio Corp. from License Header -* [ARROW-6813](https://issues.apache.org/jira/browse/ARROW-6813) - [Ruby] Arrow::Table.load with headers=true leads to exception in Arrow 0.15 -* [ARROW-6820](https://issues.apache.org/jira/browse/ARROW-6820) - [C++] [Doc] [Format] Map specification and implementation inconsistent -* [ARROW-6834](https://issues.apache.org/jira/browse/ARROW-6834) - [C++] Pin gtest to 1.8.1 to triage failing Appveyor / MSVC build -* [ARROW-6835](https://issues.apache.org/jira/browse/ARROW-6835) - [Archery][CMake] Restore ARROW\_LINT\_ONLY -* [ARROW-6842](https://issues.apache.org/jira/browse/ARROW-6842) - [Website] Jekyll error building website -* [ARROW-6844](https://issues.apache.org/jira/browse/ARROW-6844) - [C++][Parquet][Python] List columns read broken with 0.15.0 -* [ARROW-6846](https://issues.apache.org/jira/browse/ARROW-6846) - [C++] Build failures with glog enabled -* [ARROW-6857](https://issues.apache.org/jira/browse/ARROW-6857) - [Python][C++] Segfault for dictionary\_encode on empty chunked\_array (edge case) -* [ARROW-6859](https://issues.apache.org/jira/browse/ARROW-6859) - [CI][Nightly] Disable docker layer caching for CircleCI tasks -* [ARROW-6860](https://issues.apache.org/jira/browse/ARROW-6860) - [Python] Only link libarrow\_flight.so to pyarrow.\_flight -* [ARROW-6861](https://issues.apache.org/jira/browse/ARROW-6861) - [Python] arrow-0.15.0 reading arrow-0.14.1-output Parquet dictionary column: Failure reading column: IOError: Arrow error: Invalid: Resize cannot downsize -* [ARROW-6864](https://issues.apache.org/jira/browse/ARROW-6864) - [C++] bz2 / zstd tests not enabled -* [ARROW-6867](https://issues.apache.org/jira/browse/ARROW-6867) - [FlightRPC][Java] Flight server can hang JVM on shutdown -* [ARROW-6868](https://issues.apache.org/jira/browse/ARROW-6868) - [Go] slicing Struct array does not slice child fields -* [ARROW-6869](https://issues.apache.org/jira/browse/ARROW-6869) - [C++] Dictionary "delta" building logic in builder\_dict.h produces invalid arrays -* [ARROW-6873](https://issues.apache.org/jira/browse/ARROW-6873) - [Python] Stale CColumn reference break Cython cimport pyarrow -* [ARROW-6874](https://issues.apache.org/jira/browse/ARROW-6874) - [Python] Memory leak in Table.to\_pandas() when conversion to object dtype -* [ARROW-6876](https://issues.apache.org/jira/browse/ARROW-6876) - [Python] Reading parquet file with many columns becomes slow for 0.15.0 -* [ARROW-6877](https://issues.apache.org/jira/browse/ARROW-6877) - [C++] Boost not found from the correct environment -* [ARROW-6878](https://issues.apache.org/jira/browse/ARROW-6878) - [Python] pa.array() does not handle list of dicts with bytes keys correctly under python3 -* [ARROW-6882](https://issues.apache.org/jira/browse/ARROW-6882) - [Python] cannot create a chunked\_array from dictionary\_encoding result -* [ARROW-6885](https://issues.apache.org/jira/browse/ARROW-6885) - [Python] Remove superfluous skipped timedelta test -* [ARROW-6886](https://issues.apache.org/jira/browse/ARROW-6886) - [C++] arrow::io header nvcc compiler warnings -* [ARROW-6898](https://issues.apache.org/jira/browse/ARROW-6898) - [Java] Fix potential memory leak in ArrowWriter and several test classes -* [ARROW-6898](https://issues.apache.org/jira/browse/ARROW-6898) - [Java] Fix potential memory leak in ArrowWriter and several test classes -* [ARROW-6899](https://issues.apache.org/jira/browse/ARROW-6899) - [Python] to\_pandas() not implemented on list -* [ARROW-6901](https://issues.apache.org/jira/browse/ARROW-6901) - [Rust][Parquet] SerializedFileWriter writes total\_num\_rows as zero -* [ARROW-6903](https://issues.apache.org/jira/browse/ARROW-6903) - [Python] Wheels broken after ARROW-6860 changes -* [ARROW-6905](https://issues.apache.org/jira/browse/ARROW-6905) - [Packaging][OSX] Nightly builds on MacOS are failing because of brew compile timeouts -* [ARROW-6910](https://issues.apache.org/jira/browse/ARROW-6910) - [Python] pyarrow.parquet.read\_table(...) takes up lots of memory which is not released until program exits -* [ARROW-6913](https://issues.apache.org/jira/browse/ARROW-6913) - [R] Potential bug in compute.cc -* [ARROW-6914](https://issues.apache.org/jira/browse/ARROW-6914) - [CI] docker-clang-format nightly failing -* [ARROW-6922](https://issues.apache.org/jira/browse/ARROW-6922) - [Python] Pandas master build is failing (MultiIndex.levels change) -* [ARROW-6925](https://issues.apache.org/jira/browse/ARROW-6925) - [C++] Arrow fails to buld on MacOS 10.13.6 using brew gcc 7 and 8 -* [ARROW-6929](https://issues.apache.org/jira/browse/ARROW-6929) - [C++] ValidateArray is out of sync with the ListArray IPC specification -* [ARROW-6937](https://issues.apache.org/jira/browse/ARROW-6937) - [Packaging][Python] Fix conda linux and OSX wheel nightly builds -* [ARROW-6938](https://issues.apache.org/jira/browse/ARROW-6938) - [Python] Windows wheel depends on zstd.dll and libbz2.dll, which are not bundled -* [ARROW-6948](https://issues.apache.org/jira/browse/ARROW-6948) - [Rust] [Parquet] Fix bool array support in arrow reader. -* [ARROW-6950](https://issues.apache.org/jira/browse/ARROW-6950) - [C++][Dataset] Add example/benchmark for reading parquet files with dataset -* [ARROW-6957](https://issues.apache.org/jira/browse/ARROW-6957) - [CI][Crossbow] Nightly R with sanitizers build fails installing dependencies -* [ARROW-6962](https://issues.apache.org/jira/browse/ARROW-6962) - [C++] [CI] Stop compiling with -Weverything -* [ARROW-6966](https://issues.apache.org/jira/browse/ARROW-6966) - [Go] 32bit memset is null -* [ARROW-6977](https://issues.apache.org/jira/browse/ARROW-6977) - [C++] Only enable jemalloc background\_thread if feature is supported -* [ARROW-6983](https://issues.apache.org/jira/browse/ARROW-6983) - [C++] Threaded task group crashes sometimes -* [ARROW-6989](https://issues.apache.org/jira/browse/ARROW-6989) - [Python][C++] Assert is triggered when decimal type inference occurs on a value with out of range precision -* [ARROW-6992](https://issues.apache.org/jira/browse/ARROW-6992) - [C++]: Undefined Behavior sanitizer build option fails with GCC -* [ARROW-6999](https://issues.apache.org/jira/browse/ARROW-6999) - [Python] KeyError: '\_\_index\_level\_0\_\_' passing Table.from\_pandas its own schema -* [ARROW-7013](https://issues.apache.org/jira/browse/ARROW-7013) - [C++] arrow-dataset pkgconfig is incomplete -* [ARROW-7020](https://issues.apache.org/jira/browse/ARROW-7020) - [Java] Fix the bugs when calculating vector hash code -* [ARROW-7021](https://issues.apache.org/jira/browse/ARROW-7021) - [Java] UnionFixedSizeListWriter decimal type should check writer index -* [ARROW-7022](https://issues.apache.org/jira/browse/ARROW-7022) - [Python] \_\_arrow\_array\_\_ does not work for ExtensionTypes in Table.from\_pandas -* [ARROW-7023](https://issues.apache.org/jira/browse/ARROW-7023) - [Python] pa.array does not use "from\_pandas" semantics for pd.Index -* [ARROW-7024](https://issues.apache.org/jira/browse/ARROW-7024) - [CI][R] Update R dependencies for Conda build -* [ARROW-7027](https://issues.apache.org/jira/browse/ARROW-7027) - [Python] pa.table(..) returns instead of raises error if passing invalid object -* [ARROW-7033](https://issues.apache.org/jira/browse/ARROW-7033) - [C++] Error in./configure step for jemalloc when building on OSX 10.14.6 -* [ARROW-7045](https://issues.apache.org/jira/browse/ARROW-7045) - [R] Factor type not preserved in Parquet roundtrip -* [ARROW-7050](https://issues.apache.org/jira/browse/ARROW-7050) - [R] Fix compiler warnings in R bindings -* [ARROW-7053](https://issues.apache.org/jira/browse/ARROW-7053) - [Python] setuptools-scm produces incorrect version at apache-arrow-0.15.1 tag -* [ARROW-7056](https://issues.apache.org/jira/browse/ARROW-7056) - [Python] Test errors without S3 -* [ARROW-7059](https://issues.apache.org/jira/browse/ARROW-7059) - [Python] Reading parquet file with many columns is much slower in 0.15.x versus 0.14.x -* [ARROW-7074](https://issues.apache.org/jira/browse/ARROW-7074) - [C++] ASSERT\_OK\_AND\_ASSIGN crashes when failing -* [ARROW-7077](https://issues.apache.org/jira/browse/ARROW-7077) - [C++] Unsupported Dict-\>T cast crashes instead of returning error -* [ARROW-7087](https://issues.apache.org/jira/browse/ARROW-7087) - [Python] Table Metadata disappear when we write a partitioned dataset -* [ARROW-7097](https://issues.apache.org/jira/browse/ARROW-7097) - [Rust][CI] Builds failing due to rust nightly formatting -* [ARROW-7100](https://issues.apache.org/jira/browse/ARROW-7100) - [C++] libjvm.so not found on ubuntu 19.04 with openjdk-11 -* [ARROW-7105](https://issues.apache.org/jira/browse/ARROW-7105) - [CI][Crossbow] Nightly homebrew-cpp job fails -* [ARROW-7106](https://issues.apache.org/jira/browse/ARROW-7106) - [Java] Fix the problem that flight perf test hangs endlessly -* [ARROW-7117](https://issues.apache.org/jira/browse/ARROW-7117) - [C++][CI] Fix the hanging C++ tests in Windows 2019 -* [ARROW-7128](https://issues.apache.org/jira/browse/ARROW-7128) - [CI] Fedora cron jobs are failing because of wrong fedora version -* [ARROW-7133](https://issues.apache.org/jira/browse/ARROW-7133) - [CI] Allow GH Actions to run on all branches -* [ARROW-7142](https://issues.apache.org/jira/browse/ARROW-7142) - [C++] Compile error with GCC 5.4.0 -* [ARROW-7152](https://issues.apache.org/jira/browse/ARROW-7152) - [Java] Delete useless class DiffFunction -* [ARROW-7157](https://issues.apache.org/jira/browse/ARROW-7157) - [R] Add validation, helpful error message to Object$new() -* [ARROW-7158](https://issues.apache.org/jira/browse/ARROW-7158) - [C++][Visual Studio]Build config Error on non English Version visual studio. -* [ARROW-7163](https://issues.apache.org/jira/browse/ARROW-7163) - [Doc] Fix double-and typos -* [ARROW-7164](https://issues.apache.org/jira/browse/ARROW-7164) - [CI] Dev cron github action is failing every 15 minutes -* [ARROW-7167](https://issues.apache.org/jira/browse/ARROW-7167) - [CI][Python] Add nightly tests for older pandas versions to Github Actions -* [ARROW-7168](https://issues.apache.org/jira/browse/ARROW-7168) - [Python] pa.array() doesn't respect specified dictionary type -* [ARROW-7170](https://issues.apache.org/jira/browse/ARROW-7170) - [C++] Bundled ORC fails linking -* [ARROW-7180](https://issues.apache.org/jira/browse/ARROW-7180) - [CI] Java builds are not triggered on the master branch -* [ARROW-7181](https://issues.apache.org/jira/browse/ARROW-7181) - [Python][Nightly] Wheel builds could NOT find ArrowPython -* [ARROW-7183](https://issues.apache.org/jira/browse/ARROW-7183) - [CI][Crossbow] Re-skip r-sanitizer nightly tests -* [ARROW-7187](https://issues.apache.org/jira/browse/ARROW-7187) - [C++][Doc] doxygen broken on master because of @ -* [ARROW-7188](https://issues.apache.org/jira/browse/ARROW-7188) - [C++][Doc] doxygen broken on master: missing param implicit\_casts -* [ARROW-7189](https://issues.apache.org/jira/browse/ARROW-7189) - [CI][Crossbow] Nightly conda osx builds fail -* [ARROW-7194](https://issues.apache.org/jira/browse/ARROW-7194) - [Rust] CSV Writer causing recursion errors -* [ARROW-7199](https://issues.apache.org/jira/browse/ARROW-7199) - [Java] ConcurrentModificationException in BaseAllocator::getChildAllocators -* [ARROW-7200](https://issues.apache.org/jira/browse/ARROW-7200) - [C++][Flight] Running Arrow Flight benchmark on two hosts doesn't work -* [ARROW-7209](https://issues.apache.org/jira/browse/ARROW-7209) - [Python] tests with pandas master are failing now \_\_from\_arrow\_\_ support landed in pandas -* [ARROW-7212](https://issues.apache.org/jira/browse/ARROW-7212) - "go test -bench=8192 -run=. ./math" fails -* [ARROW-7214](https://issues.apache.org/jira/browse/ARROW-7214) - [Python] unpickling a pyarrow table with dictionary fields crashes -* [ARROW-7217](https://issues.apache.org/jira/browse/ARROW-7217) - ARROW-7217: [CI][Python] Use correct python version in Github Actions -* [ARROW-7225](https://issues.apache.org/jira/browse/ARROW-7225) - [C++] \`\*std::move(Result)\` calls T copy constructor -* [ARROW-7249](https://issues.apache.org/jira/browse/ARROW-7249) - [CI] Release test fails in master due to new arrow-flight Rust crate -* [ARROW-7250](https://issues.apache.org/jira/browse/ARROW-7250) - [C++] Undefined symbols for StringToFloatConverter::Impl with clang 4.x -* [ARROW-7253](https://issues.apache.org/jira/browse/ARROW-7253) - [CI] Fix master failure with release test -* [ARROW-7254](https://issues.apache.org/jira/browse/ARROW-7254) - BaseVariableWidthVector\#setSafe appears to make value offsets inconsistent -* [ARROW-7264](https://issues.apache.org/jira/browse/ARROW-7264) - [Java] RangeEqualsVisitor type check is not correct -* [ARROW-7266](https://issues.apache.org/jira/browse/ARROW-7266) - [Python] dictionary\_encode() of a slice gives wrong result -* [ARROW-7271](https://issues.apache.org/jira/browse/ARROW-7271) - [C++][Flight] Use the single parameter version of SetTotalBytesLimit -* [ARROW-7281](https://issues.apache.org/jira/browse/ARROW-7281) - [C++] AdaptiveIntBuilder::length() does not consider pending\_pos\_. -* [ARROW-7282](https://issues.apache.org/jira/browse/ARROW-7282) - [Python] IO functions should raise FileNotFoundError when appropriate -* [ARROW-7291](https://issues.apache.org/jira/browse/ARROW-7291) - [Dev] Fix FORMAT\_DIR in update-flatbuffers.sh -* [ARROW-7294](https://issues.apache.org/jira/browse/ARROW-7294) - [Python] converted\_type\_name\_from\_enum(): Incorrect name for INT\_64 -* [ARROW-7295](https://issues.apache.org/jira/browse/ARROW-7295) - [R] Fix bad test that causes failure on R < 3.5 -* [ARROW-7298](https://issues.apache.org/jira/browse/ARROW-7298) - [C++] cpp/thirdparty/download-dependencies.sh is broken -* [ARROW-7314](https://issues.apache.org/jira/browse/ARROW-7314) - [Python] Compiler warning in pyarrow -* [ARROW-7318](https://issues.apache.org/jira/browse/ARROW-7318) - [C\#] TimestampArray serialization failure -* [ARROW-7320](https://issues.apache.org/jira/browse/ARROW-7320) - [C++] Target arrow-type-benchmark failed to be built on bullx Linux -* [ARROW-7327](https://issues.apache.org/jira/browse/ARROW-7327) - [CI] Failing C GLib and R buildbot builders -* [ARROW-7328](https://issues.apache.org/jira/browse/ARROW-7328) - [CI] GitHub Actions should trigger on changes to GitHub Actions configuration -* [ARROW-7341](https://issues.apache.org/jira/browse/ARROW-7341) - [CI] Unbreak nightly Conda R job -* [ARROW-7343](https://issues.apache.org/jira/browse/ARROW-7343) - [Java] Memory leak in Flight DoGet when client cancels -* [ARROW-7349](https://issues.apache.org/jira/browse/ARROW-7349) - [C++] Fix the bug of parsing string hex values -* [ARROW-7353](https://issues.apache.org/jira/browse/ARROW-7353) - [C++] Disable -Wmissing-braces when building with clang -* [ARROW-7354](https://issues.apache.org/jira/browse/ARROW-7354) - [C++] TestHadoopFileSystem::ThreadSafety fails with sigabort -* [ARROW-7355](https://issues.apache.org/jira/browse/ARROW-7355) - [CI] Environment variables are defined twice for the fuzzit builds -* [ARROW-7358](https://issues.apache.org/jira/browse/ARROW-7358) - [CI] [Dev] [C++] ccache disabled on conda-python-hdfs -* [ARROW-7359](https://issues.apache.org/jira/browse/ARROW-7359) - [C++][Gandiva] Don't throw error for locate function with start position exceeding string length, return 0 instead -* [ARROW-7360](https://issues.apache.org/jira/browse/ARROW-7360) - [R] Can't use dplyr filter() with variables defined in parent scope -* [ARROW-7361](https://issues.apache.org/jira/browse/ARROW-7361) - [Rust] Build directory is not passed to ci/scripts/rust\_test.sh -* [ARROW-7362](https://issues.apache.org/jira/browse/ARROW-7362) - [Python] ListArray.flatten() should take care of slicing offsets -* [ARROW-7374](https://issues.apache.org/jira/browse/ARROW-7374) - [Dev] [C++] cuda-cpp docker image fails compiling Arrow -* [ARROW-7381](https://issues.apache.org/jira/browse/ARROW-7381) - [C++][Packaging] Iterator change broke manylinux1 wheels -* [ARROW-7386](https://issues.apache.org/jira/browse/ARROW-7386) - [C\#] Array offset does not work properly -* [ARROW-7388](https://issues.apache.org/jira/browse/ARROW-7388) - [Python] Skip HDFS tests if libhdfs cannot be located -* [ARROW-7389](https://issues.apache.org/jira/browse/ARROW-7389) - [Python][Packaging] Remove pyarrow.s3fs import check from the recipe -* [ARROW-7393](https://issues.apache.org/jira/browse/ARROW-7393) - [Plasma] Fix plasma executable name in build for Java -* [ARROW-7395](https://issues.apache.org/jira/browse/ARROW-7395) - [C++] Logical "or" with constants is a Clang warning -* [ARROW-7397](https://issues.apache.org/jira/browse/ARROW-7397) - [C++] Json white space length detection error -* [ARROW-7404](https://issues.apache.org/jira/browse/ARROW-7404) - [C++][Gandiva] Fix utf8 char length error on Arm64 -* [ARROW-7406](https://issues.apache.org/jira/browse/ARROW-7406) - [Java] NonNullableStructVector\#hashCode should pass hasher to child vectors -* [ARROW-7407](https://issues.apache.org/jira/browse/ARROW-7407) - [Python] Failed to install pyarrow 0.15.1 on Python 3.8 -* [ARROW-7408](https://issues.apache.org/jira/browse/ARROW-7408) - [C++] Reference benchmarks fail compiling -* [ARROW-7435](https://issues.apache.org/jira/browse/ARROW-7435) - Security issue: ValidateOffsets() does not prevent buffer over-read -* [ARROW-7436](https://issues.apache.org/jira/browse/ARROW-7436) - [Archery] Fix benchmark default configuration -* [ARROW-7437](https://issues.apache.org/jira/browse/ARROW-7437) - [Java] ReadChannel\#readFully does not set writer index correctly -* [ARROW-7442](https://issues.apache.org/jira/browse/ARROW-7442) - [Ruby] Specifying column type as time causes segmentation fault -* [ARROW-7447](https://issues.apache.org/jira/browse/ARROW-7447) - [Java] ComplexCopier does incorrect copy in some cases -* [ARROW-7450](https://issues.apache.org/jira/browse/ARROW-7450) - [CI][C++] test-ubuntu-18.04-cpp-static failing with linking error in arrow-io-hdfs-test -* [ARROW-7458](https://issues.apache.org/jira/browse/ARROW-7458) - [GLib] incorrect build dependency in Makefile -* [ARROW-7471](https://issues.apache.org/jira/browse/ARROW-7471) - [Python] Cython flake8 failures -* [ARROW-7472](https://issues.apache.org/jira/browse/ARROW-7472) - [Java] Fix some incorrect behavior in UnionListWriter -* [ARROW-7478](https://issues.apache.org/jira/browse/ARROW-7478) - [Rust] [DataFusion] Group by expression ignored unless paired with aggregate expression -* [ARROW-7492](https://issues.apache.org/jira/browse/ARROW-7492) - [CI][Crossbow] Nightly homebrew-cpp job fails on Python installation -* [ARROW-7497](https://issues.apache.org/jira/browse/ARROW-7497) - [Python] Test asserts: pandas.util.testing is deprecated, use pandas.testing instead -* [ARROW-7500](https://issues.apache.org/jira/browse/ARROW-7500) - [C++][Dataset] regex\_error in hive partition on centos7 and opensuse42 -* [ARROW-7503](https://issues.apache.org/jira/browse/ARROW-7503) - [Rust] Rust builds are failing on master -* [ARROW-7506](https://issues.apache.org/jira/browse/ARROW-7506) - [Java] JMH benchmarks should be called from main methods -* [ARROW-7508](https://issues.apache.org/jira/browse/ARROW-7508) - [C\#] DateTime32 Reading is Broken -* [ARROW-7510](https://issues.apache.org/jira/browse/ARROW-7510) - [C++] Array::null\_count() is not thread-compatible -* [ARROW-7516](https://issues.apache.org/jira/browse/ARROW-7516) - [C\#] .NET Benchmarks are broken -* [ARROW-7518](https://issues.apache.org/jira/browse/ARROW-7518) - [Python] Use PYARROW\_WITH\_HDFS when building wheels, conda packages -* [ARROW-7527](https://issues.apache.org/jira/browse/ARROW-7527) - [Python] pandas/feather tests failing on pandas master -* [ARROW-7528](https://issues.apache.org/jira/browse/ARROW-7528) - [Python] The pandas.datetime class (import of datetime.datetime) and pandas.np are deprecated -* [ARROW-7535](https://issues.apache.org/jira/browse/ARROW-7535) - [C++] ASAN failure in validation -* [ARROW-7543](https://issues.apache.org/jira/browse/ARROW-7543) - [R] arrow::write\_parquet() code examples do not work -* [ARROW-7545](https://issues.apache.org/jira/browse/ARROW-7545) - [C++] [Dataset] Scanning dataset with dictionary type hangs -* [ARROW-7551](https://issues.apache.org/jira/browse/ARROW-7551) - [FlightRPC][C++] Flight test on macOS fails due to Homebrew gRPC -* [ARROW-7552](https://issues.apache.org/jira/browse/ARROW-7552) - [C++] TestSlowInputStream is flaky -* [ARROW-7554](https://issues.apache.org/jira/browse/ARROW-7554) - [C++] Unknown CMake command "externalproject\_add". -* [ARROW-7559](https://issues.apache.org/jira/browse/ARROW-7559) - [Rust] Possibly incorrect index check assertion in StringArray and BinaryArray -* [ARROW-7561](https://issues.apache.org/jira/browse/ARROW-7561) - [Doc][Python] fix conda environment command -* [ARROW-7563](https://issues.apache.org/jira/browse/ARROW-7563) - [Rust] failed to select a version for \`byteorder\` -* [ARROW-7582](https://issues.apache.org/jira/browse/ARROW-7582) - [Rust][Flight] Unable to compile arrow.flight.protocol.rs -* [ARROW-7583](https://issues.apache.org/jira/browse/ARROW-7583) - [C++][Flight] Auth handler tests fragile on Windows -* [ARROW-7591](https://issues.apache.org/jira/browse/ARROW-7591) - [Python] DictionaryArray.to\_numpy returns dict of parts instead of numpy array -* [ARROW-7592](https://issues.apache.org/jira/browse/ARROW-7592) - [C++] Fix crashes on corrupt IPC input -* [ARROW-7593](https://issues.apache.org/jira/browse/ARROW-7593) - [CI][Python] Python datasets failing on master / not run on CI -* [ARROW-7595](https://issues.apache.org/jira/browse/ARROW-7595) - [R][CI] R appveyor job fails due to pacman compression change -* [ARROW-7596](https://issues.apache.org/jira/browse/ARROW-7596) - [Python] Only apply zero-copy DataFrame block optimizations when split\_blocks=True -* [ARROW-7599](https://issues.apache.org/jira/browse/ARROW-7599) - [Java] Fix build break due to change in RangeEqualsVisitor -* [ARROW-7603](https://issues.apache.org/jira/browse/ARROW-7603) - [CI][Crossbow] Nightly centos 8 job fails -* [ARROW-7611](https://issues.apache.org/jira/browse/ARROW-7611) - [Packaging][Python] Artifacts patterns for wheel are wrong -* [ARROW-7612](https://issues.apache.org/jira/browse/ARROW-7612) - [Packaging][Python] Artifact paths for Conda on WIndows are wrong -* [ARROW-7614](https://issues.apache.org/jira/browse/ARROW-7614) - [Python] Slow performance in test\_parquet.py::test\_set\_data\_page\_size -* [ARROW-7618](https://issues.apache.org/jira/browse/ARROW-7618) - [C++] Fix crashes or undefined behaviour on corrupt IPC input -* [ARROW-7620](https://issues.apache.org/jira/browse/ARROW-7620) - [Rust] Windows builds failing due to flatbuffer compile error -* [ARROW-7621](https://issues.apache.org/jira/browse/ARROW-7621) - [Doc] Doc build fails -* [ARROW-7634](https://issues.apache.org/jira/browse/ARROW-7634) - [Python] Dataset tests failing on Windows to parse file path -* [ARROW-7638](https://issues.apache.org/jira/browse/ARROW-7638) - [Python] Segfault when inspecting dataset.Source with invalid file/partitioning -* [ARROW-7639](https://issues.apache.org/jira/browse/ARROW-7639) - [R] Cannot convert Dictionary Array to R when values aren't strings -* [ARROW-7640](https://issues.apache.org/jira/browse/ARROW-7640) - [C++][Dataset] segfault when reading compressed Parquet files if build didn't include support for codec -* [ARROW-7647](https://issues.apache.org/jira/browse/ARROW-7647) - [C++] JSON reader fails to read arrays with few values -* [ARROW-7650](https://issues.apache.org/jira/browse/ARROW-7650) - [C++] Dataset tests not built on Windows -* [ARROW-7651](https://issues.apache.org/jira/browse/ARROW-7651) - [CI][Crossbow] Nightly macOS wheel builds fail -* [ARROW-7652](https://issues.apache.org/jira/browse/ARROW-7652) - [Python][Dataset] Insert implicit cast in ScannerBuilder.filter -* [ARROW-7661](https://issues.apache.org/jira/browse/ARROW-7661) - [Python] Non-optimal CSV chunking when no newline at end -* [ARROW-7689](https://issues.apache.org/jira/browse/ARROW-7689) - [C++] Sporadic Flight test crash on macOS -* [ARROW-7690](https://issues.apache.org/jira/browse/ARROW-7690) - [R] Cannot write parquet to OutputStream -* [ARROW-7693](https://issues.apache.org/jira/browse/ARROW-7693) - [CI] Fix test-conda-python-3.7-spark-master nightly errors -* [ARROW-7709](https://issues.apache.org/jira/browse/ARROW-7709) - [Python] Conversion from Table Column to Pandas loses name for Timestamps -* [ARROW-7714](https://issues.apache.org/jira/browse/ARROW-7714) - [Release] Variable expansion is missing -* [ARROW-7718](https://issues.apache.org/jira/browse/ARROW-7718) - [Release] Fix auto-retry in the binary release script -* [ARROW-7723](https://issues.apache.org/jira/browse/ARROW-7723) - [Python] StructArray timestamp type with timezone to\_pandas convert error -* [ARROW-7727](https://issues.apache.org/jira/browse/ARROW-7727) - [Python] Unable to read a ParquetDataset when schema validation is on. -* [ARROW-8135](https://issues.apache.org/jira/browse/ARROW-8135) - [Python] Problem importing PyArrow on a cluster -* [ARROW-8638](https://issues.apache.org/jira/browse/ARROW-8638) - Arrow Cython API Usage Gives an error when calling CTable API Endpoints -* [PARQUET-1692](https://issues.apache.org/jira/browse/PARQUET-1692) - [C++] LogicalType::FromThrift error on Centos 7 RPM -* [PARQUET-1692](https://issues.apache.org/jira/browse/PARQUET-1692) - [C++] LogicalType::FromThrift error on Centos 7 RPM -* [PARQUET-1693](https://issues.apache.org/jira/browse/PARQUET-1693) - [C++] Build examples don't account for CMAKE compression feature flags -* [PARQUET-1702](https://issues.apache.org/jira/browse/PARQUET-1702) - [C++] Make BufferedRowGroupWriter compatible with parquet encryption -* [PARQUET-1706](https://issues.apache.org/jira/browse/PARQUET-1706) - [C++] Wrong dictionary\_page\_offset when writing only data pages via BufferedPageWriter -* [PARQUET-1707](https://issues.apache.org/jira/browse/PARQUET-1707) - [C++] parquet-arrow-test fails with undefined behaviour sanitizer -* [PARQUET-1709](https://issues.apache.org/jira/browse/PARQUET-1709) - [C++] Avoid unnecessary temporary std::shared\_ptr copies -* [PARQUET-1715](https://issues.apache.org/jira/browse/PARQUET-1715) - [C++] Add the Parquet code samples to CI + Refactor Parquet Encryption Samples -* [PARQUET-1720](https://issues.apache.org/jira/browse/PARQUET-1720) - [C++] Parquet JSONPrint not showing version correctly -* [PARQUET-1747](https://issues.apache.org/jira/browse/PARQUET-1747) - [C++] Access to ColumnChunkMetaData fails when encryption is on -* [PARQUET-1766](https://issues.apache.org/jira/browse/PARQUET-1766) - [C++] parquet NaN/null double statistics can result in endless loop -* [PARQUET-1772](https://issues.apache.org/jira/browse/PARQUET-1772) - [C++] ParquetFileWriter: Data overwritten when output stream opened in append mode - - -## New Features and Improvements - -* [ARROW-412](https://issues.apache.org/jira/browse/ARROW-412) - [Format] Handling of buffer padding in the IPC metadata -* [ARROW-501](https://issues.apache.org/jira/browse/ARROW-501) - [C++] Implement concurrent / buffering InputStream for streaming data use cases -* [ARROW-772](https://issues.apache.org/jira/browse/ARROW-772) - [C++] Implement take kernel functions -* [ARROW-843](https://issues.apache.org/jira/browse/ARROW-843) - [C++] Implement Schema unification, merging unequal but equivalent schemas -* [ARROW-976](https://issues.apache.org/jira/browse/ARROW-976) - [C++][Python] Provide API for defining and reading Parquet datasets with more ad hoc partition schemes -* [ARROW-1036](https://issues.apache.org/jira/browse/ARROW-1036) - [C++] Define abstract API for filtering Arrow streams (e.g. predicate evaluation) -* [ARROW-1119](https://issues.apache.org/jira/browse/ARROW-1119) - [Python/C++] Implement NativeFile interfaces for Amazon S3 -* [ARROW-1175](https://issues.apache.org/jira/browse/ARROW-1175) - [Java] Implement/test dictionary-encoded subfields -* [ARROW-1456](https://issues.apache.org/jira/browse/ARROW-1456) - [Python] Run s3fs unit tests in Travis CI -* [ARROW-1562](https://issues.apache.org/jira/browse/ARROW-1562) - [C++] Numeric kernel implementations for add (+) -* [ARROW-1638](https://issues.apache.org/jira/browse/ARROW-1638) - [Java] IPC roundtrip for null type -* [ARROW-1900](https://issues.apache.org/jira/browse/ARROW-1900) - [C++] Add kernel functions for determining value range (maximum and minimum) of integer arrays -* [ARROW-2428](https://issues.apache.org/jira/browse/ARROW-2428) - [Python] Add API to map Arrow types (including extension types) to pandas ExtensionArray instances for to\_pandas conversions -* [ARROW-2602](https://issues.apache.org/jira/browse/ARROW-2602) - [Packaging] Automate build of development docker containers -* [ARROW-2863](https://issues.apache.org/jira/browse/ARROW-2863) - [Python] Add context manager APIs to RecordBatch\*Writer/Reader classes -* [ARROW-3085](https://issues.apache.org/jira/browse/ARROW-3085) - [Rust] Add an adapter for parquet. -* [ARROW-3408](https://issues.apache.org/jira/browse/ARROW-3408) - [C++] Add option to CSV reader to dictionary encode individual columns or all string / binary columns -* [ARROW-3444](https://issues.apache.org/jira/browse/ARROW-3444) - [Python] Table.nbytes attribute -* [ARROW-3706](https://issues.apache.org/jira/browse/ARROW-3706) - [Rust] Add record batch reader trait. -* [ARROW-3789](https://issues.apache.org/jira/browse/ARROW-3789) - [Python] Enable calling object in Table.to\_pandas to "self-destruct" for improved memory use -* [ARROW-3808](https://issues.apache.org/jira/browse/ARROW-3808) - [R] Implement [.arrow::Array -* [ARROW-3813](https://issues.apache.org/jira/browse/ARROW-3813) - [R] lower level construction of Dictionary Arrays -* [ARROW-4059](https://issues.apache.org/jira/browse/ARROW-4059) - [Rust] Parquet/Arrow Integration -* [ARROW-4091](https://issues.apache.org/jira/browse/ARROW-4091) - [C++] Curate default list of CSV null spellings -* [ARROW-4208](https://issues.apache.org/jira/browse/ARROW-4208) - [CI/Python] Have automatized tests for S3 -* [ARROW-4219](https://issues.apache.org/jira/browse/ARROW-4219) - [Rust] [Parquet] Implement ArrowReader -* [ARROW-4223](https://issues.apache.org/jira/browse/ARROW-4223) - [Python] Support scipy.sparse integration -* [ARROW-4224](https://issues.apache.org/jira/browse/ARROW-4224) - [Python] Support integration with pydata/sparse library -* [ARROW-4225](https://issues.apache.org/jira/browse/ARROW-4225) - [Format][C++] Add CSC sparse matrix support -* [ARROW-4722](https://issues.apache.org/jira/browse/ARROW-4722) - [C++] Implement Bitmap class to modularize handling of bitmaps -* [ARROW-4748](https://issues.apache.org/jira/browse/ARROW-4748) - [Rust] [DataFusion] GROUP BY performance could be optimized -* [ARROW-4930](https://issues.apache.org/jira/browse/ARROW-4930) - [Python] Remove LIBDIR assumptions in Python build -* [ARROW-5180](https://issues.apache.org/jira/browse/ARROW-5180) - [Rust] IPC Support -* [ARROW-5181](https://issues.apache.org/jira/browse/ARROW-5181) - [Rust] Create Arrow File reader -* [ARROW-5182](https://issues.apache.org/jira/browse/ARROW-5182) - [Rust] Create Arrow File writer -* [ARROW-5227](https://issues.apache.org/jira/browse/ARROW-5227) - [Rust] [DataFusion] Re-implement query execution with an extensible physical query plan -* [ARROW-5277](https://issues.apache.org/jira/browse/ARROW-5277) - [C\#] MemoryAllocator.Allocate(length: 0) should not return null -* [ARROW-5333](https://issues.apache.org/jira/browse/ARROW-5333) - [C++] Fit build option summary into narrower console -* [ARROW-5366](https://issues.apache.org/jira/browse/ARROW-5366) - [Rust] Implement Duration and Interval Arrays -* [ARROW-5400](https://issues.apache.org/jira/browse/ARROW-5400) - [Rust] Test/ensure that reader and writer support zero-length record batches -* [ARROW-5445](https://issues.apache.org/jira/browse/ARROW-5445) - [Website] Remove language that encourages pinning a version -* [ARROW-5454](https://issues.apache.org/jira/browse/ARROW-5454) - [C++] Implement Take on ChunkedArray for DataFrame use -* [ARROW-5502](https://issues.apache.org/jira/browse/ARROW-5502) - [R] file readers should mmap -* [ARROW-5508](https://issues.apache.org/jira/browse/ARROW-5508) - [C++] Create reusable Iterator interface -* [ARROW-5523](https://issues.apache.org/jira/browse/ARROW-5523) - [Python] [Packaging] Use HTTPS consistently for downloading dependencies -* [ARROW-5712](https://issues.apache.org/jira/browse/ARROW-5712) - [C++][Parquet] Arrow time32/time64/timestamp ConvertedType not being restored properly -* [ARROW-5767](https://issues.apache.org/jira/browse/ARROW-5767) - [Format] Permit dictionary replacements in IPC protocol -* [ARROW-5801](https://issues.apache.org/jira/browse/ARROW-5801) - [CI] Dockerize (add to docker-compose) all Travis CI Linux tasks -* [ARROW-5802](https://issues.apache.org/jira/browse/ARROW-5802) - [CI] Dockerize "lint" Travis CI job -* [ARROW-5804](https://issues.apache.org/jira/browse/ARROW-5804) - [C++] Dockerize C++ CI job with conda-forge toolchain, code coverage from Travis CI -* [ARROW-5805](https://issues.apache.org/jira/browse/ARROW-5805) - [Python] Dockerize (add to docker-compose) Python Travis CI job -* [ARROW-5806](https://issues.apache.org/jira/browse/ARROW-5806) - [CI] Dockerize (add to docker-compose) Integration tests Travis CI entry -* [ARROW-5807](https://issues.apache.org/jira/browse/ARROW-5807) - [JS] Dockerize NodeJS Travis CI entry -* [ARROW-5808](https://issues.apache.org/jira/browse/ARROW-5808) - [GLib][Ruby] Dockerize (add to docker-compose) current GLib + Ruby Travis CI entry -* [ARROW-5809](https://issues.apache.org/jira/browse/ARROW-5809) - [Rust] Dockerize (add to docker-compose) Rust Travis CI build -* [ARROW-5810](https://issues.apache.org/jira/browse/ARROW-5810) - [Go] Dockerize Travis CI Go build -* [ARROW-5831](https://issues.apache.org/jira/browse/ARROW-5831) - [Release] Migrate and improve binary release verification script -* [ARROW-5839](https://issues.apache.org/jira/browse/ARROW-5839) - [Python] Test manylinux2010 in CI -* [ARROW-5855](https://issues.apache.org/jira/browse/ARROW-5855) - [Python] Add support for Duration type -* [ARROW-5859](https://issues.apache.org/jira/browse/ARROW-5859) - [Python] Support ExtentionType on conversion to numpy/pandas -* [ARROW-5971](https://issues.apache.org/jira/browse/ARROW-5971) - [Website] Blog post introducing Arrow Flight -* [ARROW-5994](https://issues.apache.org/jira/browse/ARROW-5994) - [CI] [Rust] Create nightly releases of the Rust implementation -* [ARROW-6003](https://issues.apache.org/jira/browse/ARROW-6003) - [C++] Better input validation and error messaging in CSV reader -* [ARROW-6074](https://issues.apache.org/jira/browse/ARROW-6074) - [FlightRPC] Implement middleware -* [ARROW-6091](https://issues.apache.org/jira/browse/ARROW-6091) - [Rust] [DataFusion] Implement parallel execution for limit -* [ARROW-6109](https://issues.apache.org/jira/browse/ARROW-6109) - [Integration] Docker image for integration testing can't be built on windows -* [ARROW-6112](https://issues.apache.org/jira/browse/ARROW-6112) - [Java] Update APIs to support 64-bit address space -* [ARROW-6184](https://issues.apache.org/jira/browse/ARROW-6184) - [Java] Provide hash table based dictionary encoder -* [ARROW-6251](https://issues.apache.org/jira/browse/ARROW-6251) - [Developer] Add PR merge tool to apache/arrow-site -* [ARROW-6257](https://issues.apache.org/jira/browse/ARROW-6257) - [C++] Add fnmatch compatible globbing function -* [ARROW-6274](https://issues.apache.org/jira/browse/ARROW-6274) - [Rust] [DataFusion] Add support for writing results to CSV -* [ARROW-6277](https://issues.apache.org/jira/browse/ARROW-6277) - [C++][Parquet] Support reading/writing other Parquet primitive types to DictionaryArray -* [ARROW-6283](https://issues.apache.org/jira/browse/ARROW-6283) - [Rust] [DataFusion] Implement operator to write query results to partitioned CSV -* [ARROW-6285](https://issues.apache.org/jira/browse/ARROW-6285) - [GLib] Add support for LargeBinary and LargeString types -* [ARROW-6286](https://issues.apache.org/jira/browse/ARROW-6286) - [GLib] Add support for LargeList type -* [ARROW-6299](https://issues.apache.org/jira/browse/ARROW-6299) - [C++] Simplify FileFormat classes to singletons -* [ARROW-6321](https://issues.apache.org/jira/browse/ARROW-6321) - [Python] Ability to create ExtensionBlock on conversion to pandas -* [ARROW-6340](https://issues.apache.org/jira/browse/ARROW-6340) - [R] Implements low-level bindings to Dataset classes -* [ARROW-6341](https://issues.apache.org/jira/browse/ARROW-6341) - [Python] Implement low-level bindings for Dataset -* [ARROW-6352](https://issues.apache.org/jira/browse/ARROW-6352) - [Java] Add implementation of DenseUnionVector. -* [ARROW-6367](https://issues.apache.org/jira/browse/ARROW-6367) - [C++][Gandiva] Implement string reverse -* [ARROW-6378](https://issues.apache.org/jira/browse/ARROW-6378) - [C++][Dataset] Implement TreeDataSource -* [ARROW-6386](https://issues.apache.org/jira/browse/ARROW-6386) - [C++][Documentation] Explicit documentation of null slot interpretation -* [ARROW-6394](https://issues.apache.org/jira/browse/ARROW-6394) - [Java] Support conversions between delta vector and partial sum vector -* [ARROW-6396](https://issues.apache.org/jira/browse/ARROW-6396) - [C++] Add ResolveNullOptions to Logical kernels -* [ARROW-6398](https://issues.apache.org/jira/browse/ARROW-6398) - [C++] Consolidate ScanOptions and ScanContext -* [ARROW-6405](https://issues.apache.org/jira/browse/ARROW-6405) - [Python] Add std::move wrapper for use in Cython -* [ARROW-6452](https://issues.apache.org/jira/browse/ARROW-6452) - [Java] Override ValueVector toString() method -* [ARROW-6463](https://issues.apache.org/jira/browse/ARROW-6463) - [C++][Python] Rename arrow::fs::Selector to FileSelector -* [ARROW-6466](https://issues.apache.org/jira/browse/ARROW-6466) - [Developer] Refactor integration/integration\_test.py into a proper Python package -* [ARROW-6468](https://issues.apache.org/jira/browse/ARROW-6468) - [C++] Remove unused hashing routines -* [ARROW-6473](https://issues.apache.org/jira/browse/ARROW-6473) - [Format] Clarify dictionary encoding edge cases -* [ARROW-6503](https://issues.apache.org/jira/browse/ARROW-6503) - [C++] Add an argument of memory pool object to SparseTensorConverter -* [ARROW-6508](https://issues.apache.org/jira/browse/ARROW-6508) - [C++] Add Tensor and SparseTensor factory function with validations -* [ARROW-6515](https://issues.apache.org/jira/browse/ARROW-6515) - [C++] Clean type\_traits.h definitions -* [ARROW-6578](https://issues.apache.org/jira/browse/ARROW-6578) - [C++] Casting int64 to string columns -* [ARROW-6592](https://issues.apache.org/jira/browse/ARROW-6592) - [Java] Add support for skipping decoding of columns/field in Avro converter -* [ARROW-6594](https://issues.apache.org/jira/browse/ARROW-6594) - [Java] Support logical type encodings from Avro -* [ARROW-6598](https://issues.apache.org/jira/browse/ARROW-6598) - [Java] Sort the code for ApproxEqualsVisitor -* [ARROW-6608](https://issues.apache.org/jira/browse/ARROW-6608) - [C++] Make default for ARROW\_HDFS to be OFF -* [ARROW-6610](https://issues.apache.org/jira/browse/ARROW-6610) - [C++] Add ARROW\_FILESYSTEM=ON/OFF CMake configuration flag -* [ARROW-6611](https://issues.apache.org/jira/browse/ARROW-6611) - [C++] Make ARROW\_JSON=OFF the default -* [ARROW-6612](https://issues.apache.org/jira/browse/ARROW-6612) - [C++] Add ARROW\_CSV CMake build flag -* [ARROW-6619](https://issues.apache.org/jira/browse/ARROW-6619) - [Ruby] Add support for building Gandiva::Expression by Arrow::Schema\#build\_expression -* [ARROW-6624](https://issues.apache.org/jira/browse/ARROW-6624) - [C++] Add SparseTensor.ToTensor() method -* [ARROW-6625](https://issues.apache.org/jira/browse/ARROW-6625) - [Python] Allow concat\_tables to null or default fill missing columns -* [ARROW-6631](https://issues.apache.org/jira/browse/ARROW-6631) - [C++] Do not build with any compression library dependencies by default -* [ARROW-6632](https://issues.apache.org/jira/browse/ARROW-6632) - [C++] Do not build with ARROW\_COMPUTE=on and ARROW\_DATASET=on by default -* [ARROW-6633](https://issues.apache.org/jira/browse/ARROW-6633) - [C++] Do not require double-conversion for default build -* [ARROW-6634](https://issues.apache.org/jira/browse/ARROW-6634) - [C++] Do not require flatbuffers or flatbuffers\_ep to build -* [ARROW-6634](https://issues.apache.org/jira/browse/ARROW-6634) - [C++] Do not require flatbuffers or flatbuffers\_ep to build -* [ARROW-6635](https://issues.apache.org/jira/browse/ARROW-6635) - [C++] Do not require glog for default build -* [ARROW-6636](https://issues.apache.org/jira/browse/ARROW-6636) - [C++] Do not build C++ command line utilities by default -* [ARROW-6637](https://issues.apache.org/jira/browse/ARROW-6637) - [C++] Zero-dependency default core build -* [ARROW-6637](https://issues.apache.org/jira/browse/ARROW-6637) - [C++] Zero-dependency default core build -* [ARROW-6646](https://issues.apache.org/jira/browse/ARROW-6646) - [Go] Amend NullType IPC implementation to append no buffers in RecordBatch message -* [ARROW-6650](https://issues.apache.org/jira/browse/ARROW-6650) - [Rust] [Integration] Create methods to test Arrow files against Integration JSON -* [ARROW-6656](https://issues.apache.org/jira/browse/ARROW-6656) - [Rust] [DataFusion] Implement MIN and MAX aggregate expressions -* [ARROW-6657](https://issues.apache.org/jira/browse/ARROW-6657) - [Rust] [DataFusion] Implement COUNT aggregate expression -* [ARROW-6658](https://issues.apache.org/jira/browse/ARROW-6658) - [Rust] [DataFusion] Implement AVG aggregate expression -* [ARROW-6659](https://issues.apache.org/jira/browse/ARROW-6659) - [Rust] [DataFusion] Refactor of HashAggregateExec to support custom merge -* [ARROW-6662](https://issues.apache.org/jira/browse/ARROW-6662) - [Java] Implement equals/approxEquals API for VectorSchemaRoot -* [ARROW-6671](https://issues.apache.org/jira/browse/ARROW-6671) - [C++] Sparse tensor naming -* [ARROW-6672](https://issues.apache.org/jira/browse/ARROW-6672) - [Java] Extract a common interface for dictionary builders -* [ARROW-6685](https://issues.apache.org/jira/browse/ARROW-6685) - [C++/Python] S3 FileStat object's base\_path and type depends on trailing slash -* [ARROW-6686](https://issues.apache.org/jira/browse/ARROW-6686) - [CI] Pull and push docker images to speed up the nightly builds -* [ARROW-6688](https://issues.apache.org/jira/browse/ARROW-6688) - [Packaging] Include s3 support in the conda packages -* [ARROW-6690](https://issues.apache.org/jira/browse/ARROW-6690) - [Rust] [DataFusion] HashAggregate without GROUP BY should use SIMD -* [ARROW-6692](https://issues.apache.org/jira/browse/ARROW-6692) - [Rust] [DataFusion] Update examples to use physical query plan -* [ARROW-6693](https://issues.apache.org/jira/browse/ARROW-6693) - [Rust] [DataFusion] Update unit tests to use physical query plan -* [ARROW-6694](https://issues.apache.org/jira/browse/ARROW-6694) - [Rust] [DataFusion] Update integration tests to use physical plan -* [ARROW-6695](https://issues.apache.org/jira/browse/ARROW-6695) - [Rust] [DataFusion] Remove execution of logical plan -* [ARROW-6696](https://issues.apache.org/jira/browse/ARROW-6696) - [Rust] [DataFusion] Implement simple math operations in physical query plan -* [ARROW-6700](https://issues.apache.org/jira/browse/ARROW-6700) - [Rust] [DataFusion] Use new parquet arrow reader -* [ARROW-6707](https://issues.apache.org/jira/browse/ARROW-6707) - [Java] Improve the performance of JDBC adapters by using nullable information -* [ARROW-6710](https://issues.apache.org/jira/browse/ARROW-6710) - [Java] Add JDBC adapter test to cover cases which contains some null values -* [ARROW-6711](https://issues.apache.org/jira/browse/ARROW-6711) - [C++] Consolidate Filter and Expression classes -* [ARROW-6721](https://issues.apache.org/jira/browse/ARROW-6721) - [JAVA] Avro adapter benchmark only runs once in JMH -* [ARROW-6722](https://issues.apache.org/jira/browse/ARROW-6722) - [Java] Provide a uniform way to get vector name -* [ARROW-6729](https://issues.apache.org/jira/browse/ARROW-6729) - [C++] StlStringBuffer constructor is not zero-copy -* [ARROW-6730](https://issues.apache.org/jira/browse/ARROW-6730) - [CI] Use GitHub Actions for "C++ with clang 7" docker image -* [ARROW-6731](https://issues.apache.org/jira/browse/ARROW-6731) - [CI] [Rust] Set up Github Action to run Rust tests -* [ARROW-6732](https://issues.apache.org/jira/browse/ARROW-6732) - [Java] Implement quick sort in a non-recursive way to avoid stack overflow -* [ARROW-6741](https://issues.apache.org/jira/browse/ARROW-6741) - [Release] Update changelog.py to use APACHE\_ prefixed JIRA\_USERNAME and JIRA\_PASSWORD environment variables -* [ARROW-6742](https://issues.apache.org/jira/browse/ARROW-6742) - [C++] Remove usage of boost::filesystem::path from arrow/io/hdfs\_internal.cc -* [ARROW-6743](https://issues.apache.org/jira/browse/ARROW-6743) - [C++] Completely remove usage of boost::filesystem (except in hdfs\_internal) -* [ARROW-6744](https://issues.apache.org/jira/browse/ARROW-6744) - [Rust] Export JsonEqual trait in the array module -* [ARROW-6754](https://issues.apache.org/jira/browse/ARROW-6754) - [C++] Merge arrow/allocator.h and arrow/stl.h, or rename allocator.h -* [ARROW-6758](https://issues.apache.org/jira/browse/ARROW-6758) - [Release] Install ephemeral node/npm/npx in release verification script -* [ARROW-6764](https://issues.apache.org/jira/browse/ARROW-6764) - [C++] Add readahead iterator -* [ARROW-6767](https://issues.apache.org/jira/browse/ARROW-6767) - [JS] lazily bind batches in scan/scanReverse -* [ARROW-6768](https://issues.apache.org/jira/browse/ARROW-6768) - [C++][Dataset] Implement dataset::Scan to Table helper function -* [ARROW-6769](https://issues.apache.org/jira/browse/ARROW-6769) - [C++][Dataset] End to End dataset integration test case -* [ARROW-6770](https://issues.apache.org/jira/browse/ARROW-6770) - [CI][Travis] Download Minio quietly -* [ARROW-6777](https://issues.apache.org/jira/browse/ARROW-6777) - [GLib][CI] Unpin gobject-introspection gem -* [ARROW-6778](https://issues.apache.org/jira/browse/ARROW-6778) - [C++] Support DurationType in Cast kernel -* [ARROW-6782](https://issues.apache.org/jira/browse/ARROW-6782) - [C++] Build minimal core Arrow libraries without any Boost headers -* [ARROW-6784](https://issues.apache.org/jira/browse/ARROW-6784) - [C++][R] Move filter and take code from Rcpp to C++ library -* [ARROW-6787](https://issues.apache.org/jira/browse/ARROW-6787) - [CI] Decommission "C++ with clang 7 and system packages" Travis CI job -* [ARROW-6788](https://issues.apache.org/jira/browse/ARROW-6788) - [CI] Migrate Travis CI lint job to GitHub Actions -* [ARROW-6789](https://issues.apache.org/jira/browse/ARROW-6789) - [Python] Automatically box bytes/buffer-like values yielded from \`FlightServerBase.do\_action\` in Result values -* [ARROW-6790](https://issues.apache.org/jira/browse/ARROW-6790) - [Release] Automatically disable integration test cases in release verification -* [ARROW-6793](https://issues.apache.org/jira/browse/ARROW-6793) - [R] Arrow C++ binary packaging for Linux -* [ARROW-6797](https://issues.apache.org/jira/browse/ARROW-6797) - [Release] Use a separately cloned arrow-site repository in the website post release script -* [ARROW-6802](https://issues.apache.org/jira/browse/ARROW-6802) - [Packaging][deb][RPM] Update qemu-user-static package URL -* [ARROW-6803](https://issues.apache.org/jira/browse/ARROW-6803) - [Rust] [DataFusion] Aggregate queries are slower with new physical query plan -* [ARROW-6804](https://issues.apache.org/jira/browse/ARROW-6804) - [CI] [Rust] Migrate Travis Rust job to Github Actions -* [ARROW-6807](https://issues.apache.org/jira/browse/ARROW-6807) - [Java][FlightRPC] Expose gRPC service -* [ARROW-6810](https://issues.apache.org/jira/browse/ARROW-6810) - [Website] Add docs for R package 0.15 release -* [ARROW-6811](https://issues.apache.org/jira/browse/ARROW-6811) - [R] Assorted post-0.15 release cleanups -* [ARROW-6814](https://issues.apache.org/jira/browse/ARROW-6814) - [C++] Resolve compiler warnings occurred on release build -* [ARROW-6822](https://issues.apache.org/jira/browse/ARROW-6822) - [Website] merge\_pr.py is published -* [ARROW-6824](https://issues.apache.org/jira/browse/ARROW-6824) - [Plasma] Support batched create and seal requests for small objects -* [ARROW-6825](https://issues.apache.org/jira/browse/ARROW-6825) - [C++] Rework CSV reader IO around readahead iterator -* [ARROW-6831](https://issues.apache.org/jira/browse/ARROW-6831) - [R] Update R macOS/Windows builds for change in cmake compression defaults -* [ARROW-6832](https://issues.apache.org/jira/browse/ARROW-6832) - [R] Implement Codec::IsAvailable -* [ARROW-6833](https://issues.apache.org/jira/browse/ARROW-6833) - [R][CI] Add crossbow job for full R autobrew macOS build -* [ARROW-6836](https://issues.apache.org/jira/browse/ARROW-6836) - [Format] add a custom\_metadata:[KeyValue] field to the Footer table in File.fbs -* [ARROW-6843](https://issues.apache.org/jira/browse/ARROW-6843) - [Website] Disable deploy on pull request -* [ARROW-6847](https://issues.apache.org/jira/browse/ARROW-6847) - [C++] Add a range\_expression interface to Iterator<\> -* [ARROW-6850](https://issues.apache.org/jira/browse/ARROW-6850) - [Java] Jdbc converter support Null type -* [ARROW-6852](https://issues.apache.org/jira/browse/ARROW-6852) - [C++] memory-benchmark build failed on Arm64 -* [ARROW-6853](https://issues.apache.org/jira/browse/ARROW-6853) - [Java] Support vector and dictionary encoder use different hasher for calculating hashCode -* [ARROW-6855](https://issues.apache.org/jira/browse/ARROW-6855) - [C++][Python][Flight] Implement Flight middleware -* [ARROW-6862](https://issues.apache.org/jira/browse/ARROW-6862) - [Developer] Check pull request title -* [ARROW-6863](https://issues.apache.org/jira/browse/ARROW-6863) - [Java] Provide parallel searcher -* [ARROW-6865](https://issues.apache.org/jira/browse/ARROW-6865) - [Java] Improve the performance of comparing an ArrowBuf against a byte array -* [ARROW-6866](https://issues.apache.org/jira/browse/ARROW-6866) - [Java] Improve the performance of calculating hash code for struct vector -* [ARROW-6879](https://issues.apache.org/jira/browse/ARROW-6879) - [Rust] Add explicit SIMD for sum kernel -* [ARROW-6880](https://issues.apache.org/jira/browse/ARROW-6880) - [Rust] Add explicit SIMD for min/max kernel -* [ARROW-6881](https://issues.apache.org/jira/browse/ARROW-6881) - [Rust] Remove "array\_ops" in favor of the "compute" sub-module -* [ARROW-6884](https://issues.apache.org/jira/browse/ARROW-6884) - [Python][Flight] Make server-side RPC exceptions more friendly? -* [ARROW-6887](https://issues.apache.org/jira/browse/ARROW-6887) - [Java] Create prose documentation for using ValueVectors -* [ARROW-6888](https://issues.apache.org/jira/browse/ARROW-6888) - [Java] Support copy operation for vector value comparators -* [ARROW-6889](https://issues.apache.org/jira/browse/ARROW-6889) - [Java] ComplexCopier enable FixedSizeList type & fix RangeEualsVisitor StackOverFlow -* [ARROW-6891](https://issues.apache.org/jira/browse/ARROW-6891) - [Rust] [Parquet] Add Utf8 support to ArrowReader -* [ARROW-6902](https://issues.apache.org/jira/browse/ARROW-6902) - [C++] Add String\*/Binary\* support for Compare kernels -* [ARROW-6904](https://issues.apache.org/jira/browse/ARROW-6904) - [Python] Implement MapArray and MapType -* [ARROW-6907](https://issues.apache.org/jira/browse/ARROW-6907) - [C++][Plasma] Allow Plasma store to batch notifications to clients -* [ARROW-6911](https://issues.apache.org/jira/browse/ARROW-6911) - [Java] Provide composite comparator -* [ARROW-6912](https://issues.apache.org/jira/browse/ARROW-6912) - [Java] Extract a common base class for avro converter consumers -* [ARROW-6916](https://issues.apache.org/jira/browse/ARROW-6916) - [Developer] Alphabetize task names in nightly Crossbow report -* [ARROW-6918](https://issues.apache.org/jira/browse/ARROW-6918) - [R] Make docker-compose setup faster -* [ARROW-6919](https://issues.apache.org/jira/browse/ARROW-6919) - [Python] Expose more builders in Cython -* [ARROW-6920](https://issues.apache.org/jira/browse/ARROW-6920) - [Python] create manylinux wheels for python3.8 -* [ARROW-6926](https://issues.apache.org/jira/browse/ARROW-6926) - [Python] Support \_\_sizeof\_\_ protocol for Python objects -* [ARROW-6927](https://issues.apache.org/jira/browse/ARROW-6927) - [C++] Add gRPC version check -* [ARROW-6928](https://issues.apache.org/jira/browse/ARROW-6928) - [Rust] Add FixedSizeList type -* [ARROW-6930](https://issues.apache.org/jira/browse/ARROW-6930) - [Java] Create utility class for populating vector values used for test purpose only -* [ARROW-6932](https://issues.apache.org/jira/browse/ARROW-6932) - [Java] incorrect log on known extension type -* [ARROW-6933](https://issues.apache.org/jira/browse/ARROW-6933) - [Java] Suppor linear dictionary encoder -* [ARROW-6936](https://issues.apache.org/jira/browse/ARROW-6936) - [Python] Improve error message when object of wrong type is given -* [ARROW-6942](https://issues.apache.org/jira/browse/ARROW-6942) - [Developer] Add support for Parquet in pull request check by GitHub Actions -* [ARROW-6943](https://issues.apache.org/jira/browse/ARROW-6943) - [Website] Translate Apache Arrow Flight introduction to Japanese -* [ARROW-6944](https://issues.apache.org/jira/browse/ARROW-6944) - [Rust] Add StringType -* [ARROW-6949](https://issues.apache.org/jira/browse/ARROW-6949) - [Java] Fix promotable write to handle nullvectors -* [ARROW-6951](https://issues.apache.org/jira/browse/ARROW-6951) - [C++][Dataset] Ensure column projection is passed to ParquetDataFragment -* [ARROW-6952](https://issues.apache.org/jira/browse/ARROW-6952) - [C++][Dataset] Ensure expression filter is passed ParquetDataFragment -* [ARROW-6954](https://issues.apache.org/jira/browse/ARROW-6954) - [Python] [CI] Add Python 3.8 to CI matrix -* [ARROW-6960](https://issues.apache.org/jira/browse/ARROW-6960) - [R] Add support for more compression codecs in Windows build -* [ARROW-6961](https://issues.apache.org/jira/browse/ARROW-6961) - [C++][Gandiva] Add lower\_utf8 function in Gandiva -* [ARROW-6963](https://issues.apache.org/jira/browse/ARROW-6963) - [Packaging][Wheel][OSX] Use crossbow's command to deploy artifacts from travis builds -* [ARROW-6964](https://issues.apache.org/jira/browse/ARROW-6964) - [C++][Dataset] Expose a nested parallel option for Scanner::ToTable -* [ARROW-6965](https://issues.apache.org/jira/browse/ARROW-6965) - [C++][Dataset] Optionally expose partition keys as materialized columns -* [ARROW-6967](https://issues.apache.org/jira/browse/ARROW-6967) - [C++] Add filter expressions for IN, IS\_VALID -* [ARROW-6969](https://issues.apache.org/jira/browse/ARROW-6969) - [C++][Dataset] ParquetScanTask eagerly load file -* [ARROW-6970](https://issues.apache.org/jira/browse/ARROW-6970) - [Packaging][RPM] Add support for CentOS 8 -* [ARROW-6973](https://issues.apache.org/jira/browse/ARROW-6973) - [C++][ThreadPool] Use perfect forwarding in Submit -* [ARROW-6975](https://issues.apache.org/jira/browse/ARROW-6975) - [C++] Put make\_unique in its own header -* [ARROW-6980](https://issues.apache.org/jira/browse/ARROW-6980) - [R] dplyr backend for RecordBatch/Table -* [ARROW-6984](https://issues.apache.org/jira/browse/ARROW-6984) - [C++] Update LZ4 to 1.9.2 for CVE-2019-17543 -* [ARROW-6986](https://issues.apache.org/jira/browse/ARROW-6986) - [R] Add basic Expression class -* [ARROW-6987](https://issues.apache.org/jira/browse/ARROW-6987) - [CI] Travis OSX failing to install sdk headers -* [ARROW-6991](https://issues.apache.org/jira/browse/ARROW-6991) - [Packaging][deb] Add support for Ubuntu 19.10 -* [ARROW-6994](https://issues.apache.org/jira/browse/ARROW-6994) - [C++] Research jemalloc memory page reclamation configuration on macOS when background\_thread option is unavailable -* [ARROW-6997](https://issues.apache.org/jira/browse/ARROW-6997) - [Packaging] Add support for RHEL -* [ARROW-7000](https://issues.apache.org/jira/browse/ARROW-7000) - [C++][Gandiva] Handle empty inputs in string lower, upper functions -* [ARROW-7003](https://issues.apache.org/jira/browse/ARROW-7003) - [Format] [Rust] Generate flatbuffers files in build script -* [ARROW-7004](https://issues.apache.org/jira/browse/ARROW-7004) - [Plasma] Make it possible to bump up object in LRU cache -* [ARROW-7006](https://issues.apache.org/jira/browse/ARROW-7006) - [Rust] Bump flatbuffers version to avoid vulnerability -* [ARROW-7007](https://issues.apache.org/jira/browse/ARROW-7007) - [C++] Enable mmap option for LocalFs -* [ARROW-7014](https://issues.apache.org/jira/browse/ARROW-7014) - [Developer] Write script to verify Linux wheels given local environment with conda or virtualenv -* [ARROW-7015](https://issues.apache.org/jira/browse/ARROW-7015) - [Developer] Write script to verify macOS wheels given local environment with conda or virtualenv -* [ARROW-7016](https://issues.apache.org/jira/browse/ARROW-7016) - [Developer][Python] Write script to verify Windows wheels given local environment with conda -* [ARROW-7019](https://issues.apache.org/jira/browse/ARROW-7019) - [Java] Improve the performance of loading validity buffers -* [ARROW-7026](https://issues.apache.org/jira/browse/ARROW-7026) - [Java] Remove assertions in MessageSerializer/vector/writer/reader -* [ARROW-7031](https://issues.apache.org/jira/browse/ARROW-7031) - [Python] Expose the offsets of a ListArray in python -* [ARROW-7031](https://issues.apache.org/jira/browse/ARROW-7031) - [Python] Expose the offsets of a ListArray in python -* [ARROW-7032](https://issues.apache.org/jira/browse/ARROW-7032) - [Release] Run the python unit tests in the release verification script -* [ARROW-7034](https://issues.apache.org/jira/browse/ARROW-7034) - [CI][Crossbow] Skip known nightly failures -* [ARROW-7035](https://issues.apache.org/jira/browse/ARROW-7035) - [R] Default arguments are unclear in write\_parquet docs -* [ARROW-7036](https://issues.apache.org/jira/browse/ARROW-7036) - [C++] Version up ORC to avoid compile errors -* [ARROW-7037](https://issues.apache.org/jira/browse/ARROW-7037) - [C++ ] Compile error on the combination of protobuf \>= 3.9 and clang -* [ARROW-7039](https://issues.apache.org/jira/browse/ARROW-7039) - [Python] Typecheck expects pandas to be installed -* [ARROW-7047](https://issues.apache.org/jira/browse/ARROW-7047) - [C++][Dataset] Filter expressions should not require exact type match -* [ARROW-7052](https://issues.apache.org/jira/browse/ARROW-7052) - [C++] Datasets example fails to build with ARROW\_SHARED=OFF -* [ARROW-7054](https://issues.apache.org/jira/browse/ARROW-7054) - [Docs] Add option to override displayed docs version with an environment variable -* [ARROW-7057](https://issues.apache.org/jira/browse/ARROW-7057) - [C++] Add API to parse URI query strings -* [ARROW-7058](https://issues.apache.org/jira/browse/ARROW-7058) - [C++] FileSystemDataSourceDiscovery should apply partition schemes relative to the base\_dir of its selector -* [ARROW-7060](https://issues.apache.org/jira/browse/ARROW-7060) - [R] Post-0.15.1 cleanup -* [ARROW-7061](https://issues.apache.org/jira/browse/ARROW-7061) - [C++][Dataset] FileSystemDiscovery with ParquetFileFormat should ignore files that aren't Parquet -* [ARROW-7062](https://issues.apache.org/jira/browse/ARROW-7062) - [C++] Parquet file parse error messages should include the file name -* [ARROW-7064](https://issues.apache.org/jira/browse/ARROW-7064) - [R] Implement null type -* [ARROW-7066](https://issues.apache.org/jira/browse/ARROW-7066) - [Python] support returning ChunkedArray from \_\_arrow\_array\_\_ ? -* [ARROW-7067](https://issues.apache.org/jira/browse/ARROW-7067) - [CI] Disable code coverage on Travis-CI -* [ARROW-7069](https://issues.apache.org/jira/browse/ARROW-7069) - [C++][Dataset] Replace ConstantPartitionScheme with PrefixDictionaryPartitionScheme -* [ARROW-7070](https://issues.apache.org/jira/browse/ARROW-7070) - [Packaging][deb] Update package names for 1.0.0 -* [ARROW-7072](https://issues.apache.org/jira/browse/ARROW-7072) - [Java] Support concating validity bits efficiently -* [ARROW-7082](https://issues.apache.org/jira/browse/ARROW-7082) - [Packaging][deb] Add apache-arrow-archive-keyring -* [ARROW-7086](https://issues.apache.org/jira/browse/ARROW-7086) - [C++] Provide a wrapper for invoking factories to produce a Result -* [ARROW-7092](https://issues.apache.org/jira/browse/ARROW-7092) - [R] Add vignette for dplyr and datasets -* [ARROW-7093](https://issues.apache.org/jira/browse/ARROW-7093) - [R] Support creating ScalarExpressions for more data types -* [ARROW-7094](https://issues.apache.org/jira/browse/ARROW-7094) - [C++] FileSystemDataSource should use an owning pointer for fs::Filesystem -* [ARROW-7095](https://issues.apache.org/jira/browse/ARROW-7095) - [R] Better handling of unsupported filter and mutate expressions in dplyr methods -* [ARROW-7096](https://issues.apache.org/jira/browse/ARROW-7096) - [C++] Add options structs for concatenation-with-promotion and schema unification -* [ARROW-7098](https://issues.apache.org/jira/browse/ARROW-7098) - [Java] Improve the performance of comparing two memory blocks -* [ARROW-7099](https://issues.apache.org/jira/browse/ARROW-7099) - [C++] Disambiguate function calls in csv parser test -* [ARROW-7101](https://issues.apache.org/jira/browse/ARROW-7101) - [CI] Refactor docker-compose setup and use it with GitHub Actions -* [ARROW-7103](https://issues.apache.org/jira/browse/ARROW-7103) - [R] Various minor cleanups -* [ARROW-7107](https://issues.apache.org/jira/browse/ARROW-7107) - [C++][MinGW] Enable Flight on AppVeyor -* [ARROW-7110](https://issues.apache.org/jira/browse/ARROW-7110) - [GLib] Add filter support for GArrowTable, GArrowChunkedArray, and GArrowRecordBatch -* [ARROW-7111](https://issues.apache.org/jira/browse/ARROW-7111) - [GLib] Add take support for GArrowTable, GArrowChunkedArray, and GArrowRecordBatch -* [ARROW-7113](https://issues.apache.org/jira/browse/ARROW-7113) - [Rust] Buffer should accept memory owned by others -* [ARROW-7116](https://issues.apache.org/jira/browse/ARROW-7116) - [CI] Use the docker repository provided by apache organisation -* [ARROW-7120](https://issues.apache.org/jira/browse/ARROW-7120) - [C++][CI] Add .ccache to the docker-compose volume mounts -* [ARROW-7146](https://issues.apache.org/jira/browse/ARROW-7146) - [R][CI] Various fixes and speedups for the R docker-compose setup -* [ARROW-7147](https://issues.apache.org/jira/browse/ARROW-7147) - [C++][Dataset] Refactor dataset's API to use Result -* [ARROW-7148](https://issues.apache.org/jira/browse/ARROW-7148) - [C++][Dataset] API cleanup -* [ARROW-7149](https://issues.apache.org/jira/browse/ARROW-7149) - [C++] Remove experimental status on filesystem APIs -* [ARROW-7155](https://issues.apache.org/jira/browse/ARROW-7155) - [Java][CI] add maven wrapper to make setup process simple -* [ARROW-7159](https://issues.apache.org/jira/browse/ARROW-7159) - [CI] Run HDFS tests as cron task -* [ARROW-7160](https://issues.apache.org/jira/browse/ARROW-7160) - [C++] Update string\_view backport -* [ARROW-7161](https://issues.apache.org/jira/browse/ARROW-7161) - [C++] Migrate filesystem layer from Status to Result -* [ARROW-7162](https://issues.apache.org/jira/browse/ARROW-7162) - [C++] Cleanup warnings in cmake\_modules/SetupCxxFlags.cmake -* [ARROW-7166](https://issues.apache.org/jira/browse/ARROW-7166) - [Java] Remove redundant code for Jdbc adapters -* [ARROW-7169](https://issues.apache.org/jira/browse/ARROW-7169) - [C++] Vendor uriparser library -* [ARROW-7171](https://issues.apache.org/jira/browse/ARROW-7171) - [Ruby] Pass Array for Arrow::Table\#filter -* [ARROW-7172](https://issues.apache.org/jira/browse/ARROW-7172) - [C++][Dataset] Improve format of Expression::ToString -* [ARROW-7176](https://issues.apache.org/jira/browse/ARROW-7176) - [C++] Fix arrow::ipc compiler warning -* [ARROW-7178](https://issues.apache.org/jira/browse/ARROW-7178) - [C++] Vendor forward compatible std::optional -* [ARROW-7185](https://issues.apache.org/jira/browse/ARROW-7185) - [R][Dataset] Add bindings for IN, IS\_VALID expressions -* [ARROW-7186](https://issues.apache.org/jira/browse/ARROW-7186) - [R] Add inline comments to document the dplyr code -* [ARROW-7192](https://issues.apache.org/jira/browse/ARROW-7192) - [Rust] Implement Flight crate -* [ARROW-7193](https://issues.apache.org/jira/browse/ARROW-7193) - [Rust] Create Arrow stream reader -* [ARROW-7195](https://issues.apache.org/jira/browse/ARROW-7195) - [Ruby] Improve \#filter, \#take, and \#is\_in -* [ARROW-7196](https://issues.apache.org/jira/browse/ARROW-7196) - [Ruby] Remove needless BinaryArrayBuilder\#append\_values -* [ARROW-7197](https://issues.apache.org/jira/browse/ARROW-7197) - [Ruby] Suppress keyword argument related warnings with Ruby 2.7 -* [ARROW-7204](https://issues.apache.org/jira/browse/ARROW-7204) - [C++][Dataset] In expression should not require exact type match -* [ARROW-7206](https://issues.apache.org/jira/browse/ARROW-7206) - [Java] Avoid string concatenation when calling Preconditions\#checkArgument -* [ARROW-7207](https://issues.apache.org/jira/browse/ARROW-7207) - [Rust] Update Generated Flatbuffer Files -* [ARROW-7210](https://issues.apache.org/jira/browse/ARROW-7210) - [C++] Scalar cast should support time-based types -* [ARROW-7211](https://issues.apache.org/jira/browse/ARROW-7211) - [Rust] [Parquet] Support writing to byte buffers -* [ARROW-7216](https://issues.apache.org/jira/browse/ARROW-7216) - [Java] Improve the performance of setting/clearing individual bits -* [ARROW-7219](https://issues.apache.org/jira/browse/ARROW-7219) - [CI][Python] Install pickle5 in the conda-python docker image for python version 3.6 -* [ARROW-7227](https://issues.apache.org/jira/browse/ARROW-7227) - [Python] Provide wrappers for ConcatenateWithPromotion() -* [ARROW-7228](https://issues.apache.org/jira/browse/ARROW-7228) - [Python] Expose RecordBatch.FromStructArray in Python. -* [ARROW-7235](https://issues.apache.org/jira/browse/ARROW-7235) - [C++] Add Result to APIs to arrow/io -* [ARROW-7236](https://issues.apache.org/jira/browse/ARROW-7236) - [C++] Add Result to APIs to arrow/csv -* [ARROW-7240](https://issues.apache.org/jira/browse/ARROW-7240) - [C++] Add Result to APIs to arrow/util -* [ARROW-7246](https://issues.apache.org/jira/browse/ARROW-7246) - [CI][Python] wheel can't be built by SSL\_ST\_INIT error -* [ARROW-7247](https://issues.apache.org/jira/browse/ARROW-7247) - [CI][Python] wheel can't be built by wget and OpenSSL error -* [ARROW-7248](https://issues.apache.org/jira/browse/ARROW-7248) - [Rust] Automatically Regenerate IPC messages from Flatbuffers -* [ARROW-7255](https://issues.apache.org/jira/browse/ARROW-7255) - [CI] Run source release test on pull request -* [ARROW-7257](https://issues.apache.org/jira/browse/ARROW-7257) - [CI] Homebrew formula is failed by openssl formula name update -* [ARROW-7258](https://issues.apache.org/jira/browse/ARROW-7258) - [CI] Fuzzit job is failed by nonexistent directory -* [ARROW-7259](https://issues.apache.org/jira/browse/ARROW-7259) - [Java] Support subfield encoder use different hasher -* [ARROW-7260](https://issues.apache.org/jira/browse/ARROW-7260) - [CI] Ubuntu 14.04 test is failed by user defined literal -* [ARROW-7261](https://issues.apache.org/jira/browse/ARROW-7261) - [Python] Python support for fixed size list type -* [ARROW-7262](https://issues.apache.org/jira/browse/ARROW-7262) - [C++][Gandiva] Implement replace function in Gandiva -* [ARROW-7263](https://issues.apache.org/jira/browse/ARROW-7263) - [C++][Gandiva] Implement locate and position functions -* [ARROW-7268](https://issues.apache.org/jira/browse/ARROW-7268) - [Rust] Propagate \`custom\_metadata\` field from IPC message -* [ARROW-7269](https://issues.apache.org/jira/browse/ARROW-7269) - [C++] Fix arrow::parquet compiler warning -* [ARROW-7270](https://issues.apache.org/jira/browse/ARROW-7270) - [Go] preserve CSV reading behaviour, improve memory usage -* [ARROW-7274](https://issues.apache.org/jira/browse/ARROW-7274) - [C++] Add Result APIs to Decimal class -* [ARROW-7275](https://issues.apache.org/jira/browse/ARROW-7275) - [Ruby] Add support for Arrow::ListDataType.new(data\_type) -* [ARROW-7276](https://issues.apache.org/jira/browse/ARROW-7276) - [Ruby] Add support for building Arrow::ListArray from [[...]] -* [ARROW-7277](https://issues.apache.org/jira/browse/ARROW-7277) - [Document] Add discussion about vector lifecycle -* [ARROW-7279](https://issues.apache.org/jira/browse/ARROW-7279) - [C++] Rename UnionArray::type\_ids to UnionArray::type\_codes -* [ARROW-7284](https://issues.apache.org/jira/browse/ARROW-7284) - [Java] ensure java implementation meets clarified dictionary spec -* [ARROW-7289](https://issues.apache.org/jira/browse/ARROW-7289) - [C\#] ListType constructor argument is redundant -* [ARROW-7290](https://issues.apache.org/jira/browse/ARROW-7290) - [C\#] Implement ListArray Builder -* [ARROW-7292](https://issues.apache.org/jira/browse/ARROW-7292) - [C++] [CI] [Dev] Add ASAN / UBSAN CI run -* [ARROW-7293](https://issues.apache.org/jira/browse/ARROW-7293) - [Dev] [C++] Persist ccache in docker-compose build volumes -* [ARROW-7296](https://issues.apache.org/jira/browse/ARROW-7296) - [Python] Add ORC api documentation -* [ARROW-7299](https://issues.apache.org/jira/browse/ARROW-7299) - [GLib] Use Result instead of Status -* [ARROW-7303](https://issues.apache.org/jira/browse/ARROW-7303) - [C++] Refactor benchmarks to use new Result APIs -* [ARROW-7306](https://issues.apache.org/jira/browse/ARROW-7306) - [C++] Add Result-returning version of FileSystemFromUri -* [ARROW-7307](https://issues.apache.org/jira/browse/ARROW-7307) - [CI][GLib] Documentation isn't generated -* [ARROW-7309](https://issues.apache.org/jira/browse/ARROW-7309) - [Python] Support HDFS federation viewfs:// -* [ARROW-7310](https://issues.apache.org/jira/browse/ARROW-7310) - [Python] Expose HDFS implementation for pyarrow.fs -* [ARROW-7311](https://issues.apache.org/jira/browse/ARROW-7311) - [Python] Return filesystem and path from URI -* [ARROW-7312](https://issues.apache.org/jira/browse/ARROW-7312) - [Rust] ArrowError should implement std::error:Error -* [ARROW-7317](https://issues.apache.org/jira/browse/ARROW-7317) - [C++] Migrate Iterator API to Result -* [ARROW-7319](https://issues.apache.org/jira/browse/ARROW-7319) - [C++] Refactor Iterator to yield Result -* [ARROW-7321](https://issues.apache.org/jira/browse/ARROW-7321) - [CI][GLib] Failed to build with GLib warning -* [ARROW-7322](https://issues.apache.org/jira/browse/ARROW-7322) - [CI][Python] Fall back to arrowdev dockerhub organization for manylinux images -* [ARROW-7323](https://issues.apache.org/jira/browse/ARROW-7323) - [CI][Rust] Nightly CI is failed by different toolchain -* [ARROW-7324](https://issues.apache.org/jira/browse/ARROW-7324) - [Rust] Add Timezone to Timestamp -* [ARROW-7325](https://issues.apache.org/jira/browse/ARROW-7325) - [Rust] [Parquet] Update to parquet-format 2.6 and thrift 0.12 -* [ARROW-7329](https://issues.apache.org/jira/browse/ARROW-7329) - [Java] AllocationManager: Allow managing different types of memory other than those are allocated using Netty -* [ARROW-7333](https://issues.apache.org/jira/browse/ARROW-7333) - [CI][Rust] Remove duplicated nightly job -* [ARROW-7334](https://issues.apache.org/jira/browse/ARROW-7334) - [CI][Python] macOS uses Python 2 -* [ARROW-7339](https://issues.apache.org/jira/browse/ARROW-7339) - [CMake] Thrift version not respected in CMake configuration version.txt -* [ARROW-7340](https://issues.apache.org/jira/browse/ARROW-7340) - [CI] Prune defunct appveyor build setup -* [ARROW-7344](https://issues.apache.org/jira/browse/ARROW-7344) - [Packaging][Python] Build manylinux2014 wheels -* [ARROW-7346](https://issues.apache.org/jira/browse/ARROW-7346) - [CI] Explicit usage of ccache across the builds -* [ARROW-7347](https://issues.apache.org/jira/browse/ARROW-7347) - [C++] Update bundled Boost to 1.71.0 -* [ARROW-7348](https://issues.apache.org/jira/browse/ARROW-7348) - [Rust] Add api to return references of buffer of null bitmap. -* [ARROW-7351](https://issues.apache.org/jira/browse/ARROW-7351) - [Developer] Only suggest cpp-\* fix versions when merging Parquet patches -* [ARROW-7357](https://issues.apache.org/jira/browse/ARROW-7357) - [Go] migrate from pkg/errors to x/xerrors -* [ARROW-7366](https://issues.apache.org/jira/browse/ARROW-7366) - [C++][Dataset] Use PartitionSchemeDiscovery in DataSourceDiscovery -* [ARROW-7367](https://issues.apache.org/jira/browse/ARROW-7367) - [Python] Use np.full instead of np.array.repeat in ParquetDatasetPiece -* [ARROW-7368](https://issues.apache.org/jira/browse/ARROW-7368) - [Ruby] Use :arrow\_file and :arrow\_streaming for format name -* [ARROW-7369](https://issues.apache.org/jira/browse/ARROW-7369) - [GLib] Add garrow\_table\_combine\_chunks -* [ARROW-7370](https://issues.apache.org/jira/browse/ARROW-7370) - [C++] Old Protobuf with AUTO detection is failed -* [ARROW-7377](https://issues.apache.org/jira/browse/ARROW-7377) - [C++][Dataset] Simplify parquet column projection -* [ARROW-7378](https://issues.apache.org/jira/browse/ARROW-7378) - [C++][Gandiva] Loop vectorization broken in IR optimization -* [ARROW-7379](https://issues.apache.org/jira/browse/ARROW-7379) - [C++] Introduce SchemaBuilder companion class and Field::IsCompatibleWith -* [ARROW-7380](https://issues.apache.org/jira/browse/ARROW-7380) - [C++][Dataset] Implement DatasetFactory -* [ARROW-7382](https://issues.apache.org/jira/browse/ARROW-7382) - [C++][Dataset] Refactor FsDsDiscovery constructors -* [ARROW-7387](https://issues.apache.org/jira/browse/ARROW-7387) - [C\#] Support ListType Serialization -* [ARROW-7392](https://issues.apache.org/jira/browse/ARROW-7392) - [Packaging] Add conda packaging tasks for python 3.8 -* [ARROW-7398](https://issues.apache.org/jira/browse/ARROW-7398) - [Packaging][Python] Conda builds are failing on macOS -* [ARROW-7399](https://issues.apache.org/jira/browse/ARROW-7399) - [C++][Gandiva] Gandiva does not pick runtime cpu features -* [ARROW-7402](https://issues.apache.org/jira/browse/ARROW-7402) - [C++] Add more information on CUDA error -* [ARROW-7403](https://issues.apache.org/jira/browse/ARROW-7403) - [C++][JSON] Enable Rapidjson on Arm64 Neon -* [ARROW-7410](https://issues.apache.org/jira/browse/ARROW-7410) - [Python] [Doc] Document filesystem APIs -* [ARROW-7411](https://issues.apache.org/jira/browse/ARROW-7411) - [C++][Flight] Incorrect Arrow Flight benchmark output -* [ARROW-7413](https://issues.apache.org/jira/browse/ARROW-7413) - [Python][Dataset] Add tests for PartitionSchemeDiscovery -* [ARROW-7414](https://issues.apache.org/jira/browse/ARROW-7414) - [R][Dataset] Implement PartitionSchemeDiscovery -* [ARROW-7415](https://issues.apache.org/jira/browse/ARROW-7415) - [C++][Dataset] Implement IpcFormat for sources composed of ipc files -* [ARROW-7416](https://issues.apache.org/jira/browse/ARROW-7416) - [R][Nightly] Fix macos-r-autobrew build on R 3.6.2 -* [ARROW-7417](https://issues.apache.org/jira/browse/ARROW-7417) - [C++] Add a docker-compose entry for CUDA 10.1 -* [ARROW-7418](https://issues.apache.org/jira/browse/ARROW-7418) - [C++] Can't build with g++ 5.4.0 on Ubuntu 16.04 -* [ARROW-7420](https://issues.apache.org/jira/browse/ARROW-7420) - [C++] Migrate tensor related APIs to Result-returning version -* [ARROW-7429](https://issues.apache.org/jira/browse/ARROW-7429) - [Java] Enhance code style checking for Java code (remove consecutive spaces) -* [ARROW-7430](https://issues.apache.org/jira/browse/ARROW-7430) - [Python] Add more docstrings to dataset bindings -* [ARROW-7431](https://issues.apache.org/jira/browse/ARROW-7431) - [Python] Add dataset API to reference docs -* [ARROW-7432](https://issues.apache.org/jira/browse/ARROW-7432) - [Python] Add higher-level datasets functions -* [ARROW-7439](https://issues.apache.org/jira/browse/ARROW-7439) - [C++][Dataset] Remove dataset pointer aliases -* [ARROW-7449](https://issues.apache.org/jira/browse/ARROW-7449) - [GLib] Make GObject Introspection optional -* [ARROW-7452](https://issues.apache.org/jira/browse/ARROW-7452) - [GLib] Make GArrowTimeDataType abstract -* [ARROW-7453](https://issues.apache.org/jira/browse/ARROW-7453) - [Ruby] Add support for Arrow::NullArray\#[] -* [ARROW-7454](https://issues.apache.org/jira/browse/ARROW-7454) - [Ruby] Add support for saving/loading TSV -* [ARROW-7455](https://issues.apache.org/jira/browse/ARROW-7455) - [Ruby] Use Arrow::DataType.resolve for all GArrowDataType input -* [ARROW-7456](https://issues.apache.org/jira/browse/ARROW-7456) - [C++] Add support for YYYY-MM-DDThh and YYYY-MM-DDThh:mm timestamp formats -* [ARROW-7457](https://issues.apache.org/jira/browse/ARROW-7457) - [Doc] Fix typos -* [ARROW-7459](https://issues.apache.org/jira/browse/ARROW-7459) - [Python] Documentation lint is failed -* [ARROW-7460](https://issues.apache.org/jira/browse/ARROW-7460) - [Rust] Improve some kernels with autovectorisation -* [ARROW-7461](https://issues.apache.org/jira/browse/ARROW-7461) - [Java] Fix typos and spelling -* [ARROW-7463](https://issues.apache.org/jira/browse/ARROW-7463) - [Doc] Fix a broken link and typos -* [ARROW-7464](https://issues.apache.org/jira/browse/ARROW-7464) - [C++] Refine CpuInfo singleton with std::call\_once -* [ARROW-7465](https://issues.apache.org/jira/browse/ARROW-7465) - [C++] Add Arrow memory benchmark for Arm64 -* [ARROW-7468](https://issues.apache.org/jira/browse/ARROW-7468) - [Python] Fix typos -* [ARROW-7469](https://issues.apache.org/jira/browse/ARROW-7469) - [C++] Improve division related bit operations -* [ARROW-7470](https://issues.apache.org/jira/browse/ARROW-7470) - [JS] Fix typos -* [ARROW-7474](https://issues.apache.org/jira/browse/ARROW-7474) - [Ruby] Save CSV files faster -* [ARROW-7475](https://issues.apache.org/jira/browse/ARROW-7475) - [Rust] Create Arrow Stream writer -* [ARROW-7477](https://issues.apache.org/jira/browse/ARROW-7477) - [FlightRPC][Java] Flight gRPC service is missing reflection info -* [ARROW-7479](https://issues.apache.org/jira/browse/ARROW-7479) - [Rust][Ruby][R] Fix typos -* [ARROW-7481](https://issues.apache.org/jira/browse/ARROW-7481) - [C\#] Fix typos -* [ARROW-7482](https://issues.apache.org/jira/browse/ARROW-7482) - [C++] Fix typos -* [ARROW-7484](https://issues.apache.org/jira/browse/ARROW-7484) - [C++][Gandiva] Fix typos -* [ARROW-7485](https://issues.apache.org/jira/browse/ARROW-7485) - [C++][Plasma] Fix typos -* [ARROW-7487](https://issues.apache.org/jira/browse/ARROW-7487) - [Developer] Fix typos -* [ARROW-7488](https://issues.apache.org/jira/browse/ARROW-7488) - [GLib] Fix typos and broken links -* [ARROW-7489](https://issues.apache.org/jira/browse/ARROW-7489) - [CI] Fix typos -* [ARROW-7490](https://issues.apache.org/jira/browse/ARROW-7490) - [Java] Avro converter should convert attributes and props to FieldType metadata -* [ARROW-7493](https://issues.apache.org/jira/browse/ARROW-7493) - [Python] Expose sum kernel in pyarrow.compute and support ChunkedArray inputs -* [ARROW-7498](https://issues.apache.org/jira/browse/ARROW-7498) - [C++][Dataset] Rename DataFragment/DataSource/PartitionScheme -* [ARROW-7502](https://issues.apache.org/jira/browse/ARROW-7502) - [Integration] Remove Spark Integration patch that not needed anymore -* [ARROW-7513](https://issues.apache.org/jira/browse/ARROW-7513) - [JS] Arrow Tutorial: Common data types -* [ARROW-7514](https://issues.apache.org/jira/browse/ARROW-7514) - [C\#] Make GetValueOffset Obsolete -* [ARROW-7519](https://issues.apache.org/jira/browse/ARROW-7519) - [Python] Build wheels, conda packages with dataset support -* [ARROW-7521](https://issues.apache.org/jira/browse/ARROW-7521) - [Rust] Remove tuple on FixedSizeList datatype -* [ARROW-7523](https://issues.apache.org/jira/browse/ARROW-7523) - [Developer] Relax clang-tidy check -* [ARROW-7526](https://issues.apache.org/jira/browse/ARROW-7526) - [C++][Compute]: Optimize small integer sorting -* [ARROW-7532](https://issues.apache.org/jira/browse/ARROW-7532) - [CI] Unskip brew test after Homebrew fixes it upstream -* [ARROW-7537](https://issues.apache.org/jira/browse/ARROW-7537) - [CI][R] Nightly macOS autobrew job should be more verbose if it fails -* [ARROW-7538](https://issues.apache.org/jira/browse/ARROW-7538) - Clarify actual and desired size in AllocationManager -* [ARROW-7540](https://issues.apache.org/jira/browse/ARROW-7540) - [C++] License files aren't installed -* [ARROW-7541](https://issues.apache.org/jira/browse/ARROW-7541) - [GLib] Install license files -* [ARROW-7542](https://issues.apache.org/jira/browse/ARROW-7542) - [CI][C++] nproc isn't available on macOS -* [ARROW-7549](https://issues.apache.org/jira/browse/ARROW-7549) - [Java] Reorganize Flight modules to keep top level clean/organized -* [ARROW-7550](https://issues.apache.org/jira/browse/ARROW-7550) - [R][CI] Run donttest examples in CI -* [ARROW-7557](https://issues.apache.org/jira/browse/ARROW-7557) - [C++][Compute] Validate sorting stability in random test -* [ARROW-7558](https://issues.apache.org/jira/browse/ARROW-7558) - [Packaging][deb][RPM] Use the host owner and group for artifacts -* [ARROW-7560](https://issues.apache.org/jira/browse/ARROW-7560) - [Rust] Reduce Rc/Refcell usage -* [ARROW-7565](https://issues.apache.org/jira/browse/ARROW-7565) - [Website] Add support for download URL redirect -* [ARROW-7566](https://issues.apache.org/jira/browse/ARROW-7566) - [CI] Use more recent Miniconda on AppVeyor -* [ARROW-7567](https://issues.apache.org/jira/browse/ARROW-7567) - [Java] Bump Checkstyle from 6.19 to 8.18 -* [ARROW-7567](https://issues.apache.org/jira/browse/ARROW-7567) - [Java] Bump Checkstyle from 6.19 to 8.18 -* [ARROW-7568](https://issues.apache.org/jira/browse/ARROW-7568) - [Java] Bump Apache Avro from 1.9.0 to 1.9.1 -* [ARROW-7569](https://issues.apache.org/jira/browse/ARROW-7569) - [Python] Add API to map Arrow types to pandas ExtensionDtypes for to\_pandas conversions -* [ARROW-7570](https://issues.apache.org/jira/browse/ARROW-7570) - [Java] Fix high severity issues reported by LGTM -* [ARROW-7571](https://issues.apache.org/jira/browse/ARROW-7571) - [Java] Correct minimal java version on README -* [ARROW-7572](https://issues.apache.org/jira/browse/ARROW-7572) - [Java] Enfore Maven 3.3+ as mentioned in README -* [ARROW-7573](https://issues.apache.org/jira/browse/ARROW-7573) - [Rust] Reduce boxing and cleanup -* [ARROW-7575](https://issues.apache.org/jira/browse/ARROW-7575) - [R] Linux binary packaging followup -* [ARROW-7576](https://issues.apache.org/jira/browse/ARROW-7576) - [C++][Dev] Improve fuzzing setup -* [ARROW-7577](https://issues.apache.org/jira/browse/ARROW-7577) - [C++][CI] Check fuzzer setup in CI -* [ARROW-7578](https://issues.apache.org/jira/browse/ARROW-7578) - [R] Add support for datasets with IPC files and with multiple sources -* [ARROW-7580](https://issues.apache.org/jira/browse/ARROW-7580) - [Website] 0.16 release post -* [ARROW-7581](https://issues.apache.org/jira/browse/ARROW-7581) - [R] Documentation/polishing for 0.16 release -* [ARROW-7590](https://issues.apache.org/jira/browse/ARROW-7590) - [C++] Managed files in thirdparty/ are ignored -* [ARROW-7597](https://issues.apache.org/jira/browse/ARROW-7597) - [C++] Improvements to CMake configuration console summary -* [ARROW-7600](https://issues.apache.org/jira/browse/ARROW-7600) - [C++][Parquet] Add a basic disabled unit test to excercise nesting functionality -* [ARROW-7601](https://issues.apache.org/jira/browse/ARROW-7601) - [Doc] [C++] Update fuzzing documentation -* [ARROW-7602](https://issues.apache.org/jira/browse/ARROW-7602) - [Archery] Add more build options -* [ARROW-7613](https://issues.apache.org/jira/browse/ARROW-7613) - [Rust] Remove redundant \`::\` prefixes -* [ARROW-7622](https://issues.apache.org/jira/browse/ARROW-7622) - [Format] Mark Tensor and SparseTensor fields required -* [ARROW-7623](https://issues.apache.org/jira/browse/ARROW-7623) - [C++] Update generated flatbuffers files -* [ARROW-7626](https://issues.apache.org/jira/browse/ARROW-7626) - [Parquet][GLib] Add support for version macros -* [ARROW-7627](https://issues.apache.org/jira/browse/ARROW-7627) - [C++][Gandiva] Optimize string truncate function -* [ARROW-7629](https://issues.apache.org/jira/browse/ARROW-7629) - [C++][CI] Add fuzz regression files to arrow-testing -* [ARROW-7630](https://issues.apache.org/jira/browse/ARROW-7630) - [C++][CI] Check fuzz crash regressions in CI -* [ARROW-7632](https://issues.apache.org/jira/browse/ARROW-7632) - [C++] [CI] Improve fuzzing seed corpus -* [ARROW-7635](https://issues.apache.org/jira/browse/ARROW-7635) - [C++] Add pkg-config support for each components -* [ARROW-7636](https://issues.apache.org/jira/browse/ARROW-7636) - [Python] Clean-up the pyarrow.dataset.partitioning() API -* [ARROW-7644](https://issues.apache.org/jira/browse/ARROW-7644) - Add vcpkg installation instructions -* [ARROW-7645](https://issues.apache.org/jira/browse/ARROW-7645) - [Packaging][deb][RPM] arm64 build by crossbow is broken -* [ARROW-7648](https://issues.apache.org/jira/browse/ARROW-7648) - [C++] Sanitize local paths on Windows -* [ARROW-7658](https://issues.apache.org/jira/browse/ARROW-7658) - [R] Support dplyr filtering on date/time -* [ARROW-7659](https://issues.apache.org/jira/browse/ARROW-7659) - [Rust] Reduce Rc usage -* [ARROW-7660](https://issues.apache.org/jira/browse/ARROW-7660) - [C++][Gandiva] Optimise castVarchar(string, int) function for single byte characters -* [ARROW-7665](https://issues.apache.org/jira/browse/ARROW-7665) - [R] linuxLibs.R should build in parallel -* [ARROW-7666](https://issues.apache.org/jira/browse/ARROW-7666) - [Packaging][deb] Always use NInja to reduce build time -* [ARROW-7667](https://issues.apache.org/jira/browse/ARROW-7667) - [Packaging][deb] ubuntu-eoan is missing in nightly jobs -* [ARROW-7668](https://issues.apache.org/jira/browse/ARROW-7668) - [Packaging][RPM] Use NInja if possible to reduce build time -* [ARROW-7670](https://issues.apache.org/jira/browse/ARROW-7670) - [Python][Dataset] Better ergonomics for the filter expressions -* [ARROW-7671](https://issues.apache.org/jira/browse/ARROW-7671) - [Python][Dataset] Add bindings for the DatasetFactory -* [ARROW-7674](https://issues.apache.org/jira/browse/ARROW-7674) - Add helpful message for captcha challenge in merge\_arrow\_pr.py -* [ARROW-7682](https://issues.apache.org/jira/browse/ARROW-7682) - [Packaging][APT][Yum] Add support for arm64 APT/Yum repositories -* [ARROW-7683](https://issues.apache.org/jira/browse/ARROW-7683) - [Packaging] Set 0.16.0 as the next version -* [ARROW-7686](https://issues.apache.org/jira/browse/ARROW-7686) - [Packaging][deb][RPM] Include more arrow-\*.pc -* [ARROW-7687](https://issues.apache.org/jira/browse/ARROW-7687) - [C++] C++ developer document links in README are broken -* [ARROW-7692](https://issues.apache.org/jira/browse/ARROW-7692) - [Rust] Several pattern matches are hard to read -* [ARROW-7694](https://issues.apache.org/jira/browse/ARROW-7694) - [Packaging][deb][RPM] Can't build repository packages for RC -* [ARROW-7695](https://issues.apache.org/jira/browse/ARROW-7695) - [Release] Update java versions to 0.16-SNAPSHOT -* [ARROW-7696](https://issues.apache.org/jira/browse/ARROW-7696) - [Release] Unit test on release branch is failed -* [ARROW-7697](https://issues.apache.org/jira/browse/ARROW-7697) - [Release] Add a test for updating Linux packages by 00-prepare.sh -* [ARROW-7710](https://issues.apache.org/jira/browse/ARROW-7710) - [Release][C\#] .NET download URL is redirected -* [ARROW-7711](https://issues.apache.org/jira/browse/ARROW-7711) - [C\#] Date32 test depends on system timezone -* [ARROW-7715](https://issues.apache.org/jira/browse/ARROW-7715) - [Release][APT] Ignore some arm64 verifications -* [ARROW-7716](https://issues.apache.org/jira/browse/ARROW-7716) - [Packaging][APT] Use the "main" component for Ubuntu 19.10 -* [ARROW-7719](https://issues.apache.org/jira/browse/ARROW-7719) - [Python][Dataset] Table equality check occasionally fails -* [ARROW-7724](https://issues.apache.org/jira/browse/ARROW-7724) - [Release][Yum] Ignore some arm64 verifications -* [ARROW-7743](https://issues.apache.org/jira/browse/ARROW-7743) - [Rust] [Parquet] Support reading timestamp micros -* [ARROW-7768](https://issues.apache.org/jira/browse/ARROW-7768) - [Rust] Implement Length and TryClone traits for Cursor\> in reader.rs -* [ARROW-8015](https://issues.apache.org/jira/browse/ARROW-8015) - [Python] Build 0.16.0 wheel install for Windows + Python 3.5 and publish to PyPI -* [PARQUET-517](https://issues.apache.org/jira/browse/PARQUET-517) - [C++] Use arrow::MemoryPool for all heap allocations -* [PARQUET-1300](https://issues.apache.org/jira/browse/PARQUET-1300) - [C++] Parquet modular encryption -* [PARQUET-1664](https://issues.apache.org/jira/browse/PARQUET-1664) - [C++] Provide API to return metadata string from FileMetadata. -* [PARQUET-1678](https://issues.apache.org/jira/browse/PARQUET-1678) - [C++] Provide classes for reading/writing using input/output operators -* [PARQUET-1688](https://issues.apache.org/jira/browse/PARQUET-1688) - [C++] StreamWriter/StreamReader can't be built with g++ 4.8.5 on CentOS 7 -* [PARQUET-1689](https://issues.apache.org/jira/browse/PARQUET-1689) - [C++] Stream API: Allow for columns/rows to be skipped when reading -* [PARQUET-1701](https://issues.apache.org/jira/browse/PARQUET-1701) - [C++] Stream API: Add support for optional fields -* [PARQUET-1704](https://issues.apache.org/jira/browse/PARQUET-1704) - [C++] Add re-usable encryption buffer to SerializedPageWriter -* [PARQUET-1705](https://issues.apache.org/jira/browse/PARQUET-1705) - [C++] Disable shrink-to-fit on the re-usable decryption buffer -* [PARQUET-1712](https://issues.apache.org/jira/browse/PARQUET-1712) - [C++] Stop using deprecated APIs in examples -* [PARQUET-1721](https://issues.apache.org/jira/browse/PARQUET-1721) - [C++] Arrow dependency is missing in parquet.pc -* [PARQUET-1734](https://issues.apache.org/jira/browse/PARQUET-1734) - [C++] Fix typos -* [PARQUET-1769](https://issues.apache.org/jira/browse/PARQUET-1769) - [C++] Update to parquet-format 2.8.0 - - - -# Apache Arrow 0.15.1 (2019-11-01) - -## Bug Fixes - -* [ARROW-6464](https://issues.apache.org/jira/browse/ARROW-6464) - [Java] Refactor FixedSizeListVector\#splitAndTransfer with slice API -* [ARROW-6728](https://issues.apache.org/jira/browse/ARROW-6728) - [C\#] Support reading and writing Date32 and Date64 arrays -* [ARROW-6740](https://issues.apache.org/jira/browse/ARROW-6740) - [Python] Unable to delete closed MemoryMappedFile on Windows -* [ARROW-6762](https://issues.apache.org/jira/browse/ARROW-6762) - [C++] JSON reader segfaults on newline -* [ARROW-6795](https://issues.apache.org/jira/browse/ARROW-6795) - [C\#] Reading large Arrow files in C\# results in an exception -* [ARROW-6806](https://issues.apache.org/jira/browse/ARROW-6806) - [C++] Segfault deserializing ListArray containing null/empty list -* [ARROW-6809](https://issues.apache.org/jira/browse/ARROW-6809) - [RUBY] Gem does not install on macOS due to glib2 3.3.7 compilation failure -* [ARROW-6813](https://issues.apache.org/jira/browse/ARROW-6813) - [Ruby] Arrow::Table.load with headers=true leads to exception in Arrow 0.15 -* [ARROW-6834](https://issues.apache.org/jira/browse/ARROW-6834) - [C++] Pin gtest to 1.8.1 to triage failing Appveyor / MSVC build -* [ARROW-6844](https://issues.apache.org/jira/browse/ARROW-6844) - [C++][Parquet][Python] List columns read broken with 0.15.0 -* [ARROW-6857](https://issues.apache.org/jira/browse/ARROW-6857) - [Python][C++] Segfault for dictionary\_encode on empty chunked\_array (edge case) -* [ARROW-6860](https://issues.apache.org/jira/browse/ARROW-6860) - [Python] Only link libarrow\_flight.so to pyarrow.\_flight -* [ARROW-6861](https://issues.apache.org/jira/browse/ARROW-6861) - [Python] arrow-0.15.0 reading arrow-0.14.1-output Parquet dictionary column: Failure reading column: IOError: Arrow error: Invalid: Resize cannot downsize -* [ARROW-6869](https://issues.apache.org/jira/browse/ARROW-6869) - [C++] Dictionary "delta" building logic in builder\_dict.h produces invalid arrays -* [ARROW-6873](https://issues.apache.org/jira/browse/ARROW-6873) - [Python] Stale CColumn reference break Cython cimport pyarrow -* [ARROW-6874](https://issues.apache.org/jira/browse/ARROW-6874) - [Python] Memory leak in Table.to\_pandas() when conversion to object dtype -* [ARROW-6876](https://issues.apache.org/jira/browse/ARROW-6876) - [Python] Reading parquet file with many columns becomes slow for 0.15.0 -* [ARROW-6877](https://issues.apache.org/jira/browse/ARROW-6877) - [C++] Boost not found from the correct environment -* [ARROW-6878](https://issues.apache.org/jira/browse/ARROW-6878) - [Python] pa.array() does not handle list of dicts with bytes keys correctly under python3 -* [ARROW-6882](https://issues.apache.org/jira/browse/ARROW-6882) - [Python] cannot create a chunked\_array from dictionary\_encoding result -* [ARROW-6886](https://issues.apache.org/jira/browse/ARROW-6886) - [C++] arrow::io header nvcc compiler warnings -* [ARROW-6898](https://issues.apache.org/jira/browse/ARROW-6898) - [Java] Fix potential memory leak in ArrowWriter and several test classes -* [ARROW-6903](https://issues.apache.org/jira/browse/ARROW-6903) - [Python] Wheels broken after ARROW-6860 changes -* [ARROW-6905](https://issues.apache.org/jira/browse/ARROW-6905) - [Packaging][OSX] Nightly builds on MacOS are failing because of brew compile timeouts -* [ARROW-6910](https://issues.apache.org/jira/browse/ARROW-6910) - [Python] pyarrow.parquet.read\_table(...) takes up lots of memory which is not released until program exits -* [ARROW-6922](https://issues.apache.org/jira/browse/ARROW-6922) - [Python] Pandas master build is failing (MultiIndex.levels change) -* [ARROW-6937](https://issues.apache.org/jira/browse/ARROW-6937) - [Packaging][Python] Fix conda linux and OSX wheel nightly builds -* [ARROW-6938](https://issues.apache.org/jira/browse/ARROW-6938) - [Python] Windows wheel depends on zstd.dll and libbz2.dll, which are not bundled -* [ARROW-6962](https://issues.apache.org/jira/browse/ARROW-6962) - [C++] [CI] Stop compiling with -Weverything -* [ARROW-6977](https://issues.apache.org/jira/browse/ARROW-6977) - [C++] Only enable jemalloc background\_thread if feature is supported -* [ARROW-6983](https://issues.apache.org/jira/browse/ARROW-6983) - [C++] Threaded task group crashes sometimes -* [ARROW-7422](https://issues.apache.org/jira/browse/ARROW-7422) - [Python] Improper CPU flags failing pyarrow install in ARM devices -* [ARROW-7423](https://issues.apache.org/jira/browse/ARROW-7423) - Pyarrow ARM install fails from source with no clear error -* [ARROW-9349](https://issues.apache.org/jira/browse/ARROW-9349) - [Python] parquet.read\_table causes crashes on Windows Server 2016 w/ Xeon Processor - - -## New Features and Improvements - -* [ARROW-6610](https://issues.apache.org/jira/browse/ARROW-6610) - [C++] Add ARROW\_FILESYSTEM=ON/OFF CMake configuration flag -* [ARROW-6661](https://issues.apache.org/jira/browse/ARROW-6661) - [Java] Implement APIs like slice to enhance VectorSchemaRoot -* [ARROW-6777](https://issues.apache.org/jira/browse/ARROW-6777) - [GLib][CI] Unpin gobject-introspection gem -* [ARROW-6852](https://issues.apache.org/jira/browse/ARROW-6852) - [C++] memory-benchmark build failed on Arm64 -* [ARROW-6927](https://issues.apache.org/jira/browse/ARROW-6927) - [C++] Add gRPC version check -* [ARROW-6963](https://issues.apache.org/jira/browse/ARROW-6963) - [Packaging][Wheel][OSX] Use crossbow's command to deploy artifacts from travis builds - - - -# Apache Arrow 0.15.0 (2019-10-05) - -## New Features and Improvements - -* [ARROW-453](https://issues.apache.org/jira/browse/ARROW-453) - [C++] Add filesystem implementation for Amazon S3 -* [ARROW-517](https://issues.apache.org/jira/browse/ARROW-517) - [C++] Verbose Array::Equals -* [ARROW-750](https://issues.apache.org/jira/browse/ARROW-750) - [Format] Add LargeBinary and LargeString types -* [ARROW-1324](https://issues.apache.org/jira/browse/ARROW-1324) - [C++] Support ARROW\_BOOST\_VENDORED on Windows / MSVC -* [ARROW-1561](https://issues.apache.org/jira/browse/ARROW-1561) - [C++] Kernel implementations for "isin" (set containment) -* [ARROW-1566](https://issues.apache.org/jira/browse/ARROW-1566) - [C++] Implement non-materializing sort kernels -* [ARROW-1741](https://issues.apache.org/jira/browse/ARROW-1741) - [C++] Comparison function for DictionaryArray to determine if indices are "compatible" -* [ARROW-1786](https://issues.apache.org/jira/browse/ARROW-1786) - [Format] List expected on-wire buffer layouts for each kind of Arrow physical type in specification -* [ARROW-1789](https://issues.apache.org/jira/browse/ARROW-1789) - [Format] Consolidate specification documents and improve clarity for new implementation authors -* [ARROW-1875](https://issues.apache.org/jira/browse/ARROW-1875) - [Java] Write 64-bit ints as strings in integration test JSON files -* [ARROW-2006](https://issues.apache.org/jira/browse/ARROW-2006) - [C++] Add option to trim excess padding when writing IPC messages -* [ARROW-2431](https://issues.apache.org/jira/browse/ARROW-2431) - [Rust] Schema fidelity -* [ARROW-2769](https://issues.apache.org/jira/browse/ARROW-2769) - [C++][Python] Deprecate and rename add\_metadata methods -* [ARROW-2931](https://issues.apache.org/jira/browse/ARROW-2931) - [Crossbow] Windows builds are attempting to run linux and osx packaging tasks -* [ARROW-3032](https://issues.apache.org/jira/browse/ARROW-3032) - [Python] Clean up NumPy-related C++ headers -* [ARROW-3204](https://issues.apache.org/jira/browse/ARROW-3204) - [R] Enable package to be made available on CRAN -* [ARROW-3243](https://issues.apache.org/jira/browse/ARROW-3243) - [C++] Upgrade jemalloc to version 5 -* [ARROW-3246](https://issues.apache.org/jira/browse/ARROW-3246) - [Python][Parquet] direct reading/writing of pandas categoricals in parquet -* [ARROW-3325](https://issues.apache.org/jira/browse/ARROW-3325) - [Python] Support reading Parquet binary/string columns directly as DictionaryArray -* [ARROW-3325](https://issues.apache.org/jira/browse/ARROW-3325) - [Python] Support reading Parquet binary/string columns directly as DictionaryArray -* [ARROW-3531](https://issues.apache.org/jira/browse/ARROW-3531) - [Python] Deprecate Schema.field\_by\_name in favor of \_\_getitem\_\_ -* [ARROW-3538](https://issues.apache.org/jira/browse/ARROW-3538) - [Python] ability to override the automated assignment of uuid for filenames when writing datasets -* [ARROW-3579](https://issues.apache.org/jira/browse/ARROW-3579) - [Crossbow] Unintuitive error message when remote branch has not been pushed -* [ARROW-3643](https://issues.apache.org/jira/browse/ARROW-3643) - [Rust] Optimize \`push\_slice\` of \`BufferBuilder\` -* [ARROW-3710](https://issues.apache.org/jira/browse/ARROW-3710) - [Crossbow][Python] Run nightly tests against pandas master -* [ARROW-3772](https://issues.apache.org/jira/browse/ARROW-3772) - [C++] Read Parquet dictionary encoded ColumnChunks directly into an Arrow DictionaryArray -* [ARROW-3777](https://issues.apache.org/jira/browse/ARROW-3777) - [C++] Implement a mock "high latency" filesystem -* [ARROW-3817](https://issues.apache.org/jira/browse/ARROW-3817) - [R] $ method for RecordBatch -* [ARROW-3829](https://issues.apache.org/jira/browse/ARROW-3829) - [Python] Support protocols to extract Arrow objects from third-party classes -* [ARROW-3943](https://issues.apache.org/jira/browse/ARROW-3943) - [R] Write vignette for R package -* [ARROW-4036](https://issues.apache.org/jira/browse/ARROW-4036) - [C++] Make status codes pluggable -* [ARROW-4095](https://issues.apache.org/jira/browse/ARROW-4095) - [C++] Implement optimizations for dictionary unification where dictionaries are prefixes of the unified dictionary -* [ARROW-4111](https://issues.apache.org/jira/browse/ARROW-4111) - [Python] Create time types from Python sequences of integers -* [ARROW-4218](https://issues.apache.org/jira/browse/ARROW-4218) - [Rust] [Parquet] Implement ColumnReader -* [ARROW-4220](https://issues.apache.org/jira/browse/ARROW-4220) - [Python] Add buffered input and output stream ASV benchmarks with simulated high latency IO -* [ARROW-4365](https://issues.apache.org/jira/browse/ARROW-4365) - [Rust] [Parquet] Implement RecordReader -* [ARROW-4398](https://issues.apache.org/jira/browse/ARROW-4398) - [Python] Add benchmarks for Arrow<\>Parquet BYTE\_ARRAY serialization (read and write) -* [ARROW-4473](https://issues.apache.org/jira/browse/ARROW-4473) - [Website] Add instructions to do a test-deploy of Arrow website and fix bugs -* [ARROW-4507](https://issues.apache.org/jira/browse/ARROW-4507) - [Format] Create outline and introduction for new document. -* [ARROW-4508](https://issues.apache.org/jira/browse/ARROW-4508) - [Format] Copy content from Layout.rst to new document. -* [ARROW-4509](https://issues.apache.org/jira/browse/ARROW-4509) - [Format] Copy content from Metadata.rst to new document. -* [ARROW-4510](https://issues.apache.org/jira/browse/ARROW-4510) - [Format] copy content from IPC.rst to new document. -* [ARROW-4511](https://issues.apache.org/jira/browse/ARROW-4511) - [Format] remove individual documents in favor of new document once all content is moved -* [ARROW-4648](https://issues.apache.org/jira/browse/ARROW-4648) - [C++/Question] Naming/organizational inconsistencies in cpp codebase -* [ARROW-4648](https://issues.apache.org/jira/browse/ARROW-4648) - [C++/Question] Naming/organizational inconsistencies in cpp codebase -* [ARROW-4649](https://issues.apache.org/jira/browse/ARROW-4649) - [C++/CI/R] Add (nightly) job that builds \`brew install apache-arrow --HEAD\` -* [ARROW-4752](https://issues.apache.org/jira/browse/ARROW-4752) - [Rust] Add explicit SIMD vectorization for the divide kernel -* [ARROW-4810](https://issues.apache.org/jira/browse/ARROW-4810) - [Format][C++] Add "LargeList" type with 64-bit offsets -* [ARROW-4841](https://issues.apache.org/jira/browse/ARROW-4841) - [C++] Persist CMake options in generated CMake config -* [ARROW-4860](https://issues.apache.org/jira/browse/ARROW-4860) - [C++] Build AWS C++ SDK for Windows in conda-forge -* [ARROW-5134](https://issues.apache.org/jira/browse/ARROW-5134) - [R][CI] Run nightly tests against multiple R versions -* [ARROW-5211](https://issues.apache.org/jira/browse/ARROW-5211) - [Format] Missing documentation under \`Dictionary encoding\` section on MetaData page -* [ARROW-5216](https://issues.apache.org/jira/browse/ARROW-5216) - [CI] Add Appveyor badge to README -* [ARROW-5307](https://issues.apache.org/jira/browse/ARROW-5307) - [CI][GLib] Enable GTK-Doc -* [ARROW-5337](https://issues.apache.org/jira/browse/ARROW-5337) - [C++] Add RecordBatch::field method, possibly deprecate "column" -* [ARROW-5343](https://issues.apache.org/jira/browse/ARROW-5343) - [C++] Consider using Buffer for transpose maps in DictionaryType::Unify instead of std::vector -* [ARROW-5344](https://issues.apache.org/jira/browse/ARROW-5344) - [C++] Use ArrayDataVisitor in implementation of dictionary unpacking in compute/kernels/cast.cc -* [ARROW-5351](https://issues.apache.org/jira/browse/ARROW-5351) - [Rust] Add support for take kernel functions -* [ARROW-5358](https://issues.apache.org/jira/browse/ARROW-5358) - [Rust] Implement equality check for ArrayData and Array -* [ARROW-5380](https://issues.apache.org/jira/browse/ARROW-5380) - [C++] Fix and enable UBSan for unaligned accesses. -* [ARROW-5439](https://issues.apache.org/jira/browse/ARROW-5439) - [Java] Utilize stream EOS in File format -* [ARROW-5444](https://issues.apache.org/jira/browse/ARROW-5444) - [Release][Website] After 0.14 release, update what is an "official" release -* [ARROW-5458](https://issues.apache.org/jira/browse/ARROW-5458) - [C++] ARMv8 parallel CRC32c computation optimization -* [ARROW-5480](https://issues.apache.org/jira/browse/ARROW-5480) - [Python] Pandas categorical type doesn't survive a round-trip through parquet -* [ARROW-5483](https://issues.apache.org/jira/browse/ARROW-5483) - [Java] add ValueVector constructors that take a Field object -* [ARROW-5494](https://issues.apache.org/jira/browse/ARROW-5494) - [Python] Create FileSystem bindings -* [ARROW-5505](https://issues.apache.org/jira/browse/ARROW-5505) - [R] Stop masking base R functions/rethink namespacing -* [ARROW-5527](https://issues.apache.org/jira/browse/ARROW-5527) - [C++] HashTable/MemoTable should use Buffer(s)/Builder(s) for heap data -* [ARROW-5558](https://issues.apache.org/jira/browse/ARROW-5558) - [C++] Support Array::View on arrays with non-zero offsets -* [ARROW-5559](https://issues.apache.org/jira/browse/ARROW-5559) - [C++] Introduce IpcOptions struct object for better API-stability when adding new options -* [ARROW-5564](https://issues.apache.org/jira/browse/ARROW-5564) - [C++] Add uriparser to conda-forge -* [ARROW-5579](https://issues.apache.org/jira/browse/ARROW-5579) - [Java] shade flatbuffer dependency -* [ARROW-5580](https://issues.apache.org/jira/browse/ARROW-5580) - [C++][Gandiva] Correct definitions of timestamp functions in Gandiva -* [ARROW-5588](https://issues.apache.org/jira/browse/ARROW-5588) - [C++] Better support for building UnionArrays -* [ARROW-5594](https://issues.apache.org/jira/browse/ARROW-5594) - [C++] add support for UnionArrays to Take and Filter -* [ARROW-5610](https://issues.apache.org/jira/browse/ARROW-5610) - [Python] Define extension type API in Python to "receive" or "send" a foreign extension type -* [ARROW-5646](https://issues.apache.org/jira/browse/ARROW-5646) - [Crossbow][Documentation] Move the user guide to the Sphinx documentation -* [ARROW-5681](https://issues.apache.org/jira/browse/ARROW-5681) - [FlightRPC] Wrap gRPC exceptions/statuses -* [ARROW-5686](https://issues.apache.org/jira/browse/ARROW-5686) - [R] Review R Windows CI build -* [ARROW-5716](https://issues.apache.org/jira/browse/ARROW-5716) - [Developer] Improve merge PR script to acknowledge co-authors -* [ARROW-5717](https://issues.apache.org/jira/browse/ARROW-5717) - [Python] Support dictionary unification when converting variable dictionaries to pandas -* [ARROW-5719](https://issues.apache.org/jira/browse/ARROW-5719) - [Java] Support in-place vector sorting -* [ARROW-5722](https://issues.apache.org/jira/browse/ARROW-5722) - [Rust] Implement std::fmt::Debug for ListArray, BinaryArray and StructArray -* [ARROW-5734](https://issues.apache.org/jira/browse/ARROW-5734) - [Python] Dispatch to Table.from\_arrays from pyarrow.table factory function -* [ARROW-5736](https://issues.apache.org/jira/browse/ARROW-5736) - [Format][C++] Support small bit-width indices in sparse tensor -* [ARROW-5741](https://issues.apache.org/jira/browse/ARROW-5741) - [JS] Make numeric vector from functions consistent with TypedArray.from -* [ARROW-5743](https://issues.apache.org/jira/browse/ARROW-5743) - [C++] Add CMake option to enable "large memory" unit tests -* [ARROW-5746](https://issues.apache.org/jira/browse/ARROW-5746) - [Website] Move website source out of apache/arrow -* [ARROW-5747](https://issues.apache.org/jira/browse/ARROW-5747) - [C++] Better column name and header support in CSV reader -* [ARROW-5758](https://issues.apache.org/jira/browse/ARROW-5758) - [C++][Gandiva] Support casting decimals to varchar and vice versa -* [ARROW-5762](https://issues.apache.org/jira/browse/ARROW-5762) - [Integration][JS] Integration Tests for Map Type -* [ARROW-5777](https://issues.apache.org/jira/browse/ARROW-5777) - [C++] BasicDecimal128 is a small object it doesn't always make sense to pass by const ref -* [ARROW-5778](https://issues.apache.org/jira/browse/ARROW-5778) - [Java] Extract the logic for vector data copying to the super classes -* [ARROW-5784](https://issues.apache.org/jira/browse/ARROW-5784) - [Release][GLib] Replace c\_glib/ after running c\_glib/autogen.sh in dev/release/02-source.sh -* [ARROW-5786](https://issues.apache.org/jira/browse/ARROW-5786) - [Release] Use arrow-jni profile in dev/release/01-prepare.sh -* [ARROW-5788](https://issues.apache.org/jira/browse/ARROW-5788) - [Rust] Use { version = "...", path = "../..." } for arrow and parquet dependencies -* [ARROW-5789](https://issues.apache.org/jira/browse/ARROW-5789) - [C++] Small Warning/Linkage cleanups -* [ARROW-5792](https://issues.apache.org/jira/browse/ARROW-5792) - [Rust] [Parquet] A visitor trait for parquet types. -* [ARROW-5798](https://issues.apache.org/jira/browse/ARROW-5798) - [Packaging][deb] Update doc architecture -* [ARROW-5800](https://issues.apache.org/jira/browse/ARROW-5800) - [R] Dockerize R Travis CI tests so they can be run anywhere via docker-compose -* [ARROW-5803](https://issues.apache.org/jira/browse/ARROW-5803) - [C++] Dockerize C++ with clang 7 Travis CI unit test logic -* [ARROW-5812](https://issues.apache.org/jira/browse/ARROW-5812) - [Java] Refactor method name and param type in BaseIntVector -* [ARROW-5813](https://issues.apache.org/jira/browse/ARROW-5813) - [C++] Support checking the equality of the different contiguous tensors -* [ARROW-5814](https://issues.apache.org/jira/browse/ARROW-5814) - [Java] Implement a HashMap for DictionaryEncoder -* [ARROW-5827](https://issues.apache.org/jira/browse/ARROW-5827) - [C++] Require c-ares CMake config -* [ARROW-5828](https://issues.apache.org/jira/browse/ARROW-5828) - [C++] Add Protocol Buffers version check -* [ARROW-5830](https://issues.apache.org/jira/browse/ARROW-5830) - [C++] Stop using memcmp in TensorEquals -* [ARROW-5832](https://issues.apache.org/jira/browse/ARROW-5832) - [Java] Support search operations for vector data -* [ARROW-5833](https://issues.apache.org/jira/browse/ARROW-5833) - [C++] Factor out status copying code from cast.cc -* [ARROW-5834](https://issues.apache.org/jira/browse/ARROW-5834) - [Java] Apply new hash map in DictionaryEncoder -* [ARROW-5835](https://issues.apache.org/jira/browse/ARROW-5835) - [Java] Support Dictionary Encoding for binary type -* [ARROW-5841](https://issues.apache.org/jira/browse/ARROW-5841) - [Website] Add 0.14.0 release note -* [ARROW-5842](https://issues.apache.org/jira/browse/ARROW-5842) - [Java] Revise the semantic of lastSet in ListVector -* [ARROW-5843](https://issues.apache.org/jira/browse/ARROW-5843) - [Java] Improve the readability and performance of BitVectorHelper\#getNullCount -* [ARROW-5844](https://issues.apache.org/jira/browse/ARROW-5844) - [Java] Support comparison & sort for more numeric types -* [ARROW-5846](https://issues.apache.org/jira/browse/ARROW-5846) - [Java] Create Avro adapter module and add dependencies -* [ARROW-5853](https://issues.apache.org/jira/browse/ARROW-5853) - [Python] Expose boolean filter kernel on Array -* [ARROW-5861](https://issues.apache.org/jira/browse/ARROW-5861) - [Java] Initial implement to convert Avro record with primitive types -* [ARROW-5862](https://issues.apache.org/jira/browse/ARROW-5862) - [Java] Provide dictionary builder -* [ARROW-5864](https://issues.apache.org/jira/browse/ARROW-5864) - [Python] simplify cython wrapping of Result -* [ARROW-5865](https://issues.apache.org/jira/browse/ARROW-5865) - [Release] Helper script for rebasing open pull requests on master -* [ARROW-5866](https://issues.apache.org/jira/browse/ARROW-5866) - [C++] Remove duplicate library in cpp/Brewfile -* [ARROW-5867](https://issues.apache.org/jira/browse/ARROW-5867) - [C++][Gandiva] Add support for cast int to decimal -* [ARROW-5872](https://issues.apache.org/jira/browse/ARROW-5872) - Support mod(double, double) method in Gandiva -* [ARROW-5876](https://issues.apache.org/jira/browse/ARROW-5876) - [FlightRPC] Implement basic auth across all languages -* [ARROW-5877](https://issues.apache.org/jira/browse/ARROW-5877) - [FlightRPC] Fix auth incompatibilities between Python/Java -* [ARROW-5880](https://issues.apache.org/jira/browse/ARROW-5880) - [C++] Update arrow parquet writer to use TypedBufferBuilder -* [ARROW-5881](https://issues.apache.org/jira/browse/ARROW-5881) - [Java] Provide functionalities to efficiently determine if a validity buffer has completely 1 bits/0 bits -* [ARROW-5883](https://issues.apache.org/jira/browse/ARROW-5883) - [Java] Support dictionary encoding for List and Struct type -* [ARROW-5888](https://issues.apache.org/jira/browse/ARROW-5888) - [Python][C++] Add metadata to store Arrow time zones in Parquet file metadata -* [ARROW-5891](https://issues.apache.org/jira/browse/ARROW-5891) - [C++][Gandiva] Remove duplicates in function registries -* [ARROW-5892](https://issues.apache.org/jira/browse/ARROW-5892) - [C++][Gandiva] Support function aliases -* [ARROW-5893](https://issues.apache.org/jira/browse/ARROW-5893) - [C++] Remove arrow::Column class from C++ library -* [ARROW-5897](https://issues.apache.org/jira/browse/ARROW-5897) - [Java] Remove duplicated logic in MapVector -* [ARROW-5898](https://issues.apache.org/jira/browse/ARROW-5898) - [Java] Provide functionality to efficiently compute hash code for arbitrary memory segment -* [ARROW-5900](https://issues.apache.org/jira/browse/ARROW-5900) - [Gandiva] [Java] Decimal precision,scale bounds check -* [ARROW-5901](https://issues.apache.org/jira/browse/ARROW-5901) - [Rust] Implement PartialEq to compare array and json values -* [ARROW-5902](https://issues.apache.org/jira/browse/ARROW-5902) - [Java] Implement hash table and equals & hashCode API for dictionary encoding -* [ARROW-5903](https://issues.apache.org/jira/browse/ARROW-5903) - [Java] Set methods in DecimalVector are slow -* [ARROW-5904](https://issues.apache.org/jira/browse/ARROW-5904) - [Java] [Plasma] Fix compilation of Plasma Java client -* [ARROW-5906](https://issues.apache.org/jira/browse/ARROW-5906) - [CI] Set -DARROW\_VERBOSE\_THIRDPARTY\_BUILD=OFF in builds running in Travis CI, maybe all docker-compose builds by default -* [ARROW-5908](https://issues.apache.org/jira/browse/ARROW-5908) - [C\#] ArrowStreamWriter doesn't align buffers to 8 bytes -* [ARROW-5909](https://issues.apache.org/jira/browse/ARROW-5909) - [Java] Optimize ByteFunctionHelpers equals & compare logic -* [ARROW-5911](https://issues.apache.org/jira/browse/ARROW-5911) - [Java] Make ListVector and MapVector create reader lazily -* [ARROW-5917](https://issues.apache.org/jira/browse/ARROW-5917) - [Java] Redesign the dictionary encoder -* [ARROW-5918](https://issues.apache.org/jira/browse/ARROW-5918) - [Java] Add get to BaseIntVector interface -* [ARROW-5919](https://issues.apache.org/jira/browse/ARROW-5919) - [R] Add nightly tests for building r-arrow with dependencies from conda-forge -* [ARROW-5920](https://issues.apache.org/jira/browse/ARROW-5920) - [Java] Support sort & compare for all variable width vectors -* [ARROW-5924](https://issues.apache.org/jira/browse/ARROW-5924) - [C++][Plasma] It is not convenient to release a GPU object -* [ARROW-5934](https://issues.apache.org/jira/browse/ARROW-5934) - [Python] Bundle arrow's LICENSE with the wheels -* [ARROW-5937](https://issues.apache.org/jira/browse/ARROW-5937) - [Release] Stop parallel binary upload -* [ARROW-5938](https://issues.apache.org/jira/browse/ARROW-5938) - [Release] Create branch for adding release note automatically -* [ARROW-5939](https://issues.apache.org/jira/browse/ARROW-5939) - [Release] Add support for generating vote email template separately -* [ARROW-5940](https://issues.apache.org/jira/browse/ARROW-5940) - [Release] Add support for re-uploading sign/checksum for binary artifacts -* [ARROW-5941](https://issues.apache.org/jira/browse/ARROW-5941) - [Release] Avoid re-uploading already uploaded binary artifacts -* [ARROW-5943](https://issues.apache.org/jira/browse/ARROW-5943) - [GLib][Gandiva] Add support for function aliases -* [ARROW-5944](https://issues.apache.org/jira/browse/ARROW-5944) - [C++][Gandiva] Remove 'div' alias for 'divide' -* [ARROW-5945](https://issues.apache.org/jira/browse/ARROW-5945) - [Rust] [DataFusion] Table trait should support building complete queries -* [ARROW-5947](https://issues.apache.org/jira/browse/ARROW-5947) - [Rust] [DataFusion] Remove serde\_json dependency -* [ARROW-5948](https://issues.apache.org/jira/browse/ARROW-5948) - [Rust] [DataFusion] create\_logical\_plan should not call optimizer -* [ARROW-5955](https://issues.apache.org/jira/browse/ARROW-5955) - [Plasma] Support setting memory quotas per plasma client for better isolation -* [ARROW-5957](https://issues.apache.org/jira/browse/ARROW-5957) - [C++][Gandiva] Implement div function in Gandiva -* [ARROW-5958](https://issues.apache.org/jira/browse/ARROW-5958) - [Python] Link zlib statically in the wheels -* [ARROW-5961](https://issues.apache.org/jira/browse/ARROW-5961) - [R] Be able to run R-only tests even without C++ library -* [ARROW-5962](https://issues.apache.org/jira/browse/ARROW-5962) - [CI][Python] Do not test manylinux1 wheels in Travis CI -* [ARROW-5967](https://issues.apache.org/jira/browse/ARROW-5967) - [Java] DateUtility\#timeZoneList is not correct -* [ARROW-5970](https://issues.apache.org/jira/browse/ARROW-5970) - [Java] Provide pointer to Arrow buffer -* [ARROW-5974](https://issues.apache.org/jira/browse/ARROW-5974) - [Python][C++] Enable CSV reader to read from concatenated gzip stream -* [ARROW-5975](https://issues.apache.org/jira/browse/ARROW-5975) - [C++][Gandiva] Add method to cast Date(in Milliseconds) to timestamp -* [ARROW-5976](https://issues.apache.org/jira/browse/ARROW-5976) - [C++] RETURN\_IF\_ERROR(ctx) should be namespaced -* [ARROW-5977](https://issues.apache.org/jira/browse/ARROW-5977) - [C++] [Python] Method for read\_csv to limit which columns are read? -* [ARROW-5979](https://issues.apache.org/jira/browse/ARROW-5979) - [FlightRPC] Expose (de)serialization of protocol types -* [ARROW-5985](https://issues.apache.org/jira/browse/ARROW-5985) - [Developer] Do not suggest setting Fix Version for point releases in dev/merge\_arrow\_pr.py -* [ARROW-5986](https://issues.apache.org/jira/browse/ARROW-5986) - [Java] Code cleanup for dictionary encoding -* [ARROW-5988](https://issues.apache.org/jira/browse/ARROW-5988) - [Java] Avro adapter implement simple Record type -* [ARROW-5997](https://issues.apache.org/jira/browse/ARROW-5997) - [Java] Support dictionary encoding for Union type -* [ARROW-5998](https://issues.apache.org/jira/browse/ARROW-5998) - [Java] Open a document to track the API changes -* [ARROW-6000](https://issues.apache.org/jira/browse/ARROW-6000) - [Python] Expose LargeBinaryType and LargeStringType -* [ARROW-6008](https://issues.apache.org/jira/browse/ARROW-6008) - [Release] Don't parallelize the bintray upload script -* [ARROW-6009](https://issues.apache.org/jira/browse/ARROW-6009) - [Release][JS] Ignore NPM errors in the javascript release script -* [ARROW-6013](https://issues.apache.org/jira/browse/ARROW-6013) - [Java] Support range searcher -* [ARROW-6017](https://issues.apache.org/jira/browse/ARROW-6017) - [FlightRPC] Allow creating Locations with unknown schemes -* [ARROW-6020](https://issues.apache.org/jira/browse/ARROW-6020) - [Java] Refactor ByteFunctionHelper\#hash with new added ArrowBufHasher -* [ARROW-6021](https://issues.apache.org/jira/browse/ARROW-6021) - [Java] Extract copyFrom and copyFromSafe methods to ValueVector interface -* [ARROW-6022](https://issues.apache.org/jira/browse/ARROW-6022) - [Java] Support equals API in ValueVector to compare two vectors equal -* [ARROW-6023](https://issues.apache.org/jira/browse/ARROW-6023) - [C++][Gandiva] Add functions in Gandiva -* [ARROW-6024](https://issues.apache.org/jira/browse/ARROW-6024) - [Java] Provide more hash algorithms -* [ARROW-6026](https://issues.apache.org/jira/browse/ARROW-6026) - [Doc] Add CONTRIBUTING.md -* [ARROW-6030](https://issues.apache.org/jira/browse/ARROW-6030) - [Java] Efficiently compute hash code for ArrowBufPointer -* [ARROW-6031](https://issues.apache.org/jira/browse/ARROW-6031) - [Java] Support iterating a vector by ArrowBufPointer -* [ARROW-6034](https://issues.apache.org/jira/browse/ARROW-6034) - [C++][Gandiva] Add string functions in Gandiva -* [ARROW-6035](https://issues.apache.org/jira/browse/ARROW-6035) - [Java] Avro adapter support convert nullable value -* [ARROW-6036](https://issues.apache.org/jira/browse/ARROW-6036) - [GLib] Add support for skip rows and column\_names CSV read option -* [ARROW-6037](https://issues.apache.org/jira/browse/ARROW-6037) - [GLib] Add a missing version macro -* [ARROW-6039](https://issues.apache.org/jira/browse/ARROW-6039) - [GLib] Add garrow\_array\_filter() -* [ARROW-6041](https://issues.apache.org/jira/browse/ARROW-6041) - [Website] Blog post announcing R package release -* [ARROW-6042](https://issues.apache.org/jira/browse/ARROW-6042) - [C++] Implement alternative DictionaryBuilder that always yields int32 indices -* [ARROW-6045](https://issues.apache.org/jira/browse/ARROW-6045) - [C++] Benchmark for Parquet float and NaN encoding/decoding -* [ARROW-6048](https://issues.apache.org/jira/browse/ARROW-6048) - [C++] Add ChunkedArray::View which calls to Array::View -* [ARROW-6049](https://issues.apache.org/jira/browse/ARROW-6049) - [C++] Support using Array::View from compatible dictionary type to another -* [ARROW-6053](https://issues.apache.org/jira/browse/ARROW-6053) - [Python] RecordBatchStreamReader::Open2 cdef type signature doesn't match C++ -* [ARROW-6063](https://issues.apache.org/jira/browse/ARROW-6063) - [FlightRPC] Implement "half-closed" semantics for DoPut -* [ARROW-6065](https://issues.apache.org/jira/browse/ARROW-6065) - [C++] Reorganize parquet/arrow/reader.cc, remove code duplication, improve readability -* [ARROW-6069](https://issues.apache.org/jira/browse/ARROW-6069) - [Rust] [Parquet] Implement Converter to convert record reader to arrow primitive array. -* [ARROW-6070](https://issues.apache.org/jira/browse/ARROW-6070) - [Java] Avoid creating new schema before IPC sending -* [ARROW-6077](https://issues.apache.org/jira/browse/ARROW-6077) - [C++][Parquet] Build logical schema tree mapping Arrow fields to Parquet schema levels -* [ARROW-6078](https://issues.apache.org/jira/browse/ARROW-6078) - [Java] Implement dictionary-encoded subfields for List type -* [ARROW-6079](https://issues.apache.org/jira/browse/ARROW-6079) - [Java] Implement/test UnionFixedSizeListWriter for FixedSizeListVector -* [ARROW-6080](https://issues.apache.org/jira/browse/ARROW-6080) - [Java] Support compare and search operation for BaseRepeatedValueVector -* [ARROW-6083](https://issues.apache.org/jira/browse/ARROW-6083) - [Java] Refactor Jdbc adapter consume logic -* [ARROW-6084](https://issues.apache.org/jira/browse/ARROW-6084) - [Python] Support LargeList -* [ARROW-6085](https://issues.apache.org/jira/browse/ARROW-6085) - [Rust] [DataFusion] Create traits for phsyical query plan -* [ARROW-6086](https://issues.apache.org/jira/browse/ARROW-6086) - [Rust] [DataFusion] Implement parallel execution for parquet scan -* [ARROW-6087](https://issues.apache.org/jira/browse/ARROW-6087) - [Rust] [DataFusion] Implement parallel execution for CSV scan -* [ARROW-6088](https://issues.apache.org/jira/browse/ARROW-6088) - [Rust] [DataFusion] Implement parallel execution for projection -* [ARROW-6089](https://issues.apache.org/jira/browse/ARROW-6089) - [Rust] [DataFusion] Implement parallel execution for selection -* [ARROW-6090](https://issues.apache.org/jira/browse/ARROW-6090) - [Rust] [DataFusion] Implement parallel execution for hash aggregate -* [ARROW-6093](https://issues.apache.org/jira/browse/ARROW-6093) - [Java] reduce branches in algo for first match in VectorRangeSearcher -* [ARROW-6094](https://issues.apache.org/jira/browse/ARROW-6094) - [Format][Flight] Add GetFlightSchema to Flight RPC -* [ARROW-6096](https://issues.apache.org/jira/browse/ARROW-6096) - [C++] Conditionally depend on boost regex library -* [ARROW-6097](https://issues.apache.org/jira/browse/ARROW-6097) - [Java] Avro adapter implement unions type -* [ARROW-6100](https://issues.apache.org/jira/browse/ARROW-6100) - [Rust] Pin to specific Rust nightly release -* [ARROW-6101](https://issues.apache.org/jira/browse/ARROW-6101) - [Rust] [DataFusion] Create physical plan from logical plan -* [ARROW-6102](https://issues.apache.org/jira/browse/ARROW-6102) - [Testing] Add partitioned CSV file to arrow-testing repo -* [ARROW-6104](https://issues.apache.org/jira/browse/ARROW-6104) - [Rust] [DataFusion] Don't allow bare\_trait\_objects -* [ARROW-6105](https://issues.apache.org/jira/browse/ARROW-6105) - [C++][Parquet][Python] Add test case showing dictionary-encoded subfields in nested type -* [ARROW-6113](https://issues.apache.org/jira/browse/ARROW-6113) - [Java] Support vector deduplicate function -* [ARROW-6115](https://issues.apache.org/jira/browse/ARROW-6115) - [Python] support LargeList, LargeString, LargeBinary in conversion to pandas -* [ARROW-6118](https://issues.apache.org/jira/browse/ARROW-6118) - [Java] Replace google Preconditions with Arrow Preconditions -* [ARROW-6121](https://issues.apache.org/jira/browse/ARROW-6121) - [Tools] Improve merge tool cli ergonomic -* [ARROW-6125](https://issues.apache.org/jira/browse/ARROW-6125) - [Python] Remove any APIs deprecated prior to 0.14.x -* [ARROW-6127](https://issues.apache.org/jira/browse/ARROW-6127) - [Website] Add favicons and meta tags -* [ARROW-6128](https://issues.apache.org/jira/browse/ARROW-6128) - [C++] Can't build with g++ 8.3.0 by class-memaccess warning -* [ARROW-6130](https://issues.apache.org/jira/browse/ARROW-6130) - [Release] Use 0.15.0 as the next release -* [ARROW-6134](https://issues.apache.org/jira/browse/ARROW-6134) - [C++][Gandiva] Add concat function in Gandiva -* [ARROW-6137](https://issues.apache.org/jira/browse/ARROW-6137) - [C++][Gandiva] Change output format of castVARCHAR(timestamp) in Gandiva -* [ARROW-6137](https://issues.apache.org/jira/browse/ARROW-6137) - [C++][Gandiva] Change output format of castVARCHAR(timestamp) in Gandiva -* [ARROW-6138](https://issues.apache.org/jira/browse/ARROW-6138) - [C++] Add a basic (single RecordBatch) implementation of Dataset -* [ARROW-6139](https://issues.apache.org/jira/browse/ARROW-6139) - [Documentation][R] Build R docs (pkgdown) site and add to arrow-site -* [ARROW-6141](https://issues.apache.org/jira/browse/ARROW-6141) - [C++] Enable memory-mapping a file region that is offset from the beginning of the file -* [ARROW-6142](https://issues.apache.org/jira/browse/ARROW-6142) - [R] Install instructions on linux could be clearer -* [ARROW-6143](https://issues.apache.org/jira/browse/ARROW-6143) - [Java] Unify the copyFrom and copyFromSafe methods for all vectors -* [ARROW-6144](https://issues.apache.org/jira/browse/ARROW-6144) - [C++][Gandiva] Implement random function in Gandiva -* [ARROW-6155](https://issues.apache.org/jira/browse/ARROW-6155) - [Java] Extract a super interface for vectors whose elements reside in continuous memory segments -* [ARROW-6156](https://issues.apache.org/jira/browse/ARROW-6156) - [Java] Support compare semantics for ArrowBufPointer -* [ARROW-6161](https://issues.apache.org/jira/browse/ARROW-6161) - [C++] Implements dataset::ParquetFile and associated Scan structures -* [ARROW-6162](https://issues.apache.org/jira/browse/ARROW-6162) - [C++][Gandiva] Do not truncate string in castVARCHAR\_varchar when out\_len parameter is zero -* [ARROW-6164](https://issues.apache.org/jira/browse/ARROW-6164) - [Docs][Format] Document project versioning schema and forward/backward compatibility policies -* [ARROW-6172](https://issues.apache.org/jira/browse/ARROW-6172) - [Java] Provide benchmarks to set IntVector with different methods -* [ARROW-6177](https://issues.apache.org/jira/browse/ARROW-6177) - [C++] Add Array::Validate() -* [ARROW-6180](https://issues.apache.org/jira/browse/ARROW-6180) - [C++] Create InputStream that is an isolated reader of a segment of a RandomAccessFile -* [ARROW-6181](https://issues.apache.org/jira/browse/ARROW-6181) - [R] Only allow R package to install without libarrow on linux -* [ARROW-6183](https://issues.apache.org/jira/browse/ARROW-6183) - [R] Document that you don't have to use tidyselect if you don't want -* [ARROW-6185](https://issues.apache.org/jira/browse/ARROW-6185) - [Java] Provide hash table based dictionary builder -* [ARROW-6187](https://issues.apache.org/jira/browse/ARROW-6187) - [C++] fallback to storage type when writing ExtensionType to Parquet -* [ARROW-6188](https://issues.apache.org/jira/browse/ARROW-6188) - [GLib] Add garrow\_array\_is\_in() -* [ARROW-6192](https://issues.apache.org/jira/browse/ARROW-6192) - [GLib] Use the same SO version as C++ -* [ARROW-6194](https://issues.apache.org/jira/browse/ARROW-6194) - [Java] Add non-static approach in DictionaryEncoder making it easy to extend and reuse -* [ARROW-6196](https://issues.apache.org/jira/browse/ARROW-6196) - [Ruby] Add support for building Arrow::TimeNNArray by .new -* [ARROW-6197](https://issues.apache.org/jira/browse/ARROW-6197) - [GLib] Add garrow\_decimal128\_rescale() -* [ARROW-6199](https://issues.apache.org/jira/browse/ARROW-6199) - [Java] Avro adapter avoid potential resource leak. -* [ARROW-6203](https://issues.apache.org/jira/browse/ARROW-6203) - [GLib] Add garrow\_array\_sort\_to\_indices() -* [ARROW-6204](https://issues.apache.org/jira/browse/ARROW-6204) - [GLib] Add garrow\_array\_is\_in\_chunked\_array() -* [ARROW-6206](https://issues.apache.org/jira/browse/ARROW-6206) - [Java][Docs] Document environment variables/java properties -* [ARROW-6209](https://issues.apache.org/jira/browse/ARROW-6209) - [Java] Extract set null method to the base class for fixed width vectors -* [ARROW-6212](https://issues.apache.org/jira/browse/ARROW-6212) - [Java] Support vector rank operation -* [ARROW-6216](https://issues.apache.org/jira/browse/ARROW-6216) - [C++] Allow user to select the compression level -* [ARROW-6217](https://issues.apache.org/jira/browse/ARROW-6217) - [Website] Remove needless \_site/ directory -* [ARROW-6219](https://issues.apache.org/jira/browse/ARROW-6219) - [Java] Add API for JDBC adapter that can convert less then the full result set at a time. -* [ARROW-6220](https://issues.apache.org/jira/browse/ARROW-6220) - [Java] Add API to avro adapter to limit number of rows returned at a time. -* [ARROW-6225](https://issues.apache.org/jira/browse/ARROW-6225) - [Website] Update arrow-site/README and any other places to point website contributors in right direction -* [ARROW-6229](https://issues.apache.org/jira/browse/ARROW-6229) - [C++] Add a DataSource implementation which scans a directory -* [ARROW-6230](https://issues.apache.org/jira/browse/ARROW-6230) - [R] Reading in Parquet files are 20x slower than reading fst files in R -* [ARROW-6231](https://issues.apache.org/jira/browse/ARROW-6231) - [C++][Python] Consider assigning default column names when reading CSV file and header\_rows=0 -* [ARROW-6232](https://issues.apache.org/jira/browse/ARROW-6232) - [C++] Rename Argsort kernel to SortToIndices -* [ARROW-6237](https://issues.apache.org/jira/browse/ARROW-6237) - [R] Add option to set CXXFLAGS when compiling R package with $ARROW\_R\_CXXFLAGS -* [ARROW-6238](https://issues.apache.org/jira/browse/ARROW-6238) - [C++] Implement SimpleDataSource/SimpleDataFragment -* [ARROW-6240](https://issues.apache.org/jira/browse/ARROW-6240) - [Ruby] Arrow::Decimal128Array returns BigDecimal -* [ARROW-6242](https://issues.apache.org/jira/browse/ARROW-6242) - [C++] Implements basic Dataset/Scanner/ScannerBuilder -* [ARROW-6243](https://issues.apache.org/jira/browse/ARROW-6243) - [C++] Implement basic Filter expression classes -* [ARROW-6244](https://issues.apache.org/jira/browse/ARROW-6244) - [C++] Implement Partition DataSource -* [ARROW-6246](https://issues.apache.org/jira/browse/ARROW-6246) - [Website] Add link to R documentation site -* [ARROW-6247](https://issues.apache.org/jira/browse/ARROW-6247) - [Java] Provide a common interface for float4 and float8 vectors -* [ARROW-6249](https://issues.apache.org/jira/browse/ARROW-6249) - [Java] Remove useless class ByteArrayWrapper -* [ARROW-6250](https://issues.apache.org/jira/browse/ARROW-6250) - [Java] Implement ApproxEqualsVisitor comparing approx for floating point -* [ARROW-6252](https://issues.apache.org/jira/browse/ARROW-6252) - [Python] Add pyarrow.Array.diff method that exposes arrow::Diff -* [ARROW-6253](https://issues.apache.org/jira/browse/ARROW-6253) - [Python] Expose "enable\_buffered\_stream" option from parquet::ReaderProperties in pyarrow.parquet.read\_table -* [ARROW-6258](https://issues.apache.org/jira/browse/ARROW-6258) - [R] Add macOS build scripts -* [ARROW-6260](https://issues.apache.org/jira/browse/ARROW-6260) - [Website] Use deploy key on Travis to build and push to asf-site -* [ARROW-6262](https://issues.apache.org/jira/browse/ARROW-6262) - [Developer] Show JIRA issue before merging -* [ARROW-6264](https://issues.apache.org/jira/browse/ARROW-6264) - [Java] There is no need to consider byte order in ArrowBufHasher -* [ARROW-6265](https://issues.apache.org/jira/browse/ARROW-6265) - [Java] Avro adapter implement Array/Map/Fixed type -* [ARROW-6267](https://issues.apache.org/jira/browse/ARROW-6267) - [Ruby] Add Arrow::Time for Arrow::Time{32,64}DataType value -* [ARROW-6271](https://issues.apache.org/jira/browse/ARROW-6271) - [Rust] [DataFusion] Add example for running SQL against Parquet -* [ARROW-6272](https://issues.apache.org/jira/browse/ARROW-6272) - [Rust] [DataFusion] Add register\_parquet convenience method to ExecutionContext -* [ARROW-6278](https://issues.apache.org/jira/browse/ARROW-6278) - [R] Read parquet files from raw vector -* [ARROW-6279](https://issues.apache.org/jira/browse/ARROW-6279) - [Python] Add Table.slice method or allow slices in \_\_getitem\_\_ -* [ARROW-6284](https://issues.apache.org/jira/browse/ARROW-6284) - [C++] Allow references in std::tuple when converting tuple to arrow array -* [ARROW-6287](https://issues.apache.org/jira/browse/ARROW-6287) - [Rust] [DataFusion] Refactor TableProvider to return thread-safe BatchIterator -* [ARROW-6288](https://issues.apache.org/jira/browse/ARROW-6288) - [Java] Implement TypeEqualsVisitor comparing vector type equals considering names and metadata -* [ARROW-6289](https://issues.apache.org/jira/browse/ARROW-6289) - [Java] Add empty() in UnionVector to create instance -* [ARROW-6292](https://issues.apache.org/jira/browse/ARROW-6292) - [C++] Add an option to build with mimalloc -* [ARROW-6294](https://issues.apache.org/jira/browse/ARROW-6294) - [C++] Use hyphen for plasma-store-server executable -* [ARROW-6295](https://issues.apache.org/jira/browse/ARROW-6295) - [Rust][DataFusion] ExecutionError Cannot compare Float32 with Float64 -* [ARROW-6296](https://issues.apache.org/jira/browse/ARROW-6296) - [Java] Cleanup JDBC interfaces and eliminate one memcopy for binary/varchar fields -* [ARROW-6297](https://issues.apache.org/jira/browse/ARROW-6297) - [Java] Compare ArrowBufPointers by unsinged integers -* [ARROW-6300](https://issues.apache.org/jira/browse/ARROW-6300) - [C++] Add io::OutputStream::Abort() -* [ARROW-6303](https://issues.apache.org/jira/browse/ARROW-6303) - [Rust] Add a feature to disable SIMD -* [ARROW-6304](https://issues.apache.org/jira/browse/ARROW-6304) - [Java] Add description to each maven artifact -* [ARROW-6306](https://issues.apache.org/jira/browse/ARROW-6306) - [Java] Support stable sort by stable comparators -* [ARROW-6310](https://issues.apache.org/jira/browse/ARROW-6310) - [C++] Write 64-bit integers as strings in JSON integration test files -* [ARROW-6311](https://issues.apache.org/jira/browse/ARROW-6311) - [Java] Make ApproxEqualsVisitor accept DiffFunction to make it more flexible -* [ARROW-6313](https://issues.apache.org/jira/browse/ARROW-6313) - [Format] Tracking for ensuring flatbuffer serialized values are aligned in stream/files. -* [ARROW-6314](https://issues.apache.org/jira/browse/ARROW-6314) - [C++] Implement changes to ensure flatbuffer alignment. -* [ARROW-6314](https://issues.apache.org/jira/browse/ARROW-6314) - [C++] Implement changes to ensure flatbuffer alignment. -* [ARROW-6315](https://issues.apache.org/jira/browse/ARROW-6315) - [Java] Make change to ensure flatbuffer reads are aligned -* [ARROW-6316](https://issues.apache.org/jira/browse/ARROW-6316) - [Go] Make change to ensure flatbuffer reads are aligned -* [ARROW-6317](https://issues.apache.org/jira/browse/ARROW-6317) - [JS] Implement changes to ensure flatbuffer alignment -* [ARROW-6318](https://issues.apache.org/jira/browse/ARROW-6318) - [Integration] Update integration test to use generated binaries to ensure backwards compatibility -* [ARROW-6319](https://issues.apache.org/jira/browse/ARROW-6319) - [C++] Extract the core of NumericTensor::Value as Tensor::Value -* [ARROW-6326](https://issues.apache.org/jira/browse/ARROW-6326) - [C++] Nullable fields when converting std::tuple to Table -* [ARROW-6328](https://issues.apache.org/jira/browse/ARROW-6328) - Click.option-s should have help text -* [ARROW-6329](https://issues.apache.org/jira/browse/ARROW-6329) - [Format] Add 4-byte "stream continuation" to IPC message format to align Flatbuffers -* [ARROW-6331](https://issues.apache.org/jira/browse/ARROW-6331) - [Java] Incorporate ErrorProne into the java build -* [ARROW-6334](https://issues.apache.org/jira/browse/ARROW-6334) - [Java] Improve the dictionary builder API to return the position of the value in the dictionary -* [ARROW-6335](https://issues.apache.org/jira/browse/ARROW-6335) - [Java] Improve the performance of DictionaryHashTable -* [ARROW-6336](https://issues.apache.org/jira/browse/ARROW-6336) - [Python] Clarify pyarrow.serialize/deserialize docstrings viz-a-viz relationship with Arrow IPC protocol -* [ARROW-6337](https://issues.apache.org/jira/browse/ARROW-6337) - [R] as\_tibble in R API is a misnomer -* [ARROW-6338](https://issues.apache.org/jira/browse/ARROW-6338) - [R] Type function names don't match type names -* [ARROW-6342](https://issues.apache.org/jira/browse/ARROW-6342) - [Python] Add pyarrow.record\_batch factory function with same basic API / semantics as pyarrow.table -* [ARROW-6346](https://issues.apache.org/jira/browse/ARROW-6346) - [GLib] Add garrow\_array\_view() -* [ARROW-6347](https://issues.apache.org/jira/browse/ARROW-6347) - [GLib] Add garrow\_array\_diff\_unified() -* [ARROW-6350](https://issues.apache.org/jira/browse/ARROW-6350) - [Ruby] Remove Arrow::Struct and use Hash instead -* [ARROW-6351](https://issues.apache.org/jira/browse/ARROW-6351) - [Ruby] Improve Arrow\#values performance -* [ARROW-6353](https://issues.apache.org/jira/browse/ARROW-6353) - [Python] Allow user to select compression level in pyarrow.parquet.write\_table -* [ARROW-6355](https://issues.apache.org/jira/browse/ARROW-6355) - [Java] Make range equal visitor reusable -* [ARROW-6356](https://issues.apache.org/jira/browse/ARROW-6356) - [Java] Avro adapter implement Enum type and nested Record type -* [ARROW-6357](https://issues.apache.org/jira/browse/ARROW-6357) - [C++] S3: allow for background writes -* [ARROW-6358](https://issues.apache.org/jira/browse/ARROW-6358) - [C++] FileSystem::DeleteDir should make it optional to delete the directory itself -* [ARROW-6360](https://issues.apache.org/jira/browse/ARROW-6360) - [R] Update support for compression -* [ARROW-6362](https://issues.apache.org/jira/browse/ARROW-6362) - [C++] S3: more flexible credential options -* [ARROW-6365](https://issues.apache.org/jira/browse/ARROW-6365) - [R] Should be able to coerce numeric to integer with schema -* [ARROW-6366](https://issues.apache.org/jira/browse/ARROW-6366) - [Java] Make field vectors final explicitly -* [ARROW-6368](https://issues.apache.org/jira/browse/ARROW-6368) - [C++] Add RecordBatch projection functionality -* [ARROW-6373](https://issues.apache.org/jira/browse/ARROW-6373) - [C++] Make FixedWidthBinaryBuilder consistent with other primitive fixed width builders -* [ARROW-6375](https://issues.apache.org/jira/browse/ARROW-6375) - [C++] Extend ConversionTraits to allow efficiently appending list values in STL API -* [ARROW-6379](https://issues.apache.org/jira/browse/ARROW-6379) - [C++] Do not append any buffers when serializing NullType for IPC -* [ARROW-6381](https://issues.apache.org/jira/browse/ARROW-6381) - [C++] BufferOutputStream::Write is slow for many small writes -* [ARROW-6383](https://issues.apache.org/jira/browse/ARROW-6383) - [Java] report outstanding child allocators on parent allocator close -* [ARROW-6384](https://issues.apache.org/jira/browse/ARROW-6384) - [C++] Bump dependencies -* [ARROW-6385](https://issues.apache.org/jira/browse/ARROW-6385) - [C++] Investigate xxh3 -* [ARROW-6391](https://issues.apache.org/jira/browse/ARROW-6391) - [Python][Flight] Add built-in methods on FlightServerBase to start server and wait for it to be available -* [ARROW-6397](https://issues.apache.org/jira/browse/ARROW-6397) - [C++][CI] Fix S3 minio failure -* [ARROW-6401](https://issues.apache.org/jira/browse/ARROW-6401) - [Java] Implement dictionary-encoded subfields for Struct type -* [ARROW-6402](https://issues.apache.org/jira/browse/ARROW-6402) - [C++] Suppress sign-compare warning with g++ 9.2.1 -* [ARROW-6403](https://issues.apache.org/jira/browse/ARROW-6403) - [Python] Expose FileReader::ReadRowGroups() to Python -* [ARROW-6408](https://issues.apache.org/jira/browse/ARROW-6408) - [Rust] Use "if cfg!" pattern in SIMD kernel implementations -* [ARROW-6413](https://issues.apache.org/jira/browse/ARROW-6413) - [R] Support autogenerating column names -* [ARROW-6415](https://issues.apache.org/jira/browse/ARROW-6415) - [R] Remove usage of R CMD config CXXCPP -* [ARROW-6416](https://issues.apache.org/jira/browse/ARROW-6416) - [Python] Confusing API & documentation regarding chunksizes -* [ARROW-6417](https://issues.apache.org/jira/browse/ARROW-6417) - [C++][Parquet] Non-dictionary BinaryArray reads from Parquet format have slowed down since 0.11.x -* [ARROW-6419](https://issues.apache.org/jira/browse/ARROW-6419) - [Website] Blog post about Parquet dictionary performance work coming in 0.15.x release -* [ARROW-6422](https://issues.apache.org/jira/browse/ARROW-6422) - [Gandiva] Fix double-conversion linker issue -* [ARROW-6426](https://issues.apache.org/jira/browse/ARROW-6426) - [FlightRPC] Expose gRPC configuration knobs in Flight -* [ARROW-6427](https://issues.apache.org/jira/browse/ARROW-6427) - [GLib] Add support for column names autogeneration CSV read option -* [ARROW-6438](https://issues.apache.org/jira/browse/ARROW-6438) - [R] Add bindings for filesystem API -* [ARROW-6447](https://issues.apache.org/jira/browse/ARROW-6447) - [C++] Builds with ARROW\_JEMALLOC=ON wait until jemalloc\_ep is complete before building any libarrow .cc files -* [ARROW-6450](https://issues.apache.org/jira/browse/ARROW-6450) - [C++] Use 2x reallocation strategy in arrow::BufferBuilder instead of 1.5x -* [ARROW-6451](https://issues.apache.org/jira/browse/ARROW-6451) - [Format] Add clarifications to Columnar.rst about the contents of "null" slots in Varbinary or List arrays -* [ARROW-6453](https://issues.apache.org/jira/browse/ARROW-6453) - [C++] More informative error messages from S3 -* [ARROW-6454](https://issues.apache.org/jira/browse/ARROW-6454) - [Developer] Add LLVM license to LICENSE.txt due to binary redistribution in packages -* [ARROW-6458](https://issues.apache.org/jira/browse/ARROW-6458) - [Java] Remove value boxing/unboxing for ApproxEqualsVisitor -* [ARROW-6460](https://issues.apache.org/jira/browse/ARROW-6460) - [Java] Add benchmark and large fake data UT for avro adapter -* [ARROW-6462](https://issues.apache.org/jira/browse/ARROW-6462) - [C++] Can't build with bundled double-conversion on CentOS 6 x86\_64 -* [ARROW-6465](https://issues.apache.org/jira/browse/ARROW-6465) - [Python] Improve Windows build instructions -* [ARROW-6474](https://issues.apache.org/jira/browse/ARROW-6474) - [Python] Provide mechanism for python to write out old format -* [ARROW-6475](https://issues.apache.org/jira/browse/ARROW-6475) - [C++] Don't try to dictionary encode dictionary arrays -* [ARROW-6477](https://issues.apache.org/jira/browse/ARROW-6477) - [Packaging][Crossbow] Use Azure Pipelines to build linux packages -* [ARROW-6480](https://issues.apache.org/jira/browse/ARROW-6480) - [Developer] Add command to generate and send e-mail report for a Crossbow run -* [ARROW-6484](https://issues.apache.org/jira/browse/ARROW-6484) - [Java] Enable create indexType for DictionaryEncoding according to dictionary value count -* [ARROW-6487](https://issues.apache.org/jira/browse/ARROW-6487) - [Rust] [DataFusion] Create test utils module -* [ARROW-6489](https://issues.apache.org/jira/browse/ARROW-6489) - [Developer][Documentation] Fix merge script and readme -* [ARROW-6490](https://issues.apache.org/jira/browse/ARROW-6490) - [Java] log error for leak in allocator close -* [ARROW-6491](https://issues.apache.org/jira/browse/ARROW-6491) - [Java] fix master build failure caused by ErrorProne -* [ARROW-6494](https://issues.apache.org/jira/browse/ARROW-6494) - [C++][Dataset] Implement basic PartitionScheme -* [ARROW-6504](https://issues.apache.org/jira/browse/ARROW-6504) - [Python][Packaging] Add mimalloc to conda packages for better performance -* [ARROW-6505](https://issues.apache.org/jira/browse/ARROW-6505) - [Website] Add new committers -* [ARROW-6518](https://issues.apache.org/jira/browse/ARROW-6518) - [Packaging][Python] Flight failing in OSX Python wheel builds -* [ARROW-6519](https://issues.apache.org/jira/browse/ARROW-6519) - [Java] Use IPC continuation token to mark EOS -* [ARROW-6524](https://issues.apache.org/jira/browse/ARROW-6524) - [Developer][Packaging] Nightly build report's subject should contain Arrow -* [ARROW-6525](https://issues.apache.org/jira/browse/ARROW-6525) - [C++] CloseFromDestructor() should perhaps not crash -* [ARROW-6526](https://issues.apache.org/jira/browse/ARROW-6526) - [C++] Poison data in PoolBuffer destructor -* [ARROW-6527](https://issues.apache.org/jira/browse/ARROW-6527) - [C++] Add OutputStream::Write() variant taking an owned buffer -* [ARROW-6531](https://issues.apache.org/jira/browse/ARROW-6531) - [Python] Add detach() method to buffered streams -* [ARROW-6532](https://issues.apache.org/jira/browse/ARROW-6532) - [R] Write parquet files with compression -* [ARROW-6533](https://issues.apache.org/jira/browse/ARROW-6533) - [R] Compression codec should take a "level" -* [ARROW-6534](https://issues.apache.org/jira/browse/ARROW-6534) - [Java] Fix typos and spelling -* [ARROW-6539](https://issues.apache.org/jira/browse/ARROW-6539) - [R] Provide mechanism to write out old format -* [ARROW-6540](https://issues.apache.org/jira/browse/ARROW-6540) - [R] Add Validate() methods -* [ARROW-6541](https://issues.apache.org/jira/browse/ARROW-6541) - [Format][C++] Use two-part EOS and amend Format documentation -* [ARROW-6542](https://issues.apache.org/jira/browse/ARROW-6542) - [R] Add View() method to array types -* [ARROW-6544](https://issues.apache.org/jira/browse/ARROW-6544) - [R] Documentation/polishing for 0.15 release -* [ARROW-6545](https://issues.apache.org/jira/browse/ARROW-6545) - [Go] Update Go IPC writer to use two-part EOS per mailing list discussion -* [ARROW-6546](https://issues.apache.org/jira/browse/ARROW-6546) - [C++] Add missing FlatBuffers source dependency -* [ARROW-6549](https://issues.apache.org/jira/browse/ARROW-6549) - [C++] Switch back to latest jemalloc 5.x -* [ARROW-6556](https://issues.apache.org/jira/browse/ARROW-6556) - [Python] Prepare for pandas release without SparseDataFrame -* [ARROW-6556](https://issues.apache.org/jira/browse/ARROW-6556) - [Python] Prepare for pandas release without SparseDataFrame -* [ARROW-6557](https://issues.apache.org/jira/browse/ARROW-6557) - [Python] Always return pandas.Series from Array/ChunkedArray.to\_pandas, propagate field names to Series from RecordBatch, Table -* [ARROW-6558](https://issues.apache.org/jira/browse/ARROW-6558) - [C++] Refactor Iterator to a type erased handle -* [ARROW-6559](https://issues.apache.org/jira/browse/ARROW-6559) - [Developer][C++] Add "archery" option to specify system toolchain for C++ builds -* [ARROW-6563](https://issues.apache.org/jira/browse/ARROW-6563) - [Rust] [DataFusion] Create "merge" execution plan -* [ARROW-6569](https://issues.apache.org/jira/browse/ARROW-6569) - [Website] Add support for auto deployment by GitHub Actions -* [ARROW-6570](https://issues.apache.org/jira/browse/ARROW-6570) - [Python] Use MemoryPool to allocate memory for NumPy arrays in to\_pandas calls -* [ARROW-6580](https://issues.apache.org/jira/browse/ARROW-6580) - [Java] Support comparison for unsigned integers -* [ARROW-6584](https://issues.apache.org/jira/browse/ARROW-6584) - [Python][Wheel] Bundle zlib again with the windows wheels -* [ARROW-6588](https://issues.apache.org/jira/browse/ARROW-6588) - [C++] Suppress class-memaccess warning with g++ 9.2.1 -* [ARROW-6589](https://issues.apache.org/jira/browse/ARROW-6589) - [C++] Support BinaryType in MakeArrayOfNull -* [ARROW-6590](https://issues.apache.org/jira/browse/ARROW-6590) - [C++] Do not require ARROW\_JSON=ON when ARROW\_IPC=ON -* [ARROW-6591](https://issues.apache.org/jira/browse/ARROW-6591) - [R] Ignore .Rhistory files in source control -* [ARROW-6599](https://issues.apache.org/jira/browse/ARROW-6599) - [Rust] [DataFusion] Implement SUM aggregate expression -* [ARROW-6601](https://issues.apache.org/jira/browse/ARROW-6601) - [Java] Improve JDBC adapter performance & add benchmark -* [ARROW-6605](https://issues.apache.org/jira/browse/ARROW-6605) - [C++] Add recursion depth control to fs::Selector -* [ARROW-6606](https://issues.apache.org/jira/browse/ARROW-6606) - [C++] Construct tree structure from std::vector -* [ARROW-6609](https://issues.apache.org/jira/browse/ARROW-6609) - [C++] Add minimal build Dockerfile example -* [ARROW-6613](https://issues.apache.org/jira/browse/ARROW-6613) - [C++] Remove dependency on boost::filesystem -* [ARROW-6614](https://issues.apache.org/jira/browse/ARROW-6614) - [C++][Dataset] Implement FileSystemDataSourceDiscovery -* [ARROW-6616](https://issues.apache.org/jira/browse/ARROW-6616) - [Website] Release announcement blog post for 0.15 -* [ARROW-6621](https://issues.apache.org/jira/browse/ARROW-6621) - [Rust][DataFusion] Examples for DataFusion are not executed in CI -* [ARROW-6629](https://issues.apache.org/jira/browse/ARROW-6629) - [Doc][C++] Document the FileSystem API -* [ARROW-6630](https://issues.apache.org/jira/browse/ARROW-6630) - [Doc][C++] Document the file readers (CSV, JSON, Parquet, etc.) -* [ARROW-6644](https://issues.apache.org/jira/browse/ARROW-6644) - [JS] Amend NullType IPC protocol to append no buffers -* [ARROW-6647](https://issues.apache.org/jira/browse/ARROW-6647) - [C++] Can't build with g++ 4.8.5 on CentOS 7 by member initializer for shared\_ptr -* [ARROW-6648](https://issues.apache.org/jira/browse/ARROW-6648) - [Go] Expose the bitutil package -* [ARROW-6649](https://issues.apache.org/jira/browse/ARROW-6649) - [R] print() methods for Table, RecordBatch, etc. -* [ARROW-6653](https://issues.apache.org/jira/browse/ARROW-6653) - [Developer] Add support for auto JIRA link on pull request -* [ARROW-6655](https://issues.apache.org/jira/browse/ARROW-6655) - [Python] Filesystem bindings for S3 -* [ARROW-6664](https://issues.apache.org/jira/browse/ARROW-6664) - [C++] Add option to build without SSE4.2 -* [ARROW-6665](https://issues.apache.org/jira/browse/ARROW-6665) - [Rust] [DataFusion] Implement numeric literal expressions -* [ARROW-6667](https://issues.apache.org/jira/browse/ARROW-6667) - [Python] Avoid Reference Cycles in pyarrow.parquet -* [ARROW-6668](https://issues.apache.org/jira/browse/ARROW-6668) - [Rust] [DataFusion] Implement CAST expression -* [ARROW-6669](https://issues.apache.org/jira/browse/ARROW-6669) - [Rust] [DataFusion] Implement physical expression for binary expressions -* [ARROW-6675](https://issues.apache.org/jira/browse/ARROW-6675) - [JS] Add scanReverse function to dataFrame and filteredDataframe -* [ARROW-6683](https://issues.apache.org/jira/browse/ARROW-6683) - [Python] Add unit tests that validate cross-compatibility with pyarrow.parquet when fastparquet is installed -* [ARROW-6725](https://issues.apache.org/jira/browse/ARROW-6725) - [CI] Disable 3rdparty fuzzit nightly builds -* [ARROW-6735](https://issues.apache.org/jira/browse/ARROW-6735) - [C++] Suppress sign-compare warning with g++ 9.2.1 -* [ARROW-6752](https://issues.apache.org/jira/browse/ARROW-6752) - [Go] implement Stringer for Null array -* [ARROW-6755](https://issues.apache.org/jira/browse/ARROW-6755) - [Release] Improvements to Windows release verification script -* [ARROW-6771](https://issues.apache.org/jira/browse/ARROW-6771) - [Packaging][Python] Missing pytest dependency from conda and wheel builds -* [PARQUET-1468](https://issues.apache.org/jira/browse/PARQUET-1468) - [C++] Consolidate RecordReader, ColumnReader code paths - - -## Bug Fixes - -* [ARROW-1184](https://issues.apache.org/jira/browse/ARROW-1184) - [Java] Dictionary.equals is not working correctly -* [ARROW-2041](https://issues.apache.org/jira/browse/ARROW-2041) - [Python] pyarrow.serialize has high overhead for list of NumPy arrays -* [ARROW-2248](https://issues.apache.org/jira/browse/ARROW-2248) - [Python] Nightly or on-demand HDFS test builds -* [ARROW-2317](https://issues.apache.org/jira/browse/ARROW-2317) - [Python] fix C linkage warning -* [ARROW-2490](https://issues.apache.org/jira/browse/ARROW-2490) - [C++] input stream locking inconsistent -* [ARROW-3176](https://issues.apache.org/jira/browse/ARROW-3176) - [Python] Overflow in Date32 column conversion to pandas -* [ARROW-3203](https://issues.apache.org/jira/browse/ARROW-3203) - [C++] Build error on Debian Buster -* [ARROW-3651](https://issues.apache.org/jira/browse/ARROW-3651) - [Python] Datetimes from non-DateTimeIndex cannot be deserialized -* [ARROW-3652](https://issues.apache.org/jira/browse/ARROW-3652) - [Python] CategoricalIndex is lost after reading back -* [ARROW-3762](https://issues.apache.org/jira/browse/ARROW-3762) - [C++] Parquet arrow::Table reads error when overflowing capacity of BinaryArray -* [ARROW-3933](https://issues.apache.org/jira/browse/ARROW-3933) - [Python] Segfault reading Parquet files from GNOMAD -* [ARROW-4187](https://issues.apache.org/jira/browse/ARROW-4187) - [C++] file-benchmark uses -* [ARROW-4746](https://issues.apache.org/jira/browse/ARROW-4746) - [C++/Python] PyDataTime\_Date wrongly casted to PyDataTime\_DateTime -* [ARROW-4836](https://issues.apache.org/jira/browse/ARROW-4836) - [Python] "Cannot tell() a compressed stream" when using RecordBatchStreamWriter -* [ARROW-4848](https://issues.apache.org/jira/browse/ARROW-4848) - [C++] Static libparquet not compiled with -DARROW\_STATIC on Windows -* [ARROW-4880](https://issues.apache.org/jira/browse/ARROW-4880) - [Python] python/asv-build.sh is probably broken after CMake refactor -* [ARROW-4883](https://issues.apache.org/jira/browse/ARROW-4883) - [Python] read\_csv() returns garbage if given file object in text mode -* [ARROW-5028](https://issues.apache.org/jira/browse/ARROW-5028) - [Python][C++] Creating list with pyarrow.array can overflow child builder -* [ARROW-5072](https://issues.apache.org/jira/browse/ARROW-5072) - [Python] write\_table fails silently on S3 errors -* [ARROW-5085](https://issues.apache.org/jira/browse/ARROW-5085) - [Python/C++] Conversion of dict encoded null column fails in parquet writing when using RowGroups -* [ARROW-5086](https://issues.apache.org/jira/browse/ARROW-5086) - [Python] Space leak in ParquetFile.read\_row\_group() -* [ARROW-5089](https://issues.apache.org/jira/browse/ARROW-5089) - [C++/Python] Writing dictionary encoded columns to parquet is extremely slow when using chunk size -* [ARROW-5103](https://issues.apache.org/jira/browse/ARROW-5103) - [Python] Segfault when using chunked\_array.to\_pandas on array different types (edge case) -* [ARROW-5125](https://issues.apache.org/jira/browse/ARROW-5125) - [Python] Cannot roundtrip extreme dates through pyarrow -* [ARROW-5161](https://issues.apache.org/jira/browse/ARROW-5161) - [Python] Cannot convert struct type from Pandas object column -* [ARROW-5220](https://issues.apache.org/jira/browse/ARROW-5220) - [Python] index / unknown columns in specified schema in Table.from\_pandas -* [ARROW-5220](https://issues.apache.org/jira/browse/ARROW-5220) - [Python] index / unknown columns in specified schema in Table.from\_pandas -* [ARROW-5292](https://issues.apache.org/jira/browse/ARROW-5292) - [C++] Static libraries are built on AppVeyor -* [ARROW-5300](https://issues.apache.org/jira/browse/ARROW-5300) - [C++] 0.13 FAILED to build with option -DARROW\_NO\_DEFAULT\_MEMORY\_POOL -* [ARROW-5374](https://issues.apache.org/jira/browse/ARROW-5374) - [Python] Misleading error message when calling pyarrow.read\_record\_batch on a complete IPC stream -* [ARROW-5414](https://issues.apache.org/jira/browse/ARROW-5414) - [C++] Using "Ninja" build system generator overrides default Release build type on Windows -* [ARROW-5450](https://issues.apache.org/jira/browse/ARROW-5450) - [Python] TimestampArray.to\_pylist() fails with OverflowError: Python int too large to convert to C long -* [ARROW-5471](https://issues.apache.org/jira/browse/ARROW-5471) - [C++][Gandiva]Array offset is ignored in Gandiva projector -* [ARROW-5522](https://issues.apache.org/jira/browse/ARROW-5522) - [Packaging][Documentation] Comments out of date in python/manylinux1/build\_arrow.sh -* [ARROW-5525](https://issues.apache.org/jira/browse/ARROW-5525) - [C++][CI] Enable continuous fuzzing -* [ARROW-5560](https://issues.apache.org/jira/browse/ARROW-5560) - [C++][Plasma] Cannot create Plasma object after OutOfMemory error -* [ARROW-5562](https://issues.apache.org/jira/browse/ARROW-5562) - [C++][Parquet] parquet writer does not handle negative zero correctly -* [ARROW-5630](https://issues.apache.org/jira/browse/ARROW-5630) - [Python][Parquet] Table of nested arrays doesn't round trip -* [ARROW-5638](https://issues.apache.org/jira/browse/ARROW-5638) - [C++] cmake fails to generate Xcode project when Gandiva JNI bindings are enabled -* [ARROW-5651](https://issues.apache.org/jira/browse/ARROW-5651) - [Python] Incorrect conversion from strided Numpy array when other type is specified -* [ARROW-5682](https://issues.apache.org/jira/browse/ARROW-5682) - [Python] from\_pandas conversion casts values to string inconsistently -* [ARROW-5731](https://issues.apache.org/jira/browse/ARROW-5731) - [CI] Turbodbc integration tests are failing -* [ARROW-5753](https://issues.apache.org/jira/browse/ARROW-5753) - [Rust] Fix test failure in CI code coverage -* [ARROW-5772](https://issues.apache.org/jira/browse/ARROW-5772) - [GLib][Plasma][CUDA] Plasma::Client\#refer\_object test is failed -* [ARROW-5775](https://issues.apache.org/jira/browse/ARROW-5775) - [C++] StructArray : cached boxed fields not thread-safe -* [ARROW-5776](https://issues.apache.org/jira/browse/ARROW-5776) - [Gandiva][Crossbow] Revert template to have commit ids. -* [ARROW-5790](https://issues.apache.org/jira/browse/ARROW-5790) - [Python] Passing zero-dim numpy array to pa.array causes segfault -* [ARROW-5817](https://issues.apache.org/jira/browse/ARROW-5817) - [Python] Use pytest marks for Flight test to avoid silently skipping unit tests due to import failures -* [ARROW-5823](https://issues.apache.org/jira/browse/ARROW-5823) - [Rust] CI scripts miss --all-targets cargo argument -* [ARROW-5824](https://issues.apache.org/jira/browse/ARROW-5824) - [Gandiva] [C++] Fix decimal null -* [ARROW-5836](https://issues.apache.org/jira/browse/ARROW-5836) - [Java][OSX] Flight tests are failing: address already in use -* [ARROW-5838](https://issues.apache.org/jira/browse/ARROW-5838) - [C++][Flight][OSX] Building 3rdparty grpc cannot find OpenSSL -* [ARROW-5848](https://issues.apache.org/jira/browse/ARROW-5848) - [C++] SO versioning schema after release 1.0.0 -* [ARROW-5849](https://issues.apache.org/jira/browse/ARROW-5849) - [C++] Compiler warnings on mingw-w64 -* [ARROW-5850](https://issues.apache.org/jira/browse/ARROW-5850) - [CI][R] R appveyor job is broken after release -* [ARROW-5851](https://issues.apache.org/jira/browse/ARROW-5851) - [C++] Compilation of reference benchmarks fails -* [ARROW-5856](https://issues.apache.org/jira/browse/ARROW-5856) - [Python] linking 3rd party cython modules against pyarrow fails since 0.14.0 -* [ARROW-5860](https://issues.apache.org/jira/browse/ARROW-5860) - [Java] [Vector] Fix decimal byte setter -* [ARROW-5863](https://issues.apache.org/jira/browse/ARROW-5863) - [Python] Segmentation Fault via pytest-runner -* [ARROW-5868](https://issues.apache.org/jira/browse/ARROW-5868) - [Python] manylinux2010 wheels have shared library dependency on liblz4 -* [ARROW-5870](https://issues.apache.org/jira/browse/ARROW-5870) - [C++] Development compile instructions need to include "make" -* [ARROW-5873](https://issues.apache.org/jira/browse/ARROW-5873) - [Python] Segmentation fault when comparing schema with None -* [ARROW-5874](https://issues.apache.org/jira/browse/ARROW-5874) - [Python] pyarrow 0.14.0 macOS wheels depend on shared libs under /usr/local/opt -* [ARROW-5878](https://issues.apache.org/jira/browse/ARROW-5878) - [Python][C++] Parquet reader not forward compatible for timestamps without timezone -* [ARROW-5884](https://issues.apache.org/jira/browse/ARROW-5884) - [Java] Fix the get method of StructVector -* [ARROW-5886](https://issues.apache.org/jira/browse/ARROW-5886) - [Python][Packaging] Manylinux1/2010 compliance issue with libz -* [ARROW-5887](https://issues.apache.org/jira/browse/ARROW-5887) - [C\#] ArrowStreamWriter writes FieldNodes in wrong order -* [ARROW-5889](https://issues.apache.org/jira/browse/ARROW-5889) - [Python][C++] Parquet backwards compat for timestamps without timezone broken -* [ARROW-5894](https://issues.apache.org/jira/browse/ARROW-5894) - [C++] libgandiva.so.14 is exporting libstdc++ symbols -* [ARROW-5899](https://issues.apache.org/jira/browse/ARROW-5899) - [Python][Packaging] Bundle uriparser.dll in windows wheels -* [ARROW-5910](https://issues.apache.org/jira/browse/ARROW-5910) - [Python] read\_tensor() fails on non-seekable streams -* [ARROW-5921](https://issues.apache.org/jira/browse/ARROW-5921) - [C++][Fuzzing] Missing nullptr checks in IPC -* [ARROW-5923](https://issues.apache.org/jira/browse/ARROW-5923) - [C++] Fix int96 comment -* [ARROW-5925](https://issues.apache.org/jira/browse/ARROW-5925) - [Gandiva][C++] cast decimal to int should round up -* [ARROW-5930](https://issues.apache.org/jira/browse/ARROW-5930) - [FlightRPC] [Python] Flight CI tests are failing -* [ARROW-5930](https://issues.apache.org/jira/browse/ARROW-5930) - [FlightRPC] [Python] Flight CI tests are failing -* [ARROW-5935](https://issues.apache.org/jira/browse/ARROW-5935) - [C++] ArrayBuilders with mutable type are not robustly supported -* [ARROW-5946](https://issues.apache.org/jira/browse/ARROW-5946) - [Rust] [DataFusion] Projection push down with aggregate producing incorrect results -* [ARROW-5952](https://issues.apache.org/jira/browse/ARROW-5952) - [Python] Segfault when reading empty table with category as pandas dataframe -* [ARROW-5959](https://issues.apache.org/jira/browse/ARROW-5959) - [C++][CI] Fuzzit does not know about branch + commit hash -* [ARROW-5960](https://issues.apache.org/jira/browse/ARROW-5960) - [C++] Boost dependencies are specified in wrong order -* [ARROW-5963](https://issues.apache.org/jira/browse/ARROW-5963) - [R] R Appveyor job does not test changes in the C++ library -* [ARROW-5964](https://issues.apache.org/jira/browse/ARROW-5964) - [C++][Gandiva] Cast double to decimal with rounding returns 0 -* [ARROW-5965](https://issues.apache.org/jira/browse/ARROW-5965) - [Python] Regression: segfault when reading hive table with v0.14 -* [ARROW-5966](https://issues.apache.org/jira/browse/ARROW-5966) - [Python] Capacity error when converting large UTF32 numpy array to arrow array -* [ARROW-5968](https://issues.apache.org/jira/browse/ARROW-5968) - [Java] Remove duplicate Preconditions check in JDBC adapter -* [ARROW-5969](https://issues.apache.org/jira/browse/ARROW-5969) - [CI] [R] Lint failures -* [ARROW-5973](https://issues.apache.org/jira/browse/ARROW-5973) - [Java] Variable width vectors' get methods should return null when the underlying data is null -* [ARROW-5978](https://issues.apache.org/jira/browse/ARROW-5978) - [FlightRPC] [Java] Integration test client doesn't close buffers -* [ARROW-5989](https://issues.apache.org/jira/browse/ARROW-5989) - [C++][Python] pyarrow.lib.ArrowIOError: Unable to load libjvm when using openjdk-8 -* [ARROW-5990](https://issues.apache.org/jira/browse/ARROW-5990) - [Python] RowGroupMetaData.column misses bounds check -* [ARROW-5992](https://issues.apache.org/jira/browse/ARROW-5992) - [C++] Array::View fails for string/utf8 as binary -* [ARROW-5993](https://issues.apache.org/jira/browse/ARROW-5993) - [Python] Reading a dictionary column from Parquet results in disproportionate memory usage -* [ARROW-5996](https://issues.apache.org/jira/browse/ARROW-5996) - [Java] Avoid resource leak in flight service -* [ARROW-5999](https://issues.apache.org/jira/browse/ARROW-5999) - [C++] Required header files missing when built with -DARROW\_DATASET=OFF -* [ARROW-6002](https://issues.apache.org/jira/browse/ARROW-6002) - [C++][Gandiva] TestCastFunctions does not test int64 casting\` -* [ARROW-6004](https://issues.apache.org/jira/browse/ARROW-6004) - [C++] CSV reader ignore\_empty\_lines option doesn't handle empty lines -* [ARROW-6005](https://issues.apache.org/jira/browse/ARROW-6005) - [C++] parquet::arrow::FileReader::GetRecordBatchReader() does not behave as documented since ARROW-1012 -* [ARROW-6006](https://issues.apache.org/jira/browse/ARROW-6006) - [C++] Empty IPC streams containing a dictionary are corrupt -* [ARROW-6012](https://issues.apache.org/jira/browse/ARROW-6012) - [C++] Fall back on known Apache mirror for Thrift downloads -* [ARROW-6015](https://issues.apache.org/jira/browse/ARROW-6015) - [Python] pyarrow wheel: \`DLL load failed\` when importing on windows -* [ARROW-6016](https://issues.apache.org/jira/browse/ARROW-6016) - [Python] pyarrow get\_library\_dirs assertion error -* [ARROW-6029](https://issues.apache.org/jira/browse/ARROW-6029) - [R] Improve R docs on how to fix library version mismatch -* [ARROW-6032](https://issues.apache.org/jira/browse/ARROW-6032) - [C++] CountSetBits doesn't ensure 64-bit aligned accesses -* [ARROW-6038](https://issues.apache.org/jira/browse/ARROW-6038) - [Python] pyarrow.Table.from\_batches produces corrupted table if any of the batches were empty -* [ARROW-6040](https://issues.apache.org/jira/browse/ARROW-6040) - [Java] Dictionary entries are required in IPC streams even when empty -* [ARROW-6046](https://issues.apache.org/jira/browse/ARROW-6046) - [C++] Slice RecordBatch of String array with offset 0 returns whole batch -* [ARROW-6047](https://issues.apache.org/jira/browse/ARROW-6047) - [Rust] Rust nightly 1.38.0 builds failing -* [ARROW-6050](https://issues.apache.org/jira/browse/ARROW-6050) - [Java] Update out-of-date java/flight/README.md -* [ARROW-6054](https://issues.apache.org/jira/browse/ARROW-6054) - pyarrow.serialize should respect the value of structured dtype of numpy -* [ARROW-6058](https://issues.apache.org/jira/browse/ARROW-6058) - [Python][Parquet] Failure when reading Parquet file from S3 with s3fs -* [ARROW-6059](https://issues.apache.org/jira/browse/ARROW-6059) - [Python] Regression memory issue when calling pandas.read\_parquet -* [ARROW-6060](https://issues.apache.org/jira/browse/ARROW-6060) - [Python] too large memory cost using pyarrow.parquet.read\_table with use\_threads=True -* [ARROW-6061](https://issues.apache.org/jira/browse/ARROW-6061) - [C++] Cannot build libarrow without rapidjson -* [ARROW-6066](https://issues.apache.org/jira/browse/ARROW-6066) - [Website] Fix blog post author header -* [ARROW-6067](https://issues.apache.org/jira/browse/ARROW-6067) - [Python] Large memory test failures -* [ARROW-6068](https://issues.apache.org/jira/browse/ARROW-6068) - [Python] Hypothesis test failure, Add StructType::Make that accepts vector of fields -* [ARROW-6073](https://issues.apache.org/jira/browse/ARROW-6073) - [C++] Decimal128Builder is not reset in Finish() -* [ARROW-6082](https://issues.apache.org/jira/browse/ARROW-6082) - [Python] create pa.dictionary() type with non-integer indices type crashes -* [ARROW-6092](https://issues.apache.org/jira/browse/ARROW-6092) - [C++] Python 2.7: arrow\_python\_test failure -* [ARROW-6095](https://issues.apache.org/jira/browse/ARROW-6095) - [C++] Python subproject ignores ARROW\_TEST\_LINKAGE -* [ARROW-6108](https://issues.apache.org/jira/browse/ARROW-6108) - [C++] Appveyor Build\_Debug configuration is hanging in C++ unit tests -* [ARROW-6116](https://issues.apache.org/jira/browse/ARROW-6116) - [C++][Gandiva] Fix bug in TimedTestFilterAdd2 -* [ARROW-6117](https://issues.apache.org/jira/browse/ARROW-6117) - [Java] Fix the set method of FixedSizeBinaryVector -* [ARROW-6119](https://issues.apache.org/jira/browse/ARROW-6119) - [Python] PyArrow wheel import fails on Windows Python 3.7 -* [ARROW-6120](https://issues.apache.org/jira/browse/ARROW-6120) - [C++][Gandiva] including some headers causes decimal\_test to fail -* [ARROW-6126](https://issues.apache.org/jira/browse/ARROW-6126) - [C++] IPC stream reader handling of empty streams potentially not robust -* [ARROW-6132](https://issues.apache.org/jira/browse/ARROW-6132) - [Python] ListArray.from\_arrays does not check validity of input arrays -* [ARROW-6135](https://issues.apache.org/jira/browse/ARROW-6135) - [C++] KeyValueMetadata::Equals should not be order-sensitive -* [ARROW-6136](https://issues.apache.org/jira/browse/ARROW-6136) - [FlightRPC][Java] Don't double-close response stream -* [ARROW-6145](https://issues.apache.org/jira/browse/ARROW-6145) - [Java] UnionVector created by MinorType\#getNewVector could not keep field type info properly -* [ARROW-6148](https://issues.apache.org/jira/browse/ARROW-6148) - [C++][Packaging] Improve aarch64 support -* [ARROW-6152](https://issues.apache.org/jira/browse/ARROW-6152) - [C++][Parquet] Write arrow::Array directly into parquet::TypedColumnWriter -* [ARROW-6153](https://issues.apache.org/jira/browse/ARROW-6153) - [R] Address parquet deprecation warning -* [ARROW-6158](https://issues.apache.org/jira/browse/ARROW-6158) - [Python] possible to create StructArray with type that conflicts with child array's types -* [ARROW-6159](https://issues.apache.org/jira/browse/ARROW-6159) - [C++] PrettyPrint of arrow::Schema missing identation for first line -* [ARROW-6160](https://issues.apache.org/jira/browse/ARROW-6160) - [Java] AbstractStructVector\#getPrimitiveVectors fails to work with complex child vectors -* [ARROW-6166](https://issues.apache.org/jira/browse/ARROW-6166) - [Go] Slice of slice causes index out of range panic -* [ARROW-6167](https://issues.apache.org/jira/browse/ARROW-6167) - [R] macOS binary R packages on CRAN don't have arrow\_available -* [ARROW-6168](https://issues.apache.org/jira/browse/ARROW-6168) - [C++] IWYU docker-compose job is broken -* [ARROW-6170](https://issues.apache.org/jira/browse/ARROW-6170) - [R] "docker-compose build r" is slow -* [ARROW-6171](https://issues.apache.org/jira/browse/ARROW-6171) - [R] "docker-compose run r" fails -* [ARROW-6174](https://issues.apache.org/jira/browse/ARROW-6174) - [C++] Validate chunks in ChunkedArray::Validate -* [ARROW-6175](https://issues.apache.org/jira/browse/ARROW-6175) - [Java] Fix MapVector\#getMinorType and extend AbstractContainerVector addOrGet complex vector API -* [ARROW-6178](https://issues.apache.org/jira/browse/ARROW-6178) - [Developer] Don't fail in merge script on bad primary author input in multi-author PRs -* [ARROW-6182](https://issues.apache.org/jira/browse/ARROW-6182) - [R] Add note to README about r-arrow conda installation -* [ARROW-6186](https://issues.apache.org/jira/browse/ARROW-6186) - [Packaging][C++] Plasma headers not included for ubuntu-xenial libplasma-dev debian package -* [ARROW-6190](https://issues.apache.org/jira/browse/ARROW-6190) - [C++] Define and declare functions regardless of NDEBUG -* [ARROW-6193](https://issues.apache.org/jira/browse/ARROW-6193) - [GLib] Add missing require in test -* [ARROW-6200](https://issues.apache.org/jira/browse/ARROW-6200) - [Java] Method getBufferSizeFor in BaseRepeatedValueVector/ListVector not correct -* [ARROW-6202](https://issues.apache.org/jira/browse/ARROW-6202) - [Java] Exception in thread "main" org.apache.arrow.memory.OutOfMemoryException: Unable to allocate buffer of size 4 due to memory limit. Current allocation: 2147483646 -* [ARROW-6205](https://issues.apache.org/jira/browse/ARROW-6205) - [C++] ARROW\_DEPRECATED warning when including io/interfaces.h from CUDA (.cu) source -* [ARROW-6208](https://issues.apache.org/jira/browse/ARROW-6208) - [Java] Correct byte order before comparing in ByteFunctionHelpers -* [ARROW-6210](https://issues.apache.org/jira/browse/ARROW-6210) - [Java] remove equals API from ValueVector -* [ARROW-6211](https://issues.apache.org/jira/browse/ARROW-6211) - [Java] Remove dependency on RangeEqualsVisitor from ValueVector interface -* [ARROW-6214](https://issues.apache.org/jira/browse/ARROW-6214) - [R] Sanitizer errors triggered via R bindings -* [ARROW-6215](https://issues.apache.org/jira/browse/ARROW-6215) - [Java] RangeEqualVisitor does not properly compare ZeroVector -* [ARROW-6218](https://issues.apache.org/jira/browse/ARROW-6218) - [Java] Add UINT type test in integration to avoid potential overflow -* [ARROW-6223](https://issues.apache.org/jira/browse/ARROW-6223) - [C++] Configuration error with Anaconda Python 3.7.4 -* [ARROW-6224](https://issues.apache.org/jira/browse/ARROW-6224) - [Python] remaining usages of the 'data' attribute (from previous Column) cause warnings -* [ARROW-6227](https://issues.apache.org/jira/browse/ARROW-6227) - [Python] pyarrow.array() shouldn't coerce np.nan to string -* [ARROW-6234](https://issues.apache.org/jira/browse/ARROW-6234) - [Java] ListVector hashCode() is not correct -* [ARROW-6241](https://issues.apache.org/jira/browse/ARROW-6241) - [Java] Failures on master -* [ARROW-6255](https://issues.apache.org/jira/browse/ARROW-6255) - [Rust] [Parquet] Cannot use any published parquet crate due to parquet-format breaking change -* [ARROW-6259](https://issues.apache.org/jira/browse/ARROW-6259) - [C++][CI] Flatbuffers-related failures in CI on macOS -* [ARROW-6263](https://issues.apache.org/jira/browse/ARROW-6263) - [Python] RecordBatch.from\_arrays does not check array types against a passed schema -* [ARROW-6266](https://issues.apache.org/jira/browse/ARROW-6266) - [Java] Resolve the ambiguous method overload in RangeEqualsVisitor -* [ARROW-6268](https://issues.apache.org/jira/browse/ARROW-6268) - Empty buffer should have a valid address -* [ARROW-6269](https://issues.apache.org/jira/browse/ARROW-6269) - [C++][Fuzzing] IPC reads do not check decimal precision -* [ARROW-6270](https://issues.apache.org/jira/browse/ARROW-6270) - [C++][Fuzzing] IPC reads do not check buffer indices -* [ARROW-6290](https://issues.apache.org/jira/browse/ARROW-6290) - [Rust] [DataFusion] sql\_csv example errors when running -* [ARROW-6291](https://issues.apache.org/jira/browse/ARROW-6291) - [C++] CMake ignores ARROW\_PARQUET -* [ARROW-6293](https://issues.apache.org/jira/browse/ARROW-6293) - [Rust] datafusion 0.15.0-SNAPSHOT error -* [ARROW-6301](https://issues.apache.org/jira/browse/ARROW-6301) - [Python] atexit: pyarrow.lib.ArrowKeyError: 'No type extension with name arrow.py\_extension\_type found' -* [ARROW-6302](https://issues.apache.org/jira/browse/ARROW-6302) - [Python][Parquet] Reading dictionary type with serialized Arrow schema does not restore "ordered" type property -* [ARROW-6309](https://issues.apache.org/jira/browse/ARROW-6309) - [C++] Parquet tests and executables are linked statically -* [ARROW-6323](https://issues.apache.org/jira/browse/ARROW-6323) - [R] Expand file paths when passing to readers -* [ARROW-6325](https://issues.apache.org/jira/browse/ARROW-6325) - [Python] wrong conversion of DataFrame with boolean values -* [ARROW-6330](https://issues.apache.org/jira/browse/ARROW-6330) - [C++] Include missing headers in api.h -* [ARROW-6332](https://issues.apache.org/jira/browse/ARROW-6332) - [Java][C++][Gandiva] Handle size of varchar vectors correctly -* [ARROW-6339](https://issues.apache.org/jira/browse/ARROW-6339) - [Python][C++] Rowgroup statistics for pd.NaT array ill defined -* [ARROW-6343](https://issues.apache.org/jira/browse/ARROW-6343) - [Java] [Vector] Fix allocation helper -* [ARROW-6344](https://issues.apache.org/jira/browse/ARROW-6344) - [C++][Gandiva] substring does not handle multibyte characters -* [ARROW-6345](https://issues.apache.org/jira/browse/ARROW-6345) - [C++][Python] "ordered" flag seemingly not taken into account when comparing DictionaryType values for equality -* [ARROW-6348](https://issues.apache.org/jira/browse/ARROW-6348) - [R] arrow::read\_csv\_arrow namespace error when package not loaded -* [ARROW-6354](https://issues.apache.org/jira/browse/ARROW-6354) - [C++] Building without Parquet fails -* [ARROW-6363](https://issues.apache.org/jira/browse/ARROW-6363) - [R] segfault in Table\_\_from\_dots with unexpected schema -* [ARROW-6364](https://issues.apache.org/jira/browse/ARROW-6364) - [R] Handling unexpected input to time64() et al -* [ARROW-6369](https://issues.apache.org/jira/browse/ARROW-6369) - [Python] Support list-of-boolean in Array.to\_pandas conversion -* [ARROW-6371](https://issues.apache.org/jira/browse/ARROW-6371) - [Doc] Row to columnar conversion example mentions arrow::Column in comments -* [ARROW-6372](https://issues.apache.org/jira/browse/ARROW-6372) - [Rust][Datafusion] Casting from Un-signed to Signed Integers not supported -* [ARROW-6376](https://issues.apache.org/jira/browse/ARROW-6376) - [Developer] PR merge script has "master" target ref hard-coded -* [ARROW-6387](https://issues.apache.org/jira/browse/ARROW-6387) - [Archery] Errors with make -* [ARROW-6392](https://issues.apache.org/jira/browse/ARROW-6392) - [Python][Flight] list\_actions Server RPC is not tested in test\_flight.py, nor is return value validated -* [ARROW-6395](https://issues.apache.org/jira/browse/ARROW-6395) - [Python] Bug when using bool arrays with stride greater than 1 -* [ARROW-6406](https://issues.apache.org/jira/browse/ARROW-6406) - [C++] jemalloc\_ep fails for offline build -* [ARROW-6411](https://issues.apache.org/jira/browse/ARROW-6411) - [C++][Parquet] DictEncoderImpl::PutIndicesTyped has bad performance on some systems -* [ARROW-6412](https://issues.apache.org/jira/browse/ARROW-6412) - [C++] arrow-flight-test can crash because of port allocation -* [ARROW-6418](https://issues.apache.org/jira/browse/ARROW-6418) - [C++] Plasma cmake targets are not exported -* [ARROW-6423](https://issues.apache.org/jira/browse/ARROW-6423) - [Python] pyarrow.CompressedOutputStream() never completes with compression='snappy' -* [ARROW-6424](https://issues.apache.org/jira/browse/ARROW-6424) - [C++][Fuzzing] Fuzzit nightly is broken -* [ARROW-6425](https://issues.apache.org/jira/browse/ARROW-6425) - [C++] ValidateArray fail for slice of list array -* [ARROW-6428](https://issues.apache.org/jira/browse/ARROW-6428) - [CI][Crossbow] Nightly turbodbc job fails -* [ARROW-6430](https://issues.apache.org/jira/browse/ARROW-6430) - [CI][Crossbow] Nightly R docker job fails -* [ARROW-6431](https://issues.apache.org/jira/browse/ARROW-6431) - [Python] Test suite fails without pandas installed -* [ARROW-6432](https://issues.apache.org/jira/browse/ARROW-6432) - [CI][Crossbow] Remove alpine crossbow jobs -* [ARROW-6433](https://issues.apache.org/jira/browse/ARROW-6433) - [CI][Crossbow] Nightly java docker job fails -* [ARROW-6434](https://issues.apache.org/jira/browse/ARROW-6434) - [CI][Crossbow] Nightly HDFS integration job fails -* [ARROW-6435](https://issues.apache.org/jira/browse/ARROW-6435) - [CI][Crossbow] Nightly dask integration job fails -* [ARROW-6440](https://issues.apache.org/jira/browse/ARROW-6440) - [CI][Crossbow] Nightly ubuntu, debian, and centos package builds fail -* [ARROW-6441](https://issues.apache.org/jira/browse/ARROW-6441) - [CI][Crossbow] Nightly Centos 6 job fails -* [ARROW-6442](https://issues.apache.org/jira/browse/ARROW-6442) - [CI][Crossbow] Nightly gandiva jar osx build fails -* [ARROW-6443](https://issues.apache.org/jira/browse/ARROW-6443) - [CI][Crossbow] Nightly conda osx builds fail -* [ARROW-6444](https://issues.apache.org/jira/browse/ARROW-6444) - [CI][Crossbow] Nightly conda Windows builds fail (time out) -* [ARROW-6446](https://issues.apache.org/jira/browse/ARROW-6446) - [OSX][Python][Wheel] Turn off ORC feature in the wheel building scripts -* [ARROW-6449](https://issues.apache.org/jira/browse/ARROW-6449) - [R] io "tell()" methods are inconsistently named and untested -* [ARROW-6457](https://issues.apache.org/jira/browse/ARROW-6457) - [C++] CMake build locally fails with MSVC 2015 build generator -* [ARROW-6461](https://issues.apache.org/jira/browse/ARROW-6461) - [Java] EchoServer can close socket before client has finished reading -* [ARROW-6472](https://issues.apache.org/jira/browse/ARROW-6472) - [Java] ValueVector\#accept may has potential cast exception -* [ARROW-6476](https://issues.apache.org/jira/browse/ARROW-6476) - [Java][CI] Travis java all-jdks job is broken -* [ARROW-6478](https://issues.apache.org/jira/browse/ARROW-6478) - [C++] Roll back to jemalloc stable-4 branch until performance issues in 5.2.x addressed -* [ARROW-6481](https://issues.apache.org/jira/browse/ARROW-6481) - [Python][C++] Bad performance of read\_csv() with column\_types -* [ARROW-6488](https://issues.apache.org/jira/browse/ARROW-6488) - [Python] pyarrow.NULL equals to itself -* [ARROW-6492](https://issues.apache.org/jira/browse/ARROW-6492) - [Python] file written with latest fastparquet cannot be read with latest pyarrow -* [ARROW-6502](https://issues.apache.org/jira/browse/ARROW-6502) - [GLib][CI] MinGW failure in CI -* [ARROW-6506](https://issues.apache.org/jira/browse/ARROW-6506) - [C++] Validation of ExtensionType with nested type fails -* [ARROW-6509](https://issues.apache.org/jira/browse/ARROW-6509) - [C++][Gandiva] Re-enable Gandiva JNI tests and fix Travis CI failure -* [ARROW-6509](https://issues.apache.org/jira/browse/ARROW-6509) - [C++][Gandiva] Re-enable Gandiva JNI tests and fix Travis CI failure -* [ARROW-6520](https://issues.apache.org/jira/browse/ARROW-6520) - [Python] Segmentation fault on writing tables with fixed size binary fields -* [ARROW-6522](https://issues.apache.org/jira/browse/ARROW-6522) - [Python] Test suite fails with pandas 0.23.4, pytest 3.8.1 -* [ARROW-6530](https://issues.apache.org/jira/browse/ARROW-6530) - [CI][Crossbow][R] Nightly R job doesn't install all dependencies -* [ARROW-6550](https://issues.apache.org/jira/browse/ARROW-6550) - [C++] Filter expressions PR failing manylinux package builds -* [ARROW-6551](https://issues.apache.org/jira/browse/ARROW-6551) - [Python] Dask Parquet integration test failure -* [ARROW-6552](https://issues.apache.org/jira/browse/ARROW-6552) - [C++] boost::optional in STL test fails compiling in gcc 4.8.2 -* [ARROW-6560](https://issues.apache.org/jira/browse/ARROW-6560) - [Python] Failures in \*-nopandas integration tests -* [ARROW-6561](https://issues.apache.org/jira/browse/ARROW-6561) - [Python] pandas-master integration test failure -* [ARROW-6562](https://issues.apache.org/jira/browse/ARROW-6562) - [GLib] Fix wrong sliced data of GArrowBuffer -* [ARROW-6564](https://issues.apache.org/jira/browse/ARROW-6564) - [Python] Do not require pandas for invoking Array.\_\_array\_\_ -* [ARROW-6565](https://issues.apache.org/jira/browse/ARROW-6565) - [Rust] [DataFusion] Intermittent test failure due to temp dir already existing -* [ARROW-6568](https://issues.apache.org/jira/browse/ARROW-6568) - [C++][Python][Parquet] pyarrow.parquet crash writing zero-chunk dictionary-type column -* [ARROW-6572](https://issues.apache.org/jira/browse/ARROW-6572) - [C++] Reading some Parquet data can return uninitialized memory -* [ARROW-6573](https://issues.apache.org/jira/browse/ARROW-6573) - [Python] Segfault when writing to parquet -* [ARROW-6576](https://issues.apache.org/jira/browse/ARROW-6576) - [R] Fix sparklyr integration tests -* [ARROW-6586](https://issues.apache.org/jira/browse/ARROW-6586) - [Python][Packaging] Windows wheel builds failing with "DLL load failure" -* [ARROW-6597](https://issues.apache.org/jira/browse/ARROW-6597) - [Python] Segfault in test\_pandas with Python 2.7 -* [ARROW-6618](https://issues.apache.org/jira/browse/ARROW-6618) - [Python] Reading a zero-size buffer can segfault -* [ARROW-6620](https://issues.apache.org/jira/browse/ARROW-6620) - [Python][CI] pandas-master build failing due to removal of "to\_sparse" method -* [ARROW-6622](https://issues.apache.org/jira/browse/ARROW-6622) - [C++][R] SubTreeFileSystem path error on Windows -* [ARROW-6623](https://issues.apache.org/jira/browse/ARROW-6623) - [CI][Python] Dask docker integration test broken perhaps by statistics-related change -* [ARROW-6639](https://issues.apache.org/jira/browse/ARROW-6639) - [Packaging][RPM] Add support for CentOS 7 on aarch64 -* [ARROW-6640](https://issues.apache.org/jira/browse/ARROW-6640) - [C++] Error when BufferedInputStream Peek more than bytes buffered -* [ARROW-6641](https://issues.apache.org/jira/browse/ARROW-6641) - [C++] Remove Deprecated WriteableFile warning -* [ARROW-6642](https://issues.apache.org/jira/browse/ARROW-6642) - [Python] chained access of ParquetDataset's metadata segfaults -* [ARROW-6651](https://issues.apache.org/jira/browse/ARROW-6651) - [R] Fix R conda job -* [ARROW-6652](https://issues.apache.org/jira/browse/ARROW-6652) - [Python] to\_pandas conversion removes timezone from type -* [ARROW-6652](https://issues.apache.org/jira/browse/ARROW-6652) - [Python] to\_pandas conversion removes timezone from type -* [ARROW-6660](https://issues.apache.org/jira/browse/ARROW-6660) - [Rust] [DataFusion] Minor docs update for 0.15.0 release -* [ARROW-6670](https://issues.apache.org/jira/browse/ARROW-6670) - [CI][R] Fix fix for R nightly jobs -* [ARROW-6674](https://issues.apache.org/jira/browse/ARROW-6674) - [Python] Fix or ignore the test warnings -* [ARROW-6677](https://issues.apache.org/jira/browse/ARROW-6677) - [FlightRPC][C++] Document using Flight in C++ -* [ARROW-6678](https://issues.apache.org/jira/browse/ARROW-6678) - [C++] Regression in Parquet file compatibility introduced by ARROW-3246 -* [ARROW-6679](https://issues.apache.org/jira/browse/ARROW-6679) - [RELEASE] autobrew license in LICENSE.txt is not acceptable -* [ARROW-6682](https://issues.apache.org/jira/browse/ARROW-6682) - [C\#] Arrow R/C++ hangs reading binary file generated by C\# -* [ARROW-6687](https://issues.apache.org/jira/browse/ARROW-6687) - [Rust] [DataFusion] Query returns incorrect row count -* [ARROW-6687](https://issues.apache.org/jira/browse/ARROW-6687) - [Rust] [DataFusion] Query returns incorrect row count -* [ARROW-6701](https://issues.apache.org/jira/browse/ARROW-6701) - [C++][R] Lint failing on R cpp code -* [ARROW-6703](https://issues.apache.org/jira/browse/ARROW-6703) - [Packaging][Linux] Restore ARROW\_VERSION environment variable -* [ARROW-6705](https://issues.apache.org/jira/browse/ARROW-6705) - [Rust] [DataFusion] README has invalid github URL -* [ARROW-6709](https://issues.apache.org/jira/browse/ARROW-6709) - [JAVA] Jdbc adapter currentIndex should increment when value is null -* [ARROW-6714](https://issues.apache.org/jira/browse/ARROW-6714) - [R] Fix untested RecordBatchWriter case -* [ARROW-6716](https://issues.apache.org/jira/browse/ARROW-6716) - [CI] [Rust] New 1.40.0 nightly causing builds to fail -* [ARROW-6748](https://issues.apache.org/jira/browse/ARROW-6748) - [RUBY] gem compilation error -* [ARROW-6751](https://issues.apache.org/jira/browse/ARROW-6751) - [CI] ccache doesn't cache on Travis-CI -* [ARROW-6760](https://issues.apache.org/jira/browse/ARROW-6760) - [C++] JSON: improve error message when column changed type -* [ARROW-6773](https://issues.apache.org/jira/browse/ARROW-6773) - [C++] Filter kernel returns invalid data when filtering with an Array slice -* [ARROW-6796](https://issues.apache.org/jira/browse/ARROW-6796) - Certain moderately-sized (\~100MB) default-Snappy-compressed Parquet files take enormous memory and long time to load by pyarrow.parquet.read\_table -* [ARROW-7112](https://issues.apache.org/jira/browse/ARROW-7112) - Wrong contents when initializinga pyarrow.Table from boolean DataFrame -* [PARQUET-1623](https://issues.apache.org/jira/browse/PARQUET-1623) - [C++] Invalid memory access with a magic number of records -* [PARQUET-1631](https://issues.apache.org/jira/browse/PARQUET-1631) - [C++] ParquetInputWrapper::GetSize always returns 0 -* [PARQUET-1640](https://issues.apache.org/jira/browse/PARQUET-1640) - [C++] parquet-encoding-benchmark crashes - - - -# Apache Arrow 0.14.1 (2019-07-22) - -## Bug Fixes - -* [ARROW-5775](https://issues.apache.org/jira/browse/ARROW-5775) - [C++] StructArray : cached boxed fields not thread-safe -* [ARROW-5790](https://issues.apache.org/jira/browse/ARROW-5790) - [Python] Passing zero-dim numpy array to pa.array causes segfault -* [ARROW-5791](https://issues.apache.org/jira/browse/ARROW-5791) - [Python] pyarrow.csv.read\_csv hangs + eats all RAM -* [ARROW-5816](https://issues.apache.org/jira/browse/ARROW-5816) - [Release] Parallel curl does not work reliably in verify-release-candidate-sh -* [ARROW-5836](https://issues.apache.org/jira/browse/ARROW-5836) - [Java][OSX] Flight tests are failing: address already in use -* [ARROW-5838](https://issues.apache.org/jira/browse/ARROW-5838) - [C++][Flight][OSX] Building 3rdparty grpc cannot find OpenSSL -* [ARROW-5849](https://issues.apache.org/jira/browse/ARROW-5849) - [C++] Compiler warnings on mingw-w64 -* [ARROW-5850](https://issues.apache.org/jira/browse/ARROW-5850) - [CI][R] R appveyor job is broken after release -* [ARROW-5851](https://issues.apache.org/jira/browse/ARROW-5851) - [C++] Compilation of reference benchmarks fails -* [ARROW-5856](https://issues.apache.org/jira/browse/ARROW-5856) - [Python] linking 3rd party cython modules against pyarrow fails since 0.14.0 -* [ARROW-5863](https://issues.apache.org/jira/browse/ARROW-5863) - [Python] Segmentation Fault via pytest-runner -* [ARROW-5868](https://issues.apache.org/jira/browse/ARROW-5868) - [Python] manylinux2010 wheels have shared library dependency on liblz4 -* [ARROW-5873](https://issues.apache.org/jira/browse/ARROW-5873) - [Python] Segmentation fault when comparing schema with None -* [ARROW-5874](https://issues.apache.org/jira/browse/ARROW-5874) - [Python] pyarrow 0.14.0 macOS wheels depend on shared libs under /usr/local/opt -* [ARROW-5878](https://issues.apache.org/jira/browse/ARROW-5878) - [Python][C++] Parquet reader not forward compatible for timestamps without timezone -* [ARROW-5886](https://issues.apache.org/jira/browse/ARROW-5886) - [Python][Packaging] Manylinux1/2010 compliance issue with libz -* [ARROW-5887](https://issues.apache.org/jira/browse/ARROW-5887) - [C\#] ArrowStreamWriter writes FieldNodes in wrong order -* [ARROW-5889](https://issues.apache.org/jira/browse/ARROW-5889) - [Python][C++] Parquet backwards compat for timestamps without timezone broken -* [ARROW-5899](https://issues.apache.org/jira/browse/ARROW-5899) - [Python][Packaging] Bundle uriparser.dll in windows wheels -* [ARROW-5921](https://issues.apache.org/jira/browse/ARROW-5921) - [C++][Fuzzing] Missing nullptr checks in IPC -* [PARQUET-1623](https://issues.apache.org/jira/browse/PARQUET-1623) - [C++] Invalid memory access with a magic number of records - - -## New Features and Improvements - -* [ARROW-5101](https://issues.apache.org/jira/browse/ARROW-5101) - [Packaging] Avoid bundling static libraries in Windows conda packages -* [ARROW-5380](https://issues.apache.org/jira/browse/ARROW-5380) - [C++] Fix and enable UBSan for unaligned accesses. -* [ARROW-5564](https://issues.apache.org/jira/browse/ARROW-5564) - [C++] Add uriparser to conda-forge -* [ARROW-5609](https://issues.apache.org/jira/browse/ARROW-5609) - [C++] Set CMP0068 CMake policy to avoid macOS warnings -* [ARROW-5784](https://issues.apache.org/jira/browse/ARROW-5784) - [Release][GLib] Replace c\_glib/ after running c\_glib/autogen.sh in dev/release/02-source.sh -* [ARROW-5785](https://issues.apache.org/jira/browse/ARROW-5785) - [Rust] Rust datafusion implementation should not depend on rustyline -* [ARROW-5787](https://issues.apache.org/jira/browse/ARROW-5787) - [Release][Rust] Use local modules to verify RC -* [ARROW-5793](https://issues.apache.org/jira/browse/ARROW-5793) - [Release] Avoid duplicate known host SSH error in dev/release/03-binary.sh -* [ARROW-5794](https://issues.apache.org/jira/browse/ARROW-5794) - [Release] Skip uploading already uploaded binaries -* [ARROW-5795](https://issues.apache.org/jira/browse/ARROW-5795) - [Release] Add missing waits on uploading binaries -* [ARROW-5796](https://issues.apache.org/jira/browse/ARROW-5796) - [Release][APT] Update expected package list -* [ARROW-5797](https://issues.apache.org/jira/browse/ARROW-5797) - [Release][APT] Update supported distributions -* [ARROW-5820](https://issues.apache.org/jira/browse/ARROW-5820) - [Release] Remove undefined variable check from verify script -* [ARROW-5827](https://issues.apache.org/jira/browse/ARROW-5827) - [C++] Require c-ares CMake config -* [ARROW-5828](https://issues.apache.org/jira/browse/ARROW-5828) - [C++] Add Protocol Buffers version check -* [ARROW-5866](https://issues.apache.org/jira/browse/ARROW-5866) - [C++] Remove duplicate library in cpp/Brewfile -* [ARROW-5877](https://issues.apache.org/jira/browse/ARROW-5877) - [FlightRPC] Fix auth incompatibilities between Python/Java -* [ARROW-5904](https://issues.apache.org/jira/browse/ARROW-5904) - [Java] [Plasma] Fix compilation of Plasma Java client -* [ARROW-5908](https://issues.apache.org/jira/browse/ARROW-5908) - [C\#] ArrowStreamWriter doesn't align buffers to 8 bytes -* [ARROW-5934](https://issues.apache.org/jira/browse/ARROW-5934) - [Python] Bundle arrow's LICENSE with the wheels -* [ARROW-5937](https://issues.apache.org/jira/browse/ARROW-5937) - [Release] Stop parallel binary upload -* [ARROW-5938](https://issues.apache.org/jira/browse/ARROW-5938) - [Release] Create branch for adding release note automatically -* [ARROW-5939](https://issues.apache.org/jira/browse/ARROW-5939) - [Release] Add support for generating vote email template separately -* [ARROW-5940](https://issues.apache.org/jira/browse/ARROW-5940) - [Release] Add support for re-uploading sign/checksum for binary artifacts -* [ARROW-5941](https://issues.apache.org/jira/browse/ARROW-5941) - [Release] Avoid re-uploading already uploaded binary artifacts -* [ARROW-5958](https://issues.apache.org/jira/browse/ARROW-5958) - [Python] Link zlib statically in the wheels - - - -# Apache Arrow 0.14.0 (2019-07-04) - -## New Features and Improvements - -* [ARROW-258](https://issues.apache.org/jira/browse/ARROW-258) - [Format] clarify definition of Buffer in context of RPC, IPC, File -* [ARROW-653](https://issues.apache.org/jira/browse/ARROW-653) - [Python / C++] Add debugging function to print an array's buffer contents in hexadecimal -* [ARROW-767](https://issues.apache.org/jira/browse/ARROW-767) - [C++] Adopt FileSystem abstraction -* [ARROW-835](https://issues.apache.org/jira/browse/ARROW-835) - [Format] Add Timedelta type to describe time intervals -* [ARROW-840](https://issues.apache.org/jira/browse/ARROW-840) - [Python] Provide Python API for creating user-defined data types that can survive Arrow IPC -* [ARROW-973](https://issues.apache.org/jira/browse/ARROW-973) - [Website] Add FAQ page about project -* [ARROW-1012](https://issues.apache.org/jira/browse/ARROW-1012) - [C++] Create a configurable implementation of RecordBatchReader that reads from Apache Parquet files -* [ARROW-1207](https://issues.apache.org/jira/browse/ARROW-1207) - [C++] Implement Map logical type -* [ARROW-1261](https://issues.apache.org/jira/browse/ARROW-1261) - [Java] Add container type for Map logical type -* [ARROW-1278](https://issues.apache.org/jira/browse/ARROW-1278) - Integration tests for Fixed Size List type -* [ARROW-1279](https://issues.apache.org/jira/browse/ARROW-1279) - [Integration][Java] Integration tests for Map type -* [ARROW-1280](https://issues.apache.org/jira/browse/ARROW-1280) - [C++] Implement Fixed Size List type -* [ARROW-1349](https://issues.apache.org/jira/browse/ARROW-1349) - [Packaging] Provide APT and Yum repositories -* [ARROW-1496](https://issues.apache.org/jira/browse/ARROW-1496) - [JS] Upload coverage data to codecov.io -* [ARROW-1558](https://issues.apache.org/jira/browse/ARROW-1558) - [C++] Implement boolean selection kernels -* [ARROW-1587](https://issues.apache.org/jira/browse/ARROW-1587) - [Format] Add metadata for user-defined logical types -* [ARROW-1774](https://issues.apache.org/jira/browse/ARROW-1774) - [C++] Add "view" function to create zero-copy views for compatible types, if supported -* [ARROW-1833](https://issues.apache.org/jira/browse/ARROW-1833) - [Java] Add accessor methods for data buffers that skip null checking -* [ARROW-1957](https://issues.apache.org/jira/browse/ARROW-1957) - [Python] Write nanosecond timestamps using new NANO LogicalType Parquet unit -* [ARROW-1983](https://issues.apache.org/jira/browse/ARROW-1983) - [Python] Add ability to write parquet \`\_metadata\` file -* [ARROW-2057](https://issues.apache.org/jira/browse/ARROW-2057) - [Python] Configure size of data pages in pyarrow.parquet.write\_table -* [ARROW-2102](https://issues.apache.org/jira/browse/ARROW-2102) - [C++] Implement take kernel functions - primitive value type -* [ARROW-2103](https://issues.apache.org/jira/browse/ARROW-2103) - [C++] Implement take kernel functions - string/binary value type -* [ARROW-2104](https://issues.apache.org/jira/browse/ARROW-2104) - [C++] Implement take kernel functions - nested array value type -* [ARROW-2105](https://issues.apache.org/jira/browse/ARROW-2105) - [C++] Implement take kernel functions - properly handle special indices -* [ARROW-2186](https://issues.apache.org/jira/browse/ARROW-2186) - [C++] Clean up architecture specific compiler flags -* [ARROW-2217](https://issues.apache.org/jira/browse/ARROW-2217) - [C++] Add option to use dynamic linking for compression library dependencies -* [ARROW-2298](https://issues.apache.org/jira/browse/ARROW-2298) - [Python] Add option to not consider NaN to be null when converting to an integer Arrow type -* [ARROW-2412](https://issues.apache.org/jira/browse/ARROW-2412) - [Integration] Add nested dictionary integration test -* [ARROW-2467](https://issues.apache.org/jira/browse/ARROW-2467) - [Rust] Generate code using Flatbuffers -* [ARROW-2517](https://issues.apache.org/jira/browse/ARROW-2517) - [Java] Add list writer -* [ARROW-2618](https://issues.apache.org/jira/browse/ARROW-2618) - [Rust] Bitmap constructor should accept for flag for default state (0 or 1) -* [ARROW-2667](https://issues.apache.org/jira/browse/ARROW-2667) - [C++/Python] Add pandas-like take method to Array -* [ARROW-2707](https://issues.apache.org/jira/browse/ARROW-2707) - [C++] Implement Table::Slice methods using Column::Slice -* [ARROW-2709](https://issues.apache.org/jira/browse/ARROW-2709) - [Python] write\_to\_dataset poor performance when splitting -* [ARROW-2730](https://issues.apache.org/jira/browse/ARROW-2730) - [C++] Set up CMAKE\_C\_FLAGS more thoughtfully instead of using CMAKE\_CXX\_FLAGS -* [ARROW-2796](https://issues.apache.org/jira/browse/ARROW-2796) - [C++] Simplify symbols.map file, use when building libarrow\_python -* [ARROW-2818](https://issues.apache.org/jira/browse/ARROW-2818) - [Python] Better error message when passing SparseDataFrame into Table.from\_pandas -* [ARROW-2835](https://issues.apache.org/jira/browse/ARROW-2835) - [C++] ReadAt/WriteAt are inconsistent with moving the files position -* [ARROW-2969](https://issues.apache.org/jira/browse/ARROW-2969) - [R] Convert between StructArray and "nested" data.frame column containing data frame in each cell -* [ARROW-2981](https://issues.apache.org/jira/browse/ARROW-2981) - [C++] Support scripts / documentation for running clang-tidy on codebase -* [ARROW-2984](https://issues.apache.org/jira/browse/ARROW-2984) - [JS] Refactor release verification script to share code with main source release verification script -* [ARROW-3040](https://issues.apache.org/jira/browse/ARROW-3040) - [Go] add support for comparing Arrays -* [ARROW-3041](https://issues.apache.org/jira/browse/ARROW-3041) - [Go] add support for TimeArray -* [ARROW-3052](https://issues.apache.org/jira/browse/ARROW-3052) - [C++] Detect ORC system packages -* [ARROW-3087](https://issues.apache.org/jira/browse/ARROW-3087) - [C++] Add kernels for comparison operations to scalars -* [ARROW-3144](https://issues.apache.org/jira/browse/ARROW-3144) - [C++] Move "dictionary" member from DictionaryType to ArrayData to allow for changing dictionaries between Array chunks -* [ARROW-3150](https://issues.apache.org/jira/browse/ARROW-3150) - [Python] Ship Flight-enabled Python wheels on Linux and Windows -* [ARROW-3166](https://issues.apache.org/jira/browse/ARROW-3166) - [C++] Consolidate IO interfaces used in arrow/io and parquet-cpp -* [ARROW-3191](https://issues.apache.org/jira/browse/ARROW-3191) - [Java] Add support for ArrowBuf to point to arbitrary memory. -* [ARROW-3200](https://issues.apache.org/jira/browse/ARROW-3200) - [C++] Add support for reading Flight streams with dictionaries -* [ARROW-3290](https://issues.apache.org/jira/browse/ARROW-3290) - [C++] Toolchain support for secure gRPC -* [ARROW-3294](https://issues.apache.org/jira/browse/ARROW-3294) - [C++] Test Flight RPC on Windows / Appveyor -* [ARROW-3314](https://issues.apache.org/jira/browse/ARROW-3314) - [R] Set -rpath using pkg-config when building -* [ARROW-3330](https://issues.apache.org/jira/browse/ARROW-3330) - [C++] Spawn multiple Flight performance servers in flight-benchmark to test parallel get performance -* [ARROW-3419](https://issues.apache.org/jira/browse/ARROW-3419) - [C++] Run include-what-you-use checks as nightly build -* [ARROW-3459](https://issues.apache.org/jira/browse/ARROW-3459) - [C++][Gandiva] Add support for variable length output vectors -* [ARROW-3475](https://issues.apache.org/jira/browse/ARROW-3475) - [C++] Int64Builder.Finish(NumericArray) -* [ARROW-3570](https://issues.apache.org/jira/browse/ARROW-3570) - [Packaging] Don't bundle test data files with python wheels -* [ARROW-3572](https://issues.apache.org/jira/browse/ARROW-3572) - [Packaging] Correctly handle ssh origin urls for crossbow -* [ARROW-3671](https://issues.apache.org/jira/browse/ARROW-3671) - [Go] implement Interval array -* [ARROW-3676](https://issues.apache.org/jira/browse/ARROW-3676) - [Go] implement Decimal128 array -* [ARROW-3679](https://issues.apache.org/jira/browse/ARROW-3679) - [Go] implement IPC protocol -* [ARROW-3680](https://issues.apache.org/jira/browse/ARROW-3680) - [Go] implement Float16 array -* [ARROW-3686](https://issues.apache.org/jira/browse/ARROW-3686) - [Python] Support for masked arrays in to/from numpy -* [ARROW-3702](https://issues.apache.org/jira/browse/ARROW-3702) - [R] POSIXct mapped to DateType not TimestampType? -* [ARROW-3714](https://issues.apache.org/jira/browse/ARROW-3714) - [CI] Run RAT checks in pre-commit hooks -* [ARROW-3729](https://issues.apache.org/jira/browse/ARROW-3729) - [C++] Support for writing TIMESTAMP\_NANOS Parquet metadata -* [ARROW-3732](https://issues.apache.org/jira/browse/ARROW-3732) - [R] Add functions to write RecordBatch or Schema to Message value, then read back -* [ARROW-3758](https://issues.apache.org/jira/browse/ARROW-3758) - [R] Build R library on Windows, document build instructions for Windows developers -* [ARROW-3759](https://issues.apache.org/jira/browse/ARROW-3759) - [R][CI] Build and test on Windows in Appveyor -* [ARROW-3767](https://issues.apache.org/jira/browse/ARROW-3767) - [C++] Add cast for Null to any type -* [ARROW-3780](https://issues.apache.org/jira/browse/ARROW-3780) - [R] Failed to fetch data: invalid data when collecting int16 -* [ARROW-3791](https://issues.apache.org/jira/browse/ARROW-3791) - [C++] Add type inference for boolean values in CSV files -* [ARROW-3794](https://issues.apache.org/jira/browse/ARROW-3794) - [R] Consider mapping INT8 to integer() not raw() -* [ARROW-3804](https://issues.apache.org/jira/browse/ARROW-3804) - [R] Consider lowering required R runtime -* [ARROW-3810](https://issues.apache.org/jira/browse/ARROW-3810) - [R] type= argument for Array and ChunkedArray -* [ARROW-3811](https://issues.apache.org/jira/browse/ARROW-3811) - [R] struct arrays inference -* [ARROW-3814](https://issues.apache.org/jira/browse/ARROW-3814) - [R] RecordBatch$from\_arrays() -* [ARROW-3815](https://issues.apache.org/jira/browse/ARROW-3815) - [R] refine record batch factory -* [ARROW-3848](https://issues.apache.org/jira/browse/ARROW-3848) - [R] allow nbytes to be missing in RandomAccessFile$Read() -* [ARROW-3897](https://issues.apache.org/jira/browse/ARROW-3897) - [MATLAB] Add MATLAB support for writing numeric datatypes to a Feather file -* [ARROW-3904](https://issues.apache.org/jira/browse/ARROW-3904) - [C++/Python] Validate scale and precision of decimal128 type -* [ARROW-4013](https://issues.apache.org/jira/browse/ARROW-4013) - [Documentation][C++] Document how to build Apache Arrow on MSYS2 -* [ARROW-4020](https://issues.apache.org/jira/browse/ARROW-4020) - [Release] Remove source artifacts from dev dist system after release vote passes -* [ARROW-4047](https://issues.apache.org/jira/browse/ARROW-4047) - [Python] Document use of int96 timestamps and options in Parquet docs -* [ARROW-4086](https://issues.apache.org/jira/browse/ARROW-4086) - [Java] Add apis to debug alloc failures -* [ARROW-4121](https://issues.apache.org/jira/browse/ARROW-4121) - [C++] Refactor memory allocation from InvertKernel -* [ARROW-4159](https://issues.apache.org/jira/browse/ARROW-4159) - [C++] Check for -Wdocumentation issues -* [ARROW-4194](https://issues.apache.org/jira/browse/ARROW-4194) - [Format] Metadata.rst does not specify timezone for Timestamp type -* [ARROW-4302](https://issues.apache.org/jira/browse/ARROW-4302) - [C++] Add OpenSSL to C++ build toolchain -* [ARROW-4337](https://issues.apache.org/jira/browse/ARROW-4337) - [C\#] Array / RecordBatch Builder Fluent API -* [ARROW-4343](https://issues.apache.org/jira/browse/ARROW-4343) - [C++] Add as complete as possible Ubuntu Trusty / 14.04 build to docker-compose setup -* [ARROW-4356](https://issues.apache.org/jira/browse/ARROW-4356) - [CI] Add integration (docker) test for turbodbc -* [ARROW-4369](https://issues.apache.org/jira/browse/ARROW-4369) - [Packaging] Release verification script should test linux packages via docker -* [ARROW-4452](https://issues.apache.org/jira/browse/ARROW-4452) - [Python] Serializing sparse torch tensors -* [ARROW-4453](https://issues.apache.org/jira/browse/ARROW-4453) - [Python] Create Cython wrappers for SparseTensor -* [ARROW-4467](https://issues.apache.org/jira/browse/ARROW-4467) - [Rust] [DataFusion] Create a REPL & Dockerfile for DataFusion -* [ARROW-4503](https://issues.apache.org/jira/browse/ARROW-4503) - [C\#] ArrowStreamReader allocates and copies data excessively -* [ARROW-4504](https://issues.apache.org/jira/browse/ARROW-4504) - [C++] Reduce the number of unit test executables -* [ARROW-4505](https://issues.apache.org/jira/browse/ARROW-4505) - [C++] Nicer PrettyPrint for date32 -* [ARROW-4566](https://issues.apache.org/jira/browse/ARROW-4566) - [C++][Flight] Add option to run arrow-flight-benchmark against a perf server running on a different host -* [ARROW-4596](https://issues.apache.org/jira/browse/ARROW-4596) - [Rust] [DataFusion] Implement COUNT aggregate function -* [ARROW-4622](https://issues.apache.org/jira/browse/ARROW-4622) - [C++] [Python] MakeDense and MakeSparse in UnionArray should accept a vector of Field -* [ARROW-4625](https://issues.apache.org/jira/browse/ARROW-4625) - [Flight] Wrap server busy-wait methods -* [ARROW-4626](https://issues.apache.org/jira/browse/ARROW-4626) - [Flight] Add application metadata field to DoGet -* [ARROW-4627](https://issues.apache.org/jira/browse/ARROW-4627) - [Flight] Add application metadata field to DoPut -* [ARROW-4701](https://issues.apache.org/jira/browse/ARROW-4701) - [C++] Add JSON chunker benchmarks -* [ARROW-4702](https://issues.apache.org/jira/browse/ARROW-4702) - [C++] Upgrade dependency versions -* [ARROW-4708](https://issues.apache.org/jira/browse/ARROW-4708) - [C++] Add multithreaded JSON reader -* [ARROW-4708](https://issues.apache.org/jira/browse/ARROW-4708) - [C++] Add multithreaded JSON reader -* [ARROW-4714](https://issues.apache.org/jira/browse/ARROW-4714) - [C++][Java] Providing JNI interface to Read ORC file via Arrow C++ -* [ARROW-4717](https://issues.apache.org/jira/browse/ARROW-4717) - [C\#] Consider exposing ValueTask instead of Task -* [ARROW-4719](https://issues.apache.org/jira/browse/ARROW-4719) - [C\#] Implement ChunkedArray, Column and Table in C\# -* [ARROW-4741](https://issues.apache.org/jira/browse/ARROW-4741) - [Java] Add documentation to all classes and enable checkstyle for class javadocs -* [ARROW-4787](https://issues.apache.org/jira/browse/ARROW-4787) - [C++] Include "null" values (perhaps with an option to toggle on/off) in hash kernel actions -* [ARROW-4788](https://issues.apache.org/jira/browse/ARROW-4788) - [C++] Develop less verbose API for constructing StructArray -* [ARROW-4800](https://issues.apache.org/jira/browse/ARROW-4800) - [C++] Create/port a StatusOr implementation to be able to return a status or a type -* [ARROW-4805](https://issues.apache.org/jira/browse/ARROW-4805) - [Rust] Write temporal arrays to CSV -* [ARROW-4806](https://issues.apache.org/jira/browse/ARROW-4806) - [Rust] Support casting temporal arrays in cast kernels -* [ARROW-4824](https://issues.apache.org/jira/browse/ARROW-4824) - [Python] read\_csv should accept io.StringIO objects -* [ARROW-4827](https://issues.apache.org/jira/browse/ARROW-4827) - [C++] Implement benchmark comparison between two git revisions -* [ARROW-4847](https://issues.apache.org/jira/browse/ARROW-4847) - [Python] Add pyarrow.table factory function that dispatches to various ctors based on type of input -* [ARROW-4904](https://issues.apache.org/jira/browse/ARROW-4904) - [C++] Move implementations in arrow/ipc/test-common.h into libarrow\_testing -* [ARROW-4911](https://issues.apache.org/jira/browse/ARROW-4911) - [R] Support for building package for Windows -* [ARROW-4912](https://issues.apache.org/jira/browse/ARROW-4912) - [C++, Python] Allow specifying column names to CSV reader -* [ARROW-4913](https://issues.apache.org/jira/browse/ARROW-4913) - [Java][Memory] Limit number of ledgers and arrowbufs -* [ARROW-4945](https://issues.apache.org/jira/browse/ARROW-4945) - [Flight] Enable Flight integration tests in Travis -* [ARROW-4956](https://issues.apache.org/jira/browse/ARROW-4956) - [C\#] Allow ArrowBuffers to wrap external Memory in C\# -* [ARROW-4959](https://issues.apache.org/jira/browse/ARROW-4959) - [Gandiva][Crossbow] Builds broken -* [ARROW-4968](https://issues.apache.org/jira/browse/ARROW-4968) - [Rust] StructArray builder and From<\> methods should check that field types match schema -* [ARROW-4971](https://issues.apache.org/jira/browse/ARROW-4971) - [Go] DataType equality -* [ARROW-4972](https://issues.apache.org/jira/browse/ARROW-4972) - [Go] Array equality -* [ARROW-4973](https://issues.apache.org/jira/browse/ARROW-4973) - [Go] Slice Array equality -* [ARROW-4974](https://issues.apache.org/jira/browse/ARROW-4974) - [Go] Array approx equality -* [ARROW-4990](https://issues.apache.org/jira/browse/ARROW-4990) - [C++] Kernel to compare array with array -* [ARROW-4993](https://issues.apache.org/jira/browse/ARROW-4993) - [C++] Display summary at the end of CMake configuration -* [ARROW-5000](https://issues.apache.org/jira/browse/ARROW-5000) - [Python] Fix deprecation warning from setup.py -* [ARROW-5007](https://issues.apache.org/jira/browse/ARROW-5007) - [C++] Move DCHECK out of sse-utils -* [ARROW-5020](https://issues.apache.org/jira/browse/ARROW-5020) - [C++][Gandiva] Split Gandiva-related conda packages for builds into separate .yml conda env file -* [ARROW-5027](https://issues.apache.org/jira/browse/ARROW-5027) - [Python] Add JSON Reader -* [ARROW-5037](https://issues.apache.org/jira/browse/ARROW-5037) - [Rust] [DataFusion] Refactor aggregate module -* [ARROW-5038](https://issues.apache.org/jira/browse/ARROW-5038) - [Rust] [DataFusion] Implement AVG aggregate function -* [ARROW-5039](https://issues.apache.org/jira/browse/ARROW-5039) - [Rust] [DataFusion] Fix bugs in CAST support -* [ARROW-5040](https://issues.apache.org/jira/browse/ARROW-5040) - [C++] ArrayFromJSON can't parse Timestamp from strings -* [ARROW-5045](https://issues.apache.org/jira/browse/ARROW-5045) - [Rust] Code coverage silently failing in CI -* [ARROW-5053](https://issues.apache.org/jira/browse/ARROW-5053) - [Rust] [DataFusion] Use env var for location of arrow test data -* [ARROW-5054](https://issues.apache.org/jira/browse/ARROW-5054) - [C++][Release] Test Flight in verify-release-candidate.sh -* [ARROW-5056](https://issues.apache.org/jira/browse/ARROW-5056) - [Packaging] Adjust conda recipes to use ORC conda-forge package on unix systems -* [ARROW-5061](https://issues.apache.org/jira/browse/ARROW-5061) - [Release] Improve 03-binary performance -* [ARROW-5062](https://issues.apache.org/jira/browse/ARROW-5062) - [Java] Shade Java Guava dependency for Flight -* [ARROW-5063](https://issues.apache.org/jira/browse/ARROW-5063) - [Java] FlightClient should not create a child allocator -* [ARROW-5064](https://issues.apache.org/jira/browse/ARROW-5064) - [Release] Pass PKG\_CONFIG\_PATH to glib in the verification script -* [ARROW-5066](https://issues.apache.org/jira/browse/ARROW-5066) - [Integration] Add flags to enable/disable implementations in integration/integration\_test.py -* [ARROW-5071](https://issues.apache.org/jira/browse/ARROW-5071) - [Benchmarking] Performs a benchmark run with archery -* [ARROW-5076](https://issues.apache.org/jira/browse/ARROW-5076) - [Packaging] Improve post binary upload performance -* [ARROW-5077](https://issues.apache.org/jira/browse/ARROW-5077) - [Rust] Release process should change Cargo.toml to use release versions -* [ARROW-5078](https://issues.apache.org/jira/browse/ARROW-5078) - [Documentation] Sphinx is failed by RemovedInSphinx30Warning -* [ARROW-5079](https://issues.apache.org/jira/browse/ARROW-5079) - [Release] Add a script to release C\# package -* [ARROW-5080](https://issues.apache.org/jira/browse/ARROW-5080) - [Release] Add a script to release Rust packages -* [ARROW-5081](https://issues.apache.org/jira/browse/ARROW-5081) - [C++] Consistently use PATH\_SUFFIXES in CMake config -* [ARROW-5083](https://issues.apache.org/jira/browse/ARROW-5083) - [Developer] In merge\_arrow\_pr.py script, allow user to set a released Fix Version -* [ARROW-5088](https://issues.apache.org/jira/browse/ARROW-5088) - [C++] Do not set -Werror when using BUILD\_WARNING\_LEVEL=CHECKIN in release mode -* [ARROW-5091](https://issues.apache.org/jira/browse/ARROW-5091) - [Flight] Rename FlightGetInfo message to FlightInfo -* [ARROW-5093](https://issues.apache.org/jira/browse/ARROW-5093) - [Packaging] Add support for selective binary upload -* [ARROW-5094](https://issues.apache.org/jira/browse/ARROW-5094) - [Packaging] Add APT/Yum verification scripts -* [ARROW-5102](https://issues.apache.org/jira/browse/ARROW-5102) - [C++] Reduce header dependencies -* [ARROW-5108](https://issues.apache.org/jira/browse/ARROW-5108) - [Go] implement reading primitive arrays from Arrow file -* [ARROW-5109](https://issues.apache.org/jira/browse/ARROW-5109) - [Go] implement reading binary/string arrays from Arrow file -* [ARROW-5110](https://issues.apache.org/jira/browse/ARROW-5110) - [Go] implement reading struct arrays from Arrow file -* [ARROW-5111](https://issues.apache.org/jira/browse/ARROW-5111) - [Go] implement reading list arrays from Arrow file -* [ARROW-5112](https://issues.apache.org/jira/browse/ARROW-5112) - [Go] implement writing arrays to Arrow file -* [ARROW-5113](https://issues.apache.org/jira/browse/ARROW-5113) - [C++][Flight] Unit tests in C++ for DoPut -* [ARROW-5115](https://issues.apache.org/jira/browse/ARROW-5115) - [JS] Implement the Vector Builders -* [ARROW-5116](https://issues.apache.org/jira/browse/ARROW-5116) - [Rust] move kernel related files under compute/kernels -* [ARROW-5124](https://issues.apache.org/jira/browse/ARROW-5124) - [C++] Add support for Parquet in MinGW build -* [ARROW-5126](https://issues.apache.org/jira/browse/ARROW-5126) - [Rust] [Parquet] Convert parquet column desc to arrow data type -* [ARROW-5127](https://issues.apache.org/jira/browse/ARROW-5127) - [Rust] [Parquet] Add page iterator -* [ARROW-5136](https://issues.apache.org/jira/browse/ARROW-5136) - [Flight] Implement call options (timeouts) -* [ARROW-5137](https://issues.apache.org/jira/browse/ARROW-5137) - [Flight] Implement authentication APIs -* [ARROW-5145](https://issues.apache.org/jira/browse/ARROW-5145) - [C++] Release mode lacks convenience input validation -* [ARROW-5150](https://issues.apache.org/jira/browse/ARROW-5150) - [Ruby] Add Arrow::Table\#raw\_records -* [ARROW-5155](https://issues.apache.org/jira/browse/ARROW-5155) - [GLib][Ruby] Add support for building union arrays from data type -* [ARROW-5157](https://issues.apache.org/jira/browse/ARROW-5157) - [Website] Add MATLAB to powered by Apache Arrow page -* [ARROW-5162](https://issues.apache.org/jira/browse/ARROW-5162) - [Rust] [Parquet] Rename mod reader to arrow. -* [ARROW-5163](https://issues.apache.org/jira/browse/ARROW-5163) - [Gandiva] Cast timestamp/date are incorrectly evaluating year 0097 to 1997 -* [ARROW-5164](https://issues.apache.org/jira/browse/ARROW-5164) - [Gandiva] [C++] Introduce 32bit hash functions -* [ARROW-5165](https://issues.apache.org/jira/browse/ARROW-5165) - [Python][Documentation] Build docs don't suggest assigning $ARROW\_BUILD\_TYPE -* [ARROW-5168](https://issues.apache.org/jira/browse/ARROW-5168) - [GLib] Add garrow\_array\_take() -* [ARROW-5171](https://issues.apache.org/jira/browse/ARROW-5171) - [C++] Use LESS instead of LOWER in compare enum option. -* [ARROW-5172](https://issues.apache.org/jira/browse/ARROW-5172) - [Go] implement reading fixed-size binary arrays from Arrow file -* [ARROW-5178](https://issues.apache.org/jira/browse/ARROW-5178) - [Python] Allow creating Table from Python dict -* [ARROW-5179](https://issues.apache.org/jira/browse/ARROW-5179) - [Python] Return plain dicts, not OrderedDict, on Python 3.7+ -* [ARROW-5185](https://issues.apache.org/jira/browse/ARROW-5185) - [C++] Add support for Boost with CMake configuration file -* [ARROW-5187](https://issues.apache.org/jira/browse/ARROW-5187) - [Rust] Ability to flatten StructArray into a RecordBatch -* [ARROW-5188](https://issues.apache.org/jira/browse/ARROW-5188) - [Rust] Add temporal builders for StructArray -* [ARROW-5189](https://issues.apache.org/jira/browse/ARROW-5189) - [Rust] [Parquet] Format individual fields within a parquet row -* [ARROW-5190](https://issues.apache.org/jira/browse/ARROW-5190) - [R] Discussion: tibble dependency in R package -* [ARROW-5191](https://issues.apache.org/jira/browse/ARROW-5191) - [Rust] Expose CSV and JSON reader schemas -* [ARROW-5203](https://issues.apache.org/jira/browse/ARROW-5203) - [GLib] Add support for Compare filter -* [ARROW-5204](https://issues.apache.org/jira/browse/ARROW-5204) - [C++] Improve BufferBuilder performance -* [ARROW-5212](https://issues.apache.org/jira/browse/ARROW-5212) - [Go] Array BinaryBuilder in Go library has no access to resize the values buffer -* [ARROW-5218](https://issues.apache.org/jira/browse/ARROW-5218) - [C++] Improve build when third-party library locations are specified -* [ARROW-5219](https://issues.apache.org/jira/browse/ARROW-5219) - [C++] Build protobuf\_ep in parallel when using Ninja -* [ARROW-5222](https://issues.apache.org/jira/browse/ARROW-5222) - [Python] Issues with installing pyarrow for development on MacOS -* [ARROW-5225](https://issues.apache.org/jira/browse/ARROW-5225) - [Java] Improve performance of BaseValueVector\#getValidityBufferSizeFromCount -* [ARROW-5226](https://issues.apache.org/jira/browse/ARROW-5226) - [Gandiva] support compare operators for decimal -* [ARROW-5238](https://issues.apache.org/jira/browse/ARROW-5238) - [Python] Improve usability of pyarrow.dictionary function -* [ARROW-5241](https://issues.apache.org/jira/browse/ARROW-5241) - [Python] Add option to disable writing statistics to parquet file -* [ARROW-5250](https://issues.apache.org/jira/browse/ARROW-5250) - [Java] remove javadoc suppression on methods. -* [ARROW-5252](https://issues.apache.org/jira/browse/ARROW-5252) - [C++] Change variant implementation -* [ARROW-5256](https://issues.apache.org/jira/browse/ARROW-5256) - [Packaging][deb] Failed to build with LLVM 7.1.0 -* [ARROW-5257](https://issues.apache.org/jira/browse/ARROW-5257) - [Website] Update site to use "official" Apache Arrow logo, add clearly marked links to logo -* [ARROW-5258](https://issues.apache.org/jira/browse/ARROW-5258) - [C++/Python] Expose file metadata of dataset pieces to caller -* [ARROW-5261](https://issues.apache.org/jira/browse/ARROW-5261) - [C++] Finish implementation of scalar types for Duration and Interval -* [ARROW-5262](https://issues.apache.org/jira/browse/ARROW-5262) - [Python] Fix typo -* [ARROW-5264](https://issues.apache.org/jira/browse/ARROW-5264) - [Java] Allow enabling/disabling boundary checking by environmental variable -* [ARROW-5266](https://issues.apache.org/jira/browse/ARROW-5266) - [Go] implement read/write IPC for Float16 -* [ARROW-5268](https://issues.apache.org/jira/browse/ARROW-5268) - [GLib] Add GArrowJSONReader -* [ARROW-5269](https://issues.apache.org/jira/browse/ARROW-5269) - [C++] Whitelist benchmarks candidates for regression checks -* [ARROW-5275](https://issues.apache.org/jira/browse/ARROW-5275) - [C++] Write generic filesystem tests -* [ARROW-5281](https://issues.apache.org/jira/browse/ARROW-5281) - [Rust] [Parquet] Move DataPageBuilder to test\_common -* [ARROW-5284](https://issues.apache.org/jira/browse/ARROW-5284) - [Rust] Replace libc with std::alloc for memory allocation -* [ARROW-5286](https://issues.apache.org/jira/browse/ARROW-5286) - [Python] support Structs in Table.from\_pandas given a known schema -* [ARROW-5288](https://issues.apache.org/jira/browse/ARROW-5288) - [Documentation] Enrich the contribution guidelines -* [ARROW-5289](https://issues.apache.org/jira/browse/ARROW-5289) - [C++] Move arrow/util/concatenate.h to arrow/array/ -* [ARROW-5290](https://issues.apache.org/jira/browse/ARROW-5290) - [Java] Provide a flag to enable/disable null-checking in vectors' get methods -* [ARROW-5291](https://issues.apache.org/jira/browse/ARROW-5291) - [Python] Add wrapper for "take" kernel on Array -* [ARROW-5298](https://issues.apache.org/jira/browse/ARROW-5298) - [Rust] Add debug implementation for Buffer -* [ARROW-5299](https://issues.apache.org/jira/browse/ARROW-5299) - [C++] ListArray comparison is incorrect -* [ARROW-5309](https://issues.apache.org/jira/browse/ARROW-5309) - [Python] Add clarifications to Python "append" methods that return new objects -* [ARROW-5311](https://issues.apache.org/jira/browse/ARROW-5311) - [C++] Return more specific invalid Status in Take kernel -* [ARROW-5313](https://issues.apache.org/jira/browse/ARROW-5313) - [Format] Comments on Field table are a bit confusing -* [ARROW-5317](https://issues.apache.org/jira/browse/ARROW-5317) - [Rust] [Parquet] impl IntoIterator for SerializedFileReader -* [ARROW-5319](https://issues.apache.org/jira/browse/ARROW-5319) - [CI] Enable ccache with MinGW builds -* [ARROW-5321](https://issues.apache.org/jira/browse/ARROW-5321) - [Gandiva][C++] add isnull and isnotnull for utf8 and binary types -* [ARROW-5323](https://issues.apache.org/jira/browse/ARROW-5323) - [CI] Use compression with clcache -* [ARROW-5328](https://issues.apache.org/jira/browse/ARROW-5328) - [R] Add shell scripts to do a full package rebuild and test locally -* [ARROW-5329](https://issues.apache.org/jira/browse/ARROW-5329) - Add support for building MATLAB interface to Feather directly within MATLAB -* [ARROW-5334](https://issues.apache.org/jira/browse/ARROW-5334) - [C++] Add "Type" to names of arrow::Integer, arrow::FloatingPoint classes for consistency -* [ARROW-5335](https://issues.apache.org/jira/browse/ARROW-5335) - [Python] Raise on variable dictionaries when converting to pandas -* [ARROW-5339](https://issues.apache.org/jira/browse/ARROW-5339) - [C++] Add jemalloc to thirdparty dependency download script -* [ARROW-5341](https://issues.apache.org/jira/browse/ARROW-5341) - [C++] Add instructions about fixing and testing for -Wdocumentation clang warnings locally -* [ARROW-5342](https://issues.apache.org/jira/browse/ARROW-5342) - [Format] Formalize extension type metadata in IPC protocol -* [ARROW-5346](https://issues.apache.org/jira/browse/ARROW-5346) - [C++] Revert changes to qualify duration in vendored date code -* [ARROW-5349](https://issues.apache.org/jira/browse/ARROW-5349) - [Python/C++] Provide a way to specify the file path in parquet ColumnChunkMetaData -* [ARROW-5361](https://issues.apache.org/jira/browse/ARROW-5361) - [R] Follow DictionaryType/DictionaryArray changes from ARROW-3144 -* [ARROW-5363](https://issues.apache.org/jira/browse/ARROW-5363) - [GLib] Fix coding styles -* [ARROW-5364](https://issues.apache.org/jira/browse/ARROW-5364) - [C++] Use ASCII rather than UTF-8 in BuildUtils.cmake comment -* [ARROW-5365](https://issues.apache.org/jira/browse/ARROW-5365) - [C++][CI] Add UBSan and ASAN into CI -* [ARROW-5368](https://issues.apache.org/jira/browse/ARROW-5368) - [C++] Disable jemalloc by default with MinGW -* [ARROW-5369](https://issues.apache.org/jira/browse/ARROW-5369) - [C++] Add support for glog on Windows -* [ARROW-5370](https://issues.apache.org/jira/browse/ARROW-5370) - [C++] Detect system uriparser by default -* [ARROW-5372](https://issues.apache.org/jira/browse/ARROW-5372) - [GLib] Add support for null/boolean values CSV read option -* [ARROW-5378](https://issues.apache.org/jira/browse/ARROW-5378) - [C++] Add local FileSystem implementation -* [ARROW-5384](https://issues.apache.org/jira/browse/ARROW-5384) - [Go] add FixedSizeList array -* [ARROW-5389](https://issues.apache.org/jira/browse/ARROW-5389) - [C++] Add an internal temporary directory API -* [ARROW-5392](https://issues.apache.org/jira/browse/ARROW-5392) - [C++][CI][MinGW] Disable static library build on AppVeyor -* [ARROW-5393](https://issues.apache.org/jira/browse/ARROW-5393) - [R] Add tests and example for read\_parquet() -* [ARROW-5395](https://issues.apache.org/jira/browse/ARROW-5395) - [C++] Utilize stream EOS in File format -* [ARROW-5396](https://issues.apache.org/jira/browse/ARROW-5396) - [JS] Ensure reader and writer support files and streams with no RecordBatches -* [ARROW-5401](https://issues.apache.org/jira/browse/ARROW-5401) - [CI] [C++] Print ccache statistics on Travis-CI -* [ARROW-5404](https://issues.apache.org/jira/browse/ARROW-5404) - [C++] nonstd::string\_view conflicts with std::string\_view in c++17 -* [ARROW-5407](https://issues.apache.org/jira/browse/ARROW-5407) - [C++] Integration test Travis CI entry builds many unnecessary targets -* [ARROW-5413](https://issues.apache.org/jira/browse/ARROW-5413) - [C++] CSV reader doesn't remove BOM -* [ARROW-5415](https://issues.apache.org/jira/browse/ARROW-5415) - [Release] Release script should update R version everywhere -* [ARROW-5416](https://issues.apache.org/jira/browse/ARROW-5416) - [Website] Add Homebrew to project installation page -* [ARROW-5418](https://issues.apache.org/jira/browse/ARROW-5418) - [CI][R] Run code coverage and report to codecov.io -* [ARROW-5420](https://issues.apache.org/jira/browse/ARROW-5420) - [Java] Implement or remove getCurrentSizeInBytes in VariableWidthVector -* [ARROW-5427](https://issues.apache.org/jira/browse/ARROW-5427) - [Python] RangeIndex serialization change implications -* [ARROW-5428](https://issues.apache.org/jira/browse/ARROW-5428) - [C++] Add option to set "read extent" in arrow::io::BufferedInputStream -* [ARROW-5429](https://issues.apache.org/jira/browse/ARROW-5429) - [Java] Provide alternative buffer allocation policy -* [ARROW-5432](https://issues.apache.org/jira/browse/ARROW-5432) - [Python] Add 'read\_at' method to pyarrow.NativeFile -* [ARROW-5433](https://issues.apache.org/jira/browse/ARROW-5433) - [C++][Parquet] improve parquet-reader columns information -* [ARROW-5434](https://issues.apache.org/jira/browse/ARROW-5434) - [Java] Introduce wrappers for backward compatibility for ArrowBuf changes in ARROW-3191 -* [ARROW-5436](https://issues.apache.org/jira/browse/ARROW-5436) - [Python] expose filters argument in parquet.read\_table -* [ARROW-5438](https://issues.apache.org/jira/browse/ARROW-5438) - [JS] Utilize stream EOS in File format -* [ARROW-5441](https://issues.apache.org/jira/browse/ARROW-5441) - [C++] Implement FindArrowFlight.cmake -* [ARROW-5442](https://issues.apache.org/jira/browse/ARROW-5442) - [Website] Clarify what makes a release artifact "official" -* [ARROW-5443](https://issues.apache.org/jira/browse/ARROW-5443) - [Gandiva][Crossbow] Turn parquet encryption off -* [ARROW-5447](https://issues.apache.org/jira/browse/ARROW-5447) - [CI] [Ruby] CI is failed on AppVeyor -* [ARROW-5449](https://issues.apache.org/jira/browse/ARROW-5449) - [C++] Local filesystem implementation: investigate Windows UNC paths -* [ARROW-5451](https://issues.apache.org/jira/browse/ARROW-5451) - [C++][Gandiva] Add round functions for decimals -* [ARROW-5452](https://issues.apache.org/jira/browse/ARROW-5452) - [R] Add documentation website (pkgdown) -* [ARROW-5461](https://issues.apache.org/jira/browse/ARROW-5461) - [Java] Add micro-benchmarks for Float8Vector and allocators -* [ARROW-5463](https://issues.apache.org/jira/browse/ARROW-5463) - [Rust] Implement AsRef for Buffer -* [ARROW-5464](https://issues.apache.org/jira/browse/ARROW-5464) - [Archery] Bad --benchmark-filter default -* [ARROW-5465](https://issues.apache.org/jira/browse/ARROW-5465) - [Crossbow] Support writing submitted job definition yaml to a file -* [ARROW-5466](https://issues.apache.org/jira/browse/ARROW-5466) - [Java] Dockerize Java builds in Travis CI, run multiple JDKs in single entry -* [ARROW-5467](https://issues.apache.org/jira/browse/ARROW-5467) - [Go] implement read/write IPC for Time32/Time64 arrays -* [ARROW-5468](https://issues.apache.org/jira/browse/ARROW-5468) - [Go] implement read/write IPC for Timestamp arrays -* [ARROW-5469](https://issues.apache.org/jira/browse/ARROW-5469) - [Go] implement read/write IPC for Date32/Date64 arrays -* [ARROW-5470](https://issues.apache.org/jira/browse/ARROW-5470) - [CI] C++ local filesystem patch breaks Travis R job -* [ARROW-5472](https://issues.apache.org/jira/browse/ARROW-5472) - [Development] Add warning to PR merge tool if no JIRA component is set -* [ARROW-5474](https://issues.apache.org/jira/browse/ARROW-5474) - [C++] Document required Boost version -* [ARROW-5475](https://issues.apache.org/jira/browse/ARROW-5475) - [Python] Add Python binding for arrow::Concatenate -* [ARROW-5476](https://issues.apache.org/jira/browse/ARROW-5476) - [Java][Memory] Fix Netty ArrowBuf Slice -* [ARROW-5477](https://issues.apache.org/jira/browse/ARROW-5477) - [C++] Check required RapidJSON version -* [ARROW-5478](https://issues.apache.org/jira/browse/ARROW-5478) - [Packaging] Drop Ubuntu 14.04 support -* [ARROW-5481](https://issues.apache.org/jira/browse/ARROW-5481) - [GLib] garrow\_seekable\_input\_stream\_peek() misses "error" parameter document -* [ARROW-5485](https://issues.apache.org/jira/browse/ARROW-5485) - [Gandiva][Crossbow] OSx builds failing -* [ARROW-5485](https://issues.apache.org/jira/browse/ARROW-5485) - [Gandiva][Crossbow] OSx builds failing -* [ARROW-5486](https://issues.apache.org/jira/browse/ARROW-5486) - [GLib] Add binding of gandiva::FunctionRegistry and related things -* [ARROW-5488](https://issues.apache.org/jira/browse/ARROW-5488) - [R] Workaround when C++ lib not available -* [ARROW-5490](https://issues.apache.org/jira/browse/ARROW-5490) - [C++] Remove ARROW\_BOOST\_HEADER\_ONLY -* [ARROW-5491](https://issues.apache.org/jira/browse/ARROW-5491) - [C++] Remove unecessary semicolons following MACRO definitions -* [ARROW-5492](https://issues.apache.org/jira/browse/ARROW-5492) - [R] Add "col\_select" argument to read\_\* functions to read subset of columns -* [ARROW-5495](https://issues.apache.org/jira/browse/ARROW-5495) - [C++] Use HTTPS consistently for downloading dependencies -* [ARROW-5496](https://issues.apache.org/jira/browse/ARROW-5496) - [R][CI] Fix relative paths in R codecov.io reporting -* [ARROW-5498](https://issues.apache.org/jira/browse/ARROW-5498) - [C++] Build failure with Flatbuffers 1.11.0 and MinGW -* [ARROW-5499](https://issues.apache.org/jira/browse/ARROW-5499) - [R] Alternate bindings for when libarrow is not found -* [ARROW-5500](https://issues.apache.org/jira/browse/ARROW-5500) - [R] read\_csv\_arrow() signature should match readr::read\_csv() -* [ARROW-5503](https://issues.apache.org/jira/browse/ARROW-5503) - [R] add read\_json() -* [ARROW-5504](https://issues.apache.org/jira/browse/ARROW-5504) - [R] move use\_threads argument to global option -* [ARROW-5509](https://issues.apache.org/jira/browse/ARROW-5509) - [R] write\_parquet() -* [ARROW-5511](https://issues.apache.org/jira/browse/ARROW-5511) - [Packaging] Enable Flight in Conda packages -* [ARROW-5512](https://issues.apache.org/jira/browse/ARROW-5512) - [C++] Draft initial public APIs for Datasets project -* [ARROW-5513](https://issues.apache.org/jira/browse/ARROW-5513) - [Java] Refactor method name for getstartOffset to use camel case -* [ARROW-5516](https://issues.apache.org/jira/browse/ARROW-5516) - [Python] Development page for pyarrow has a missing dependency in using pip -* [ARROW-5518](https://issues.apache.org/jira/browse/ARROW-5518) - [Java] Set VectorSchemaRoot rowCount to 0 on allocateNew and clear -* [ARROW-5524](https://issues.apache.org/jira/browse/ARROW-5524) - [C++] Turn off PARQUET\_BUILD\_ENCRYPTION in CMake if OpenSSL not found -* [ARROW-5526](https://issues.apache.org/jira/browse/ARROW-5526) - [Developer] Add more prominent notice to GitHub issue template to direct bug reports to JIRA -* [ARROW-5529](https://issues.apache.org/jira/browse/ARROW-5529) - [Flight] Allow serving with multiple TLS certificates -* [ARROW-5531](https://issues.apache.org/jira/browse/ARROW-5531) - [Python] Support binary, utf8, and nested types in Array.from\_buffers -* [ARROW-5533](https://issues.apache.org/jira/browse/ARROW-5533) - [Plasma] Plasma client should be thread-safe -* [ARROW-5534](https://issues.apache.org/jira/browse/ARROW-5534) - [GLib] Add garrow\_table\_concatenate() -* [ARROW-5535](https://issues.apache.org/jira/browse/ARROW-5535) - [GLib] Add garrow\_table\_slice() -* [ARROW-5537](https://issues.apache.org/jira/browse/ARROW-5537) - [JS] Support delta dictionaries in RecordBatchWriter and DictionaryBuilder -* [ARROW-5538](https://issues.apache.org/jira/browse/ARROW-5538) - [C++] Restrict minimum OpenSSL version to 1.0.2 -* [ARROW-5541](https://issues.apache.org/jira/browse/ARROW-5541) - [R] cast from negative int32 to uint32 and uint64 are now safe -* [ARROW-5544](https://issues.apache.org/jira/browse/ARROW-5544) - [Archery] should not return non-zero in \`benchmark diff\` sub command on regression -* [ARROW-5545](https://issues.apache.org/jira/browse/ARROW-5545) - [C++][Docs] Clarify expectation of UTC values for timestamps with time zones in C++ API docs -* [ARROW-5547](https://issues.apache.org/jira/browse/ARROW-5547) - [C++][FlightRPC] arrow-flight.pc isn't provided -* [ARROW-5552](https://issues.apache.org/jira/browse/ARROW-5552) - [Go] make Schema and Field implement Stringer -* [ARROW-5554](https://issues.apache.org/jira/browse/ARROW-5554) - Add a python wrapper for arrow::Concatenate -* [ARROW-5555](https://issues.apache.org/jira/browse/ARROW-5555) - [R] Add install\_arrow() function to assist the user in obtaining C++ runtime libraries -* [ARROW-5556](https://issues.apache.org/jira/browse/ARROW-5556) - [Doc] Document JSON reader -* [ARROW-5557](https://issues.apache.org/jira/browse/ARROW-5557) - [C++] Investigate performance of VisitBitsUnrolled on different platforms -* [ARROW-5565](https://issues.apache.org/jira/browse/ARROW-5565) - [Python] Document how to use gdb when working on pyarrow -* [ARROW-5567](https://issues.apache.org/jira/browse/ARROW-5567) - [C++] Fix build error of memory-benchmark -* [ARROW-5571](https://issues.apache.org/jira/browse/ARROW-5571) - [R] Rework handing of ARROW\_R\_WITH\_PARQUET -* [ARROW-5574](https://issues.apache.org/jira/browse/ARROW-5574) - [R] documentation error for read\_arrow() -* [ARROW-5581](https://issues.apache.org/jira/browse/ARROW-5581) - [Java] Provide interfaces and initial implementations for vector sorting -* [ARROW-5582](https://issues.apache.org/jira/browse/ARROW-5582) - [Go] add support for comparing Records -* [ARROW-5586](https://issues.apache.org/jira/browse/ARROW-5586) - [R] convert Array of LIST type to R lists -* [ARROW-5587](https://issues.apache.org/jira/browse/ARROW-5587) - [Java] Add more maven style check for Java code -* [ARROW-5590](https://issues.apache.org/jira/browse/ARROW-5590) - [R] Run "no libarrow" R build in the same CI entry if possible -* [ARROW-5591](https://issues.apache.org/jira/browse/ARROW-5591) - [Go] implement read/write IPC for Duration & Intervals -* [ARROW-5597](https://issues.apache.org/jira/browse/ARROW-5597) - [Packaging][deb] Add Flight packages -* [ARROW-5600](https://issues.apache.org/jira/browse/ARROW-5600) - [R] R package namespace cleanup -* [ARROW-5602](https://issues.apache.org/jira/browse/ARROW-5602) - [Java][Gandiva] Add test for decimal round functions -* [ARROW-5604](https://issues.apache.org/jira/browse/ARROW-5604) - [Go] improve test coverage of type-traits -* [ARROW-5609](https://issues.apache.org/jira/browse/ARROW-5609) - [C++] Set CMP0068 CMake policy to avoid macOS warnings -* [ARROW-5612](https://issues.apache.org/jira/browse/ARROW-5612) - [Python][Documentation] Clarify date\_as\_object option behavior -* [ARROW-5621](https://issues.apache.org/jira/browse/ARROW-5621) - [Go] implement read/write IPC for Decimal128 arrays -* [ARROW-5622](https://issues.apache.org/jira/browse/ARROW-5622) - [C++][Dataset] arrow-dataset.pc isn't provided -* [ARROW-5625](https://issues.apache.org/jira/browse/ARROW-5625) - [R] convert Array of struct type to data frame columns -* [ARROW-5632](https://issues.apache.org/jira/browse/ARROW-5632) - [Doc] Add some documentation describing compile/debug workflow on macOS with Xcode IDE -* [ARROW-5633](https://issues.apache.org/jira/browse/ARROW-5633) - [Python] Enable bz2 in Linux wheels -* [ARROW-5635](https://issues.apache.org/jira/browse/ARROW-5635) - [C++] Support "compacting" a table -* [ARROW-5637](https://issues.apache.org/jira/browse/ARROW-5637) - [Gandiva] [Java]Complete IN Expression -* [ARROW-5639](https://issues.apache.org/jira/browse/ARROW-5639) - [Java] Remove floating point computation from getOffsetBufferValueCapacity -* [ARROW-5641](https://issues.apache.org/jira/browse/ARROW-5641) - [GLib] Remove enums files generated by GNU Autotools from Git targets -* [ARROW-5643](https://issues.apache.org/jira/browse/ARROW-5643) - [Flight] Add ability to override hostname checking -* [ARROW-5650](https://issues.apache.org/jira/browse/ARROW-5650) - [Python] Update manylinux dependency versions -* [ARROW-5652](https://issues.apache.org/jira/browse/ARROW-5652) - [CI] Fix iwyu docker image -* [ARROW-5653](https://issues.apache.org/jira/browse/ARROW-5653) - [CI] Fix cpp docker image -* [ARROW-5656](https://issues.apache.org/jira/browse/ARROW-5656) - [Python] Enable Flight wheels on macOS -* [ARROW-5659](https://issues.apache.org/jira/browse/ARROW-5659) - [C++] Add support for finding OpenSSL installed by Homebrew -* [ARROW-5660](https://issues.apache.org/jira/browse/ARROW-5660) - [GLib][CI] Use the latest macOS image and all Homebrew based libraries -* [ARROW-5661](https://issues.apache.org/jira/browse/ARROW-5661) - Support hash functions for decimal in Gandiva -* [ARROW-5662](https://issues.apache.org/jira/browse/ARROW-5662) - [C++] Add support for BOOST\_SOURCE=AUTO|BUNDLED|SYSTEM -* [ARROW-5663](https://issues.apache.org/jira/browse/ARROW-5663) - [Packaging][RPM] Update CentOS packages for 0.14.0 -* [ARROW-5664](https://issues.apache.org/jira/browse/ARROW-5664) - [Crossbow] Execute nightly crossbow tests on CircleCI instead of Travis -* [ARROW-5668](https://issues.apache.org/jira/browse/ARROW-5668) - [Python] Display "not null" in Schema.\_\_repr\_\_ for non-nullable fields -* [ARROW-5669](https://issues.apache.org/jira/browse/ARROW-5669) - [Crossbow] manylinux1 wheel building failing -* [ARROW-5670](https://issues.apache.org/jira/browse/ARROW-5670) - [Crossbow] get\_apache\_mirror.py fails with TLS error on macOS with Python 3.5 -* [ARROW-5671](https://issues.apache.org/jira/browse/ARROW-5671) - [crossbow] mac os python wheels failing -* [ARROW-5672](https://issues.apache.org/jira/browse/ARROW-5672) - [Java] Refactor redundant method modifier -* [ARROW-5683](https://issues.apache.org/jira/browse/ARROW-5683) - [R] Add snappy to Rtools Windows builds -* [ARROW-5684](https://issues.apache.org/jira/browse/ARROW-5684) - [Packaging][deb] Add support for Ubuntu 19.04 -* [ARROW-5685](https://issues.apache.org/jira/browse/ARROW-5685) - [Packaging][deb] Add support for Apache Arrow Datasets -* [ARROW-5687](https://issues.apache.org/jira/browse/ARROW-5687) - [C++] Remove remaining uses of ARROW\_BOOST\_VENDORED -* [ARROW-5690](https://issues.apache.org/jira/browse/ARROW-5690) - [Packaging][Python] macOS wheels broken: libprotobuf.18.dylib missing -* [ARROW-5694](https://issues.apache.org/jira/browse/ARROW-5694) - [Python] List of decimals are not supported when converting to pandas -* [ARROW-5695](https://issues.apache.org/jira/browse/ARROW-5695) - [C\#][Release] Run sourcelink test in verify-release-candidate.sh -* [ARROW-5696](https://issues.apache.org/jira/browse/ARROW-5696) - [Gandiva] [C++] Introduce castVarcharVarchar -* [ARROW-5699](https://issues.apache.org/jira/browse/ARROW-5699) - [C++] Optimize parsing of Decimal128 in CSV -* [ARROW-5701](https://issues.apache.org/jira/browse/ARROW-5701) - [C++][Gandiva] Build expressions only for the required selection vector types -* [ARROW-5702](https://issues.apache.org/jira/browse/ARROW-5702) - [C++] parquet::arrow::FileReader::GetSchema() -* [ARROW-5704](https://issues.apache.org/jira/browse/ARROW-5704) - [C++] Stop using ARROW\_TEMPLATE\_EXPORT for SparseTensorImpl class -* [ARROW-5705](https://issues.apache.org/jira/browse/ARROW-5705) - [Java] Optimize BaseValueVector\#computeCombinedBufferSize logic -* [ARROW-5706](https://issues.apache.org/jira/browse/ARROW-5706) - [Java] Remove type conversion in getValidityBufferValueCapacity -* [ARROW-5707](https://issues.apache.org/jira/browse/ARROW-5707) - [Java] Improve the performance and code structure for ArrowRecordBatch -* [ARROW-5710](https://issues.apache.org/jira/browse/ARROW-5710) - [C++] Allow compiling Gandiva with Ninja on Windows -* [ARROW-5715](https://issues.apache.org/jira/browse/ARROW-5715) - [Release] Verify Ubuntu 19.04 APT repository -* [ARROW-5718](https://issues.apache.org/jira/browse/ARROW-5718) - [R] auto splice data frames in record\_batch() and table() -* [ARROW-5720](https://issues.apache.org/jira/browse/ARROW-5720) - [C++] Create benchmarks for decimal related classes. -* [ARROW-5721](https://issues.apache.org/jira/browse/ARROW-5721) - [Rust] Move array related code into a separate module -* [ARROW-5724](https://issues.apache.org/jira/browse/ARROW-5724) - [R] [CI] AppVeyor build should use ccache -* [ARROW-5725](https://issues.apache.org/jira/browse/ARROW-5725) - [Crossbow] Port conda recipes to azure pipelines -* [ARROW-5726](https://issues.apache.org/jira/browse/ARROW-5726) - [Java] Implement a common interface for int vectors -* [ARROW-5727](https://issues.apache.org/jira/browse/ARROW-5727) - [Python] [CI] Install pytest-faulthandler before running tests -* [ARROW-5748](https://issues.apache.org/jira/browse/ARROW-5748) - [Packaging][deb] Add support for Debian GNU/Linux buster -* [ARROW-5749](https://issues.apache.org/jira/browse/ARROW-5749) - [Python] Add Python binding for Table::CombineChunks() -* [ARROW-5751](https://issues.apache.org/jira/browse/ARROW-5751) - [Packaging][Python] Python macOS wheels have dynamic dependency on libcares -* [ARROW-5752](https://issues.apache.org/jira/browse/ARROW-5752) - [Java] Improve the performance of ArrowBuf\#setZero -* [ARROW-5755](https://issues.apache.org/jira/browse/ARROW-5755) - [Rust] [Parquet] Add derived clone for Type -* [ARROW-5768](https://issues.apache.org/jira/browse/ARROW-5768) - [Release] There are needless newlines at the end of CHANGELOG.md -* [ARROW-5773](https://issues.apache.org/jira/browse/ARROW-5773) - [R] Clean up documentation before release -* [ARROW-5780](https://issues.apache.org/jira/browse/ARROW-5780) - [C++] Add benchmark for Decimal128 operations -* [ARROW-5782](https://issues.apache.org/jira/browse/ARROW-5782) - [Release] Setup test data for Flight in dev/release/01-perform.sh -* [ARROW-5783](https://issues.apache.org/jira/browse/ARROW-5783) - [Release][C\#] Exclude dummy.git from RAT check -* [ARROW-5785](https://issues.apache.org/jira/browse/ARROW-5785) - [Rust] Rust datafusion implementation should not depend on rustyline -* [ARROW-5787](https://issues.apache.org/jira/browse/ARROW-5787) - [Release][Rust] Use local modules to verify RC -* [ARROW-5793](https://issues.apache.org/jira/browse/ARROW-5793) - [Release] Avoid duplicate known host SSH error in dev/release/03-binary.sh -* [ARROW-5794](https://issues.apache.org/jira/browse/ARROW-5794) - [Release] Skip uploading already uploaded binaries -* [ARROW-5795](https://issues.apache.org/jira/browse/ARROW-5795) - [Release] Add missing waits on uploading binaries -* [ARROW-5796](https://issues.apache.org/jira/browse/ARROW-5796) - [Release][APT] Update expected package list -* [ARROW-5797](https://issues.apache.org/jira/browse/ARROW-5797) - [Release][APT] Update supported distributions -* [ARROW-5818](https://issues.apache.org/jira/browse/ARROW-5818) - [Java][Gandiva] support varlen output vectors -* [ARROW-5820](https://issues.apache.org/jira/browse/ARROW-5820) - [Release] Remove undefined variable check from verify script -* [ARROW-5826](https://issues.apache.org/jira/browse/ARROW-5826) - [Website] Blog post for 0.14.0 release announcement -* [PARQUET-1243](https://issues.apache.org/jira/browse/PARQUET-1243) - [C++] Improve quality of error message for zero-length files, otherwise corrupted files -* [PARQUET-1411](https://issues.apache.org/jira/browse/PARQUET-1411) - [C++] Upgrade to use LogicalType annotations instead of ConvertedType -* [PARQUET-1422](https://issues.apache.org/jira/browse/PARQUET-1422) - [C++] Use Arrow IO interfaces natively rather than current parquet:: wrappers -* [PARQUET-1517](https://issues.apache.org/jira/browse/PARQUET-1517) - [C++] Update cpp crypto package to match signed-off specification -* [PARQUET-1523](https://issues.apache.org/jira/browse/PARQUET-1523) - [C++] Vectorize comparator interface -* [PARQUET-1569](https://issues.apache.org/jira/browse/PARQUET-1569) - [C++] Consolidate testing header files -* [PARQUET-1582](https://issues.apache.org/jira/browse/PARQUET-1582) - [C++] Add ToString method ColumnDescriptor -* [PARQUET-1583](https://issues.apache.org/jira/browse/PARQUET-1583) - [C++] Remove parquet::Vector class -* [PARQUET-1586](https://issues.apache.org/jira/browse/PARQUET-1586) - [C++] Add --dump options to parquet-reader tool to dump def/rep levels -* [PARQUET-1603](https://issues.apache.org/jira/browse/PARQUET-1603) - [C++] rename parquet::LogicalType to parquet::ConvertedType - - -## Bug Fixes - -* [ARROW-61](https://issues.apache.org/jira/browse/ARROW-61) - [Java] Method can return the value bigger than long MAX\_VALUE -* [ARROW-352](https://issues.apache.org/jira/browse/ARROW-352) - [Format] Interval(DAY\_TIME) has no unit -* [ARROW-1837](https://issues.apache.org/jira/browse/ARROW-1837) - [Java] Unable to read unsigned integers outside signed range for bit width in integration tests -* [ARROW-2119](https://issues.apache.org/jira/browse/ARROW-2119) - [C++][Java] Handle Arrow stream with zero record batch -* [ARROW-2136](https://issues.apache.org/jira/browse/ARROW-2136) - [Python] Non-nullable schema fields not checked in conversions from pandas -* [ARROW-2256](https://issues.apache.org/jira/browse/ARROW-2256) - [C++] Fuzzer builds fail out of the box on Ubuntu 16.04 using LLVM apt repos -* [ARROW-2461](https://issues.apache.org/jira/browse/ARROW-2461) - [Python] Build wheels for manylinux2010 tag -* [ARROW-2590](https://issues.apache.org/jira/browse/ARROW-2590) - [Python] Pyspark python\_udf serialization error on grouped map (Amazon EMR) -* [ARROW-3344](https://issues.apache.org/jira/browse/ARROW-3344) - [Python] test\_plasma.py fails (in test\_plasma\_list) -* [ARROW-3399](https://issues.apache.org/jira/browse/ARROW-3399) - [Python] Cannot serialize numpy matrix object -* [ARROW-3650](https://issues.apache.org/jira/browse/ARROW-3650) - [Python] Mixed column indexes are read back as strings -* [ARROW-3801](https://issues.apache.org/jira/browse/ARROW-3801) - [Python] Pandas-Arrow roundtrip makes pd categorical index not writeable -* [ARROW-4021](https://issues.apache.org/jira/browse/ARROW-4021) - [Ruby] Error building red-arrow on msys2 -* [ARROW-4076](https://issues.apache.org/jira/browse/ARROW-4076) - [Python] schema validation and filters -* [ARROW-4139](https://issues.apache.org/jira/browse/ARROW-4139) - [Python] Cast Parquet column statistics to unicode if UTF8 ConvertedType is set -* [ARROW-4301](https://issues.apache.org/jira/browse/ARROW-4301) - [Java][Gandiva] Maven snapshot version update does not seem to update Gandiva submodule -* [ARROW-4301](https://issues.apache.org/jira/browse/ARROW-4301) - [Java][Gandiva] Maven snapshot version update does not seem to update Gandiva submodule -* [ARROW-4324](https://issues.apache.org/jira/browse/ARROW-4324) - [Python] Array dtype inference incorrect when created from list of mixed numpy scalars -* [ARROW-4350](https://issues.apache.org/jira/browse/ARROW-4350) - [Python] dtype=object arrays cannot be converted to a list-of-list ListArray -* [ARROW-4433](https://issues.apache.org/jira/browse/ARROW-4433) - [R] Segmentation fault when instantiating arrow::table from data frame -* [ARROW-4447](https://issues.apache.org/jira/browse/ARROW-4447) - [C++] Investigate dynamic linking for libthift -* [ARROW-4516](https://issues.apache.org/jira/browse/ARROW-4516) - [Python] Error while creating a ParquetDataset on a path without \`\_common\_dataset\` but with an empty \`\_tempfile\` -* [ARROW-4523](https://issues.apache.org/jira/browse/ARROW-4523) - [JS] Add row proxy generation benchmark -* [ARROW-4651](https://issues.apache.org/jira/browse/ARROW-4651) - [Format] Flight Location should be more flexible than a (host, port) pair -* [ARROW-4665](https://issues.apache.org/jira/browse/ARROW-4665) - [C++] With glog activated, DCHECK macros are redefined -* [ARROW-4675](https://issues.apache.org/jira/browse/ARROW-4675) - [Python] Error serializing bool ndarray in py2 and deserializing in py3 -* [ARROW-4694](https://issues.apache.org/jira/browse/ARROW-4694) - [CI] detect-changes.py is inconsistent -* [ARROW-4723](https://issues.apache.org/jira/browse/ARROW-4723) - [Python] Skip \_files when reading a directory containing parquet files -* [ARROW-4725](https://issues.apache.org/jira/browse/ARROW-4725) - [C++] Dictionary tests disabled under MinGW builds -* [ARROW-4823](https://issues.apache.org/jira/browse/ARROW-4823) - [Python] read\_csv shouldn't close file handles it doesn't own -* [ARROW-4832](https://issues.apache.org/jira/browse/ARROW-4832) - [Python] pandas Index metadata for RangeIndex is incorrect -* [ARROW-4845](https://issues.apache.org/jira/browse/ARROW-4845) - [R] Compiler warnings on Windows MingW64 -* [ARROW-4851](https://issues.apache.org/jira/browse/ARROW-4851) - [Java] BoundsChecking.java defaulting behavior for old drill parameter seems off -* [ARROW-4877](https://issues.apache.org/jira/browse/ARROW-4877) - [Plasma] CI failure in test\_plasma\_list -* [ARROW-4884](https://issues.apache.org/jira/browse/ARROW-4884) - [C++] conda-forge thrift-cpp package not available via pkg-config or cmake -* [ARROW-4885](https://issues.apache.org/jira/browse/ARROW-4885) - [Python] read\_csv() can't handle decimal128 columns -* [ARROW-4886](https://issues.apache.org/jira/browse/ARROW-4886) - [Rust] Inconsistent behaviour with casting sliced primitive array to list array -* [ARROW-4923](https://issues.apache.org/jira/browse/ARROW-4923) - Expose setters for Decimal vector that take long and double inputs -* [ARROW-4934](https://issues.apache.org/jira/browse/ARROW-4934) - [Python] Address deprecation notice that will be a bug in Python 3.8 -* [ARROW-5019](https://issues.apache.org/jira/browse/ARROW-5019) - [C\#] ArrowStreamWriter doesn't work on a non-seekable stream -* [ARROW-5049](https://issues.apache.org/jira/browse/ARROW-5049) - [Python] org/apache/hadoop/fs/FileSystem class not found when pyarrow FileSystem used in spark -* [ARROW-5051](https://issues.apache.org/jira/browse/ARROW-5051) - [GLib][Gandiva] Test failure in release verification script -* [ARROW-5055](https://issues.apache.org/jira/browse/ARROW-5055) - [Ruby][MSYS2] libparquet needs to be installed in MSYS2 for ruby -* [ARROW-5058](https://issues.apache.org/jira/browse/ARROW-5058) - [Release] 02-source.sh generates e-mail template with wrong links -* [ARROW-5059](https://issues.apache.org/jira/browse/ARROW-5059) - [C++][Gandiva] cbrt\_\* floating point tests can fail due to exact comparisons -* [ARROW-5065](https://issues.apache.org/jira/browse/ARROW-5065) - [Rust] cast kernel does not support casting from Int64 -* [ARROW-5068](https://issues.apache.org/jira/browse/ARROW-5068) - [Gandiva][Packaging] Fix gandiva nightly builds after the CMake refactor -* [ARROW-5090](https://issues.apache.org/jira/browse/ARROW-5090) - Parquet linking fails on MacOS due to @rpath in dylib -* [ARROW-5092](https://issues.apache.org/jira/browse/ARROW-5092) - [C\#] Source Link doesn't work with the C\# release script -* [ARROW-5095](https://issues.apache.org/jira/browse/ARROW-5095) - [Flight][C++] Flight DoGet doesn't expose server error message -* [ARROW-5096](https://issues.apache.org/jira/browse/ARROW-5096) - [Packaging][deb] plasma-store-server packages are missing -* [ARROW-5097](https://issues.apache.org/jira/browse/ARROW-5097) - [Packaging][CentOS6] arrow-lib has unresolvable dependencies -* [ARROW-5098](https://issues.apache.org/jira/browse/ARROW-5098) - [Website] Update APT install document for 0.13.0 -* [ARROW-5100](https://issues.apache.org/jira/browse/ARROW-5100) - [JS] Writer swaps byte order if buffers share the same underlying ArrayBuffer -* [ARROW-5117](https://issues.apache.org/jira/browse/ARROW-5117) - [Go] Panic when appending zero slices after initializing a builder -* [ARROW-5119](https://issues.apache.org/jira/browse/ARROW-5119) - [Go] invalid Stringer implementation for array.Boolean -* [ARROW-5122](https://issues.apache.org/jira/browse/ARROW-5122) - [Python] pyarrow.parquet.read\_table raises non-file path error when given a windows path to a directory -* [ARROW-5128](https://issues.apache.org/jira/browse/ARROW-5128) - [Packaging][CentOS][Conda] Numpy not found in nightly builds -* [ARROW-5129](https://issues.apache.org/jira/browse/ARROW-5129) - [Rust][Parquet] Column writer bug: check dictionary encoder when adding a new data page -* [ARROW-5130](https://issues.apache.org/jira/browse/ARROW-5130) - [Python] Segfault when importing TensorFlow after Pyarrow -* [ARROW-5132](https://issues.apache.org/jira/browse/ARROW-5132) - [Java] Errors on building gandiva\_jni.dll on Windows with Visual Studio 2017 -* [ARROW-5138](https://issues.apache.org/jira/browse/ARROW-5138) - [Python/C++] Row group retrieval doesn't restore index properly -* [ARROW-5140](https://issues.apache.org/jira/browse/ARROW-5140) - [Bug?][Parquet] Can write a jagged array column of strings to disk, but hit \`ArrowNotImplementedError\` on read -* [ARROW-5142](https://issues.apache.org/jira/browse/ARROW-5142) - [CI] Fix conda calls in AppVeyor scripts -* [ARROW-5144](https://issues.apache.org/jira/browse/ARROW-5144) - [Python] ParquetDataset and ParquetPiece not serializable -* [ARROW-5146](https://issues.apache.org/jira/browse/ARROW-5146) - [Dev] Merge script imposes directory name -* [ARROW-5147](https://issues.apache.org/jira/browse/ARROW-5147) - [C++] get an error in building: Could NOT find DoubleConversion -* [ARROW-5148](https://issues.apache.org/jira/browse/ARROW-5148) - [CI] [C++] LLVM-related compile errors -* [ARROW-5149](https://issues.apache.org/jira/browse/ARROW-5149) - [Packaging][Wheel] Pin LLVM to version 7 in windows builds -* [ARROW-5152](https://issues.apache.org/jira/browse/ARROW-5152) - [Python] CMake warnings when building -* [ARROW-5159](https://issues.apache.org/jira/browse/ARROW-5159) - Unable to build benches in arrow crate. -* [ARROW-5160](https://issues.apache.org/jira/browse/ARROW-5160) - [C++] ABORT\_NOT\_OK evalutes expression twice -* [ARROW-5166](https://issues.apache.org/jira/browse/ARROW-5166) - [Python][Parquet] Statistics for uint64 columns may overflow -* [ARROW-5167](https://issues.apache.org/jira/browse/ARROW-5167) - [C++] Upgrade string-view-light to latest -* [ARROW-5169](https://issues.apache.org/jira/browse/ARROW-5169) - [Python] non-nullable fields are converted to nullable in {{Table.from\_pandas}} -* [ARROW-5173](https://issues.apache.org/jira/browse/ARROW-5173) - [Go] handle multiple concatenated streams back-to-back -* [ARROW-5174](https://issues.apache.org/jira/browse/ARROW-5174) - [Go] implement Stringer for DataTypes -* [ARROW-5177](https://issues.apache.org/jira/browse/ARROW-5177) - [Python] ParquetReader.read\_column() doesn't check bounds -* [ARROW-5183](https://issues.apache.org/jira/browse/ARROW-5183) - [CI] MinGW build failures on AppVeyor -* [ARROW-5184](https://issues.apache.org/jira/browse/ARROW-5184) - [Rust] Broken links and other documentation warnings -* [ARROW-5186](https://issues.apache.org/jira/browse/ARROW-5186) - [Plasma] Crash on deleting CUDA memory -* [ARROW-5194](https://issues.apache.org/jira/browse/ARROW-5194) - [C++][Plasma] TEST(PlasmaSerialization, GetReply) is failing -* [ARROW-5195](https://issues.apache.org/jira/browse/ARROW-5195) - [Python] read\_csv ignores null\_values on string types -* [ARROW-5201](https://issues.apache.org/jira/browse/ARROW-5201) - [Python] Import ABCs from collections is deprecated in Python 3.7 -* [ARROW-5208](https://issues.apache.org/jira/browse/ARROW-5208) - [Python] Inconsistent resulting type during casting in pa.array() when mask is present -* [ARROW-5214](https://issues.apache.org/jira/browse/ARROW-5214) - [C++] Offline dependency downloader misses some libraries -* [ARROW-5217](https://issues.apache.org/jira/browse/ARROW-5217) - [Rust] [CI] DataFusion test failure -* [ARROW-5232](https://issues.apache.org/jira/browse/ARROW-5232) - [Java] value vector size increases rapidly in case of clear/setSafe loop -* [ARROW-5233](https://issues.apache.org/jira/browse/ARROW-5233) - [Go] migrate to new flatbuffers-v1.11.0 -* [ARROW-5237](https://issues.apache.org/jira/browse/ARROW-5237) - [Python] pandas\_version key in pandas metadata no longer populated -* [ARROW-5240](https://issues.apache.org/jira/browse/ARROW-5240) - [C++][CI] cmake\_format 0.5.0 appears to fail the build -* [ARROW-5242](https://issues.apache.org/jira/browse/ARROW-5242) - [C++] Arrow doesn't compile cleanly with Visual Studio 2017 Update 9 or later due to narrowing -* [ARROW-5243](https://issues.apache.org/jira/browse/ARROW-5243) - [Java][Gandiva] Add test for decimal compare functions -* [ARROW-5245](https://issues.apache.org/jira/browse/ARROW-5245) - [C++][CI] Unpin cmake\_format -* [ARROW-5246](https://issues.apache.org/jira/browse/ARROW-5246) - [Go] use Go-1.12 in CI -* [ARROW-5249](https://issues.apache.org/jira/browse/ARROW-5249) - [Java] Flight client doesn't handle auth correctly in some cases -* [ARROW-5253](https://issues.apache.org/jira/browse/ARROW-5253) - [C++] external Snappy fails on Alpine -* [ARROW-5254](https://issues.apache.org/jira/browse/ARROW-5254) - [Flight][Java] DoAction does not support result streams -* [ARROW-5255](https://issues.apache.org/jira/browse/ARROW-5255) - [Java] Implement user-defined data types API -* [ARROW-5260](https://issues.apache.org/jira/browse/ARROW-5260) - [Python][C++] Crash when deserializing from components in a fresh new process -* [ARROW-5274](https://issues.apache.org/jira/browse/ARROW-5274) - [JavaScript] Wrong array type for countBy -* [ARROW-5283](https://issues.apache.org/jira/browse/ARROW-5283) - [C++][Plasma] Server crash when creating an aborted object 3 times -* [ARROW-5285](https://issues.apache.org/jira/browse/ARROW-5285) - [C++][Plasma] GpuProcessHandle is not released when GPU object deleted -* [ARROW-5293](https://issues.apache.org/jira/browse/ARROW-5293) - [C++] Take kernel on DictionaryArray does not preserve ordered flag -* [ARROW-5294](https://issues.apache.org/jira/browse/ARROW-5294) - [CI] setuptools\_scm failures -* [ARROW-5296](https://issues.apache.org/jira/browse/ARROW-5296) - [Java] Sporadic Flight test failures -* [ARROW-5301](https://issues.apache.org/jira/browse/ARROW-5301) - [Python] parquet documentation outdated on nthreads argument -* [ARROW-5304](https://issues.apache.org/jira/browse/ARROW-5304) - [C++] CudaDeviceManager::GetInstance is not thread-safe -* [ARROW-5306](https://issues.apache.org/jira/browse/ARROW-5306) - [CI] [GLib] Disable GTK-Doc -* [ARROW-5308](https://issues.apache.org/jira/browse/ARROW-5308) - [Go] remove deprecated Feather format -* [ARROW-5314](https://issues.apache.org/jira/browse/ARROW-5314) - [Go] Incorrect Printing for String Arrays with Offsets -* [ARROW-5314](https://issues.apache.org/jira/browse/ARROW-5314) - [Go] Incorrect Printing for String Arrays with Offsets -* [ARROW-5318](https://issues.apache.org/jira/browse/ARROW-5318) - [Python] pyarrow hdfs reader overrequests -* [ARROW-5325](https://issues.apache.org/jira/browse/ARROW-5325) - [Archery][Benchmark] Output properly formatted jsonlines from benchmark diff cli command -* [ARROW-5330](https://issues.apache.org/jira/browse/ARROW-5330) - [Python] [CI] Run Python Flight tests on Travis-CI -* [ARROW-5332](https://issues.apache.org/jira/browse/ARROW-5332) - [R] R package fails to build/install: error in dyn.load() -* [ARROW-5348](https://issues.apache.org/jira/browse/ARROW-5348) - [CI] [Java] Gandiva checkstyle failure -* [ARROW-5360](https://issues.apache.org/jira/browse/ARROW-5360) - [Rust] Builds are broken by rustyline on nightly 2019-05-16+ -* [ARROW-5362](https://issues.apache.org/jira/browse/ARROW-5362) - [C++] Compression round trip test can cause some sanitizers to to fail -* [ARROW-5371](https://issues.apache.org/jira/browse/ARROW-5371) - [Release] Add tests for dev/release/00-prepare.sh -* [ARROW-5373](https://issues.apache.org/jira/browse/ARROW-5373) - [Java] Add missing details for Gandiva Java Build -* [ARROW-5376](https://issues.apache.org/jira/browse/ARROW-5376) - [C++] Compile failure on gcc 5.4.0 -* [ARROW-5383](https://issues.apache.org/jira/browse/ARROW-5383) - [Go] update IPC flatbuf (new Duration type) -* [ARROW-5387](https://issues.apache.org/jira/browse/ARROW-5387) - [Go] properly handle sub-slice of List -* [ARROW-5388](https://issues.apache.org/jira/browse/ARROW-5388) - [Go] use arrow.TypeEqual in array.NewChunked -* [ARROW-5390](https://issues.apache.org/jira/browse/ARROW-5390) - [CI] Job time limit exceeded on Travis -* [ARROW-5397](https://issues.apache.org/jira/browse/ARROW-5397) - Test Flight TLS support -* [ARROW-5398](https://issues.apache.org/jira/browse/ARROW-5398) - [Python] Flight tests broken by URI changes -* [ARROW-5403](https://issues.apache.org/jira/browse/ARROW-5403) - [C++] Test failures not propagated in Windows shared builds -* [ARROW-5411](https://issues.apache.org/jira/browse/ARROW-5411) - [C++][Python] Build error building on Mac OS Mojave -* [ARROW-5412](https://issues.apache.org/jira/browse/ARROW-5412) - [Java] Integration test fails with UnsupportedOperationException -* [ARROW-5419](https://issues.apache.org/jira/browse/ARROW-5419) - [C++] CSV strings\_can\_be\_null option doesn't respect all null\_values -* [ARROW-5421](https://issues.apache.org/jira/browse/ARROW-5421) - [Packaging][Crossbow] Duplicated key in nightly test configuration -* [ARROW-5422](https://issues.apache.org/jira/browse/ARROW-5422) - [CI] [C++] Build failure with Google Benchmark -* [ARROW-5430](https://issues.apache.org/jira/browse/ARROW-5430) - [Python] Can read but not write parquet partitioned on large ints -* [ARROW-5435](https://issues.apache.org/jira/browse/ARROW-5435) - [Java] add test for IntervalYearVector\#getAsStringBuilder -* [ARROW-5437](https://issues.apache.org/jira/browse/ARROW-5437) - [Python] Missing pandas pytest marker from parquet tests -* [ARROW-5446](https://issues.apache.org/jira/browse/ARROW-5446) - [C++] Use cmake header install directory instead of include -* [ARROW-5448](https://issues.apache.org/jira/browse/ARROW-5448) - [CI] MinGW build failures on AppVeyor -* [ARROW-5453](https://issues.apache.org/jira/browse/ARROW-5453) - [C++] Just-released cmake-format 0.5.2 breaks the build -* [ARROW-5455](https://issues.apache.org/jira/browse/ARROW-5455) - [Rust] Build broken by 2019-05-30 Rust nightly -* [ARROW-5456](https://issues.apache.org/jira/browse/ARROW-5456) - [GLib][Plasma] Installed plasma-glib may be used on building document -* [ARROW-5457](https://issues.apache.org/jira/browse/ARROW-5457) - [GLib][Plasma] Environment variable name for test is wrong -* [ARROW-5459](https://issues.apache.org/jira/browse/ARROW-5459) - [Go] implement Stringer for Float16 DataType -* [ARROW-5462](https://issues.apache.org/jira/browse/ARROW-5462) - [Go] support writing zero-length List -* [ARROW-5479](https://issues.apache.org/jira/browse/ARROW-5479) - [Rust] [DataFusion] Use ARROW\_TEST\_DATA instead of relative path for testing -* [ARROW-5487](https://issues.apache.org/jira/browse/ARROW-5487) - [CI] [Python] Failure in docs build -* [ARROW-5493](https://issues.apache.org/jira/browse/ARROW-5493) - [Integration/Go] add Go support for IPC integration tests -* [ARROW-5507](https://issues.apache.org/jira/browse/ARROW-5507) - [Plasma] [CUDA] Compile error -* [ARROW-5514](https://issues.apache.org/jira/browse/ARROW-5514) - [C++] Printer for uint64 shows wrong values -* [ARROW-5517](https://issues.apache.org/jira/browse/ARROW-5517) - [C++] Header collection CMake logic should only consider filename without directory included -* [ARROW-5520](https://issues.apache.org/jira/browse/ARROW-5520) - [C++][Packaging] No NVidia CUDA toolkit on AArch64C -* [ARROW-5521](https://issues.apache.org/jira/browse/ARROW-5521) - [Packaging] License check fails with Apache RAT 0.13 -* [ARROW-5528](https://issues.apache.org/jira/browse/ARROW-5528) - Concatenate() crashes when concatenating empty binary arrays. -* [ARROW-5532](https://issues.apache.org/jira/browse/ARROW-5532) - [JS] Field Metadata Not Read -* [ARROW-5551](https://issues.apache.org/jira/browse/ARROW-5551) - [Go] invalid FixedSizeArray representation -* [ARROW-5553](https://issues.apache.org/jira/browse/ARROW-5553) - [Ruby] red-arrow gem does not compile on ruby:2.5 docker image -* [ARROW-5576](https://issues.apache.org/jira/browse/ARROW-5576) - [C++] Flaky thrift\_ep tarball downloads -* [ARROW-5577](https://issues.apache.org/jira/browse/ARROW-5577) - [C++] Link failure due to googletest shared library on Alpine Linux -* [ARROW-5583](https://issues.apache.org/jira/browse/ARROW-5583) - [Java] When the isSet of a NullableValueHolder is 0, the buffer field should not be used -* [ARROW-5584](https://issues.apache.org/jira/browse/ARROW-5584) - [Java] Add import for link reference in FieldReader javadoc -* [ARROW-5589](https://issues.apache.org/jira/browse/ARROW-5589) - [C++][Fuzzing] arrow-ipc-fuzzing-test crash 2354085db0125113f04f7bd23f54b85cca104713 -* [ARROW-5592](https://issues.apache.org/jira/browse/ARROW-5592) - [Go] implement Duration array -* [ARROW-5596](https://issues.apache.org/jira/browse/ARROW-5596) - [Python] Flight tests failing on Python 2.7 -* [ARROW-5601](https://issues.apache.org/jira/browse/ARROW-5601) - [gandiva] Error when projector with a string field -* [ARROW-5603](https://issues.apache.org/jira/browse/ARROW-5603) - [Python] register pytest markers to avoid warnings -* [ARROW-5605](https://issues.apache.org/jira/browse/ARROW-5605) - [C++][Fuzzing] arrow-ipc-fuzzing-test crash 74aec871d14bb6b07c72ea8f0e8c9f72cbe6b73c -* [ARROW-5606](https://issues.apache.org/jira/browse/ARROW-5606) - [Python] pandas.RangeIndex.\_start/\_stop/\_step are deprecated -* [ARROW-5608](https://issues.apache.org/jira/browse/ARROW-5608) - [C++][parquet] Invalid memory access when using parquet::arrow::ColumnReader -* [ARROW-5615](https://issues.apache.org/jira/browse/ARROW-5615) - [C++] Compilation error due to C++11 string literals on gcc 5.4.0 Ubuntu 16.04 -* [ARROW-5616](https://issues.apache.org/jira/browse/ARROW-5616) - [Python] C++ build failure against Python 2.7 headers -* [ARROW-5617](https://issues.apache.org/jira/browse/ARROW-5617) - [C++] thrift\_ep 0.12.0 fails to build when using ARROW\_BOOST\_VENDORED=ON -* [ARROW-5619](https://issues.apache.org/jira/browse/ARROW-5619) - [C++] get\_apache\_mirror.py doesn't work with Python 3.5 -* [ARROW-5623](https://issues.apache.org/jira/browse/ARROW-5623) - [CI][GLib] Failed on macOS -* [ARROW-5624](https://issues.apache.org/jira/browse/ARROW-5624) - [C++] -Duriparser\_SOURCE=BUNDLED is broken -* [ARROW-5626](https://issues.apache.org/jira/browse/ARROW-5626) - [C++][Gandiva] Expression cache should consider precision and scale too -* [ARROW-5629](https://issues.apache.org/jira/browse/ARROW-5629) - [C++] Fix Coverity issues -* [ARROW-5631](https://issues.apache.org/jira/browse/ARROW-5631) - [C++] CMake 3.2 build is broken -* [ARROW-5644](https://issues.apache.org/jira/browse/ARROW-5644) - [Python] test\_flight.py::test\_tls\_do\_get appears to hang -* [ARROW-5647](https://issues.apache.org/jira/browse/ARROW-5647) - [Python] Accessing a file from Databricks using pandas read\_parquet using the pyarrow engine fails with : Passed non-file path: /mnt/aa/example.parquet -* [ARROW-5648](https://issues.apache.org/jira/browse/ARROW-5648) - [C++] Build fails on mingw without codecvt -* [ARROW-5654](https://issues.apache.org/jira/browse/ARROW-5654) - [C++] ChunkedArray should validate the types of the arrays -* [ARROW-5657](https://issues.apache.org/jira/browse/ARROW-5657) - [C++] "docker-compose run cpp" broken in master -* [ARROW-5674](https://issues.apache.org/jira/browse/ARROW-5674) - [Python] Missing pandas pytest markers from test\_parquet.py -* [ARROW-5675](https://issues.apache.org/jira/browse/ARROW-5675) - [Doc] Fix typo in documentation describing compile/debug workflow on macOS with Xcode IDE -* [ARROW-5678](https://issues.apache.org/jira/browse/ARROW-5678) - [R][Lint] Fix hadolint docker linting error -* [ARROW-5693](https://issues.apache.org/jira/browse/ARROW-5693) - [Go] skip IPC integration test for Decimal128 -* [ARROW-5697](https://issues.apache.org/jira/browse/ARROW-5697) - [GLib] c\_glib/Dockerfile is broken -* [ARROW-5698](https://issues.apache.org/jira/browse/ARROW-5698) - [R] r/Dockerfile docker-compose build is broken -* [ARROW-5709](https://issues.apache.org/jira/browse/ARROW-5709) - [C++] gandiva-date\_time\_test failure on Windows -* [ARROW-5714](https://issues.apache.org/jira/browse/ARROW-5714) - [JS] Inconsistent behavior in Int64Builder with/without BigNum -* [ARROW-5723](https://issues.apache.org/jira/browse/ARROW-5723) - [Gandiva][Crossbow] Builds failing -* [ARROW-5728](https://issues.apache.org/jira/browse/ARROW-5728) - [Python] [CI] Travis-CI failures in test\_jvm.py -* [ARROW-5729](https://issues.apache.org/jira/browse/ARROW-5729) - [Python][Java] ArrowType.Int object has no attribute 'isSigned' -* [ARROW-5730](https://issues.apache.org/jira/browse/ARROW-5730) - [Python][CI] Selectively skip test cases in the dask integration test -* [ARROW-5732](https://issues.apache.org/jira/browse/ARROW-5732) - [C++] macOS builds failing idiosyncratically on master with warnings from pmmintrin.h -* [ARROW-5735](https://issues.apache.org/jira/browse/ARROW-5735) - [C++] Appveyor builds failing persistently in thrift\_ep build -* [ARROW-5737](https://issues.apache.org/jira/browse/ARROW-5737) - [C++][Gandiva] Gandiva not building in manylinux -* [ARROW-5738](https://issues.apache.org/jira/browse/ARROW-5738) - [Crossbow][Conda] OSX package builds are failing with missing intrinsics -* [ARROW-5739](https://issues.apache.org/jira/browse/ARROW-5739) - [CI] Fix docker python build -* [ARROW-5750](https://issues.apache.org/jira/browse/ARROW-5750) - [Java] Java compilation failures on master -* [ARROW-5754](https://issues.apache.org/jira/browse/ARROW-5754) - [C++]Missing override for \~GrpcStreamWriter? -* [ARROW-5765](https://issues.apache.org/jira/browse/ARROW-5765) - [C++] TestDictionary.Validate test is crashed with release build -* [ARROW-5769](https://issues.apache.org/jira/browse/ARROW-5769) - [Java] org.apache.arrow.flight.TestTls is failed via dev/release/00-prepare.sh -* [ARROW-5770](https://issues.apache.org/jira/browse/ARROW-5770) - [C++] Fix -Wpessimizing-move in result.h -* [ARROW-5771](https://issues.apache.org/jira/browse/ARROW-5771) - [Python] Docker python-nopandas job fails -* [ARROW-5774](https://issues.apache.org/jira/browse/ARROW-5774) - [Java][Documentation] Document the need to checkout git submodules for flight -* [ARROW-5781](https://issues.apache.org/jira/browse/ARROW-5781) - [Archery] Ensure benchmark clone accepts remotes in revision -* [ARROW-5791](https://issues.apache.org/jira/browse/ARROW-5791) - [Python] pyarrow.csv.read\_csv hangs + eats all RAM -* [ARROW-5816](https://issues.apache.org/jira/browse/ARROW-5816) - [Release] Parallel curl does not work reliably in verify-release-candidate-sh -* [ARROW-5922](https://issues.apache.org/jira/browse/ARROW-5922) - [Python] Unable to connect to HDFS from a worker/data node on a Kerberized cluster using pyarrow' hdfs API -* [PARQUET-1402](https://issues.apache.org/jira/browse/PARQUET-1402) - [C++] incorrect calculation column start offset for files created by parquet-mr 1.8.1 -* [PARQUET-1405](https://issues.apache.org/jira/browse/PARQUET-1405) - [C++] 'Couldn't deserialize thrift' error when reading large binary column -* [PARQUET-1405](https://issues.apache.org/jira/browse/PARQUET-1405) - [C++] 'Couldn't deserialize thrift' error when reading large binary column -* [PARQUET-1565](https://issues.apache.org/jira/browse/PARQUET-1565) - [C++] SEGV in FromParquetSchema with corrupt file from PARQUET-1481 -* [PARQUET-1571](https://issues.apache.org/jira/browse/PARQUET-1571) - [C++] Can't read data from parquet file in C++ library -* [PARQUET-1574](https://issues.apache.org/jira/browse/PARQUET-1574) - [C++] parquet-encoding-test failed with msvc -* [PARQUET-1581](https://issues.apache.org/jira/browse/PARQUET-1581) - [C++] Fix undefined behavior in encoding.cc when num\_dictionary\_values is 0. - - - -# Apache Arrow 0.13.0 (2019-04-01) - -## Bug Fixes - -* [ARROW-295](https://issues.apache.org/jira/browse/ARROW-295) - Create DOAP File -* [ARROW-1171](https://issues.apache.org/jira/browse/ARROW-1171) - [C++] Segmentation faults on Fedora 24 with pyarrow-manylinux1 and self-compiled turbodbc -* [ARROW-2392](https://issues.apache.org/jira/browse/ARROW-2392) - [Python] pyarrow RecordBatchStreamWriter allows writing batches with different schemas -* [ARROW-2399](https://issues.apache.org/jira/browse/ARROW-2399) - [Rust] Builder should not provide a set() method -* [ARROW-2598](https://issues.apache.org/jira/browse/ARROW-2598) - [Python] table.to\_pandas segfault -* [ARROW-3086](https://issues.apache.org/jira/browse/ARROW-3086) - [GLib] GISCAN fails due to conda-shipped openblas -* [ARROW-3096](https://issues.apache.org/jira/browse/ARROW-3096) - [Python] Update Python source build instructions given Anaconda/conda-forge toolchain migration -* [ARROW-3133](https://issues.apache.org/jira/browse/ARROW-3133) - [C++] Logical boolean kernels in kernels/boolean.cc cannot write into preallocated memory -* [ARROW-3133](https://issues.apache.org/jira/browse/ARROW-3133) - [C++] Logical boolean kernels in kernels/boolean.cc cannot write into preallocated memory -* [ARROW-3208](https://issues.apache.org/jira/browse/ARROW-3208) - [C++] Segmentation fault when casting dictionary to numeric with nullptr valid\_bitmap -* [ARROW-3426](https://issues.apache.org/jira/browse/ARROW-3426) - [CI] Java integration test very verbose -* [ARROW-3564](https://issues.apache.org/jira/browse/ARROW-3564) - [Python] writing version 2.0 parquet format with dictionary encoding enabled -* [ARROW-3578](https://issues.apache.org/jira/browse/ARROW-3578) - [Release] Address spurious Apache RAT failures in source release script -* [ARROW-3593](https://issues.apache.org/jira/browse/ARROW-3593) - [R] CI builds failing due to GitHub API rate limits -* [ARROW-3606](https://issues.apache.org/jira/browse/ARROW-3606) - [Python] flake8 fails on Crossbow -* [ARROW-3669](https://issues.apache.org/jira/browse/ARROW-3669) - [Python] Convert big-endian numbers or raise error in pyarrow.array -* [ARROW-3843](https://issues.apache.org/jira/browse/ARROW-3843) - [Python] Writing Parquet file from empty table created with Table.from\_pandas(..., preserve\_index=False) fails -* [ARROW-3923](https://issues.apache.org/jira/browse/ARROW-3923) - [Java] JDBC-to-Arrow Conversion: Unnecessary Calendar Requirement -* [ARROW-4007](https://issues.apache.org/jira/browse/ARROW-4007) - [Java][Plasma] Plasma JNI tests failing -* [ARROW-4050](https://issues.apache.org/jira/browse/ARROW-4050) - [Python][Parquet] core dump on reading parquet file -* [ARROW-4081](https://issues.apache.org/jira/browse/ARROW-4081) - [Go] Sum methods on Mac OS X panic when the array is empty -* [ARROW-4104](https://issues.apache.org/jira/browse/ARROW-4104) - [Java] race in AllocationManager during release -* [ARROW-4108](https://issues.apache.org/jira/browse/ARROW-4108) - [Python/Java] Spark integration tests do not work -* [ARROW-4117](https://issues.apache.org/jira/browse/ARROW-4117) - [Python] "asv dev" command fails with latest revision -* [ARROW-4140](https://issues.apache.org/jira/browse/ARROW-4140) - [C++][Gandiva] Compiled LLVM bitcode file path may result in libraries being non-relocatable -* [ARROW-4145](https://issues.apache.org/jira/browse/ARROW-4145) - [C++] Find Windows-compatible strptime implementation -* [ARROW-4181](https://issues.apache.org/jira/browse/ARROW-4181) - [Python] TestConvertStructTypes.test\_from\_numpy\_large failing -* [ARROW-4192](https://issues.apache.org/jira/browse/ARROW-4192) - "./dev/run\_docker\_compose.sh" is out of date -* [ARROW-4213](https://issues.apache.org/jira/browse/ARROW-4213) - [Flight] C++ and Java implementations are incompatible -* [ARROW-4244](https://issues.apache.org/jira/browse/ARROW-4244) - Clarify language around padding/alignment -* [ARROW-4250](https://issues.apache.org/jira/browse/ARROW-4250) - [C++][Gandiva] Use approximate comparisons for floating point numbers in gandiva-projector-test -* [ARROW-4252](https://issues.apache.org/jira/browse/ARROW-4252) - [C++] Status error context strings missing lines of code -* [ARROW-4253](https://issues.apache.org/jira/browse/ARROW-4253) - [GLib] Cannot use non-system Boost specified with $BOOST\_ROOT -* [ARROW-4254](https://issues.apache.org/jira/browse/ARROW-4254) - [C++] Gandiva tests fail to compile with Boost in Ubuntu 14.04 apt -* [ARROW-4255](https://issues.apache.org/jira/browse/ARROW-4255) - [C++] Schema::GetFieldIndex is not thread-safe -* [ARROW-4261](https://issues.apache.org/jira/browse/ARROW-4261) - [C++] CMake paths for IPC, Flight, Thrift, and Plasma don't support using Arrow as a subproject -* [ARROW-4264](https://issues.apache.org/jira/browse/ARROW-4264) - [C++] Document why DCHECKs are used in kernels -* [ARROW-4267](https://issues.apache.org/jira/browse/ARROW-4267) - [Python/C++][Parquet] Segfault when reading rowgroups with duplicated columns -* [ARROW-4274](https://issues.apache.org/jira/browse/ARROW-4274) - [Gandiva] static jni library broken after decimal changes -* [ARROW-4275](https://issues.apache.org/jira/browse/ARROW-4275) - [C++] gandiva-decimal\_single\_test extremely slow -* [ARROW-4280](https://issues.apache.org/jira/browse/ARROW-4280) - [C++][Documentation] It looks like flex and bison are required for parquet -* [ARROW-4282](https://issues.apache.org/jira/browse/ARROW-4282) - [Rust] builder benchmark is broken -* [ARROW-4284](https://issues.apache.org/jira/browse/ARROW-4284) - [C\#] File / Stream serialization fails due to type mismatch / missing footer -* [ARROW-4295](https://issues.apache.org/jira/browse/ARROW-4295) - [Plasma] Incorrect log message when evicting objects -* [ARROW-4296](https://issues.apache.org/jira/browse/ARROW-4296) - [Plasma] Starting Plasma store with use\_one\_memory\_mapped\_file enabled crashes due to improper memory alignment -* [ARROW-4308](https://issues.apache.org/jira/browse/ARROW-4308) - [Python] pyarrow has a hard dependency on pandas -* [ARROW-4311](https://issues.apache.org/jira/browse/ARROW-4311) - [Python] Regression on pq.ParquetWriter incorrectly handling source string -* [ARROW-4312](https://issues.apache.org/jira/browse/ARROW-4312) - [C++] Lint doesn't work anymore ("[Errno 24] Too many open files") -* [ARROW-4319](https://issues.apache.org/jira/browse/ARROW-4319) - plasma/store.h pulls ins flatbuffer dependency -* [ARROW-4320](https://issues.apache.org/jira/browse/ARROW-4320) - [C++] Add tests for non-contiguous tensors -* [ARROW-4322](https://issues.apache.org/jira/browse/ARROW-4322) - [CI] docker nightlies fails after conda-forge compiler migration -* [ARROW-4323](https://issues.apache.org/jira/browse/ARROW-4323) - [Packaging] Fix failing OSX clang conda forge builds -* [ARROW-4326](https://issues.apache.org/jira/browse/ARROW-4326) - [C++] Development instructions in python/development.rst will not work for many Linux distros with new conda-forge toolchain -* [ARROW-4327](https://issues.apache.org/jira/browse/ARROW-4327) - [Python] Add requirements-build.txt file to simplify setting up Python build environment -* [ARROW-4328](https://issues.apache.org/jira/browse/ARROW-4328) - Make R build compatible with DARROW\_TENSORFLOW=ON -* [ARROW-4329](https://issues.apache.org/jira/browse/ARROW-4329) - Python should include the parquet headers -* [ARROW-4342](https://issues.apache.org/jira/browse/ARROW-4342) - [Gandiva][Java] spurious failures in projector cache test -* [ARROW-4347](https://issues.apache.org/jira/browse/ARROW-4347) - [Python] Run Python Travis CI unit tests on Linux when Java codebase changed -* [ARROW-4349](https://issues.apache.org/jira/browse/ARROW-4349) - [C++] Build all benchmarks on Windows without failing -* [ARROW-4351](https://issues.apache.org/jira/browse/ARROW-4351) - [C++] Fail to build with static parquet -* [ARROW-4355](https://issues.apache.org/jira/browse/ARROW-4355) - [C++] test-util functions are no longer part of libarrow -* [ARROW-4360](https://issues.apache.org/jira/browse/ARROW-4360) - [C++] Query homebrew for Thrift -* [ARROW-4364](https://issues.apache.org/jira/browse/ARROW-4364) - [C++] Fix -weverything -wextra compilation errors -* [ARROW-4366](https://issues.apache.org/jira/browse/ARROW-4366) - [Docs] Change extension from format/README.md to format/README.rst -* [ARROW-4367](https://issues.apache.org/jira/browse/ARROW-4367) - [C++] StringDictionaryBuilder segfaults on Finish with only null entries -* [ARROW-4368](https://issues.apache.org/jira/browse/ARROW-4368) - Bintray repository signature verification fails -* [ARROW-4370](https://issues.apache.org/jira/browse/ARROW-4370) - [Python] Table to pandas conversion fails for list of bool -* [ARROW-4374](https://issues.apache.org/jira/browse/ARROW-4374) - [C++] DictionaryBuilder does not correctly report length and null\_count -* [ARROW-4381](https://issues.apache.org/jira/browse/ARROW-4381) - [Docker] docker-compose build lint fails -* [ARROW-4382](https://issues.apache.org/jira/browse/ARROW-4382) - [C++] Improve new cpplint output readability -* [ARROW-4384](https://issues.apache.org/jira/browse/ARROW-4384) - [C++] Running "format" target on new Windows 10 install opens "how do you want to open this file" dialog -* [ARROW-4385](https://issues.apache.org/jira/browse/ARROW-4385) - [Python] default\_version of a release should not include SNAPSHOT -* [ARROW-4389](https://issues.apache.org/jira/browse/ARROW-4389) - [R] Installing clang-tools in CI is failing on trusty -* [ARROW-4395](https://issues.apache.org/jira/browse/ARROW-4395) - ts-node throws type error running \`bin/arrow2csv.js\` -* [ARROW-4400](https://issues.apache.org/jira/browse/ARROW-4400) - [CI] install of clang tools failing -* [ARROW-4403](https://issues.apache.org/jira/browse/ARROW-4403) - [Rust] CI fails due to formatting errors -* [ARROW-4404](https://issues.apache.org/jira/browse/ARROW-4404) - [CI] AppVeyor toolchain build does not build anything -* [ARROW-4407](https://issues.apache.org/jira/browse/ARROW-4407) - [C++] ExternalProject\_Add does not capture CC/CXX correctly -* [ARROW-4410](https://issues.apache.org/jira/browse/ARROW-4410) - [C++] Fix InvertKernel edge cases -* [ARROW-4413](https://issues.apache.org/jira/browse/ARROW-4413) - [Python] pyarrow.hdfs.connect() failing -* [ARROW-4414](https://issues.apache.org/jira/browse/ARROW-4414) - [C++] Stop using cmake COMMAND\_EXPAND\_LISTS because it breaks package builds for older distros -* [ARROW-4417](https://issues.apache.org/jira/browse/ARROW-4417) - [C++] Doc build broken -* [ARROW-4420](https://issues.apache.org/jira/browse/ARROW-4420) - [INTEGRATION] Make spark integration test pass and test against spark's master branch -* [ARROW-4421](https://issues.apache.org/jira/browse/ARROW-4421) - [Flight][C++] Handle large Flight data messages -* [ARROW-4434](https://issues.apache.org/jira/browse/ARROW-4434) - [Python] Cannot create empty StructArray via pa.StructArray.from\_arrays -* [ARROW-4440](https://issues.apache.org/jira/browse/ARROW-4440) - [C++] Fix flatbuffers build using msvc -* [ARROW-4457](https://issues.apache.org/jira/browse/ARROW-4457) - [Python] Cannot create Decimal128 array using integers -* [ARROW-4469](https://issues.apache.org/jira/browse/ARROW-4469) - [Python][C++] CI Failing for Python 2.7 and 3.6 with valgrind -* [ARROW-4471](https://issues.apache.org/jira/browse/ARROW-4471) - [C++] Pass AR and RANLIB to all external projects -* [ARROW-4474](https://issues.apache.org/jira/browse/ARROW-4474) - [Flight] FlightInfo should use signed integer types for payload size -* [ARROW-4480](https://issues.apache.org/jira/browse/ARROW-4480) - [Python] Drive letter removed when writing parquet file -* [ARROW-4487](https://issues.apache.org/jira/browse/ARROW-4487) - [C++] Appveyor toolchain build does not actually build the project -* [ARROW-4494](https://issues.apache.org/jira/browse/ARROW-4494) - [Java] arrow-jdbc JAR is not uploaded on release -* [ARROW-4496](https://issues.apache.org/jira/browse/ARROW-4496) - [CI] CI failing for python Xcode 7.3 -* [ARROW-4498](https://issues.apache.org/jira/browse/ARROW-4498) - [Plasma] Plasma fails building with CUDA enabled -* [ARROW-4500](https://issues.apache.org/jira/browse/ARROW-4500) - [C++] librt and pthread hacks can cause linking problems -* [ARROW-4501](https://issues.apache.org/jira/browse/ARROW-4501) - [C++] Unique returns non-unique strings -* [ARROW-4525](https://issues.apache.org/jira/browse/ARROW-4525) - [Rust] [Parquet] Convert ArrowError to ParquetError -* [ARROW-4527](https://issues.apache.org/jira/browse/ARROW-4527) - [Packaging] Update linux packaging tasks to align with the LLVM 7 migration -* [ARROW-4532](https://issues.apache.org/jira/browse/ARROW-4532) - [Java] varchar value buffer much larger than expected -* [ARROW-4533](https://issues.apache.org/jira/browse/ARROW-4533) - [Python] Document how to run hypothesis tests -* [ARROW-4535](https://issues.apache.org/jira/browse/ARROW-4535) - [C++] Fix MakeBuilder to preserve ListType's field name -* [ARROW-4536](https://issues.apache.org/jira/browse/ARROW-4536) - Add data\_type argument in garrow\_list\_array\_new -* [ARROW-4538](https://issues.apache.org/jira/browse/ARROW-4538) - [PYTHON] Remove index column from subschema in write\_to\_dataframe -* [ARROW-4549](https://issues.apache.org/jira/browse/ARROW-4549) - [C++] Can't build benchmark code on CUDA enabled build -* [ARROW-4550](https://issues.apache.org/jira/browse/ARROW-4550) - [JS] Fix AMD pattern -* [ARROW-4559](https://issues.apache.org/jira/browse/ARROW-4559) - [Python] pyarrow can't read/write filenames with special characters -* [ARROW-4563](https://issues.apache.org/jira/browse/ARROW-4563) - [Python] pa.decimal128 should validate inputs -* [ARROW-4571](https://issues.apache.org/jira/browse/ARROW-4571) - [Format] Tensor.fbs file has multiple root\_type declarations -* [ARROW-4573](https://issues.apache.org/jira/browse/ARROW-4573) - [Python] Add Flight unit tests -* [ARROW-4576](https://issues.apache.org/jira/browse/ARROW-4576) - [Python] Benchmark failures -* [ARROW-4577](https://issues.apache.org/jira/browse/ARROW-4577) - [C++] Interface link libraries declared on arrow\_shared target that are actually non-interface -* [ARROW-4581](https://issues.apache.org/jira/browse/ARROW-4581) - [C++] gbenchmark\_ep is a dependency of unit tests when ARROW\_BUILD\_BENCHMARKS=ON -* [ARROW-4582](https://issues.apache.org/jira/browse/ARROW-4582) - [C++/Python] Memory corruption on Pandas-\>Arrow conversion -* [ARROW-4584](https://issues.apache.org/jira/browse/ARROW-4584) - [Python] Add built wheel to manylinux1 dockerignore. -* [ARROW-4585](https://issues.apache.org/jira/browse/ARROW-4585) - [C++] Dependency of Flight C++ sources on generated protobuf is not respected -* [ARROW-4587](https://issues.apache.org/jira/browse/ARROW-4587) - Flight C++ DoPut segfaults -* [ARROW-4597](https://issues.apache.org/jira/browse/ARROW-4597) - [C++] Targets for system Google Mock shared library are missing -* [ARROW-4601](https://issues.apache.org/jira/browse/ARROW-4601) - [Python] Master build is broken due to missing licence for .dockerignore -* [ARROW-4606](https://issues.apache.org/jira/browse/ARROW-4606) - [Rust] [DataFusion] FilterRelation created RecordBatch with empty schema -* [ARROW-4608](https://issues.apache.org/jira/browse/ARROW-4608) - [C++] cmake script assumes that double-conversion installs static libs -* [ARROW-4617](https://issues.apache.org/jira/browse/ARROW-4617) - [C++] Support double-conversion<3.1 -* [ARROW-4624](https://issues.apache.org/jira/browse/ARROW-4624) - [C++] Linker errors when building benchmarks -* [ARROW-4629](https://issues.apache.org/jira/browse/ARROW-4629) - [Python] Pandas to arrow conversion slowed down by local imports -* [ARROW-4635](https://issues.apache.org/jira/browse/ARROW-4635) - [Java] StructVector growing validity buffer unnecessarily -* [ARROW-4639](https://issues.apache.org/jira/browse/ARROW-4639) - [CI] Crossbow build failing for Gandiva jars -* [ARROW-4641](https://issues.apache.org/jira/browse/ARROW-4641) - [C++] Flight builds complain of -Wstrict-aliasing -* [ARROW-4642](https://issues.apache.org/jira/browse/ARROW-4642) - [R] Change \`f\` to \`file\` in \`read\_parquet\_file()\` -* [ARROW-4653](https://issues.apache.org/jira/browse/ARROW-4653) - [C++] decimal multiply broken when both args are negative -* [ARROW-4654](https://issues.apache.org/jira/browse/ARROW-4654) - [C++] Implicit Flight target dependencies cause compilation failure -* [ARROW-4657](https://issues.apache.org/jira/browse/ARROW-4657) - [Release] gbenchmark should not be needed for verification -* [ARROW-4658](https://issues.apache.org/jira/browse/ARROW-4658) - [C++] Shared gflags is also a run-time conda requirement -* [ARROW-4659](https://issues.apache.org/jira/browse/ARROW-4659) - [CI] ubuntu/debian nightlies fail because of missing gandiva files -* [ARROW-4660](https://issues.apache.org/jira/browse/ARROW-4660) - [C++] gflags fails to build due to CMake error -* [ARROW-4664](https://issues.apache.org/jira/browse/ARROW-4664) - [C++] DCHECK macro conditions are evaluated in release builds -* [ARROW-4669](https://issues.apache.org/jira/browse/ARROW-4669) - [Java] No Bounds checking on ArrowBuf.slice -* [ARROW-4672](https://issues.apache.org/jira/browse/ARROW-4672) - [C++] clang-7 matrix entry is build using gcc -* [ARROW-4680](https://issues.apache.org/jira/browse/ARROW-4680) - [CI] [Rust] Travis CI builds fail with latest Rust 1.34.0-nightly (2019-02-25) -* [ARROW-4684](https://issues.apache.org/jira/browse/ARROW-4684) - [Python] CI failures in test\_cython.py -* [ARROW-4687](https://issues.apache.org/jira/browse/ARROW-4687) - [Python] FlightServerBase.run should exit on Ctrl-C -* [ARROW-4688](https://issues.apache.org/jira/browse/ARROW-4688) - [C++][Parquet] 16MB limit on (nested) column chunk prevents tuning row\_group\_size -* [ARROW-4696](https://issues.apache.org/jira/browse/ARROW-4696) - Verify release script is over optimist with CUDA detection -* [ARROW-4699](https://issues.apache.org/jira/browse/ARROW-4699) - [C++] json parser should not rely on null terminated buffers -* [ARROW-4704](https://issues.apache.org/jira/browse/ARROW-4704) - [CI][GLib] Plasma test is flaky -* [ARROW-4710](https://issues.apache.org/jira/browse/ARROW-4710) - [C++][R] New linting script skip files with "cpp" extension -* [ARROW-4712](https://issues.apache.org/jira/browse/ARROW-4712) - [C++][CI] Clang7 Valgrind complains when not move shared\_ptr -* [ARROW-4721](https://issues.apache.org/jira/browse/ARROW-4721) - [Rust] [DataFusion] Propagate schema in filter -* [ARROW-4724](https://issues.apache.org/jira/browse/ARROW-4724) - [C++] Python not being built nor test under MinGW builds -* [ARROW-4728](https://issues.apache.org/jira/browse/ARROW-4728) - [JS] Failing test Table\#assign with a zero-length Null column round-trips through serialization -* [ARROW-4737](https://issues.apache.org/jira/browse/ARROW-4737) - [C\#] tests are not running in CI -* [ARROW-4744](https://issues.apache.org/jira/browse/ARROW-4744) - [CI][C++] Mingw32 builds failing -* [ARROW-4750](https://issues.apache.org/jira/browse/ARROW-4750) - [C++] RapidJSON triggers Wclass-memaccess on GCC 8+ -* [ARROW-4760](https://issues.apache.org/jira/browse/ARROW-4760) - [C++] protobuf 3.7 defines EXPECT\_OK that clashes with Arrow's macro -* [ARROW-4766](https://issues.apache.org/jira/browse/ARROW-4766) - [C++] Casting empty boolean array causes segfault -* [ARROW-4767](https://issues.apache.org/jira/browse/ARROW-4767) - [C\#] ArrowStreamReader crashes while reading the end of a stream -* [ARROW-4768](https://issues.apache.org/jira/browse/ARROW-4768) - [C++][CI] arrow-test-array sometimes gets stuck in MinGW build -* [ARROW-4774](https://issues.apache.org/jira/browse/ARROW-4774) - [C++][Parquet] Call Table::Validate when writing a table -* [ARROW-4775](https://issues.apache.org/jira/browse/ARROW-4775) - [Website] Site navbar cannot be expanded -* [ARROW-4783](https://issues.apache.org/jira/browse/ARROW-4783) - [C++][CI] Mingw32 builds sometimes timeout -* [ARROW-4793](https://issues.apache.org/jira/browse/ARROW-4793) - [Ruby] Suppress unused variable warning -* [ARROW-4796](https://issues.apache.org/jira/browse/ARROW-4796) - [Flight][Python] segfault in simple server implementation -* [ARROW-4802](https://issues.apache.org/jira/browse/ARROW-4802) - [Python] Hadoop classpath discovery broken HADOOP\_HOME is a symlink -* [ARROW-4807](https://issues.apache.org/jira/browse/ARROW-4807) - [Rust] Fix csv\_writer benchmark -* [ARROW-4811](https://issues.apache.org/jira/browse/ARROW-4811) - [C++] An incorrect dependency leads "ninja" to re-evaluate steps unnecessarily on subsequent calls -* [ARROW-4813](https://issues.apache.org/jira/browse/ARROW-4813) - [Ruby] Add tests for \#== and \#!= -* [ARROW-4820](https://issues.apache.org/jira/browse/ARROW-4820) - [Python] hadoop class path derived not correct -* [ARROW-4822](https://issues.apache.org/jira/browse/ARROW-4822) - [C++/Python] pyarrow.Table.equals segmentation fault on None -* [ARROW-4828](https://issues.apache.org/jira/browse/ARROW-4828) - [Python] manylinux1 docker-compose context should be python/manylinux1 -* [ARROW-4850](https://issues.apache.org/jira/browse/ARROW-4850) - [CI] Integration test failures do not fail the Travis CI build -* [ARROW-4853](https://issues.apache.org/jira/browse/ARROW-4853) - [Rust] Array slice doesn't work on ListArray and StructArray -* [ARROW-4857](https://issues.apache.org/jira/browse/ARROW-4857) - [C++/Python/CI] docker-compose in manylinux1 crossbow jobs too old -* [ARROW-4866](https://issues.apache.org/jira/browse/ARROW-4866) - [C++] zstd ExternalProject failing on Windows -* [ARROW-4867](https://issues.apache.org/jira/browse/ARROW-4867) - [Python] Table.from\_pandas() column order not respected -* [ARROW-4869](https://issues.apache.org/jira/browse/ARROW-4869) - [C++] Use of gmock fails in compute/kernels/util-internal-test.cc -* [ARROW-4870](https://issues.apache.org/jira/browse/ARROW-4870) - [Ruby] gemspec has wrong msys2 dependency listed -* [ARROW-4871](https://issues.apache.org/jira/browse/ARROW-4871) - [Flight][Java] Handle large Flight messages -* [ARROW-4872](https://issues.apache.org/jira/browse/ARROW-4872) - [Python] Keep backward compatibility for ParquetDatasetPiece -* [ARROW-4879](https://issues.apache.org/jira/browse/ARROW-4879) - [C++] cmake can't use conda's flatbuffers -* [ARROW-4881](https://issues.apache.org/jira/browse/ARROW-4881) - [Python] bundle\_zlib CMake function still uses ARROW\_BUILD\_TOOLCHAIN -* [ARROW-4900](https://issues.apache.org/jira/browse/ARROW-4900) - mingw-w64 < 5 does not have \_\_cpuidex -* [ARROW-4903](https://issues.apache.org/jira/browse/ARROW-4903) - [C++] Building tests using only static libs not possible -* [ARROW-4906](https://issues.apache.org/jira/browse/ARROW-4906) - [Format] Fix document to describe that SparseMatrixIndexCSR assumes indptr is sorted for each row -* [ARROW-4918](https://issues.apache.org/jira/browse/ARROW-4918) - [C++] Add cmake-format to pre-commit -* [ARROW-4928](https://issues.apache.org/jira/browse/ARROW-4928) - [Python] Hypothesis test failures -* [ARROW-4931](https://issues.apache.org/jira/browse/ARROW-4931) - [C++] CMake fails on gRPC ExternalProject -* [ARROW-4938](https://issues.apache.org/jira/browse/ARROW-4938) - [Glib] Undefined symbols error occurred when GIR file is being generated. -* [ARROW-4942](https://issues.apache.org/jira/browse/ARROW-4942) - [Ruby] Remove needless omits -* [ARROW-4948](https://issues.apache.org/jira/browse/ARROW-4948) - [JS] Nightly test failing with "Cannot assign to read only property" -* [ARROW-4950](https://issues.apache.org/jira/browse/ARROW-4950) - [C++] Thirdparty CMake error get\_target\_property() called with non-existent target LZ4::lz4 -* [ARROW-4952](https://issues.apache.org/jira/browse/ARROW-4952) - [C++] Equals / ApproxEquals behaviour undefined on FP NaNs -* [ARROW-4953](https://issues.apache.org/jira/browse/ARROW-4953) - [Ruby] Not loading libarrow-glib -* [ARROW-4954](https://issues.apache.org/jira/browse/ARROW-4954) - [Python] test failure with Flight enabled -* [ARROW-4958](https://issues.apache.org/jira/browse/ARROW-4958) - [C++] Purely static linking broken -* [ARROW-4961](https://issues.apache.org/jira/browse/ARROW-4961) - [C++][Python] Add GTest\_SOURCE=BUNDLED to relevant build docs that use conda-forge toolchain -* [ARROW-4962](https://issues.apache.org/jira/browse/ARROW-4962) - [C++] Warning level to CHECKIN can't compile on modern GCC -* [ARROW-4976](https://issues.apache.org/jira/browse/ARROW-4976) - [JS] RecordBatchReader should reset its Node/DOM streams -* [ARROW-4982](https://issues.apache.org/jira/browse/ARROW-4982) - [GLib][CI] Run tests on AppVeyor -* [ARROW-4984](https://issues.apache.org/jira/browse/ARROW-4984) - [Flight][C++] Flight server segfaults when port is in use -* [ARROW-4986](https://issues.apache.org/jira/browse/ARROW-4986) - [CI] Travis fails to install llvm@7 -* [ARROW-4989](https://issues.apache.org/jira/browse/ARROW-4989) - [C++] Builds fails to find Ubuntu-packaged re2 library -* [ARROW-4991](https://issues.apache.org/jira/browse/ARROW-4991) - [CI] Bump travis node version to 11.12 -* [ARROW-4997](https://issues.apache.org/jira/browse/ARROW-4997) - [C\#] ArrowStreamReader doesn't consume whole stream and doesn't implement sync read -* [ARROW-5009](https://issues.apache.org/jira/browse/ARROW-5009) - [C++] Cleanup using to std::\* in files -* [ARROW-5010](https://issues.apache.org/jira/browse/ARROW-5010) - [Release] Fix release script with llvm-7 -* [ARROW-5012](https://issues.apache.org/jira/browse/ARROW-5012) - [C++] "testing" headers not installed -* [ARROW-5023](https://issues.apache.org/jira/browse/ARROW-5023) - [Release] Default value syntax in shell is wrong -* [ARROW-5024](https://issues.apache.org/jira/browse/ARROW-5024) - [Release] crossbow.py --arrow-version causes missing variable error -* [ARROW-5025](https://issues.apache.org/jira/browse/ARROW-5025) - [Python][Packaging] wheel for Windows are broken -* [ARROW-5026](https://issues.apache.org/jira/browse/ARROW-5026) - [Python][Packaging] conda package on non Windows is broken -* [ARROW-5029](https://issues.apache.org/jira/browse/ARROW-5029) - [C++] Compilation warnings in release mode -* [ARROW-5031](https://issues.apache.org/jira/browse/ARROW-5031) - [Dev] Release verification script does not run CUDA tests in Python -* [ARROW-5042](https://issues.apache.org/jira/browse/ARROW-5042) - [Release] Wrong ARROW\_DEPENDENCY\_SOURCE in verification script -* [ARROW-5043](https://issues.apache.org/jira/browse/ARROW-5043) - [Release][Ruby] red-arrow dependency can't be resolve in verification script -* [ARROW-5044](https://issues.apache.org/jira/browse/ARROW-5044) - [Release][Rust] Format error in verification script -* [ARROW-5046](https://issues.apache.org/jira/browse/ARROW-5046) - [Release][C++] Plasma test is fragile in verification script -* [ARROW-5047](https://issues.apache.org/jira/browse/ARROW-5047) - [Release] Always set up parquet-testing in verification script -* [ARROW-5048](https://issues.apache.org/jira/browse/ARROW-5048) - [Release][Rust] arrow-testing is missing in verification script -* [ARROW-5050](https://issues.apache.org/jira/browse/ARROW-5050) - [C++] cares\_ep should build before grpc\_ep -* [ARROW-5087](https://issues.apache.org/jira/browse/ARROW-5087) - [Debian] APT repository no longer contains libarrow-dev -* [ARROW-5658](https://issues.apache.org/jira/browse/ARROW-5658) - [JAVA] Provide ability to resync VectorSchemaRoot if types change -* [PARQUET-1482](https://issues.apache.org/jira/browse/PARQUET-1482) - [C++] Unable to read data from parquet file generated with parquetjs -* [PARQUET-1494](https://issues.apache.org/jira/browse/PARQUET-1494) - [C++] Can't access parquet statistics on binary columns -* [PARQUET-1532](https://issues.apache.org/jira/browse/PARQUET-1532) - [C++] Can't build column reader test with MinGW - - -## New Features and Improvements - -* [ARROW-47](https://issues.apache.org/jira/browse/ARROW-47) - [C++] Consider adding a scalar type object model -* [ARROW-331](https://issues.apache.org/jira/browse/ARROW-331) - [Python] Timeline for dropping Python 2.7 support -* [ARROW-549](https://issues.apache.org/jira/browse/ARROW-549) - [C++] Add function to concatenate like-typed arrays -* [ARROW-572](https://issues.apache.org/jira/browse/ARROW-572) - [C++] Apply visitor pattern in IPC metadata -* [ARROW-585](https://issues.apache.org/jira/browse/ARROW-585) - [C++] Define public API for user-defined data types -* [ARROW-694](https://issues.apache.org/jira/browse/ARROW-694) - [C++] Build JSON "scanner" for reading record batches from line-delimited JSON files -* [ARROW-1425](https://issues.apache.org/jira/browse/ARROW-1425) - [Python] Document semantic differences between Spark timestamps and Arrow timestamps -* [ARROW-1572](https://issues.apache.org/jira/browse/ARROW-1572) - [C++] Implement "value counts" kernels for tabulating value frequencies -* [ARROW-1639](https://issues.apache.org/jira/browse/ARROW-1639) - [Python] More efficient serialization for RangeIndex in serialize\_pandas -* [ARROW-1642](https://issues.apache.org/jira/browse/ARROW-1642) - [GLib] Build GLib using Meson in Appveyor -* [ARROW-1807](https://issues.apache.org/jira/browse/ARROW-1807) - [JAVA] Reduce Heap Usage (Phase 3): consolidate buffers -* [ARROW-1896](https://issues.apache.org/jira/browse/ARROW-1896) - [C++] Do not allocate memory for primitive outputs in CastKernel::Call implementation -* [ARROW-2015](https://issues.apache.org/jira/browse/ARROW-2015) - [Java] Use Java Time and Date APIs instead of JodaTime -* [ARROW-2022](https://issues.apache.org/jira/browse/ARROW-2022) - [Format] Add custom metadata field specific to a RecordBatch message -* [ARROW-2112](https://issues.apache.org/jira/browse/ARROW-2112) - [C++] Enable cpplint to be run on Windows -* [ARROW-2243](https://issues.apache.org/jira/browse/ARROW-2243) - [C++] Enable IPO/LTO -* [ARROW-2409](https://issues.apache.org/jira/browse/ARROW-2409) - [Rust] Test for build warnings, remove current warnings -* [ARROW-2460](https://issues.apache.org/jira/browse/ARROW-2460) - [Rust] Schema and DataType::Struct should use Vec\> -* [ARROW-2487](https://issues.apache.org/jira/browse/ARROW-2487) - [C++] Provide a variant of AppendValues that takes bytemaps for the nullability -* [ARROW-2523](https://issues.apache.org/jira/browse/ARROW-2523) - [Rust] Implement CAST operations for arrays -* [ARROW-2620](https://issues.apache.org/jira/browse/ARROW-2620) - [Rust] Integrate memory pool abstraction with rest of codebase -* [ARROW-2627](https://issues.apache.org/jira/browse/ARROW-2627) - [Python] Add option (or some equivalent) to toggle memory mapping functionality when using parquet.ParquetFile or other read entry points -* [ARROW-2904](https://issues.apache.org/jira/browse/ARROW-2904) - [C++] Use FirstTimeBitmapWriter instead of SetBit functions in builder.h/cc -* [ARROW-3066](https://issues.apache.org/jira/browse/ARROW-3066) - [Wiki] Add "How to contribute" to developer wiki -* [ARROW-3084](https://issues.apache.org/jira/browse/ARROW-3084) - [Python] Do we need to build both unicode variants of pyarrow wheels? -* [ARROW-3107](https://issues.apache.org/jira/browse/ARROW-3107) - [C++] arrow::PrettyPrint for Column instances -* [ARROW-3121](https://issues.apache.org/jira/browse/ARROW-3121) - [C++] Mean kernel aggregate -* [ARROW-3123](https://issues.apache.org/jira/browse/ARROW-3123) - [C++] Incremental Count, Count Not Null aggregator -* [ARROW-3135](https://issues.apache.org/jira/browse/ARROW-3135) - [C++] Add helper functions for validity bitmap propagation in kernel context -* [ARROW-3149](https://issues.apache.org/jira/browse/ARROW-3149) - [C++] Use gRPC (when it exists) from conda-forge for CI builds -* [ARROW-3162](https://issues.apache.org/jira/browse/ARROW-3162) - [Python] Enable Flight servers to be implemented in pure Python -* [ARROW-3162](https://issues.apache.org/jira/browse/ARROW-3162) - [Python] Enable Flight servers to be implemented in pure Python -* [ARROW-3239](https://issues.apache.org/jira/browse/ARROW-3239) - [C++] Improve random data generation functions -* [ARROW-3255](https://issues.apache.org/jira/browse/ARROW-3255) - [C++/Python] Migrate Travis CI jobs off Xcode 6.4 -* [ARROW-3289](https://issues.apache.org/jira/browse/ARROW-3289) - [C++] Implement DoPut command for Flight on client and server side -* [ARROW-3292](https://issues.apache.org/jira/browse/ARROW-3292) - [C++] Test Flight RPC in Travis CI -* [ARROW-3295](https://issues.apache.org/jira/browse/ARROW-3295) - [Packaging] Package gRPC libraries in conda-forge for use in builds, packaging -* [ARROW-3297](https://issues.apache.org/jira/browse/ARROW-3297) - [Python] Python bindings for Flight C++ client -* [ARROW-3311](https://issues.apache.org/jira/browse/ARROW-3311) - [R] Functions for deserializing IPC components from arrow::Buffer or from IO interface -* [ARROW-3328](https://issues.apache.org/jira/browse/ARROW-3328) - [Flight] Allow for optional unique flight identifier to be sent with FlightGetInfo -* [ARROW-3361](https://issues.apache.org/jira/browse/ARROW-3361) - [R] Run cpp/build-support/cpplint.py on C++ source files -* [ARROW-3364](https://issues.apache.org/jira/browse/ARROW-3364) - [Doc] Document docker compose setup -* [ARROW-3367](https://issues.apache.org/jira/browse/ARROW-3367) - [INTEGRATION] Port Spark integration test to the docker-compose setup -* [ARROW-3422](https://issues.apache.org/jira/browse/ARROW-3422) - [C++] Add "toolchain" target to ensure that all required toolchain libraries are built -* [ARROW-3434](https://issues.apache.org/jira/browse/ARROW-3434) - [Packaging] Add Apache ORC C++ library to conda-forge -* [ARROW-3435](https://issues.apache.org/jira/browse/ARROW-3435) - [C++] Add option to use dynamic linking with re2 -* [ARROW-3511](https://issues.apache.org/jira/browse/ARROW-3511) - [Gandiva] support input selection vectors for both projector and filter -* [ARROW-3532](https://issues.apache.org/jira/browse/ARROW-3532) - [Python] Schema, StructType, StructArray field retrieval by name should raise warning or exception for multiple matches -* [ARROW-3550](https://issues.apache.org/jira/browse/ARROW-3550) - [C++] Use kUnknownNullCount in NumericArray constructor -* [ARROW-3554](https://issues.apache.org/jira/browse/ARROW-3554) - [C++] Reverse traits for C++ -* [ARROW-3594](https://issues.apache.org/jira/browse/ARROW-3594) - [Packaging] Build "cares" library in conda-forge -* [ARROW-3595](https://issues.apache.org/jira/browse/ARROW-3595) - [Packaging] Build boringssl in conda-forge -* [ARROW-3596](https://issues.apache.org/jira/browse/ARROW-3596) - [Packaging] Build gRPC in conda-forge -* [ARROW-3619](https://issues.apache.org/jira/browse/ARROW-3619) - [R] Expose global thread pool optins -* [ARROW-3631](https://issues.apache.org/jira/browse/ARROW-3631) - [C\#] Add Appveyor build for C\# -* [ARROW-3653](https://issues.apache.org/jira/browse/ARROW-3653) - [Python/C++] Support data copying between different GPU devices -* [ARROW-3735](https://issues.apache.org/jira/browse/ARROW-3735) - [Python] Proper error handling in \_ensure\_type -* [ARROW-3761](https://issues.apache.org/jira/browse/ARROW-3761) - [R] Bindings for CompressedInputStream, CompressedOutputStream -* [ARROW-3763](https://issues.apache.org/jira/browse/ARROW-3763) - [C++] Write Parquet ByteArray / FixedLenByteArray reader batches directly into arrow::BinaryBuilder -* [ARROW-3769](https://issues.apache.org/jira/browse/ARROW-3769) - [C++] Support reading non-dictionary encoded binary Parquet columns directly as DictionaryArray -* [ARROW-3770](https://issues.apache.org/jira/browse/ARROW-3770) - [C++] Validate or add option to validate arrow::Table schema in parquet::arrow::FileWriter::WriteTable -* [ARROW-3816](https://issues.apache.org/jira/browse/ARROW-3816) - [R] nrow.RecordBatch method -* [ARROW-3824](https://issues.apache.org/jira/browse/ARROW-3824) - [R] Document developer workflow for building project, running unit tests in r/README.md -* [ARROW-3838](https://issues.apache.org/jira/browse/ARROW-3838) - [Rust] Implement CSV Writer -* [ARROW-3846](https://issues.apache.org/jira/browse/ARROW-3846) - [Gandiva] Build on Windows -* [ARROW-3882](https://issues.apache.org/jira/browse/ARROW-3882) - [Rust] PrimitiveArray should support cast operations -* [ARROW-3903](https://issues.apache.org/jira/browse/ARROW-3903) - [Python] Random array generator for Arrow conversion and Parquet testing -* [ARROW-3926](https://issues.apache.org/jira/browse/ARROW-3926) - [Python] Add Gandiva bindings to Python wheels -* [ARROW-3951](https://issues.apache.org/jira/browse/ARROW-3951) - [Go] implement a CSV writer -* [ARROW-3954](https://issues.apache.org/jira/browse/ARROW-3954) - [Rust] Add Slice to Array and ArrayData -* [ARROW-3965](https://issues.apache.org/jira/browse/ARROW-3965) - [Java] JDBC-to-Arrow Conversion: Configuration Object -* [ARROW-3966](https://issues.apache.org/jira/browse/ARROW-3966) - [Java] JDBC-to-Arrow Conversion: JDBC Metadata in Schema Fields -* [ARROW-3972](https://issues.apache.org/jira/browse/ARROW-3972) - [C++] Update to LLVM and Clang bits to 7.0 -* [ARROW-3981](https://issues.apache.org/jira/browse/ARROW-3981) - [C++] Rename json.h -* [ARROW-3985](https://issues.apache.org/jira/browse/ARROW-3985) - [C++] Pass -C option when compiling with ccache to avoid some warnings -* [ARROW-4012](https://issues.apache.org/jira/browse/ARROW-4012) - [Documentation][C++] Document how to install Apache Arrow on MSYS2 -* [ARROW-4014](https://issues.apache.org/jira/browse/ARROW-4014) - [C++] Fix "LIBCMT" warnings on MSVC -* [ARROW-4023](https://issues.apache.org/jira/browse/ARROW-4023) - [Gandiva] Address long CI times in macOS builds -* [ARROW-4024](https://issues.apache.org/jira/browse/ARROW-4024) - [Python] Cython compilation error on cython==0.27.3 -* [ARROW-4031](https://issues.apache.org/jira/browse/ARROW-4031) - [C++] Refactor ArrayBuilder bitmap logic into TypedBufferBuilder -* [ARROW-4040](https://issues.apache.org/jira/browse/ARROW-4040) - [Rust] Add array\_ops method for filtering an array -* [ARROW-4056](https://issues.apache.org/jira/browse/ARROW-4056) - [C++] Upgrade to boost-cpp 1.69.0 again -* [ARROW-4061](https://issues.apache.org/jira/browse/ARROW-4061) - [Rust] [Parquet] Implement "spaced" version for non-dictionary encoding/decoding -* [ARROW-4068](https://issues.apache.org/jira/browse/ARROW-4068) - [Gandiva] Support building with Xcode 6.4 -* [ARROW-4071](https://issues.apache.org/jira/browse/ARROW-4071) - [Rust] Add rustfmt as a pre-commit hook -* [ARROW-4072](https://issues.apache.org/jira/browse/ARROW-4072) - [Rust] Set default value for PARQUET\_TEST\_DATA -* [ARROW-4092](https://issues.apache.org/jira/browse/ARROW-4092) - [Rust] Implement common Reader / DataSource trait for CSV and Parquet -* [ARROW-4094](https://issues.apache.org/jira/browse/ARROW-4094) - [Python] Store RangeIndex in Parquet files as metadata rather than a physical data column -* [ARROW-4110](https://issues.apache.org/jira/browse/ARROW-4110) - [C++] Do not generate distinct cast kernels when input and output type are the same -* [ARROW-4123](https://issues.apache.org/jira/browse/ARROW-4123) - [C++] Improve linting workflow and documentation for Windows-based developers -* [ARROW-4124](https://issues.apache.org/jira/browse/ARROW-4124) - [C++] Abstract aggregation kernel API -* [ARROW-4142](https://issues.apache.org/jira/browse/ARROW-4142) - [Java] JDBC-to-Arrow: JDBC Arrays -* [ARROW-4165](https://issues.apache.org/jira/browse/ARROW-4165) - [C++] Port cpp/apidoc/Windows.md and other files to Sphinx / rst -* [ARROW-4180](https://issues.apache.org/jira/browse/ARROW-4180) - [Java] Reduce verbose logging of ArrowBuf creation events? -* [ARROW-4196](https://issues.apache.org/jira/browse/ARROW-4196) - [Rust] Add explicit SIMD vectorization for arithmetic ops in "array\_ops" -* [ARROW-4198](https://issues.apache.org/jira/browse/ARROW-4198) - [Gandiva] Add support to cast timestamp -* [ARROW-4204](https://issues.apache.org/jira/browse/ARROW-4204) - [Gandiva] implement decimal subtract -* [ARROW-4205](https://issues.apache.org/jira/browse/ARROW-4205) - [Gandiva] Implement decimal multiply -* [ARROW-4206](https://issues.apache.org/jira/browse/ARROW-4206) - [Gandiva] Implement decimal divide -* [ARROW-4212](https://issues.apache.org/jira/browse/ARROW-4212) - [Python] [CUDA] Creating a CUDA buffer from Numba device array should be easier -* [ARROW-4230](https://issues.apache.org/jira/browse/ARROW-4230) - [C++] Enable building flight against system gRPC -* [ARROW-4232](https://issues.apache.org/jira/browse/ARROW-4232) - [C++] Follow conda-forge compiler ABI migration -* [ARROW-4234](https://issues.apache.org/jira/browse/ARROW-4234) - [C++] Add memory bandwidth benchmarks to arrow/util/machine-benchmark.cc -* [ARROW-4235](https://issues.apache.org/jira/browse/ARROW-4235) - [GLib] Use "column\_builder" in GArrowRecordBatchBuilder -* [ARROW-4236](https://issues.apache.org/jira/browse/ARROW-4236) - [JAVA] Distinct plasma client create exceptions -* [ARROW-4245](https://issues.apache.org/jira/browse/ARROW-4245) - [Rust] Add Rustdoc header to each source file -* [ARROW-4247](https://issues.apache.org/jira/browse/ARROW-4247) - [Packaging] Update verify script for 0.12.0 -* [ARROW-4251](https://issues.apache.org/jira/browse/ARROW-4251) - [C++] Add option to use vendored Boost in verify-release-candidate.sh -* [ARROW-4262](https://issues.apache.org/jira/browse/ARROW-4262) - [Website] Blog post to give preview into using R and Arrow with Apache Spark -* [ARROW-4263](https://issues.apache.org/jira/browse/ARROW-4263) - [Rust] Donate DataFusion -* [ARROW-4265](https://issues.apache.org/jira/browse/ARROW-4265) - [C++] Automatic conversion between Table and std::vector\> -* [ARROW-4268](https://issues.apache.org/jira/browse/ARROW-4268) - [C++] Add C primitive to Arrow:Type compile time in TypeTraits -* [ARROW-4271](https://issues.apache.org/jira/browse/ARROW-4271) - [Rust] Move Parquet specific info to Parquet Readme -* [ARROW-4273](https://issues.apache.org/jira/browse/ARROW-4273) - [Release] Fix verification script to use cf201901 conda-forge label -* [ARROW-4277](https://issues.apache.org/jira/browse/ARROW-4277) - [C++] Add gmock to toolchain -* [ARROW-4281](https://issues.apache.org/jira/browse/ARROW-4281) - [CI] Use Ubuntu Xenial (16.04) VMs on Travis-CI -* [ARROW-4285](https://issues.apache.org/jira/browse/ARROW-4285) - [Python] Use proper builder interface for serialization -* [ARROW-4287](https://issues.apache.org/jira/browse/ARROW-4287) - [C++] Ensure minimal bison version on OSX for Thrift -* [ARROW-4289](https://issues.apache.org/jira/browse/ARROW-4289) - [C++] Forward AR and RANLIB to thirdparty builds -* [ARROW-4290](https://issues.apache.org/jira/browse/ARROW-4290) - [C++/Gandiva] Support detecting correct LLVM version in Homebrew -* [ARROW-4291](https://issues.apache.org/jira/browse/ARROW-4291) - [Dev] Support selecting features in release scripts -* [ARROW-4294](https://issues.apache.org/jira/browse/ARROW-4294) - [Plasma] Add support for evicting objects to external store -* [ARROW-4297](https://issues.apache.org/jira/browse/ARROW-4297) - [C++] Fix build for 32-bit MSYS2 -* [ARROW-4298](https://issues.apache.org/jira/browse/ARROW-4298) - [Java] Building Flight fails with OpenJDK 11 -* [ARROW-4299](https://issues.apache.org/jira/browse/ARROW-4299) - [Ruby] Depend on the same version as Red Arrow -* [ARROW-4300](https://issues.apache.org/jira/browse/ARROW-4300) - [C++] Restore apache-arrow Homebrew recipe and define process for maintaining and updating for releases -* [ARROW-4303](https://issues.apache.org/jira/browse/ARROW-4303) - [Gandiva/Python] Build LLVM with RTTI in manylinux1 container -* [ARROW-4305](https://issues.apache.org/jira/browse/ARROW-4305) - [Rust] Fix parquet version number in README -* [ARROW-4307](https://issues.apache.org/jira/browse/ARROW-4307) - [C++] FIx doxygen warnings, include doxygen warning checks in CI linting -* [ARROW-4310](https://issues.apache.org/jira/browse/ARROW-4310) - [Website] Update install document for 0.12.0 -* [ARROW-4313](https://issues.apache.org/jira/browse/ARROW-4313) - Define general benchmark database schema -* [ARROW-4315](https://issues.apache.org/jira/browse/ARROW-4315) - [Website] Home page of https://arrow.apache.org/ does not mention Go or Rust -* [ARROW-4318](https://issues.apache.org/jira/browse/ARROW-4318) - [C++] Add Tensor::CountNonZero -* [ARROW-4321](https://issues.apache.org/jira/browse/ARROW-4321) - [CI] Setup conda-forge channel globally in docker containers -* [ARROW-4330](https://issues.apache.org/jira/browse/ARROW-4330) - [C++] Use FindThreads.cmake to handle -pthread compiler/link options -* [ARROW-4331](https://issues.apache.org/jira/browse/ARROW-4331) - [C++] Extend Scalar Datum to support more types -* [ARROW-4332](https://issues.apache.org/jira/browse/ARROW-4332) - [Website] Instructions and scripts for publishing web site appear to be incorrect -* [ARROW-4334](https://issues.apache.org/jira/browse/ARROW-4334) - [CI] Setup conda-forge channel globally in travis builds -* [ARROW-4335](https://issues.apache.org/jira/browse/ARROW-4335) - [C++] Better document sparse tensor support -* [ARROW-4336](https://issues.apache.org/jira/browse/ARROW-4336) - [C++] Default BUILD\_WARNING\_LEVEL to CHECKIN -* [ARROW-4339](https://issues.apache.org/jira/browse/ARROW-4339) - [C++] rewrite cpp/README shorter, with a separate contribution guide -* [ARROW-4340](https://issues.apache.org/jira/browse/ARROW-4340) - [C++] Update IWYU version in the \`lint\` dockerfile -* [ARROW-4341](https://issues.apache.org/jira/browse/ARROW-4341) - [C++] Use TypedBufferBuilder in BooleanBuilder -* [ARROW-4344](https://issues.apache.org/jira/browse/ARROW-4344) - [Java] Further cleanup maven output -* [ARROW-4345](https://issues.apache.org/jira/browse/ARROW-4345) - [C++] Add Apache 2.0 license file to the Parquet-testing repository -* [ARROW-4346](https://issues.apache.org/jira/browse/ARROW-4346) - [C++] Fix compiler warnings with gcc 8.2.0 -* [ARROW-4352](https://issues.apache.org/jira/browse/ARROW-4352) - [C++] Add support for system Google Test -* [ARROW-4353](https://issues.apache.org/jira/browse/ARROW-4353) - [CI] Add jobs for 32-bit and 64-bit MinGW -* [ARROW-4358](https://issues.apache.org/jira/browse/ARROW-4358) - [Gandiva][Crossbow] Trusty build broken -* [ARROW-4361](https://issues.apache.org/jira/browse/ARROW-4361) - [Website] Update commiters list -* [ARROW-4362](https://issues.apache.org/jira/browse/ARROW-4362) - [Java] Test OpenJDK 11 in CI -* [ARROW-4363](https://issues.apache.org/jira/browse/ARROW-4363) - [C++] Add CMake format checks -* [ARROW-4372](https://issues.apache.org/jira/browse/ARROW-4372) - [C++] Embed precompiled bitcode in the gandiva library -* [ARROW-4373](https://issues.apache.org/jira/browse/ARROW-4373) - [Packaging] Travis fails to deploy conda packages on OSX -* [ARROW-4375](https://issues.apache.org/jira/browse/ARROW-4375) - [CI] Sphinx dependencies were removed from docs conda environment -* [ARROW-4376](https://issues.apache.org/jira/browse/ARROW-4376) - [Rust] Implement from\_buf\_reader for csv::Reader -* [ARROW-4377](https://issues.apache.org/jira/browse/ARROW-4377) - [Rust] Implement std::fmt::Debug for all PrimitiveArrays -* [ARROW-4379](https://issues.apache.org/jira/browse/ARROW-4379) - Register pyarrow serializers for collections.Counter and collections.deque. -* [ARROW-4383](https://issues.apache.org/jira/browse/ARROW-4383) - [C++] Use the CMake's standard find features -* [ARROW-4386](https://issues.apache.org/jira/browse/ARROW-4386) - [Rust] Implement Date and Time Arrays -* [ARROW-4388](https://issues.apache.org/jira/browse/ARROW-4388) - [Go] add DimNames() method to tensor Interface? -* [ARROW-4393](https://issues.apache.org/jira/browse/ARROW-4393) - [Rust] coding style: apply 90 characters per line limit -* [ARROW-4396](https://issues.apache.org/jira/browse/ARROW-4396) - Update Typedoc to support TypeScript 3.2 -* [ARROW-4397](https://issues.apache.org/jira/browse/ARROW-4397) - [C++] dim\_names in Tensor and SparseTensor -* [ARROW-4399](https://issues.apache.org/jira/browse/ARROW-4399) - [C++] Remove usage of "extern template class" from NumericArray -* [ARROW-4401](https://issues.apache.org/jira/browse/ARROW-4401) - [Python] Alpine dockerfile fails to build because pandas requires numpy as build dependency -* [ARROW-4406](https://issues.apache.org/jira/browse/ARROW-4406) - Ignore "\*\_$folder$" files on S3 -* [ARROW-4408](https://issues.apache.org/jira/browse/ARROW-4408) - [CPP/Doc] Remove outdated Parquet documentation -* [ARROW-4422](https://issues.apache.org/jira/browse/ARROW-4422) - [Plasma] Enforce memory limit in plasma, rather than relying on dlmalloc\_set\_footprint\_limit -* [ARROW-4423](https://issues.apache.org/jira/browse/ARROW-4423) - [C++] Update version of vendored gtest to 1.8.1 -* [ARROW-4424](https://issues.apache.org/jira/browse/ARROW-4424) - [Python] Manylinux CI builds failing -* [ARROW-4425](https://issues.apache.org/jira/browse/ARROW-4425) - Add link to 'Contributing' page in the top-level Arrow README -* [ARROW-4430](https://issues.apache.org/jira/browse/ARROW-4430) - [C++] add unit test for currently unused append method -* [ARROW-4431](https://issues.apache.org/jira/browse/ARROW-4431) - [C++] Build gRPC as ExternalProject without allowing it to build its vendored dependencies -* [ARROW-4435](https://issues.apache.org/jira/browse/ARROW-4435) - [C\#] Add .sln file and minor .csproj fix ups -* [ARROW-4436](https://issues.apache.org/jira/browse/ARROW-4436) - [Documentation] Clarify instructions for building documentation -* [ARROW-4442](https://issues.apache.org/jira/browse/ARROW-4442) - [JS] Overly broad type annotation for Chunked typeId leading to type mismatches in generated typing -* [ARROW-4444](https://issues.apache.org/jira/browse/ARROW-4444) - [Testing] Add DataFusion test files to arrow-testing repo -* [ARROW-4445](https://issues.apache.org/jira/browse/ARROW-4445) - [C++][Gandiva] Run Gandiva-LLVM tests in Appveyor -* [ARROW-4446](https://issues.apache.org/jira/browse/ARROW-4446) - [Python] Run Gandiva tests on Windows and Appveyor -* [ARROW-4448](https://issues.apache.org/jira/browse/ARROW-4448) - [JAVA][Flight] Flaky Flight java test -* [ARROW-4449](https://issues.apache.org/jira/browse/ARROW-4449) - [Rust] Convert File to T: Read + Seek for schema inference -* [ARROW-4454](https://issues.apache.org/jira/browse/ARROW-4454) - [C++] fix unused parameter warnings -* [ARROW-4455](https://issues.apache.org/jira/browse/ARROW-4455) - [Plasma] g++ 8 reports class-memaccess warnings -* [ARROW-4459](https://issues.apache.org/jira/browse/ARROW-4459) - [Testing] Add git submodule for arrow-testing data files -* [ARROW-4460](https://issues.apache.org/jira/browse/ARROW-4460) - [Website] Write blog post to announce DataFusion donation -* [ARROW-4461](https://issues.apache.org/jira/browse/ARROW-4461) - [C++] Expose bit-util methods for binary boolean operations that don't allocate -* [ARROW-4462](https://issues.apache.org/jira/browse/ARROW-4462) - [C++] Upgrade LZ4 v1.7.5 to v1.8.3 to compile with VS2017 -* [ARROW-4464](https://issues.apache.org/jira/browse/ARROW-4464) - [Rust] [DataFusion] Add support for LIMIT -* [ARROW-4466](https://issues.apache.org/jira/browse/ARROW-4466) - [Rust] [DataFusion] Add support for Parquet data sources -* [ARROW-4468](https://issues.apache.org/jira/browse/ARROW-4468) - [Rust] Implement BitAnd/BitOr for &Buffer (with SIMD) -* [ARROW-4472](https://issues.apache.org/jira/browse/ARROW-4472) - [Website][Python] Blog post about Python string memory use improvements in 0.12 -* [ARROW-4475](https://issues.apache.org/jira/browse/ARROW-4475) - [Python] Serializing objects that contain themselves -* [ARROW-4476](https://issues.apache.org/jira/browse/ARROW-4476) - [Rust] [DataFusion] Post donation clean up tasks -* [ARROW-4481](https://issues.apache.org/jira/browse/ARROW-4481) - [Website] Instructions for publishing web site are missing a step -* [ARROW-4483](https://issues.apache.org/jira/browse/ARROW-4483) - [Website] Fix broken link (author) in DataFusion blog post -* [ARROW-4485](https://issues.apache.org/jira/browse/ARROW-4485) - [CI] Determine maintenance approach to pinned conda-forge binutils package -* [ARROW-4486](https://issues.apache.org/jira/browse/ARROW-4486) - [Python][CUDA] pyarrow.cuda.Context.foreign\_buffer should have a \`base=None\` argument -* [ARROW-4488](https://issues.apache.org/jira/browse/ARROW-4488) - [Rust] From AsRef<[u8]\> for Buffer does not ensure correct padding -* [ARROW-4489](https://issues.apache.org/jira/browse/ARROW-4489) - [Rust] PrimitiveArray.value\_slice performs bounds checking when it should not -* [ARROW-4490](https://issues.apache.org/jira/browse/ARROW-4490) - [Rust] Add explicit SIMD vectorization for boolean ops in "array\_ops" -* [ARROW-4491](https://issues.apache.org/jira/browse/ARROW-4491) - [Python] Remove usage of std::to\_string and std::stoi -* [ARROW-4499](https://issues.apache.org/jira/browse/ARROW-4499) - [Python][CI] Upgrade to latest flake8 3.7.5 in travis\_lint.sh -* [ARROW-4502](https://issues.apache.org/jira/browse/ARROW-4502) - [C\#] Add support for zero-copy reads -* [ARROW-4506](https://issues.apache.org/jira/browse/ARROW-4506) - [Ruby] Add Arrow::RecordBatch\#raw\_records -* [ARROW-4513](https://issues.apache.org/jira/browse/ARROW-4513) - [Rust] Implement BitAnd/BitOr for &Bitmap -* [ARROW-4517](https://issues.apache.org/jira/browse/ARROW-4517) - [JS] remove version number as it is not used -* [ARROW-4518](https://issues.apache.org/jira/browse/ARROW-4518) - [JS] add jsdelivr to package.json -* [ARROW-4528](https://issues.apache.org/jira/browse/ARROW-4528) - [C++] Update lint docker container to LLVM-7 -* [ARROW-4529](https://issues.apache.org/jira/browse/ARROW-4529) - [C++] Add test coverage for BitUtils::RoundDown -* [ARROW-4531](https://issues.apache.org/jira/browse/ARROW-4531) - [C++] Handling of non-aligned slices in Sum kernel -* [ARROW-4537](https://issues.apache.org/jira/browse/ARROW-4537) - [CI] Suppress shell warning on travis-ci -* [ARROW-4539](https://issues.apache.org/jira/browse/ARROW-4539) - [Java]List vector child value count not set correctly -* [ARROW-4540](https://issues.apache.org/jira/browse/ARROW-4540) - [Rust] Add basic JSON reader -* [ARROW-4543](https://issues.apache.org/jira/browse/ARROW-4543) - [C\#] Update Flat Buffers code to latest version -* [ARROW-4546](https://issues.apache.org/jira/browse/ARROW-4546) - [C++] LICENSE.txt should be updated. -* [ARROW-4547](https://issues.apache.org/jira/browse/ARROW-4547) - [Python][Documentation] Update python/development.rst with instructions for CUDA-enabled builds -* [ARROW-4556](https://issues.apache.org/jira/browse/ARROW-4556) - [Rust] Preserve order of JSON inferred schema -* [ARROW-4558](https://issues.apache.org/jira/browse/ARROW-4558) - [C++][Flight] Avoid undefined behavior with gRPC memory optimizations -* [ARROW-4560](https://issues.apache.org/jira/browse/ARROW-4560) - [R] array() needs to take single input, not ... -* [ARROW-4562](https://issues.apache.org/jira/browse/ARROW-4562) - [C++][Flight] Create outgoing composite grpc::ByteBuffer instead of allocating contiguous slice and copying IpcPayload into it -* [ARROW-4564](https://issues.apache.org/jira/browse/ARROW-4564) - [C++] IWYU docker image silently fails -* [ARROW-4565](https://issues.apache.org/jira/browse/ARROW-4565) - [R] Reading records with all non-null decimals SEGFAULTs -* [ARROW-4568](https://issues.apache.org/jira/browse/ARROW-4568) - [C++] Add version macros to headers -* [ARROW-4572](https://issues.apache.org/jira/browse/ARROW-4572) - [C++] Remove memory zeroing from PrimitiveAllocatingUnaryKernel -* [ARROW-4583](https://issues.apache.org/jira/browse/ARROW-4583) - [Plasma] There are bugs reported by code scan tool -* [ARROW-4586](https://issues.apache.org/jira/browse/ARROW-4586) - [Rust] Remove arrow/mod.rs as it is not needed -* [ARROW-4589](https://issues.apache.org/jira/browse/ARROW-4589) - [Rust] [DataFusion] Implement projection push down query optimizer rule -* [ARROW-4590](https://issues.apache.org/jira/browse/ARROW-4590) - [Rust] Add explicit SIMD vectorization for comparison ops in "array\_ops" -* [ARROW-4592](https://issues.apache.org/jira/browse/ARROW-4592) - [GLib] Stop configure immediately when GLib isn't available -* [ARROW-4593](https://issues.apache.org/jira/browse/ARROW-4593) - [Ruby] Arrow::Array\#[out\_of\_range] returns nil -* [ARROW-4594](https://issues.apache.org/jira/browse/ARROW-4594) - [Ruby] Arrow::StructArray\#[] returns Arrow::Struct instead of Arrow::Array -* [ARROW-4595](https://issues.apache.org/jira/browse/ARROW-4595) - [Rust] [DataFusion] Implement DataFrame style API -* [ARROW-4598](https://issues.apache.org/jira/browse/ARROW-4598) - [CI] Remove needless LLVM\_DIR for macOS -* [ARROW-4599](https://issues.apache.org/jira/browse/ARROW-4599) - [C++] Add support for system GFlags -* [ARROW-4602](https://issues.apache.org/jira/browse/ARROW-4602) - [Rust][ [DataFusion] Integrate query optimizer with ExecutionContext -* [ARROW-4603](https://issues.apache.org/jira/browse/ARROW-4603) - [Rust] [DataFusion] Execution context should allow in-memory data sources to be registered -* [ARROW-4604](https://issues.apache.org/jira/browse/ARROW-4604) - [Rust] [DataFusion] Add benchmarks for SQL query execution -* [ARROW-4605](https://issues.apache.org/jira/browse/ARROW-4605) - [Rust] Move filter and limit code from DataFusion into compute module -* [ARROW-4609](https://issues.apache.org/jira/browse/ARROW-4609) - [C++] Use google benchmark from toolchain -* [ARROW-4610](https://issues.apache.org/jira/browse/ARROW-4610) - [Plasma] Avoid JNI from crashing -* [ARROW-4611](https://issues.apache.org/jira/browse/ARROW-4611) - [C++] Rework CMake third-party logic -* [ARROW-4612](https://issues.apache.org/jira/browse/ARROW-4612) - [Python] Use cython from PyPI for windows wheels build -* [ARROW-4613](https://issues.apache.org/jira/browse/ARROW-4613) - [C++] Alpine build failing as libgtestd.so is not found -* [ARROW-4614](https://issues.apache.org/jira/browse/ARROW-4614) - [C++/CI] Activate flight build in ci/docker\_build\_cpp.sh -* [ARROW-4615](https://issues.apache.org/jira/browse/ARROW-4615) - [C++] Add checked\_pointer\_cast -* [ARROW-4616](https://issues.apache.org/jira/browse/ARROW-4616) - [C++] Log message in BuildUtils as STATUS -* [ARROW-4618](https://issues.apache.org/jira/browse/ARROW-4618) - [Docker] Makefile to build dependent docker images -* [ARROW-4619](https://issues.apache.org/jira/browse/ARROW-4619) - [R]: Fix the autobrew script -* [ARROW-4620](https://issues.apache.org/jira/browse/ARROW-4620) - [C\#] Add unit tests for "Types" in arrow/csharp -* [ARROW-4623](https://issues.apache.org/jira/browse/ARROW-4623) - [R] update Rcpp dependency -* [ARROW-4628](https://issues.apache.org/jira/browse/ARROW-4628) - [Rust] [DataFusion] Implement type coercion query optimizer rule -* [ARROW-4632](https://issues.apache.org/jira/browse/ARROW-4632) - [Ruby] Add BigDecimal\#to\_arrow -* [ARROW-4634](https://issues.apache.org/jira/browse/ARROW-4634) - [Rust] [Parquet] Reorganize test\_common mod to allow more test util codes. -* [ARROW-4637](https://issues.apache.org/jira/browse/ARROW-4637) - [Python] Avoid importing Pandas unless necessary -* [ARROW-4638](https://issues.apache.org/jira/browse/ARROW-4638) - [R] install instructions using brew -* [ARROW-4640](https://issues.apache.org/jira/browse/ARROW-4640) - [Python] Add docker-compose configuration to build and test the project without pandas installed -* [ARROW-4643](https://issues.apache.org/jira/browse/ARROW-4643) - [C++] Add compiler diagnostic color when using Ninja -* [ARROW-4644](https://issues.apache.org/jira/browse/ARROW-4644) - [C++/Docker] Build Gandiva in the docker containers -* [ARROW-4645](https://issues.apache.org/jira/browse/ARROW-4645) - [C++/Packaging] Ship Gandiva with OSX and Windows wheels -* [ARROW-4646](https://issues.apache.org/jira/browse/ARROW-4646) - [C++/Packaging] Ship gandiva with the conda-forge packages -* [ARROW-4655](https://issues.apache.org/jira/browse/ARROW-4655) - [Packaging] Parallelize binary upload -* [ARROW-4662](https://issues.apache.org/jira/browse/ARROW-4662) - [Python] Add type\_codes property in UnionType -* [ARROW-4667](https://issues.apache.org/jira/browse/ARROW-4667) - [C++] Suppress unused function warnings with MinGW -* [ARROW-4670](https://issues.apache.org/jira/browse/ARROW-4670) - [Rust] compute::sum performance issue -* [ARROW-4671](https://issues.apache.org/jira/browse/ARROW-4671) - [C++] MakeBuilder doesn't support Type::DICTIONARY -* [ARROW-4673](https://issues.apache.org/jira/browse/ARROW-4673) - [C++] Implement AssertDatumEquals -* [ARROW-4676](https://issues.apache.org/jira/browse/ARROW-4676) - [C++] Add support for debug build with MinGW -* [ARROW-4678](https://issues.apache.org/jira/browse/ARROW-4678) - [Rust] Minimize unstable feature usage -* [ARROW-4679](https://issues.apache.org/jira/browse/ARROW-4679) - [Rust] [DataFusion] Implement in-memory DataSource -* [ARROW-4681](https://issues.apache.org/jira/browse/ARROW-4681) - [Rust] [DataFusion] Implement parallel query execution using threads -* [ARROW-4686](https://issues.apache.org/jira/browse/ARROW-4686) - Only accept 'y' or 'n' in merge\_arrow\_pr.py prompts -* [ARROW-4689](https://issues.apache.org/jira/browse/ARROW-4689) - [Go] add support for WASM -* [ARROW-4690](https://issues.apache.org/jira/browse/ARROW-4690) - [Python] Building TensorFlow compatible wheels for Arrow -* [ARROW-4692](https://issues.apache.org/jira/browse/ARROW-4692) - [Format][Documentation] Add more details about "sidecar" to flight proto -* [ARROW-4693](https://issues.apache.org/jira/browse/ARROW-4693) - [CI] Build boost library with multi precision -* [ARROW-4697](https://issues.apache.org/jira/browse/ARROW-4697) - [C++] Add URI parsing facility -* [ARROW-4703](https://issues.apache.org/jira/browse/ARROW-4703) - [C++] Upgrade dependency versions -* [ARROW-4705](https://issues.apache.org/jira/browse/ARROW-4705) - [Rust] CSV reader should show line number and error message when failing to parse a line -* [ARROW-4707](https://issues.apache.org/jira/browse/ARROW-4707) - [C++] move BitsetStack to bit-util.h -* [ARROW-4718](https://issues.apache.org/jira/browse/ARROW-4718) - Add ArrowStreamWriter/Reader ctors that leave open the underlying Stream -* [ARROW-4727](https://issues.apache.org/jira/browse/ARROW-4727) - [Rust] Implement ability to check if two schemas are the same -* [ARROW-4730](https://issues.apache.org/jira/browse/ARROW-4730) - [C++] Add docker-compose entry for testing Fedora build with system packages -* [ARROW-4731](https://issues.apache.org/jira/browse/ARROW-4731) - [C++] Add docker-compose entry for testing Ubuntu Xenial build with system packages -* [ARROW-4732](https://issues.apache.org/jira/browse/ARROW-4732) - [C++] Add docker-compose entry for testing Debian Testing build with system packages -* [ARROW-4733](https://issues.apache.org/jira/browse/ARROW-4733) - [C++] Add CI entry that builds without the conda-forge toolchain but with system packages -* [ARROW-4734](https://issues.apache.org/jira/browse/ARROW-4734) - [Go] Add option to write a header for CSV writer -* [ARROW-4735](https://issues.apache.org/jira/browse/ARROW-4735) - [Go] Benchmark strconv.Format vs. fmt.Sprintf for CSV writer -* [ARROW-4739](https://issues.apache.org/jira/browse/ARROW-4739) - [Rust] [DataFusion] It should be possible to share a logical plan between threads -* [ARROW-4740](https://issues.apache.org/jira/browse/ARROW-4740) - [Java] Upgrade to JUnit 5 -* [ARROW-4743](https://issues.apache.org/jira/browse/ARROW-4743) - [Java] Fix documentation in arrow memory module -* [ARROW-4745](https://issues.apache.org/jira/browse/ARROW-4745) - [C++][Documentation] Document process for replicating static\_crt builds on windows -* [ARROW-4749](https://issues.apache.org/jira/browse/ARROW-4749) - [Rust] RecordBatch::new() should return result instead of panicking -* [ARROW-4751](https://issues.apache.org/jira/browse/ARROW-4751) - [C++] Add pkg-config to conda\_env\_cpp.yml -* [ARROW-4754](https://issues.apache.org/jira/browse/ARROW-4754) - [CI][Java] Flaky TestAuth Flight test -* [ARROW-4756](https://issues.apache.org/jira/browse/ARROW-4756) - [CI] document the procedure to update docker image for manylinux1 builds -* [ARROW-4758](https://issues.apache.org/jira/browse/ARROW-4758) - [Flight] Build fails on Mac due to missing Schema\_generated.h -* [ARROW-4769](https://issues.apache.org/jira/browse/ARROW-4769) - [Rust] Improve array limit function where max records \> len -* [ARROW-4772](https://issues.apache.org/jira/browse/ARROW-4772) - Provide new ORC adapter interface that allow user to specify row number -* [ARROW-4776](https://issues.apache.org/jira/browse/ARROW-4776) - [C++] DictionaryBuilder should support bootstrapping from an existing dict type -* [ARROW-4777](https://issues.apache.org/jira/browse/ARROW-4777) - [C++/Python] manylinux1: Update lz4 to 1.8.3 -* [ARROW-4778](https://issues.apache.org/jira/browse/ARROW-4778) - [C++/Python] manylinux1: Update Thrift to 0.12.0 -* [ARROW-4782](https://issues.apache.org/jira/browse/ARROW-4782) - [C++] Prototype scalar and array expression types for developing deferred operator algebra -* [ARROW-4786](https://issues.apache.org/jira/browse/ARROW-4786) - [C++/Python] Support better parallelisation in manylinux1 base build -* [ARROW-4789](https://issues.apache.org/jira/browse/ARROW-4789) - [C++] Deprecate and and later remove arrow::io::ReadableFileInterface -* [ARROW-4790](https://issues.apache.org/jira/browse/ARROW-4790) - [Python/Packaging] Update manylinux docker image in crossbow task -* [ARROW-4791](https://issues.apache.org/jira/browse/ARROW-4791) - Unused dependencies in arrow and datafusion -* [ARROW-4794](https://issues.apache.org/jira/browse/ARROW-4794) - [Python] Make pandas an optional test dependency -* [ARROW-4797](https://issues.apache.org/jira/browse/ARROW-4797) - [Plasma] Avoid store crash if not enough memory is available -* [ARROW-4801](https://issues.apache.org/jira/browse/ARROW-4801) - [GLib] Suppress pkgconfig.generate() warnings -* [ARROW-4808](https://issues.apache.org/jira/browse/ARROW-4808) - [Java][Vector] Convenience methods for setting decimal vector -* [ARROW-4812](https://issues.apache.org/jira/browse/ARROW-4812) - [Rust] [DataFusion] Table.scan() should return one iterator per partition -* [ARROW-4817](https://issues.apache.org/jira/browse/ARROW-4817) - [Rust] [DataFusion] Small re-org of modules -* [ARROW-4818](https://issues.apache.org/jira/browse/ARROW-4818) - [Rust] [DataFusion] Parquet data source does not support null values -* [ARROW-4826](https://issues.apache.org/jira/browse/ARROW-4826) - [Go] export Flush method for CSV writer -* [ARROW-4831](https://issues.apache.org/jira/browse/ARROW-4831) - [C++] CMAKE\_AR is not passed to ZSTD thirdparty dependency -* [ARROW-4833](https://issues.apache.org/jira/browse/ARROW-4833) - [Release] Document how to update the brew formula in the release management guide -* [ARROW-4834](https://issues.apache.org/jira/browse/ARROW-4834) - [R] Feature flag to disable parquet -* [ARROW-4835](https://issues.apache.org/jira/browse/ARROW-4835) - [GLib] Add boolean operations -* [ARROW-4837](https://issues.apache.org/jira/browse/ARROW-4837) - [C++] Support c++filt on a custom path in the run-test.sh script -* [ARROW-4839](https://issues.apache.org/jira/browse/ARROW-4839) - [C\#] Add NuGet support -* [ARROW-4843](https://issues.apache.org/jira/browse/ARROW-4843) - [Rust] [DataFusion] Parquet data source should support DATE -* [ARROW-4846](https://issues.apache.org/jira/browse/ARROW-4846) - [Java] Update Jackson to 2.9.8 -* [ARROW-4849](https://issues.apache.org/jira/browse/ARROW-4849) - [C++] Add docker-compose entry for testing Ubuntu Bionic build with system packages -* [ARROW-4854](https://issues.apache.org/jira/browse/ARROW-4854) - [Rust] Use Array Slice for limit kernel -* [ARROW-4855](https://issues.apache.org/jira/browse/ARROW-4855) - [Packaging] Generate default package version based on cpp tags in crossbow.py -* [ARROW-4858](https://issues.apache.org/jira/browse/ARROW-4858) - [Flight][Python] Enable custom FlightDataStream in Python -* [ARROW-4859](https://issues.apache.org/jira/browse/ARROW-4859) - [GLib] Add garrow\_numeric\_array\_mean() -* [ARROW-4862](https://issues.apache.org/jira/browse/ARROW-4862) - [GLib] Add GArrowCastOptions::allow-invalid-utf8 property -* [ARROW-4862](https://issues.apache.org/jira/browse/ARROW-4862) - [GLib] Add GArrowCastOptions::allow-invalid-utf8 property -* [ARROW-4865](https://issues.apache.org/jira/browse/ARROW-4865) - [Rust] Support casting lists and primitives to lists -* [ARROW-4873](https://issues.apache.org/jira/browse/ARROW-4873) - [C++] Clarify documentation about how to use external ARROW\_PACKAGE\_PREFIX while also using CONDA dependency resolution -* [ARROW-4878](https://issues.apache.org/jira/browse/ARROW-4878) - [C++] ARROW\_DEPENDENCY\_SOURCE=CONDA does not work properly with MSVC -* [ARROW-4882](https://issues.apache.org/jira/browse/ARROW-4882) - [GLib] Add "Sum" functions -* [ARROW-4887](https://issues.apache.org/jira/browse/ARROW-4887) - [GLib] Add garrow\_array\_count() -* [ARROW-4889](https://issues.apache.org/jira/browse/ARROW-4889) - [C++] Add STATUS messages for Protobuf in CMake -* [ARROW-4891](https://issues.apache.org/jira/browse/ARROW-4891) - [C++] ZLIB include directories not added -* [ARROW-4892](https://issues.apache.org/jira/browse/ARROW-4892) - [Rust] [DataFusion] Move SQL parser and planner into sql package -* [ARROW-4893](https://issues.apache.org/jira/browse/ARROW-4893) - [C++] conda packages should use $PREFIX inside of conda-build -* [ARROW-4894](https://issues.apache.org/jira/browse/ARROW-4894) - [Rust] [DataFusion] Remove all uses of panic! from aggregate.rs -* [ARROW-4895](https://issues.apache.org/jira/browse/ARROW-4895) - [Rust] [DataFusion] Move error.rs to top level package -* [ARROW-4896](https://issues.apache.org/jira/browse/ARROW-4896) - [Rust] [DataFusion] Remove all uses of panic! from tests -* [ARROW-4897](https://issues.apache.org/jira/browse/ARROW-4897) - [Rust] [DataFusion] Improve Rustdoc -* [ARROW-4898](https://issues.apache.org/jira/browse/ARROW-4898) - [C++] Old versions of FindProtobuf.cmake use ALL-CAPS for variables -* [ARROW-4899](https://issues.apache.org/jira/browse/ARROW-4899) - [Rust] [DataFusion] Remove all uses of panic! from expression.rs -* [ARROW-4901](https://issues.apache.org/jira/browse/ARROW-4901) - [Go] Run tests in Appveyor -* [ARROW-4905](https://issues.apache.org/jira/browse/ARROW-4905) - [C++][Plasma] Remove dlmalloc from client library -* [ARROW-4907](https://issues.apache.org/jira/browse/ARROW-4907) - [CI] Add docker container to inspect docker context -* [ARROW-4908](https://issues.apache.org/jira/browse/ARROW-4908) - [Rust] [DataFusion] Add support for parquet date/time in int32/64 encoding -* [ARROW-4909](https://issues.apache.org/jira/browse/ARROW-4909) - [CI] Use hadolint to lint Dockerfiles -* [ARROW-4910](https://issues.apache.org/jira/browse/ARROW-4910) - [Rust] [DataFusion] Remove all uses of unimplemented! -* [ARROW-4915](https://issues.apache.org/jira/browse/ARROW-4915) - [GLib] Add support for arrow::NullBuilder -* [ARROW-4922](https://issues.apache.org/jira/browse/ARROW-4922) - [Packaging] Use system libraris for .deb and .rpm -* [ARROW-4924](https://issues.apache.org/jira/browse/ARROW-4924) - [Ruby] Add Decimal128\#to\_s(scale=nil) -* [ARROW-4925](https://issues.apache.org/jira/browse/ARROW-4925) - [Rust] [DataFusion] Remove duplicate implementations of collect\_expr -* [ARROW-4926](https://issues.apache.org/jira/browse/ARROW-4926) - [Rust] [DataFusion] Update README for 0.13.0 release -* [ARROW-4929](https://issues.apache.org/jira/browse/ARROW-4929) - [GLib] Add garrow\_array\_count\_values() -* [ARROW-4932](https://issues.apache.org/jira/browse/ARROW-4932) - [GLib] Use G\_DECLARE\_DERIVABLE\_TYPE macro -* [ARROW-4933](https://issues.apache.org/jira/browse/ARROW-4933) - [R] Autodetect Parquet support using pkg-config -* [ARROW-4937](https://issues.apache.org/jira/browse/ARROW-4937) - [R] Clean pkg-config related logic -* [ARROW-4939](https://issues.apache.org/jira/browse/ARROW-4939) - [Python] Add wrapper for "sum" kernel -* [ARROW-4940](https://issues.apache.org/jira/browse/ARROW-4940) - [Rust] Enhance documentation for datafusion -* [ARROW-4944](https://issues.apache.org/jira/browse/ARROW-4944) - [C++] Raise minimal required thrift-cpp to 0.11 in conda environment -* [ARROW-4946](https://issues.apache.org/jira/browse/ARROW-4946) - [C++] Support detection of flatbuffers without FlatbuffersConfig.cmake -* [ARROW-4947](https://issues.apache.org/jira/browse/ARROW-4947) - [Flight][C++/Python] Remove redundant schema parameter in DoGet -* [ARROW-4951](https://issues.apache.org/jira/browse/ARROW-4951) - [C++] Turn off cpp benchmarks in cpp docker images -* [ARROW-4955](https://issues.apache.org/jira/browse/ARROW-4955) - [GLib] Add garrow\_file\_is\_closed() -* [ARROW-4964](https://issues.apache.org/jira/browse/ARROW-4964) - [Ruby] Add closed check if available on auto close -* [ARROW-4969](https://issues.apache.org/jira/browse/ARROW-4969) - [C++] Set RPATH in correct order for test executables on OSX -* [ARROW-4977](https://issues.apache.org/jira/browse/ARROW-4977) - [Ruby] Add support for building on Windows -* [ARROW-4978](https://issues.apache.org/jira/browse/ARROW-4978) - [Ruby] Fix wrong internal variable name for table data -* [ARROW-4979](https://issues.apache.org/jira/browse/ARROW-4979) - [GLib] Add missing lock to garrow::GIOInputStream -* [ARROW-4980](https://issues.apache.org/jira/browse/ARROW-4980) - [GLib] Use GInputStream as the parent of GArrowInputStream -* [ARROW-4981](https://issues.apache.org/jira/browse/ARROW-4981) - [Ruby] Add support for CSV data encoding conversion -* [ARROW-4983](https://issues.apache.org/jira/browse/ARROW-4983) - [Plasma] Unmap memory when the client is destroyed -* [ARROW-4994](https://issues.apache.org/jira/browse/ARROW-4994) - [website] Update Details for ptgoetz -* [ARROW-4995](https://issues.apache.org/jira/browse/ARROW-4995) - [R] Make sure winbuilder tests pass for package -* [ARROW-4996](https://issues.apache.org/jira/browse/ARROW-4996) - [Plasma] There are many log files in /tmp -* [ARROW-5003](https://issues.apache.org/jira/browse/ARROW-5003) - [R] remove dependency on withr -* [ARROW-5006](https://issues.apache.org/jira/browse/ARROW-5006) - [R] parquet.cpp does not include enough Rcpp -* [ARROW-5011](https://issues.apache.org/jira/browse/ARROW-5011) - [Release] Add support in the source release script for custom hash -* [ARROW-5013](https://issues.apache.org/jira/browse/ARROW-5013) - [Rust] [DataFusion] Refactor runtime expression support -* [ARROW-5014](https://issues.apache.org/jira/browse/ARROW-5014) - [Java] Fix typos in Flight module -* [ARROW-5018](https://issues.apache.org/jira/browse/ARROW-5018) - [Release] Include JavaScript implementation -* [ARROW-5032](https://issues.apache.org/jira/browse/ARROW-5032) - [C++] Headers in vendored/datetime directory aren't installed -* [ARROW-5041](https://issues.apache.org/jira/browse/ARROW-5041) - [Release][C++] use bundled gtest and gmock in verify-release-candidate.bat -* [ARROW-5075](https://issues.apache.org/jira/browse/ARROW-5075) - [Release] Add 0.13.0 release note -* [ARROW-5084](https://issues.apache.org/jira/browse/ARROW-5084) - [Website] Blog post / release announcement for 0.13.0 -* [PARQUET-1477](https://issues.apache.org/jira/browse/PARQUET-1477) - Thrift crypto updates -* [PARQUET-1508](https://issues.apache.org/jira/browse/PARQUET-1508) - [C++] Enable reading from ByteArray and FixedLenByteArray decoders directly into arrow::BinaryBuilder or arrow::BinaryDictionaryBuilder -* [PARQUET-1519](https://issues.apache.org/jira/browse/PARQUET-1519) - [C++] Remove use of "extern template class" from parquet/column\_reader.h -* [PARQUET-1521](https://issues.apache.org/jira/browse/PARQUET-1521) - [C++] Do not use "extern template class" with parquet::ColumnWriter -* [PARQUET-1525](https://issues.apache.org/jira/browse/PARQUET-1525) - [C++] remove dependency on getopt in parquet tools - - - -# Apache Arrow 0.12.1 (2019-02-25) - -## Bug Fixes - -* [ARROW-3564](https://issues.apache.org/jira/browse/ARROW-3564) - [Python] writing version 2.0 parquet format with dictionary encoding enabled -* [ARROW-4255](https://issues.apache.org/jira/browse/ARROW-4255) - [C++] Schema::GetFieldIndex is not thread-safe -* [ARROW-4267](https://issues.apache.org/jira/browse/ARROW-4267) - [Python/C++][Parquet] Segfault when reading rowgroups with duplicated columns -* [ARROW-4323](https://issues.apache.org/jira/browse/ARROW-4323) - [Packaging] Fix failing OSX clang conda forge builds -* [ARROW-4367](https://issues.apache.org/jira/browse/ARROW-4367) - [C++] StringDictionaryBuilder segfaults on Finish with only null entries -* [ARROW-4374](https://issues.apache.org/jira/browse/ARROW-4374) - [C++] DictionaryBuilder does not correctly report length and null\_count -* [ARROW-4492](https://issues.apache.org/jira/browse/ARROW-4492) - [Python] Failure reading Parquet column as pandas Categorical in 0.12 -* [ARROW-4501](https://issues.apache.org/jira/browse/ARROW-4501) - [C++] Unique returns non-unique strings -* [ARROW-4582](https://issues.apache.org/jira/browse/ARROW-4582) - [C++/Python] Memory corruption on Pandas-\>Arrow conversion -* [ARROW-4629](https://issues.apache.org/jira/browse/ARROW-4629) - [Python] Pandas to arrow conversion slowed down by local imports -* [ARROW-4636](https://issues.apache.org/jira/browse/ARROW-4636) - [Python/Packaging] Crossbow builds for conda-osx fail on upload with Ruby linkage errors -* [ARROW-4647](https://issues.apache.org/jira/browse/ARROW-4647) - [Packaging] dev/release/00-prepare.sh fails for minor version changes - - -## New Features and Improvements - -* [ARROW-4291](https://issues.apache.org/jira/browse/ARROW-4291) - [Dev] Support selecting features in release scripts -* [ARROW-4298](https://issues.apache.org/jira/browse/ARROW-4298) - [Java] Building Flight fails with OpenJDK 11 -* [ARROW-4373](https://issues.apache.org/jira/browse/ARROW-4373) - [Packaging] Travis fails to deploy conda packages on OSX - - - -# Apache Arrow 0.12.0 (2019-01-20) - -## New Features and Improvements - -* [ARROW-45](https://issues.apache.org/jira/browse/ARROW-45) - [Python] Add unnest/flatten function for List types -* [ARROW-536](https://issues.apache.org/jira/browse/ARROW-536) - [C++] Provide non-SSE4 versions of functions that use CPU intrinsics for older processors -* [ARROW-554](https://issues.apache.org/jira/browse/ARROW-554) - [C++] Implement functions to conform unequal dictionaries amongst multiple Arrow arrays -* [ARROW-766](https://issues.apache.org/jira/browse/ARROW-766) - [C++] Introduce zero-copy "StringPiece" type -* [ARROW-854](https://issues.apache.org/jira/browse/ARROW-854) - [Format] Support sparse tensor -* [ARROW-912](https://issues.apache.org/jira/browse/ARROW-912) - [Python] Account for multiarch systems in development.rst -* [ARROW-1019](https://issues.apache.org/jira/browse/ARROW-1019) - [C++] Implement input stream and output stream with Gzip codec -* [ARROW-1055](https://issues.apache.org/jira/browse/ARROW-1055) - [C++] GPU support library development -* [ARROW-1262](https://issues.apache.org/jira/browse/ARROW-1262) - [Packaging] Packaging automation in arrow-dist -* [ARROW-1423](https://issues.apache.org/jira/browse/ARROW-1423) - [C++] Create non-owned CudaContext from context handle provided by thirdparty user -* [ARROW-1492](https://issues.apache.org/jira/browse/ARROW-1492) - [C++] Type casting function kernel suite -* [ARROW-1688](https://issues.apache.org/jira/browse/ARROW-1688) - [Java] Fail build on checkstyle warnings -* [ARROW-1696](https://issues.apache.org/jira/browse/ARROW-1696) - [C++] Add codec benchmarks -* [ARROW-1822](https://issues.apache.org/jira/browse/ARROW-1822) - [C++] Add SSE4.2-accelerated hash kernels and use if host CPU supports -* [ARROW-1993](https://issues.apache.org/jira/browse/ARROW-1993) - [Python] Add function for determining implied Arrow schema from pandas.DataFrame -* [ARROW-1994](https://issues.apache.org/jira/browse/ARROW-1994) - [Python] Test against Pandas master -* [ARROW-2183](https://issues.apache.org/jira/browse/ARROW-2183) - [C++] Add helper CMake function for globbing the right header files -* [ARROW-2211](https://issues.apache.org/jira/browse/ARROW-2211) - [C++] Use simpler hash functions for integers -* [ARROW-2216](https://issues.apache.org/jira/browse/ARROW-2216) - [CI] CI descriptions and envars are misleading -* [ARROW-2337](https://issues.apache.org/jira/browse/ARROW-2337) - [Scripts] Windows release verification script should use boost DSOs instead of static linkage -* [ARROW-2374](https://issues.apache.org/jira/browse/ARROW-2374) - [Rust] Add support for array of List -* [ARROW-2475](https://issues.apache.org/jira/browse/ARROW-2475) - [Format] Confusing array length description -* [ARROW-2476](https://issues.apache.org/jira/browse/ARROW-2476) - [Python/Question] Maximum length of an Array created from ndarray -* [ARROW-2483](https://issues.apache.org/jira/browse/ARROW-2483) - [Rust] use bit-packing for boolean vectors -* [ARROW-2504](https://issues.apache.org/jira/browse/ARROW-2504) - [Website] Add ApacheCon NA link -* [ARROW-2535](https://issues.apache.org/jira/browse/ARROW-2535) - [Python] Provide pre-commit hooks that check flake8 -* [ARROW-2560](https://issues.apache.org/jira/browse/ARROW-2560) - [Rust] The Rust README should include Rust-specific information on contributing -* [ARROW-2624](https://issues.apache.org/jira/browse/ARROW-2624) - [Python] Random schema and data generator for Arrow conversion and Parquet testing -* [ARROW-2637](https://issues.apache.org/jira/browse/ARROW-2637) - [C++/Python] Build support and instructions for development on Alpine Linux -* [ARROW-2648](https://issues.apache.org/jira/browse/ARROW-2648) - [Packaging] Follow up packaging tasks -* [ARROW-2653](https://issues.apache.org/jira/browse/ARROW-2653) - [C++] Refactor hash table support -* [ARROW-2670](https://issues.apache.org/jira/browse/ARROW-2670) - [C++/Python] Add Ubuntu 18.04 / gcc7 as a nightly build -* [ARROW-2673](https://issues.apache.org/jira/browse/ARROW-2673) - [Python] Add documentation + docstring for ARROW-2661 -* [ARROW-2684](https://issues.apache.org/jira/browse/ARROW-2684) - [Python] Various documentation improvements -* [ARROW-2712](https://issues.apache.org/jira/browse/ARROW-2712) - [C\#] Initial C\# .NET library -* [ARROW-2720](https://issues.apache.org/jira/browse/ARROW-2720) - [C++] Clean up cmake CXX\_STANDARD and PIC flag setting -* [ARROW-2759](https://issues.apache.org/jira/browse/ARROW-2759) - Export notification socket of Plasma -* [ARROW-2803](https://issues.apache.org/jira/browse/ARROW-2803) - [C++] Put hashing function into src/arrow/util -* [ARROW-2807](https://issues.apache.org/jira/browse/ARROW-2807) - [Python] Enable memory-mapping to be toggled in get\_reader when reading Parquet files -* [ARROW-2808](https://issues.apache.org/jira/browse/ARROW-2808) - [Python] Add unit tests for ProxyMemoryPool, enable new default MemoryPool to be constructed -* [ARROW-2919](https://issues.apache.org/jira/browse/ARROW-2919) - [C++] Improve error message when listing empty HDFS file -* [ARROW-2968](https://issues.apache.org/jira/browse/ARROW-2968) - [R] Multi-threaded conversion from Arrow table to R data.frame -* [ARROW-2995](https://issues.apache.org/jira/browse/ARROW-2995) - [CI] Build Python libraries in same run when running C++ unit tests so project does not need to be rebuilt again right away -* [ARROW-3020](https://issues.apache.org/jira/browse/ARROW-3020) - [Python] Addition of option to allow empty Parquet row groups -* [ARROW-3038](https://issues.apache.org/jira/browse/ARROW-3038) - [Go] add support for StringArray -* [ARROW-3063](https://issues.apache.org/jira/browse/ARROW-3063) - [Go] move list of supported/TODO features to confluence -* [ARROW-3070](https://issues.apache.org/jira/browse/ARROW-3070) - [Release] Host binary artifacts for RCs and releases on ASF Bintray account instead of dist/mirror system -* [ARROW-3108](https://issues.apache.org/jira/browse/ARROW-3108) - [C++] arrow::PrettyPrint for Table instances -* [ARROW-3126](https://issues.apache.org/jira/browse/ARROW-3126) - [Python] Make Buffered\* IO classes available to Python, incorporate into input\_stream, output\_stream factory functions -* [ARROW-3131](https://issues.apache.org/jira/browse/ARROW-3131) - [Go] add test for Go-1.11 -* [ARROW-3161](https://issues.apache.org/jira/browse/ARROW-3161) - [Packaging] Ensure to run pyarrow unit tests in conda and wheel builds -* [ARROW-3169](https://issues.apache.org/jira/browse/ARROW-3169) - [C++] Break array-test.cc and array.cc into multiple compilation units -* [ARROW-3184](https://issues.apache.org/jira/browse/ARROW-3184) - [C++] Add modular build targets, "all" target, and require explicit target when invoking make or ninja -* [ARROW-3194](https://issues.apache.org/jira/browse/ARROW-3194) - [Java] Fix setValueCount in spitAndTransfer for variable width vectors -* [ARROW-3199](https://issues.apache.org/jira/browse/ARROW-3199) - [Plasma] Check for EAGAIN in recvmsg and sendmsg -* [ARROW-3209](https://issues.apache.org/jira/browse/ARROW-3209) - [C++] Rename libarrow\_gpu to libarrow\_cuda -* [ARROW-3230](https://issues.apache.org/jira/browse/ARROW-3230) - [Python] Missing comparisons on ChunkedArray, Table -* [ARROW-3233](https://issues.apache.org/jira/browse/ARROW-3233) - [Python] Sphinx documentation for pyarrow.cuda GPU support -* [ARROW-3248](https://issues.apache.org/jira/browse/ARROW-3248) - [C++] Arrow tests should have label "arrow" -* [ARROW-3254](https://issues.apache.org/jira/browse/ARROW-3254) - [C++] Add option to ADD\_ARROW\_TEST to compose a test executable from multiple .cc files containing unit tests -* [ARROW-3260](https://issues.apache.org/jira/browse/ARROW-3260) - [CI] Make linting a separate job -* [ARROW-3272](https://issues.apache.org/jira/browse/ARROW-3272) - [Java] Document checkstyle deviations from Google style guide -* [ARROW-3273](https://issues.apache.org/jira/browse/ARROW-3273) - [Java] checkstyle - fix javadoc style -* [ARROW-3278](https://issues.apache.org/jira/browse/ARROW-3278) - [Python] Retrieve StructType's and StructArray's field by name -* [ARROW-3291](https://issues.apache.org/jira/browse/ARROW-3291) - [C++] Convenience API for constructing arrow::io::BufferReader from std::string -* [ARROW-3293](https://issues.apache.org/jira/browse/ARROW-3293) - [C++] Test Flight RPC in Travis CI -* [ARROW-3296](https://issues.apache.org/jira/browse/ARROW-3296) - [Python] Add Flight support to manylinux1 wheels -* [ARROW-3303](https://issues.apache.org/jira/browse/ARROW-3303) - [C++] Enable example arrays to be written with a simplified JSON representation -* [ARROW-3306](https://issues.apache.org/jira/browse/ARROW-3306) - [R] Objects and support functions different kinds of arrow::Buffer -* [ARROW-3307](https://issues.apache.org/jira/browse/ARROW-3307) - [R] Convert chunked arrow::Column to R vector -* [ARROW-3310](https://issues.apache.org/jira/browse/ARROW-3310) - [R] Create wrapper classes for various Arrow IO interfaces -* [ARROW-3312](https://issues.apache.org/jira/browse/ARROW-3312) - [R] Use same .clang-format file for both R binding C++ code and main C++ codebase -* [ARROW-3315](https://issues.apache.org/jira/browse/ARROW-3315) - [R] Support for multi-threaded conversions from RecordBatch, Table to R data.frame -* [ARROW-3318](https://issues.apache.org/jira/browse/ARROW-3318) - [C++] Convenience method for reading all batches from an IPC stream or file as arrow::Table -* [ARROW-3323](https://issues.apache.org/jira/browse/ARROW-3323) - [Java] checkstyle - fix naming -* [ARROW-3331](https://issues.apache.org/jira/browse/ARROW-3331) - [C++] Add re2 to ThirdpartyToolchain -* [ARROW-3340](https://issues.apache.org/jira/browse/ARROW-3340) - [R] support for dates and time classes -* [ARROW-3347](https://issues.apache.org/jira/browse/ARROW-3347) - [Rust] Implement PrimitiveArrayBuilder -* [ARROW-3353](https://issues.apache.org/jira/browse/ARROW-3353) - [Packaging] Build python 3.7 wheels -* [ARROW-3355](https://issues.apache.org/jira/browse/ARROW-3355) - [R] Support for factors -* [ARROW-3358](https://issues.apache.org/jira/browse/ARROW-3358) - [Gandiva][C++] Replace usages of gandiva/status.h with arrow/status.h -* [ARROW-3362](https://issues.apache.org/jira/browse/ARROW-3362) - [R] Guard against null buffers -* [ARROW-3366](https://issues.apache.org/jira/browse/ARROW-3366) - [R] Dockerfile for docker-compose setup -* [ARROW-3368](https://issues.apache.org/jira/browse/ARROW-3368) - [Integration/CI/Python] Add dask integration test to docker-compose setup -* [ARROW-3380](https://issues.apache.org/jira/browse/ARROW-3380) - [Python] Support reading CSV files and more from a gzipped file -* [ARROW-3381](https://issues.apache.org/jira/browse/ARROW-3381) - [C++] Implement InputStream for bz2 files -* [ARROW-3383](https://issues.apache.org/jira/browse/ARROW-3383) - [Java] Run Gandiva tests in Travis CI -* [ARROW-3384](https://issues.apache.org/jira/browse/ARROW-3384) - [Gandiva] Sync remaining commits from gandiva repo -* [ARROW-3385](https://issues.apache.org/jira/browse/ARROW-3385) - [Java] [Gandiva] Deploy gandiva snapshot jars automatically -* [ARROW-3387](https://issues.apache.org/jira/browse/ARROW-3387) - [C++] Function to cast binary to string/utf8 with UTF8 validation -* [ARROW-3398](https://issues.apache.org/jira/browse/ARROW-3398) - [Rust] Update existing Builder to use MutableBuffer internally -* [ARROW-3402](https://issues.apache.org/jira/browse/ARROW-3402) - [Gandiva][C++] Utilize common bitmap operation implementations in precompiled IR routines -* [ARROW-3407](https://issues.apache.org/jira/browse/ARROW-3407) - [C++] Add UTF8 conversion modes in CSV reader conversion options -* [ARROW-3409](https://issues.apache.org/jira/browse/ARROW-3409) - [C++] Add streaming compression interfaces -* [ARROW-3421](https://issues.apache.org/jira/browse/ARROW-3421) - [C++] Add include-what-you-use setup to primary docker-compose.yml -* [ARROW-3427](https://issues.apache.org/jira/browse/ARROW-3427) - [C++] Add Windows support, Unix static libs for double-conversion package in conda-forge -* [ARROW-3429](https://issues.apache.org/jira/browse/ARROW-3429) - [Packaging] Add a script to release binaries that use source archive at dist.apache.orgtable bit -* [ARROW-3430](https://issues.apache.org/jira/browse/ARROW-3430) - [Packaging] Add workaround to verify 0.11.0 -* [ARROW-3431](https://issues.apache.org/jira/browse/ARROW-3431) - [GLib] Include Gemfile to archive -* [ARROW-3432](https://issues.apache.org/jira/browse/ARROW-3432) - [Packaging] Variables aren't expanded Subversion commit message -* [ARROW-3433](https://issues.apache.org/jira/browse/ARROW-3433) - [C++] Validate re2 with Windows toolchain, EP -* [ARROW-3439](https://issues.apache.org/jira/browse/ARROW-3439) - [R] R language bindings for Feather format -* [ARROW-3440](https://issues.apache.org/jira/browse/ARROW-3440) - [Gandiva][C++] Remove outdated cpp/src/gandiva/README.md, add build documentation to cpp/README.md -* [ARROW-3441](https://issues.apache.org/jira/browse/ARROW-3441) - [Gandiva][C++] Produce fewer test executables -* [ARROW-3442](https://issues.apache.org/jira/browse/ARROW-3442) - [C++] Use dynamic linking for unit tests, ensure coverage working properly with clang -* [ARROW-3450](https://issues.apache.org/jira/browse/ARROW-3450) - [R] Wrap MemoryMappedFile class -* [ARROW-3451](https://issues.apache.org/jira/browse/ARROW-3451) - [Python] Allocate CUDA memory from a CUcontext created by numba.cuda -* [ARROW-3455](https://issues.apache.org/jira/browse/ARROW-3455) - [Gandiva][C++] Support pkg-config for Gandiva -* [ARROW-3456](https://issues.apache.org/jira/browse/ARROW-3456) - [CI] Reuse docker images and optimize docker-compose containers -* [ARROW-3460](https://issues.apache.org/jira/browse/ARROW-3460) - [Packaging] Add a script to rebase master on local release branch -* [ARROW-3461](https://issues.apache.org/jira/browse/ARROW-3461) - [Packaging] Add a script to upload RC artifacts as the official release -* [ARROW-3462](https://issues.apache.org/jira/browse/ARROW-3462) - [Packaging] Update CHANGELOG for 0.11.0 -* [ARROW-3463](https://issues.apache.org/jira/browse/ARROW-3463) - [Website] Update for 0.11.0 -* [ARROW-3464](https://issues.apache.org/jira/browse/ARROW-3464) - [Packaging] Build shared libraries for gandiva fat JAR via crossbow -* [ARROW-3465](https://issues.apache.org/jira/browse/ARROW-3465) - [Documentation] Fix gen\_apidocs' docker image -* [ARROW-3469](https://issues.apache.org/jira/browse/ARROW-3469) - [Gandiva] add travis entry for gandiva on OSX -* [ARROW-3472](https://issues.apache.org/jira/browse/ARROW-3472) - [Gandiva] remove gandiva helpers library -* [ARROW-3473](https://issues.apache.org/jira/browse/ARROW-3473) - [Format] Update Layout.md document to clarify use of 64-bit array lengths -* [ARROW-3474](https://issues.apache.org/jira/browse/ARROW-3474) - [GLib] Extend gparquet API with get\_schema and read\_column -* [ARROW-3479](https://issues.apache.org/jira/browse/ARROW-3479) - [R] Support to write record\_batch as stream -* [ARROW-3482](https://issues.apache.org/jira/browse/ARROW-3482) - [C++] Build with JEMALLOC by default -* [ARROW-3487](https://issues.apache.org/jira/browse/ARROW-3487) - [Gandiva] simplify NULL\_IF\_NULL functions that can return errors -* [ARROW-3488](https://issues.apache.org/jira/browse/ARROW-3488) - [Packaging] Separate crossbow task definition files for packaging and tests -* [ARROW-3489](https://issues.apache.org/jira/browse/ARROW-3489) - [Gandiva] Support for in expressions -* [ARROW-3490](https://issues.apache.org/jira/browse/ARROW-3490) - [R] streaming arrow objects to output streams -* [ARROW-3492](https://issues.apache.org/jira/browse/ARROW-3492) - [C++] Build jemalloc in parallel -* [ARROW-3493](https://issues.apache.org/jira/browse/ARROW-3493) - [Java] Document BOUNDS\_CHECKING\_ENABLED -* [ARROW-3499](https://issues.apache.org/jira/browse/ARROW-3499) - [R] Expose arrow::ipc::Message type -* [ARROW-3501](https://issues.apache.org/jira/browse/ARROW-3501) - [Gandiva] Enable building with gcc 4.8.x on Ubuntu Trusty, similar distros -* [ARROW-3504](https://issues.apache.org/jira/browse/ARROW-3504) - [Plasma] Add support for Plasma Client to put/get raw bytes without pyarrow serialization. -* [ARROW-3505](https://issues.apache.org/jira/browse/ARROW-3505) - [R] Read record batch and table -* [ARROW-3506](https://issues.apache.org/jira/browse/ARROW-3506) - [Packaging] Nightly tests for docker-compose images -* [ARROW-3508](https://issues.apache.org/jira/browse/ARROW-3508) - [C++] Build against double-conversion from conda-forge -* [ARROW-3515](https://issues.apache.org/jira/browse/ARROW-3515) - Introduce NumericTensor class -* [ARROW-3518](https://issues.apache.org/jira/browse/ARROW-3518) - [C++] Detect HOMEBREW\_PREFIX automatically -* [ARROW-3519](https://issues.apache.org/jira/browse/ARROW-3519) - [Gandiva] Add support for functions that can return variable len output -* [ARROW-3521](https://issues.apache.org/jira/browse/ARROW-3521) - [GLib] Run Python using find\_program in meson.build -* [ARROW-3529](https://issues.apache.org/jira/browse/ARROW-3529) - [Ruby] Import Red Parquet -* [ARROW-3530](https://issues.apache.org/jira/browse/ARROW-3530) - [Java/Python] Add conversion for pyarrow.Schema from org.apache…pojo.Schema -* [ARROW-3533](https://issues.apache.org/jira/browse/ARROW-3533) - [Python/Documentation] Use sphinx\_rtd\_theme instead of Bootstrap -* [ARROW-3536](https://issues.apache.org/jira/browse/ARROW-3536) - [C++] Fast UTF8 validation functions -* [ARROW-3537](https://issues.apache.org/jira/browse/ARROW-3537) - [Rust] Implement Tensor Type -* [ARROW-3539](https://issues.apache.org/jira/browse/ARROW-3539) - [CI/Packaging] Update scripts to build against vendored jemalloc -* [ARROW-3540](https://issues.apache.org/jira/browse/ARROW-3540) - [Rust] Incorporate BooleanArray into PrimitiveArray -* [ARROW-3542](https://issues.apache.org/jira/browse/ARROW-3542) - [C++] Use unsafe appends when building array from CSV -* [ARROW-3545](https://issues.apache.org/jira/browse/ARROW-3545) - [C++/Python] Normalize child/field terminology with StructType -* [ARROW-3547](https://issues.apache.org/jira/browse/ARROW-3547) - [R] Protect against Null crash when reading from RecordBatch -* [ARROW-3548](https://issues.apache.org/jira/browse/ARROW-3548) - Speed up storing small objects in the object store. -* [ARROW-3551](https://issues.apache.org/jira/browse/ARROW-3551) - Change MapD to OmniSci on Powered By page -* [ARROW-3553](https://issues.apache.org/jira/browse/ARROW-3553) - [R] Error when losing data on int64, uint64 conversions to double -* [ARROW-3555](https://issues.apache.org/jira/browse/ARROW-3555) - [Plasma] Unify plasma client get function using metadata. -* [ARROW-3556](https://issues.apache.org/jira/browse/ARROW-3556) - [CI] Disable optimizations on Windows -* [ARROW-3557](https://issues.apache.org/jira/browse/ARROW-3557) - [Python] Set language\_level in Cython sources -* [ARROW-3558](https://issues.apache.org/jira/browse/ARROW-3558) - [Plasma] Remove fatal error when plasma client calls get on an unsealed object that it created. -* [ARROW-3559](https://issues.apache.org/jira/browse/ARROW-3559) - Statically link libraries for plasma\_store\_server executable. -* [ARROW-3562](https://issues.apache.org/jira/browse/ARROW-3562) - [R] Disallow creation of objects with null shared\_ptr -* [ARROW-3563](https://issues.apache.org/jira/browse/ARROW-3563) - [C++] Declare public link dependencies so arrow\_static, plasma\_static automatically pull in transitive dependencies -* [ARROW-3566](https://issues.apache.org/jira/browse/ARROW-3566) - Clarify that the type of dictionary encoded field should be the encoded(index) type -* [ARROW-3567](https://issues.apache.org/jira/browse/ARROW-3567) - [Gandiva] [GLib] Add GLib bindings of Gandiva -* [ARROW-3568](https://issues.apache.org/jira/browse/ARROW-3568) - [Packaging] Run pyarrow unittests for windows wheels -* [ARROW-3569](https://issues.apache.org/jira/browse/ARROW-3569) - [Packaging] Run pyarrow unittests when building conda package -* [ARROW-3574](https://issues.apache.org/jira/browse/ARROW-3574) - Fix remaining bug with plasma static versus shared libraries. -* [ARROW-3575](https://issues.apache.org/jira/browse/ARROW-3575) - [Python] New documentation page for CSV reader -* [ARROW-3576](https://issues.apache.org/jira/browse/ARROW-3576) - [Python] Expose compressed file readers as NativeFile -* [ARROW-3577](https://issues.apache.org/jira/browse/ARROW-3577) - [Go] add support for ChunkedArray -* [ARROW-3581](https://issues.apache.org/jira/browse/ARROW-3581) - [Gandiva][C++] ARROW\_PROTOBUF\_USE\_SHARED isn't used -* [ARROW-3582](https://issues.apache.org/jira/browse/ARROW-3582) - [CI] Gandiva C++ build is always triggered -* [ARROW-3583](https://issues.apache.org/jira/browse/ARROW-3583) - [Python/Java] Create RecordBatch from VectorSchemaRoot -* [ARROW-3584](https://issues.apache.org/jira/browse/ARROW-3584) - [Go] add support for Table -* [ARROW-3587](https://issues.apache.org/jira/browse/ARROW-3587) - [Python] Efficient serialization for Arrow Objects (array, table, tensor, etc) -* [ARROW-3588](https://issues.apache.org/jira/browse/ARROW-3588) - [Java] checkstyle - fix license -* [ARROW-3589](https://issues.apache.org/jira/browse/ARROW-3589) - [Gandiva] Make it possible to compile gandiva without JNI -* [ARROW-3591](https://issues.apache.org/jira/browse/ARROW-3591) - [R] Support to collect decimal type -* [ARROW-3592](https://issues.apache.org/jira/browse/ARROW-3592) - [Python] Get BinaryArray value as zero copy memory view -* [ARROW-3597](https://issues.apache.org/jira/browse/ARROW-3597) - [Gandiva] gandiva should integrate with ADD\_ARROW\_TEST for tests -* [ARROW-3600](https://issues.apache.org/jira/browse/ARROW-3600) - [Packaging] Support Ubuntu 18.10 -* [ARROW-3601](https://issues.apache.org/jira/browse/ARROW-3601) - [Rust] Release 0.11.0 -* [ARROW-3602](https://issues.apache.org/jira/browse/ARROW-3602) - [Gandiva] [Python] Add preliminary Cython bindings for Gandiva -* [ARROW-3603](https://issues.apache.org/jira/browse/ARROW-3603) - [Gandiva][C++] Can't build with vendored Boost -* [ARROW-3605](https://issues.apache.org/jira/browse/ARROW-3605) - Remove AE library from plasma header files. -* [ARROW-3607](https://issues.apache.org/jira/browse/ARROW-3607) - [Java] delete() method via JNI for plasma -* [ARROW-3608](https://issues.apache.org/jira/browse/ARROW-3608) - [R] Support for time32 and time64 array types -* [ARROW-3609](https://issues.apache.org/jira/browse/ARROW-3609) - [Gandiva] Move benchmark tests out of unit test -* [ARROW-3610](https://issues.apache.org/jira/browse/ARROW-3610) - [C++] Add interface to turn stl\_allocator into arrow::MemoryPool -* [ARROW-3611](https://issues.apache.org/jira/browse/ARROW-3611) - Give error more quickly when pyarrow serialization context is used incorrectly. -* [ARROW-3612](https://issues.apache.org/jira/browse/ARROW-3612) - [Go] implement RecordBatch and RecordBatchReader -* [ARROW-3615](https://issues.apache.org/jira/browse/ARROW-3615) - [R] Support for NaN -* [ARROW-3616](https://issues.apache.org/jira/browse/ARROW-3616) - [Java] checkstyle - fix remaining coding checks -* [ARROW-3618](https://issues.apache.org/jira/browse/ARROW-3618) - [Packaging/Documentation] Add \`-c conda-forge\` option to avoid PackagesNotFoundError -* [ARROW-3620](https://issues.apache.org/jira/browse/ARROW-3620) - [Python] Document multithreading options in Sphinx and add to api.rst -* [ARROW-3621](https://issues.apache.org/jira/browse/ARROW-3621) - [Go] implement TableBatchReader -* [ARROW-3622](https://issues.apache.org/jira/browse/ARROW-3622) - [Go] implement Schema.Equal -* [ARROW-3623](https://issues.apache.org/jira/browse/ARROW-3623) - [Go] implement Field.Equal -* [ARROW-3624](https://issues.apache.org/jira/browse/ARROW-3624) - [Python/C++] Support for zero-sized device buffers -* [ARROW-3625](https://issues.apache.org/jira/browse/ARROW-3625) - [Go] add examples for Table, Record and {Table,Record}Reader -* [ARROW-3626](https://issues.apache.org/jira/browse/ARROW-3626) - [Go] add a CSV TableReader -* [ARROW-3627](https://issues.apache.org/jira/browse/ARROW-3627) - [Go] add RecordBatchBuilder -* [ARROW-3629](https://issues.apache.org/jira/browse/ARROW-3629) - [Python] Add write\_to\_dataset to Python Sphinx API listing -* [ARROW-3630](https://issues.apache.org/jira/browse/ARROW-3630) - [Plasma] [GLib] Add GLib bindings of Plasma -* [ARROW-3632](https://issues.apache.org/jira/browse/ARROW-3632) - [Packaging] Update deb names in dev/tasks/tasks.yml in dev/release/00-prepare.sh -* [ARROW-3633](https://issues.apache.org/jira/browse/ARROW-3633) - [Packaging] Update deb names in dev/tasks/tasks.yml for 0.12.0 -* [ARROW-3636](https://issues.apache.org/jira/browse/ARROW-3636) - [C++/Python] Update arrow/python/pyarrow\_api.h -* [ARROW-3638](https://issues.apache.org/jira/browse/ARROW-3638) - [C++][Python] Move reading from Feather as Table feature to C++ from Python -* [ARROW-3639](https://issues.apache.org/jira/browse/ARROW-3639) - [Packaging] Run gandiva nightly packaging tasks -* [ARROW-3640](https://issues.apache.org/jira/browse/ARROW-3640) - [Go] add support for Tensors -* [ARROW-3641](https://issues.apache.org/jira/browse/ARROW-3641) - [C++/Python] remove public keyword from Cython api functions -* [ARROW-3642](https://issues.apache.org/jira/browse/ARROW-3642) - [C++] Add arrowConfig.cmake generation -* [ARROW-3644](https://issues.apache.org/jira/browse/ARROW-3644) - [Rust] Implement ListArrayBuilder -* [ARROW-3645](https://issues.apache.org/jira/browse/ARROW-3645) - [Python] Document compression support in Sphinx -* [ARROW-3646](https://issues.apache.org/jira/browse/ARROW-3646) - [Python] Add convenience factories to create IO streams -* [ARROW-3647](https://issues.apache.org/jira/browse/ARROW-3647) - [R] Crash after unloading bit64 package -* [ARROW-3648](https://issues.apache.org/jira/browse/ARROW-3648) - [Plasma] Add API to get metadata and data at the same time -* [ARROW-3649](https://issues.apache.org/jira/browse/ARROW-3649) - [Rust] Refactor MutableBuffer's resize -* [ARROW-3656](https://issues.apache.org/jira/browse/ARROW-3656) - [C++] Allow whitespace in numeric CSV fields -* [ARROW-3657](https://issues.apache.org/jira/browse/ARROW-3657) - [R] Require bit64 package -* [ARROW-3659](https://issues.apache.org/jira/browse/ARROW-3659) - [C++] Clang Travis build (matrix entry 2) might not actually be using clang -* [ARROW-3660](https://issues.apache.org/jira/browse/ARROW-3660) - [C++] Don't unnecessarily lock MemoryMappedFile for resizing in readonly files -* [ARROW-3661](https://issues.apache.org/jira/browse/ARROW-3661) - [Gandiva][GLib] Improve constant name -* [ARROW-3662](https://issues.apache.org/jira/browse/ARROW-3662) - [C++] Add a const overload to MemoryMappedFile::GetSize -* [ARROW-3664](https://issues.apache.org/jira/browse/ARROW-3664) - [Rust] Add benchmark for PrimitiveArrayBuilder -* [ARROW-3665](https://issues.apache.org/jira/browse/ARROW-3665) - [Rust] Implement StructArrayBuilder -* [ARROW-3666](https://issues.apache.org/jira/browse/ARROW-3666) - [C++] Improve CSV parser performance -* [ARROW-3672](https://issues.apache.org/jira/browse/ARROW-3672) - [Go] implement Time32 array -* [ARROW-3673](https://issues.apache.org/jira/browse/ARROW-3673) - [Go] implement Time64 array -* [ARROW-3674](https://issues.apache.org/jira/browse/ARROW-3674) - [Go] implement Date32 array -* [ARROW-3675](https://issues.apache.org/jira/browse/ARROW-3675) - [Go] implement Date64 array -* [ARROW-3677](https://issues.apache.org/jira/browse/ARROW-3677) - [Go] implement FixedSizedBinary array -* [ARROW-3681](https://issues.apache.org/jira/browse/ARROW-3681) - [Go] add benchmarks for CSV reader -* [ARROW-3682](https://issues.apache.org/jira/browse/ARROW-3682) - [Go] unexport encoding/csv.Reader from CSV reader -* [ARROW-3683](https://issues.apache.org/jira/browse/ARROW-3683) - [Go] add functional-option style to CSV reader -* [ARROW-3684](https://issues.apache.org/jira/browse/ARROW-3684) - [Go] add chunk size option to CSV reader -* [ARROW-3692](https://issues.apache.org/jira/browse/ARROW-3692) - [Gandiva] [Ruby] Add Ruby bindings of Gandiva -* [ARROW-3693](https://issues.apache.org/jira/browse/ARROW-3693) - [R] Invalid buffer for empty characters with null data -* [ARROW-3694](https://issues.apache.org/jira/browse/ARROW-3694) - [Java] Avoid superfluous string creation when logging level is disabled -* [ARROW-3695](https://issues.apache.org/jira/browse/ARROW-3695) - [Gandiva] Use add\_arrow\_lib() -* [ARROW-3696](https://issues.apache.org/jira/browse/ARROW-3696) - [C++] Add feather::TableWriter::Write(table) -* [ARROW-3697](https://issues.apache.org/jira/browse/ARROW-3697) - [Ruby] Add schema\#[] -* [ARROW-3701](https://issues.apache.org/jira/browse/ARROW-3701) - [Gandiva] Add support for decimal operations -* [ARROW-3708](https://issues.apache.org/jira/browse/ARROW-3708) - [Packaging] Nightly CentOS builds are failing -* [ARROW-3713](https://issues.apache.org/jira/browse/ARROW-3713) - [Rust] Implement BinaryArrayBuilder -* [ARROW-3718](https://issues.apache.org/jira/browse/ARROW-3718) - [Gandiva] Remove spurious gtest include -* [ARROW-3719](https://issues.apache.org/jira/browse/ARROW-3719) - [GLib] Support read/write table to/from Feather -* [ARROW-3720](https://issues.apache.org/jira/browse/ARROW-3720) - [GLib] Use "indices" instead of "indexes" -* [ARROW-3721](https://issues.apache.org/jira/browse/ARROW-3721) - [Gandiva] [Python] Support all Gandiva literals -* [ARROW-3722](https://issues.apache.org/jira/browse/ARROW-3722) - [C++] Allow specifying column types to CSV reader -* [ARROW-3723](https://issues.apache.org/jira/browse/ARROW-3723) - [Plasma] [Ruby] Add Ruby bindings of Plasma -* [ARROW-3724](https://issues.apache.org/jira/browse/ARROW-3724) - [GLib] Update gitignore -* [ARROW-3725](https://issues.apache.org/jira/browse/ARROW-3725) - [GLib] Add field readers to GArrowStructDataType -* [ARROW-3726](https://issues.apache.org/jira/browse/ARROW-3726) - [Rust] CSV Reader & Writer -* [ARROW-3727](https://issues.apache.org/jira/browse/ARROW-3727) - [Python] Document use of pyarrow.foreign\_buffer, cuda.foreign\_buffer in Sphinx -* [ARROW-3731](https://issues.apache.org/jira/browse/ARROW-3731) - [R] R API for reading and writing Parquet files -* [ARROW-3733](https://issues.apache.org/jira/browse/ARROW-3733) - [GLib] Add to\_string() to GArrowTable and GArrowColumn -* [ARROW-3736](https://issues.apache.org/jira/browse/ARROW-3736) - [CI/Docker] Ninja test in docker-compose run cpp hangs -* [ARROW-3738](https://issues.apache.org/jira/browse/ARROW-3738) - [C++] Add CSV conversion option to parse ISO8601-like timestamp strings -* [ARROW-3741](https://issues.apache.org/jira/browse/ARROW-3741) - [R] Add support for arrow::compute::Cast to convert Arrow arrays from one type to another -* [ARROW-3743](https://issues.apache.org/jira/browse/ARROW-3743) - [Ruby] Add support for saving/loading Feather -* [ARROW-3744](https://issues.apache.org/jira/browse/ARROW-3744) - [Ruby] Use garrow\_table\_to\_string() in Arrow::Table\#to\_s -* [ARROW-3746](https://issues.apache.org/jira/browse/ARROW-3746) - [Gandiva] [Python] Make it possible to list all functions registered with Gandiva -* [ARROW-3747](https://issues.apache.org/jira/browse/ARROW-3747) - [C++] Flip order of data members in arrow::Decimal128 -* [ARROW-3748](https://issues.apache.org/jira/browse/ARROW-3748) - [GLib] Add GArrowCSVReader -* [ARROW-3749](https://issues.apache.org/jira/browse/ARROW-3749) - [GLib] Typos in documentation and test case name -* [ARROW-3751](https://issues.apache.org/jira/browse/ARROW-3751) - [Python] Add more cython bindings for gandiva -* [ARROW-3752](https://issues.apache.org/jira/browse/ARROW-3752) - [C++] Remove unused status::ArrowError -* [ARROW-3753](https://issues.apache.org/jira/browse/ARROW-3753) - [Gandiva] Remove debug print -* [ARROW-3755](https://issues.apache.org/jira/browse/ARROW-3755) - [GLib] Support for CompressedInputStream, CompressedOutputStream -* [ARROW-3760](https://issues.apache.org/jira/browse/ARROW-3760) - [R] Support Arrow CSV reader -* [ARROW-3773](https://issues.apache.org/jira/browse/ARROW-3773) - [C++] Remove duplicated AssertArraysEqual code in parquet/arrow/arrow-reader-writer-test.cc -* [ARROW-3778](https://issues.apache.org/jira/browse/ARROW-3778) - [C++] Don't put implementations in test-util.h -* [ARROW-3781](https://issues.apache.org/jira/browse/ARROW-3781) - [C++] Configure buffer size in arrow::io::BufferedOutputStream -* [ARROW-3782](https://issues.apache.org/jira/browse/ARROW-3782) - [C++] Implement BufferedReader for C++ -* [ARROW-3784](https://issues.apache.org/jira/browse/ARROW-3784) - [R] Array with type fails with x is not a vector -* [ARROW-3785](https://issues.apache.org/jira/browse/ARROW-3785) - [C++] Use double-conversion conda package in CI toolchain -* [ARROW-3787](https://issues.apache.org/jira/browse/ARROW-3787) - Implement From for BinaryArray -* [ARROW-3788](https://issues.apache.org/jira/browse/ARROW-3788) - [Ruby] Add support for CSV parser writtin in C++ -* [ARROW-3795](https://issues.apache.org/jira/browse/ARROW-3795) - [R] Support for retrieving NAs from INT64 arrays -* [ARROW-3796](https://issues.apache.org/jira/browse/ARROW-3796) - [Rust] Add Example for PrimitiveArrayBuilder -* [ARROW-3798](https://issues.apache.org/jira/browse/ARROW-3798) - [GLib] Add support for column type CSV read options -* [ARROW-3800](https://issues.apache.org/jira/browse/ARROW-3800) - [C++] Vendor a string\_view backport -* [ARROW-3803](https://issues.apache.org/jira/browse/ARROW-3803) - [C++/Python] Split C++ and Python unit test Travis CI jobs, run all C++ tests (including Gandiva) together -* [ARROW-3807](https://issues.apache.org/jira/browse/ARROW-3807) - [R] Missing Field API -* [ARROW-3819](https://issues.apache.org/jira/browse/ARROW-3819) - [Packaging] Update conda variant files to conform with feedstock after compiler migration -* [ARROW-3821](https://issues.apache.org/jira/browse/ARROW-3821) - [Format/Documentation]: Fix typos and grammar issues in Flight.proto comments -* [ARROW-3823](https://issues.apache.org/jira/browse/ARROW-3823) - [R] + buffer.complex -* [ARROW-3825](https://issues.apache.org/jira/browse/ARROW-3825) - [Python] The Python README.md does not show how to run the unit test suite -* [ARROW-3826](https://issues.apache.org/jira/browse/ARROW-3826) - [C++] Determine if using ccache caching in Travis CI actually improves build times -* [ARROW-3830](https://issues.apache.org/jira/browse/ARROW-3830) - [GLib] Add GArrowCodec -* [ARROW-3834](https://issues.apache.org/jira/browse/ARROW-3834) - [Doc] Merge Python & C++ and move to top-level -* [ARROW-3836](https://issues.apache.org/jira/browse/ARROW-3836) - [C++] Add PREFIX option to ADD\_ARROW\_BENCHMARK -* [ARROW-3839](https://issues.apache.org/jira/browse/ARROW-3839) - [Rust] Add ability to infer schema in CSV reader -* [ARROW-3841](https://issues.apache.org/jira/browse/ARROW-3841) - [C++] warning: catching polymorphic type by value -* [ARROW-3842](https://issues.apache.org/jira/browse/ARROW-3842) - [R] RecordBatchStreamWriter api -* [ARROW-3844](https://issues.apache.org/jira/browse/ARROW-3844) - [C++] Remove ARROW\_USE\_SSE and ARROW\_SSE3 -* [ARROW-3845](https://issues.apache.org/jira/browse/ARROW-3845) - [Gandiva] [GLib] Add GGandivaNode -* [ARROW-3847](https://issues.apache.org/jira/browse/ARROW-3847) - [GLib] Remove unnecessary “\”. -* [ARROW-3849](https://issues.apache.org/jira/browse/ARROW-3849) - Leverage Armv8 crc32 extension instructions to accelerate the hash computation for Arm64. -* [ARROW-3851](https://issues.apache.org/jira/browse/ARROW-3851) - [C++] "make check-format" is slow -* [ARROW-3852](https://issues.apache.org/jira/browse/ARROW-3852) - [C++] used uninitialized warning -* [ARROW-3853](https://issues.apache.org/jira/browse/ARROW-3853) - [C++] Implement string to timestamp cast -* [ARROW-3854](https://issues.apache.org/jira/browse/ARROW-3854) - [GLib] Deprecate garrow\_gio\_{input,output}\_stream\_get\_raw() -* [ARROW-3855](https://issues.apache.org/jira/browse/ARROW-3855) - [Rust] Schema/Field/Datatype should implement serde traits -* [ARROW-3856](https://issues.apache.org/jira/browse/ARROW-3856) - [Ruby] Support compressed CSV save/load -* [ARROW-3858](https://issues.apache.org/jira/browse/ARROW-3858) - [GLib] Use {class\_name}\_get\_instance\_private -* [ARROW-3859](https://issues.apache.org/jira/browse/ARROW-3859) - [Java] Fix ComplexWriter backward incompatible change -* [ARROW-3860](https://issues.apache.org/jira/browse/ARROW-3860) - [Gandiva] [C++] Add option to use -static-libstdc++ when building libgandiva\_jni.so -* [ARROW-3862](https://issues.apache.org/jira/browse/ARROW-3862) - [C++] Improve dependencies download script -* [ARROW-3863](https://issues.apache.org/jira/browse/ARROW-3863) - [GLib] Use travis\_retry with brew bundle command -* [ARROW-3864](https://issues.apache.org/jira/browse/ARROW-3864) - [GLib] Add support for allow-float-truncate cast option -* [ARROW-3865](https://issues.apache.org/jira/browse/ARROW-3865) - [Packaging] Add double-conversion dependency to conda forge recipes and the windows wheel build -* [ARROW-3867](https://issues.apache.org/jira/browse/ARROW-3867) - [Documentation] Uploading binary realase artifacts to Bintray -* [ARROW-3868](https://issues.apache.org/jira/browse/ARROW-3868) - [Rust] Build against nightly Rust in CI -* [ARROW-3870](https://issues.apache.org/jira/browse/ARROW-3870) - [C++] Add Peek to InputStream API -* [ARROW-3871](https://issues.apache.org/jira/browse/ARROW-3871) - [R] Replace usages of C++ GetValuesSafely with new methods on ArrayData -* [ARROW-3878](https://issues.apache.org/jira/browse/ARROW-3878) - [Rust] Improve primitive types -* [ARROW-3880](https://issues.apache.org/jira/browse/ARROW-3880) - [Rust] PrimitiveArray should support simple math operations -* [ARROW-3881](https://issues.apache.org/jira/browse/ARROW-3881) - [Rust] PrimitiveArray should support comparison operators -* [ARROW-3883](https://issues.apache.org/jira/browse/ARROW-3883) - [Rust] Update Rust README to reflect new functionality -* [ARROW-3884](https://issues.apache.org/jira/browse/ARROW-3884) - [Python] Add LLVM6 to manylinux1 base image -* [ARROW-3885](https://issues.apache.org/jira/browse/ARROW-3885) - [Rust] Update version to 0.12.0 and update release instructions on wiki -* [ARROW-3886](https://issues.apache.org/jira/browse/ARROW-3886) - [C++] Additional test cases for ARROW-3831 -* [ARROW-3891](https://issues.apache.org/jira/browse/ARROW-3891) - [Java] Remove Long.bitCount with simple bitmap operations -* [ARROW-3893](https://issues.apache.org/jira/browse/ARROW-3893) - [C++] Improve adaptive int builder performance -* [ARROW-3895](https://issues.apache.org/jira/browse/ARROW-3895) - [Rust] CSV reader should return Result\> not Option\> -* [ARROW-3899](https://issues.apache.org/jira/browse/ARROW-3899) - [Python] Table.to\_pandas converts Arrow date32[day] to pandas datetime64[ns] -* [ARROW-3900](https://issues.apache.org/jira/browse/ARROW-3900) - [GLib] Add garrow\_mutable\_buffer\_set\_data() -* [ARROW-3905](https://issues.apache.org/jira/browse/ARROW-3905) - [Ruby] Add StructDataType\#[] -* [ARROW-3906](https://issues.apache.org/jira/browse/ARROW-3906) - [C++] Break builder.cc into multiple compilation units -* [ARROW-3908](https://issues.apache.org/jira/browse/ARROW-3908) - [Rust] Update rust dockerfile to use nightly toolchain -* [ARROW-3910](https://issues.apache.org/jira/browse/ARROW-3910) - [Python] Set date\_as\_object to True in \*.to\_pandas as default after deduplicating logic implemented -* [ARROW-3911](https://issues.apache.org/jira/browse/ARROW-3911) - [Python] Deduplicate datetime.date objects in Table.to\_pandas internals -* [ARROW-3912](https://issues.apache.org/jira/browse/ARROW-3912) - [Plasma][GLib] Add support for creating and referring objects -* [ARROW-3913](https://issues.apache.org/jira/browse/ARROW-3913) - [Gandiva] [GLib] Add GGandivaLiteralNode -* [ARROW-3914](https://issues.apache.org/jira/browse/ARROW-3914) - [C++/Python/Packaging] Docker-compose setup for Alpine linux -* [ARROW-3916](https://issues.apache.org/jira/browse/ARROW-3916) - [Python] Support caller-provided filesystem in \`ParquetWriter\` constructor -* [ARROW-3921](https://issues.apache.org/jira/browse/ARROW-3921) - [CI][GLib] Log Homebrew output -* [ARROW-3922](https://issues.apache.org/jira/browse/ARROW-3922) - [C++] improve the performance of bitmap operations -* [ARROW-3924](https://issues.apache.org/jira/browse/ARROW-3924) - [Packaging][Plasma] Add support for Plasma deb/rpm packages -* [ARROW-3925](https://issues.apache.org/jira/browse/ARROW-3925) - [Python] Include autoconf in Linux/macOS dependencies in conda environment -* [ARROW-3928](https://issues.apache.org/jira/browse/ARROW-3928) - [Python] Add option to deduplicate PyBytes / PyString / PyUnicode objects in Table.to\_pandas conversion path -* [ARROW-3929](https://issues.apache.org/jira/browse/ARROW-3929) - [Go] improve memory usage of CSV reader to improve runtime performances -* [ARROW-3930](https://issues.apache.org/jira/browse/ARROW-3930) - [C++] Random test data generation is slow -* [ARROW-3932](https://issues.apache.org/jira/browse/ARROW-3932) - [Python/Documentation] Include Benchmarks.md in Sphinx docs -* [ARROW-3934](https://issues.apache.org/jira/browse/ARROW-3934) - [Gandiva] Don't compile precompiled tests if ARROW\_GANDIVA\_BUILD\_TESTS=off -* [ARROW-3938](https://issues.apache.org/jira/browse/ARROW-3938) - [Packaging] Stop to refer java/pom.xml to get version information -* [ARROW-3939](https://issues.apache.org/jira/browse/ARROW-3939) - [Rust] Remove macro definition for ListArrayBuilder -* [ARROW-3945](https://issues.apache.org/jira/browse/ARROW-3945) - [Website] Blog post about Gandiva code donation -* [ARROW-3946](https://issues.apache.org/jira/browse/ARROW-3946) - [GLib] Add support for union -* [ARROW-3948](https://issues.apache.org/jira/browse/ARROW-3948) - [CI][GLib] Set timeout to Homebrew -* [ARROW-3950](https://issues.apache.org/jira/browse/ARROW-3950) - [Plasma] Don't force loading the TensorFlow op on import -* [ARROW-3952](https://issues.apache.org/jira/browse/ARROW-3952) - [Rust] Specify edition="2018" in Cargo.toml -* [ARROW-3958](https://issues.apache.org/jira/browse/ARROW-3958) - [Plasma] Reduce number of IPCs -* [ARROW-3959](https://issues.apache.org/jira/browse/ARROW-3959) - [Rust] Time and Timestamp Support -* [ARROW-3960](https://issues.apache.org/jira/browse/ARROW-3960) - [Rust] remove extern crate for Rust 2018 -* [ARROW-3963](https://issues.apache.org/jira/browse/ARROW-3963) - [Packaging/Docker] Nightly test for building sphinx documentations -* [ARROW-3964](https://issues.apache.org/jira/browse/ARROW-3964) - [Go] More readable example for csv.Reader -* [ARROW-3967](https://issues.apache.org/jira/browse/ARROW-3967) - [Gandiva] [C++] Make gandiva/node.h public -* [ARROW-3970](https://issues.apache.org/jira/browse/ARROW-3970) - [Gandiva][C++] Remove unnecessary boost dependencies -* [ARROW-3971](https://issues.apache.org/jira/browse/ARROW-3971) - [Python] Remove APIs deprecated in 0.11 and prior -* [ARROW-3974](https://issues.apache.org/jira/browse/ARROW-3974) - [C++] Combine field\_builders\_ and children\_ members in array/builder.h -* [ARROW-3982](https://issues.apache.org/jira/browse/ARROW-3982) - [C++] Allow "binary" input in simple JSON format -* [ARROW-3983](https://issues.apache.org/jira/browse/ARROW-3983) - [Gandiva][Crossbow] Use static boost while packaging -* [ARROW-3984](https://issues.apache.org/jira/browse/ARROW-3984) - [C++] Exit with error if user hits zstd ExternalProject path -* [ARROW-3986](https://issues.apache.org/jira/browse/ARROW-3986) - [C++] Write prose documentation -* [ARROW-3986](https://issues.apache.org/jira/browse/ARROW-3986) - [C++] Write prose documentation -* [ARROW-3987](https://issues.apache.org/jira/browse/ARROW-3987) - [Java] Benchmark results for ARROW-1807 -* [ARROW-3988](https://issues.apache.org/jira/browse/ARROW-3988) - [C++] Do not build unit tests by default in build system -* [ARROW-3993](https://issues.apache.org/jira/browse/ARROW-3993) - [JS] CI Jobs Failing -* [ARROW-3994](https://issues.apache.org/jira/browse/ARROW-3994) - [C++] Remove ARROW\_GANDIVA\_BUILD\_TESTS option -* [ARROW-3995](https://issues.apache.org/jira/browse/ARROW-3995) - [CI] Use understandable names in Travis Matrix -* [ARROW-3997](https://issues.apache.org/jira/browse/ARROW-3997) - [C++] [Doc] Clarify dictionary encoding integer signedness (and width?) -* [ARROW-4002](https://issues.apache.org/jira/browse/ARROW-4002) - [C++][Gandiva] Remove CMake version check -* [ARROW-4004](https://issues.apache.org/jira/browse/ARROW-4004) - [GLib] Replace GPU with CUDA -* [ARROW-4005](https://issues.apache.org/jira/browse/ARROW-4005) - [Plasma] [GLib] Add gplasma\_client\_disconnect() -* [ARROW-4006](https://issues.apache.org/jira/browse/ARROW-4006) - Add CODE\_OF\_CONDUCT.md -* [ARROW-4009](https://issues.apache.org/jira/browse/ARROW-4009) - [CI] Run Valgrind and C++ code coverage in different bulds -* [ARROW-4010](https://issues.apache.org/jira/browse/ARROW-4010) - [C++] Enable Travis CI scripts to only build and install only certain targets -* [ARROW-4015](https://issues.apache.org/jira/browse/ARROW-4015) - [Plasma] remove legacy interfaces for plasma manager -* [ARROW-4017](https://issues.apache.org/jira/browse/ARROW-4017) - [C++] Check and update vendored libraries -* [ARROW-4026](https://issues.apache.org/jira/browse/ARROW-4026) - [C++] Use separate modular $COMPONENT-test targets for unit tests -* [ARROW-4028](https://issues.apache.org/jira/browse/ARROW-4028) - [Rust] Merge parquet-rs codebase -* [ARROW-4029](https://issues.apache.org/jira/browse/ARROW-4029) - [C++] Define and document naming convention for internal / private header files not to be installed -* [ARROW-4030](https://issues.apache.org/jira/browse/ARROW-4030) - [CI] Use travis\_terminate to halt builds when a step fails -* [ARROW-4035](https://issues.apache.org/jira/browse/ARROW-4035) - [Ruby] Support msys2 mingw dependencies -* [ARROW-4037](https://issues.apache.org/jira/browse/ARROW-4037) - [Packaging] Remove workaround to verify 0.11.0 -* [ARROW-4038](https://issues.apache.org/jira/browse/ARROW-4038) - [Rust] Add array\_ops methods for boolean AND, OR, NOT -* [ARROW-4039](https://issues.apache.org/jira/browse/ARROW-4039) - [Python] Update link to 'development.rst' page from Python README.md -* [ARROW-4042](https://issues.apache.org/jira/browse/ARROW-4042) - [Rust] Inconsistent method naming between BinaryArray and PrimitiveArray -* [ARROW-4043](https://issues.apache.org/jira/browse/ARROW-4043) - [Packaging/Docker] Python tests on alpine miss pytest dependency -* [ARROW-4044](https://issues.apache.org/jira/browse/ARROW-4044) - [Packaging/Python] Add hypothesis test dependency to pyarrow conda recipe -* [ARROW-4045](https://issues.apache.org/jira/browse/ARROW-4045) - [Packaging/Python] Add hypothesis test dependency to wheel crossbow tests -* [ARROW-4048](https://issues.apache.org/jira/browse/ARROW-4048) - [GLib] Return ChunkedArray instead of Array in gparquet\_arrow\_file\_reader\_read\_column -* [ARROW-4051](https://issues.apache.org/jira/browse/ARROW-4051) - [Gandiva] [GLib] Add support for null literal -* [ARROW-4054](https://issues.apache.org/jira/browse/ARROW-4054) - [Python] Update gtest, flatbuffers and OpenSSL in manylinux1 base image -* [ARROW-4060](https://issues.apache.org/jira/browse/ARROW-4060) - [Rust] Add Parquet/Arrow schema converter -* [ARROW-4069](https://issues.apache.org/jira/browse/ARROW-4069) - [Python] Add tests for casting from binary to utf8 -* [ARROW-4075](https://issues.apache.org/jira/browse/ARROW-4075) - [Rust] Reuse array builder after calling finish() -* [ARROW-4079](https://issues.apache.org/jira/browse/ARROW-4079) - [C++] Add machine benchmarks -* [ARROW-4080](https://issues.apache.org/jira/browse/ARROW-4080) - [Rust] Improving lengthy build times in Appveyor -* [ARROW-4082](https://issues.apache.org/jira/browse/ARROW-4082) - [C++] CMake tweaks: allow RelWithDebInfo, improve FindClangTools -* [ARROW-4084](https://issues.apache.org/jira/browse/ARROW-4084) - [C++] Simplify Status and stringstream boilerplate -* [ARROW-4085](https://issues.apache.org/jira/browse/ARROW-4085) - [GLib] Use "field" for struct data type -* [ARROW-4087](https://issues.apache.org/jira/browse/ARROW-4087) - [C++] Make CSV nulls configurable -* [ARROW-4093](https://issues.apache.org/jira/browse/ARROW-4093) - [C++] Deprecated method suggests wrong method -* [ARROW-4098](https://issues.apache.org/jira/browse/ARROW-4098) - [Python] Deprecate pyarrow.open\_stream,open\_file in favor of pa.ipc.open\_stream/open\_file -* [ARROW-4100](https://issues.apache.org/jira/browse/ARROW-4100) - [Gandiva][C++] Fix regex to ignore "." character -* [ARROW-4102](https://issues.apache.org/jira/browse/ARROW-4102) - [C++] FixedSizeBinary identity cast not implemented -* [ARROW-4103](https://issues.apache.org/jira/browse/ARROW-4103) - [Documentation] Add README to docs/ root -* [ARROW-4105](https://issues.apache.org/jira/browse/ARROW-4105) - Add rust-toolchain to enforce user to use nightly toolchain for building -* [ARROW-4107](https://issues.apache.org/jira/browse/ARROW-4107) - [Python] Use ninja in pyarrow manylinux1 build -* [ARROW-4112](https://issues.apache.org/jira/browse/ARROW-4112) - [Packaging][Gandiva] Add support for deb packages -* [ARROW-4116](https://issues.apache.org/jira/browse/ARROW-4116) - [Python] Clarify in development.rst that virtualenv cannot be used with miniconda/Anaconda -* [ARROW-4122](https://issues.apache.org/jira/browse/ARROW-4122) - [C++] Initialize some uninitialized class members -* [ARROW-4127](https://issues.apache.org/jira/browse/ARROW-4127) - [Documentation] Add Docker build instructions -* [ARROW-4129](https://issues.apache.org/jira/browse/ARROW-4129) - [Python] Fix syntax problem in benchmark docs -* [ARROW-4132](https://issues.apache.org/jira/browse/ARROW-4132) - [GLib] Add more GArrowTable constructors -* [ARROW-4141](https://issues.apache.org/jira/browse/ARROW-4141) - [Ruby] Add support for creating schema from raw Ruby objects -* [ARROW-4148](https://issues.apache.org/jira/browse/ARROW-4148) - [CI/Python] Disable ORC on nightly Alpine builds -* [ARROW-4150](https://issues.apache.org/jira/browse/ARROW-4150) - [C++] Do not return buffers containing nullptr from internal allocations -* [ARROW-4151](https://issues.apache.org/jira/browse/ARROW-4151) - [Rust] Restructure project directories -* [ARROW-4152](https://issues.apache.org/jira/browse/ARROW-4152) - [GLib] Remove an example to show Torch integration -* [ARROW-4153](https://issues.apache.org/jira/browse/ARROW-4153) - [GLib] Add builder\_append\_value() for consistency -* [ARROW-4154](https://issues.apache.org/jira/browse/ARROW-4154) - [GLib] Add GArrowDecimal128DataType -* [ARROW-4155](https://issues.apache.org/jira/browse/ARROW-4155) - [Rust] Implement array\_ops::sum() for PrimitiveArray -* [ARROW-4156](https://issues.apache.org/jira/browse/ARROW-4156) - [C++] xcodebuild failure for cmake generated project -* [ARROW-4158](https://issues.apache.org/jira/browse/ARROW-4158) - [Dev] Allow maintainers to use a GitHub API token when merging pull requests -* [ARROW-4160](https://issues.apache.org/jira/browse/ARROW-4160) - [Rust] Add README and executable files to parquet -* [ARROW-4161](https://issues.apache.org/jira/browse/ARROW-4161) - [GLib] Add GPlasmaClientOptions -* [ARROW-4162](https://issues.apache.org/jira/browse/ARROW-4162) - [Ruby] Add support for creating data types from description -* [ARROW-4166](https://issues.apache.org/jira/browse/ARROW-4166) - [Ruby] Add support for saving to and loading from buffer -* [ARROW-4167](https://issues.apache.org/jira/browse/ARROW-4167) - [Gandiva] switch to arrow/util/variant -* [ARROW-4168](https://issues.apache.org/jira/browse/ARROW-4168) - [GLib] Use property to keep GArrowDataType passed in garrow\_field\_new() -* [ARROW-4172](https://issues.apache.org/jira/browse/ARROW-4172) - [Rust] more consistent naming in array builders -* [ARROW-4174](https://issues.apache.org/jira/browse/ARROW-4174) - [Ruby] Add support for building composite array from raw Ruby objects -* [ARROW-4175](https://issues.apache.org/jira/browse/ARROW-4175) - [GLib] Add support for decimal compare operators -* [ARROW-4177](https://issues.apache.org/jira/browse/ARROW-4177) - [C++] Add ThreadPool and TaskGroup microbenchmarks -* [ARROW-4183](https://issues.apache.org/jira/browse/ARROW-4183) - [Ruby] Add Arrow::Struct as an element of Arrow::StructArray -* [ARROW-4184](https://issues.apache.org/jira/browse/ARROW-4184) - [Ruby] Add Arrow::RecordBatch\#to\_table -* [ARROW-4191](https://issues.apache.org/jira/browse/ARROW-4191) - [C++] Use same CC and AR for jemalloc as for the main sources -* [ARROW-4199](https://issues.apache.org/jira/browse/ARROW-4199) - [GLib] Add garrow\_seekable\_input\_stream\_peek() -* [ARROW-4207](https://issues.apache.org/jira/browse/ARROW-4207) - [Gandiva] [GLib] Add support for IfNode -* [ARROW-4210](https://issues.apache.org/jira/browse/ARROW-4210) - [Python] Mention boost-cpp directly in the conda meta.yaml for pyarrow -* [ARROW-4211](https://issues.apache.org/jira/browse/ARROW-4211) - [GLib] Add GArrowFixedSizeBinaryDataType -* [ARROW-4214](https://issues.apache.org/jira/browse/ARROW-4214) - [Ruby] Add support for building RecordBatch from raw Ruby objects -* [ARROW-4216](https://issues.apache.org/jira/browse/ARROW-4216) - [Python] Add CUDA API docs -* [ARROW-4228](https://issues.apache.org/jira/browse/ARROW-4228) - [GLib] Add garrow\_list\_data\_type\_get\_field() -* [ARROW-4229](https://issues.apache.org/jira/browse/ARROW-4229) - [Packaging] Set crossbow target explicitly to enable building arbitrary arrow repo -* [ARROW-4233](https://issues.apache.org/jira/browse/ARROW-4233) - [Packaging] Create a Dockerfile to build source archive -* [ARROW-4239](https://issues.apache.org/jira/browse/ARROW-4239) - [Release] Updating .deb package names in the prepare script failed to run on OSX -* [ARROW-4240](https://issues.apache.org/jira/browse/ARROW-4240) - [Packaging] Documents for Plasma GLib and Gandiva GLib are missing in source archive -* [ARROW-4241](https://issues.apache.org/jira/browse/ARROW-4241) - [Packaging] Disable crossbow conda OSX clang builds -* [ARROW-4243](https://issues.apache.org/jira/browse/ARROW-4243) - [Python] Test failure with pandas 0.24.0rc1 -* [ARROW-4249](https://issues.apache.org/jira/browse/ARROW-4249) - [Plasma] Remove reference to logging.h from plasma/common.h -* [ARROW-4257](https://issues.apache.org/jira/browse/ARROW-4257) - [Release] Update release verification script to check binaries on Bintray -* [ARROW-4266](https://issues.apache.org/jira/browse/ARROW-4266) - [Python][CI] Disable ORC tests in dask integration test -* [ARROW-4269](https://issues.apache.org/jira/browse/ARROW-4269) - [Python] AttributeError: module 'pandas.core' has no attribute 'arrays' -* [ARROW-4270](https://issues.apache.org/jira/browse/ARROW-4270) - [Packaging][Conda] Update xcode version and remove toolchain builds -* [ARROW-4276](https://issues.apache.org/jira/browse/ARROW-4276) - [Release] Remove needless Bintray authentication from binaries verify script -* [ARROW-4306](https://issues.apache.org/jira/browse/ARROW-4306) - [Release] Update website and add blog post announcing 0.12.0 release -* [PARQUET-690](https://issues.apache.org/jira/browse/PARQUET-690) - [C++] Investigate / improve performance of Thrift utilities -* [PARQUET-1271](https://issues.apache.org/jira/browse/PARQUET-1271) - [C++] "parquet\_reader" should be "parquet-reader" -* [PARQUET-1439](https://issues.apache.org/jira/browse/PARQUET-1439) - [C++] Parquet build fails when PARQUET\_ARROW\_LINKAGE is static -* [PARQUET-1449](https://issues.apache.org/jira/browse/PARQUET-1449) - [C++] Can't build with ARROW\_BOOST\_VENDORED=ON -* [PARQUET-1463](https://issues.apache.org/jira/browse/PARQUET-1463) - [C++] Utilize revamped common hashing machinery for dictionary encoding -* [PARQUET-1467](https://issues.apache.org/jira/browse/PARQUET-1467) - [C++] Remove ChunkedAllocator code, now unused -* [PARQUET-1473](https://issues.apache.org/jira/browse/PARQUET-1473) - [C++] Add helper function that converts ParquetVersion to human-friendly string -* [PARQUET-1484](https://issues.apache.org/jira/browse/PARQUET-1484) - [C++] Improve memory usage of FileMetaDataBuilder - - -## Bug Fixes - -* [ARROW-1847](https://issues.apache.org/jira/browse/ARROW-1847) - [Doc] Document the difference between RecordBatch and Table in an FAQ fashion -* [ARROW-2026](https://issues.apache.org/jira/browse/ARROW-2026) - [Python] Cast all timestamp resolutions to INT96 use\_deprecated\_int96\_timestamps=True -* [ARROW-2038](https://issues.apache.org/jira/browse/ARROW-2038) - [Python] Follow-up bug fixes for s3fs Parquet support -* [ARROW-2113](https://issues.apache.org/jira/browse/ARROW-2113) - [Python] Incomplete CLASSPATH with "hadoop" contained in it can fool the classpath setting HDFS logic -* [ARROW-2591](https://issues.apache.org/jira/browse/ARROW-2591) - [Python] Segmentation fault when writing empty ListType column to Parquet -* [ARROW-2592](https://issues.apache.org/jira/browse/ARROW-2592) - [Python] Error reading old Parquet file due to metadata backwards compatibility issue -* [ARROW-2654](https://issues.apache.org/jira/browse/ARROW-2654) - [Python] Error with errno 22 when loading 3.6 GB Parquet file -* [ARROW-2708](https://issues.apache.org/jira/browse/ARROW-2708) - [C++] Internal GetValues function in arrow::compute should check for nullptr -* [ARROW-2831](https://issues.apache.org/jira/browse/ARROW-2831) - [Plasma] MemoryError in teardown -* [ARROW-2970](https://issues.apache.org/jira/browse/ARROW-2970) - [Python] NumPyConverter::Visit for Binary/String/FixedSizeBinary can overflow -* [ARROW-2987](https://issues.apache.org/jira/browse/ARROW-2987) - [Python] test\_cython\_api can fail if run in an environment where vsvarsall.bat has been run more than once -* [ARROW-3048](https://issues.apache.org/jira/browse/ARROW-3048) - [Python] Import pyarrow fails if scikit-learn is installed from conda (boost-cpp / libboost issue) -* [ARROW-3058](https://issues.apache.org/jira/browse/ARROW-3058) - [Python] Feather reads fail with unintuitive error when conversion from pandas yields ChunkedArray -* [ARROW-3186](https://issues.apache.org/jira/browse/ARROW-3186) - [GLib] mesonbuild failures in Travis CI -* [ARROW-3202](https://issues.apache.org/jira/browse/ARROW-3202) - [C++] Build does not succeed on Alpine Linux -* [ARROW-3225](https://issues.apache.org/jira/browse/ARROW-3225) - [C++/Python] Pandas object conversion of ListType and ListType -* [ARROW-3324](https://issues.apache.org/jira/browse/ARROW-3324) - [Parquet] Free more internal resources when writing multiple row groups -* [ARROW-3343](https://issues.apache.org/jira/browse/ARROW-3343) - [Java] Java tests fail non-deterministically with memory leak from Flight tests -* [ARROW-3405](https://issues.apache.org/jira/browse/ARROW-3405) - [Python] Document CSV reader -* [ARROW-3428](https://issues.apache.org/jira/browse/ARROW-3428) - [Python] from\_pandas gives incorrect results when converting floating point to bool -* [ARROW-3436](https://issues.apache.org/jira/browse/ARROW-3436) - [C++] Boost version required by Gandiva is too new for Ubuntu 14.04 -* [ARROW-3437](https://issues.apache.org/jira/browse/ARROW-3437) - [Gandiva][C++] Configure static linking of libgcc, libstdc++ with LDFLAGS -* [ARROW-3438](https://issues.apache.org/jira/browse/ARROW-3438) - [Packaging] Escaped bulletpoints in changelog -* [ARROW-3445](https://issues.apache.org/jira/browse/ARROW-3445) - [GLib] Parquet GLib doesn't link Arrow GLib -* [ARROW-3449](https://issues.apache.org/jira/browse/ARROW-3449) - [C++] Support CMake 3.2 for "out of the box" builds -* [ARROW-3466](https://issues.apache.org/jira/browse/ARROW-3466) - [Python] Crash when importing tensorflow and pyarrow -* [ARROW-3467](https://issues.apache.org/jira/browse/ARROW-3467) - Building against external double conversion is broken -* [ARROW-3470](https://issues.apache.org/jira/browse/ARROW-3470) - [C++] Row-wise conversion tutorial has fallen out of date -* [ARROW-3477](https://issues.apache.org/jira/browse/ARROW-3477) - [C++] Testsuite fails on 32 bit arch -* [ARROW-3480](https://issues.apache.org/jira/browse/ARROW-3480) - [Website] Install document for Ubuntu is broken -* [ARROW-3483](https://issues.apache.org/jira/browse/ARROW-3483) - [CI] Python 3.6 build failure on Travis-CI -* [ARROW-3485](https://issues.apache.org/jira/browse/ARROW-3485) - [C++] Examples fail with Protobuf error -* [ARROW-3494](https://issues.apache.org/jira/browse/ARROW-3494) - [C++] re2 conda-forge package not working in toolchain -* [ARROW-3498](https://issues.apache.org/jira/browse/ARROW-3498) - [R] Make IPC APIs consistent -* [ARROW-3516](https://issues.apache.org/jira/browse/ARROW-3516) - [C++] Use unsigned type for difference of pointers in parallel\_memcpy -* [ARROW-3517](https://issues.apache.org/jira/browse/ARROW-3517) - [C++] MinGW 32bit build causes g++ segv -* [ARROW-3524](https://issues.apache.org/jira/browse/ARROW-3524) - [C++] Fix compiler warnings from ARROW-3409 on clang-6 -* [ARROW-3527](https://issues.apache.org/jira/browse/ARROW-3527) - [R] Unused variables in R-package C++ code -* [ARROW-3528](https://issues.apache.org/jira/browse/ARROW-3528) - [R] Typo in R documentation -* [ARROW-3535](https://issues.apache.org/jira/browse/ARROW-3535) - [Python] pip install tensorflow install too new numpy in manylinux1 build -* [ARROW-3541](https://issues.apache.org/jira/browse/ARROW-3541) - [Rust] Update BufferBuilder to allow for new bit-packed BooleanArray -* [ARROW-3544](https://issues.apache.org/jira/browse/ARROW-3544) - [Gandiva] Populate function registry in multiple compilation units to mitigate long compile times in release mode -* [ARROW-3549](https://issues.apache.org/jira/browse/ARROW-3549) - [Rust] Replace i64 with usize for some bit utility functions -* [ARROW-3573](https://issues.apache.org/jira/browse/ARROW-3573) - [Rust] with\_bitset does not set valid bits correctly -* [ARROW-3580](https://issues.apache.org/jira/browse/ARROW-3580) - [Gandiva][C++] Build error with g++ 8.2.0 -* [ARROW-3586](https://issues.apache.org/jira/browse/ARROW-3586) - [Python] Segmentation fault when converting empty table to pandas with categoricals -* [ARROW-3598](https://issues.apache.org/jira/browse/ARROW-3598) - [Plasma] plasma\_store\_server fails linking with GPU enabled -* [ARROW-3613](https://issues.apache.org/jira/browse/ARROW-3613) - [Go] Resize does not correctly update the length -* [ARROW-3613](https://issues.apache.org/jira/browse/ARROW-3613) - [Go] Resize does not correctly update the length -* [ARROW-3614](https://issues.apache.org/jira/browse/ARROW-3614) - [R] Handle Type::TIMESTAMP from Arrow to R -* [ARROW-3634](https://issues.apache.org/jira/browse/ARROW-3634) - [GLib] cuda.cpp compile error -* [ARROW-3637](https://issues.apache.org/jira/browse/ARROW-3637) - [Go] Implement Stringer for arrays -* [ARROW-3658](https://issues.apache.org/jira/browse/ARROW-3658) - [Rust] validation of offsets buffer is incorrect for \`List\` -* [ARROW-3670](https://issues.apache.org/jira/browse/ARROW-3670) - [C++] Use FindBacktrace to find execinfo.h support -* [ARROW-3687](https://issues.apache.org/jira/browse/ARROW-3687) - [Rust] Anything measuring array slots should be \`usize\` -* [ARROW-3698](https://issues.apache.org/jira/browse/ARROW-3698) - [C++] Segmentation fault when using a large table in Gandiva -* [ARROW-3700](https://issues.apache.org/jira/browse/ARROW-3700) - [C++] CSV parser should allow ignoring empty lines -* [ARROW-3703](https://issues.apache.org/jira/browse/ARROW-3703) - [Python] DataFrame.to\_parquet crashes if datetime column has time zones -* [ARROW-3704](https://issues.apache.org/jira/browse/ARROW-3704) - [Gandiva] Can't build with g++ 8.2.0 -* [ARROW-3707](https://issues.apache.org/jira/browse/ARROW-3707) - [C++] test failure with zstd 1.3.7 -* [ARROW-3711](https://issues.apache.org/jira/browse/ARROW-3711) - [C++] Don't pass CXX\_FLAGS to C\_FLAGS -* [ARROW-3712](https://issues.apache.org/jira/browse/ARROW-3712) - [CI] License check regression (RAT failure) -* [ARROW-3715](https://issues.apache.org/jira/browse/ARROW-3715) - [C++] gflags\_ep fails to build with CMake 3.13 -* [ARROW-3716](https://issues.apache.org/jira/browse/ARROW-3716) - [R] Missing cases for ChunkedArray conversion -* [ARROW-3728](https://issues.apache.org/jira/browse/ARROW-3728) - [Python] Merging Parquet Files - Pandas Meta in Schema Mismatch -* [ARROW-3734](https://issues.apache.org/jira/browse/ARROW-3734) - [C++] Linking static zstd library fails on Arch x86-64 -* [ARROW-3740](https://issues.apache.org/jira/browse/ARROW-3740) - [C++] Calling ArrayBuilder::Resize with length smaller than current appended length results in invalid state -* [ARROW-3742](https://issues.apache.org/jira/browse/ARROW-3742) - Fix pyarrow.types & gandiva cython bindings -* [ARROW-3745](https://issues.apache.org/jira/browse/ARROW-3745) - [C++] CMake passes static libraries multiple times to linker -* [ARROW-3754](https://issues.apache.org/jira/browse/ARROW-3754) - [Packaging] Zstd configure error on linux package builds -* [ARROW-3756](https://issues.apache.org/jira/browse/ARROW-3756) - [CI/Docker/Java] Java tests are failing in docker-compose setup -* [ARROW-3765](https://issues.apache.org/jira/browse/ARROW-3765) - [Gandiva] Segfault when the validity bitmap has not been allocated -* [ARROW-3766](https://issues.apache.org/jira/browse/ARROW-3766) - [Python] pa.Table.from\_pandas doesn't use schema ordering -* [ARROW-3768](https://issues.apache.org/jira/browse/ARROW-3768) - [Python] set classpath to hdfs not hadoop executable -* [ARROW-3775](https://issues.apache.org/jira/browse/ARROW-3775) - [C++] Handling Parquet Arrow reads that overflow a BinaryArray capacity -* [ARROW-3790](https://issues.apache.org/jira/browse/ARROW-3790) - [C++] Signed to unsigned integer cast yields incorrect results when type sizes are the same -* [ARROW-3792](https://issues.apache.org/jira/browse/ARROW-3792) - [Python] Segmentation fault when writing empty RecordBatches to Parquet -* [ARROW-3793](https://issues.apache.org/jira/browse/ARROW-3793) - [C++] TestScalarAppendUnsafe is not testing unsafe appends -* [ARROW-3797](https://issues.apache.org/jira/browse/ARROW-3797) - [Rust] BinaryArray::value\_offset incorrect in offset case -* [ARROW-3805](https://issues.apache.org/jira/browse/ARROW-3805) - [Gandiva] handle null validity bitmap in if-else expressions -* [ARROW-3831](https://issues.apache.org/jira/browse/ARROW-3831) - [C++] arrow::util::Codec::Decompress() doesn't return decompressed data size -* [ARROW-3835](https://issues.apache.org/jira/browse/ARROW-3835) - [C++] arrow::io::CompressedOutputStream::raw() impementation is missing -* [ARROW-3837](https://issues.apache.org/jira/browse/ARROW-3837) - [C++] gflags link errors on Windows -* [ARROW-3866](https://issues.apache.org/jira/browse/ARROW-3866) - [Python] Column metadata is not transferred to tables in pyarrow -* [ARROW-3869](https://issues.apache.org/jira/browse/ARROW-3869) - [Rust] "invalid fastbin errors" since Rust nightly-2018-11-03 -* [ARROW-3874](https://issues.apache.org/jira/browse/ARROW-3874) - [Gandiva] Cannot build: LLVM not detected correctly -* [ARROW-3879](https://issues.apache.org/jira/browse/ARROW-3879) - [C++] cuda-test failure -* [ARROW-3888](https://issues.apache.org/jira/browse/ARROW-3888) - [C++] Compilation warnings with gcc 7.3.0 -* [ARROW-3889](https://issues.apache.org/jira/browse/ARROW-3889) - [Python] creating schema with invalid paramaters causes segmanetation fault -* [ARROW-3890](https://issues.apache.org/jira/browse/ARROW-3890) - [Python] Creating Array with explicit string type fails on Python 2.7 -* [ARROW-3894](https://issues.apache.org/jira/browse/ARROW-3894) - [Python] Error reading IPC file with no record batches -* [ARROW-3898](https://issues.apache.org/jira/browse/ARROW-3898) - parquet-arrow example has compilation errors -* [ARROW-3909](https://issues.apache.org/jira/browse/ARROW-3909) - [Python] Table.from\_pandas call that seemingly should zero copy does not -* [ARROW-3918](https://issues.apache.org/jira/browse/ARROW-3918) - [Python] ParquetWriter.write\_table doesn't support coerce\_timestamps or allow\_truncated\_timestamps -* [ARROW-3920](https://issues.apache.org/jira/browse/ARROW-3920) - Plasma reference counting not properly done in TensorFlow custom operator. -* [ARROW-3931](https://issues.apache.org/jira/browse/ARROW-3931) - [C++] Make possible to build regardless of LANG -* [ARROW-3936](https://issues.apache.org/jira/browse/ARROW-3936) - Add \_O\_NOINHERIT to the file open flags on Windows -* [ARROW-3937](https://issues.apache.org/jira/browse/ARROW-3937) - [Rust] Rust nightly build is failing -* [ARROW-3940](https://issues.apache.org/jira/browse/ARROW-3940) - [Python/Documentation] Add required packages to the development instruction -* [ARROW-3941](https://issues.apache.org/jira/browse/ARROW-3941) - [R] RecordBatchStreamReader$schema -* [ARROW-3942](https://issues.apache.org/jira/browse/ARROW-3942) - [R] Feather api fixes -* [ARROW-3953](https://issues.apache.org/jira/browse/ARROW-3953) - Compat with pandas 0.24 rename of MultiIndex labels -\> codes -* [ARROW-3955](https://issues.apache.org/jira/browse/ARROW-3955) - [GLib] Add (transfer full) to free when no longer needed -* [ARROW-3957](https://issues.apache.org/jira/browse/ARROW-3957) - [Python] Better error message when user connects to HDFS cluster with wrong port -* [ARROW-3961](https://issues.apache.org/jira/browse/ARROW-3961) - [Python/Documentation] Fix wrong path in the pyarrow README -* [ARROW-3969](https://issues.apache.org/jira/browse/ARROW-3969) - [Rust] CI build broken because rustfmt not available on nightly toolchain -* [ARROW-3976](https://issues.apache.org/jira/browse/ARROW-3976) - [Ruby] Homebrew donation solicitation on CLI breaking CI builds -* [ARROW-3977](https://issues.apache.org/jira/browse/ARROW-3977) - [Gandiva] gandiva cpp tests not running in CI -* [ARROW-3979](https://issues.apache.org/jira/browse/ARROW-3979) - [Gandiva] fix all valgrind reported errors -* [ARROW-3980](https://issues.apache.org/jira/browse/ARROW-3980) - [C++] Fix CRTP use in json-simple.cc -* [ARROW-3989](https://issues.apache.org/jira/browse/ARROW-3989) - [Rust] CSV reader should handle case sensitivity for boolean values -* [ARROW-3996](https://issues.apache.org/jira/browse/ARROW-3996) - [C++] Insufficient description on build -* [ARROW-4008](https://issues.apache.org/jira/browse/ARROW-4008) - [C++] Integration test executable failure -* [ARROW-4011](https://issues.apache.org/jira/browse/ARROW-4011) - [Gandiva] Refer irhelpers.bc in build directory -* [ARROW-4019](https://issues.apache.org/jira/browse/ARROW-4019) - [C++] Fix coverity issues -* [ARROW-4033](https://issues.apache.org/jira/browse/ARROW-4033) - [C++] thirdparty/download\_dependencies.sh uses tools or options not available in older Linuxes -* [ARROW-4034](https://issues.apache.org/jira/browse/ARROW-4034) - [Ruby] Interface for FileOutputStream doesn't respect append=True -* [ARROW-4041](https://issues.apache.org/jira/browse/ARROW-4041) - [CI] Python 2.7 run uses Python 3.6 -* [ARROW-4049](https://issues.apache.org/jira/browse/ARROW-4049) - [C++] Arrow never use glog even though glog is linked. -* [ARROW-4052](https://issues.apache.org/jira/browse/ARROW-4052) - [C++] Linker errors with glog and gflags -* [ARROW-4053](https://issues.apache.org/jira/browse/ARROW-4053) - [Python/Integration] HDFS Tests failing with I/O operation on closed file -* [ARROW-4055](https://issues.apache.org/jira/browse/ARROW-4055) - [Python] Fails to convert pytz.utc with versions 2018.3 and earlier -* [ARROW-4058](https://issues.apache.org/jira/browse/ARROW-4058) - [C++] arrow-io-hdfs-test fails when run against HDFS cluster from docker-compose -* [ARROW-4065](https://issues.apache.org/jira/browse/ARROW-4065) - [C++] arrowTargets.cmake is broken -* [ARROW-4066](https://issues.apache.org/jira/browse/ARROW-4066) - Instructions to create Sphinx documentation -* [ARROW-4070](https://issues.apache.org/jira/browse/ARROW-4070) - [C++] ARROW\_BOOST\_VENDORED doesn't work properly with ninja build -* [ARROW-4073](https://issues.apache.org/jira/browse/ARROW-4073) - [Python] Parquet test failures on AppVeyor -* [ARROW-4074](https://issues.apache.org/jira/browse/ARROW-4074) - [Python] test\_get\_library\_dirs\_win32 fails if libraries installed someplace different from conda or wheel packages -* [ARROW-4078](https://issues.apache.org/jira/browse/ARROW-4078) - [CI] Run Travis job where documentation is built when docs/ is changed -* [ARROW-4088](https://issues.apache.org/jira/browse/ARROW-4088) - [Python] Table.from\_batches() fails when passed a schema with metadata -* [ARROW-4089](https://issues.apache.org/jira/browse/ARROW-4089) - [Plasma] The tutorial is wrong regarding the parameter type of PlasmaClient.Create -* [ARROW-4101](https://issues.apache.org/jira/browse/ARROW-4101) - [C++] Binary identity cast not implemented -* [ARROW-4106](https://issues.apache.org/jira/browse/ARROW-4106) - [Python] Tests fail to run because hypothesis update broke its API -* [ARROW-4109](https://issues.apache.org/jira/browse/ARROW-4109) - [Packaging] Missing glog dependency from arrow-cpp conda recipe -* [ARROW-4113](https://issues.apache.org/jira/browse/ARROW-4113) - [R] Version number patch broke build -* [ARROW-4114](https://issues.apache.org/jira/browse/ARROW-4114) - [C++][DOCUMENTATION] Add "python" to Linux build instructions -* [ARROW-4115](https://issues.apache.org/jira/browse/ARROW-4115) - [Gandiva] valgrind complains that boolean output data buffer has uninited data -* [ARROW-4118](https://issues.apache.org/jira/browse/ARROW-4118) - [Python] Error with "asv run" -* [ARROW-4125](https://issues.apache.org/jira/browse/ARROW-4125) - [Python] ASV benchmarks fail to run if Plasma extension is not built (e.g. on Windows) -* [ARROW-4126](https://issues.apache.org/jira/browse/ARROW-4126) - [Go] offset not used when accessing boolean array -* [ARROW-4128](https://issues.apache.org/jira/browse/ARROW-4128) - [C++][DOCUMENTATION] Update style guide to reflect some more exceptions -* [ARROW-4130](https://issues.apache.org/jira/browse/ARROW-4130) - [Go] offset not used when accessing binary array -* [ARROW-4134](https://issues.apache.org/jira/browse/ARROW-4134) - [Packaging] Properly setup timezone in docker tests to prevent ORC adapter's abort -* [ARROW-4135](https://issues.apache.org/jira/browse/ARROW-4135) - [Python] Can't reload a pandas dataframe containing a list of datetime.time -* [ARROW-4137](https://issues.apache.org/jira/browse/ARROW-4137) - [Rust] Move parquet code into a separate crate -* [ARROW-4138](https://issues.apache.org/jira/browse/ARROW-4138) - [Python] setuptools\_scm customization does not work for versions above 0.9.0 on Windows -* [ARROW-4147](https://issues.apache.org/jira/browse/ARROW-4147) - [JAVA] Reduce heap usage for variable width vectors -* [ARROW-4149](https://issues.apache.org/jira/browse/ARROW-4149) - [CI/C++] Parquet test misses ZSTD compression codec in CMake 3.2 nightly builds -* [ARROW-4157](https://issues.apache.org/jira/browse/ARROW-4157) - [C++] -Wdocumentation failures with clang 6.0 on Ubuntu 18.04 -* [ARROW-4171](https://issues.apache.org/jira/browse/ARROW-4171) - [Rust] fix parquet crate release version -* [ARROW-4173](https://issues.apache.org/jira/browse/ARROW-4173) - JIRA library name is wrong in error message of dev/merge\_arrow\_pr.py -* [ARROW-4178](https://issues.apache.org/jira/browse/ARROW-4178) - [C++] Fix TSan and UBSan errors -* [ARROW-4179](https://issues.apache.org/jira/browse/ARROW-4179) - [Python] Tests crashing on all platforms in CI -* [ARROW-4182](https://issues.apache.org/jira/browse/ARROW-4182) - [Python][CI] SEGV frequency -* [ARROW-4185](https://issues.apache.org/jira/browse/ARROW-4185) - [Rust] Appveyor builds are broken -* [ARROW-4186](https://issues.apache.org/jira/browse/ARROW-4186) - [C++] BitmapWriters clobber the first byte when length=0 -* [ARROW-4188](https://issues.apache.org/jira/browse/ARROW-4188) - [Rust] There should be a README in the top level rust directory -* [ARROW-4197](https://issues.apache.org/jira/browse/ARROW-4197) - [C++] Emscripten compiler fails building Arrow -* [ARROW-4200](https://issues.apache.org/jira/browse/ARROW-4200) - [C++] conda\_env\_\* files cannot be used to create a fresh conda environment on Windows -* [ARROW-4209](https://issues.apache.org/jira/browse/ARROW-4209) - [Gandiva] returning IR structs causes issues with windows -* [ARROW-4215](https://issues.apache.org/jira/browse/ARROW-4215) - [GLib] Fix typos in documentation -* [ARROW-4227](https://issues.apache.org/jira/browse/ARROW-4227) - [GLib] Field in composite data type returns wrong data type -* [ARROW-4237](https://issues.apache.org/jira/browse/ARROW-4237) - [Packaging] Fix CMAKE\_INSTALL\_LIBDIR in release verification script -* [ARROW-4238](https://issues.apache.org/jira/browse/ARROW-4238) - [Packaging] Fix RC version conflict between crossbow and rake -* [ARROW-4246](https://issues.apache.org/jira/browse/ARROW-4246) - [Plasma][Python] PlasmaClient.list doesn't work with CUDA enabled Plasma -* [ARROW-4246](https://issues.apache.org/jira/browse/ARROW-4246) - [Plasma][Python] PlasmaClient.list doesn't work with CUDA enabled Plasma -* [ARROW-4256](https://issues.apache.org/jira/browse/ARROW-4256) - [Release] Update Windows verification script for 0.12 release -* [ARROW-4258](https://issues.apache.org/jira/browse/ARROW-4258) - [Python] Safe cast fails from numpy float64 array with nans to integer -* [ARROW-4260](https://issues.apache.org/jira/browse/ARROW-4260) - [Python] test\_serialize\_deserialize\_pandas is failing in multiple build entries -* [PARQUET-1426](https://issues.apache.org/jira/browse/PARQUET-1426) - [C++] parquet-dump-schema has poor usability -* [PARQUET-1458](https://issues.apache.org/jira/browse/PARQUET-1458) - [C++] parquet::CompressionToString not recognizing brotli compression -* [PARQUET-1469](https://issues.apache.org/jira/browse/PARQUET-1469) - [C++] DefinitionLevelsToBitmap can overwrite prior decoded data -* [PARQUET-1471](https://issues.apache.org/jira/browse/PARQUET-1471) - [C++] Out of bounds access in statistics UpdateSpaced when writing optional list with null list slots -* [PARQUET-1481](https://issues.apache.org/jira/browse/PARQUET-1481) - [C++] SEGV when reading corrupt parquet file - - - -# Apache Arrow 0.11.1 (2018-10-23) - -## New Features and Improvements - -* [ARROW-3353](https://issues.apache.org/jira/browse/ARROW-3353) - [Packaging] Build python 3.7 wheels -* [ARROW-3534](https://issues.apache.org/jira/browse/ARROW-3534) - [Python] Update zlib library in manylinux1 image -* [ARROW-3546](https://issues.apache.org/jira/browse/ARROW-3546) - [Python] Provide testing setup to verify wheel binaries work in one or more common Linux distributions -* [ARROW-3565](https://issues.apache.org/jira/browse/ARROW-3565) - [Python] Pin tensorflow to 1.11.0 in manylinux1 container - - -## Bug Fixes - -* [ARROW-3514](https://issues.apache.org/jira/browse/ARROW-3514) - [Python] zlib deflate exception when writing Parquet file -* [ARROW-3907](https://issues.apache.org/jira/browse/ARROW-3907) - [Python] from\_pandas errors when schemas are used with lower resolution timestamps - - - -# Apache Arrow 0.11.0 (2018-10-08) - -## New Features and Improvements - -* [ARROW-25](https://issues.apache.org/jira/browse/ARROW-25) - [C++] Implement delimited file scanner / CSV reader -* [ARROW-249](https://issues.apache.org/jira/browse/ARROW-249) - [Flight] Define GRPC IDL / wire protocol for messaging with Arrow data -* [ARROW-614](https://issues.apache.org/jira/browse/ARROW-614) - [C++] Use glog (or some other tool) to print stack traces in debug builds on errors -* [ARROW-1325](https://issues.apache.org/jira/browse/ARROW-1325) - [R] Bootstrap R bindings subproject -* [ARROW-1424](https://issues.apache.org/jira/browse/ARROW-1424) - [Python] Initial bindings for libarrow\_gpu -* [ARROW-1491](https://issues.apache.org/jira/browse/ARROW-1491) - [C++] Add casting implementations from strings to numbers or boolean -* [ARROW-1521](https://issues.apache.org/jira/browse/ARROW-1521) - [C++] Add Reset method to BufferOutputStream to enable object reuse -* [ARROW-1563](https://issues.apache.org/jira/browse/ARROW-1563) - [C++] Implement logical unary and binary kernels for boolean arrays -* [ARROW-1563](https://issues.apache.org/jira/browse/ARROW-1563) - [C++] Implement logical unary and binary kernels for boolean arrays -* [ARROW-1860](https://issues.apache.org/jira/browse/ARROW-1860) - [C++] Add data structure to "stage" a sequence of IPC messages from in-memory data -* [ARROW-1949](https://issues.apache.org/jira/browse/ARROW-1949) - [Python/C++] Add option to Array.from\_pandas and pyarrow.array to perform unsafe casts -* [ARROW-1963](https://issues.apache.org/jira/browse/ARROW-1963) - [C++/Python] Create Array from sequence of numpy.datetime64 -* [ARROW-1968](https://issues.apache.org/jira/browse/ARROW-1968) - [Python] Unit testing setup for ORC files -* [ARROW-2165](https://issues.apache.org/jira/browse/ARROW-2165) - enhance AllocatorListener to listen for child allocator addition and removal -* [ARROW-2338](https://issues.apache.org/jira/browse/ARROW-2338) - [Scripts] Windows release verification script should create a conda environment -* [ARROW-2352](https://issues.apache.org/jira/browse/ARROW-2352) - [C++/Python] Test OSX packaging in Travis matrix -* [ARROW-2519](https://issues.apache.org/jira/browse/ARROW-2519) - [Rust] Implement min/max for primitive arrays -* [ARROW-2520](https://issues.apache.org/jira/browse/ARROW-2520) - [Rust] CI should also build against nightly Rust -* [ARROW-2555](https://issues.apache.org/jira/browse/ARROW-2555) - [Python] Provide an option to convert on coerce\_timestamps instead of error -* [ARROW-2583](https://issues.apache.org/jira/browse/ARROW-2583) - [Rust] Buffer should be typeless -* [ARROW-2617](https://issues.apache.org/jira/browse/ARROW-2617) - [Rust] Schema should contain fields not columns -* [ARROW-2687](https://issues.apache.org/jira/browse/ARROW-2687) - [JS] Example usage in README is outdated -* [ARROW-2734](https://issues.apache.org/jira/browse/ARROW-2734) - [Python] Cython api example doesn't work by default on macOS -* [ARROW-2750](https://issues.apache.org/jira/browse/ARROW-2750) - [MATLAB] Add MATLAB support for reading numeric types from Feather files -* [ARROW-2799](https://issues.apache.org/jira/browse/ARROW-2799) - [Python] Add safe option to Table.from\_pandas to avoid unsafe casts -* [ARROW-2813](https://issues.apache.org/jira/browse/ARROW-2813) - [C++] Strip uninformative lcov output from Travis CI logs -* [ARROW-2813](https://issues.apache.org/jira/browse/ARROW-2813) - [C++] Strip uninformative lcov output from Travis CI logs -* [ARROW-2817](https://issues.apache.org/jira/browse/ARROW-2817) - [C++] Enable libraries to be installed in msys2 on Windows -* [ARROW-2840](https://issues.apache.org/jira/browse/ARROW-2840) - [C++] See if stream alignment logic can be simplified -* [ARROW-2865](https://issues.apache.org/jira/browse/ARROW-2865) - [C++/Python] Reduce some duplicated code in python/builtin\_convert.cc -* [ARROW-2889](https://issues.apache.org/jira/browse/ARROW-2889) - [C++] Add optional argument to ADD\_ARROW\_TEST CMake function to add unit test prefix -* [ARROW-2900](https://issues.apache.org/jira/browse/ARROW-2900) - [Python] Improve performance of appending nested NumPy arrays in builtin\_convert.cc -* [ARROW-2936](https://issues.apache.org/jira/browse/ARROW-2936) - [Python] Implement Table.cast for casting from one schema to another (if possible) -* [ARROW-2948](https://issues.apache.org/jira/browse/ARROW-2948) - [Packaging] Generate changelog with crossbow -* [ARROW-2950](https://issues.apache.org/jira/browse/ARROW-2950) - [C++] Clean up util/bit-util.h -* [ARROW-2952](https://issues.apache.org/jira/browse/ARROW-2952) - [C++] Dockerfile for running include-what-you-use checks -* [ARROW-2958](https://issues.apache.org/jira/browse/ARROW-2958) - [C++] Flatbuffers EP fails to compile with GCC 8.1 -* [ARROW-2960](https://issues.apache.org/jira/browse/ARROW-2960) - [Packaging] Fix verify-release-candidate for binary packages and fix release cutting script for lib64 cmake issue -* [ARROW-2964](https://issues.apache.org/jira/browse/ARROW-2964) - [Go] wire all currently implemented array types in array.MakeFromData -* [ARROW-2971](https://issues.apache.org/jira/browse/ARROW-2971) - [Python] Give more descriptive names to python\_to\_arrow.cc/arrow\_to\_python.cc -* [ARROW-2972](https://issues.apache.org/jira/browse/ARROW-2972) - [Python] Implement inference logic for uint64 conversions in builtin\_convert.cc -* [ARROW-2975](https://issues.apache.org/jira/browse/ARROW-2975) - [Plasma] TensorFlow op: Compilation only working if arrow found by pkg-config -* [ARROW-2976](https://issues.apache.org/jira/browse/ARROW-2976) - [Python] Directory in pyarrow.get\_library\_dirs() on Travis doesn't contain libarrow.so -* [ARROW-2979](https://issues.apache.org/jira/browse/ARROW-2979) - [GLib] Add operator functions in GArrowDecimal128 -* [ARROW-2983](https://issues.apache.org/jira/browse/ARROW-2983) - [Packaging] Verify source release and binary artifacts in different scripts -* [ARROW-2989](https://issues.apache.org/jira/browse/ARROW-2989) - [C++] Remove deprecated APIs in 0.10.0 and below -* [ARROW-2991](https://issues.apache.org/jira/browse/ARROW-2991) - [CI] Cut down number of AppVeyor jobs -* [ARROW-2994](https://issues.apache.org/jira/browse/ARROW-2994) - [C++] Only include Python C header directories for Python-related compilation units -* [ARROW-2996](https://issues.apache.org/jira/browse/ARROW-2996) - [C++] Fix typo in cpp/.clang-tidy -* [ARROW-2998](https://issues.apache.org/jira/browse/ARROW-2998) - [C++] Add variants of AllocateBuffer, AllocateResizeableBuffer that return unique\_ptr -* [ARROW-2999](https://issues.apache.org/jira/browse/ARROW-2999) - [Python] Do not run ASV benchmarks in every Travis CI build to improve runtimes -* [ARROW-3000](https://issues.apache.org/jira/browse/ARROW-3000) - [Python] Do not build unit tests other than python-test in travis\_script\_python.sh -* [ARROW-3001](https://issues.apache.org/jira/browse/ARROW-3001) - [Packaging] Don't modify PATH during rust release verification -* [ARROW-3002](https://issues.apache.org/jira/browse/ARROW-3002) - [Python] Implement better DataType hash function -* [ARROW-3003](https://issues.apache.org/jira/browse/ARROW-3003) - [Doc] Enable Java doc in dev/gen\_apidocs/create\_documents.sh -* [ARROW-3005](https://issues.apache.org/jira/browse/ARROW-3005) - [Website] Update website and write blog post for 0.10.0 release announcement -* [ARROW-3008](https://issues.apache.org/jira/browse/ARROW-3008) - [Packaging] Verify GPU related modules if available -* [ARROW-3009](https://issues.apache.org/jira/browse/ARROW-3009) - [Python] pyarrow.orc uses APIs now prohibited in 0.10.0 -* [ARROW-3010](https://issues.apache.org/jira/browse/ARROW-3010) - [GLib] Update README to use Bundler -* [ARROW-3017](https://issues.apache.org/jira/browse/ARROW-3017) - [C++] Don't throw exception in arrow/util/thread-pool.h -* [ARROW-3018](https://issues.apache.org/jira/browse/ARROW-3018) - [Plasma] Improve random ObjectID generation -* [ARROW-3018](https://issues.apache.org/jira/browse/ARROW-3018) - [Plasma] Improve random ObjectID generation -* [ARROW-3019](https://issues.apache.org/jira/browse/ARROW-3019) - [Packaging] Use Bundler to verify Arrow GLib -* [ARROW-3021](https://issues.apache.org/jira/browse/ARROW-3021) - [Go] support for List -* [ARROW-3022](https://issues.apache.org/jira/browse/ARROW-3022) - [Go] support for Struct -* [ARROW-3023](https://issues.apache.org/jira/browse/ARROW-3023) - [C++] Use gold linker in builds if it is available -* [ARROW-3024](https://issues.apache.org/jira/browse/ARROW-3024) - [C++] Replace usages of std::mutex with atomics in memory\_pool.cc -* [ARROW-3025](https://issues.apache.org/jira/browse/ARROW-3025) - [C++] Add option to switch between dynamic and static linking in unit test executables -* [ARROW-3026](https://issues.apache.org/jira/browse/ARROW-3026) - [Plasma] Only run Plasma Python unit tests under valgrind once instead of twice in CI -* [ARROW-3027](https://issues.apache.org/jira/browse/ARROW-3027) - [Ruby] Stop "git tag" by "rake release" -* [ARROW-3028](https://issues.apache.org/jira/browse/ARROW-3028) - [Python] Trim unneeded work from documentation build in Travis CI -* [ARROW-3029](https://issues.apache.org/jira/browse/ARROW-3029) - [Python] pkg\_resources is slow -* [ARROW-3031](https://issues.apache.org/jira/browse/ARROW-3031) - [Go] Streamline release of Arrays and Builders -* [ARROW-3033](https://issues.apache.org/jira/browse/ARROW-3033) - [Dev] docker-compose test tooling does not seem to cache built Docker images -* [ARROW-3034](https://issues.apache.org/jira/browse/ARROW-3034) - [Packaging] Source archive can't be extracted by bsdtar on MSYS2 -* [ARROW-3035](https://issues.apache.org/jira/browse/ARROW-3035) - [Rust] Examples in README.md do not run -* [ARROW-3036](https://issues.apache.org/jira/browse/ARROW-3036) - [Go] add support for slicing Arrays -* [ARROW-3037](https://issues.apache.org/jira/browse/ARROW-3037) - [Go] add support NullArray -* [ARROW-3042](https://issues.apache.org/jira/browse/ARROW-3042) - [Go] add badge to GoDoc in the Go-Arrow README -* [ARROW-3043](https://issues.apache.org/jira/browse/ARROW-3043) - [C++] pthread doesn't exist on MinGW -* [ARROW-3044](https://issues.apache.org/jira/browse/ARROW-3044) - [Python] Remove all occurrences of cython's legacy property definition syntax -* [ARROW-3045](https://issues.apache.org/jira/browse/ARROW-3045) - [Python] Remove nullcheck from ipc Message and MessageReader -* [ARROW-3046](https://issues.apache.org/jira/browse/ARROW-3046) - [GLib] Use rubyish method in test-orc-file-reader.rb -* [ARROW-3050](https://issues.apache.org/jira/browse/ARROW-3050) - [C++] Adopt HiveServer2 client C++ codebase -* [ARROW-3051](https://issues.apache.org/jira/browse/ARROW-3051) - [C++] Status performance optimization from Impala/Kudu -* [ARROW-3057](https://issues.apache.org/jira/browse/ARROW-3057) - [INTEGRATION] Fix spark and hdfs dockerfiles -* [ARROW-3059](https://issues.apache.org/jira/browse/ARROW-3059) - [C++] Streamline namespace array::test -* [ARROW-3060](https://issues.apache.org/jira/browse/ARROW-3060) - [C++] Factor out parsing routines -* [ARROW-3062](https://issues.apache.org/jira/browse/ARROW-3062) - [Python] Extend fast libtensorflow\_framework.so compatibility workaround to Python 2.7 -* [ARROW-3064](https://issues.apache.org/jira/browse/ARROW-3064) - [C++] Add option to ADD\_ARROW\_TEST to indicate additional dependencies for particular unit test executables -* [ARROW-3067](https://issues.apache.org/jira/browse/ARROW-3067) - [Packaging] Support dev/rc/release .deb/.rpm builds -* [ARROW-3068](https://issues.apache.org/jira/browse/ARROW-3068) - [Packaging] Bump version to 0.11.0-SNAPSHOT -* [ARROW-3069](https://issues.apache.org/jira/browse/ARROW-3069) - [Release] Stop using SHA1 checksums per ASF policy -* [ARROW-3072](https://issues.apache.org/jira/browse/ARROW-3072) - [C++] Use ARROW\_RETURN\_NOT\_OK instead of RETURN\_NOT\_OK in header files -* [ARROW-3075](https://issues.apache.org/jira/browse/ARROW-3075) - [C++] Incorporate apache/parquet-cpp codebase into Arrow C++ codebase and build system -* [ARROW-3076](https://issues.apache.org/jira/browse/ARROW-3076) - [Website] Add Google Analytics tags to C++, Python API docs -* [ARROW-3088](https://issues.apache.org/jira/browse/ARROW-3088) - [Rust] Use internal \`Result\` type instead of \`Result\` -* [ARROW-3090](https://issues.apache.org/jira/browse/ARROW-3090) - [Rust] Accompany error messages with assertions -* [ARROW-3094](https://issues.apache.org/jira/browse/ARROW-3094) - [Python] Allow lighter construction of pa.Schema / pa.StructType -* [ARROW-3099](https://issues.apache.org/jira/browse/ARROW-3099) - [C++] Add benchmark for number parsing -* [ARROW-3105](https://issues.apache.org/jira/browse/ARROW-3105) - [Plasma] Improve flushing error message -* [ARROW-3106](https://issues.apache.org/jira/browse/ARROW-3106) - [Website] Update committers and PMC roster on website -* [ARROW-3109](https://issues.apache.org/jira/browse/ARROW-3109) - [Python] Add Python 3.7 virtualenvs to manylinux1 container -* [ARROW-3110](https://issues.apache.org/jira/browse/ARROW-3110) - [C++] Compilation warnings with gcc 7.3.0 -* [ARROW-3111](https://issues.apache.org/jira/browse/ARROW-3111) - [Java] Enable changing default logging level when running tests -* [ARROW-3114](https://issues.apache.org/jira/browse/ARROW-3114) - [Website] Add information about user@ mailing list to website / Community page -* [ARROW-3115](https://issues.apache.org/jira/browse/ARROW-3115) - [Java] Style Checks - Fix import ordering -* [ARROW-3116](https://issues.apache.org/jira/browse/ARROW-3116) - [Plasma] Add "ls" to object store -* [ARROW-3117](https://issues.apache.org/jira/browse/ARROW-3117) - [GLib] Add garrow\_chunked\_array\_to\_string() -* [ARROW-3119](https://issues.apache.org/jira/browse/ARROW-3119) - [Packaging] Nightly packaging script fails -* [ARROW-3127](https://issues.apache.org/jira/browse/ARROW-3127) - [C++] Add Tutorial about Sending Tensor from C++ to Python -* [ARROW-3128](https://issues.apache.org/jira/browse/ARROW-3128) - [C++] Support system shared zlib -* [ARROW-3129](https://issues.apache.org/jira/browse/ARROW-3129) - [Packaging] Stop to use deprecated BuildRoot and Group in .rpm -* [ARROW-3130](https://issues.apache.org/jira/browse/ARROW-3130) - [Go] add initial support for Go modules -* [ARROW-3136](https://issues.apache.org/jira/browse/ARROW-3136) - [C++] Clean up arrow:: public API -* [ARROW-3142](https://issues.apache.org/jira/browse/ARROW-3142) - [C++] Fetch all libs from toolchain environment -* [ARROW-3143](https://issues.apache.org/jira/browse/ARROW-3143) - [C++] CopyBitmap into existing memory -* [ARROW-3146](https://issues.apache.org/jira/browse/ARROW-3146) - [C++] Barebones Flight RPC server and client implementations -* [ARROW-3147](https://issues.apache.org/jira/browse/ARROW-3147) - [C++] MSVC version isn't detected in code page 932 -* [ARROW-3148](https://issues.apache.org/jira/browse/ARROW-3148) - [C++] MSVC shows C4819 warning on code page 932 -* [ARROW-3152](https://issues.apache.org/jira/browse/ARROW-3152) - [C++][Packaging] Use dynamic linking for zlib in conda recipes -* [ARROW-3153](https://issues.apache.org/jira/browse/ARROW-3153) - [Packaging] Fix broken nightly package builds introduced with recent cmake changes and orc tests -* [ARROW-3157](https://issues.apache.org/jira/browse/ARROW-3157) - [C++] Improve buffer creation for typed data -* [ARROW-3158](https://issues.apache.org/jira/browse/ARROW-3158) - [C++] Handle float truncation during casting -* [ARROW-3160](https://issues.apache.org/jira/browse/ARROW-3160) - [Python] Improve pathlib.Path support in parquet and filesystem modules -* [ARROW-3163](https://issues.apache.org/jira/browse/ARROW-3163) - [Python] Cython dependency is missing in non wheel package -* [ARROW-3167](https://issues.apache.org/jira/browse/ARROW-3167) - [CI] Limit clcache cache size -* [ARROW-3168](https://issues.apache.org/jira/browse/ARROW-3168) - [C++] Restore pkgconfig for Parquet C++ libraries -* [ARROW-3170](https://issues.apache.org/jira/browse/ARROW-3170) - [C++] Implement "readahead spooler" class for background input buffering -* [ARROW-3171](https://issues.apache.org/jira/browse/ARROW-3171) - [Java] checkstyle - fix line length and indentation -* [ARROW-3172](https://issues.apache.org/jira/browse/ARROW-3172) - [Rust] Update documentation for datatypes.rs -* [ARROW-3174](https://issues.apache.org/jira/browse/ARROW-3174) - [Rust] run examples as part of CI -* [ARROW-3177](https://issues.apache.org/jira/browse/ARROW-3177) - [Rust] Update expected error messages for tests that 'should panic' -* [ARROW-3180](https://issues.apache.org/jira/browse/ARROW-3180) - [C++] Add docker-compose setup to simulate Travis CI run locally -* [ARROW-3181](https://issues.apache.org/jira/browse/ARROW-3181) - [Packaging] Adjust conda package scripts to account for Parquet codebase migration -* [ARROW-3182](https://issues.apache.org/jira/browse/ARROW-3182) - [C++] Merge Gandiva codebase -* [ARROW-3187](https://issues.apache.org/jira/browse/ARROW-3187) - [Plasma] Make Plasma Log pluggable with glog -* [ARROW-3195](https://issues.apache.org/jira/browse/ARROW-3195) - [C++] NumPy initialization error check is missing in test -* [ARROW-3196](https://issues.apache.org/jira/browse/ARROW-3196) - Enable merge\_arrow\_py.py script to merge Parquet patches and set fix versions -* [ARROW-3197](https://issues.apache.org/jira/browse/ARROW-3197) - [C++] Add instructions to cpp/README.md about Parquet-only development and Arrow+Parquet -* [ARROW-3198](https://issues.apache.org/jira/browse/ARROW-3198) - [Website] Blog post for 0.11 release -* [ARROW-3211](https://issues.apache.org/jira/browse/ARROW-3211) - [C++] gold linker doesn't work with MinGW-w64 -* [ARROW-3212](https://issues.apache.org/jira/browse/ARROW-3212) - [C++] Create deterministic IPC metadata -* [ARROW-3213](https://issues.apache.org/jira/browse/ARROW-3213) - [C++] Use CMake to build vendored Snappy on Windows -* [ARROW-3214](https://issues.apache.org/jira/browse/ARROW-3214) - [C++] Disable insecure warnings with MinGW build -* [ARROW-3215](https://issues.apache.org/jira/browse/ARROW-3215) - [C++] Add support for finding libpython on MSYS2 -* [ARROW-3216](https://issues.apache.org/jira/browse/ARROW-3216) - [C++] libpython isn't linked to libarrow\_python in MinGW build -* [ARROW-3217](https://issues.apache.org/jira/browse/ARROW-3217) - [C++] ARROW\_STATIC definition is missing in MinGW build -* [ARROW-3218](https://issues.apache.org/jira/browse/ARROW-3218) - [C++] Utilities has needless pthread link in MinGW build -* [ARROW-3219](https://issues.apache.org/jira/browse/ARROW-3219) - [C++] Use Win32 API in MinGW -* [ARROW-3223](https://issues.apache.org/jira/browse/ARROW-3223) - [GLib] Use the same shared object versioning rule in C++ -* [ARROW-3229](https://issues.apache.org/jira/browse/ARROW-3229) - [Packaging]: Adjust wheel package scripts to account for Parquet codebase migration -* [ARROW-3234](https://issues.apache.org/jira/browse/ARROW-3234) - [C++] Link order is wrong when ARROW\_ORC=on and ARROW\_PROTOBUF\_USE\_SHARED=ON -* [ARROW-3235](https://issues.apache.org/jira/browse/ARROW-3235) - [Packaging] Update deb names -* [ARROW-3236](https://issues.apache.org/jira/browse/ARROW-3236) - [C++] OutputStream bookkeeping logic when writing IPC file format is incorrect -* [ARROW-3240](https://issues.apache.org/jira/browse/ARROW-3240) - [GLib] Add build instructions using Meson -* [ARROW-3242](https://issues.apache.org/jira/browse/ARROW-3242) - [C++] Use coarser-grained dispatch to SIMD hash functions -* [ARROW-3249](https://issues.apache.org/jira/browse/ARROW-3249) - [Python] Run flake8 on integration\_test.py and crossbow.py -* [ARROW-3250](https://issues.apache.org/jira/browse/ARROW-3250) - [C++] Create Buffer implementation that takes ownership for the memory from a std::string via std::move -* [ARROW-3252](https://issues.apache.org/jira/browse/ARROW-3252) - [C++] Do not hard code the "v" part of versions in thirdparty toolchain -* [ARROW-3257](https://issues.apache.org/jira/browse/ARROW-3257) - [C++] Stop to use IMPORTED\_LINK\_INTERFACE\_LIBRARIES -* [ARROW-3258](https://issues.apache.org/jira/browse/ARROW-3258) - [GLib] CI is failued on macOS -* [ARROW-3259](https://issues.apache.org/jira/browse/ARROW-3259) - [GLib] Rename "writeable" to "writable" -* [ARROW-3261](https://issues.apache.org/jira/browse/ARROW-3261) - [Python] Add "field" method to select fields from StructArray -* [ARROW-3262](https://issues.apache.org/jira/browse/ARROW-3262) - [Python] Implement \_\_getitem\_\_ with integers on pyarrow.Column -* [ARROW-3264](https://issues.apache.org/jira/browse/ARROW-3264) - [Java] checkstyle - fix whitespace -* [ARROW-3267](https://issues.apache.org/jira/browse/ARROW-3267) - [Python] Create empty table from schema -* [ARROW-3268](https://issues.apache.org/jira/browse/ARROW-3268) - [CI] Reduce conda times on AppVeyor -* [ARROW-3269](https://issues.apache.org/jira/browse/ARROW-3269) - [Python] Fix warnings in unit test suite -* [ARROW-3270](https://issues.apache.org/jira/browse/ARROW-3270) - [Release] Adjust release verification scripts to recent parquet migration -* [ARROW-3274](https://issues.apache.org/jira/browse/ARROW-3274) - [Packaging] Missing glog dependency from conda-forge recipes -* [ARROW-3276](https://issues.apache.org/jira/browse/ARROW-3276) - [Packaging] Add support Parquet related Linux packages -* [ARROW-3281](https://issues.apache.org/jira/browse/ARROW-3281) - [Java] Make sure that WritableByteChannel in WriteChannel writes out complete bytes -* [ARROW-3282](https://issues.apache.org/jira/browse/ARROW-3282) - [R] initial R functionality -* [ARROW-3284](https://issues.apache.org/jira/browse/ARROW-3284) - [R] Adding R Error in Status -* [ARROW-3285](https://issues.apache.org/jira/browse/ARROW-3285) - [GLib] Add arrow\_cpp\_build\_type and arrow\_cpp\_build\_dir Meson options -* [ARROW-3286](https://issues.apache.org/jira/browse/ARROW-3286) - [C++] ARROW\_EXPORT for RecordBatchBuilder is missing -* [ARROW-3287](https://issues.apache.org/jira/browse/ARROW-3287) - [C++] "redeclared without dllimport attribute after being referenced with dll linkage" with MinGW -* [ARROW-3288](https://issues.apache.org/jira/browse/ARROW-3288) - [GLib] Add new API index for 0.11.0 -* [ARROW-3300](https://issues.apache.org/jira/browse/ARROW-3300) - [Release] Update .deb package names in preparation -* [ARROW-3301](https://issues.apache.org/jira/browse/ARROW-3301) - [Website] Update Jekyll and Bootstrap 4 -* [ARROW-3305](https://issues.apache.org/jira/browse/ARROW-3305) - [JS] Incorrect development documentation link in javascript readme -* [ARROW-3309](https://issues.apache.org/jira/browse/ARROW-3309) - [JS] Missing links from DEVELOP.md -* [ARROW-3313](https://issues.apache.org/jira/browse/ARROW-3313) - [R] Run clang-format, cpplint checks on R C++ code -* [ARROW-3313](https://issues.apache.org/jira/browse/ARROW-3313) - [R] Run clang-format, cpplint checks on R C++ code -* [ARROW-3319](https://issues.apache.org/jira/browse/ARROW-3319) - [GLib] Expose AlignStream methods in InputStream, OutputStream classes -* [ARROW-3320](https://issues.apache.org/jira/browse/ARROW-3320) - [C++] Improve float parsing performance -* [ARROW-3321](https://issues.apache.org/jira/browse/ARROW-3321) - [C++] Improve integer parsing performance -* [ARROW-3334](https://issues.apache.org/jira/browse/ARROW-3334) - [Python] Update conda packages to new numpy requirement -* [ARROW-3335](https://issues.apache.org/jira/browse/ARROW-3335) - [Python] Add ccache to manylinux1 container -* [ARROW-3339](https://issues.apache.org/jira/browse/ARROW-3339) - [R] Support for character vectors -* [ARROW-3341](https://issues.apache.org/jira/browse/ARROW-3341) - [R] Support for logical vector -* [ARROW-3349](https://issues.apache.org/jira/browse/ARROW-3349) - [C++] Use aligned API in MinGW -* [ARROW-3350](https://issues.apache.org/jira/browse/ARROW-3350) - [Website] Fix powered by links -* [ARROW-3352](https://issues.apache.org/jira/browse/ARROW-3352) - [Packaging] Fix recently failing wheel builds -* [ARROW-3356](https://issues.apache.org/jira/browse/ARROW-3356) - [Python] Document parameters of Table.to\_pandas method -* [ARROW-3357](https://issues.apache.org/jira/browse/ARROW-3357) - [Rust] Add a mutable buffer implementation -* [ARROW-3360](https://issues.apache.org/jira/browse/ARROW-3360) - [GLib] Import Parquet bindings -* [ARROW-3363](https://issues.apache.org/jira/browse/ARROW-3363) - [C++/Python] Add helper functions to detect scalar Python types -* [ARROW-3371](https://issues.apache.org/jira/browse/ARROW-3371) - [Python] Remove check\_metadata argument for Field.equals docstring -* [ARROW-3375](https://issues.apache.org/jira/browse/ARROW-3375) - [Rust] Remove memory\_pool.rs -* [ARROW-3376](https://issues.apache.org/jira/browse/ARROW-3376) - [C++] Add double-conversion to cpp/thirdparty/download\_dependencies.sh -* [ARROW-3377](https://issues.apache.org/jira/browse/ARROW-3377) - [Gandiva][C++] Remove If statement from bit map set function -* [ARROW-3382](https://issues.apache.org/jira/browse/ARROW-3382) - [C++] Run Gandiva tests in Travis CI -* [ARROW-3392](https://issues.apache.org/jira/browse/ARROW-3392) - [Python] Support filters in disjunctive normal form in ParquetDataset -* [ARROW-3395](https://issues.apache.org/jira/browse/ARROW-3395) - [C++/Python] Add docker container for linting -* [ARROW-3397](https://issues.apache.org/jira/browse/ARROW-3397) - [C++] Use relative CMake path for modules -* [ARROW-3400](https://issues.apache.org/jira/browse/ARROW-3400) - [Packaging] Add support Parquet GLib related Linux packages -* [ARROW-3404](https://issues.apache.org/jira/browse/ARROW-3404) - [C++] Make CSV chunker faster -* [ARROW-3411](https://issues.apache.org/jira/browse/ARROW-3411) - [Packaging] dev/release/01-perform.sh doesn't have executable bit -* [ARROW-3412](https://issues.apache.org/jira/browse/ARROW-3412) - [Packaging] rat failure in dev/release/02-source.sh -* [ARROW-3413](https://issues.apache.org/jira/browse/ARROW-3413) - [Packaging] dev/release/02-source.sh doesn't generate Parquet GLib document -* [ARROW-3415](https://issues.apache.org/jira/browse/ARROW-3415) - [Packaging] dev/release/verify-release-cndidate.sh fails in "conda activate arrow-test" -* [ARROW-3416](https://issues.apache.org/jira/browse/ARROW-3416) - [Packaging] dev/release/02-source.sh must use SHA512 instead of SHA1 -* [ARROW-3417](https://issues.apache.org/jira/browse/ARROW-3417) - [Packaging] dev/release/verify-release-cndidate.sh fails Parquet C++ test -* [ARROW-3418](https://issues.apache.org/jira/browse/ARROW-3418) - [C++] Update Parquet snapshot version for release -* [ARROW-3423](https://issues.apache.org/jira/browse/ARROW-3423) - [Packaging] Remove RC information from deb/rpm -* [ARROW-3443](https://issues.apache.org/jira/browse/ARROW-3443) - [Java] Flight reports memory leaks in TestBasicOperation -* [PARQUET-169](https://issues.apache.org/jira/browse/PARQUET-169) - Parquet-cpp: Implement support for bulk reading and writing repetition/definition levels. -* [PARQUET-267](https://issues.apache.org/jira/browse/PARQUET-267) - Detach thirdparty code from build configuration. -* [PARQUET-416](https://issues.apache.org/jira/browse/PARQUET-416) - C++11, cpplint cleanup, package target and header installation -* [PARQUET-418](https://issues.apache.org/jira/browse/PARQUET-418) - Add a utility to print contents of a Parquet file to stdout -* [PARQUET-428](https://issues.apache.org/jira/browse/PARQUET-428) - Support INT96 and FIXED\_LEN\_BYTE\_ARRAY types -* [PARQUET-434](https://issues.apache.org/jira/browse/PARQUET-434) - Add a ParquetFileReader class to encapsulate some low-level details of interacting with Parquet files -* [PARQUET-435](https://issues.apache.org/jira/browse/PARQUET-435) - Provide vectorized ColumnReader interface -* [PARQUET-436](https://issues.apache.org/jira/browse/PARQUET-436) - Implement ParquetFileWriter class entry point for generating new Parquet files -* [PARQUET-437](https://issues.apache.org/jira/browse/PARQUET-437) - Incorporate googletest thirdparty dependency and add cmake tools (ADD\_PARQUET\_TEST) to simplify adding new unit tests -* [PARQUET-438](https://issues.apache.org/jira/browse/PARQUET-438) - Update RLE encoder/decoder modules from Impala upstream changes and adapt unit tests -* [PARQUET-439](https://issues.apache.org/jira/browse/PARQUET-439) - Conform all copyright headers to ASF requirements -* [PARQUET-442](https://issues.apache.org/jira/browse/PARQUET-442) - Convert flat SchemaElement vector to implied nested schema data structure -* [PARQUET-448](https://issues.apache.org/jira/browse/PARQUET-448) - Add cmake option to skip building the unit tests -* [PARQUET-449](https://issues.apache.org/jira/browse/PARQUET-449) - Update to latest parquet.thrift -* [PARQUET-451](https://issues.apache.org/jira/browse/PARQUET-451) - Add a RowGroup reader interface class -* [PARQUET-456](https://issues.apache.org/jira/browse/PARQUET-456) - Add zlib codec support -* [PARQUET-463](https://issues.apache.org/jira/browse/PARQUET-463) - Add DCHECK\* macros for assertions in debug builds -* [PARQUET-468](https://issues.apache.org/jira/browse/PARQUET-468) - Add a cmake option to generate the Parquet thrift headers with the thriftc in the environment -* [PARQUET-477](https://issues.apache.org/jira/browse/PARQUET-477) - Enable clang-format check during the Travis CI build -* [PARQUET-482](https://issues.apache.org/jira/browse/PARQUET-482) - Organize src code file structure to have a very clear folder with public headers. -* [PARQUET-485](https://issues.apache.org/jira/browse/PARQUET-485) - Decouple data page delimiting from column reader / scanner classes, create test fixtures -* [PARQUET-488](https://issues.apache.org/jira/browse/PARQUET-488) - Add SSE-related cmake options to manage compiler flags -* [PARQUET-489](https://issues.apache.org/jira/browse/PARQUET-489) - Add visibility macros to be used for public and internal APIs of libparquet -* [PARQUET-494](https://issues.apache.org/jira/browse/PARQUET-494) - Implement PLAIN\_DICTIONARY encoding and decoding -* [PARQUET-496](https://issues.apache.org/jira/browse/PARQUET-496) - Fix cpplint configuration to be more restrictive -* [PARQUET-497](https://issues.apache.org/jira/browse/PARQUET-497) - Decouple Parquet physical file structure from FileReader class -* [PARQUET-499](https://issues.apache.org/jira/browse/PARQUET-499) - Complete PlainEncoder implementation for all primitive types and test end to end -* [PARQUET-501](https://issues.apache.org/jira/browse/PARQUET-501) - Add an OutputStream abstraction (capable of memory allocation) for Encoder public API -* [PARQUET-503](https://issues.apache.org/jira/browse/PARQUET-503) - Re-enable parquet 2.0 encodings -* [PARQUET-508](https://issues.apache.org/jira/browse/PARQUET-508) - Add ParquetFilePrinter -* [PARQUET-508](https://issues.apache.org/jira/browse/PARQUET-508) - Add ParquetFilePrinter -* [PARQUET-512](https://issues.apache.org/jira/browse/PARQUET-512) - Add optional google/benchmark 3rd-party dependency for performance testing -* [PARQUET-515](https://issues.apache.org/jira/browse/PARQUET-515) - Add "Reset" to LevelEncoder and LevelDecoder -* [PARQUET-518](https://issues.apache.org/jira/browse/PARQUET-518) - Review usages of size\_t and unsigned integers generally per Google style guide -* [PARQUET-519](https://issues.apache.org/jira/browse/PARQUET-519) - Disable compiler warning supressions and fix all DEBUG build warnings -* [PARQUET-520](https://issues.apache.org/jira/browse/PARQUET-520) - Add version of LocalFileSource that uses memory-mapping for zero-copy reads -* [PARQUET-533](https://issues.apache.org/jira/browse/PARQUET-533) - Simplify RandomAccessSource API to combine Seek/Read -* [PARQUET-538](https://issues.apache.org/jira/browse/PARQUET-538) - Improve ColumnReader Tests -* [PARQUET-542](https://issues.apache.org/jira/browse/PARQUET-542) - Support memory allocation from external memory -* [PARQUET-545](https://issues.apache.org/jira/browse/PARQUET-545) - Improve API to support Decimal type -* [PARQUET-547](https://issues.apache.org/jira/browse/PARQUET-547) - Refactor most templates to use DataType structs rather than the Type::type enum -* [PARQUET-551](https://issues.apache.org/jira/browse/PARQUET-551) - Handle compiler warnings due to disabled DCHECKs in release builds -* [PARQUET-556](https://issues.apache.org/jira/browse/PARQUET-556) - Extend RowGroupStatistics to include "min" "max" statistics -* [PARQUET-559](https://issues.apache.org/jira/browse/PARQUET-559) - Enable InputStream as a source to the ParquetFileReader -* [PARQUET-564](https://issues.apache.org/jira/browse/PARQUET-564) - Add option to run unit tests with valgrind --tool=memcheck -* [PARQUET-566](https://issues.apache.org/jira/browse/PARQUET-566) - Add method to retrieve the full column path -* [PARQUET-568](https://issues.apache.org/jira/browse/PARQUET-568) - Read only specified top-level columns in DebugPrint -* [PARQUET-572](https://issues.apache.org/jira/browse/PARQUET-572) - Rename parquet\_cpp namespace to parquet -* [PARQUET-573](https://issues.apache.org/jira/browse/PARQUET-573) - C++: Create a public API for reading and writing file metadata -* [PARQUET-582](https://issues.apache.org/jira/browse/PARQUET-582) - Conversion functions for Parquet enums to Thrift enums -* [PARQUET-583](https://issues.apache.org/jira/browse/PARQUET-583) - Implement Parquet to Thrift schema conversion -* [PARQUET-587](https://issues.apache.org/jira/browse/PARQUET-587) - Implement BufferReader::Read(int64\_t,uint8\_t\*) -* [PARQUET-589](https://issues.apache.org/jira/browse/PARQUET-589) - Implement Chunked InMemoryInputStream for better memory usage -* [PARQUET-592](https://issues.apache.org/jira/browse/PARQUET-592) - Support compressed writes -* [PARQUET-593](https://issues.apache.org/jira/browse/PARQUET-593) - Add API for writing Page statistics -* [PARQUET-595](https://issues.apache.org/jira/browse/PARQUET-595) - Add API for key-value metadata -* [PARQUET-595](https://issues.apache.org/jira/browse/PARQUET-595) - Add API for key-value metadata -* [PARQUET-597](https://issues.apache.org/jira/browse/PARQUET-597) - Add data rates to benchmark output -* [PARQUET-598](https://issues.apache.org/jira/browse/PARQUET-598) - [C++] Test writing all primitive data types -* [PARQUET-600](https://issues.apache.org/jira/browse/PARQUET-600) - Add benchmarks for RLE-Level encoding -* [PARQUET-603](https://issues.apache.org/jira/browse/PARQUET-603) - Implement missing information in schema descriptor -* [PARQUET-605](https://issues.apache.org/jira/browse/PARQUET-605) - Expose schema node in ColumnDescriptor -* [PARQUET-607](https://issues.apache.org/jira/browse/PARQUET-607) - Public Writer header -* [PARQUET-610](https://issues.apache.org/jira/browse/PARQUET-610) - Print ColumnMetaData for each RowGroup -* [PARQUET-616](https://issues.apache.org/jira/browse/PARQUET-616) - C++: WriteBatch should accept const arrays -* [PARQUET-619](https://issues.apache.org/jira/browse/PARQUET-619) - C++: Add OutputStream for local files -* [PARQUET-625](https://issues.apache.org/jira/browse/PARQUET-625) - Improve RLE read performance -* [PARQUET-633](https://issues.apache.org/jira/browse/PARQUET-633) - Add version to WriterProperties -* [PARQUET-634](https://issues.apache.org/jira/browse/PARQUET-634) - Consistent private linking of dependencies -* [PARQUET-636](https://issues.apache.org/jira/browse/PARQUET-636) - Expose selection for different encodings -* [PARQUET-641](https://issues.apache.org/jira/browse/PARQUET-641) - Instantiate stringstream only if needed in SerializedPageReader::NextPage -* [PARQUET-646](https://issues.apache.org/jira/browse/PARQUET-646) - [C++] Enable easier 3rd-party toolchain clang builds on Linux -* [PARQUET-666](https://issues.apache.org/jira/browse/PARQUET-666) - PLAIN\_DICTIONARY write support -* [PARQUET-671](https://issues.apache.org/jira/browse/PARQUET-671) - Improve performance of RLE/bit-packed decoding in parquet-cpp -* [PARQUET-679](https://issues.apache.org/jira/browse/PARQUET-679) - [C++] Build and unit tests support for MSVC on Windows -* [PARQUET-679](https://issues.apache.org/jira/browse/PARQUET-679) - [C++] Build and unit tests support for MSVC on Windows -* [PARQUET-679](https://issues.apache.org/jira/browse/PARQUET-679) - [C++] Build and unit tests support for MSVC on Windows -* [PARQUET-679](https://issues.apache.org/jira/browse/PARQUET-679) - [C++] Build and unit tests support for MSVC on Windows -* [PARQUET-681](https://issues.apache.org/jira/browse/PARQUET-681) - Add tool to scan a parquet file -* [PARQUET-681](https://issues.apache.org/jira/browse/PARQUET-681) - Add tool to scan a parquet file -* [PARQUET-687](https://issues.apache.org/jira/browse/PARQUET-687) - C++: Switch to PLAIN encoding if dictionary grows too large -* [PARQUET-689](https://issues.apache.org/jira/browse/PARQUET-689) - C++: Compress DataPages eagerly -* [PARQUET-699](https://issues.apache.org/jira/browse/PARQUET-699) - Update parquet.thrift from https://github.com/apache/parquet-format -* [PARQUET-712](https://issues.apache.org/jira/browse/PARQUET-712) - C++: Read into Arrow memory -* [PARQUET-721](https://issues.apache.org/jira/browse/PARQUET-721) - Performance benchmarks for reading into Arrow structures -* [PARQUET-724](https://issues.apache.org/jira/browse/PARQUET-724) - Test more advanced properties setting -* [PARQUET-728](https://issues.apache.org/jira/browse/PARQUET-728) - [C++] Bring parquet::arrow up to date with API changes in arrow::io -* [PARQUET-728](https://issues.apache.org/jira/browse/PARQUET-728) - [C++] Bring parquet::arrow up to date with API changes in arrow::io -* [PARQUET-731](https://issues.apache.org/jira/browse/PARQUET-731) - [CPP] Add API to return metadata size and Skip reading values -* [PARQUET-737](https://issues.apache.org/jira/browse/PARQUET-737) - Use absolute namespace in macros -* [PARQUET-752](https://issues.apache.org/jira/browse/PARQUET-752) - [C++] Conform parquet\_arrow to upstream API changes -* [PARQUET-762](https://issues.apache.org/jira/browse/PARQUET-762) - C++: Use optimistic allocation instead of Arrow Builders -* [PARQUET-763](https://issues.apache.org/jira/browse/PARQUET-763) - C++: Expose ParquetFileReader through Arrow reader -* [PARQUET-769](https://issues.apache.org/jira/browse/PARQUET-769) - C++: Add support for Brotli Compression -* [PARQUET-778](https://issues.apache.org/jira/browse/PARQUET-778) - Standardize the schema output to match the parquet-mr format -* [PARQUET-782](https://issues.apache.org/jira/browse/PARQUET-782) - C++: Support writing to Arrow sinks -* [PARQUET-785](https://issues.apache.org/jira/browse/PARQUET-785) - C++: List conversion for Arrow Schemas -* [PARQUET-805](https://issues.apache.org/jira/browse/PARQUET-805) - C++: Read Int96 into Arrow Timestamp(ns) -* [PARQUET-807](https://issues.apache.org/jira/browse/PARQUET-807) - [C++] Add API to read file metadata only from a file handle -* [PARQUET-807](https://issues.apache.org/jira/browse/PARQUET-807) - [C++] Add API to read file metadata only from a file handle -* [PARQUET-809](https://issues.apache.org/jira/browse/PARQUET-809) - [C++] Add API to determine if two files' schemas are compatible -* [PARQUET-813](https://issues.apache.org/jira/browse/PARQUET-813) - C++: Build dependencies using CMake External project -* [PARQUET-820](https://issues.apache.org/jira/browse/PARQUET-820) - C++: Decoders should directly emit arrays with spacing for null entries -* [PARQUET-829](https://issues.apache.org/jira/browse/PARQUET-829) - C++: Make use of ARROW-469 -* [PARQUET-830](https://issues.apache.org/jira/browse/PARQUET-830) - [C++] Add additional configuration options to parquet::arrow::OpenFIle -* [PARQUET-833](https://issues.apache.org/jira/browse/PARQUET-833) - C++: Provide API to write spaced arrays (e.g. Arrow) -* [PARQUET-834](https://issues.apache.org/jira/browse/PARQUET-834) - C++: Support r/w of arrow::ListArray -* [PARQUET-835](https://issues.apache.org/jira/browse/PARQUET-835) - [C++] Add option to parquet::arrow to read columns in parallel using a thread pool -* [PARQUET-836](https://issues.apache.org/jira/browse/PARQUET-836) - [C++] Add column selection to parquet::arrow::FileReader -* [PARQUET-844](https://issues.apache.org/jira/browse/PARQUET-844) - [C++] Consolidate encodings, schema, and compression subdirectories into fewer files -* [PARQUET-848](https://issues.apache.org/jira/browse/PARQUET-848) - [C++] Consolidate libparquet\_thrift subcomponent -* [PARQUET-857](https://issues.apache.org/jira/browse/PARQUET-857) - [C++] Flatten parquet/encodings directory -* [PARQUET-858](https://issues.apache.org/jira/browse/PARQUET-858) - [C++] Flatten parquet/column directory, consolidate related code -* [PARQUET-859](https://issues.apache.org/jira/browse/PARQUET-859) - [C++] Flatten parquet/file directory -* [PARQUET-862](https://issues.apache.org/jira/browse/PARQUET-862) - Provide defaut cache size values if CPU info probing is not available -* [PARQUET-866](https://issues.apache.org/jira/browse/PARQUET-866) - [C++] Account for API changes in ARROW-33 -* [PARQUET-867](https://issues.apache.org/jira/browse/PARQUET-867) - [C++] Support writing sliced Arrow arrays -* [PARQUET-874](https://issues.apache.org/jira/browse/PARQUET-874) - [C++] Use default memory allocator from Arrow -* [PARQUET-877](https://issues.apache.org/jira/browse/PARQUET-877) - C++: Update Arrow Hash, update Version in metadata. -* [PARQUET-882](https://issues.apache.org/jira/browse/PARQUET-882) - [CPP] Improve Application Version parsing -* [PARQUET-890](https://issues.apache.org/jira/browse/PARQUET-890) - C++: Support I/O of DATE columns in parquet\_arrow -* [PARQUET-894](https://issues.apache.org/jira/browse/PARQUET-894) - Fix compilation warning -* [PARQUET-894](https://issues.apache.org/jira/browse/PARQUET-894) - Fix compilation warning -* [PARQUET-897](https://issues.apache.org/jira/browse/PARQUET-897) - [C++] Only use designated public headers from libarrow -* [PARQUET-903](https://issues.apache.org/jira/browse/PARQUET-903) - C++: Add option to set RPATH to ORIGIN -* [PARQUET-909](https://issues.apache.org/jira/browse/PARQUET-909) - [CPP]: Reduce buffer allocations (mallocs) on critical path -* [PARQUET-909](https://issues.apache.org/jira/browse/PARQUET-909) - [CPP]: Reduce buffer allocations (mallocs) on critical path -* [PARQUET-911](https://issues.apache.org/jira/browse/PARQUET-911) - C++: Support nested structs in parquet\_arrow -* [PARQUET-928](https://issues.apache.org/jira/browse/PARQUET-928) - [C++] Support pkg-config -* [PARQUET-929](https://issues.apache.org/jira/browse/PARQUET-929) - [C++] Handle arrow::DictionaryArray when writing Arrow data -* [PARQUET-930](https://issues.apache.org/jira/browse/PARQUET-930) - [C++] Account for all Arrow date/time types -* [PARQUET-934](https://issues.apache.org/jira/browse/PARQUET-934) - [C++] Support multiarch on Debian -* [PARQUET-935](https://issues.apache.org/jira/browse/PARQUET-935) - [C++] Set shared library version for .deb packages -* [PARQUET-946](https://issues.apache.org/jira/browse/PARQUET-946) - [C++] Refactoring in parquet::arrow::FileReader to be able to read a single row group -* [PARQUET-953](https://issues.apache.org/jira/browse/PARQUET-953) - [C++] Change arrow::FileWriter API to be initialized from a Schema, and provide for writing multiple tables -* [PARQUET-967](https://issues.apache.org/jira/browse/PARQUET-967) - [C++] Combine libparquet/libparquet\_arrow libraries -* [PARQUET-970](https://issues.apache.org/jira/browse/PARQUET-970) - Add Add Lz4 and Zstd compression codecs -* [PARQUET-978](https://issues.apache.org/jira/browse/PARQUET-978) - [C++] Minimizing footer reads for small(ish) metadata -* [PARQUET-984](https://issues.apache.org/jira/browse/PARQUET-984) - C++: Add abi and so version to pkg-config -* [PARQUET-991](https://issues.apache.org/jira/browse/PARQUET-991) - [C++] Fix compiler warnings on MSVC and build with /WX in Appveyor -* [PARQUET-991](https://issues.apache.org/jira/browse/PARQUET-991) - [C++] Fix compiler warnings on MSVC and build with /WX in Appveyor -* [PARQUET-991](https://issues.apache.org/jira/browse/PARQUET-991) - [C++] Fix compiler warnings on MSVC and build with /WX in Appveyor -* [PARQUET-999](https://issues.apache.org/jira/browse/PARQUET-999) - Improve MSVC build - Enable PARQUET\_BUILD\_BENCHMARKS -* [PARQUET-1008](https://issues.apache.org/jira/browse/PARQUET-1008) - Update TypedColumnReader::ReadBatch method to accept batch\_size as int64\_t -* [PARQUET-1035](https://issues.apache.org/jira/browse/PARQUET-1035) - Write Int96 from Arrow Timestamp(ns) -* [PARQUET-1037](https://issues.apache.org/jira/browse/PARQUET-1037) - Allow final RowGroup to be unfilled -* [PARQUET-1041](https://issues.apache.org/jira/browse/PARQUET-1041) - C++: Support Arrow's NullArray -* [PARQUET-1043](https://issues.apache.org/jira/browse/PARQUET-1043) - [C++] Raise minimum supported CMake version to 3.2 -* [PARQUET-1044](https://issues.apache.org/jira/browse/PARQUET-1044) - [C++] Use compression libraries from Apache Arrow -* [PARQUET-1045](https://issues.apache.org/jira/browse/PARQUET-1045) - [C++] Refactor to account for computational utility code migration in ARROW-1154 -* [PARQUET-1053](https://issues.apache.org/jira/browse/PARQUET-1053) - Fix unused result warnings due to unchecked Statuses -* [PARQUET-1053](https://issues.apache.org/jira/browse/PARQUET-1053) - Fix unused result warnings due to unchecked Statuses -* [PARQUET-1068](https://issues.apache.org/jira/browse/PARQUET-1068) - [C++] Use more vanilla Google C++ code formatting -* [PARQUET-1068](https://issues.apache.org/jira/browse/PARQUET-1068) - [C++] Use more vanilla Google C++ code formatting -* [PARQUET-1072](https://issues.apache.org/jira/browse/PARQUET-1072) - [C++] Add ARROW\_NO\_DEPRECATED\_API to CI to check for deprecated API use -* [PARQUET-1078](https://issues.apache.org/jira/browse/PARQUET-1078) - [C++] Add Arrow writer option to coerce timestamps to milliseconds or microseconds -* [PARQUET-1079](https://issues.apache.org/jira/browse/PARQUET-1079) - [C++] Account for Arrow API change in ARROW-1335 -* [PARQUET-1083](https://issues.apache.org/jira/browse/PARQUET-1083) - [C++] Refactor core logic in parquet-scan.cc so that it can be used as a library function for benchmarking -* [PARQUET-1083](https://issues.apache.org/jira/browse/PARQUET-1083) - [C++] Refactor core logic in parquet-scan.cc so that it can be used as a library function for benchmarking -* [PARQUET-1086](https://issues.apache.org/jira/browse/PARQUET-1086) - [C++] Remove usage of arrow/util/compiler-util.h after 1.3.0 release -* [PARQUET-1087](https://issues.apache.org/jira/browse/PARQUET-1087) - [C++] Add wrapper for ScanFileContents in parquet::arrow that catches exceptions -* [PARQUET-1092](https://issues.apache.org/jira/browse/PARQUET-1092) - [C++] Write Arrow tables with chunked columns -* [PARQUET-1093](https://issues.apache.org/jira/browse/PARQUET-1093) - C++: Improve Arrow level generation error message -* [PARQUET-1094](https://issues.apache.org/jira/browse/PARQUET-1094) - C++: Add benchmark for boolean Arrow column I/O -* [PARQUET-1095](https://issues.apache.org/jira/browse/PARQUET-1095) - [C++] Read and write Arrow decimal values -* [PARQUET-1104](https://issues.apache.org/jira/browse/PARQUET-1104) - [C++] Upgrade to Apache Arrow 0.7.0 RC0 -* [PARQUET-1150](https://issues.apache.org/jira/browse/PARQUET-1150) - C++: Hide statically linked boost symbols -* [PARQUET-1160](https://issues.apache.org/jira/browse/PARQUET-1160) - [C++] Implement BYTE\_ARRAY-backed Decimal reads -* [PARQUET-1164](https://issues.apache.org/jira/browse/PARQUET-1164) - [C++] Follow API changes in ARROW-1808 -* [PARQUET-1165](https://issues.apache.org/jira/browse/PARQUET-1165) - [C++] Pin clang-format version to 4.0 -* [PARQUET-1166](https://issues.apache.org/jira/browse/PARQUET-1166) - [API Proposal] Add GetRecordBatchReader in parquet/arrow/reader.h -* [PARQUET-1177](https://issues.apache.org/jira/browse/PARQUET-1177) - [C++] Add more extensive compiler warnings when using Clang -* [PARQUET-1177](https://issues.apache.org/jira/browse/PARQUET-1177) - [C++] Add more extensive compiler warnings when using Clang -* [PARQUET-1196](https://issues.apache.org/jira/browse/PARQUET-1196) - [C++] Provide a parquet\_arrow example project incl. CMake setup -* [PARQUET-1200](https://issues.apache.org/jira/browse/PARQUET-1200) - [C++] Support reading a single Arrow column from a Parquet file -* [PARQUET-1218](https://issues.apache.org/jira/browse/PARQUET-1218) - [C++] More informative error message on too short pages -* [PARQUET-1225](https://issues.apache.org/jira/browse/PARQUET-1225) - NaN values may lead to incorrect filtering under certain circumstances -* [PARQUET-1227](https://issues.apache.org/jira/browse/PARQUET-1227) - Thrift crypto metadata structures -* [PARQUET-1256](https://issues.apache.org/jira/browse/PARQUET-1256) - [C++] Add --print-key-value-metadata option to parquet\_reader tool -* [PARQUET-1256](https://issues.apache.org/jira/browse/PARQUET-1256) - [C++] Add --print-key-value-metadata option to parquet\_reader tool -* [PARQUET-1267](https://issues.apache.org/jira/browse/PARQUET-1267) - replace "unsafe" std::equal by std::memcmp -* [PARQUET-1276](https://issues.apache.org/jira/browse/PARQUET-1276) - [C++] Reduce the amount of memory used for writing null decimal values -* [PARQUET-1279](https://issues.apache.org/jira/browse/PARQUET-1279) - Use ASSERT\_NO\_FATAIL\_FAILURE in C++ unit tests -* [PARQUET-1301](https://issues.apache.org/jira/browse/PARQUET-1301) - [C++] Crypto package in parquet-cpp -* [PARQUET-1308](https://issues.apache.org/jira/browse/PARQUET-1308) - [C++] parquet::arrow should use thread pool, not ParallelFor -* [PARQUET-1323](https://issues.apache.org/jira/browse/PARQUET-1323) - [C++] Fix compiler warnings with clang-6.0 -* [PARQUET-1332](https://issues.apache.org/jira/browse/PARQUET-1332) - [C++] Add bloom filter utility class -* [PARQUET-1340](https://issues.apache.org/jira/browse/PARQUET-1340) - [C++] Fix Travis Ci valgrind errors related to std::random\_device -* [PARQUET-1346](https://issues.apache.org/jira/browse/PARQUET-1346) - [C++] Protect against null values data in empty Arrow array -* [PARQUET-1348](https://issues.apache.org/jira/browse/PARQUET-1348) - [C++] Allow Arrow FileWriter To Write FileMetaData -* [PARQUET-1350](https://issues.apache.org/jira/browse/PARQUET-1350) - [C++] Use abstract ResizableBuffer instead of concrete PoolBuffer -* [PARQUET-1360](https://issues.apache.org/jira/browse/PARQUET-1360) - [C++] Minor API + style changes follow up to PARQUET-1348 -* [PARQUET-1366](https://issues.apache.org/jira/browse/PARQUET-1366) - [C++] Streamline use of Arrow bit-util.h -* [PARQUET-1372](https://issues.apache.org/jira/browse/PARQUET-1372) - [C++] Add an API to allow writing RowGroups based on their size rather than num\_rows -* [PARQUET-1372](https://issues.apache.org/jira/browse/PARQUET-1372) - [C++] Add an API to allow writing RowGroups based on their size rather than num\_rows -* [PARQUET-1378](https://issues.apache.org/jira/browse/PARQUET-1378) - [c++] Allow RowGroups with zero rows to be written -* [PARQUET-1382](https://issues.apache.org/jira/browse/PARQUET-1382) - [C++] Prepare for arrow::test namespace removal -* [PARQUET-1392](https://issues.apache.org/jira/browse/PARQUET-1392) - [C++] Supply row group indices to parquet::arrow::FileReader::ReadTable -* [PARQUET-1398](https://issues.apache.org/jira/browse/PARQUET-1398) - Separate iv\_prefix for GCM and CTR modes -* [PARQUET-1401](https://issues.apache.org/jira/browse/PARQUET-1401) - RowGroup offset and total compressed size fields -* [PARQUET-1427](https://issues.apache.org/jira/browse/PARQUET-1427) - [C++] Move example executables and CLI tools to Apache Arrow repo -* [PARQUET-1431](https://issues.apache.org/jira/browse/PARQUET-1431) - [C++] Automaticaly set thrift to use boost for thrift versions before 0.11 - - -## Bug Fixes - -* [ARROW-1380](https://issues.apache.org/jira/browse/ARROW-1380) - [C++] Fix "still reachable" valgrind warnings when PLASMA\_VALGRIND=1 -* [ARROW-1661](https://issues.apache.org/jira/browse/ARROW-1661) - [Python] Python 3.7 support -* [ARROW-1799](https://issues.apache.org/jira/browse/ARROW-1799) - [Plasma C++] Make unittest does not create plasma store executable -* [ARROW-1996](https://issues.apache.org/jira/browse/ARROW-1996) - [Python] pyarrow.read\_serialized cannot read concatenated records -* [ARROW-2027](https://issues.apache.org/jira/browse/ARROW-2027) - [C++] ipc::Message::SerializeTo does not pad the message body -* [ARROW-2220](https://issues.apache.org/jira/browse/ARROW-2220) - Change default fix version in merge tool to be the next mainline release version -* [ARROW-2310](https://issues.apache.org/jira/browse/ARROW-2310) - Source release scripts fail with Java8 -* [ARROW-2646](https://issues.apache.org/jira/browse/ARROW-2646) - [C++/Python] Pandas roundtrip for date objects -* [ARROW-2775](https://issues.apache.org/jira/browse/ARROW-2775) - [Python] ccache error when building manylinux1 wheels -* [ARROW-2776](https://issues.apache.org/jira/browse/ARROW-2776) - [C++] Do not pass -Wno-noexcept-type for compilers that do not support it -* [ARROW-2782](https://issues.apache.org/jira/browse/ARROW-2782) - [Python] Ongoing Travis CI failures in Plasma unit tests -* [ARROW-2785](https://issues.apache.org/jira/browse/ARROW-2785) - [C++] Crash in json-integration-test -* [ARROW-2814](https://issues.apache.org/jira/browse/ARROW-2814) - [Python] Unify PyObject\* sequence conversion paths for built-in sequences, NumPy arrays -* [ARROW-2854](https://issues.apache.org/jira/browse/ARROW-2854) - [C++/Python] Casting float NaN to int should raise an error on safe cast -* [ARROW-2925](https://issues.apache.org/jira/browse/ARROW-2925) - [JS] Documentation failing in docker container -* [ARROW-2965](https://issues.apache.org/jira/browse/ARROW-2965) - [Python] Possible uint64 overflow issues in python\_to\_arrow.cc -* [ARROW-2966](https://issues.apache.org/jira/browse/ARROW-2966) - [Python] Data type conversion error -* [ARROW-2973](https://issues.apache.org/jira/browse/ARROW-2973) - [Python] pitrou/asv.git@customize\_commands does not work with the "new" way of activating conda -* [ARROW-2974](https://issues.apache.org/jira/browse/ARROW-2974) - [Python] Replace usages of "source activate" with "conda activate" in CI scripts -* [ARROW-2986](https://issues.apache.org/jira/browse/ARROW-2986) - [C++] /EHsc possibly needed for Visual Studio 2015 builds -* [ARROW-2992](https://issues.apache.org/jira/browse/ARROW-2992) - [Python] Parquet benchmark failure -* [ARROW-2992](https://issues.apache.org/jira/browse/ARROW-2992) - [Python] Parquet benchmark failure -* [ARROW-3006](https://issues.apache.org/jira/browse/ARROW-3006) - [GLib] .gir/.typelib for GPU aren't installed -* [ARROW-3007](https://issues.apache.org/jira/browse/ARROW-3007) - [Packaging] libarrow-gpu10 deb for Ubuntu 18.04 has broken dependencies -* [ARROW-3011](https://issues.apache.org/jira/browse/ARROW-3011) - [CI] Remove Slack notification -* [ARROW-3012](https://issues.apache.org/jira/browse/ARROW-3012) - [Python] Installation crashes with setuptools\_scm error -* [ARROW-3013](https://issues.apache.org/jira/browse/ARROW-3013) - [Website] Fix download links on website for tarballs, checksums -* [ARROW-3015](https://issues.apache.org/jira/browse/ARROW-3015) - [Python] Fix documentation typo for pa.uint8 -* [ARROW-3047](https://issues.apache.org/jira/browse/ARROW-3047) - [C++] cmake downloads and builds ORC even though it's installed -* [ARROW-3049](https://issues.apache.org/jira/browse/ARROW-3049) - [C++/Python] ORC reader fails on empty file -* [ARROW-3053](https://issues.apache.org/jira/browse/ARROW-3053) - [Python] Pandas decimal conversion segfault -* [ARROW-3056](https://issues.apache.org/jira/browse/ARROW-3056) - [Python] Indicate in NativeFile docstrings methods that are part of the RawIOBase API but not implemented -* [ARROW-3061](https://issues.apache.org/jira/browse/ARROW-3061) - [Java] headroom does not take into account reservation -* [ARROW-3065](https://issues.apache.org/jira/browse/ARROW-3065) - [Python] concat\_tables() failing from bad Pandas Metadata -* [ARROW-3083](https://issues.apache.org/jira/browse/ARROW-3083) - [Python] Version in manylinux1 wheel builds is wrong -* [ARROW-3093](https://issues.apache.org/jira/browse/ARROW-3093) - [C++] Linking errors with ORC enabled -* [ARROW-3095](https://issues.apache.org/jira/browse/ARROW-3095) - [Python] test\_plasma.py fails -* [ARROW-3098](https://issues.apache.org/jira/browse/ARROW-3098) - [Python] BufferReader doesn't adhere to the seek protocol -* [ARROW-3100](https://issues.apache.org/jira/browse/ARROW-3100) - [CI] C/glib build broken on OS X -* [ARROW-3125](https://issues.apache.org/jira/browse/ARROW-3125) - [Python] Update ASV instructions -* [ARROW-3125](https://issues.apache.org/jira/browse/ARROW-3125) - [Python] Update ASV instructions -* [ARROW-3132](https://issues.apache.org/jira/browse/ARROW-3132) - Regenerate 0.10.0 changelog -* [ARROW-3137](https://issues.apache.org/jira/browse/ARROW-3137) - [Python] pyarrow 0.10 requires newer version of numpy than specified in requirements -* [ARROW-3140](https://issues.apache.org/jira/browse/ARROW-3140) - [Plasma] Plasma fails building with GPU enabled -* [ARROW-3141](https://issues.apache.org/jira/browse/ARROW-3141) - [Python] Tensorflow support in pyarrow wheels pins numpy\>=1.14 -* [ARROW-3145](https://issues.apache.org/jira/browse/ARROW-3145) - [C++] Thrift compiler reruns in arrow/dbi/hiveserver2/thrift when using Ninja build -* [ARROW-3173](https://issues.apache.org/jira/browse/ARROW-3173) - [Rust] dynamic\_types example does not run -* [ARROW-3175](https://issues.apache.org/jira/browse/ARROW-3175) - [Java] Upgrade to official FlatBuffers release (Flatbuffers incompatibility) -* [ARROW-3183](https://issues.apache.org/jira/browse/ARROW-3183) - [Python] get\_library\_dirs on Windows can give the wrong directory -* [ARROW-3188](https://issues.apache.org/jira/browse/ARROW-3188) - [Python] Table.from\_arrays segfaults if lists and schema are passed -* [ARROW-3190](https://issues.apache.org/jira/browse/ARROW-3190) - [C++] "WriteableFile" is misspelled, should be renamed "WritableFile" with deprecation for old name -* [ARROW-3206](https://issues.apache.org/jira/browse/ARROW-3206) - [C++] Building with ARROW\_HIVESERVER2=ON with unit tests disabled causes error -* [ARROW-3227](https://issues.apache.org/jira/browse/ARROW-3227) - [Python] NativeFile.write shouldn't accept unicode strings -* [ARROW-3228](https://issues.apache.org/jira/browse/ARROW-3228) - [Python] Immutability of bytes is ignored -* [ARROW-3231](https://issues.apache.org/jira/browse/ARROW-3231) - [Python] Sphinx's autodoc\_default\_flags is now deprecated -* [ARROW-3237](https://issues.apache.org/jira/browse/ARROW-3237) - [CI] Update linux packaging filenames in rat exclusion list -* [ARROW-3241](https://issues.apache.org/jira/browse/ARROW-3241) - [Plasma] test\_plasma\_list test failure on Ubuntu 14.04 -* [ARROW-3251](https://issues.apache.org/jira/browse/ARROW-3251) - [C++] Conversion warnings in cast.cc -* [ARROW-3256](https://issues.apache.org/jira/browse/ARROW-3256) - [JS] File footer and message metadata is inconsistent -* [ARROW-3271](https://issues.apache.org/jira/browse/ARROW-3271) - [Python] Manylinux1 builds timing out in Travis CI -* [ARROW-3279](https://issues.apache.org/jira/browse/ARROW-3279) - [C++] Allow linking Arrow tests dynamically on Windows -* [ARROW-3299](https://issues.apache.org/jira/browse/ARROW-3299) - [C++] Appveyor builds failing -* [ARROW-3322](https://issues.apache.org/jira/browse/ARROW-3322) - [CI] Rust job always runs on AppVeyor -* [ARROW-3327](https://issues.apache.org/jira/browse/ARROW-3327) - [Python] manylinux container confusing -* [ARROW-3338](https://issues.apache.org/jira/browse/ARROW-3338) - [Python] Crash when schema and columns do not match -* [ARROW-3342](https://issues.apache.org/jira/browse/ARROW-3342) - Appveyor builds have stopped triggering on GitHub -* [ARROW-3348](https://issues.apache.org/jira/browse/ARROW-3348) - Plasma store dies when an object that a dead client is waiting for gets created. -* [ARROW-3354](https://issues.apache.org/jira/browse/ARROW-3354) - [Python] read\_record\_batch interfaces differ in pyarrow and pyarrow.cuda -* [ARROW-3369](https://issues.apache.org/jira/browse/ARROW-3369) - [Packaging] Wheel builds are failing due to wheel 0.32 release -* [ARROW-3370](https://issues.apache.org/jira/browse/ARROW-3370) - [Packaging] Centos 6 build is failing -* [ARROW-3373](https://issues.apache.org/jira/browse/ARROW-3373) - Fix bug in which plasma store can die when client gets multiple objects and object becomes available. -* [ARROW-3374](https://issues.apache.org/jira/browse/ARROW-3374) - [Python] Dictionary has out-of-bound index when creating DictionaryArray from Pandas with NaN -* [ARROW-3390](https://issues.apache.org/jira/browse/ARROW-3390) - [C++] cmake file under windows msys2 system doesn't work -* [ARROW-3393](https://issues.apache.org/jira/browse/ARROW-3393) - [C++] Fix compiler warning in util/task-group-cc on clang 6 -* [ARROW-3394](https://issues.apache.org/jira/browse/ARROW-3394) - [Java] Remove duplicate dependency entry in Flight -* [ARROW-3403](https://issues.apache.org/jira/browse/ARROW-3403) - [Website] Source tarball link missing from install page -* [ARROW-3420](https://issues.apache.org/jira/browse/ARROW-3420) - [C++] Fix outstanding include-what-you-use issues in src/arrow, src/parquet codebases -* [PARQUET-232](https://issues.apache.org/jira/browse/PARQUET-232) - minor compilation issue -* [PARQUET-446](https://issues.apache.org/jira/browse/PARQUET-446) - Hide thrift dependency in parquet-cpp -* [PARQUET-454](https://issues.apache.org/jira/browse/PARQUET-454) - Address inconsistencies in boolean decoding -* [PARQUET-455](https://issues.apache.org/jira/browse/PARQUET-455) - Fix compiler warnings on OS X / Clang -* [PARQUET-457](https://issues.apache.org/jira/browse/PARQUET-457) - Add compressed data page unit tests -* [PARQUET-469](https://issues.apache.org/jira/browse/PARQUET-469) - Roll back Thrift bindings to 0.9.0 -* [PARQUET-472](https://issues.apache.org/jira/browse/PARQUET-472) - Clean up InputStream ownership semantics in ColumnReader -* [PARQUET-505](https://issues.apache.org/jira/browse/PARQUET-505) - Column reader: automatically handle large data pages -* [PARQUET-507](https://issues.apache.org/jira/browse/PARQUET-507) - Improve runtime of rle-test.cc -* [PARQUET-513](https://issues.apache.org/jira/browse/PARQUET-513) - Valgrind errors are not failing the Travis CI build -* [PARQUET-525](https://issues.apache.org/jira/browse/PARQUET-525) - Test coverage for malformed file failure modes on the read path -* [PARQUET-537](https://issues.apache.org/jira/browse/PARQUET-537) - LocalFileSource leaks resources -* [PARQUET-549](https://issues.apache.org/jira/browse/PARQUET-549) - Add scanner and column reader tests for dictionary data pages -* [PARQUET-555](https://issues.apache.org/jira/browse/PARQUET-555) - Dictionary page metadata handling inconsistencies -* [PARQUET-561](https://issues.apache.org/jira/browse/PARQUET-561) - ParquetFileReader::Contents PIMPL missing a virtual destructor -* [PARQUET-599](https://issues.apache.org/jira/browse/PARQUET-599) - ColumnWriter::RleEncodeLevels' size estimation might be wrong -* [PARQUET-604](https://issues.apache.org/jira/browse/PARQUET-604) - Install writer.h headers -* [PARQUET-614](https://issues.apache.org/jira/browse/PARQUET-614) - C++: Remove unneeded LZ4-related code -* [PARQUET-620](https://issues.apache.org/jira/browse/PARQUET-620) - C++: Duplicate calls to ParquetFileWriter::Close cause duplicate metdata writes -* [PARQUET-621](https://issues.apache.org/jira/browse/PARQUET-621) - C++: Uninitialised DecimalMetadata is read -* [PARQUET-629](https://issues.apache.org/jira/browse/PARQUET-629) - RowGroupSerializer should only close itself once -* [PARQUET-639](https://issues.apache.org/jira/browse/PARQUET-639) - Do not export DCHECK in public headers -* [PARQUET-643](https://issues.apache.org/jira/browse/PARQUET-643) - Add const modifier to schema pointer reference in ParquetFileWriter -* [PARQUET-657](https://issues.apache.org/jira/browse/PARQUET-657) - [C++] Don't define DISALLOW\_COPY\_AND\_ASSIGN if already defined -* [PARQUET-658](https://issues.apache.org/jira/browse/PARQUET-658) - ColumnReader has no virtual destructor -* [PARQUET-659](https://issues.apache.org/jira/browse/PARQUET-659) - [C++] Instantiated template visibility is broken on clang / OS X -* [PARQUET-662](https://issues.apache.org/jira/browse/PARQUET-662) - [C++] ParquetException must be explicitly exported in dynamic libraries -* [PARQUET-676](https://issues.apache.org/jira/browse/PARQUET-676) - MAX\_VALUES\_PER\_LITERAL\_RUN causes RLE encoding failure -* [PARQUET-691](https://issues.apache.org/jira/browse/PARQUET-691) - [C++] Write ColumnChunk metadata after each column chunk in the file -* [PARQUET-694](https://issues.apache.org/jira/browse/PARQUET-694) - C++: Revert default data page size back to 1M -* [PARQUET-700](https://issues.apache.org/jira/browse/PARQUET-700) - C++: Disable dictionary encoding for boolean columns -* [PARQUET-701](https://issues.apache.org/jira/browse/PARQUET-701) - C++: Dictionary is written multiple times if close is called multiple times. -* [PARQUET-702](https://issues.apache.org/jira/browse/PARQUET-702) - Add a writer + reader example with detailed comments -* [PARQUET-702](https://issues.apache.org/jira/browse/PARQUET-702) - Add a writer + reader example with detailed comments -* [PARQUET-703](https://issues.apache.org/jira/browse/PARQUET-703) - [C++] Validate num\_values metadata for columns with nulls -* [PARQUET-704](https://issues.apache.org/jira/browse/PARQUET-704) - [C++] scan-all.h is not being installed -* [PARQUET-708](https://issues.apache.org/jira/browse/PARQUET-708) - [C++] RleEncoder does not account for "worst case scenario" in MaxBufferSize for bit\_width \> 1 -* [PARQUET-710](https://issues.apache.org/jira/browse/PARQUET-710) - Remove unneeded private member variables from RowGroupReader ABI -* [PARQUET-711](https://issues.apache.org/jira/browse/PARQUET-711) - Use metadata builders in parquet writer -* [PARQUET-711](https://issues.apache.org/jira/browse/PARQUET-711) - Use metadata builders in parquet writer -* [PARQUET-718](https://issues.apache.org/jira/browse/PARQUET-718) - Reading boolean pages written by parquet-cpp fails -* [PARQUET-719](https://issues.apache.org/jira/browse/PARQUET-719) - Fix WriterBatch API to handle NULL values -* [PARQUET-720](https://issues.apache.org/jira/browse/PARQUET-720) - Parquet-cpp fails to link when included in multiple TUs -* [PARQUET-739](https://issues.apache.org/jira/browse/PARQUET-739) - Rle-decoding uses static buffer that is shared accross threads -* [PARQUET-739](https://issues.apache.org/jira/browse/PARQUET-739) - Rle-decoding uses static buffer that is shared accross threads -* [PARQUET-741](https://issues.apache.org/jira/browse/PARQUET-741) - compression\_buffer\_ is reused although it shouldn't -* [PARQUET-742](https://issues.apache.org/jira/browse/PARQUET-742) - Add missing license headers -* [PARQUET-745](https://issues.apache.org/jira/browse/PARQUET-745) - TypedRowGroupStatistics fails to PlainDecode min and max in ByteArrayType -* [PARQUET-747](https://issues.apache.org/jira/browse/PARQUET-747) - [C++] TypedRowGroupStatistics are not being exported in libparquet.so -* [PARQUET-759](https://issues.apache.org/jira/browse/PARQUET-759) - Cannot store columns consisting of empty strings -* [PARQUET-760](https://issues.apache.org/jira/browse/PARQUET-760) - On switching from dictionary to the fallback encoding, an incorrect encoding is set -* [PARQUET-764](https://issues.apache.org/jira/browse/PARQUET-764) - [CPP] Parquet Writer does not write Boolean values correctly -* [PARQUET-766](https://issues.apache.org/jira/browse/PARQUET-766) - C++: Expose ParquetFileReader through Arrow reader as const -* [PARQUET-775](https://issues.apache.org/jira/browse/PARQUET-775) - C++: TrackingAllocator is not thread-safe -* [PARQUET-779](https://issues.apache.org/jira/browse/PARQUET-779) - Export TypedRowGroupStatistics in libparquet -* [PARQUET-780](https://issues.apache.org/jira/browse/PARQUET-780) - WriterBatch API does not properly handle NULL values for byte array types -* [PARQUET-789](https://issues.apache.org/jira/browse/PARQUET-789) - [C++] Catch and translate ParquetException in parquet::arrow::FileReader::{ReadFlatColumn, ReadFlatTable}} -* [PARQUET-793](https://issues.apache.org/jira/browse/PARQUET-793) - [CPP] Do not return incorrect statistics -* [PARQUET-797](https://issues.apache.org/jira/browse/PARQUET-797) - [C++] Update for API changes in ARROW-418 -* [PARQUET-799](https://issues.apache.org/jira/browse/PARQUET-799) - concurrent usage of the file reader API -* [PARQUET-812](https://issues.apache.org/jira/browse/PARQUET-812) - [C++] Failure reading BYTE\_ARRAY data from file in parquet-compatibility project -* [PARQUET-816](https://issues.apache.org/jira/browse/PARQUET-816) - [C++] Failure decoding sample dict-encoded file from parquet-compatibility project -* [PARQUET-818](https://issues.apache.org/jira/browse/PARQUET-818) - [C++] Refactor library to share IO, Buffer, and memory management abstractions with Apache Arrow -* [PARQUET-818](https://issues.apache.org/jira/browse/PARQUET-818) - [C++] Refactor library to share IO, Buffer, and memory management abstractions with Apache Arrow -* [PARQUET-819](https://issues.apache.org/jira/browse/PARQUET-819) - C++: Trying to install non-existing parquet/arrow/utils.h -* [PARQUET-827](https://issues.apache.org/jira/browse/PARQUET-827) - [C++] Incorporate addition of arrow::MemoryPool::Reallocate -* [PARQUET-828](https://issues.apache.org/jira/browse/PARQUET-828) - [C++] "version" field set improperly in file metadata -* [PARQUET-837](https://issues.apache.org/jira/browse/PARQUET-837) - [C++] SerializedFile::ParseMetaData uses Seek, followed by Read, and could have race conditions -* [PARQUET-841](https://issues.apache.org/jira/browse/PARQUET-841) - [C++] Writing wrong format version when using ParquetVersion::PARQUET\_1\_0 -* [PARQUET-842](https://issues.apache.org/jira/browse/PARQUET-842) - [C++] Impala rejects DOUBLE columns if decimal metadata is set -* [PARQUET-843](https://issues.apache.org/jira/browse/PARQUET-843) - [C++] Impala unable to read files created by parquet-cpp -* [PARQUET-846](https://issues.apache.org/jira/browse/PARQUET-846) - [CPP] CpuInfo::Init() is not thread safe -* [PARQUET-880](https://issues.apache.org/jira/browse/PARQUET-880) - [CPP] Prevent destructors from throwing -* [PARQUET-888](https://issues.apache.org/jira/browse/PARQUET-888) - C++ Memory leak in RowGroupSerializer -* [PARQUET-889](https://issues.apache.org/jira/browse/PARQUET-889) - Fix compilation when PARQUET\_USE\_SSE is on -* [PARQUET-892](https://issues.apache.org/jira/browse/PARQUET-892) - [C++] Clean up link library targets in CMake files -* [PARQUET-895](https://issues.apache.org/jira/browse/PARQUET-895) - Reading of nested columns is broken -* [PARQUET-898](https://issues.apache.org/jira/browse/PARQUET-898) - [C++] Change Travis CI OS X image to Xcode 6.4 and fix our thirdparty build -* [PARQUET-908](https://issues.apache.org/jira/browse/PARQUET-908) - Fix for PARQUET-890 introduces undefined symbol in libparquet\_arrow.so -* [PARQUET-914](https://issues.apache.org/jira/browse/PARQUET-914) - [C++] Throw more informative exception when user writes too many values to a column in a row group -* [PARQUET-915](https://issues.apache.org/jira/browse/PARQUET-915) - Support Arrow Time Types in Schema -* [PARQUET-918](https://issues.apache.org/jira/browse/PARQUET-918) - FromParquetSchema API crashes on nested schemas -* [PARQUET-918](https://issues.apache.org/jira/browse/PARQUET-918) - FromParquetSchema API crashes on nested schemas -* [PARQUET-919](https://issues.apache.org/jira/browse/PARQUET-919) - [C++] Account for API changes in ARROW-683 -* [PARQUET-923](https://issues.apache.org/jira/browse/PARQUET-923) - [C++] Account for Time metadata changes in ARROW-686 -* [PARQUET-933](https://issues.apache.org/jira/browse/PARQUET-933) - [C++] Account for Arrow Table API changes coming in ARROW-728 -* [PARQUET-936](https://issues.apache.org/jira/browse/PARQUET-936) - [C++] parquet::arrow::WriteTable can enter infinite loop if chunk\_size is 0 -* [PARQUET-943](https://issues.apache.org/jira/browse/PARQUET-943) - [C++] Overflow build error on x86 -* [PARQUET-947](https://issues.apache.org/jira/browse/PARQUET-947) - [C++] Refactor to account for ARROW-795 Arrow core library consolidation -* [PARQUET-958](https://issues.apache.org/jira/browse/PARQUET-958) - [C++] Print Parquet metadata in JSON format -* [PARQUET-958](https://issues.apache.org/jira/browse/PARQUET-958) - [C++] Print Parquet metadata in JSON format -* [PARQUET-963](https://issues.apache.org/jira/browse/PARQUET-963) - [C++] Disallow reading struct types in Arrow reader for now -* [PARQUET-965](https://issues.apache.org/jira/browse/PARQUET-965) - [C++] FIXED\_LEN\_BYTE\_ARRAY types are unhandled in the Arrow reader -* [PARQUET-979](https://issues.apache.org/jira/browse/PARQUET-979) - [C++] Limit size of min, max or disable stats for long binary types -* [PARQUET-992](https://issues.apache.org/jira/browse/PARQUET-992) - [C++] parquet/compression.h leaks zlib.h -* [PARQUET-995](https://issues.apache.org/jira/browse/PARQUET-995) - [C++] Int96 reader in parquet\_arrow uses size of Int96Type instead of Int96 -* [PARQUET-997](https://issues.apache.org/jira/browse/PARQUET-997) - Fix override compiler warnings -* [PARQUET-1002](https://issues.apache.org/jira/browse/PARQUET-1002) - [C++] Compute statistics based on Logical Types -* [PARQUET-1003](https://issues.apache.org/jira/browse/PARQUET-1003) - [C++] Modify DEFAULT\_CREATED\_BY value for every new release version -* [PARQUET-1007](https://issues.apache.org/jira/browse/PARQUET-1007) - [C++ ] Update parquet.thrift from https://github.com/apache/parquet-format -* [PARQUET-1029](https://issues.apache.org/jira/browse/PARQUET-1029) - [C++] TypedColumnReader/TypeColumnWriter symbols are no longer being exported -* [PARQUET-1029](https://issues.apache.org/jira/browse/PARQUET-1029) - [C++] TypedColumnReader/TypeColumnWriter symbols are no longer being exported -* [PARQUET-1033](https://issues.apache.org/jira/browse/PARQUET-1033) - Mismatched Read and Write -* [PARQUET-1038](https://issues.apache.org/jira/browse/PARQUET-1038) - Key value metadata should be nullptr if not set -* [PARQUET-1040](https://issues.apache.org/jira/browse/PARQUET-1040) - Missing writer method implementations -* [PARQUET-1042](https://issues.apache.org/jira/browse/PARQUET-1042) - C++: Compilation breaks on GCC 4.8 -* [PARQUET-1048](https://issues.apache.org/jira/browse/PARQUET-1048) - [C++] Static linking of libarrow is no longer supported -* [PARQUET-1048](https://issues.apache.org/jira/browse/PARQUET-1048) - [C++] Static linking of libarrow is no longer supported -* [PARQUET-1054](https://issues.apache.org/jira/browse/PARQUET-1054) - [C++] Account for Arrow API changes in ARROW-1199 -* [PARQUET-1071](https://issues.apache.org/jira/browse/PARQUET-1071) - [C++] parquet::arrow::FileWriter::Close is not idempotent -* [PARQUET-1085](https://issues.apache.org/jira/browse/PARQUET-1085) - [C++] Backwards compatibility from macro cleanup in transitive dependencies in ARROW-1452 -* [PARQUET-1088](https://issues.apache.org/jira/browse/PARQUET-1088) - [CPP] remove parquet\_version.h from version control since it gets auto generated -* [PARQUET-1090](https://issues.apache.org/jira/browse/PARQUET-1090) - [C++] Fix int32 overflow in Arrow table writer, add max row group size property -* [PARQUET-1098](https://issues.apache.org/jira/browse/PARQUET-1098) - [C++] Install new header in parquet/util -* [PARQUET-1100](https://issues.apache.org/jira/browse/PARQUET-1100) - [C++] Reading repeated types should decode number of records rather than number of values -* [PARQUET-1108](https://issues.apache.org/jira/browse/PARQUET-1108) - [C++] Fix Int96 comparators -* [PARQUET-1114](https://issues.apache.org/jira/browse/PARQUET-1114) - Apply fix for ARROW-1601 and ARROW-1611 to parquet-cpp -* [PARQUET-1121](https://issues.apache.org/jira/browse/PARQUET-1121) - C++: DictionaryArrays of NullType cannot be written -* [PARQUET-1123](https://issues.apache.org/jira/browse/PARQUET-1123) - [C++] Update parquet-cpp to use Arrow's AssertArraysEqual -* [PARQUET-1138](https://issues.apache.org/jira/browse/PARQUET-1138) - [C++] Fix compilation with Arrow 0.7.1 -* [PARQUET-1167](https://issues.apache.org/jira/browse/PARQUET-1167) - [C++] FieldToNode function should return a status when throwing an exception -* [PARQUET-1175](https://issues.apache.org/jira/browse/PARQUET-1175) - [C++] Fix usage of deprecated Arrow API -* [PARQUET-1179](https://issues.apache.org/jira/browse/PARQUET-1179) - [C++] Support Apache Thrift 0.11 -* [PARQUET-1180](https://issues.apache.org/jira/browse/PARQUET-1180) - C++: Fix behaviour of num\_children element of primitive nodes -* [PARQUET-1193](https://issues.apache.org/jira/browse/PARQUET-1193) - [CPP] Implement ColumnOrder to support min\_value and max\_value -* [PARQUET-1226](https://issues.apache.org/jira/browse/PARQUET-1226) - [C++] Fix new build warnings with clang 5.0 -* [PARQUET-1233](https://issues.apache.org/jira/browse/PARQUET-1233) - [CPP ]Enable option to switch between stl classes and boost classes for thrift header -* [PARQUET-1245](https://issues.apache.org/jira/browse/PARQUET-1245) - [C++] Segfault when writing Arrow table with duplicate columns -* [PARQUET-1255](https://issues.apache.org/jira/browse/PARQUET-1255) - [C++] Exceptions thrown in some tests -* [PARQUET-1265](https://issues.apache.org/jira/browse/PARQUET-1265) - Segfault on static ApplicationVersion initialization -* [PARQUET-1268](https://issues.apache.org/jira/browse/PARQUET-1268) - [C++] Conversion of Arrow null list columns fails -* [PARQUET-1270](https://issues.apache.org/jira/browse/PARQUET-1270) - [C++] Executable tools do not get installed -* [PARQUET-1272](https://issues.apache.org/jira/browse/PARQUET-1272) - [C++] ScanFileContents reports wrong row count for nested columns -* [PARQUET-1273](https://issues.apache.org/jira/browse/PARQUET-1273) - [Python] Error writing to partitioned Parquet dataset -* [PARQUET-1274](https://issues.apache.org/jira/browse/PARQUET-1274) - [Python] SegFault in pyarrow.parquet.write\_table with specific options -* [PARQUET-1283](https://issues.apache.org/jira/browse/PARQUET-1283) - [C++] FormatStatValue appends trailing space to string and int96 -* [PARQUET-1307](https://issues.apache.org/jira/browse/PARQUET-1307) - [C++] memory-test fails with latest Arrow -* [PARQUET-1315](https://issues.apache.org/jira/browse/PARQUET-1315) - [C++] ColumnChunkMetaData.has\_dictionary\_page() should return bool, not int64\_t -* [PARQUET-1333](https://issues.apache.org/jira/browse/PARQUET-1333) - [C++] Reading of files with dictionary size 0 fails on Windows with bad\_alloc -* [PARQUET-1334](https://issues.apache.org/jira/browse/PARQUET-1334) - [C++] memory\_map parameter seems missleading in parquet file opener -* [PARQUET-1357](https://issues.apache.org/jira/browse/PARQUET-1357) - [C++] FormatStatValue truncates binary statistics on zero character -* [PARQUET-1358](https://issues.apache.org/jira/browse/PARQUET-1358) - [C++] index\_page\_offset should be unset as it is not supported. -* [PARQUET-1369](https://issues.apache.org/jira/browse/PARQUET-1369) - [Python] Unavailable Parquet column statistics from Spark-generated file -* [PARQUET-1384](https://issues.apache.org/jira/browse/PARQUET-1384) - [C++] Clang compiler warnings in bloom\_filter-test.cc - - - -# Apache Arrow 0.10.0 (2018-08-06) - -## Bug Fixes - -* [ARROW-198](https://issues.apache.org/jira/browse/ARROW-198) - [Java] OutOfMemoryError for vector test case -* [ARROW-640](https://issues.apache.org/jira/browse/ARROW-640) - [Python] Arrow scalar values should have a sensible \_\_hash\_\_ and comparison -* [ARROW-2020](https://issues.apache.org/jira/browse/ARROW-2020) - [Python] Parquet segfaults if coercing ns timestamps and writing 96-bit timestamps -* [ARROW-2059](https://issues.apache.org/jira/browse/ARROW-2059) - [Python] Possible performance regression in Feather read/write path -* [ARROW-2101](https://issues.apache.org/jira/browse/ARROW-2101) - [Python] from\_pandas reads 'str' type as binary Arrow data with Python 2 -* [ARROW-2122](https://issues.apache.org/jira/browse/ARROW-2122) - [Python] Pyarrow fails to serialize dataframe with timestamp. -* [ARROW-2182](https://issues.apache.org/jira/browse/ARROW-2182) - [Python] ASV benchmark setup does not account for C++ library changing -* [ARROW-2189](https://issues.apache.org/jira/browse/ARROW-2189) - [C++] Seg. fault on make\_shared -* [ARROW-2193](https://issues.apache.org/jira/browse/ARROW-2193) - [Plasma] plasma\_store has runtime dependency on Boost shared libraries when ARROW\_BOOST\_USE\_SHARED=on -* [ARROW-2195](https://issues.apache.org/jira/browse/ARROW-2195) - [Plasma] Segfault when retrieving RecordBatch from plasma store -* [ARROW-2247](https://issues.apache.org/jira/browse/ARROW-2247) - [Python] Statically-linking boost\_regex in both libarrow and libparquet results in segfault -* [ARROW-2273](https://issues.apache.org/jira/browse/ARROW-2273) - Cannot deserialize pandas SparseDataFrame -* [ARROW-2300](https://issues.apache.org/jira/browse/ARROW-2300) - [Python] python/testing/test\_hdfs.sh no longer works -* [ARROW-2305](https://issues.apache.org/jira/browse/ARROW-2305) - [Python] Cython 0.25.2 compilation failure -* [ARROW-2314](https://issues.apache.org/jira/browse/ARROW-2314) - [Python] Union array slicing is defective -* [ARROW-2326](https://issues.apache.org/jira/browse/ARROW-2326) - [Python] cannot import pip installed pyarrow on OS X (10.9) -* [ARROW-2328](https://issues.apache.org/jira/browse/ARROW-2328) - Writing a slice with feather ignores the offset -* [ARROW-2331](https://issues.apache.org/jira/browse/ARROW-2331) - [Python] Fix indexing implementations -* [ARROW-2333](https://issues.apache.org/jira/browse/ARROW-2333) - [Python] boost bundling fails in setup.py -* [ARROW-2342](https://issues.apache.org/jira/browse/ARROW-2342) - [Python] Aware timestamp type fails pickling -* [ARROW-2346](https://issues.apache.org/jira/browse/ARROW-2346) - [Python] PYARROW\_CXXFLAGS doesn't accept multiple options -* [ARROW-2349](https://issues.apache.org/jira/browse/ARROW-2349) - [Python] Boost shared library bundling is broken for MSVC -* [ARROW-2351](https://issues.apache.org/jira/browse/ARROW-2351) - [C++] StringBuilder::append(vector...) not implemented -* [ARROW-2354](https://issues.apache.org/jira/browse/ARROW-2354) - [C++] PyDecimal\_Check() is much too slow -* [ARROW-2355](https://issues.apache.org/jira/browse/ARROW-2355) - [Python] Unable to import pyarrow [0.9.0] OSX -* [ARROW-2357](https://issues.apache.org/jira/browse/ARROW-2357) - Benchmark PandasObjectIsNull -* [ARROW-2368](https://issues.apache.org/jira/browse/ARROW-2368) - DecimalVector\#setBigEndian is not padding correctly for negative values -* [ARROW-2369](https://issues.apache.org/jira/browse/ARROW-2369) - Large (\>\~20 GB) files written to Parquet via PyArrow are corrupted -* [ARROW-2370](https://issues.apache.org/jira/browse/ARROW-2370) - [GLib] include path is wrong on Meson build -* [ARROW-2371](https://issues.apache.org/jira/browse/ARROW-2371) - [GLib] gio-2.0 isn't required on GNU Autotools build -* [ARROW-2372](https://issues.apache.org/jira/browse/ARROW-2372) - [Python] ArrowIOError: Invalid argument when reading Parquet file -* [ARROW-2375](https://issues.apache.org/jira/browse/ARROW-2375) - [Rust] Buffer should release memory when dropped -* [ARROW-2377](https://issues.apache.org/jira/browse/ARROW-2377) - [GLib] Travis-CI failures -* [ARROW-2380](https://issues.apache.org/jira/browse/ARROW-2380) - [Python] Correct issues in numpy\_to\_arrow conversion routines -* [ARROW-2382](https://issues.apache.org/jira/browse/ARROW-2382) - [Rust] List was not using memory safely -* [ARROW-2383](https://issues.apache.org/jira/browse/ARROW-2383) - [C++] Debian packages need to depend on libprotobuf -* [ARROW-2387](https://issues.apache.org/jira/browse/ARROW-2387) - [Python] negative decimal values get spurious rescaling error -* [ARROW-2391](https://issues.apache.org/jira/browse/ARROW-2391) - [Python] Segmentation fault from PyArrow when mapping Pandas datetime column to pyarrow.date64 -* [ARROW-2393](https://issues.apache.org/jira/browse/ARROW-2393) - [C++] arrow/status.h does not define ARROW\_CHECK needed for ARROW\_CHECK\_OK -* [ARROW-2403](https://issues.apache.org/jira/browse/ARROW-2403) - [C++] arrow::CpuInfo::model\_name\_ destructed twice on exit -* [ARROW-2405](https://issues.apache.org/jira/browse/ARROW-2405) - [C++] is missing in plasma/client.h -* [ARROW-2418](https://issues.apache.org/jira/browse/ARROW-2418) - [Rust] List builder fails due to memory not being reserved correctly -* [ARROW-2419](https://issues.apache.org/jira/browse/ARROW-2419) - [Site] Website generation depends on local timezone -* [ARROW-2420](https://issues.apache.org/jira/browse/ARROW-2420) - [Rust] Memory is never released -* [ARROW-2421](https://issues.apache.org/jira/browse/ARROW-2421) - [C++] Update LLVM version in cpp README -* [ARROW-2423](https://issues.apache.org/jira/browse/ARROW-2423) - [Python] PyArrow datatypes raise ValueError on equality checks against non-PyArrow objects -* [ARROW-2424](https://issues.apache.org/jira/browse/ARROW-2424) - [Rust] Missing import causing broken build -* [ARROW-2425](https://issues.apache.org/jira/browse/ARROW-2425) - [Rust] Array::from missing mapping for u8 type -* [ARROW-2426](https://issues.apache.org/jira/browse/ARROW-2426) - [CI] glib build failure -* [ARROW-2432](https://issues.apache.org/jira/browse/ARROW-2432) - [Python] from\_pandas fails when converting decimals if have None values -* [ARROW-2437](https://issues.apache.org/jira/browse/ARROW-2437) - [C++] Change of arrow::ipc::ReadMessage signature breaks ABI compability -* [ARROW-2438](https://issues.apache.org/jira/browse/ARROW-2438) - [Rust] memory\_pool.rs misses license header -* [ARROW-2441](https://issues.apache.org/jira/browse/ARROW-2441) - [Rust] Builder::slice\_mut assertions are too strict -* [ARROW-2443](https://issues.apache.org/jira/browse/ARROW-2443) - [Python] Conversion from pandas of empty categorical fails with ArrowInvalid -* [ARROW-2450](https://issues.apache.org/jira/browse/ARROW-2450) - [Python] Saving to parquet fails for empty lists -* [ARROW-2452](https://issues.apache.org/jira/browse/ARROW-2452) - [TEST] Spark integration test fails with permission error -* [ARROW-2454](https://issues.apache.org/jira/browse/ARROW-2454) - [Python] Empty chunked array slice crashes -* [ARROW-2455](https://issues.apache.org/jira/browse/ARROW-2455) - [C++] The bytes\_allocated\_ in CudaContextImpl isn't initialized -* [ARROW-2457](https://issues.apache.org/jira/browse/ARROW-2457) - garrow\_array\_builder\_append\_values() won't work for large arrays -* [ARROW-2459](https://issues.apache.org/jira/browse/ARROW-2459) - pyarrow: Segfault with pyarrow.deserialize\_pandas -* [ARROW-2462](https://issues.apache.org/jira/browse/ARROW-2462) - [C++] Segfault when writing a parquet table containing a dictionary column from Record Batch Stream -* [ARROW-2465](https://issues.apache.org/jira/browse/ARROW-2465) - [Plasma] plasma\_store fails to find libarrow\_gpu.so -* [ARROW-2466](https://issues.apache.org/jira/browse/ARROW-2466) - [C++] misleading "append" flag to FileOutputStream -* [ARROW-2468](https://issues.apache.org/jira/browse/ARROW-2468) - [Rust] Builder::slice\_mut should take mut self -* [ARROW-2471](https://issues.apache.org/jira/browse/ARROW-2471) - [Rust] Assertion when pushing value to Builder/ListBuilder with zero capacity -* [ARROW-2473](https://issues.apache.org/jira/browse/ARROW-2473) - [Rust] List assertion error with list of zero length -* [ARROW-2474](https://issues.apache.org/jira/browse/ARROW-2474) - [Rust] Add windows support for memory pool abstraction -* [ARROW-2489](https://issues.apache.org/jira/browse/ARROW-2489) - [Plasma] test\_plasma.py crashes -* [ARROW-2491](https://issues.apache.org/jira/browse/ARROW-2491) - [Python] Array.from\_buffers does not work for ListArray -* [ARROW-2492](https://issues.apache.org/jira/browse/ARROW-2492) - [Python] Prevent segfault on accidental call of pyarrow.Array -* [ARROW-2500](https://issues.apache.org/jira/browse/ARROW-2500) - [Java] IPC Writers/readers are not always setting validity bits correctly -* [ARROW-2502](https://issues.apache.org/jira/browse/ARROW-2502) - [Rust] Restore Windows Compatibility -* [ARROW-2503](https://issues.apache.org/jira/browse/ARROW-2503) - [Python] Trailing space character in RowGroup statistics of pyarrow.parquet.ParquetFile -* [ARROW-2509](https://issues.apache.org/jira/browse/ARROW-2509) - [CI] Intermittent npm failures -* [ARROW-2510](https://issues.apache.org/jira/browse/ARROW-2510) - [Python] Segmentation fault when converting empty column as categorical -* [ARROW-2511](https://issues.apache.org/jira/browse/ARROW-2511) - BaseVariableWidthVector.allocateNew is not throwing OOM when it can't allocate memory -* [ARROW-2514](https://issues.apache.org/jira/browse/ARROW-2514) - [Python] Inferring / converting nested Numpy array is very slow -* [ARROW-2515](https://issues.apache.org/jira/browse/ARROW-2515) - Errors with DictionaryArray inside of ListArray or other DictionaryArray -* [ARROW-2518](https://issues.apache.org/jira/browse/ARROW-2518) - [Java] Restore Java unit tests and javadoc test to CI matrix -* [ARROW-2530](https://issues.apache.org/jira/browse/ARROW-2530) - [GLib] Out-of-source build is failed -* [ARROW-2534](https://issues.apache.org/jira/browse/ARROW-2534) - [C++] libarrow.so leaks zlib symbols -* [ARROW-2545](https://issues.apache.org/jira/browse/ARROW-2545) - [Python] Arrow fails linking against statically-compiled Python -* [ARROW-2554](https://issues.apache.org/jira/browse/ARROW-2554) - pa.array type inference bug when using NS-timestamp -* [ARROW-2557](https://issues.apache.org/jira/browse/ARROW-2557) - [Rust] Add badge for code coverage in README -* [ARROW-2561](https://issues.apache.org/jira/browse/ARROW-2561) - [C++] Crash in cuda-test shutdown with coverage enabled -* [ARROW-2564](https://issues.apache.org/jira/browse/ARROW-2564) - [C++] Rowwise Tutorial is out of date -* [ARROW-2565](https://issues.apache.org/jira/browse/ARROW-2565) - [Plasma] new subscriber cannot receive notifications about existing objects -* [ARROW-2570](https://issues.apache.org/jira/browse/ARROW-2570) - [Python] Add support for writing parquet files with LZ4 compression -* [ARROW-2571](https://issues.apache.org/jira/browse/ARROW-2571) - [C++] Lz4Codec doesn't properly handle empty data -* [ARROW-2575](https://issues.apache.org/jira/browse/ARROW-2575) - [Python] Exclude hidden files when reading Parquet dataset -* [ARROW-2578](https://issues.apache.org/jira/browse/ARROW-2578) - [Plasma] Valgrind errors related to std::random\_device -* [ARROW-2589](https://issues.apache.org/jira/browse/ARROW-2589) - [Python] test\_parquet.py regression with Pandas 0.23.0 -* [ARROW-2593](https://issues.apache.org/jira/browse/ARROW-2593) - [Python] TypeError: data type "mixed-integer" not understood -* [ARROW-2594](https://issues.apache.org/jira/browse/ARROW-2594) - [Java] Vector reallocation does not properly clear reused buffers -* [ARROW-2599](https://issues.apache.org/jira/browse/ARROW-2599) - [Python] pip install is not working without Arrow C++ being installed -* [ARROW-2601](https://issues.apache.org/jira/browse/ARROW-2601) - [Python] MemoryPool bytes\_allocated causes seg -* [ARROW-2603](https://issues.apache.org/jira/browse/ARROW-2603) - [Python] from pandas raises ArrowInvalid for date(time) subclasses -* [ARROW-2615](https://issues.apache.org/jira/browse/ARROW-2615) - [Rust] Refactor introduced a bug around Arrays of String -* [ARROW-2622](https://issues.apache.org/jira/browse/ARROW-2622) - [C++] Array methods IsNull and IsValid are not complementary -* [ARROW-2629](https://issues.apache.org/jira/browse/ARROW-2629) - [Plasma] Iterator invalidation for pending\_notifications\_ -* [ARROW-2630](https://issues.apache.org/jira/browse/ARROW-2630) - [Java] Typo in the document -* [ARROW-2632](https://issues.apache.org/jira/browse/ARROW-2632) - [Java] ArrowStreamWriter accumulates ArrowBlock but does not use them -* [ARROW-2640](https://issues.apache.org/jira/browse/ARROW-2640) - JS Writer should serialize schema metadata -* [ARROW-2642](https://issues.apache.org/jira/browse/ARROW-2642) - [Python] Fail building parquet binding on Windows -* [ARROW-2643](https://issues.apache.org/jira/browse/ARROW-2643) - [C++] Travis-CI build failure with cpp toolchain enabled -* [ARROW-2644](https://issues.apache.org/jira/browse/ARROW-2644) - [Python] parquet binding fails building on AppVeyor -* [ARROW-2655](https://issues.apache.org/jira/browse/ARROW-2655) - [C++] Failure with -Werror=conversion on gcc 7.3.0 -* [ARROW-2657](https://issues.apache.org/jira/browse/ARROW-2657) - Segfault when importing TensorFlow after Pyarrow -* [ARROW-2668](https://issues.apache.org/jira/browse/ARROW-2668) - [C++] -Wnull-pointer-arithmetic warning with dlmalloc.c on clang 6.0, Ubuntu 14.04 -* [ARROW-2669](https://issues.apache.org/jira/browse/ARROW-2669) - [C++] EP\_CXX\_FLAGS not passed on when building gbenchmark -* [ARROW-2675](https://issues.apache.org/jira/browse/ARROW-2675) - Arrow build error with clang-10 (Apple Clang / LLVM) -* [ARROW-2683](https://issues.apache.org/jira/browse/ARROW-2683) - [Python] Resource Warning (Unclosed File) when using pyarrow.parquet.read\_table() -* [ARROW-2690](https://issues.apache.org/jira/browse/ARROW-2690) - [C++] Plasma does not follow style conventions for variable and function names -* [ARROW-2691](https://issues.apache.org/jira/browse/ARROW-2691) - [Rust] Travis fails due to formatting diff -* [ARROW-2693](https://issues.apache.org/jira/browse/ARROW-2693) - [Python] pa.chunked\_array causes a segmentation fault on empty input -* [ARROW-2694](https://issues.apache.org/jira/browse/ARROW-2694) - [Python] ArrayValue string conversion returns the representation instead of the converted python object string -* [ARROW-2698](https://issues.apache.org/jira/browse/ARROW-2698) - [Python] Exception when passing a string to Table.column -* [ARROW-2711](https://issues.apache.org/jira/browse/ARROW-2711) - [Python/C++] Pandas-Arrow doesn't roundtrip when column of lists has empty first element -* [ARROW-2715](https://issues.apache.org/jira/browse/ARROW-2715) - Address apt flakiness with launchpad.net -* [ARROW-2716](https://issues.apache.org/jira/browse/ARROW-2716) - [Python] Make manylinux1 base image independent of Python patch releases -* [ARROW-2721](https://issues.apache.org/jira/browse/ARROW-2721) - [C++] Link error with Arrow C++ build with -DARROW\_ORC=ON on CentOS 7 -* [ARROW-2722](https://issues.apache.org/jira/browse/ARROW-2722) - [Python] ndarray to arrow conversion fails when downcasted from pandas to\_numeric -* [ARROW-2723](https://issues.apache.org/jira/browse/ARROW-2723) - [C++] arrow-orc.pc is missing -* [ARROW-2726](https://issues.apache.org/jira/browse/ARROW-2726) - [C++] The latest Boost version is wrong -* [ARROW-2727](https://issues.apache.org/jira/browse/ARROW-2727) - [Java] Unable to build java/adapters module -* [ARROW-2741](https://issues.apache.org/jira/browse/ARROW-2741) - [Python] pa.array from np.datetime[D] and type=pa.date64 produces invalid results -* [ARROW-2744](https://issues.apache.org/jira/browse/ARROW-2744) - [Python] Writing to parquet crashes when writing a ListArray of empty lists -* [ARROW-2745](https://issues.apache.org/jira/browse/ARROW-2745) - [C++] ORC ExternalProject needs to declare dependency on vendored protobuf -* [ARROW-2747](https://issues.apache.org/jira/browse/ARROW-2747) - [CI] [Plasma] huge tables test failure on Travis -* [ARROW-2754](https://issues.apache.org/jira/browse/ARROW-2754) - [Python] When installing pyarrow via pip, a debug build is created -* [ARROW-2770](https://issues.apache.org/jira/browse/ARROW-2770) - [Packaging] Account for conda-forge compiler migration in conda recipes -* [ARROW-2773](https://issues.apache.org/jira/browse/ARROW-2773) - [Python] Corrected parquet docs partition\_cols parameter name -* [ARROW-2781](https://issues.apache.org/jira/browse/ARROW-2781) - [Python] Download boost using curl in manylinux1 image -* [ARROW-2787](https://issues.apache.org/jira/browse/ARROW-2787) - [Python] Memory Issue passing table from python to c++ via cython -* [ARROW-2795](https://issues.apache.org/jira/browse/ARROW-2795) - [Python] Run TensorFlow import workaround only on Linux -* [ARROW-2806](https://issues.apache.org/jira/browse/ARROW-2806) - [Python] Inconsistent handling of np.nan -* [ARROW-2810](https://issues.apache.org/jira/browse/ARROW-2810) - [Plasma] Plasma public headers leak flatbuffers.h -* [ARROW-2812](https://issues.apache.org/jira/browse/ARROW-2812) - [Ruby] StructArray\#[] raises NoMethodError -* [ARROW-2820](https://issues.apache.org/jira/browse/ARROW-2820) - [Python] RecordBatch.from\_arrays does not validate array lengths are all equal -* [ARROW-2823](https://issues.apache.org/jira/browse/ARROW-2823) - [C++] Search for flatbuffers in /lib64 -* [ARROW-2841](https://issues.apache.org/jira/browse/ARROW-2841) - [Go] Fix recent Go build failures in Travis CI -* [ARROW-2850](https://issues.apache.org/jira/browse/ARROW-2850) - [C++/Python] PARQUET\_RPATH\_ORIGIN=ON missing in manylinux1 build -* [ARROW-2851](https://issues.apache.org/jira/browse/ARROW-2851) - [C++] Update RAT excludes for new install file names -* [ARROW-2852](https://issues.apache.org/jira/browse/ARROW-2852) - [Rust] Mark Array as Sync and Send -* [ARROW-2856](https://issues.apache.org/jira/browse/ARROW-2856) - [Python/C++] Array constructor should not truncate floats when casting to int -* [ARROW-2862](https://issues.apache.org/jira/browse/ARROW-2862) - [C++] Ensure thirdparty download directory has been created in thirdparty/download\_thirdparty.sh -* [ARROW-2867](https://issues.apache.org/jira/browse/ARROW-2867) - [Python] Incorrect example for Cython usage -* [ARROW-2871](https://issues.apache.org/jira/browse/ARROW-2871) - [Python] Array.to\_numpy is invalid for boolean arrays -* [ARROW-2872](https://issues.apache.org/jira/browse/ARROW-2872) - [Python] Add pytest mark to opt into TensorFlow-related unit tests -* [ARROW-2876](https://issues.apache.org/jira/browse/ARROW-2876) - [Packaging] Crossbow builds can hang if you cloned using SSH -* [ARROW-2877](https://issues.apache.org/jira/browse/ARROW-2877) - [Packaging] crossbow submit results in duplicate Travis CI build -* [ARROW-2878](https://issues.apache.org/jira/browse/ARROW-2878) - [Packaging] README.md does not mention setting GitHub API token in user's crossbow repo settings -* [ARROW-2883](https://issues.apache.org/jira/browse/ARROW-2883) - [Plasma] Compilation warnings -* [ARROW-2891](https://issues.apache.org/jira/browse/ARROW-2891) - [Python] Preserve schema in write\_to\_dataset -* [ARROW-2894](https://issues.apache.org/jira/browse/ARROW-2894) - [Glib] Format tests broken due to recent refactor -* [ARROW-2895](https://issues.apache.org/jira/browse/ARROW-2895) - [Ruby] CI isn't ran when C++ is changed -* [ARROW-2896](https://issues.apache.org/jira/browse/ARROW-2896) - [GLib] export are missing -* [ARROW-2901](https://issues.apache.org/jira/browse/ARROW-2901) - [Java] Build is failing on Java9 -* [ARROW-2902](https://issues.apache.org/jira/browse/ARROW-2902) - [Python] HDFS Docker integration tests leave around files created by root -* [ARROW-2903](https://issues.apache.org/jira/browse/ARROW-2903) - [C++] Setting -DARROW\_HDFS=OFF breaks arrow build when linking against boost libraries -* [ARROW-2911](https://issues.apache.org/jira/browse/ARROW-2911) - [Python] Parquet binary statistics that end in '\0' truncate last byte -* [ARROW-2917](https://issues.apache.org/jira/browse/ARROW-2917) - [Python] Tensor requiring gradiant cannot be serialized with pyarrow.serialize -* [ARROW-2920](https://issues.apache.org/jira/browse/ARROW-2920) - [Python] Segfault with pytorch 0.4 -* [ARROW-2926](https://issues.apache.org/jira/browse/ARROW-2926) - [Python] ParquetWriter segfaults in example where passed schema and table schema do not match -* [ARROW-2930](https://issues.apache.org/jira/browse/ARROW-2930) - [C++] Trying to set target properties on not existing CMake target -* [ARROW-2940](https://issues.apache.org/jira/browse/ARROW-2940) - [Python] Import error with pytorch 0.3 -* [ARROW-2945](https://issues.apache.org/jira/browse/ARROW-2945) - [Packaging] Update argument check for 02-source.sh -* [ARROW-2955](https://issues.apache.org/jira/browse/ARROW-2955) - [Python] Typo in pyarrow's HDFS API result -* [ARROW-2963](https://issues.apache.org/jira/browse/ARROW-2963) - [Python] Deadlock during fork-join and use\_threads=True -* [ARROW-2978](https://issues.apache.org/jira/browse/ARROW-2978) - [Rust] Travis CI build is failing -* [ARROW-2982](https://issues.apache.org/jira/browse/ARROW-2982) - The "--show-progress" option is only supported in wget 1.16 and higher -* [ARROW-3210](https://issues.apache.org/jira/browse/ARROW-3210) - [Python] Creating ParquetDataset creates partitioned ParquetFiles with mismatched Parquet schemas - - -## New Features and Improvements - -* [ARROW-530](https://issues.apache.org/jira/browse/ARROW-530) - C++/Python: Provide subpools for better memory allocation tracking -* [ARROW-564](https://issues.apache.org/jira/browse/ARROW-564) - [Python] Add methods to return vanilla NumPy arrays (plus boolean mask array if there are nulls) -* [ARROW-665](https://issues.apache.org/jira/browse/ARROW-665) - C++: Move zeroing logic for (re)allocations to the Allocator -* [ARROW-889](https://issues.apache.org/jira/browse/ARROW-889) - [C++] Implement arrow::PrettyPrint for ChunkedArray -* [ARROW-902](https://issues.apache.org/jira/browse/ARROW-902) - [C++] Build C++ project including thirdparty dependencies from local tarballs -* [ARROW-906](https://issues.apache.org/jira/browse/ARROW-906) - [C++] Serialize Field metadata to IPC metadata -* [ARROW-1018](https://issues.apache.org/jira/browse/ARROW-1018) - [C++] Add option to create FileOutputStream, ReadableFile from OS file descriptor -* [ARROW-1163](https://issues.apache.org/jira/browse/ARROW-1163) - [Plasma][Java] Java client for Plasma -* [ARROW-1388](https://issues.apache.org/jira/browse/ARROW-1388) - [Python] Add Table.drop method for removing columns -* [ARROW-1454](https://issues.apache.org/jira/browse/ARROW-1454) - [Python] More informative error message when attempting to write an unsupported Arrow type to Parquet format -* [ARROW-1715](https://issues.apache.org/jira/browse/ARROW-1715) - [Python] Implement pickling for Column, ChunkedArray, RecordBatch, Table -* [ARROW-1722](https://issues.apache.org/jira/browse/ARROW-1722) - [C++] Add linting script to look for C++/CLI issues -* [ARROW-1731](https://issues.apache.org/jira/browse/ARROW-1731) - [Python] Provide for selecting a subset of columns to convert in RecordBatch/Table.from\_pandas -* [ARROW-1744](https://issues.apache.org/jira/browse/ARROW-1744) - [Plasma] Provide TensorFlow operator to read tensors from plasma -* [ARROW-1780](https://issues.apache.org/jira/browse/ARROW-1780) - [Java] JDBC Adapter for Apache Arrow -* [ARROW-1858](https://issues.apache.org/jira/browse/ARROW-1858) - [Python] Add documentation about parquet.write\_to\_dataset and related methods -* [ARROW-1868](https://issues.apache.org/jira/browse/ARROW-1868) - [Java] Change vector getMinorType to use MinorType instead of Types.MinorType -* [ARROW-1886](https://issues.apache.org/jira/browse/ARROW-1886) - [Python] Add function to "flatten" structs within tables -* [ARROW-1913](https://issues.apache.org/jira/browse/ARROW-1913) - [Java] Fix Javadoc generation bugs with JDK8 -* [ARROW-1928](https://issues.apache.org/jira/browse/ARROW-1928) - [C++] Add benchmarks comparing performance of internal::BitmapReader/Writer with naive approaches -* [ARROW-1954](https://issues.apache.org/jira/browse/ARROW-1954) - [Python] Add metadata accessor to pyarrow.Field -* [ARROW-1964](https://issues.apache.org/jira/browse/ARROW-1964) - [Python] Expose Builder classes -* [ARROW-2014](https://issues.apache.org/jira/browse/ARROW-2014) - [Python] Document read\_pandas method in pyarrow.parquet -* [ARROW-2055](https://issues.apache.org/jira/browse/ARROW-2055) - [Java] Upgrade to Java 8 -* [ARROW-2060](https://issues.apache.org/jira/browse/ARROW-2060) - [Python] Documentation for creating StructArray using from\_arrays or a sequence of dicts -* [ARROW-2061](https://issues.apache.org/jira/browse/ARROW-2061) - [C++] Run ASAN builds in Travis CI -* [ARROW-2074](https://issues.apache.org/jira/browse/ARROW-2074) - [Python] Allow type inference for struct arrays -* [ARROW-2097](https://issues.apache.org/jira/browse/ARROW-2097) - [Python] Suppress valgrind stdout/stderr in Travis CI builds when there are no errors -* [ARROW-2100](https://issues.apache.org/jira/browse/ARROW-2100) - [Python] Drop Python 3.4 support -* [ARROW-2140](https://issues.apache.org/jira/browse/ARROW-2140) - [Python] Conversion from Numpy float16 array unimplemented -* [ARROW-2141](https://issues.apache.org/jira/browse/ARROW-2141) - [Python] Conversion from Numpy object array to varsize binary unimplemented -* [ARROW-2147](https://issues.apache.org/jira/browse/ARROW-2147) - [Python] Type inference doesn't work on lists of Numpy arrays -* [ARROW-2207](https://issues.apache.org/jira/browse/ARROW-2207) - [GLib] Support decimal type -* [ARROW-2222](https://issues.apache.org/jira/browse/ARROW-2222) - [C++] Add option to validate Flatbuffers messages -* [ARROW-2224](https://issues.apache.org/jira/browse/ARROW-2224) - [C++] Get rid of boost regex usage -* [ARROW-2241](https://issues.apache.org/jira/browse/ARROW-2241) - [Python] Simple script for running all current ASV benchmarks at a commit or tag -* [ARROW-2264](https://issues.apache.org/jira/browse/ARROW-2264) - [Python] Efficiently serialize numpy arrays with dtype of unicode fixed length string -* [ARROW-2267](https://issues.apache.org/jira/browse/ARROW-2267) - Rust bindings -* [ARROW-2276](https://issues.apache.org/jira/browse/ARROW-2276) - [Python] Tensor could implement the buffer protocol -* [ARROW-2281](https://issues.apache.org/jira/browse/ARROW-2281) - [Python] Expose MakeArray to construct arrays from buffers -* [ARROW-2285](https://issues.apache.org/jira/browse/ARROW-2285) - [Python] Can't convert Numpy string arrays -* [ARROW-2286](https://issues.apache.org/jira/browse/ARROW-2286) - [Python] Allow subscripting pyarrow.lib.StructValue -* [ARROW-2287](https://issues.apache.org/jira/browse/ARROW-2287) - [Python] chunked array not iterable, not indexable -* [ARROW-2299](https://issues.apache.org/jira/browse/ARROW-2299) - [Go] Go language implementation -* [ARROW-2301](https://issues.apache.org/jira/browse/ARROW-2301) - [Python] Add source distribution publishing instructions to package / release management documentation -* [ARROW-2302](https://issues.apache.org/jira/browse/ARROW-2302) - [GLib] Run autotools and meson Linux builds in same Travis CI build entry -* [ARROW-2308](https://issues.apache.org/jira/browse/ARROW-2308) - Serialized tensor data should be 64-byte aligned. -* [ARROW-2315](https://issues.apache.org/jira/browse/ARROW-2315) - [C++/Python] Add method to flatten a struct array -* [ARROW-2319](https://issues.apache.org/jira/browse/ARROW-2319) - [C++] Add buffered output class implementing OutputStream interface -* [ARROW-2322](https://issues.apache.org/jira/browse/ARROW-2322) - Document requirements to run dev/release/01-perform.sh -* [ARROW-2325](https://issues.apache.org/jira/browse/ARROW-2325) - [Python] Update setup.py to use Markdown project description -* [ARROW-2330](https://issues.apache.org/jira/browse/ARROW-2330) - [C++] Optimize delta buffer creation with partially finishable array builders -* [ARROW-2332](https://issues.apache.org/jira/browse/ARROW-2332) - [Python] Provide API for reading multiple Feather files -* [ARROW-2332](https://issues.apache.org/jira/browse/ARROW-2332) - [Python] Provide API for reading multiple Feather files -* [ARROW-2334](https://issues.apache.org/jira/browse/ARROW-2334) - [C++] Update boost to 1.66.0 -* [ARROW-2335](https://issues.apache.org/jira/browse/ARROW-2335) - [Go] Move Go README one directory higher -* [ARROW-2340](https://issues.apache.org/jira/browse/ARROW-2340) - [Website] Add blog post about Go codebase donation -* [ARROW-2341](https://issues.apache.org/jira/browse/ARROW-2341) - [Python] pa.union() mode argument unintuitive -* [ARROW-2343](https://issues.apache.org/jira/browse/ARROW-2343) - [Java/Packaging] Run mvn clean in API doc builds -* [ARROW-2344](https://issues.apache.org/jira/browse/ARROW-2344) - [Go] Run Go unit tests in Travis CI -* [ARROW-2345](https://issues.apache.org/jira/browse/ARROW-2345) - [Documentation] Fix bundle exec and set sphinx nosidebar to True -* [ARROW-2348](https://issues.apache.org/jira/browse/ARROW-2348) - [GLib] Remove Go example -* [ARROW-2350](https://issues.apache.org/jira/browse/ARROW-2350) - Shrink size of spark\_integration Docker container -* [ARROW-2353](https://issues.apache.org/jira/browse/ARROW-2353) - Test correctness of built wheel on AppVeyor -* [ARROW-2361](https://issues.apache.org/jira/browse/ARROW-2361) - [Rust] Start native Rust Implementation -* [ARROW-2364](https://issues.apache.org/jira/browse/ARROW-2364) - [Plasma] PlasmaClient::Get() could take vector of object ids -* [ARROW-2376](https://issues.apache.org/jira/browse/ARROW-2376) - [Rust] Travis should run tests for Rust library -* [ARROW-2378](https://issues.apache.org/jira/browse/ARROW-2378) - [Rust] Use rustfmt to format source code -* [ARROW-2381](https://issues.apache.org/jira/browse/ARROW-2381) - [Rust] Buffer should have an Iterator -* [ARROW-2384](https://issues.apache.org/jira/browse/ARROW-2384) - Rust: Use Traits rather than defining methods directly -* [ARROW-2385](https://issues.apache.org/jira/browse/ARROW-2385) - [Rust] Implement to\_json() for Field and DataType -* [ARROW-2388](https://issues.apache.org/jira/browse/ARROW-2388) - [C++] Arrow::StringBuilder::Append() uses null\_bytes not valid\_bytes -* [ARROW-2389](https://issues.apache.org/jira/browse/ARROW-2389) - [C++] Add StatusCode::OverflowError -* [ARROW-2390](https://issues.apache.org/jira/browse/ARROW-2390) - [C++/Python] CheckPyError() could inspect exception type -* [ARROW-2394](https://issues.apache.org/jira/browse/ARROW-2394) - [Python] Correct flake8 errors in benchmarks -* [ARROW-2395](https://issues.apache.org/jira/browse/ARROW-2395) - [Python] Correct flake8 errors outside of pyarrow/ directory -* [ARROW-2396](https://issues.apache.org/jira/browse/ARROW-2396) - Unify Rust Errors -* [ARROW-2397](https://issues.apache.org/jira/browse/ARROW-2397) - Document changes in Tensor encoding in IPC.md. -* [ARROW-2398](https://issues.apache.org/jira/browse/ARROW-2398) - [Rust] Provide a zero-copy builder for type-safe Buffer -* [ARROW-2400](https://issues.apache.org/jira/browse/ARROW-2400) - [C++] Status destructor is expensive -* [ARROW-2401](https://issues.apache.org/jira/browse/ARROW-2401) - Support filters on Hive partitioned Parquet files -* [ARROW-2402](https://issues.apache.org/jira/browse/ARROW-2402) - [C++] FixedSizeBinaryBuilder::Append lacks "const char\*" overload -* [ARROW-2404](https://issues.apache.org/jira/browse/ARROW-2404) - Fix declaration of 'type\_id' hides class member warning in msvc build -* [ARROW-2407](https://issues.apache.org/jira/browse/ARROW-2407) - [GLib] Add garrow\_string\_array\_builder\_append\_values() -* [ARROW-2408](https://issues.apache.org/jira/browse/ARROW-2408) - [Rust] It should be possible to get a &mut[T] from Builder -* [ARROW-2408](https://issues.apache.org/jira/browse/ARROW-2408) - [Rust] It should be possible to get a &mut[T] from Builder -* [ARROW-2411](https://issues.apache.org/jira/browse/ARROW-2411) - [C++] Add method to append batches of null-terminated strings to StringBuilder -* [ARROW-2413](https://issues.apache.org/jira/browse/ARROW-2413) - [Rust] Remove useless use of \`format!\` -* [ARROW-2414](https://issues.apache.org/jira/browse/ARROW-2414) - [Documentation] Fix miscellaneous documentation typos -* [ARROW-2415](https://issues.apache.org/jira/browse/ARROW-2415) - [Rust] Fix using references in pattern matching -* [ARROW-2416](https://issues.apache.org/jira/browse/ARROW-2416) - [C++] Support system libprotobuf -* [ARROW-2417](https://issues.apache.org/jira/browse/ARROW-2417) - [Rust] Review APIs for safety -* [ARROW-2422](https://issues.apache.org/jira/browse/ARROW-2422) - [Python] Support more filter operators on Hive partitioned Parquet files -* [ARROW-2427](https://issues.apache.org/jira/browse/ARROW-2427) - [C++] ReadAt implementations suboptimal -* [ARROW-2430](https://issues.apache.org/jira/browse/ARROW-2430) - MVP for branch based packaging automation -* [ARROW-2433](https://issues.apache.org/jira/browse/ARROW-2433) - [Rust] Add Builder.push\_slice(&[T]) -* [ARROW-2434](https://issues.apache.org/jira/browse/ARROW-2434) - [Rust] Add windows support -* [ARROW-2435](https://issues.apache.org/jira/browse/ARROW-2435) - [Rust] Add memory pool abstraction. -* [ARROW-2436](https://issues.apache.org/jira/browse/ARROW-2436) - [Rust] Add windows CI -* [ARROW-2439](https://issues.apache.org/jira/browse/ARROW-2439) - [Rust] Run license header checks also in Rust CI entry -* [ARROW-2440](https://issues.apache.org/jira/browse/ARROW-2440) - [Rust] Implement ListBuilder -* [ARROW-2442](https://issues.apache.org/jira/browse/ARROW-2442) - [C++] Disambiguate Builder::Append overloads -* [ARROW-2445](https://issues.apache.org/jira/browse/ARROW-2445) - [Rust] Add documentation and make some fields private -* [ARROW-2448](https://issues.apache.org/jira/browse/ARROW-2448) - Segfault when plasma client goes out of scope before buffer. -* [ARROW-2451](https://issues.apache.org/jira/browse/ARROW-2451) - Handle more dtypes efficiently in custom numpy array serializer. -* [ARROW-2453](https://issues.apache.org/jira/browse/ARROW-2453) - [Python] Improve Table column access -* [ARROW-2458](https://issues.apache.org/jira/browse/ARROW-2458) - [Plasma] PlasmaClient uses global variable -* [ARROW-2463](https://issues.apache.org/jira/browse/ARROW-2463) - [C++] Update flatbuffers to 1.9.0 -* [ARROW-2464](https://issues.apache.org/jira/browse/ARROW-2464) - [Python] Use a python\_version marker instead of a condition -* [ARROW-2469](https://issues.apache.org/jira/browse/ARROW-2469) - Make out arguments last in ReadMessage API. -* [ARROW-2470](https://issues.apache.org/jira/browse/ARROW-2470) - [C++] FileGetSize() should not seek -* [ARROW-2472](https://issues.apache.org/jira/browse/ARROW-2472) - [Rust] The Schema and Fields types should not have public attributes -* [ARROW-2477](https://issues.apache.org/jira/browse/ARROW-2477) - [Rust] Set up code coverage in CI -* [ARROW-2478](https://issues.apache.org/jira/browse/ARROW-2478) - [C++] Introduce a checked\_cast function that performs a dynamic\_cast in debug mode -* [ARROW-2479](https://issues.apache.org/jira/browse/ARROW-2479) - [C++] Have a global thread pool -* [ARROW-2480](https://issues.apache.org/jira/browse/ARROW-2480) - [C++] Enable casting the value of a decimal to int32\_t or int64\_t -* [ARROW-2481](https://issues.apache.org/jira/browse/ARROW-2481) - [Rust] Move calls to free() into memory.rs -* [ARROW-2482](https://issues.apache.org/jira/browse/ARROW-2482) - [Rust] support nested types -* [ARROW-2484](https://issues.apache.org/jira/browse/ARROW-2484) - [C++] Document ABI compliance checking -* [ARROW-2485](https://issues.apache.org/jira/browse/ARROW-2485) - [C++] Output diff when run\_clang\_format.py reports a change -* [ARROW-2486](https://issues.apache.org/jira/browse/ARROW-2486) - [C++/Python] Provide a Docker image that contains all dependencies for development -* [ARROW-2488](https://issues.apache.org/jira/browse/ARROW-2488) - [C++] List Boost 1.67 as supported version -* [ARROW-2493](https://issues.apache.org/jira/browse/ARROW-2493) - [Python] Add support for pickling to buffers and arrays -* [ARROW-2494](https://issues.apache.org/jira/browse/ARROW-2494) - Return status codes from PlasmaClient::Seal -* [ARROW-2498](https://issues.apache.org/jira/browse/ARROW-2498) - [Java] Upgrade to JDK 1.8 -* [ARROW-2499](https://issues.apache.org/jira/browse/ARROW-2499) - [C++] Add iterator facility for Python sequences -* [ARROW-2505](https://issues.apache.org/jira/browse/ARROW-2505) - [C++] Disable MSVC warning C4800 -* [ARROW-2506](https://issues.apache.org/jira/browse/ARROW-2506) - [Plasma] Build error on macOS -* [ARROW-2507](https://issues.apache.org/jira/browse/ARROW-2507) - [Rust] Don't take a reference when not needed -* [ARROW-2508](https://issues.apache.org/jira/browse/ARROW-2508) - [Python] pytest API changes make tests fail -* [ARROW-2513](https://issues.apache.org/jira/browse/ARROW-2513) - [Python] DictionaryType should give access to index type and dictionary array -* [ARROW-2516](https://issues.apache.org/jira/browse/ARROW-2516) - AppVeyor Build Matrix should be specific to the changes made in a PR -* [ARROW-2521](https://issues.apache.org/jira/browse/ARROW-2521) - [Rust] Refactor Rust API to use traits and generics -* [ARROW-2522](https://issues.apache.org/jira/browse/ARROW-2522) - [C++] Version shared library files -* [ARROW-2525](https://issues.apache.org/jira/browse/ARROW-2525) - [GLib] Add garrow\_struct\_array\_flatten() -* [ARROW-2526](https://issues.apache.org/jira/browse/ARROW-2526) - [GLib] Update .gitignore -* [ARROW-2527](https://issues.apache.org/jira/browse/ARROW-2527) - [GLib] Enable GPU document -* [ARROW-2528](https://issues.apache.org/jira/browse/ARROW-2528) - [Rust] Add trait bounds for T in Buffer/List -* [ARROW-2529](https://issues.apache.org/jira/browse/ARROW-2529) - [C++] Update mention of clang-format to 5.0 in the docs -* [ARROW-2531](https://issues.apache.org/jira/browse/ARROW-2531) - [C++] Update clang bits to 6.0 -* [ARROW-2533](https://issues.apache.org/jira/browse/ARROW-2533) - [CI] Fast finish failing AppVeyor builds -* [ARROW-2536](https://issues.apache.org/jira/browse/ARROW-2536) - [Rust] ListBuilder uses wrong initial size for offset builder -* [ARROW-2537](https://issues.apache.org/jira/browse/ARROW-2537) - [Ruby] Import -* [ARROW-2539](https://issues.apache.org/jira/browse/ARROW-2539) - [Plasma] Use unique\_ptr instead of raw pointer -* [ARROW-2540](https://issues.apache.org/jira/browse/ARROW-2540) - [Plasma] add constructor/destructor to make sure dlfree is called automatically -* [ARROW-2541](https://issues.apache.org/jira/browse/ARROW-2541) - [Plasma] Clean up macro usage -* [ARROW-2543](https://issues.apache.org/jira/browse/ARROW-2543) - [Rust] CI should cache dependencies for faster builds -* [ARROW-2544](https://issues.apache.org/jira/browse/ARROW-2544) - [CI] Run C++ tests with two jobs on Travis-CI -* [ARROW-2547](https://issues.apache.org/jira/browse/ARROW-2547) - [Format] Fix off-by-one in List\> example -* [ARROW-2548](https://issues.apache.org/jira/browse/ARROW-2548) - [Format] Clarify \`List\` Array example -* [ARROW-2549](https://issues.apache.org/jira/browse/ARROW-2549) - [GLib] Apply arrow::StatusCodes changes to GArrowError -* [ARROW-2550](https://issues.apache.org/jira/browse/ARROW-2550) - [C++] Add missing status codes into arrow::StatusCode::CodeAsString() -* [ARROW-2551](https://issues.apache.org/jira/browse/ARROW-2551) - [Plasma] Improve notification logic -* [ARROW-2552](https://issues.apache.org/jira/browse/ARROW-2552) - [Plasma] Unit tests are flaky -* [ARROW-2553](https://issues.apache.org/jira/browse/ARROW-2553) - [Python] Set MACOSX\_DEPLOYMENT\_TARGET in wheel build -* [ARROW-2558](https://issues.apache.org/jira/browse/ARROW-2558) - [Plasma] avoid walk through all the objects when a client disconnects -* [ARROW-2562](https://issues.apache.org/jira/browse/ARROW-2562) - [C++] Upload coverage data to codecov.io -* [ARROW-2563](https://issues.apache.org/jira/browse/ARROW-2563) - [Rust] Poor caching in Travis-CI -* [ARROW-2566](https://issues.apache.org/jira/browse/ARROW-2566) - [CI] Add codecov.io badge to README -* [ARROW-2567](https://issues.apache.org/jira/browse/ARROW-2567) - [C++/Python] Unit is ignored on comparison of TimestampArrays -* [ARROW-2568](https://issues.apache.org/jira/browse/ARROW-2568) - [Python] Expose thread pool size setting to Python, and deprecate "nthreads" -* [ARROW-2569](https://issues.apache.org/jira/browse/ARROW-2569) - [C++] Improve thread pool size heuristic -* [ARROW-2574](https://issues.apache.org/jira/browse/ARROW-2574) - [CI] Collect and publish Python coverage -* [ARROW-2576](https://issues.apache.org/jira/browse/ARROW-2576) - [GLib] Add abs functions for Decimal128. -* [ARROW-2577](https://issues.apache.org/jira/browse/ARROW-2577) - [Plasma] Add ASV benchmarks -* [ARROW-2580](https://issues.apache.org/jira/browse/ARROW-2580) - [GLib] Fix abs functions for Decimal128 -* [ARROW-2582](https://issues.apache.org/jira/browse/ARROW-2582) - [GLib] Add negate functions for Decimal128 -* [ARROW-2585](https://issues.apache.org/jira/browse/ARROW-2585) - [C++] Add Decimal128::FromBigEndian -* [ARROW-2586](https://issues.apache.org/jira/browse/ARROW-2586) - [C++] Make child builders of ListBuilder and StructBuilder shared\_ptr's -* [ARROW-2595](https://issues.apache.org/jira/browse/ARROW-2595) - [Plasma] operator[] creates entries in map -* [ARROW-2596](https://issues.apache.org/jira/browse/ARROW-2596) - [GLib] Use the default value of GTK-Doc -* [ARROW-2597](https://issues.apache.org/jira/browse/ARROW-2597) - [Plasma] remove UniqueIDHasher -* [ARROW-2604](https://issues.apache.org/jira/browse/ARROW-2604) - [Java] Add method overload for VarCharVector.set(int,String) -* [ARROW-2608](https://issues.apache.org/jira/browse/ARROW-2608) - [Java/Python] Add pyarrow.{Array,Field}.from\_jvm / jvm\_buffer -* [ARROW-2611](https://issues.apache.org/jira/browse/ARROW-2611) - [Python] Python 2 integer serialization -* [ARROW-2612](https://issues.apache.org/jira/browse/ARROW-2612) - [Plasma] Fix deprecated PLASMA\_DEFAULT\_RELEASE\_DELAY -* [ARROW-2613](https://issues.apache.org/jira/browse/ARROW-2613) - [Docs] Update the gen\_apidocs docker script -* [ARROW-2614](https://issues.apache.org/jira/browse/ARROW-2614) - [CI] Remove 'group: deprecated' in Travis -* [ARROW-2626](https://issues.apache.org/jira/browse/ARROW-2626) - [Python] pandas ArrowInvalid message should include failing column name -* [ARROW-2634](https://issues.apache.org/jira/browse/ARROW-2634) - [Go] Add LICENSE additions for Go subproject -* [ARROW-2635](https://issues.apache.org/jira/browse/ARROW-2635) - [Ruby] LICENSE.txt isn't suitable -* [ARROW-2636](https://issues.apache.org/jira/browse/ARROW-2636) - [Ruby] "Unofficial" package note is missing -* [ARROW-2638](https://issues.apache.org/jira/browse/ARROW-2638) - [Python] Prevent calling extension class constructors directly -* [ARROW-2639](https://issues.apache.org/jira/browse/ARROW-2639) - [Python] Remove unnecessary \_check\_nullptr methods -* [ARROW-2641](https://issues.apache.org/jira/browse/ARROW-2641) - [C++] Investigate spurious memset() calls -* [ARROW-2645](https://issues.apache.org/jira/browse/ARROW-2645) - [Java] ArrowStreamWriter accumulates DictionaryBatch ArrowBlocks -* [ARROW-2649](https://issues.apache.org/jira/browse/ARROW-2649) - [C++] Add std::generate()-like function for faster bitmap writing -* [ARROW-2656](https://issues.apache.org/jira/browse/ARROW-2656) - [Python] Improve ParquetManifest creation time -* [ARROW-2660](https://issues.apache.org/jira/browse/ARROW-2660) - [Python] Experiment with zero-copy pickling -* [ARROW-2661](https://issues.apache.org/jira/browse/ARROW-2661) - [Python/C++] Allow passing HDFS Config values via map/dict instead of needing an hdfs-site.xml file -* [ARROW-2662](https://issues.apache.org/jira/browse/ARROW-2662) - [Python] Add to\_pandas / to\_numpy to ChunkedArray -* [ARROW-2663](https://issues.apache.org/jira/browse/ARROW-2663) - [Python] Make dictionary\_encode and unique accesible on Column / ChunkedArray -* [ARROW-2664](https://issues.apache.org/jira/browse/ARROW-2664) - [Python] Implement \_\_getitem\_\_ / slicing on Buffer -* [ARROW-2666](https://issues.apache.org/jira/browse/ARROW-2666) - [Python] numpy.asarray should trigger to\_pandas on Array/ChunkedArray -* [ARROW-2672](https://issues.apache.org/jira/browse/ARROW-2672) - [Python] Build ORC extension in manylinux1 wheels -* [ARROW-2674](https://issues.apache.org/jira/browse/ARROW-2674) - [Packaging] Start building nightlies -* [ARROW-2676](https://issues.apache.org/jira/browse/ARROW-2676) - [Packaging] Deploy build artifacts to github releases -* [ARROW-2677](https://issues.apache.org/jira/browse/ARROW-2677) - [Python] Expose Parquet ZSTD compression -* [ARROW-2678](https://issues.apache.org/jira/browse/ARROW-2678) - [GLib] Add extra information to common build problems on macOS -* [ARROW-2680](https://issues.apache.org/jira/browse/ARROW-2680) - [Python] Add documentation about type inference in Table.from\_pandas -* [ARROW-2682](https://issues.apache.org/jira/browse/ARROW-2682) - [CI] Notify in Slack about broken builds -* [ARROW-2689](https://issues.apache.org/jira/browse/ARROW-2689) - [Python] Remove references to timestamps\_to\_ms argument from documentation -* [ARROW-2692](https://issues.apache.org/jira/browse/ARROW-2692) - [Python] Add test for writing dictionary encoded columns to chunked Parquet files -* [ARROW-2695](https://issues.apache.org/jira/browse/ARROW-2695) - [Python] Prevent calling scalar contructors directly -* [ARROW-2696](https://issues.apache.org/jira/browse/ARROW-2696) - [JAVA] enhance AllocationListener with an onFailedAllocation() call -* [ARROW-2699](https://issues.apache.org/jira/browse/ARROW-2699) - [C++/Python] Add Table method that replaces a column with a new supplied column -* [ARROW-2700](https://issues.apache.org/jira/browse/ARROW-2700) - [Python] Add simple examples to Array.cast docstring -* [ARROW-2701](https://issues.apache.org/jira/browse/ARROW-2701) - [C++] Make MemoryMappedFile resizable -* [ARROW-2704](https://issues.apache.org/jira/browse/ARROW-2704) - [Java] IPC stream handling should be more friendly to low level processing -* [ARROW-2713](https://issues.apache.org/jira/browse/ARROW-2713) - [Packaging] Fix linux package builds -* [ARROW-2717](https://issues.apache.org/jira/browse/ARROW-2717) - [Packaging] Postfix conda artifacts with target arch -* [ARROW-2718](https://issues.apache.org/jira/browse/ARROW-2718) - [Packaging] GPG sign downloaded artifacts -* [ARROW-2724](https://issues.apache.org/jira/browse/ARROW-2724) - [Packaging] Determine whether all the expected artifacts are uploaded -* [ARROW-2725](https://issues.apache.org/jira/browse/ARROW-2725) - [JAVA] make Accountant.AllocationOutcome publicly visible -* [ARROW-2729](https://issues.apache.org/jira/browse/ARROW-2729) - [GLib] Add decimal128 array builder -* [ARROW-2731](https://issues.apache.org/jira/browse/ARROW-2731) - Allow usage of external ORC library -* [ARROW-2732](https://issues.apache.org/jira/browse/ARROW-2732) - Update brew packages for macOS -* [ARROW-2733](https://issues.apache.org/jira/browse/ARROW-2733) - [GLib] Cast garrow\_decimal128 to gint64 -* [ARROW-2738](https://issues.apache.org/jira/browse/ARROW-2738) - [GLib] Use Brewfile on installation process -* [ARROW-2739](https://issues.apache.org/jira/browse/ARROW-2739) - [GLib] Use G\_DECLARE\_DERIVABLE\_TYPE for GArrowDecimalDataType and GArrowDecimal128ArrayBuilder -* [ARROW-2740](https://issues.apache.org/jira/browse/ARROW-2740) - [Python] Add address property to Buffer -* [ARROW-2742](https://issues.apache.org/jira/browse/ARROW-2742) - [Python] Allow Table.from\_batches to use Iterator of ArrowRecordBatches -* [ARROW-2748](https://issues.apache.org/jira/browse/ARROW-2748) - [GLib] Add garrow\_decimal\_data\_type\_get\_scale() (and \_precision()) -* [ARROW-2749](https://issues.apache.org/jira/browse/ARROW-2749) - [GLib] Rename \*garrow\_decimal128\_array\_get\_value to \*garrow\_decimal128\_array\_format\_value -* [ARROW-2751](https://issues.apache.org/jira/browse/ARROW-2751) - [GLib] Add garrow\_table\_replace\_column() -* [ARROW-2752](https://issues.apache.org/jira/browse/ARROW-2752) - [GLib] Document garrow\_decimal\_data\_type\_new() -* [ARROW-2753](https://issues.apache.org/jira/browse/ARROW-2753) - [GLib] Add garrow\_schema\_\*\_field() -* [ARROW-2755](https://issues.apache.org/jira/browse/ARROW-2755) - [Python] Allow using Ninja to build extension -* [ARROW-2756](https://issues.apache.org/jira/browse/ARROW-2756) - [Python] Remove redundant imports and minor fixes in parquet tests -* [ARROW-2758](https://issues.apache.org/jira/browse/ARROW-2758) - [Plasma] Use Scope enum in Plasma -* [ARROW-2760](https://issues.apache.org/jira/browse/ARROW-2760) - [Python] Remove legacy property definition syntax from parquet module and test them -* [ARROW-2761](https://issues.apache.org/jira/browse/ARROW-2761) - Support set filter operators on Hive partitioned Parquet files -* [ARROW-2763](https://issues.apache.org/jira/browse/ARROW-2763) - [Python] Make parquet \_metadata file accessible from ParquetDataset -* [ARROW-2780](https://issues.apache.org/jira/browse/ARROW-2780) - [Go] Run code coverage analysis -* [ARROW-2784](https://issues.apache.org/jira/browse/ARROW-2784) - [C++] MemoryMappedFile::WriteAt allow writing past the end -* [ARROW-2790](https://issues.apache.org/jira/browse/ARROW-2790) - [C++] Buffers contain uninitialized memory -* [ARROW-2790](https://issues.apache.org/jira/browse/ARROW-2790) - [C++] Buffers contain uninitialized memory -* [ARROW-2791](https://issues.apache.org/jira/browse/ARROW-2791) - [Packaging] Build Ubuntu 18.04 packages -* [ARROW-2792](https://issues.apache.org/jira/browse/ARROW-2792) - [Packaging] Consider uploading tarballs to avoid naming conflicts -* [ARROW-2794](https://issues.apache.org/jira/browse/ARROW-2794) - [Plasma] Add Delete method for multiple objects -* [ARROW-2798](https://issues.apache.org/jira/browse/ARROW-2798) - [Plasma] Use hashing function that takes into account all UniqueID bytes -* [ARROW-2802](https://issues.apache.org/jira/browse/ARROW-2802) - [Docs] Move release management guide to project wiki -* [ARROW-2804](https://issues.apache.org/jira/browse/ARROW-2804) - [Website] Link to Developer wiki (Confluence) from front page -* [ARROW-2805](https://issues.apache.org/jira/browse/ARROW-2805) - [Python] TensorFlow import workaround not working with tensorflow-gpu if CUDA is not installed -* [ARROW-2809](https://issues.apache.org/jira/browse/ARROW-2809) - [C++] Decrease verbosity of lint checks in Travis CI -* [ARROW-2811](https://issues.apache.org/jira/browse/ARROW-2811) - [Python] Test serialization for determinism -* [ARROW-2815](https://issues.apache.org/jira/browse/ARROW-2815) - [CI] Suppress DEBUG logging when building Java library in C++ CI entries -* [ARROW-2816](https://issues.apache.org/jira/browse/ARROW-2816) - [Python] Add \_\_iter\_\_ method to NativeFile -* [ARROW-2821](https://issues.apache.org/jira/browse/ARROW-2821) - [C++] Only zero memory in BooleanBuilder in one place -* [ARROW-2822](https://issues.apache.org/jira/browse/ARROW-2822) - [C++] Zero padding bytes in PoolBuffer::Resize -* [ARROW-2822](https://issues.apache.org/jira/browse/ARROW-2822) - [C++] Zero padding bytes in PoolBuffer::Resize -* [ARROW-2824](https://issues.apache.org/jira/browse/ARROW-2824) - [GLib] Add garrow\_decimal128\_array\_get\_value() -* [ARROW-2825](https://issues.apache.org/jira/browse/ARROW-2825) - [C++] Need AllocateBuffer / AllocateResizableBuffer variant with default memory pool -* [ARROW-2826](https://issues.apache.org/jira/browse/ARROW-2826) - [C++] Clarification needed between ArrayBuilder::Init(), Resize() and Reserve() -* [ARROW-2827](https://issues.apache.org/jira/browse/ARROW-2827) - [C++] LZ4 and Zstd build may be failed in parallel build -* [ARROW-2829](https://issues.apache.org/jira/browse/ARROW-2829) - [GLib] Add GArrowORCFileReader -* [ARROW-2830](https://issues.apache.org/jira/browse/ARROW-2830) - [Packaging] Enable parallel build for deb package build again -* [ARROW-2832](https://issues.apache.org/jira/browse/ARROW-2832) - [Python] Pretty-print schema metadata in Schema.\_\_repr\_\_ -* [ARROW-2833](https://issues.apache.org/jira/browse/ARROW-2833) - [Python] Column.\_\_repr\_\_ will lock up Jupyter with large datasets -* [ARROW-2834](https://issues.apache.org/jira/browse/ARROW-2834) - [GLib] Remove "enable\_" prefix from Meson options -* [ARROW-2836](https://issues.apache.org/jira/browse/ARROW-2836) - [Packaging] Expand build matrices to multiple tasks -* [ARROW-2837](https://issues.apache.org/jira/browse/ARROW-2837) - [C++] ArrayBuilder::null\_bitmap returns PoolBuffer -* [ARROW-2838](https://issues.apache.org/jira/browse/ARROW-2838) - [Python] Speed up null testing with Pandas semantics -* [ARROW-2844](https://issues.apache.org/jira/browse/ARROW-2844) - [Packaging] Test OSX wheels after build -* [ARROW-2845](https://issues.apache.org/jira/browse/ARROW-2845) - [Packaging] Upload additional debian artifacts -* [ARROW-2846](https://issues.apache.org/jira/browse/ARROW-2846) - [Packaging] Update nightly build in crossbow as well as the sample configuration -* [ARROW-2847](https://issues.apache.org/jira/browse/ARROW-2847) - [Packaging] Fix artifact name matching for conda forge packages -* [ARROW-2848](https://issues.apache.org/jira/browse/ARROW-2848) - [Packaging] lib\*.deb package name doesn't match so version -* [ARROW-2849](https://issues.apache.org/jira/browse/ARROW-2849) - [Ruby] Arrow::Table\#load supports ORC -* [ARROW-2855](https://issues.apache.org/jira/browse/ARROW-2855) - [C++] Blog post that outlines the benefits of using jemalloc -* [ARROW-2859](https://issues.apache.org/jira/browse/ARROW-2859) - [Python] Handle objects exporting the buffer protocol in open\_stream, open\_file, and RecordBatch\*Reader APIs -* [ARROW-2861](https://issues.apache.org/jira/browse/ARROW-2861) - [Python] Add extra tips about using Parquet to store index-less pandas data -* [ARROW-2864](https://issues.apache.org/jira/browse/ARROW-2864) - [Plasma] Add deletion cache to delete objects later -* [ARROW-2868](https://issues.apache.org/jira/browse/ARROW-2868) - [Packaging] Fix centos-7 build -* [ARROW-2869](https://issues.apache.org/jira/browse/ARROW-2869) - [Python] Add documentation for Array.to\_numpy -* [ARROW-2874](https://issues.apache.org/jira/browse/ARROW-2874) - [Packaging] Pass job prefix when putting on Queue -* [ARROW-2875](https://issues.apache.org/jira/browse/ARROW-2875) - [Packaging] Don't attempt to download arrow archive in linux builds -* [ARROW-2881](https://issues.apache.org/jira/browse/ARROW-2881) - [Website] Add Community tab to website -* [ARROW-2884](https://issues.apache.org/jira/browse/ARROW-2884) - [Packaging] Options to build packages from apache source archive -* [ARROW-2886](https://issues.apache.org/jira/browse/ARROW-2886) - [Release] An unused variable exists -* [ARROW-2890](https://issues.apache.org/jira/browse/ARROW-2890) - [Plasma] Make Python PlasmaClient.release private -* [ARROW-2893](https://issues.apache.org/jira/browse/ARROW-2893) - [C++] Remove PoolBuffer class from public API and hide implementation details behind factory functions -* [ARROW-2897](https://issues.apache.org/jira/browse/ARROW-2897) - Organize supported Ubuntu versions -* [ARROW-2898](https://issues.apache.org/jira/browse/ARROW-2898) - [Packaging] Setuptools\_scm just shipped a new version which fails to parse \`apache-arrow-\` tag -* [ARROW-2906](https://issues.apache.org/jira/browse/ARROW-2906) - [Website] Remove the link to slack channel -* [ARROW-2907](https://issues.apache.org/jira/browse/ARROW-2907) - [GitHub] Improve "How to contribute patches" -* [ARROW-2908](https://issues.apache.org/jira/browse/ARROW-2908) - [Rust] Update version to 0.10.0 -* [ARROW-2914](https://issues.apache.org/jira/browse/ARROW-2914) - [Integration] Add WindowPandasUDFTests to Spark Integration -* [ARROW-2915](https://issues.apache.org/jira/browse/ARROW-2915) - [Packaging] Remove artifact form ubuntu-trusty build -* [ARROW-2918](https://issues.apache.org/jira/browse/ARROW-2918) - [C++] Improve formatting of Struct pretty prints -* [ARROW-2921](https://issues.apache.org/jira/browse/ARROW-2921) - [Release] Update .deb/.rpm changelos in preparation -* [ARROW-2922](https://issues.apache.org/jira/browse/ARROW-2922) - [Release] Make python command name customizable -* [ARROW-2923](https://issues.apache.org/jira/browse/ARROW-2923) - [Doc] Add instructions for running Spark integration tests -* [ARROW-2924](https://issues.apache.org/jira/browse/ARROW-2924) - [Java] mvn release fails when an older maven javadoc plugin is installed -* [ARROW-2927](https://issues.apache.org/jira/browse/ARROW-2927) - [Packaging] AppVeyor wheel task is failing on initial checkout -* [ARROW-2928](https://issues.apache.org/jira/browse/ARROW-2928) - [Packaging] AppVeyor crossbow conda builds are picking up boost 1.63.0 instead of the installed version -* [ARROW-2929](https://issues.apache.org/jira/browse/ARROW-2929) - [C++] ARROW-2826 Breaks parquet-cpp 1.4.0 builds -* [ARROW-2934](https://issues.apache.org/jira/browse/ARROW-2934) - [Packaging] Add checksums creation to sign subcommand -* [ARROW-2935](https://issues.apache.org/jira/browse/ARROW-2935) - [Packaging] Add verify\_binary\_artifacts function to verify-release-candidate.sh -* [ARROW-2937](https://issues.apache.org/jira/browse/ARROW-2937) - [Java] Follow-up changes to ARROW-2704 -* [ARROW-2943](https://issues.apache.org/jira/browse/ARROW-2943) - [C++] Implement BufferedOutputStream::Flush -* [ARROW-2944](https://issues.apache.org/jira/browse/ARROW-2944) - [Format] Arrow columnar format docs mentions VectorLayout that does not exist anymore -* [ARROW-2946](https://issues.apache.org/jira/browse/ARROW-2946) - [Packaging] Stop to use PWD in debian/rules -* [ARROW-2947](https://issues.apache.org/jira/browse/ARROW-2947) - [Packaging] Remove Ubuntu Artful -* [ARROW-2949](https://issues.apache.org/jira/browse/ARROW-2949) - [CI] repo.continuum.io can be flaky in builds -* [ARROW-2951](https://issues.apache.org/jira/browse/ARROW-2951) - [CI] Changes in format/ should cause Appveyor builds to run -* [ARROW-2953](https://issues.apache.org/jira/browse/ARROW-2953) - [Plasma] Store memory usage -* [ARROW-2954](https://issues.apache.org/jira/browse/ARROW-2954) - [Plasma] Store object\_id only once in object table -* [ARROW-2962](https://issues.apache.org/jira/browse/ARROW-2962) - [Packaging] Bintray descriptor files are no longer needed -* [ARROW-2977](https://issues.apache.org/jira/browse/ARROW-2977) - [Packaging] Release verification script should check rust too -* [ARROW-2985](https://issues.apache.org/jira/browse/ARROW-2985) - [Ruby] Run unit tests in verify-release-candidate.sh -* [ARROW-2988](https://issues.apache.org/jira/browse/ARROW-2988) - [Release] More automated release verification on Windows -* [ARROW-2990](https://issues.apache.org/jira/browse/ARROW-2990) - [GLib] Fail to build with rpath-ed Arrow C++ on macOS - - - -# Apache Arrow 0.9.0 (2018-03-19) - -## New Features and Improvements - -* [ARROW-232](https://issues.apache.org/jira/browse/ARROW-232) - C++/Parquet: Support writing chunked arrays as part of a table -* [ARROW-633](https://issues.apache.org/jira/browse/ARROW-633) - [Java] Add support for FixedSizeBinary type -* [ARROW-634](https://issues.apache.org/jira/browse/ARROW-634) - Add integration tests for FixedSizeBinary -* [ARROW-760](https://issues.apache.org/jira/browse/ARROW-760) - [Python] document differences w.r.t. fastparquet -* [ARROW-764](https://issues.apache.org/jira/browse/ARROW-764) - [C++] Improve performance of CopyBitmap, add benchmarks -* [ARROW-969](https://issues.apache.org/jira/browse/ARROW-969) - [C++/Python] Add add/remove field functions for RecordBatch -* [ARROW-1021](https://issues.apache.org/jira/browse/ARROW-1021) - [Python] Add documentation about using pyarrow from other Cython and C++ projects -* [ARROW-1035](https://issues.apache.org/jira/browse/ARROW-1035) - [Python] Add ASV benchmarks for streaming columnar deserialization -* [ARROW-1394](https://issues.apache.org/jira/browse/ARROW-1394) - [Plasma] Add optional extension for allocating memory on GPUs -* [ARROW-1463](https://issues.apache.org/jira/browse/ARROW-1463) - [JAVA] Restructure ValueVector hierarchy to minimize compile-time generated code -* [ARROW-1579](https://issues.apache.org/jira/browse/ARROW-1579) - [Java] Add dockerized test setup to validate Spark integration -* [ARROW-1580](https://issues.apache.org/jira/browse/ARROW-1580) - [Python] Instructions for setting up nightly builds on Linux -* [ARROW-1623](https://issues.apache.org/jira/browse/ARROW-1623) - [C++] Add convenience method to construct Buffer from a string that owns its memory -* [ARROW-1632](https://issues.apache.org/jira/browse/ARROW-1632) - [Python] Permit categorical conversions in Table.to\_pandas on a per-column basis -* [ARROW-1643](https://issues.apache.org/jira/browse/ARROW-1643) - [Python] Accept hdfs:// prefixes in parquet.read\_table and attempt to connect to HDFS -* [ARROW-1705](https://issues.apache.org/jira/browse/ARROW-1705) - [Python] Create StructArray from sequence of dicts given a known data type -* [ARROW-1706](https://issues.apache.org/jira/browse/ARROW-1706) - [Python] StructArray.from\_arrays should handle sequences that are coercible to arrays -* [ARROW-1712](https://issues.apache.org/jira/browse/ARROW-1712) - [C++] Add method to BinaryBuilder to reserve space for value data -* [ARROW-1757](https://issues.apache.org/jira/browse/ARROW-1757) - [C++] Add DictionaryArray::FromArrays alternate ctor that can check or sanitized "untrusted" indices -* [ARROW-1815](https://issues.apache.org/jira/browse/ARROW-1815) - [Java] Rename MapVector to StructVector -* [ARROW-1832](https://issues.apache.org/jira/browse/ARROW-1832) - [JS] Implement JSON reader for integration tests -* [ARROW-1835](https://issues.apache.org/jira/browse/ARROW-1835) - [C++] Create Arrow schema from std::tuple types -* [ARROW-1861](https://issues.apache.org/jira/browse/ARROW-1861) - [Python] Fix up ASV setup, add developer instructions for writing new benchmarks and running benchmark suite locally -* [ARROW-1872](https://issues.apache.org/jira/browse/ARROW-1872) - [Website] Populate hard-coded fields for current release from a YAML file -* [ARROW-1899](https://issues.apache.org/jira/browse/ARROW-1899) - [Python] Refactor handling of null sentinels in python/numpy\_to\_arrow.cc -* [ARROW-1920](https://issues.apache.org/jira/browse/ARROW-1920) - Add support for reading ORC files -* [ARROW-1926](https://issues.apache.org/jira/browse/ARROW-1926) - [GLib] Add garrow\_timestamp\_data\_type\_get\_unit() -* [ARROW-1927](https://issues.apache.org/jira/browse/ARROW-1927) - [Plasma] Implement delete function -* [ARROW-1929](https://issues.apache.org/jira/browse/ARROW-1929) - [C++] Move various Arrow testing utility code from Parquet to Arrow codebase -* [ARROW-1930](https://issues.apache.org/jira/browse/ARROW-1930) - [C++] Implement Slice for ChunkedArray and Column -* [ARROW-1931](https://issues.apache.org/jira/browse/ARROW-1931) - [C++] w4996 warning due to std::tr1 failing builds on Visual Studio 2017 -* [ARROW-1937](https://issues.apache.org/jira/browse/ARROW-1937) - [Python] Add documentation for different forms of constructing nested arrays from Python data structures -* [ARROW-1942](https://issues.apache.org/jira/browse/ARROW-1942) - [C++] Hash table specializations for small integers -* [ARROW-1947](https://issues.apache.org/jira/browse/ARROW-1947) - [Plasma] Change Client Create and Get to use Buffers -* [ARROW-1951](https://issues.apache.org/jira/browse/ARROW-1951) - Add memcopy\_threads to serialization context -* [ARROW-1962](https://issues.apache.org/jira/browse/ARROW-1962) - [Java] Add reset() to ValueVector interface -* [ARROW-1965](https://issues.apache.org/jira/browse/ARROW-1965) - [GLib] Add garrow\_array\_builder\_get\_value\_data\_type() and garrow\_array\_builder\_get\_value\_type() -* [ARROW-1969](https://issues.apache.org/jira/browse/ARROW-1969) - [C++] Do not build ORC adapter by default -* [ARROW-1970](https://issues.apache.org/jira/browse/ARROW-1970) - [GLib] Add garrow\_chunked\_array\_get\_value\_data\_type() and garrow\_chunked\_array\_get\_value\_type() -* [ARROW-1977](https://issues.apache.org/jira/browse/ARROW-1977) - [C++] Update windows dev docs -* [ARROW-1978](https://issues.apache.org/jira/browse/ARROW-1978) - [Website] Add more visible link to "Powered By" page to front page, simplify Powered By -* [ARROW-2004](https://issues.apache.org/jira/browse/ARROW-2004) - [C++] Add shrink\_to\_fit option in BufferBuilder::Resize -* [ARROW-2007](https://issues.apache.org/jira/browse/ARROW-2007) - [Python] Sequence converter for float32 not implemented -* [ARROW-2011](https://issues.apache.org/jira/browse/ARROW-2011) - Allow setting the pickler to use in pyarrow serialization. -* [ARROW-2012](https://issues.apache.org/jira/browse/ARROW-2012) - [GLib] Support "make distclean" -* [ARROW-2018](https://issues.apache.org/jira/browse/ARROW-2018) - [C++] Build instruction on macOS and Homebrew is incomplete -* [ARROW-2019](https://issues.apache.org/jira/browse/ARROW-2019) - Control the memory allocated for inner vector in LIST -* [ARROW-2024](https://issues.apache.org/jira/browse/ARROW-2024) - [Python] Remove global SerializationContext variables -* [ARROW-2028](https://issues.apache.org/jira/browse/ARROW-2028) - [Python] extra\_cmake\_args needs to be passed through shlex.split -* [ARROW-2031](https://issues.apache.org/jira/browse/ARROW-2031) - HadoopFileSystem isn't pickleable -* [ARROW-2035](https://issues.apache.org/jira/browse/ARROW-2035) - [C++] Update vendored cpplint.py to a Py3-compatible one -* [ARROW-2036](https://issues.apache.org/jira/browse/ARROW-2036) - NativeFile should support standard IOBase methods -* [ARROW-2042](https://issues.apache.org/jira/browse/ARROW-2042) - [Plasma] Revert API change of plasma::Create to output a MutableBuffer -* [ARROW-2043](https://issues.apache.org/jira/browse/ARROW-2043) - [C++] Change description from OS X to macOS -* [ARROW-2046](https://issues.apache.org/jira/browse/ARROW-2046) - [Python] Add support for PEP519 - pathlib and similar objects -* [ARROW-2048](https://issues.apache.org/jira/browse/ARROW-2048) - [Python/C++] Upate Thrift pin to 0.11 -* [ARROW-2050](https://issues.apache.org/jira/browse/ARROW-2050) - Support \`setup.py pytest\` to automatically fetch the test dependencies -* [ARROW-2052](https://issues.apache.org/jira/browse/ARROW-2052) - Unify OwnedRef and ScopedRef -* [ARROW-2053](https://issues.apache.org/jira/browse/ARROW-2053) - [C++] Build instruction is incomplete -* [ARROW-2054](https://issues.apache.org/jira/browse/ARROW-2054) - Compilation warnings -* [ARROW-2064](https://issues.apache.org/jira/browse/ARROW-2064) - [GLib] Add common build problems link to the install section -* [ARROW-2065](https://issues.apache.org/jira/browse/ARROW-2065) - Fix bug in SerializationContext.clone(). -* [ARROW-2066](https://issues.apache.org/jira/browse/ARROW-2066) - [Python] Document reading Parquet files from Azure Blob Store -* [ARROW-2068](https://issues.apache.org/jira/browse/ARROW-2068) - [Python] Expose Array's buffers to Python users -* [ARROW-2069](https://issues.apache.org/jira/browse/ARROW-2069) - [Python] Document that Plasma is not (yet) supported on Windows -* [ARROW-2071](https://issues.apache.org/jira/browse/ARROW-2071) - [Python] Reduce runtime of builds in Travis CI -* [ARROW-2071](https://issues.apache.org/jira/browse/ARROW-2071) - [Python] Reduce runtime of builds in Travis CI -* [ARROW-2073](https://issues.apache.org/jira/browse/ARROW-2073) - [Python] Create StructArray from sequence of tuples given a known data type -* [ARROW-2076](https://issues.apache.org/jira/browse/ARROW-2076) - [Python] Display slowest test durations -* [ARROW-2083](https://issues.apache.org/jira/browse/ARROW-2083) - Support skipping builds -* [ARROW-2084](https://issues.apache.org/jira/browse/ARROW-2084) - [C++] Support newer Brotli static library names -* [ARROW-2086](https://issues.apache.org/jira/browse/ARROW-2086) - [Python] Shrink size of arrow\_manylinux1\_x86\_64\_base docker image -* [ARROW-2087](https://issues.apache.org/jira/browse/ARROW-2087) - [Python] Binaries of 3rdparty are not stripped in manylinux1 base image -* [ARROW-2088](https://issues.apache.org/jira/browse/ARROW-2088) - [GLib] Add GArrowNumericArray -* [ARROW-2089](https://issues.apache.org/jira/browse/ARROW-2089) - [GLib] Rename to GARROW\_TYPE\_BOOLEAN for consistency -* [ARROW-2090](https://issues.apache.org/jira/browse/ARROW-2090) - [Python] Add context manager methods to ParquetWriter -* [ARROW-2093](https://issues.apache.org/jira/browse/ARROW-2093) - [Python] Possibly do not test pytorch serialization in Travis CI -* [ARROW-2094](https://issues.apache.org/jira/browse/ARROW-2094) - [Python] Use toolchain libraries and PROTOBUF\_HOME for protocol buffers -* [ARROW-2095](https://issues.apache.org/jira/browse/ARROW-2095) - [C++] Suppress ORC EP build logging by default -* [ARROW-2096](https://issues.apache.org/jira/browse/ARROW-2096) - [C++] Turn off Boost\_DEBUG to trim build output -* [ARROW-2099](https://issues.apache.org/jira/browse/ARROW-2099) - [Python] Support DictionaryArray::FromArrays in Python bindings -* [ARROW-2107](https://issues.apache.org/jira/browse/ARROW-2107) - [GLib] Follow arrow::gpu::CudaIpcMemHandle API change -* [ARROW-2108](https://issues.apache.org/jira/browse/ARROW-2108) - [Python] Update instructions for ASV -* [ARROW-2110](https://issues.apache.org/jira/browse/ARROW-2110) - [Python] Only require pytest-runner on test commands -* [ARROW-2111](https://issues.apache.org/jira/browse/ARROW-2111) - [C++] Linting could be faster -* [ARROW-2114](https://issues.apache.org/jira/browse/ARROW-2114) - [Python] Pull latest docker manylinux1 image -* [ARROW-2117](https://issues.apache.org/jira/browse/ARROW-2117) - [C++] Pin clang to version 5.0 -* [ARROW-2118](https://issues.apache.org/jira/browse/ARROW-2118) - [Python] Improve error message when calling parquet.read\_table on an empty file -* [ARROW-2120](https://issues.apache.org/jira/browse/ARROW-2120) - Add possibility to use empty \_MSVC\_STATIC\_LIB\_SUFFIX for Thirdparties -* [ARROW-2121](https://issues.apache.org/jira/browse/ARROW-2121) - [Python] Consider special casing object arrays in pandas serializers. -* [ARROW-2123](https://issues.apache.org/jira/browse/ARROW-2123) - [JS] Upgrade to TS 2.7.1 -* [ARROW-2132](https://issues.apache.org/jira/browse/ARROW-2132) - [Doc] Add links / mentions of Plasma store to main README -* [ARROW-2134](https://issues.apache.org/jira/browse/ARROW-2134) - [CI] Make Travis commit inspection more robust -* [ARROW-2137](https://issues.apache.org/jira/browse/ARROW-2137) - [Python] Don't print paths that are ignored when reading Parquet files -* [ARROW-2138](https://issues.apache.org/jira/browse/ARROW-2138) - [C++] Have FatalLog abort instead of exiting -* [ARROW-2142](https://issues.apache.org/jira/browse/ARROW-2142) - [Python] Conversion from Numpy struct array unimplemented -* [ARROW-2143](https://issues.apache.org/jira/browse/ARROW-2143) - [Python] Provide a manylinux1 wheel for cp27m -* [ARROW-2146](https://issues.apache.org/jira/browse/ARROW-2146) - [GLib] Implement Slice for ChunkedArray -* [ARROW-2149](https://issues.apache.org/jira/browse/ARROW-2149) - [Python] reorganize test\_convert\_pandas.py -* [ARROW-2154](https://issues.apache.org/jira/browse/ARROW-2154) - [Python] \_\_eq\_\_ unimplemented on Buffer -* [ARROW-2155](https://issues.apache.org/jira/browse/ARROW-2155) - [Python] pa.frombuffer(bytearray) returns immutable Buffer -* [ARROW-2156](https://issues.apache.org/jira/browse/ARROW-2156) - [CI] Isolate Sphinx dependencies -* [ARROW-2163](https://issues.apache.org/jira/browse/ARROW-2163) - Install apt dependencies separate from built-in Travis commands, retry on flakiness -* [ARROW-2166](https://issues.apache.org/jira/browse/ARROW-2166) - [GLib] Implement Slice for Column -* [ARROW-2168](https://issues.apache.org/jira/browse/ARROW-2168) - [C++] Build toolchain builds with jemalloc -* [ARROW-2169](https://issues.apache.org/jira/browse/ARROW-2169) - [C++] MSVC is complaining about uncaptured variables -* [ARROW-2174](https://issues.apache.org/jira/browse/ARROW-2174) - [JS] Export format and schema enums -* [ARROW-2176](https://issues.apache.org/jira/browse/ARROW-2176) - [C++] Extend DictionaryBuilder to support delta dictionaries -* [ARROW-2177](https://issues.apache.org/jira/browse/ARROW-2177) - [C++] Remove support for specifying negative scale values in DecimalType -* [ARROW-2180](https://issues.apache.org/jira/browse/ARROW-2180) - [C++] Remove APIs deprecated in 0.8.0 release -* [ARROW-2181](https://issues.apache.org/jira/browse/ARROW-2181) - [Python] Add concat\_tables to API reference, add documentation on use -* [ARROW-2184](https://issues.apache.org/jira/browse/ARROW-2184) - [C++] Add static constructor for FileOutputStream returning shared\_ptr to base OutputStream -* [ARROW-2185](https://issues.apache.org/jira/browse/ARROW-2185) - Remove CI directives from squashed commit messages -* [ARROW-2190](https://issues.apache.org/jira/browse/ARROW-2190) - [GLib] Add add/remove field functions for RecordBatch. -* [ARROW-2191](https://issues.apache.org/jira/browse/ARROW-2191) - [C++] Only use specific version of jemalloc -* [ARROW-2197](https://issues.apache.org/jira/browse/ARROW-2197) - Document "undefined symbol" issue and workaround -* [ARROW-2198](https://issues.apache.org/jira/browse/ARROW-2198) - [Python] Docstring for parquet.read\_table is misleading or incorrect -* [ARROW-2199](https://issues.apache.org/jira/browse/ARROW-2199) - [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree -* [ARROW-2203](https://issues.apache.org/jira/browse/ARROW-2203) - [C++] StderrStream class -* [ARROW-2204](https://issues.apache.org/jira/browse/ARROW-2204) - [C++] Build fails with TLS error on parquet-cpp clone -* [ARROW-2205](https://issues.apache.org/jira/browse/ARROW-2205) - [Python] Option for integer object nulls -* [ARROW-2206](https://issues.apache.org/jira/browse/ARROW-2206) - [JS] Add Perspective as a community project -* [ARROW-2218](https://issues.apache.org/jira/browse/ARROW-2218) - [Python] PythonFile should infer mode when not given -* [ARROW-2231](https://issues.apache.org/jira/browse/ARROW-2231) - [CI] Use clcache on AppVeyor -* [ARROW-2238](https://issues.apache.org/jira/browse/ARROW-2238) - [C++] Detect clcache in cmake configuration -* [ARROW-2239](https://issues.apache.org/jira/browse/ARROW-2239) - [C++] Update build docs for Windows -* [ARROW-2250](https://issues.apache.org/jira/browse/ARROW-2250) - plasma\_store process should cleanup on INT and TERM signals -* [ARROW-2252](https://issues.apache.org/jira/browse/ARROW-2252) - [Python] Create buffer from address, size and base -* [ARROW-2253](https://issues.apache.org/jira/browse/ARROW-2253) - [Python] Support \_\_eq\_\_ on scalar values -* [ARROW-2257](https://issues.apache.org/jira/browse/ARROW-2257) - [C++] Add high-level option to toggle CXX11 ABI -* [ARROW-2261](https://issues.apache.org/jira/browse/ARROW-2261) - [GLib] Can't share the same memory in GArrowBuffer safely -* [ARROW-2262](https://issues.apache.org/jira/browse/ARROW-2262) - [Python] Support slicing on pyarrow.ChunkedArray -* [ARROW-2279](https://issues.apache.org/jira/browse/ARROW-2279) - [Python] Better error message if lib cannot be found -* [ARROW-2282](https://issues.apache.org/jira/browse/ARROW-2282) - [Python] Create StringArray from buffers -* [ARROW-2283](https://issues.apache.org/jira/browse/ARROW-2283) - [C++] Support Arrow C++ installed in /usr detection by pkg-config -* [ARROW-2289](https://issues.apache.org/jira/browse/ARROW-2289) - [GLib] Add Numeric, Integer and FloatingPoint data types -* [ARROW-2291](https://issues.apache.org/jira/browse/ARROW-2291) - [C++] README missing instructions for libboost-regex-dev -* [ARROW-2292](https://issues.apache.org/jira/browse/ARROW-2292) - [Python] More consistent / intuitive name for pyarrow.frombuffer -* [ARROW-2309](https://issues.apache.org/jira/browse/ARROW-2309) - [C++] Use std::make\_unsigned -* [ARROW-2321](https://issues.apache.org/jira/browse/ARROW-2321) - [C++] Release verification script fails with if CMAKE\_INSTALL\_LIBDIR is not $ARROW\_HOME/lib -* [ARROW-2329](https://issues.apache.org/jira/browse/ARROW-2329) - [Website]: 0.9.0 release update -* [ARROW-2336](https://issues.apache.org/jira/browse/ARROW-2336) - [Website] Blog post for 0.9.0 release -* [ARROW-2768](https://issues.apache.org/jira/browse/ARROW-2768) - [Packaging] Support Ubuntu 18.04 -* [ARROW-2783](https://issues.apache.org/jira/browse/ARROW-2783) - Importing conda-forge pyarrow fails - - -## Bug Fixes - -* [ARROW-1345](https://issues.apache.org/jira/browse/ARROW-1345) - [Python] Conversion from nested NumPy arrays fails on integers other than int64, float32 -* [ARROW-1589](https://issues.apache.org/jira/browse/ARROW-1589) - [C++] Fuzzing for certain input formats -* [ARROW-1646](https://issues.apache.org/jira/browse/ARROW-1646) - [Python] pyarrow.array cannot handle NumPy scalar types -* [ARROW-1856](https://issues.apache.org/jira/browse/ARROW-1856) - [Python] Auto-detect Parquet ABI version when using PARQUET\_HOME -* [ARROW-1909](https://issues.apache.org/jira/browse/ARROW-1909) - [C++] Bug: Build fails on windows with "-DARROW\_BUILD\_BENCHMARKS=ON" -* [ARROW-1912](https://issues.apache.org/jira/browse/ARROW-1912) - [Website] Add org affiliations to committers.html -* [ARROW-1919](https://issues.apache.org/jira/browse/ARROW-1919) - Plasma hanging if object id is not 20 bytes -* [ARROW-1924](https://issues.apache.org/jira/browse/ARROW-1924) - [Python] Bring back pickle=True option for serialization -* [ARROW-1933](https://issues.apache.org/jira/browse/ARROW-1933) - [GLib] Build failure with --with-arrow-cpp-build-dir and GPU enabled Arrow C++ -* [ARROW-1940](https://issues.apache.org/jira/browse/ARROW-1940) - [Python] Extra metadata gets added after multiple conversions between pd.DataFrame and pa.Table -* [ARROW-1941](https://issues.apache.org/jira/browse/ARROW-1941) - Table <–\> DataFrame roundtrip failing -* [ARROW-1943](https://issues.apache.org/jira/browse/ARROW-1943) - Handle setInitialCapacity() for deeply nested lists of lists -* [ARROW-1944](https://issues.apache.org/jira/browse/ARROW-1944) - FindArrow has wrong ARROW\_STATIC\_LIB -* [ARROW-1945](https://issues.apache.org/jira/browse/ARROW-1945) - [C++] Fix doxygen documentation of array.h -* [ARROW-1946](https://issues.apache.org/jira/browse/ARROW-1946) - Add APIs to decimal vector for writing big endian data -* [ARROW-1948](https://issues.apache.org/jira/browse/ARROW-1948) - [Java] ListVector does not handle ipc with all non-null values with none set -* [ARROW-1950](https://issues.apache.org/jira/browse/ARROW-1950) - [Python] pandas\_type in pandas metadata incorrect for List types -* [ARROW-1953](https://issues.apache.org/jira/browse/ARROW-1953) - [JS] JavaScript builds broken on master -* [ARROW-1955](https://issues.apache.org/jira/browse/ARROW-1955) - MSVC generates "attempting to reference a deleted function" during build. -* [ARROW-1958](https://issues.apache.org/jira/browse/ARROW-1958) - [Python] Error in pandas conversion for datetimetz row index -* [ARROW-1961](https://issues.apache.org/jira/browse/ARROW-1961) - [Python] Writing Parquet file with flavor='spark' loses pandas schema metadata -* [ARROW-1966](https://issues.apache.org/jira/browse/ARROW-1966) - [C++] Support JAVA\_HOME paths in HDFS libjvm loading that include the jre directory -* [ARROW-1967](https://issues.apache.org/jira/browse/ARROW-1967) - Python: AssertionError w.r.t Pandas conversion on Parquet files in 0.8.0 dev version -* [ARROW-1971](https://issues.apache.org/jira/browse/ARROW-1971) - [Python] Add pandas serialization to the default -* [ARROW-1972](https://issues.apache.org/jira/browse/ARROW-1972) - Deserialization of buffer objects (and pandas dataframes) segfaults on different processes. -* [ARROW-1973](https://issues.apache.org/jira/browse/ARROW-1973) - [Python] Memory leak when converting Arrow tables with array columns to Pandas dataframes. -* [ARROW-1976](https://issues.apache.org/jira/browse/ARROW-1976) - [Python] Handling unicode pandas columns on parquet.read\_table -* [ARROW-1979](https://issues.apache.org/jira/browse/ARROW-1979) - [JS] JS builds handing in es2015:umd tests -* [ARROW-1980](https://issues.apache.org/jira/browse/ARROW-1980) - [Python] Race condition in \`write\_to\_dataset\` -* [ARROW-1982](https://issues.apache.org/jira/browse/ARROW-1982) - [Python] Return parquet statistics min/max as values instead of strings -* [ARROW-1986](https://issues.apache.org/jira/browse/ARROW-1986) - [Python] HadoopFileSystem is not picklable and cannot currently be used with multiprocessing -* [ARROW-1991](https://issues.apache.org/jira/browse/ARROW-1991) - [GLib] Docker-based documentation build is broken -* [ARROW-1992](https://issues.apache.org/jira/browse/ARROW-1992) - [Python] to\_pandas crashes when using strings\_to\_categoricals on empty string cols on 0.8.0 -* [ARROW-1997](https://issues.apache.org/jira/browse/ARROW-1997) - [Python] to\_pandas with strings\_to\_categorical fails -* [ARROW-1998](https://issues.apache.org/jira/browse/ARROW-1998) - [Python] Table.from\_pandas crashes when data frame is empty -* [ARROW-1999](https://issues.apache.org/jira/browse/ARROW-1999) - [Python] from\_numpy\_dtype returns wrong types -* [ARROW-2000](https://issues.apache.org/jira/browse/ARROW-2000) - Deduplicate file descriptors when plasma store replies to get request. -* [ARROW-2002](https://issues.apache.org/jira/browse/ARROW-2002) - use pyarrow download file will raise queue.Full exceptions sometimes -* [ARROW-2003](https://issues.apache.org/jira/browse/ARROW-2003) - [Python] Do not use deprecated kwarg in pandas.core.internals.make\_block -* [ARROW-2005](https://issues.apache.org/jira/browse/ARROW-2005) - [Python] pyflakes warnings on Cython files not failing build -* [ARROW-2008](https://issues.apache.org/jira/browse/ARROW-2008) - [Python] Type inference for int32 NumPy arrays (expecting list) returns int64 and then conversion fails -* [ARROW-2010](https://issues.apache.org/jira/browse/ARROW-2010) - [C++] Compiler warnings with CHECKIN warning level in ORC adapter -* [ARROW-2017](https://issues.apache.org/jira/browse/ARROW-2017) - Array initialization with large (\>2\*\*31-1) uint64 values fails -* [ARROW-2023](https://issues.apache.org/jira/browse/ARROW-2023) - [C++] Test opening IPC stream reader or file reader on an empty InputStream -* [ARROW-2025](https://issues.apache.org/jira/browse/ARROW-2025) - [Python/C++] HDFS Client disconnect closes all open clients -* [ARROW-2029](https://issues.apache.org/jira/browse/ARROW-2029) - [Python] Program crash on \`HdfsFile.tell\` if file is closed -* [ARROW-2032](https://issues.apache.org/jira/browse/ARROW-2032) - [C++] ORC ep installs on each call to ninja build (even if no work to do) -* [ARROW-2033](https://issues.apache.org/jira/browse/ARROW-2033) - pa.array() doesn't work with iterators -* [ARROW-2039](https://issues.apache.org/jira/browse/ARROW-2039) - [Python] pyarrow.Buffer().to\_pybytes() segfaults -* [ARROW-2040](https://issues.apache.org/jira/browse/ARROW-2040) - [Python] Deserialized Numpy array must keep ref to underlying tensor -* [ARROW-2047](https://issues.apache.org/jira/browse/ARROW-2047) - [Python] test\_serialization.py uses a python executable in PATH rather than that used for a test run -* [ARROW-2049](https://issues.apache.org/jira/browse/ARROW-2049) - ARROW-2049: [Python] Use python -m cython to run Cython, instead of CYTHON\_EXECUTABLE -* [ARROW-2062](https://issues.apache.org/jira/browse/ARROW-2062) - [C++] Stalled builds in test\_serialization.py in Travis CI -* [ARROW-2070](https://issues.apache.org/jira/browse/ARROW-2070) - [Python] chdir logic in setup.py buggy -* [ARROW-2072](https://issues.apache.org/jira/browse/ARROW-2072) - [Python] decimal128.byte\_width crashes -* [ARROW-2080](https://issues.apache.org/jira/browse/ARROW-2080) - [Python] Update documentation after ARROW-2024 -* [ARROW-2085](https://issues.apache.org/jira/browse/ARROW-2085) - HadoopFileSystem.isdir and .isfile should return False if the path doesn't exist -* [ARROW-2106](https://issues.apache.org/jira/browse/ARROW-2106) - [Python] pyarrow.array can't take a pandas Series of python datetime objects. -* [ARROW-2109](https://issues.apache.org/jira/browse/ARROW-2109) - [C++] Boost 1.66 compilation fails on Windows on linkage stage -* [ARROW-2124](https://issues.apache.org/jira/browse/ARROW-2124) - [Python] ArrowInvalid raised if the first item of a nested list of numpy arrays is empty -* [ARROW-2128](https://issues.apache.org/jira/browse/ARROW-2128) - [Python] Cannot serialize array of empty lists -* [ARROW-2129](https://issues.apache.org/jira/browse/ARROW-2129) - [Python] Segmentation fault on conversion of empty array to Pandas -* [ARROW-2131](https://issues.apache.org/jira/browse/ARROW-2131) - [Python] Serialization test fails on Windows when library has been built in place / not installed -* [ARROW-2133](https://issues.apache.org/jira/browse/ARROW-2133) - [Python] Segmentation fault on conversion of empty nested arrays to Pandas -* [ARROW-2135](https://issues.apache.org/jira/browse/ARROW-2135) - [Python] NaN values silently casted to int64 when passing explicit schema for conversion in Table.from\_pandas -* [ARROW-2139](https://issues.apache.org/jira/browse/ARROW-2139) - [Python] Address Sphinx deprecation warning when building docs -* [ARROW-2145](https://issues.apache.org/jira/browse/ARROW-2145) - [Python] Decimal conversion not working for NaN values -* [ARROW-2150](https://issues.apache.org/jira/browse/ARROW-2150) - [Python] array equality defaults to identity -* [ARROW-2151](https://issues.apache.org/jira/browse/ARROW-2151) - [Python] Error when converting from list of uint64 arrays -* [ARROW-2153](https://issues.apache.org/jira/browse/ARROW-2153) - [C++/Python] Decimal conversion not working for exponential notation -* [ARROW-2157](https://issues.apache.org/jira/browse/ARROW-2157) - [Python] Decimal arrays cannot be constructed from Python lists -* [ARROW-2158](https://issues.apache.org/jira/browse/ARROW-2158) - [Python] Construction of Decimal array with None or np.nan fails -* [ARROW-2160](https://issues.apache.org/jira/browse/ARROW-2160) - [C++/Python] Fix decimal precision inference -* [ARROW-2161](https://issues.apache.org/jira/browse/ARROW-2161) - [Python] Skip test\_cython\_api if ARROW\_HOME isn't defined -* [ARROW-2162](https://issues.apache.org/jira/browse/ARROW-2162) - [Python/C++] Decimal Values with too-high precision are multiplied by 100 -* [ARROW-2167](https://issues.apache.org/jira/browse/ARROW-2167) - [C++] Building Orc extensions fails with the default BUILD\_WARNING\_LEVEL=Production -* [ARROW-2170](https://issues.apache.org/jira/browse/ARROW-2170) - [Python] construct\_metadata fails on reading files where no index was preserved -* [ARROW-2171](https://issues.apache.org/jira/browse/ARROW-2171) - [Python] OwnedRef is fragile -* [ARROW-2172](https://issues.apache.org/jira/browse/ARROW-2172) - [Python] Incorrect conversion from Numpy array when stride % itemsize != 0 -* [ARROW-2173](https://issues.apache.org/jira/browse/ARROW-2173) - [Python] NumPyBuffer destructor should hold the GIL -* [ARROW-2175](https://issues.apache.org/jira/browse/ARROW-2175) - [Python] arrow\_ep build is triggering during parquet-cpp build in Travis CI -* [ARROW-2178](https://issues.apache.org/jira/browse/ARROW-2178) - [JS] Fix JS html FileReader example -* [ARROW-2179](https://issues.apache.org/jira/browse/ARROW-2179) - [C++] arrow/util/io-util.h missing from libarrow-dev -* [ARROW-2192](https://issues.apache.org/jira/browse/ARROW-2192) - Commits to master should run all builds in CI matrix -* [ARROW-2194](https://issues.apache.org/jira/browse/ARROW-2194) - [Python] Pandas columns metadata incorrect for empty string columns -* [ARROW-2208](https://issues.apache.org/jira/browse/ARROW-2208) - [Python] install issues with jemalloc -* [ARROW-2209](https://issues.apache.org/jira/browse/ARROW-2209) - [Python] Partition columns are not correctly loaded in schema of ParquetDataset -* [ARROW-2210](https://issues.apache.org/jira/browse/ARROW-2210) - [C++] TestBuffer\_ResizeOOM has a memory leak with jemalloc -* [ARROW-2212](https://issues.apache.org/jira/browse/ARROW-2212) - [C++/Python] Build Protobuf in base manylinux 1 docker image -* [ARROW-2223](https://issues.apache.org/jira/browse/ARROW-2223) - [JS] installing umd release throws an error -* [ARROW-2227](https://issues.apache.org/jira/browse/ARROW-2227) - [Python] Table.from\_pandas does not create chunked\_arrays. -* [ARROW-2228](https://issues.apache.org/jira/browse/ARROW-2228) - [Python] Unsigned int type for arrow Table not supported -* [ARROW-2230](https://issues.apache.org/jira/browse/ARROW-2230) - [Python] JS version number is sometimes picked up -* [ARROW-2232](https://issues.apache.org/jira/browse/ARROW-2232) - [Python] pyarrow.Tensor constructor segfaults -* [ARROW-2234](https://issues.apache.org/jira/browse/ARROW-2234) - [JS] Read timestamp low bits as Uint32s -* [ARROW-2240](https://issues.apache.org/jira/browse/ARROW-2240) - [Python] Array initialization with leading numpy nan fails with exception -* [ARROW-2244](https://issues.apache.org/jira/browse/ARROW-2244) - [C++] Slicing NullArray should not cause the null count on the internal data to be unknown -* [ARROW-2245](https://issues.apache.org/jira/browse/ARROW-2245) - [Python] Revert static linkage of parquet-cpp in manylinux1 wheel -* [ARROW-2246](https://issues.apache.org/jira/browse/ARROW-2246) - [Python] Use namespaced boost in manylinux1 package -* [ARROW-2251](https://issues.apache.org/jira/browse/ARROW-2251) - [GLib] Destroying GArrowBuffer while GArrowTensor that uses the buffer causes a crash -* [ARROW-2254](https://issues.apache.org/jira/browse/ARROW-2254) - [Python] Local in-place dev versions picking up JS tags -* [ARROW-2258](https://issues.apache.org/jira/browse/ARROW-2258) - [C++] Appveyor builds failing on master -* [ARROW-2263](https://issues.apache.org/jira/browse/ARROW-2263) - [Python] test\_cython.py fails if pyarrow is not in import path (e.g. with inplace builds) -* [ARROW-2265](https://issues.apache.org/jira/browse/ARROW-2265) - [Python] Serializing subclasses of np.ndarray returns a np.ndarray. -* [ARROW-2268](https://issues.apache.org/jira/browse/ARROW-2268) - Remove MD5 checksums from release process -* [ARROW-2269](https://issues.apache.org/jira/browse/ARROW-2269) - [Python] Cannot build bdist\_wheel for Python -* [ARROW-2270](https://issues.apache.org/jira/browse/ARROW-2270) - [Python] ForeignBuffer doesn't tie Python object lifetime to C++ buffer lifetime -* [ARROW-2272](https://issues.apache.org/jira/browse/ARROW-2272) - [Python] test\_plasma spams /tmp -* [ARROW-2275](https://issues.apache.org/jira/browse/ARROW-2275) - [C++] Buffer::mutable\_data\_ member uninitialized -* [ARROW-2280](https://issues.apache.org/jira/browse/ARROW-2280) - [Python] pyarrow.Array.buffers should also include the offsets -* [ARROW-2284](https://issues.apache.org/jira/browse/ARROW-2284) - [Python] test\_plasma error on plasma\_store error -* [ARROW-2288](https://issues.apache.org/jira/browse/ARROW-2288) - [Python] slicing logic defective -* [ARROW-2297](https://issues.apache.org/jira/browse/ARROW-2297) - [JS] babel-jest is not listed as a dev dependency -* [ARROW-2304](https://issues.apache.org/jira/browse/ARROW-2304) - [C++] MultipleClients test in io-hdfs-test fails on trunk -* [ARROW-2306](https://issues.apache.org/jira/browse/ARROW-2306) - [Python] HDFS test failures -* [ARROW-2307](https://issues.apache.org/jira/browse/ARROW-2307) - [Python] Unable to read arrow stream containing 0 record batches -* [ARROW-2311](https://issues.apache.org/jira/browse/ARROW-2311) - [Python] Struct array slicing defective -* [ARROW-2312](https://issues.apache.org/jira/browse/ARROW-2312) - [JS] verify-release-candidate-sh must be updated to include JS in integration tests -* [ARROW-2313](https://issues.apache.org/jira/browse/ARROW-2313) - [GLib] Release builds must define NDEBUG -* [ARROW-2316](https://issues.apache.org/jira/browse/ARROW-2316) - [C++] Revert Buffer::mutable\_data member to always inline -* [ARROW-2318](https://issues.apache.org/jira/browse/ARROW-2318) - [C++] TestPlasmaStore.MultipleClientTest is flaky (hangs) in release builds -* [ARROW-2320](https://issues.apache.org/jira/browse/ARROW-2320) - [C++] Vendored Boost build does not build regex library -* [ARROW-2406](https://issues.apache.org/jira/browse/ARROW-2406) - [Python] Segfault when creating PyArrow table from Pandas for empty string column when schema provided - - - -# Apache Arrow 0.8.0 (2017-12-18) - -## Bug Fixes - -* [ARROW-226](https://issues.apache.org/jira/browse/ARROW-226) - [C++] libhdfs: feedback to help determining cause of failure in opening file path -* [ARROW-641](https://issues.apache.org/jira/browse/ARROW-641) - [C++] Do not build/run io-hdfs-test if ARROW\_HDFS=off -* [ARROW-1282](https://issues.apache.org/jira/browse/ARROW-1282) - Large memory reallocation by Arrow causes hang in jemalloc -* [ARROW-1298](https://issues.apache.org/jira/browse/ARROW-1298) - C++: Add prefix to jemalloc functions to guard against issues when using multiple allocators in the same process -* [ARROW-1341](https://issues.apache.org/jira/browse/ARROW-1341) - [C++] Deprecate arrow::MakeTable in favor of new ctor from ARROW-1334 -* [ARROW-1347](https://issues.apache.org/jira/browse/ARROW-1347) - [JAVA] List null type should use consistent name for inner field -* [ARROW-1398](https://issues.apache.org/jira/browse/ARROW-1398) - [Python] No support reading columns of type decimal(19,4) -* [ARROW-1409](https://issues.apache.org/jira/browse/ARROW-1409) - [Format] Use for "page" attribute in Buffer in metadata -* [ARROW-1431](https://issues.apache.org/jira/browse/ARROW-1431) - [Java] JsonFileReader doesn't intialize some vectors approperately -* [ARROW-1436](https://issues.apache.org/jira/browse/ARROW-1436) - PyArrow Timestamps written to Parquet as INT96 appear in Spark as 'bigint' -* [ARROW-1540](https://issues.apache.org/jira/browse/ARROW-1540) - [C++] Fix valgrind warnings in cuda-test if possible -* [ARROW-1541](https://issues.apache.org/jira/browse/ARROW-1541) - [C++] Race condition with arrow\_gpu -* [ARROW-1543](https://issues.apache.org/jira/browse/ARROW-1543) - [C++] row\_wise\_conversion example doesn't correspond to ListBuilder constructor arguments -* [ARROW-1549](https://issues.apache.org/jira/browse/ARROW-1549) - [JS] Integrate auto-generated Arrow test files -* [ARROW-1555](https://issues.apache.org/jira/browse/ARROW-1555) - [Python] write\_to\_dataset on s3 -* [ARROW-1584](https://issues.apache.org/jira/browse/ARROW-1584) - [PYTHON] serialize\_pandas on empty dataframe -* [ARROW-1585](https://issues.apache.org/jira/browse/ARROW-1585) - serialize\_pandas round trip fails on integer columns -* [ARROW-1586](https://issues.apache.org/jira/browse/ARROW-1586) - [PYTHON] serialize\_pandas roundtrip loses columns name -* [ARROW-1609](https://issues.apache.org/jira/browse/ARROW-1609) - Plasma: Build fails with Xcode 9.0 -* [ARROW-1615](https://issues.apache.org/jira/browse/ARROW-1615) - CXX flags for development more permissive than Travis CI builds -* [ARROW-1617](https://issues.apache.org/jira/browse/ARROW-1617) - [Python] Do not use symlinks in python/cmake\_modules -* [ARROW-1620](https://issues.apache.org/jira/browse/ARROW-1620) - Python: Download Boost in manylinux1 build from bintray -* [ARROW-1622](https://issues.apache.org/jira/browse/ARROW-1622) - [Plasma] Plasma doesn't compile with XCode 9 -* [ARROW-1624](https://issues.apache.org/jira/browse/ARROW-1624) - [C++] Follow up fixes / tweaks to compiler warnings for Plasma / LLVM 4.0, add to readme -* [ARROW-1625](https://issues.apache.org/jira/browse/ARROW-1625) - [Serialization] Support OrderedDict properly -* [ARROW-1629](https://issues.apache.org/jira/browse/ARROW-1629) - [C++] Fix problematic code paths identified by infer tool -* [ARROW-1633](https://issues.apache.org/jira/browse/ARROW-1633) - [Python] numpy "unicode" arrays not understood -* [ARROW-1640](https://issues.apache.org/jira/browse/ARROW-1640) - Resolve OpenSSL issues in Travis CI -* [ARROW-1647](https://issues.apache.org/jira/browse/ARROW-1647) - [Plasma] Potential bug when reading/writing messages. -* [ARROW-1653](https://issues.apache.org/jira/browse/ARROW-1653) - [Plasma] Use static cast to avoid compiler warning. -* [ARROW-1655](https://issues.apache.org/jira/browse/ARROW-1655) - [Java] Add Scale and Precision to ValueVectorTypes.tdd for Decimals -* [ARROW-1656](https://issues.apache.org/jira/browse/ARROW-1656) - [C++] Endianness Macro is Incorrect on Windows And Mac -* [ARROW-1657](https://issues.apache.org/jira/browse/ARROW-1657) - [C++] Multithreaded Read Test Failing on Arch Linux -* [ARROW-1658](https://issues.apache.org/jira/browse/ARROW-1658) - [Python] Out of bounds dictionary indices causes segfault after converting to pandas -* [ARROW-1663](https://issues.apache.org/jira/browse/ARROW-1663) - [Java] Follow up on ARROW-1347 and make schema backward compatible -* [ARROW-1670](https://issues.apache.org/jira/browse/ARROW-1670) - [Python] Speed up deserialization code path -* [ARROW-1672](https://issues.apache.org/jira/browse/ARROW-1672) - [Python] Failure to write Feather bytes column -* [ARROW-1673](https://issues.apache.org/jira/browse/ARROW-1673) - [Python] NumPy boolean arrays get converted to uint8 arrays on NdarrayToTensor roundtrip -* [ARROW-1676](https://issues.apache.org/jira/browse/ARROW-1676) - [C++] Correctly truncate oversized validity bitmaps when writing Feather format -* [ARROW-1678](https://issues.apache.org/jira/browse/ARROW-1678) - [Python] Incorrect serialization of numpy.float16 -* [ARROW-1680](https://issues.apache.org/jira/browse/ARROW-1680) - [Python] Timestamp unit change not done in from\_pandas() conversion -* [ARROW-1681](https://issues.apache.org/jira/browse/ARROW-1681) - [Python] Error writing with nulls in lists -* [ARROW-1686](https://issues.apache.org/jira/browse/ARROW-1686) - Documentation generation script creates "apidocs" directory under site/java -* [ARROW-1693](https://issues.apache.org/jira/browse/ARROW-1693) - [JS] Error reading dictionary-encoded integration test files -* [ARROW-1694](https://issues.apache.org/jira/browse/ARROW-1694) - [Java] Unclosed VectorSchemaRoot in JsonFileReader\#readDictionaryBatches() -* [ARROW-1695](https://issues.apache.org/jira/browse/ARROW-1695) - [Serialization] Fix reference counting of numpy arrays created in custom serialializer -* [ARROW-1698](https://issues.apache.org/jira/browse/ARROW-1698) - [JS] File reader attempts to load the same dictionary batch more than once -* [ARROW-1704](https://issues.apache.org/jira/browse/ARROW-1704) - [GLib] Go example in test suite is broken -* [ARROW-1708](https://issues.apache.org/jira/browse/ARROW-1708) - [JS] Linter problem breaks master build -* [ARROW-1709](https://issues.apache.org/jira/browse/ARROW-1709) - [C++] Decimal.ToString is incorrect for negative scale -* [ARROW-1711](https://issues.apache.org/jira/browse/ARROW-1711) - [Python] flake8 checks still not failing builds -* [ARROW-1714](https://issues.apache.org/jira/browse/ARROW-1714) - [Python] No named pd.Series name serialized as u'None' -* [ARROW-1720](https://issues.apache.org/jira/browse/ARROW-1720) - [Python] Segmentation fault while trying to access an out-of-bound chunk -* [ARROW-1723](https://issues.apache.org/jira/browse/ARROW-1723) - Windows: \_\_declspec(dllexport) specified when building arrow static library -* [ARROW-1730](https://issues.apache.org/jira/browse/ARROW-1730) - [Python] Incorrect result from pyarrow.array when passing timestamp type -* [ARROW-1732](https://issues.apache.org/jira/browse/ARROW-1732) - [Python] RecordBatch.from\_pandas fails on DataFrame with no columns when preserve\_index=False -* [ARROW-1735](https://issues.apache.org/jira/browse/ARROW-1735) - [C++] Cast kernels cannot write into sliced output array -* [ARROW-1738](https://issues.apache.org/jira/browse/ARROW-1738) - [Python] Wrong datetime conversion when pa.array with unit -* [ARROW-1739](https://issues.apache.org/jira/browse/ARROW-1739) - [Python] Fix usages of assertRaises causing broken build -* [ARROW-1742](https://issues.apache.org/jira/browse/ARROW-1742) - C++: clang-format is not detected correct on OSX anymore -* [ARROW-1743](https://issues.apache.org/jira/browse/ARROW-1743) - [Python] Table to\_pandas fails when index contains categorical column -* [ARROW-1745](https://issues.apache.org/jira/browse/ARROW-1745) - Compilation failure on Mac OS in plasma tests -* [ARROW-1749](https://issues.apache.org/jira/browse/ARROW-1749) - [C++] Handle range of Decimal128 values that require 39 digits to be displayed -* [ARROW-1751](https://issues.apache.org/jira/browse/ARROW-1751) - [Python] Pandas 0.21.0 introduces a breaking API change for MultiIndex construction -* [ARROW-1754](https://issues.apache.org/jira/browse/ARROW-1754) - [Python] Fix buggy Parquet roundtrip when an index name is the same as a column name -* [ARROW-1756](https://issues.apache.org/jira/browse/ARROW-1756) - [Python] Observed int32 overflow in Feather write/read path -* [ARROW-1762](https://issues.apache.org/jira/browse/ARROW-1762) - [C++] unittest failure for language environment -* [ARROW-1764](https://issues.apache.org/jira/browse/ARROW-1764) - [Python] Add -c conda-forge for Windows dev installation instructions -* [ARROW-1766](https://issues.apache.org/jira/browse/ARROW-1766) - [GLib] Fix failing builds on OSX -* [ARROW-1768](https://issues.apache.org/jira/browse/ARROW-1768) - [Python] Fix suppressed exception in ParquetWriter.\_\_del\_\_ -* [ARROW-1769](https://issues.apache.org/jira/browse/ARROW-1769) - Python: pyarrow.parquet.write\_to\_dataset creates cyclic references -* [ARROW-1770](https://issues.apache.org/jira/browse/ARROW-1770) - [GLib] Fix GLib compiler warning -* [ARROW-1771](https://issues.apache.org/jira/browse/ARROW-1771) - [C++] ARROW-1749 Breaks Public API test in parquet-cpp -* [ARROW-1776](https://issues.apache.org/jira/browse/ARROW-1776) - [C++[ arrow::gpu::CudaContext::bytes\_allocated() isn't defined -* [ARROW-1778](https://issues.apache.org/jira/browse/ARROW-1778) - [Python] Link parquet-cpp statically, privately in manylinux1 wheels -* [ARROW-1781](https://issues.apache.org/jira/browse/ARROW-1781) - [CI] OSX Builds on Travis-CI time out often -* [ARROW-1788](https://issues.apache.org/jira/browse/ARROW-1788) - Plasma store crashes when trying to abort objects for disconnected client -* [ARROW-1791](https://issues.apache.org/jira/browse/ARROW-1791) - Integration tests generate date[DAY] values outside of reasonable range -* [ARROW-1793](https://issues.apache.org/jira/browse/ARROW-1793) - [Integration] fix a typo for README.md -* [ARROW-1800](https://issues.apache.org/jira/browse/ARROW-1800) - [C++] Fix and simplify random\_decimals -* [ARROW-1805](https://issues.apache.org/jira/browse/ARROW-1805) - [Python] ignore non-parquet files when exploring dataset -* [ARROW-1811](https://issues.apache.org/jira/browse/ARROW-1811) - [C++/Python] Rename all Decimal based APIs to Decimal128 -* [ARROW-1812](https://issues.apache.org/jira/browse/ARROW-1812) - Plasma store modifies hash table while iterating during client disconnect -* [ARROW-1813](https://issues.apache.org/jira/browse/ARROW-1813) - Enforce checkstyle failure in JAVA build and fix all checkstyle -* [ARROW-1821](https://issues.apache.org/jira/browse/ARROW-1821) - Add integration test case to explicitly check for optional validity buffer -* [ARROW-1829](https://issues.apache.org/jira/browse/ARROW-1829) - [Plasma] Clean up eviction policy bookkeeping -* [ARROW-1830](https://issues.apache.org/jira/browse/ARROW-1830) - [Python] Error when loading all the files in a dictionary -* [ARROW-1831](https://issues.apache.org/jira/browse/ARROW-1831) - [Python] Docker-based documentation build does not properly set LD\_LIBRARY\_PATH -* [ARROW-1836](https://issues.apache.org/jira/browse/ARROW-1836) - [C++] Fix C4996 warning from arrow/util/variant.h on MSVC builds -* [ARROW-1839](https://issues.apache.org/jira/browse/ARROW-1839) - [C++/Python] Add Decimal Parquet Read/Write Tests -* [ARROW-1840](https://issues.apache.org/jira/browse/ARROW-1840) - [Website] The installation command failed on Windows10 anaconda environment. -* [ARROW-1845](https://issues.apache.org/jira/browse/ARROW-1845) - [Python] Expose Decimal128Type -* [ARROW-1852](https://issues.apache.org/jira/browse/ARROW-1852) - [Plasma] Make retrieving manager file descriptor const -* [ARROW-1853](https://issues.apache.org/jira/browse/ARROW-1853) - [Plasma] Fix off-by-one error in retry processing -* [ARROW-1863](https://issues.apache.org/jira/browse/ARROW-1863) - [Python] PyObjectStringify could render bytes-like output for more types of objects -* [ARROW-1865](https://issues.apache.org/jira/browse/ARROW-1865) - [C++] Adding a column to an empty Table fails -* [ARROW-1869](https://issues.apache.org/jira/browse/ARROW-1869) - Fix typo in LowCostIdentityHashMap -* [ARROW-1871](https://issues.apache.org/jira/browse/ARROW-1871) - [Python/C++] Appending Python Decimals with different scales requires rescaling -* [ARROW-1873](https://issues.apache.org/jira/browse/ARROW-1873) - [Python] Segmentation fault when loading total 2GB of parquet files -* [ARROW-1877](https://issues.apache.org/jira/browse/ARROW-1877) - Incorrect comparison in JsonStringArrayList.equals -* [ARROW-1879](https://issues.apache.org/jira/browse/ARROW-1879) - [Python] Dask integration tests are not skipped if dask is not installed -* [ARROW-1881](https://issues.apache.org/jira/browse/ARROW-1881) - [Python] setuptools\_scm picks up JS version tags -* [ARROW-1882](https://issues.apache.org/jira/browse/ARROW-1882) - [C++] Reintroduce DictionaryBuilder -* [ARROW-1883](https://issues.apache.org/jira/browse/ARROW-1883) - [Python] BUG: Table.to\_pandas metadata checking fails if columns are not present -* [ARROW-1889](https://issues.apache.org/jira/browse/ARROW-1889) - [Python] --exclude is not available in older git versions -* [ARROW-1890](https://issues.apache.org/jira/browse/ARROW-1890) - [Python] Masking for date32 arrays not working -* [ARROW-1891](https://issues.apache.org/jira/browse/ARROW-1891) - [Python] NaT date32 values are only converted to nulls if from\_pandas is used -* [ARROW-1892](https://issues.apache.org/jira/browse/ARROW-1892) - [Python] Unknown list item type: binary -* [ARROW-1893](https://issues.apache.org/jira/browse/ARROW-1893) - [Python] test\_primitive\_serialization fails on Python 2.7.3 -* [ARROW-1895](https://issues.apache.org/jira/browse/ARROW-1895) - [Python] Add field\_name to pandas index metadata -* [ARROW-1897](https://issues.apache.org/jira/browse/ARROW-1897) - [Python] Incorrect numpy\_type for pandas metadata of Categoricals -* [ARROW-1904](https://issues.apache.org/jira/browse/ARROW-1904) - [C++] Deprecate PrimitiveArray::raw\_values -* [ARROW-1906](https://issues.apache.org/jira/browse/ARROW-1906) - [Python] Creating a pyarrow.Array with timestamp of different unit is not casted -* [ARROW-1908](https://issues.apache.org/jira/browse/ARROW-1908) - [Python] Construction of arrow table from pandas DataFrame with duplicate column names crashes -* [ARROW-1910](https://issues.apache.org/jira/browse/ARROW-1910) - CPP README Brewfile link incorrect -* [ARROW-1914](https://issues.apache.org/jira/browse/ARROW-1914) - [C++] make -j may fail to build with -DARROW\_GPU=on -* [ARROW-1915](https://issues.apache.org/jira/browse/ARROW-1915) - [Python] Parquet tests should be optional -* [ARROW-1916](https://issues.apache.org/jira/browse/ARROW-1916) - [Java] Do not exclude java/dev/checkstyle from source releases -* [ARROW-1917](https://issues.apache.org/jira/browse/ARROW-1917) - [GLib] Must set GI\_TYPELIB\_PATH in verify-release-candidate.sh -* [ARROW-1935](https://issues.apache.org/jira/browse/ARROW-1935) - Download page must not link to snapshots / nightly builds -* [ARROW-1936](https://issues.apache.org/jira/browse/ARROW-1936) - Broken links to signatures/hashes etc -* [ARROW-1939](https://issues.apache.org/jira/browse/ARROW-1939) - Correct links in release 0.8 blog post - - -## New Features and Improvements - -* [ARROW-480](https://issues.apache.org/jira/browse/ARROW-480) - [Python] Add accessors for Parquet column statistics -* [ARROW-504](https://issues.apache.org/jira/browse/ARROW-504) - [Python] Add adapter to write pandas.DataFrame in user-selected chunk size to streaming format -* [ARROW-507](https://issues.apache.org/jira/browse/ARROW-507) - [C++/Python] Construct List container from offsets and values subarrays -* [ARROW-541](https://issues.apache.org/jira/browse/ARROW-541) - [JS] Implement JavaScript-compatible implementation -* [ARROW-571](https://issues.apache.org/jira/browse/ARROW-571) - [Python] Add APIs to build Parquet files incrementally from Arrow tables -* [ARROW-587](https://issues.apache.org/jira/browse/ARROW-587) - Add JIRA fix version to merge tool -* [ARROW-609](https://issues.apache.org/jira/browse/ARROW-609) - [C++] Function for casting from days since UNIX epoch to int64 date -* [ARROW-838](https://issues.apache.org/jira/browse/ARROW-838) - [Python] Efficient construction of arrays from non-pandas 1D NumPy arrays -* [ARROW-905](https://issues.apache.org/jira/browse/ARROW-905) - [Docs] Add Dockerfile for reproducible documentation generation -* [ARROW-911](https://issues.apache.org/jira/browse/ARROW-911) - [Python] Expand development.rst with build instructions without conda -* [ARROW-942](https://issues.apache.org/jira/browse/ARROW-942) - Support integration testing on Python 2.7 -* [ARROW-950](https://issues.apache.org/jira/browse/ARROW-950) - [Site] Add Google Analytics tag -* [ARROW-972](https://issues.apache.org/jira/browse/ARROW-972) - [Python] Add test cases and basic APIs for UnionArray -* [ARROW-1032](https://issues.apache.org/jira/browse/ARROW-1032) - [JS] Support custom\_metadata -* [ARROW-1047](https://issues.apache.org/jira/browse/ARROW-1047) - [Java] Add generalized stream writer and reader interfaces that are decoupled from IO / message framing -* [ARROW-1047](https://issues.apache.org/jira/browse/ARROW-1047) - [Java] Add generalized stream writer and reader interfaces that are decoupled from IO / message framing -* [ARROW-1087](https://issues.apache.org/jira/browse/ARROW-1087) - [Python] add get\_include to expose directory containing header files -* [ARROW-1114](https://issues.apache.org/jira/browse/ARROW-1114) - [C++] Create Record Batch Builder class as a reusable and efficient way to transpose row-by-row data to columns -* [ARROW-1134](https://issues.apache.org/jira/browse/ARROW-1134) - [C++] Allow C++/CLI projects to build with Arrow​ -* [ARROW-1178](https://issues.apache.org/jira/browse/ARROW-1178) - [Python] Create alternative to Table.from\_pandas that yields a list of RecordBatch objects with a given chunk size -* [ARROW-1226](https://issues.apache.org/jira/browse/ARROW-1226) - [C++] Improve / correct doxygen function documentation in arrow::ipc -* [ARROW-1250](https://issues.apache.org/jira/browse/ARROW-1250) - [Python] Define API for user type checking of array types -* [ARROW-1362](https://issues.apache.org/jira/browse/ARROW-1362) - [Integration] Validate vector type layout in IPC messages -* [ARROW-1367](https://issues.apache.org/jira/browse/ARROW-1367) - [Website] Divide CHANGELOG issues by component and add subheaders -* [ARROW-1369](https://issues.apache.org/jira/browse/ARROW-1369) - Support boolean types in the javascript arrow reader library -* [ARROW-1371](https://issues.apache.org/jira/browse/ARROW-1371) - [Website] Add "Powered By" page to the website -* [ARROW-1455](https://issues.apache.org/jira/browse/ARROW-1455) - [Python] Add Dockerfile for validating Dask integration outside of usual CI -* [ARROW-1471](https://issues.apache.org/jira/browse/ARROW-1471) - [JAVA] Document requirements and non/requirements for ValueVector updates -* [ARROW-1472](https://issues.apache.org/jira/browse/ARROW-1472) - [JAVA] Design updated ValueVector Object Hierarchy -* [ARROW-1473](https://issues.apache.org/jira/browse/ARROW-1473) - [JAVA] Create Prototype Code Hierarchy (Implementation Phase 1) -* [ARROW-1474](https://issues.apache.org/jira/browse/ARROW-1474) - [JAVA] ValueVector hierarchy (Implementation Phase 2) -* [ARROW-1476](https://issues.apache.org/jira/browse/ARROW-1476) - [JAVA] Implement final ValueVector updates -* [ARROW-1482](https://issues.apache.org/jira/browse/ARROW-1482) - [C++] Implement casts between date32 and date64 -* [ARROW-1483](https://issues.apache.org/jira/browse/ARROW-1483) - [C++] Implement casts between time32 and time64 -* [ARROW-1484](https://issues.apache.org/jira/browse/ARROW-1484) - [C++] Implement (safe and unsafe) casts between timestamps and times of different units -* [ARROW-1485](https://issues.apache.org/jira/browse/ARROW-1485) - [C++] Implement union-like data type for accommodating kernel arguments which may be scalars or arrays -* [ARROW-1486](https://issues.apache.org/jira/browse/ARROW-1486) - [C++] Decide if arrow::RecordBatch needs to be copyable -* [ARROW-1487](https://issues.apache.org/jira/browse/ARROW-1487) - [C++] Implement casts from List to List, where a cast function is defined from any A to B -* [ARROW-1488](https://issues.apache.org/jira/browse/ARROW-1488) - [C++] Implement ArrayBuilder::Finish in terms of internal::ArrayData -* [ARROW-1498](https://issues.apache.org/jira/browse/ARROW-1498) - [GitHub] Add CONTRIBUTING.md and ISSUE\_TEMPLATE.md -* [ARROW-1503](https://issues.apache.org/jira/browse/ARROW-1503) - [Python] Add serialization callbacks for pandas objects in pyarrow.serialize -* [ARROW-1522](https://issues.apache.org/jira/browse/ARROW-1522) - [C++] Support pyarrow.Buffer as built-in type in pyarrow.serialize -* [ARROW-1523](https://issues.apache.org/jira/browse/ARROW-1523) - [C++] Add helper data struct with methods for reading a validity bitmap possibly having a non-zero offset -* [ARROW-1524](https://issues.apache.org/jira/browse/ARROW-1524) - [C++] More graceful solution for handling non-zero offsets on inputs and outputs in compute library -* [ARROW-1525](https://issues.apache.org/jira/browse/ARROW-1525) - [C++] Change functions in arrow/compare.h to not return Status -* [ARROW-1526](https://issues.apache.org/jira/browse/ARROW-1526) - [Python] Unit tests to exercise code path in PARQUET-1100 -* [ARROW-1535](https://issues.apache.org/jira/browse/ARROW-1535) - [Python] Enable sdist source tarballs to build assuming that Arrow C++ libraries are available on the host system -* [ARROW-1538](https://issues.apache.org/jira/browse/ARROW-1538) - [C++] Support Ubuntu 14.04 in .deb packaging automation -* [ARROW-1539](https://issues.apache.org/jira/browse/ARROW-1539) - [C++] Remove functions deprecated as of 0.7.0 and prior releases -* [ARROW-1556](https://issues.apache.org/jira/browse/ARROW-1556) - [C++] Incorporate AssertArraysEqual function from PARQUET-1100 patch -* [ARROW-1559](https://issues.apache.org/jira/browse/ARROW-1559) - [C++] Kernel implementations for "unique" (compute distinct elements of array) -* [ARROW-1573](https://issues.apache.org/jira/browse/ARROW-1573) - [C++] Implement stateful kernel function that uses DictionaryBuilder to compute dictionary indices -* [ARROW-1575](https://issues.apache.org/jira/browse/ARROW-1575) - [Python] Add pyarrow.column factory function -* [ARROW-1576](https://issues.apache.org/jira/browse/ARROW-1576) - [Python] Add utility functions (or a richer type hierachy) for checking whether data type instances are members of various type classes -* [ARROW-1577](https://issues.apache.org/jira/browse/ARROW-1577) - [JS] Package release script for NPM modules -* [ARROW-1588](https://issues.apache.org/jira/browse/ARROW-1588) - [C++/Format] Harden Decimal Format -* [ARROW-1593](https://issues.apache.org/jira/browse/ARROW-1593) - [PYTHON] serialize\_pandas should pass through the preserve\_index keyword -* [ARROW-1594](https://issues.apache.org/jira/browse/ARROW-1594) - [Python] Enable multi-threaded conversions in Table.from\_pandas -* [ARROW-1600](https://issues.apache.org/jira/browse/ARROW-1600) - [C++] Zero-copy Buffer constructor from std::string -* [ARROW-1602](https://issues.apache.org/jira/browse/ARROW-1602) - [C++] Add IsValid/IsNotNull method to arrow::Array -* [ARROW-1603](https://issues.apache.org/jira/browse/ARROW-1603) - [C++] Add BinaryArray method to get a value as a std::string -* [ARROW-1604](https://issues.apache.org/jira/browse/ARROW-1604) - [Python] Support common type aliases in cast(...) and various type= arguments -* [ARROW-1605](https://issues.apache.org/jira/browse/ARROW-1605) - [Python] pyarrow.array should be able to yield smaller integer types without an explicit cast -* [ARROW-1607](https://issues.apache.org/jira/browse/ARROW-1607) - [C++] Implement DictionaryBuilder for Decimals -* [ARROW-1613](https://issues.apache.org/jira/browse/ARROW-1613) - [Java] ArrowReader should not close the input ReadChannel -* [ARROW-1616](https://issues.apache.org/jira/browse/ARROW-1616) - [Python] Add "write" method to RecordBatchStreamWriter that dispatches to write\_table/write\_back as appropriate -* [ARROW-1626](https://issues.apache.org/jira/browse/ARROW-1626) - Add make targets to run the inter-procedural static analysis tool called "infer". -* [ARROW-1627](https://issues.apache.org/jira/browse/ARROW-1627) - [JAVA] Reduce heap usage(Phase 2) - memory footprint in AllocationManager.BufferLedger -* [ARROW-1630](https://issues.apache.org/jira/browse/ARROW-1630) - [Serialization] Support Python datetime objects -* [ARROW-1631](https://issues.apache.org/jira/browse/ARROW-1631) - [C++] Add GRPC to ThirdpartyToolchain.cmake -* [ARROW-1635](https://issues.apache.org/jira/browse/ARROW-1635) - Add release management guide for PMCs -* [ARROW-1637](https://issues.apache.org/jira/browse/ARROW-1637) - [C++] IPC round-trip for null type -* [ARROW-1641](https://issues.apache.org/jira/browse/ARROW-1641) - [C++] Do not include in public headers -* [ARROW-1648](https://issues.apache.org/jira/browse/ARROW-1648) - C++: Add cast from Dictionary[NullType] to NullType -* [ARROW-1649](https://issues.apache.org/jira/browse/ARROW-1649) - C++: Print number of nulls in PrettyPrint for NullArray -* [ARROW-1651](https://issues.apache.org/jira/browse/ARROW-1651) - [JS] Lazy row accessor in Table -* [ARROW-1652](https://issues.apache.org/jira/browse/ARROW-1652) - [JS] Separate Vector into BatchVector and CompositeVector -* [ARROW-1654](https://issues.apache.org/jira/browse/ARROW-1654) - [Python] pa.DataType cannot be pickled -* [ARROW-1662](https://issues.apache.org/jira/browse/ARROW-1662) - Move OSX Dependency management into brew bundle Brewfiles -* [ARROW-1665](https://issues.apache.org/jira/browse/ARROW-1665) - [Serialization] Support more custom datatypes in the default serialization context -* [ARROW-1666](https://issues.apache.org/jira/browse/ARROW-1666) - [GLib] Enable gtk-doc on Travis CI Mac environment -* [ARROW-1667](https://issues.apache.org/jira/browse/ARROW-1667) - [GLib] Support Meson -* [ARROW-1671](https://issues.apache.org/jira/browse/ARROW-1671) - [C++] Change arrow::MakeArray to not return Status -* [ARROW-1675](https://issues.apache.org/jira/browse/ARROW-1675) - [Python] Use RecordBatch.from\_pandas in FeatherWriter.write -* [ARROW-1677](https://issues.apache.org/jira/browse/ARROW-1677) - [Blog] Add blog post on Ray and Arrow Python serialization -* [ARROW-1679](https://issues.apache.org/jira/browse/ARROW-1679) - [GLib] Add garrow\_record\_batch\_reader\_read\_next() -* [ARROW-1683](https://issues.apache.org/jira/browse/ARROW-1683) - [Python] Restore "TimestampType" to pyarrow namespace -* [ARROW-1684](https://issues.apache.org/jira/browse/ARROW-1684) - [Python] Simplify user API for reading nested Parquet columns -* [ARROW-1685](https://issues.apache.org/jira/browse/ARROW-1685) - [GLib] Add GArrowTableReader -* [ARROW-1687](https://issues.apache.org/jira/browse/ARROW-1687) - [Python] Expose UnionArray to pyarrow -* [ARROW-1689](https://issues.apache.org/jira/browse/ARROW-1689) - [Python] Categorical Indices Should Be Zero-Copy -* [ARROW-1689](https://issues.apache.org/jira/browse/ARROW-1689) - [Python] Categorical Indices Should Be Zero-Copy -* [ARROW-1690](https://issues.apache.org/jira/browse/ARROW-1690) - [GLib] Add garrow\_array\_is\_valid() -* [ARROW-1691](https://issues.apache.org/jira/browse/ARROW-1691) - [Java] Conform Java Decimal type implementation to format decisions in ARROW-1588 -* [ARROW-1697](https://issues.apache.org/jira/browse/ARROW-1697) - [GitHub] Add ISSUE\_TEMPLATE.md -* [ARROW-1701](https://issues.apache.org/jira/browse/ARROW-1701) - [Serialization] Support zero copy PyTorch Tensor serialization -* [ARROW-1702](https://issues.apache.org/jira/browse/ARROW-1702) - Update jemalloc in manylinux1 build -* [ARROW-1703](https://issues.apache.org/jira/browse/ARROW-1703) - [C++] Vendor exact version of jemalloc we depend on -* [ARROW-1707](https://issues.apache.org/jira/browse/ARROW-1707) - Update dev README after movement to GitBox -* [ARROW-1710](https://issues.apache.org/jira/browse/ARROW-1710) - [Java] Remove non-nullable vectors in new vector class hierarchy -* [ARROW-1716](https://issues.apache.org/jira/browse/ARROW-1716) - [Format/JSON] Use string integer value for Decimals in JSON -* [ARROW-1717](https://issues.apache.org/jira/browse/ARROW-1717) - [Java] Remove public static helper method in vector classes for JSONReader/Writer -* [ARROW-1718](https://issues.apache.org/jira/browse/ARROW-1718) - [Python] Implement casts from timestamp to date32/date64 and support in Array.from\_pandas -* [ARROW-1719](https://issues.apache.org/jira/browse/ARROW-1719) - [Java] Remove accessor/mutator -* [ARROW-1721](https://issues.apache.org/jira/browse/ARROW-1721) - [Python] Support null mask in places where it isn't supported in numpy\_to\_arrow.cc -* [ARROW-1724](https://issues.apache.org/jira/browse/ARROW-1724) - [Packaging] Support Ubuntu 17.10 -* [ARROW-1725](https://issues.apache.org/jira/browse/ARROW-1725) - [Packaging] Upload .deb for Ubuntu 17.10 -* [ARROW-1726](https://issues.apache.org/jira/browse/ARROW-1726) - [GLib] Add setup description to verify C GLib build -* [ARROW-1727](https://issues.apache.org/jira/browse/ARROW-1727) - [Format] Expand Arrow streaming format to permit new dictionaries and deltas / additions to existing dictionaries -* [ARROW-1728](https://issues.apache.org/jira/browse/ARROW-1728) - [C++] Run clang-format checks in Travis CI -* [ARROW-1734](https://issues.apache.org/jira/browse/ARROW-1734) - C++/Python: Add cast function on Column-level -* [ARROW-1736](https://issues.apache.org/jira/browse/ARROW-1736) - [GLib] Add GArrowCastOptions:allow-time-truncate -* [ARROW-1737](https://issues.apache.org/jira/browse/ARROW-1737) - [GLib] Use G\_DECLARE\_DERIVABLE\_TYPE -* [ARROW-1740](https://issues.apache.org/jira/browse/ARROW-1740) - C++: Kernel to get unique values of an Array/Column -* [ARROW-1746](https://issues.apache.org/jira/browse/ARROW-1746) - [Python] Add build dependencies for Arch Linux -* [ARROW-1747](https://issues.apache.org/jira/browse/ARROW-1747) - [C++] Don't export symbols of statically linked libraries -* [ARROW-1748](https://issues.apache.org/jira/browse/ARROW-1748) - [GLib] Add GArrowRecordBatchBuilder -* [ARROW-1750](https://issues.apache.org/jira/browse/ARROW-1750) - [C++] Remove the need for arrow/util/random.h -* [ARROW-1752](https://issues.apache.org/jira/browse/ARROW-1752) - [Packaging] Add GPU packages for Debian and Ubuntu -* [ARROW-1753](https://issues.apache.org/jira/browse/ARROW-1753) - [Python] Provide for matching subclasses with register\_type in serialization context -* [ARROW-1755](https://issues.apache.org/jira/browse/ARROW-1755) - [C++] Add build options for MSVC to use static runtime libraries -* [ARROW-1758](https://issues.apache.org/jira/browse/ARROW-1758) - [Python] Remove pickle=True option for object serialization -* [ARROW-1759](https://issues.apache.org/jira/browse/ARROW-1759) - [Python] Add function / property to get implied Arrow schema from Parquet file -* [ARROW-1763](https://issues.apache.org/jira/browse/ARROW-1763) - [Python] DataType should be hashable -* [ARROW-1765](https://issues.apache.org/jira/browse/ARROW-1765) - [Doc] Use dependencies from conda in C++ docker build -* [ARROW-1767](https://issues.apache.org/jira/browse/ARROW-1767) - [C++] Support file reads and writes over 2GB on Windows -* [ARROW-1772](https://issues.apache.org/jira/browse/ARROW-1772) - [C++] Add public-api-test module in style of parquet-cpp -* [ARROW-1773](https://issues.apache.org/jira/browse/ARROW-1773) - [C++] Add casts from date/time types to compatible signed integers -* [ARROW-1775](https://issues.apache.org/jira/browse/ARROW-1775) - Ability to abort created but unsealed Plasma objects -* [ARROW-1777](https://issues.apache.org/jira/browse/ARROW-1777) - [C++] Add static ctor ArrayData::Make for nicer syntax in places -* [ARROW-1779](https://issues.apache.org/jira/browse/ARROW-1779) - [Java] Integration test breaks without zeroing out validity vectors -* [ARROW-1782](https://issues.apache.org/jira/browse/ARROW-1782) - [Python] Expose compressors as pyarrow.compress, pyarrow.decompress -* [ARROW-1783](https://issues.apache.org/jira/browse/ARROW-1783) - [Python] Convert SerializedPyObject to/from sequence of component buffers with minimal memory allocation / copying -* [ARROW-1784](https://issues.apache.org/jira/browse/ARROW-1784) - [Python] Read and write pandas.DataFrame in pyarrow.serialize by decomposing the BlockManager rather than coercing to Arrow format -* [ARROW-1785](https://issues.apache.org/jira/browse/ARROW-1785) - [Format/C++/Java] Remove VectorLayout metadata from Flatbuffers metadata -* [ARROW-1787](https://issues.apache.org/jira/browse/ARROW-1787) - [Python] Support reading parquet files into DataFrames in a backward compatible way -* [ARROW-1794](https://issues.apache.org/jira/browse/ARROW-1794) - [C++/Python] Rename DecimalArray to Decimal128Array -* [ARROW-1795](https://issues.apache.org/jira/browse/ARROW-1795) - [Plasma C++] change evict policy -* [ARROW-1801](https://issues.apache.org/jira/browse/ARROW-1801) - [Docs] Update install instructions to use red-data-tools repos -* [ARROW-1802](https://issues.apache.org/jira/browse/ARROW-1802) - [GLib] Add Arrow GPU support -* [ARROW-1806](https://issues.apache.org/jira/browse/ARROW-1806) - [GLib] Add garrow\_record\_batch\_writer\_write\_table() -* [ARROW-1808](https://issues.apache.org/jira/browse/ARROW-1808) - [C++] Make RecordBatch interface virtual to permit record batches that lazy-materialize columns -* [ARROW-1809](https://issues.apache.org/jira/browse/ARROW-1809) - [GLib] Use .xml instead of .sgml for GTK-Doc main file -* [ARROW-1810](https://issues.apache.org/jira/browse/ARROW-1810) - [Plasma] Remove test shell scripts -* [ARROW-1816](https://issues.apache.org/jira/browse/ARROW-1816) - [Java] Resolve new vector classes structure for timestamp, date and maybe interval -* [ARROW-1817](https://issues.apache.org/jira/browse/ARROW-1817) - Configure JsonFileReader to read NaN for floats -* [ARROW-1818](https://issues.apache.org/jira/browse/ARROW-1818) - Examine Java Dependencies -* [ARROW-1819](https://issues.apache.org/jira/browse/ARROW-1819) - [Java] Remove legacy vector classes -* [ARROW-1820](https://issues.apache.org/jira/browse/ARROW-1820) - [C++] Create arrow\_compute shared library subcomponent -* [ARROW-1826](https://issues.apache.org/jira/browse/ARROW-1826) - [JAVA] Avoid branching at cell level (copyFrom) -* [ARROW-1827](https://issues.apache.org/jira/browse/ARROW-1827) - [Java] Add checkstyle config file and header file -* [ARROW-1828](https://issues.apache.org/jira/browse/ARROW-1828) - [C++] Implement hash kernel specialization for BooleanType -* [ARROW-1834](https://issues.apache.org/jira/browse/ARROW-1834) - [Doc] Build documentation in separate build folders -* [ARROW-1838](https://issues.apache.org/jira/browse/ARROW-1838) - [C++] Use compute::Datum uniformly for input argument to kernels -* [ARROW-1841](https://issues.apache.org/jira/browse/ARROW-1841) - [JS] Update text-encoding-utf-8 and tslib for node ESModules support -* [ARROW-1844](https://issues.apache.org/jira/browse/ARROW-1844) - [C++] Basic benchmark suite for hash kernels -* [ARROW-1849](https://issues.apache.org/jira/browse/ARROW-1849) - [GLib] Add input checks to GArrowRecordBatch -* [ARROW-1850](https://issues.apache.org/jira/browse/ARROW-1850) - [C++] Use const void\* in Writable::Write instead of const uint8\_t\* -* [ARROW-1854](https://issues.apache.org/jira/browse/ARROW-1854) - [Python] Improve performance of serializing object dtype ndarrays -* [ARROW-1855](https://issues.apache.org/jira/browse/ARROW-1855) - [GLib] Add workaround for build failure on macOS -* [ARROW-1857](https://issues.apache.org/jira/browse/ARROW-1857) - [Python] Add switch for boost linkage with static parquet in wheels -* [ARROW-1859](https://issues.apache.org/jira/browse/ARROW-1859) - [GLib] Add GArrowDictionaryDataType -* [ARROW-1862](https://issues.apache.org/jira/browse/ARROW-1862) - [GLib] Add GArrowDictionaryArray -* [ARROW-1864](https://issues.apache.org/jira/browse/ARROW-1864) - [Java] Upgrade Netty to 4.1.x -* [ARROW-1866](https://issues.apache.org/jira/browse/ARROW-1866) - [Java] Combine MapVector and NonNullableMapVector Classes -* [ARROW-1867](https://issues.apache.org/jira/browse/ARROW-1867) - [Java] Add BitVector APIs from old vector class -* [ARROW-1874](https://issues.apache.org/jira/browse/ARROW-1874) - [GLib] Add garrow\_array\_unique() -* [ARROW-1878](https://issues.apache.org/jira/browse/ARROW-1878) - [GLib] Add garrow\_array\_dictionary\_encode() -* [ARROW-1884](https://issues.apache.org/jira/browse/ARROW-1884) - [C++] Make JsonReader/JsonWriter classes internal APIs -* [ARROW-1885](https://issues.apache.org/jira/browse/ARROW-1885) - [Java] Restore previous MapVector class names -* [ARROW-1901](https://issues.apache.org/jira/browse/ARROW-1901) - [Python] Support recursive mkdir for DaskFilesystem -* [ARROW-1902](https://issues.apache.org/jira/browse/ARROW-1902) - [Python] Remove mkdir race condition from write\_to\_dataset -* [ARROW-1905](https://issues.apache.org/jira/browse/ARROW-1905) - [Python] Add more functions for checking exact types in pyarrow.types -* [ARROW-1911](https://issues.apache.org/jira/browse/ARROW-1911) - Add Graphistry to Arrow JS proof points -* [ARROW-1922](https://issues.apache.org/jira/browse/ARROW-1922) - Blog post on recent improvements/changes in JAVA Vectors -* [ARROW-1932](https://issues.apache.org/jira/browse/ARROW-1932) - [Website] Update site for 0.8.0 -* [ARROW-1934](https://issues.apache.org/jira/browse/ARROW-1934) - [Website] Blog post summarizing highlights of 0.8.0 release - - - -# Apache Arrow 0.7.1 (2017-10-01) - -## New Features and Improvements - -* [ARROW-559](https://issues.apache.org/jira/browse/ARROW-559) - Script to easily verify release in all languages -* [ARROW-1464](https://issues.apache.org/jira/browse/ARROW-1464) - [GLib] Documentation for troubleshooting of build errors -* [ARROW-1537](https://issues.apache.org/jira/browse/ARROW-1537) - [C++] Support building with full path install\_name on macOS -* [ARROW-1546](https://issues.apache.org/jira/browse/ARROW-1546) - [GLib] Support GLib 2.40 again -* [ARROW-1548](https://issues.apache.org/jira/browse/ARROW-1548) - [GLib] Support build append in builder -* [ARROW-1578](https://issues.apache.org/jira/browse/ARROW-1578) - [C++/Python] Run lint checks in Travis CI to fail for linting issues as early as possible -* [ARROW-1592](https://issues.apache.org/jira/browse/ARROW-1592) - [GLib] Add GArrowUIntArrayBuilder -* [ARROW-1608](https://issues.apache.org/jira/browse/ARROW-1608) - Support Release verification script on macOS -* [ARROW-1612](https://issues.apache.org/jira/browse/ARROW-1612) - [GLib] add how to install for mac os to README -* [ARROW-1618](https://issues.apache.org/jira/browse/ARROW-1618) - [JAVA] Reduce Heap Usage(Phase 1): move release listener logic to Allocation Manager -* [ARROW-1634](https://issues.apache.org/jira/browse/ARROW-1634) - [Website] Updates for 0.7.1 release - - -## Bug Fixes - -* [ARROW-1497](https://issues.apache.org/jira/browse/ARROW-1497) - [Java] JsonFileReader doesn't set value count for some vectors -* [ARROW-1500](https://issues.apache.org/jira/browse/ARROW-1500) - [C++] Result of ftruncate ignored in MemoryMappedFile::Create -* [ARROW-1529](https://issues.apache.org/jira/browse/ARROW-1529) - [GLib] Fix failure on macOS on Travis CI -* [ARROW-1533](https://issues.apache.org/jira/browse/ARROW-1533) - [JAVA] realloc should consider the existing buffer capacity for computing target memory requirement -* [ARROW-1536](https://issues.apache.org/jira/browse/ARROW-1536) - [C++] Do not transitively depend on libboost\_system -* [ARROW-1542](https://issues.apache.org/jira/browse/ARROW-1542) - [C++] Windows release verification script should not modify conda environment -* [ARROW-1544](https://issues.apache.org/jira/browse/ARROW-1544) - [JS] Export Vector type definitions -* [ARROW-1545](https://issues.apache.org/jira/browse/ARROW-1545) - Int64Builder should not need int64() as arg -* [ARROW-1547](https://issues.apache.org/jira/browse/ARROW-1547) - [JAVA] Fix 8x memory over-allocation in BitVector -* [ARROW-1550](https://issues.apache.org/jira/browse/ARROW-1550) - [Python] Fix flaky test on Windows -* [ARROW-1550](https://issues.apache.org/jira/browse/ARROW-1550) - [Python] Fix flaky test on Windows -* [ARROW-1553](https://issues.apache.org/jira/browse/ARROW-1553) - [JAVA] Implement setInitialCapacity for MapWriter and pass on this capacity during lazy creation of child vectors -* [ARROW-1554](https://issues.apache.org/jira/browse/ARROW-1554) - [Python] Document that pip wheels depend on MSVC14 runtime -* [ARROW-1557](https://issues.apache.org/jira/browse/ARROW-1557) - [PYTHON] pyarrow.Table.from\_arrays doesn't validate names length -* [ARROW-1590](https://issues.apache.org/jira/browse/ARROW-1590) - Flow TS Table method generics -* [ARROW-1591](https://issues.apache.org/jira/browse/ARROW-1591) - C++: Xcode 9 is not correctly detected -* [ARROW-1595](https://issues.apache.org/jira/browse/ARROW-1595) - [Python] Fix package dependency issues causing build failures -* [ARROW-1598](https://issues.apache.org/jira/browse/ARROW-1598) - [C++/Tutorials] MIsmatch code comment and actual code about Object ID -* [ARROW-1601](https://issues.apache.org/jira/browse/ARROW-1601) - [C++] READ\_NEXT\_BITSET reads one byte past the last byte on last iteration -* [ARROW-1606](https://issues.apache.org/jira/browse/ARROW-1606) - Python: Windows wheels don't include .lib files. -* [ARROW-1610](https://issues.apache.org/jira/browse/ARROW-1610) - C++/Python: Only call python-prefix if the default PYTHON\_LIBRARY is not present -* [ARROW-1611](https://issues.apache.org/jira/browse/ARROW-1611) - Crash in BitmapReader when length is zero -* [ARROW-1619](https://issues.apache.org/jira/browse/ARROW-1619) - [Java] Correctly set "lastSet" for variable vectors in JsonReader - - - -# Apache Arrow 0.7.0 (2017-09-17) - -## Bug Fixes - -* [ARROW-12](https://issues.apache.org/jira/browse/ARROW-12) - Get Github activity mirrored to JIRA -* [ARROW-248](https://issues.apache.org/jira/browse/ARROW-248) - UnionVector.close() should call clear() -* [ARROW-269](https://issues.apache.org/jira/browse/ARROW-269) - UnionVector getBuffers method does not include typevector -* [ARROW-407](https://issues.apache.org/jira/browse/ARROW-407) - BitVector.copyFromSafe() should re-allocate if necessary instead of returning false -* [ARROW-801](https://issues.apache.org/jira/browse/ARROW-801) - [JAVA] Provide direct access to underlying buffer memory addresses in consistent way without generating garbage or large amount indirections -* [ARROW-1302](https://issues.apache.org/jira/browse/ARROW-1302) - C++: ${MAKE} variable not set sometimes on older MacOS installations -* [ARROW-1332](https://issues.apache.org/jira/browse/ARROW-1332) - [Packaging] Building Windows wheels in Apache repos -* [ARROW-1354](https://issues.apache.org/jira/browse/ARROW-1354) - [Python] Segfault in Table.from\_pandas with Mixed-Type Categories -* [ARROW-1357](https://issues.apache.org/jira/browse/ARROW-1357) - [Python] Data corruption in reading multi-file parquet dataset -* [ARROW-1363](https://issues.apache.org/jira/browse/ARROW-1363) - [C++] IPC writer sends buffer layout for dictionary rather than indices -* [ARROW-1365](https://issues.apache.org/jira/browse/ARROW-1365) - [Python] Remove usage of removed jemalloc\_memory\_pool in Python API docs -* [ARROW-1373](https://issues.apache.org/jira/browse/ARROW-1373) - [Java] Implement getBuffer() methods at the ValueVector interface -* [ARROW-1375](https://issues.apache.org/jira/browse/ARROW-1375) - [C++] Visual Studio 2017 Appveyor builds failing -* [ARROW-1378](https://issues.apache.org/jira/browse/ARROW-1378) - [Python] whl is not a supported wheel on this platform on Debian/Jessie -* [ARROW-1379](https://issues.apache.org/jira/browse/ARROW-1379) - [Java] maven dependency issues - both unused and undeclared -* [ARROW-1390](https://issues.apache.org/jira/browse/ARROW-1390) - [Python] Extend tests for python serialization -* [ARROW-1407](https://issues.apache.org/jira/browse/ARROW-1407) - Dictionaries can only hold a maximum of 4096 indices -* [ARROW-1411](https://issues.apache.org/jira/browse/ARROW-1411) - [Python] Booleans in Float Columns cause Segfault -* [ARROW-1414](https://issues.apache.org/jira/browse/ARROW-1414) - [GLib] Cast after status check -* [ARROW-1421](https://issues.apache.org/jira/browse/ARROW-1421) - [Python] pyarrow.serialize cannot serialize a Python dict input -* [ARROW-1426](https://issues.apache.org/jira/browse/ARROW-1426) - [Website] The title element of the top page is empty -* [ARROW-1429](https://issues.apache.org/jira/browse/ARROW-1429) - [Python] Error loading parquet file with \_metadata from HDFS -* [ARROW-1430](https://issues.apache.org/jira/browse/ARROW-1430) - [Python] flake8 warnings are not failing CI builds -* [ARROW-1434](https://issues.apache.org/jira/browse/ARROW-1434) - [C++/Python] pyarrow.Array.from\_pandas does not support datetime64[D] arrays -* [ARROW-1435](https://issues.apache.org/jira/browse/ARROW-1435) - [Python] PyArrow not propagating timezone information from Parquet to Python -* [ARROW-1437](https://issues.apache.org/jira/browse/ARROW-1437) - [Python] pa.Array.from\_pandas segfaults when given a mixed-type array -* [ARROW-1439](https://issues.apache.org/jira/browse/ARROW-1439) - [Packaging] Automate updating RPM in RPM build -* [ARROW-1443](https://issues.apache.org/jira/browse/ARROW-1443) - [Java] Bug on ArrowBuf.setBytes with unsliced ByteBuffers -* [ARROW-1444](https://issues.apache.org/jira/browse/ARROW-1444) - [JAVA] BitVector.splitAndTransfer copies last byte incorrectly -* [ARROW-1446](https://issues.apache.org/jira/browse/ARROW-1446) - Python: Writing more than 2^31 rows from pandas dataframe causes row count overflow error -* [ARROW-1450](https://issues.apache.org/jira/browse/ARROW-1450) - [Python] Raise proper error if custom serialization handler fails -* [ARROW-1452](https://issues.apache.org/jira/browse/ARROW-1452) - [C++] Make UNUSED macro name more unique so it does not conflict with thirdparty projects -* [ARROW-1452](https://issues.apache.org/jira/browse/ARROW-1452) - [C++] Make UNUSED macro name more unique so it does not conflict with thirdparty projects -* [ARROW-1453](https://issues.apache.org/jira/browse/ARROW-1453) - [Python] Implement WriteTensor for non-contiguous tensors -* [ARROW-1457](https://issues.apache.org/jira/browse/ARROW-1457) - [C++] Optimize strided WriteTensor -* [ARROW-1458](https://issues.apache.org/jira/browse/ARROW-1458) - [Python] Document that HadoopFileSystem.mkdir with create\_parents=False has no effect -* [ARROW-1459](https://issues.apache.org/jira/browse/ARROW-1459) - [Python] PyArrow fails to load partitioned parquet files with non-primitive types -* [ARROW-1461](https://issues.apache.org/jira/browse/ARROW-1461) - [C++] Disable builds using LLVM apt packages temporarily -* [ARROW-1461](https://issues.apache.org/jira/browse/ARROW-1461) - [C++] Disable builds using LLVM apt packages temporarily -* [ARROW-1467](https://issues.apache.org/jira/browse/ARROW-1467) - [JAVA]: Fix reset() and allocateNew() in Nullable Value Vectors template -* [ARROW-1469](https://issues.apache.org/jira/browse/ARROW-1469) - Segfault when serialize Pandas series with mixed object type -* [ARROW-1490](https://issues.apache.org/jira/browse/ARROW-1490) - [Java] Allow Travis CI failures for JDK9 for now -* [ARROW-1493](https://issues.apache.org/jira/browse/ARROW-1493) - [C++] Flush the output stream at the end of each PrettyPrint function -* [ARROW-1495](https://issues.apache.org/jira/browse/ARROW-1495) - [C++] Store shared\_ptr to boxed arrays in RecordBatch -* [ARROW-1507](https://issues.apache.org/jira/browse/ARROW-1507) - [C++] arrow/compute/api.h can't be used without arrow/array.h -* [ARROW-1512](https://issues.apache.org/jira/browse/ARROW-1512) - [Docs] NumericArray has no member named 'raw\_data' -* [ARROW-1514](https://issues.apache.org/jira/browse/ARROW-1514) - [C++] Fix a typo in document -* [ARROW-1527](https://issues.apache.org/jira/browse/ARROW-1527) - Fix Travis JDK9 build -* [ARROW-1531](https://issues.apache.org/jira/browse/ARROW-1531) - [C++] Return ToBytes by value from Decimal128 -* [ARROW-1532](https://issues.apache.org/jira/browse/ARROW-1532) - [Python] Referencing an Empty Schema causes a SegFault - - -## New Features and Improvements - -* [ARROW-34](https://issues.apache.org/jira/browse/ARROW-34) - C++: establish a basic function evaluation model -* [ARROW-229](https://issues.apache.org/jira/browse/ARROW-229) - [C++] Implement safe casts for primitive types -* [ARROW-592](https://issues.apache.org/jira/browse/ARROW-592) - [C++] Provide .deb and .rpm packages -* [ARROW-594](https://issues.apache.org/jira/browse/ARROW-594) - [Python] Provide interface to write pyarrow.Table to a stream -* [ARROW-695](https://issues.apache.org/jira/browse/ARROW-695) - Integration tests for Decimal types -* [ARROW-696](https://issues.apache.org/jira/browse/ARROW-696) - [C++] Add JSON read/write support for decimals for integration tests -* [ARROW-759](https://issues.apache.org/jira/browse/ARROW-759) - [Python] Implement a transient list serialization function that can handle a mix of scalars, lists, ndarrays, dicts -* [ARROW-786](https://issues.apache.org/jira/browse/ARROW-786) - [Format] In-memory format for 128-bit Decimals, handling of sign bit -* [ARROW-837](https://issues.apache.org/jira/browse/ARROW-837) - [Python] Expose buffer allocation, FixedSizeBufferWriter -* [ARROW-941](https://issues.apache.org/jira/browse/ARROW-941) - [Docs] Improve "cold start" integration testing instructions -* [ARROW-989](https://issues.apache.org/jira/browse/ARROW-989) - [Python] Write pyarrow.Table to FileWriter or StreamWriter -* [ARROW-1156](https://issues.apache.org/jira/browse/ARROW-1156) - [Python] pyarrow.Array.from\_pandas should take a type parameter -* [ARROW-1238](https://issues.apache.org/jira/browse/ARROW-1238) - [Java] Add JSON read/write support for decimals for integration tests -* [ARROW-1286](https://issues.apache.org/jira/browse/ARROW-1286) - PYTHON: support Categorical serialization to/from parquet -* [ARROW-1307](https://issues.apache.org/jira/browse/ARROW-1307) - [Python] Add pandas serialization section + Feather API to Sphinx docs -* [ARROW-1317](https://issues.apache.org/jira/browse/ARROW-1317) - [Python] Add function to set Hadoop CLASSPATH -* [ARROW-1331](https://issues.apache.org/jira/browse/ARROW-1331) - [Java] Refactor tests -* [ARROW-1331](https://issues.apache.org/jira/browse/ARROW-1331) - [Java] Refactor tests -* [ARROW-1339](https://issues.apache.org/jira/browse/ARROW-1339) - [C++] Use boost::filesystem for handling of platform-specific file path encodings -* [ARROW-1344](https://issues.apache.org/jira/browse/ARROW-1344) - [C++] Calling BufferOutputStream::Write after calling Finish crashes -* [ARROW-1348](https://issues.apache.org/jira/browse/ARROW-1348) - [C++/Python] Add release verification script for Windows -* [ARROW-1351](https://issues.apache.org/jira/browse/ARROW-1351) - Automate updating CHANGELOG.md as part of release scripts -* [ARROW-1352](https://issues.apache.org/jira/browse/ARROW-1352) - [Integration] Improve print formatting for producer, consumer line -* [ARROW-1355](https://issues.apache.org/jira/browse/ARROW-1355) - Make arrow buildable with java9 -* [ARROW-1356](https://issues.apache.org/jira/browse/ARROW-1356) - [Website] Add new committers -* [ARROW-1358](https://issues.apache.org/jira/browse/ARROW-1358) - Update source release scripts to account for new SHA checksum policy -* [ARROW-1359](https://issues.apache.org/jira/browse/ARROW-1359) - [Python] Add Parquet writer option to normalize field names for use in Spark -* [ARROW-1364](https://issues.apache.org/jira/browse/ARROW-1364) - [C++] IPC reader and writer specialized for GPU device memory -* [ARROW-1366](https://issues.apache.org/jira/browse/ARROW-1366) - [Python] Add instructions for starting the Plasma store when installing pyarrow from wheels -* [ARROW-1372](https://issues.apache.org/jira/browse/ARROW-1372) - [Plasma] Support for storing data in huge pages -* [ARROW-1376](https://issues.apache.org/jira/browse/ARROW-1376) - [C++] RecordBatchStreamReader::Open API is inconsistent with writer -* [ARROW-1377](https://issues.apache.org/jira/browse/ARROW-1377) - [Python] Add function to assist with benchmarking Parquet scan performance -* [ARROW-1381](https://issues.apache.org/jira/browse/ARROW-1381) - [Python] Improve performance of SerializedPyObject.to\_buffer -* [ARROW-1383](https://issues.apache.org/jira/browse/ARROW-1383) - [C++] Support std::vector in builder vector appends -* [ARROW-1384](https://issues.apache.org/jira/browse/ARROW-1384) - [C++] Add convenience function for serializing a record batch to an IPC message -* [ARROW-1386](https://issues.apache.org/jira/browse/ARROW-1386) - [C++] Unpin CMake version in MSVC build toolchain -* [ARROW-1387](https://issues.apache.org/jira/browse/ARROW-1387) - [C++] Set up GPU leaf library build toolchain -* [ARROW-1392](https://issues.apache.org/jira/browse/ARROW-1392) - [C++] Implement reader and writer IO interfaces for GPU buffers -* [ARROW-1395](https://issues.apache.org/jira/browse/ARROW-1395) - [C++] Remove APIs deprecated as of 0.5.0 and later versions -* [ARROW-1396](https://issues.apache.org/jira/browse/ARROW-1396) - [C++] Add PrettyPrint function for Schemas, which also outputs any dictionaries -* [ARROW-1397](https://issues.apache.org/jira/browse/ARROW-1397) - [Packaging] Use Docker instead of Vagrant -* [ARROW-1399](https://issues.apache.org/jira/browse/ARROW-1399) - [C++] Add CUDA build version in a public header to help prevent ABI conflicts -* [ARROW-1400](https://issues.apache.org/jira/browse/ARROW-1400) - [Python] Ability to create partitions when writing to Parquet -* [ARROW-1401](https://issues.apache.org/jira/browse/ARROW-1401) - [C++] Add extra debugging context to failures in RETURN\_NOT\_OK in debug builds -* [ARROW-1401](https://issues.apache.org/jira/browse/ARROW-1401) - [C++] Add extra debugging context to failures in RETURN\_NOT\_OK in debug builds -* [ARROW-1402](https://issues.apache.org/jira/browse/ARROW-1402) - [C++] Possibly deprecate public APIs that use MutableBuffer -* [ARROW-1404](https://issues.apache.org/jira/browse/ARROW-1404) - [Packaging] Build .deb and .rpm on Travis CI -* [ARROW-1405](https://issues.apache.org/jira/browse/ARROW-1405) - [Python] Add logging option for verbose memory allocations -* [ARROW-1406](https://issues.apache.org/jira/browse/ARROW-1406) - [Python] Harden user API for generating serialized schema and record batch messages as memoryview-compatible objects -* [ARROW-1408](https://issues.apache.org/jira/browse/ARROW-1408) - [C++] Refactor and make IPC read / write APIs more consistent, add appropriate deprecations -* [ARROW-1410](https://issues.apache.org/jira/browse/ARROW-1410) - Plasma object store occasionally pauses for a long time -* [ARROW-1412](https://issues.apache.org/jira/browse/ARROW-1412) - [Plasma] Add higher level API for putting and getting Python objects -* [ARROW-1413](https://issues.apache.org/jira/browse/ARROW-1413) - [C++] Add include-what-you-use configuration -* [ARROW-1415](https://issues.apache.org/jira/browse/ARROW-1415) - [GLib] Support date32 and date64 -* [ARROW-1416](https://issues.apache.org/jira/browse/ARROW-1416) - [Format] Clarify example array in memory layout documentation -* [ARROW-1417](https://issues.apache.org/jira/browse/ARROW-1417) - [Python] Allow more generic filesystem objects to be passed to ParquetDataset -* [ARROW-1418](https://issues.apache.org/jira/browse/ARROW-1418) - [Python] Introduce SerializationContext to register custom serialization callbacks -* [ARROW-1419](https://issues.apache.org/jira/browse/ARROW-1419) - [GLib] Suppress sign-conversion warning on Clang -* [ARROW-1427](https://issues.apache.org/jira/browse/ARROW-1427) - [GLib] Add a link to readme of Arrow GLib -* [ARROW-1428](https://issues.apache.org/jira/browse/ARROW-1428) - [C++] Append steps to clone source code to README.mb -* [ARROW-1432](https://issues.apache.org/jira/browse/ARROW-1432) - [C++] Build bundled jemalloc functions with private prefix -* [ARROW-1433](https://issues.apache.org/jira/browse/ARROW-1433) - [C++] Simplify implementation of Array::Slice -* [ARROW-1438](https://issues.apache.org/jira/browse/ARROW-1438) - [Plasma] Pull SerializationContext through PlasmaClient put and get -* [ARROW-1441](https://issues.apache.org/jira/browse/ARROW-1441) - [Site] Add Ruby to Flexible section -* [ARROW-1442](https://issues.apache.org/jira/browse/ARROW-1442) - [Website] Add pointer to nightly conda packages on /install -* [ARROW-1447](https://issues.apache.org/jira/browse/ARROW-1447) - [C++] Round of include-what-you-use include cleanups -* [ARROW-1448](https://issues.apache.org/jira/browse/ARROW-1448) - [Packaging] Support uploading built .deb and .rpm to Bintray -* [ARROW-1449](https://issues.apache.org/jira/browse/ARROW-1449) - Implement Decimal using only Int128 -* [ARROW-1451](https://issues.apache.org/jira/browse/ARROW-1451) - [C++] Create arrow/io/api.h -* [ARROW-1460](https://issues.apache.org/jira/browse/ARROW-1460) - [C++] Upgrade clang-format used to LLVM 4.0 -* [ARROW-1462](https://issues.apache.org/jira/browse/ARROW-1462) - [GLib] Support time array -* [ARROW-1466](https://issues.apache.org/jira/browse/ARROW-1466) - [C++] Support DecimalArray in arrow::PrettyPrint -* [ARROW-1468](https://issues.apache.org/jira/browse/ARROW-1468) - [C++] Append to PrimitiveBuilder from std::vector -* [ARROW-1479](https://issues.apache.org/jira/browse/ARROW-1479) - [JS] Expand JavaScript implementation -* [ARROW-1480](https://issues.apache.org/jira/browse/ARROW-1480) - [Python] Improve performance of serializing sets -* [ARROW-1481](https://issues.apache.org/jira/browse/ARROW-1481) - [C++] Expose type casts as generic callable object that can write into pre-allocated memory -* [ARROW-1494](https://issues.apache.org/jira/browse/ARROW-1494) - [C++] Document that shared\_ptr returned by RecordBatch::column needs to be retained -* [ARROW-1499](https://issues.apache.org/jira/browse/ARROW-1499) - [Python] Consider adding option to parquet.write\_table that sets options for maximum Spark compatibility -* [ARROW-1504](https://issues.apache.org/jira/browse/ARROW-1504) - [GLib] Support timestamp -* [ARROW-1505](https://issues.apache.org/jira/browse/ARROW-1505) - [GLib] Simplify arguments check -* [ARROW-1506](https://issues.apache.org/jira/browse/ARROW-1506) - [C++] Support pkg-config for compute modules -* [ARROW-1508](https://issues.apache.org/jira/browse/ARROW-1508) - C++: Add support for FixedSizeBinaryType in DictionaryBuilder -* [ARROW-1510](https://issues.apache.org/jira/browse/ARROW-1510) - [C++] Support cast -* [ARROW-1511](https://issues.apache.org/jira/browse/ARROW-1511) - [C++] Deprecate arrow::MakePrimitiveArray -* [ARROW-1513](https://issues.apache.org/jira/browse/ARROW-1513) - C++: Add cast from Dictionary to plain arrays -* [ARROW-1515](https://issues.apache.org/jira/browse/ARROW-1515) - [GLib] Detect version directly -* [ARROW-1516](https://issues.apache.org/jira/browse/ARROW-1516) - [GLib] Update document -* [ARROW-1517](https://issues.apache.org/jira/browse/ARROW-1517) - Remove unnecessary temporary in DecimalUtil::ToString function -* [ARROW-1519](https://issues.apache.org/jira/browse/ARROW-1519) - [C++] Move DecimalUtil functions to methods on the Int128 class -* [ARROW-1528](https://issues.apache.org/jira/browse/ARROW-1528) - [GLib] Resolve include dependency -* [ARROW-1530](https://issues.apache.org/jira/browse/ARROW-1530) - [C++] Install arrow/util/parallel.h -* [ARROW-1551](https://issues.apache.org/jira/browse/ARROW-1551) - [Website] Updates for 0.7.0 release -* [ARROW-1597](https://issues.apache.org/jira/browse/ARROW-1597) - [Packaging] arrow-compute.pc is missing in .deb/.rpm file list - - - -# Apache Arrow 0.6.0 (2017-08-14) - -## Bug Fixes - -* [ARROW-187](https://issues.apache.org/jira/browse/ARROW-187) - [C++] Decide on how pedantic we want to be about exceptions -* [ARROW-276](https://issues.apache.org/jira/browse/ARROW-276) - [JAVA] Nullable Value Vectors should extend BaseValueVector instead of BaseDataValueVector -* [ARROW-573](https://issues.apache.org/jira/browse/ARROW-573) - [Python/C++] Support ordered dictionaries data, pandas Categorical -* [ARROW-884](https://issues.apache.org/jira/browse/ARROW-884) - [C++] Exclude internal classes from documentation -* [ARROW-932](https://issues.apache.org/jira/browse/ARROW-932) - [Python] Fix compiler warnings on MSVC -* [ARROW-968](https://issues.apache.org/jira/browse/ARROW-968) - [Python] RecordBatch [i:j] syntax is incomplete -* [ARROW-1192](https://issues.apache.org/jira/browse/ARROW-1192) - [JAVA] Improve splitAndTransfer performance for List and Union vectors -* [ARROW-1195](https://issues.apache.org/jira/browse/ARROW-1195) - [C++] CpuInfo doesn't get cache size on Windows -* [ARROW-1204](https://issues.apache.org/jira/browse/ARROW-1204) - [C++] lz4 ExternalProject fails in Visual Studio 2015 -* [ARROW-1225](https://issues.apache.org/jira/browse/ARROW-1225) - [Python] pyarrow.array does not attempt to convert bytes to UTF8 when passed a StringType -* [ARROW-1237](https://issues.apache.org/jira/browse/ARROW-1237) - [JAVA] Expose the ability to set lastSet -* [ARROW-1239](https://issues.apache.org/jira/browse/ARROW-1239) - issue with current version of git-commit-id-plugin -* [ARROW-1240](https://issues.apache.org/jira/browse/ARROW-1240) - security: upgrade logback to address CVE-2017-5929 -* [ARROW-1240](https://issues.apache.org/jira/browse/ARROW-1240) - security: upgrade logback to address CVE-2017-5929 -* [ARROW-1241](https://issues.apache.org/jira/browse/ARROW-1241) - [C++] Visual Studio 2017 Appveyor build job -* [ARROW-1242](https://issues.apache.org/jira/browse/ARROW-1242) - [Java] security - upgrade Jackson to mitigate 3 CVE vulnerabilities -* [ARROW-1242](https://issues.apache.org/jira/browse/ARROW-1242) - [Java] security - upgrade Jackson to mitigate 3 CVE vulnerabilities -* [ARROW-1245](https://issues.apache.org/jira/browse/ARROW-1245) - [Integration] Java Integration Tests Disabled -* [ARROW-1248](https://issues.apache.org/jira/browse/ARROW-1248) - [Python] C linkage warnings in Clang with public Cython API -* [ARROW-1249](https://issues.apache.org/jira/browse/ARROW-1249) - [JAVA] Expose the fillEmpties function from NullableVector.mutator -* [ARROW-1263](https://issues.apache.org/jira/browse/ARROW-1263) - [C++] CpuInfo should be able to get CPU features on Windows -* [ARROW-1265](https://issues.apache.org/jira/browse/ARROW-1265) - [Plasma] Plasma store memory leak warnings in Python test suite -* [ARROW-1267](https://issues.apache.org/jira/browse/ARROW-1267) - [Java] Handle zero length case in BitVector.splitAndTransfer -* [ARROW-1269](https://issues.apache.org/jira/browse/ARROW-1269) - [Packaging] Add Windows wheel build scripts from ARROW-1068 to arrow-dist -* [ARROW-1275](https://issues.apache.org/jira/browse/ARROW-1275) - [C++] Default static library prefix for Snappy should be "\_static" -* [ARROW-1276](https://issues.apache.org/jira/browse/ARROW-1276) - Cannot serializer empty DataFrame to parquet -* [ARROW-1283](https://issues.apache.org/jira/browse/ARROW-1283) - [Java] VectorSchemaRoot should be able to be closed() more than once -* [ARROW-1285](https://issues.apache.org/jira/browse/ARROW-1285) - PYTHON: NotImplemented exception creates empty parquet file -* [ARROW-1287](https://issues.apache.org/jira/browse/ARROW-1287) - [Python] Emulate "whence" argument of seek in NativeFile -* [ARROW-1290](https://issues.apache.org/jira/browse/ARROW-1290) - [C++] Use array capacity doubling in arrow::BufferBuilder -* [ARROW-1291](https://issues.apache.org/jira/browse/ARROW-1291) - [Python] pa.RecordBatch.from\_pandas doesn't accept DataFrame with numeric column names -* [ARROW-1294](https://issues.apache.org/jira/browse/ARROW-1294) - [C++] New Appveyor build failures -* [ARROW-1296](https://issues.apache.org/jira/browse/ARROW-1296) - [Java] templates/FixValueVectors reset() method doesn't set allocationSizeInBytes correctly -* [ARROW-1300](https://issues.apache.org/jira/browse/ARROW-1300) - [JAVA] Fix ListVector Tests -* [ARROW-1306](https://issues.apache.org/jira/browse/ARROW-1306) - [Python] Encoding? issue with error reporting for parquet.read\_table -* [ARROW-1308](https://issues.apache.org/jira/browse/ARROW-1308) - [C++] ld tries to link 'arrow\_static' even when -DARROW\_BUILD\_STATIC=off -* [ARROW-1309](https://issues.apache.org/jira/browse/ARROW-1309) - [Python] Error inferring List type in Array.from\_pandas when inner values are all None -* [ARROW-1310](https://issues.apache.org/jira/browse/ARROW-1310) - [JAVA] Revert ARROW-886 -* [ARROW-1311](https://issues.apache.org/jira/browse/ARROW-1311) - python hangs after write a few parquet tables -* [ARROW-1312](https://issues.apache.org/jira/browse/ARROW-1312) - [C++] Set default value to ARROW\_JEMALLOC to OFF until ARROW-1282 is resolved -* [ARROW-1312](https://issues.apache.org/jira/browse/ARROW-1312) - [C++] Set default value to ARROW\_JEMALLOC to OFF until ARROW-1282 is resolved -* [ARROW-1326](https://issues.apache.org/jira/browse/ARROW-1326) - [Python] Fix Sphinx build in Travis CI -* [ARROW-1327](https://issues.apache.org/jira/browse/ARROW-1327) - [Python] Failing to release GIL in MemoryMappedFile.\_open causes deadlock -* [ARROW-1328](https://issues.apache.org/jira/browse/ARROW-1328) - [Python] pyarrow.Table.from\_pandas option timestamps\_to\_ms changes column values -* [ARROW-1330](https://issues.apache.org/jira/browse/ARROW-1330) - [Plasma] Turn on plasma tests on manylinux1 -* [ARROW-1335](https://issues.apache.org/jira/browse/ARROW-1335) - [C++] PrimitiveArray::raw\_values has inconsistent semantics re: offsets compared with subclasses -* [ARROW-1338](https://issues.apache.org/jira/browse/ARROW-1338) - [Python] Investigate non-deterministic core dump on Python 2.7, Travis CI builds -* [ARROW-1340](https://issues.apache.org/jira/browse/ARROW-1340) - [Java] NullableMapVector field doesn't maintain metadata -* [ARROW-1342](https://issues.apache.org/jira/browse/ARROW-1342) - [Python] Support strided array of lists -* [ARROW-1343](https://issues.apache.org/jira/browse/ARROW-1343) - [Format/Java/C++] Ensuring encapsulated stream / IPC message sizes are always a multiple of 8 -* [ARROW-1350](https://issues.apache.org/jira/browse/ARROW-1350) - [C++] Include Plasma source tree in source distribution - - -## New Features and Improvements - -* [ARROW-439](https://issues.apache.org/jira/browse/ARROW-439) - [Python] Add option in "to\_pandas" conversions to yield Categorical from String/Binary arrays -* [ARROW-622](https://issues.apache.org/jira/browse/ARROW-622) - [Python] Investigate alternatives to timestamps\_to\_ms argument in pandas conversion -* [ARROW-1076](https://issues.apache.org/jira/browse/ARROW-1076) - [Python] Handle nanosecond timestamps more gracefully when writing to Parquet format -* [ARROW-1093](https://issues.apache.org/jira/browse/ARROW-1093) - [Python] Fail Python builds if flake8 yields warnings -* [ARROW-1104](https://issues.apache.org/jira/browse/ARROW-1104) - Integrate in-memory object store from Ray -* [ARROW-1116](https://issues.apache.org/jira/browse/ARROW-1116) - [Python] Create single external GitHub repo building for building wheels for all platforms in one shot -* [ARROW-1121](https://issues.apache.org/jira/browse/ARROW-1121) - [C++] Improve error message when opening OS file fails -* [ARROW-1140](https://issues.apache.org/jira/browse/ARROW-1140) - [C++] Allow optional build of plasma -* [ARROW-1149](https://issues.apache.org/jira/browse/ARROW-1149) - [Plasma] Create Cython client library for Plasma -* [ARROW-1173](https://issues.apache.org/jira/browse/ARROW-1173) - [Plasma] Blog post for Plasma -* [ARROW-1211](https://issues.apache.org/jira/browse/ARROW-1211) - [C++] Consider making default\_memory\_pool() the default for builder classes -* [ARROW-1213](https://issues.apache.org/jira/browse/ARROW-1213) - [Python] Enable s3fs to be used with ParquetDataset and reader/writer functions -* [ARROW-1219](https://issues.apache.org/jira/browse/ARROW-1219) - [C++] Use more vanilla Google C++ formatting -* [ARROW-1224](https://issues.apache.org/jira/browse/ARROW-1224) - [Format] Clarify language around buffer padding and alignment in IPC -* [ARROW-1230](https://issues.apache.org/jira/browse/ARROW-1230) - [Plasma] Install libraries and headers -* [ARROW-1243](https://issues.apache.org/jira/browse/ARROW-1243) - [Java] security: upgrade all libraries to latest stable versions -* [ARROW-1246](https://issues.apache.org/jira/browse/ARROW-1246) - [Format] Add Map logical type to metadata -* [ARROW-1251](https://issues.apache.org/jira/browse/ARROW-1251) - [Python/C++] Revise build documentation to account for latest build toolchain -* [ARROW-1253](https://issues.apache.org/jira/browse/ARROW-1253) - [C++] Use pre-built toolchain libraries where prudent to speed up CI builds -* [ARROW-1255](https://issues.apache.org/jira/browse/ARROW-1255) - [Plasma] Check plasma flatbuffer messages with the flatbuffer verifier -* [ARROW-1256](https://issues.apache.org/jira/browse/ARROW-1256) - [Plasma] Fix compile warnings on macOS -* [ARROW-1257](https://issues.apache.org/jira/browse/ARROW-1257) - [Plasma] Plasma documentation -* [ARROW-1258](https://issues.apache.org/jira/browse/ARROW-1258) - [C++] Suppress dlmalloc warnings on Clang -* [ARROW-1259](https://issues.apache.org/jira/browse/ARROW-1259) - [Plasma] Speed up Plasma tests -* [ARROW-1260](https://issues.apache.org/jira/browse/ARROW-1260) - [Plasma] Use factory method to create Python PlasmaClient -* [ARROW-1264](https://issues.apache.org/jira/browse/ARROW-1264) - [Plasma] Don't exit the Python interpreter if the plasma client can't connect to the store -* [ARROW-1268](https://issues.apache.org/jira/browse/ARROW-1268) - [Website] Blog post on Arrow integration with Spark -* [ARROW-1270](https://issues.apache.org/jira/browse/ARROW-1270) - [Packaging] Add Python wheel build scripts for macOS to arrow-dist -* [ARROW-1272](https://issues.apache.org/jira/browse/ARROW-1272) - [Python] Add script to arrow-dist to generate and upload manylinux1 Python wheels -* [ARROW-1273](https://issues.apache.org/jira/browse/ARROW-1273) - [Python] Add convenience functions for reading only Parquet metadata or effective Arrow schema from a particular Parquet file -* [ARROW-1274](https://issues.apache.org/jira/browse/ARROW-1274) - [C++] add\_compiler\_export\_flags() throws warning with CMake \>= 3.3 -* [ARROW-1281](https://issues.apache.org/jira/browse/ARROW-1281) - [C++/Python] Add Docker setup for running HDFS tests and other tests we may not run in Travis CI -* [ARROW-1288](https://issues.apache.org/jira/browse/ARROW-1288) - Clean up many ASF license headers -* [ARROW-1289](https://issues.apache.org/jira/browse/ARROW-1289) - [Python] Add PYARROW\_BUILD\_PLASMA option like Parquet -* [ARROW-1297](https://issues.apache.org/jira/browse/ARROW-1297) - 0.6.0 Release -* [ARROW-1301](https://issues.apache.org/jira/browse/ARROW-1301) - [C++/Python] Add remaining supported libhdfs UNIX-like filesystem APIs -* [ARROW-1303](https://issues.apache.org/jira/browse/ARROW-1303) - [C++] Support downloading Boost -* [ARROW-1304](https://issues.apache.org/jira/browse/ARROW-1304) - [Java] Fix checkstyle checks warning -* [ARROW-1305](https://issues.apache.org/jira/browse/ARROW-1305) - [GLib] Add GArrowIntArrayBuilder -* [ARROW-1315](https://issues.apache.org/jira/browse/ARROW-1315) - [GLib] Status check of arrow::ArrayBuilder::Finish() is missing -* [ARROW-1323](https://issues.apache.org/jira/browse/ARROW-1323) - [GLib] Add garrow\_boolean\_array\_get\_values() -* [ARROW-1333](https://issues.apache.org/jira/browse/ARROW-1333) - [Plasma] Sorting example for DataFrames in plasma -* [ARROW-1334](https://issues.apache.org/jira/browse/ARROW-1334) - [C++] Instantiate arrow::Table from vector of Array objects (instead of Columns) -* [ARROW-1336](https://issues.apache.org/jira/browse/ARROW-1336) - [C++] Add arrow::schema factory function -* [ARROW-1353](https://issues.apache.org/jira/browse/ARROW-1353) - [Website] Updates + blog post for 0.6.0 release - - - -# Apache Arrow 0.5.0 (2017-07-23) - -## New Features and Improvements - -* [ARROW-111](https://issues.apache.org/jira/browse/ARROW-111) - [C++] Add static analyzer to tool chain to verify checking of Status returns -* [ARROW-195](https://issues.apache.org/jira/browse/ARROW-195) - [C++] Upgrade clang bits to clang-3.8 and move back to trusty. -* [ARROW-460](https://issues.apache.org/jira/browse/ARROW-460) - [C++] Implement JSON round trip for DictionaryArray -* [ARROW-462](https://issues.apache.org/jira/browse/ARROW-462) - [C++] Implement in-memory conversions between non-nested primitive types and DictionaryArray equivalent -* [ARROW-575](https://issues.apache.org/jira/browse/ARROW-575) - Python: Auto-detect nested lists and nested numpy arrays in Pandas -* [ARROW-597](https://issues.apache.org/jira/browse/ARROW-597) - [Python] Add convenience function to yield DataFrame from any object that a StreamReader or FileReader can read from -* [ARROW-599](https://issues.apache.org/jira/browse/ARROW-599) - [C++] Add LZ4 codec to 3rd-party toolchain -* [ARROW-599](https://issues.apache.org/jira/browse/ARROW-599) - [C++] Add LZ4 codec to 3rd-party toolchain -* [ARROW-600](https://issues.apache.org/jira/browse/ARROW-600) - [C++] Add ZSTD codec to 3rd-party toolchain -* [ARROW-692](https://issues.apache.org/jira/browse/ARROW-692) - Java<-\>C++ Integration tests for dictionary-encoded vectors -* [ARROW-693](https://issues.apache.org/jira/browse/ARROW-693) - [Java] Add JSON support for dictionary vectors -* [ARROW-742](https://issues.apache.org/jira/browse/ARROW-742) - Handling exceptions during execution of std::wstring\_convert -* [ARROW-742](https://issues.apache.org/jira/browse/ARROW-742) - Handling exceptions during execution of std::wstring\_convert -* [ARROW-834](https://issues.apache.org/jira/browse/ARROW-834) - [Python] Support creating Arrow arrays from Python iterables -* [ARROW-915](https://issues.apache.org/jira/browse/ARROW-915) - Struct Array reads limited support -* [ARROW-935](https://issues.apache.org/jira/browse/ARROW-935) - [Java] Build Javadoc in Travis CI -* [ARROW-960](https://issues.apache.org/jira/browse/ARROW-960) - [Python] Add source build guide for macOS + Homebrew -* [ARROW-962](https://issues.apache.org/jira/browse/ARROW-962) - [Python] Add schema attribute to FileReader -* [ARROW-964](https://issues.apache.org/jira/browse/ARROW-964) - [Python] Improve api docs -* [ARROW-966](https://issues.apache.org/jira/browse/ARROW-966) - [Python] pyarrow.list\_ should also accept Field instance -* [ARROW-978](https://issues.apache.org/jira/browse/ARROW-978) - [Python] Use sphinx-bootstrap-theme for Sphinx documentation -* [ARROW-1041](https://issues.apache.org/jira/browse/ARROW-1041) - [Python] Support read\_pandas on a directory of Parquet files -* [ARROW-1048](https://issues.apache.org/jira/browse/ARROW-1048) - Allow user LD\_LIBRARY\_PATH to be used with source release script -* [ARROW-1052](https://issues.apache.org/jira/browse/ARROW-1052) - Arrow 0.5.0 release -* [ARROW-1071](https://issues.apache.org/jira/browse/ARROW-1071) - [Python] RecordBatchFileReader does not have a schema property -* [ARROW-1073](https://issues.apache.org/jira/browse/ARROW-1073) - C++: Adapative integer builder -* [ARROW-1095](https://issues.apache.org/jira/browse/ARROW-1095) - [Website] Add Arrow icon asset -* [ARROW-1100](https://issues.apache.org/jira/browse/ARROW-1100) - [Python] Add "mode" property to NativeFile instances -* [ARROW-1102](https://issues.apache.org/jira/browse/ARROW-1102) - Make MessageSerializer.serializeMessage() public -* [ARROW-1120](https://issues.apache.org/jira/browse/ARROW-1120) - [Python] Write support for int96 -* [ARROW-1122](https://issues.apache.org/jira/browse/ARROW-1122) - [Website] Guest blog post on Arrow + ODBC from turbodbc -* [ARROW-1122](https://issues.apache.org/jira/browse/ARROW-1122) - [Website] Guest blog post on Arrow + ODBC from turbodbc -* [ARROW-1123](https://issues.apache.org/jira/browse/ARROW-1123) - C++: Make jemalloc the default allocator -* [ARROW-1135](https://issues.apache.org/jira/browse/ARROW-1135) - Upgrade Travis CI clang builds to use LLVM 4.0 -* [ARROW-1137](https://issues.apache.org/jira/browse/ARROW-1137) - Python: Ensure Pandas roundtrip of all-None column -* [ARROW-1142](https://issues.apache.org/jira/browse/ARROW-1142) - [C++] Move over compression library toolchain from parquet-cpp -* [ARROW-1145](https://issues.apache.org/jira/browse/ARROW-1145) - [GLib] Add get\_values() -* [ARROW-1146](https://issues.apache.org/jira/browse/ARROW-1146) - Add .gitignore for \*\_generated.h files in src/plasma/format -* [ARROW-1148](https://issues.apache.org/jira/browse/ARROW-1148) - [C++] Raise minimum CMake version to 3.2 -* [ARROW-1151](https://issues.apache.org/jira/browse/ARROW-1151) - [C++] Add gcc branch prediction to status check macro -* [ARROW-1154](https://issues.apache.org/jira/browse/ARROW-1154) - [C++] Migrate more computational utility code from parquet-cpp -* [ARROW-1160](https://issues.apache.org/jira/browse/ARROW-1160) - C++: Implement DictionaryBuilder -* [ARROW-1165](https://issues.apache.org/jira/browse/ARROW-1165) - [C++] Refactor PythonDecimalToArrowDecimal to not use templates -* [ARROW-1172](https://issues.apache.org/jira/browse/ARROW-1172) - [C++] Use unique\_ptr with array builder classes -* [ARROW-1183](https://issues.apache.org/jira/browse/ARROW-1183) - [Python] Implement time type conversions in to\_pandas -* [ARROW-1185](https://issues.apache.org/jira/browse/ARROW-1185) - [C++] Clean up arrow::Status implementation, add warn\_unused\_result attribute for clang -* [ARROW-1187](https://issues.apache.org/jira/browse/ARROW-1187) - Serialize a DataFrame with None column -* [ARROW-1193](https://issues.apache.org/jira/browse/ARROW-1193) - [C++] Support pkg-config forarrow\_python.so -* [ARROW-1196](https://issues.apache.org/jira/browse/ARROW-1196) - [C++] Appveyor separate jobs for Debug/Release builds from sources; Build with conda toolchain; Build with NMake Makefiles Generator -* [ARROW-1198](https://issues.apache.org/jira/browse/ARROW-1198) - Python: Add public C++ API to unwrap PyArrow object -* [ARROW-1199](https://issues.apache.org/jira/browse/ARROW-1199) - [C++] Introduce mutable POD struct for generic array data -* [ARROW-1202](https://issues.apache.org/jira/browse/ARROW-1202) - Remove semicolons from status macros -* [ARROW-1212](https://issues.apache.org/jira/browse/ARROW-1212) - [GLib] Add garrow\_binary\_array\_get\_offsets\_buffer() -* [ARROW-1214](https://issues.apache.org/jira/browse/ARROW-1214) - [Python] Add classes / functions to enable stream message components to be handled outside of the stream reader class -* [ARROW-1217](https://issues.apache.org/jira/browse/ARROW-1217) - [GLib] Add GInputStream based arrow::io::RandomAccessFile -* [ARROW-1220](https://issues.apache.org/jira/browse/ARROW-1220) - [C++] Standartize usage of \*\_HOME cmake script variables for 3rd party libs -* [ARROW-1221](https://issues.apache.org/jira/browse/ARROW-1221) - [C++] Pin clang-format version -* [ARROW-1227](https://issues.apache.org/jira/browse/ARROW-1227) - [GLib] Support GOutputStream -* [ARROW-1229](https://issues.apache.org/jira/browse/ARROW-1229) - [GLib] Follow Reader API change (get -\> read) -* [ARROW-1244](https://issues.apache.org/jira/browse/ARROW-1244) - [C++] Do not include cpp/src/plasma in source release pending IP clearance -* [ARROW-1252](https://issues.apache.org/jira/browse/ARROW-1252) - [Website] Update for 0.5.0 release, add blog post summarizing changes from 0.4.x - - -## Bug Fixes - -* [ARROW-288](https://issues.apache.org/jira/browse/ARROW-288) - Implement Arrow adapter for Spark Datasets -* [ARROW-601](https://issues.apache.org/jira/browse/ARROW-601) - Some logical types not supported when loading Parquet -* [ARROW-784](https://issues.apache.org/jira/browse/ARROW-784) - Cleaning up thirdparty toolchain support in Arrow on Windows -* [ARROW-785](https://issues.apache.org/jira/browse/ARROW-785) - possible issue on writing parquet via pyarrow, subsequently read in Hive -* [ARROW-924](https://issues.apache.org/jira/browse/ARROW-924) - Setting GTEST\_HOME Fails on CMake run -* [ARROW-992](https://issues.apache.org/jira/browse/ARROW-992) - [Python] In place development builds do not have a \_\_version\_\_ -* [ARROW-1043](https://issues.apache.org/jira/browse/ARROW-1043) - [Python] Make sure pandas metadata created by arrow conforms to the pandas spec -* [ARROW-1074](https://issues.apache.org/jira/browse/ARROW-1074) - from\_pandas doesnt convert ndarray to list -* [ARROW-1079](https://issues.apache.org/jira/browse/ARROW-1079) - [Python] Empty "private" directories should be ignored by Parquet interface -* [ARROW-1081](https://issues.apache.org/jira/browse/ARROW-1081) - C++: arrow::test::TestBase::MakePrimitive doesn't fill null\_bitmap -* [ARROW-1096](https://issues.apache.org/jira/browse/ARROW-1096) - [C++] Memory mapping file over 4GB fails on Windows -* [ARROW-1097](https://issues.apache.org/jira/browse/ARROW-1097) - Reading tensor needs file to be opened in writeable mode -* [ARROW-1098](https://issues.apache.org/jira/browse/ARROW-1098) - Document Error? -* [ARROW-1101](https://issues.apache.org/jira/browse/ARROW-1101) - UnionListWriter is not implementing all methods on interface ScalarWriter -* [ARROW-1103](https://issues.apache.org/jira/browse/ARROW-1103) - [Python] Utilize pandas metadata from common \_metadata Parquet file if it exists -* [ARROW-1107](https://issues.apache.org/jira/browse/ARROW-1107) - [JAVA] NullableMapVector getField() should return nullable type -* [ARROW-1108](https://issues.apache.org/jira/browse/ARROW-1108) - Check if ArrowBuf is empty buffer in getActualConsumedMemory() and getPossibleConsumedMemory() -* [ARROW-1109](https://issues.apache.org/jira/browse/ARROW-1109) - [JAVA] transferOwnership fails when readerIndex is not 0 -* [ARROW-1110](https://issues.apache.org/jira/browse/ARROW-1110) - [JAVA] make union vector naming consistent -* [ARROW-1111](https://issues.apache.org/jira/browse/ARROW-1111) - [JAVA] Make aligning buffers optional, and allow -1 for unknown null count -* [ARROW-1112](https://issues.apache.org/jira/browse/ARROW-1112) - [JAVA] Set lastSet for VarLength and List vectors when loading -* [ARROW-1113](https://issues.apache.org/jira/browse/ARROW-1113) - [C++] gflags EP build gets triggered (as a no-op) on subsequent calls to make or ninja build -* [ARROW-1115](https://issues.apache.org/jira/browse/ARROW-1115) - [C++] Use absolute path for ccache -* [ARROW-1117](https://issues.apache.org/jira/browse/ARROW-1117) - [Docs] Minor issues in GLib README -* [ARROW-1124](https://issues.apache.org/jira/browse/ARROW-1124) - [Python] pyarrow needs to depend on numpy\>=1.10 (not 1.9) -* [ARROW-1125](https://issues.apache.org/jira/browse/ARROW-1125) - Python: Table.from\_pandas doesn't work anymore on partial schemas -* [ARROW-1125](https://issues.apache.org/jira/browse/ARROW-1125) - Python: Table.from\_pandas doesn't work anymore on partial schemas -* [ARROW-1128](https://issues.apache.org/jira/browse/ARROW-1128) - [Docs] command to build a wheel is not properly rendered -* [ARROW-1129](https://issues.apache.org/jira/browse/ARROW-1129) - [C++] Fix Linux toolchain build regression from ARROW-742 -* [ARROW-1130](https://issues.apache.org/jira/browse/ARROW-1130) - io-hdfs-test failure -* [ARROW-1131](https://issues.apache.org/jira/browse/ARROW-1131) - Python: Parquet unit tests are always skipped -* [ARROW-1132](https://issues.apache.org/jira/browse/ARROW-1132) - [Python] Unable to write pandas DataFrame w/MultiIndex containing duplicate values to parquet -* [ARROW-1136](https://issues.apache.org/jira/browse/ARROW-1136) - [C++/Python] Segfault on empty stream -* [ARROW-1138](https://issues.apache.org/jira/browse/ARROW-1138) - Travis: Use OpenJDK7 instead of OracleJDK7 -* [ARROW-1139](https://issues.apache.org/jira/browse/ARROW-1139) - [C++] dlmalloc doesn't allow arrow to be built with clang 4 or gcc 7.1.1 -* [ARROW-1141](https://issues.apache.org/jira/browse/ARROW-1141) - on import get libjemalloc.so.2: cannot allocate memory in static TLS block -* [ARROW-1143](https://issues.apache.org/jira/browse/ARROW-1143) - C++: Fix comparison of NullArray -* [ARROW-1144](https://issues.apache.org/jira/browse/ARROW-1144) - [C++] Remove unused variable -* [ARROW-1147](https://issues.apache.org/jira/browse/ARROW-1147) - [C++] Allow optional vendoring of flatbuffers in plasma -* [ARROW-1150](https://issues.apache.org/jira/browse/ARROW-1150) - [C++] AdaptiveIntBuilder compiler warning on MSVC -* [ARROW-1152](https://issues.apache.org/jira/browse/ARROW-1152) - [Cython] read\_tensor should work with a readable file -* [ARROW-1153](https://issues.apache.org/jira/browse/ARROW-1153) - All non-Pandas column throws NotImplemented: unhandled type -* [ARROW-1155](https://issues.apache.org/jira/browse/ARROW-1155) - segmentation fault when run pa.Int16Value() -* [ARROW-1157](https://issues.apache.org/jira/browse/ARROW-1157) - C++/Python: Decimal templates are not correctly exported on OSX -* [ARROW-1159](https://issues.apache.org/jira/browse/ARROW-1159) - [C++] Static data members cannot be accessed from inline functions in Arrow headers by thirdparty users -* [ARROW-1162](https://issues.apache.org/jira/browse/ARROW-1162) - Transfer Between Empty Lists Should Not Invoke Callback -* [ARROW-1164](https://issues.apache.org/jira/browse/ARROW-1164) - C++: Templated functions need ARROW\_EXPORT instead of ARROW\_TEMPLATE\_EXPORT -* [ARROW-1166](https://issues.apache.org/jira/browse/ARROW-1166) - Errors in Struct type's example and missing reference in Layout.md -* [ARROW-1167](https://issues.apache.org/jira/browse/ARROW-1167) - [Python] Create chunked BinaryArray in Table.from\_pandas when a column's data exceeds 2GB -* [ARROW-1168](https://issues.apache.org/jira/browse/ARROW-1168) - [Python] pandas metadata may contain "mixed" data types -* [ARROW-1169](https://issues.apache.org/jira/browse/ARROW-1169) - C++: jemalloc externalproject doesn't build with CMake's ninja generator -* [ARROW-1170](https://issues.apache.org/jira/browse/ARROW-1170) - C++: ARROW\_JEMALLOC=OFF breaks linking on unittest -* [ARROW-1174](https://issues.apache.org/jira/browse/ARROW-1174) - [GLib] Investigate root cause of ListArray glib test failure -* [ARROW-1177](https://issues.apache.org/jira/browse/ARROW-1177) - [C++] Detect int32 overflow in ListBuilder::Append -* [ARROW-1179](https://issues.apache.org/jira/browse/ARROW-1179) - C++: Add missing virtual destructors -* [ARROW-1180](https://issues.apache.org/jira/browse/ARROW-1180) - [GLib] garrow\_tensor\_get\_dimension\_name() returns invalid address -* [ARROW-1181](https://issues.apache.org/jira/browse/ARROW-1181) - [Python] Parquet test fail if not enabled -* [ARROW-1182](https://issues.apache.org/jira/browse/ARROW-1182) - C++: Specify BUILD\_BYPRODUCTS for zlib and zstd -* [ARROW-1186](https://issues.apache.org/jira/browse/ARROW-1186) - [C++] Enable option to build arrow with minimal dependencies needed to build Parquet library -* [ARROW-1188](https://issues.apache.org/jira/browse/ARROW-1188) - Segfault when trying to serialize a DataFrame with Null-only Categorical Column -* [ARROW-1190](https://issues.apache.org/jira/browse/ARROW-1190) - VectorLoader corrupts vectors with duplicate names -* [ARROW-1191](https://issues.apache.org/jira/browse/ARROW-1191) - [JAVA] Implement getField() method for the complex readers -* [ARROW-1194](https://issues.apache.org/jira/browse/ARROW-1194) - Getting record batch size with pa.get\_record\_batch\_size returns a size that is too small for pandas DataFrame. -* [ARROW-1197](https://issues.apache.org/jira/browse/ARROW-1197) - [GLib] record\_batch.hpp Inclusion is missing -* [ARROW-1200](https://issues.apache.org/jira/browse/ARROW-1200) - [C++] DictionaryBuilder should use signed integers for indices -* [ARROW-1201](https://issues.apache.org/jira/browse/ARROW-1201) - [Python] Incomplete Python types cause a core dump when repr-ing -* [ARROW-1203](https://issues.apache.org/jira/browse/ARROW-1203) - [C++] Disallow BinaryBuilder to append byte strings larger than the maximum value of int32\_t -* [ARROW-1205](https://issues.apache.org/jira/browse/ARROW-1205) - C++: Reference to type objects in ArrayLoader may cause segmentation faults. -* [ARROW-1206](https://issues.apache.org/jira/browse/ARROW-1206) - [C++] Enable MSVC builds to work with some compression library support disabled -* [ARROW-1208](https://issues.apache.org/jira/browse/ARROW-1208) - [C++] Toolchain build with ZSTD library from conda-forge failure -* [ARROW-1208](https://issues.apache.org/jira/browse/ARROW-1208) - [C++] Toolchain build with ZSTD library from conda-forge failure -* [ARROW-1215](https://issues.apache.org/jira/browse/ARROW-1215) - [Python] Class methods in API reference -* [ARROW-1216](https://issues.apache.org/jira/browse/ARROW-1216) - Numpy arrays cannot be created from Arrow Buffers on Python 2 -* [ARROW-1218](https://issues.apache.org/jira/browse/ARROW-1218) - Arrow doesn't compile if all compression libraries are deactivated -* [ARROW-1222](https://issues.apache.org/jira/browse/ARROW-1222) - [Python] pyarrow.array returns NullArray for array of unsupported Python objects -* [ARROW-1223](https://issues.apache.org/jira/browse/ARROW-1223) - [GLib] Fix function name that returns wrapped object -* [ARROW-1228](https://issues.apache.org/jira/browse/ARROW-1228) - [GLib] Test file name should be the same name as target class -* [ARROW-1233](https://issues.apache.org/jira/browse/ARROW-1233) - [C++] Validate cmake script resolving of 3rd party linked libs from correct location in toolchain build -* [ARROW-1235](https://issues.apache.org/jira/browse/ARROW-1235) - [C++] macOS linker failure with operator<< and std::ostream -* [ARROW-1236](https://issues.apache.org/jira/browse/ARROW-1236) - Library paths in exported pkg-config file are incorrect -* [ARROW-1284](https://issues.apache.org/jira/browse/ARROW-1284) - Windows can't install pyarrow 0.4.1 and 0.5.0 - - - -# Apache Arrow 0.4.1 (2017-06-09) - -## Bug Fixes - -* [ARROW-424](https://issues.apache.org/jira/browse/ARROW-424) - [C++] Threadsafety in arrow/io/hdfs.h -* [ARROW-1039](https://issues.apache.org/jira/browse/ARROW-1039) - Python: pyarrow.Filesystem.read\_parquet causing error if nthreads\>1 -* [ARROW-1050](https://issues.apache.org/jira/browse/ARROW-1050) - [C++] Export arrow::ValidateArray -* [ARROW-1051](https://issues.apache.org/jira/browse/ARROW-1051) - [Python] If pyarrow.parquet fails to import due to a shared library ABI conflict, the test\_parquet.py tests silently do not run -* [ARROW-1056](https://issues.apache.org/jira/browse/ARROW-1056) - [Python] Parquet+HDFS test failure due to writing pandas index -* [ARROW-1057](https://issues.apache.org/jira/browse/ARROW-1057) - Fix cmake warning and msvc debug asserts -* [ARROW-1060](https://issues.apache.org/jira/browse/ARROW-1060) - [Python] Add unit test for ARROW-1053 -* [ARROW-1062](https://issues.apache.org/jira/browse/ARROW-1062) - [GLib] Examples use old API -* [ARROW-1066](https://issues.apache.org/jira/browse/ARROW-1066) - remove warning on feather for pandas \>= 0.20.1 -* [ARROW-1070](https://issues.apache.org/jira/browse/ARROW-1070) - [C++] Feather files for date/time types should be written with the physical types -* [ARROW-1075](https://issues.apache.org/jira/browse/ARROW-1075) - [GLib] Build error on macOS -* [ARROW-1082](https://issues.apache.org/jira/browse/ARROW-1082) - [GLib] Add CI on macOS -* [ARROW-1085](https://issues.apache.org/jira/browse/ARROW-1085) - [java] Follow up on template cleanup. Missing method for IntervalYear -* [ARROW-1086](https://issues.apache.org/jira/browse/ARROW-1086) - [Python] pyarrow 0.4.0 on pypi is missing pxd files -* [ARROW-1088](https://issues.apache.org/jira/browse/ARROW-1088) - [Python] test\_unicode\_filename test fails when unicode filenames aren't supported by system -* [ARROW-1090](https://issues.apache.org/jira/browse/ARROW-1090) - [Python] build\_ext usability -* [ARROW-1091](https://issues.apache.org/jira/browse/ARROW-1091) - Decimal scale and precision are flipped -* [ARROW-1092](https://issues.apache.org/jira/browse/ARROW-1092) - More Decimal and scale flipped follow-up -* [ARROW-1094](https://issues.apache.org/jira/browse/ARROW-1094) - [C++] Incomplete buffer reads in arrow::io::ReadableFile should exactly truncate returned buffer -* [ARROW-1127](https://issues.apache.org/jira/browse/ARROW-1127) - pyarrow 4.1 import failure on Travis - - -## New Features and Improvements - -* [ARROW-897](https://issues.apache.org/jira/browse/ARROW-897) - [GLib] Build arrow-glib as a separate build in the Travis CI build matrix -* [ARROW-986](https://issues.apache.org/jira/browse/ARROW-986) - [Format] Update IPC.md to account for dictionary batches -* [ARROW-990](https://issues.apache.org/jira/browse/ARROW-990) - [JS] Add tslint support for linting TypeScript -* [ARROW-1020](https://issues.apache.org/jira/browse/ARROW-1020) - [Format] Add additional language to Schema.fbs to clarify naive vs. localized Timestamp values -* [ARROW-1034](https://issues.apache.org/jira/browse/ARROW-1034) - [Python] Enable creation of binary wheels on Windows / MSVC -* [ARROW-1049](https://issues.apache.org/jira/browse/ARROW-1049) - [java] vector template cleanup -* [ARROW-1063](https://issues.apache.org/jira/browse/ARROW-1063) - [Website] Blog post and website updates for 0.4.0 release -* [ARROW-1068](https://issues.apache.org/jira/browse/ARROW-1068) - [Python] Create external repo with appveyor.yml configured for building Python wheel installers -* [ARROW-1069](https://issues.apache.org/jira/browse/ARROW-1069) - Add instructions for publishing maven artifacts -* [ARROW-1078](https://issues.apache.org/jira/browse/ARROW-1078) - [Python] Account for PARQUET-967 -* [ARROW-1080](https://issues.apache.org/jira/browse/ARROW-1080) - C++: Add tutorial about converting to/from row-wise representation -* [ARROW-1084](https://issues.apache.org/jira/browse/ARROW-1084) - Implementations of BufferAllocator should handle Netty's OutOfDirectMemoryError -* [ARROW-1118](https://issues.apache.org/jira/browse/ARROW-1118) - [Website] Site updates for 0.4.1 - - - -# Apache Arrow 0.4.0 (2017-05-22) - -## Bug Fixes - -* [ARROW-813](https://issues.apache.org/jira/browse/ARROW-813) - [Python] setup.py sdist must also bundle dependent cmake modules -* [ARROW-824](https://issues.apache.org/jira/browse/ARROW-824) - Date and Time Vectors should reflect timezone-less semantics -* [ARROW-856](https://issues.apache.org/jira/browse/ARROW-856) - CmakeError by Unknown compiler. -* [ARROW-909](https://issues.apache.org/jira/browse/ARROW-909) - libjemalloc.so.2: cannot open shared object file: -* [ARROW-939](https://issues.apache.org/jira/browse/ARROW-939) - Fix division by zero for zero-dimensional Tensors -* [ARROW-940](https://issues.apache.org/jira/browse/ARROW-940) - [JS] Generate multiple sets of artifacts -* [ARROW-944](https://issues.apache.org/jira/browse/ARROW-944) - Python: Compat broken for pandas==0.18.1 -* [ARROW-948](https://issues.apache.org/jira/browse/ARROW-948) - [GLib] Update C++ header file list -* [ARROW-952](https://issues.apache.org/jira/browse/ARROW-952) - Compilation error on macOS with clang-802.0.42 -* [ARROW-958](https://issues.apache.org/jira/browse/ARROW-958) - [Python] Conda build guide still needs ARROW\_HOME, PARQUET\_HOME -* [ARROW-979](https://issues.apache.org/jira/browse/ARROW-979) - [Python] Fix setuptools\_scm version when release tag is not in the master timeline -* [ARROW-991](https://issues.apache.org/jira/browse/ARROW-991) - [Python] PyArray\_SimpleNew should not be used with NPY\_DATETIME -* [ARROW-995](https://issues.apache.org/jira/browse/ARROW-995) - [Website] 0.3 release announce has a typo in reference -* [ARROW-998](https://issues.apache.org/jira/browse/ARROW-998) - [Doc] File format documents incorrect schema location -* [ARROW-1003](https://issues.apache.org/jira/browse/ARROW-1003) - [C++] Hdfs and java dlls fail to load when built for Windows with MSVC -* [ARROW-1004](https://issues.apache.org/jira/browse/ARROW-1004) - ArrowInvalid: Invalid: Python object of type float is not None and is not a string, bool, or date object -* [ARROW-1017](https://issues.apache.org/jira/browse/ARROW-1017) - Python: Table.to\_pandas leaks memory -* [ARROW-1023](https://issues.apache.org/jira/browse/ARROW-1023) - Python: Fix bundling of arrow-cpp for macOS -* [ARROW-1033](https://issues.apache.org/jira/browse/ARROW-1033) - [Python] pytest discovers scripts/test\_leak.py -* [ARROW-1045](https://issues.apache.org/jira/browse/ARROW-1045) - [JAVA] Add support for custom metadata in org.apache.arrow.vector.types.pojo.\* -* [ARROW-1046](https://issues.apache.org/jira/browse/ARROW-1046) - [Python] Conform DataFrame metadata to pandas spec -* [ARROW-1053](https://issues.apache.org/jira/browse/ARROW-1053) - [Python] Memory leak with RecordBatchFileReader -* [ARROW-1054](https://issues.apache.org/jira/browse/ARROW-1054) - [Python] Test suite fails on pandas 0.19.2 -* [ARROW-1061](https://issues.apache.org/jira/browse/ARROW-1061) - [C++] Harden decimal parsing against invalid strings -* [ARROW-1064](https://issues.apache.org/jira/browse/ARROW-1064) - ModuleNotFoundError: No module named 'pyarrow.\_parquet' - - -## New Features and Improvements - -* [ARROW-29](https://issues.apache.org/jira/browse/ARROW-29) - C++: Add re2 as optional 3rd-party toolchain dependency -* [ARROW-182](https://issues.apache.org/jira/browse/ARROW-182) - [C++] Remove Array::Validate virtual function and make a separate method -* [ARROW-376](https://issues.apache.org/jira/browse/ARROW-376) - Python: Convert non-range Pandas indices (optionally) to Arrow -* [ARROW-446](https://issues.apache.org/jira/browse/ARROW-446) - [Python] Document NativeFile interfaces, HDFS client in Sphinx -* [ARROW-482](https://issues.apache.org/jira/browse/ARROW-482) - [Java] Provide API access to "custom\_metadata" Field attribute in IPC setting -* [ARROW-532](https://issues.apache.org/jira/browse/ARROW-532) - [Python] Expand pyarrow.parquet documentation for 0.3 release -* [ARROW-579](https://issues.apache.org/jira/browse/ARROW-579) - Python: Provide redistributable pyarrow wheels on OSX -* [ARROW-596](https://issues.apache.org/jira/browse/ARROW-596) - [Python] Add convenience function to convert pandas.DataFrame to pyarrow.Buffer containing a file or stream representation -* [ARROW-629](https://issues.apache.org/jira/browse/ARROW-629) - [JS] Add unit test suite -* [ARROW-714](https://issues.apache.org/jira/browse/ARROW-714) - [C++] Add import\_pyarrow C API in the style of NumPy for thirdparty C++ users -* [ARROW-819](https://issues.apache.org/jira/browse/ARROW-819) - [Python] Define public Cython API -* [ARROW-872](https://issues.apache.org/jira/browse/ARROW-872) - [JS] Read streaming format -* [ARROW-873](https://issues.apache.org/jira/browse/ARROW-873) - [JS] Implement fixed width list type -* [ARROW-874](https://issues.apache.org/jira/browse/ARROW-874) - [JS] Read dictionary-encoded vectors -* [ARROW-881](https://issues.apache.org/jira/browse/ARROW-881) - [Python] Reconstruct Pandas DataFrame indexes using custom\_metadata -* [ARROW-891](https://issues.apache.org/jira/browse/ARROW-891) - [Python] Expand Windows build instructions to not require looking at separate C++ docs -* [ARROW-899](https://issues.apache.org/jira/browse/ARROW-899) - [Docs] Add CHANGELOG for 0.3.0 -* [ARROW-901](https://issues.apache.org/jira/browse/ARROW-901) - [Python] Write FixedSizeBinary to Parquet -* [ARROW-913](https://issues.apache.org/jira/browse/ARROW-913) - [Python] Only link jemalloc to the Cython extension where it's needed -* [ARROW-923](https://issues.apache.org/jira/browse/ARROW-923) - [Docs] Generate Changelog for website with JIRA links -* [ARROW-929](https://issues.apache.org/jira/browse/ARROW-929) - Move KEYS file to SVN, remove from git -* [ARROW-943](https://issues.apache.org/jira/browse/ARROW-943) - [GLib] Support running unit tests with source archive -* [ARROW-945](https://issues.apache.org/jira/browse/ARROW-945) - [GLib] Add a Lua example to show Torch integration -* [ARROW-946](https://issues.apache.org/jira/browse/ARROW-946) - [GLib] Use "new" instead of "open" for constructor name -* [ARROW-947](https://issues.apache.org/jira/browse/ARROW-947) - [Python] Improve execution time of manylinux1 build -* [ARROW-953](https://issues.apache.org/jira/browse/ARROW-953) - Use cmake / curl from conda-forge in CI builds -* [ARROW-954](https://issues.apache.org/jira/browse/ARROW-954) - Make it possible to compile Arrow with header-only boost -* [ARROW-956](https://issues.apache.org/jira/browse/ARROW-956) - remove pandas pre-0.20.0 compat -* [ARROW-957](https://issues.apache.org/jira/browse/ARROW-957) - [Doc] Add HDFS and Windows documents to doxygen output -* [ARROW-961](https://issues.apache.org/jira/browse/ARROW-961) - [Python] Rename InMemoryOutputStream to BufferOutputStream -* [ARROW-963](https://issues.apache.org/jira/browse/ARROW-963) - [GLib] Add equal -* [ARROW-967](https://issues.apache.org/jira/browse/ARROW-967) - [GLib] Support initializing array with buffer -* [ARROW-970](https://issues.apache.org/jira/browse/ARROW-970) - [Python] Accidentally calling pyarrow.Table() should not segfault process -* [ARROW-977](https://issues.apache.org/jira/browse/ARROW-977) - [java] Add Timezone aware timestamp vectors -* [ARROW-980](https://issues.apache.org/jira/browse/ARROW-980) - Fix detection of "msvc" COMPILER\_FAMILY -* [ARROW-982](https://issues.apache.org/jira/browse/ARROW-982) - [Website] Improve website front copy to highlight serialization efficiency benefits -* [ARROW-984](https://issues.apache.org/jira/browse/ARROW-984) - [GLib] Add Go examples -* [ARROW-985](https://issues.apache.org/jira/browse/ARROW-985) - [GLib] Update package information -* [ARROW-988](https://issues.apache.org/jira/browse/ARROW-988) - [JS] Add entry to Travis CI matrix -* [ARROW-993](https://issues.apache.org/jira/browse/ARROW-993) - [GLib] Add missing error checks in Go examples -* [ARROW-996](https://issues.apache.org/jira/browse/ARROW-996) - [Website] Add 0.3 release announce in Japanese -* [ARROW-997](https://issues.apache.org/jira/browse/ARROW-997) - [Java] Implement transfer in FixedSizeListVector -* [ARROW-1000](https://issues.apache.org/jira/browse/ARROW-1000) - [GLib] Move install document to Website -* [ARROW-1001](https://issues.apache.org/jira/browse/ARROW-1001) - [GLib] Unify writer files -* [ARROW-1002](https://issues.apache.org/jira/browse/ARROW-1002) - [C++] It is not necessary to add padding after the magic header in the FileWriter implementation -* [ARROW-1008](https://issues.apache.org/jira/browse/ARROW-1008) - [C++] Define abstract interface for stream iteration -* [ARROW-1010](https://issues.apache.org/jira/browse/ARROW-1010) - [Website] Only show English posts in /blog/ -* [ARROW-1011](https://issues.apache.org/jira/browse/ARROW-1011) - [Format] Clarify requirements around buffer padding in validity bitmaps -* [ARROW-1014](https://issues.apache.org/jira/browse/ARROW-1014) - 0.4.0 release -* [ARROW-1015](https://issues.apache.org/jira/browse/ARROW-1015) - [Java] Implement schema-level metadata -* [ARROW-1016](https://issues.apache.org/jira/browse/ARROW-1016) - Python: Include C++ headers (optionally) in wheels -* [ARROW-1022](https://issues.apache.org/jira/browse/ARROW-1022) - [Python] Add nthreads option to Feather read method -* [ARROW-1024](https://issues.apache.org/jira/browse/ARROW-1024) - Python: Update build time numpy version to 1.10.1 -* [ARROW-1025](https://issues.apache.org/jira/browse/ARROW-1025) - [Website] Improve changelog on website -* [ARROW-1027](https://issues.apache.org/jira/browse/ARROW-1027) - [Python] Allow negative indexing in fields/columns on pyarrow Table and Schema objects -* [ARROW-1028](https://issues.apache.org/jira/browse/ARROW-1028) - [Python] Documentation updates after ARROW-1008 -* [ARROW-1029](https://issues.apache.org/jira/browse/ARROW-1029) - [Python] Fix --with-parquet build on Windows, add unit tests to Appveyor -* [ARROW-1030](https://issues.apache.org/jira/browse/ARROW-1030) - Python: Account for library versioning in parquet-cpp -* [ARROW-1031](https://issues.apache.org/jira/browse/ARROW-1031) - [GLib] Support pretty print -* [ARROW-1037](https://issues.apache.org/jira/browse/ARROW-1037) - [GLib] Follow reader name change -* [ARROW-1038](https://issues.apache.org/jira/browse/ARROW-1038) - [GLib] Follow writer name change -* [ARROW-1040](https://issues.apache.org/jira/browse/ARROW-1040) - [GLib] Follow tensor IO -* [ARROW-1044](https://issues.apache.org/jira/browse/ARROW-1044) - [GLib] Support Feather -* [ARROW-1126](https://issues.apache.org/jira/browse/ARROW-1126) - Python: Add function to convert NumPy/Pandas dtypes to Arrow DataTypes - - - -# Apache Arrow 0.3.0 (2017-05-05) - -## Bug Fixes - -* [ARROW-109](https://issues.apache.org/jira/browse/ARROW-109) - [C++] Investigate recursive data types limit in flatbuffers -* [ARROW-208](https://issues.apache.org/jira/browse/ARROW-208) - Add checkstyle policy to java project -* [ARROW-347](https://issues.apache.org/jira/browse/ARROW-347) - Add method to pass CallBack when creating a transfer pair -* [ARROW-413](https://issues.apache.org/jira/browse/ARROW-413) - DATE type is not specified clearly -* [ARROW-431](https://issues.apache.org/jira/browse/ARROW-431) - [Python] Review GIL release and acquisition in to\_pandas conversion -* [ARROW-443](https://issues.apache.org/jira/browse/ARROW-443) - [Python] Support for converting from strided pandas data in Table.from\_pandas -* [ARROW-451](https://issues.apache.org/jira/browse/ARROW-451) - [C++] Override DataType::Equals for other types with additional metadata -* [ARROW-454](https://issues.apache.org/jira/browse/ARROW-454) - pojo.Field doesn't implement hashCode() -* [ARROW-526](https://issues.apache.org/jira/browse/ARROW-526) - [Format] Update IPC.md to account for File format changes and Streaming format -* [ARROW-565](https://issues.apache.org/jira/browse/ARROW-565) - [C++] Examine "Field::dictionary" member -* [ARROW-570](https://issues.apache.org/jira/browse/ARROW-570) - Determine Java tools JAR location from project metadata -* [ARROW-584](https://issues.apache.org/jira/browse/ARROW-584) - [C++] Fix compiler warnings exposed with -Wconversion -* [ARROW-586](https://issues.apache.org/jira/browse/ARROW-586) - Problem with reading parquet files saved by Apache Spark -* [ARROW-588](https://issues.apache.org/jira/browse/ARROW-588) - [C++] Fix compiler warnings on 32-bit platforms -* [ARROW-595](https://issues.apache.org/jira/browse/ARROW-595) - [Python] StreamReader.schema returns None -* [ARROW-604](https://issues.apache.org/jira/browse/ARROW-604) - Python: boxed Field instances are missing the reference to DataType -* [ARROW-611](https://issues.apache.org/jira/browse/ARROW-611) - [Java] TimeVector TypeLayout is incorrectly specified as 64 bit width -* [ARROW-613](https://issues.apache.org/jira/browse/ARROW-613) - [JS] Implement random-access file format -* [ARROW-617](https://issues.apache.org/jira/browse/ARROW-617) - Time type is not specified clearly -* [ARROW-619](https://issues.apache.org/jira/browse/ARROW-619) - Python: Fix typos in setup.py args and LD\_LIBRARY\_PATH -* [ARROW-619](https://issues.apache.org/jira/browse/ARROW-619) - Python: Fix typos in setup.py args and LD\_LIBRARY\_PATH -* [ARROW-623](https://issues.apache.org/jira/browse/ARROW-623) - segfault with \_\_repr\_\_ of empty Field -* [ARROW-624](https://issues.apache.org/jira/browse/ARROW-624) - [C++] Restore MakePrimitiveArray function -* [ARROW-627](https://issues.apache.org/jira/browse/ARROW-627) - [C++] Compatibility macros for exported extern template class declarations -* [ARROW-628](https://issues.apache.org/jira/browse/ARROW-628) - [Python] Install nomkl metapackage when building parquet-cpp for faster Travis builds -* [ARROW-630](https://issues.apache.org/jira/browse/ARROW-630) - [C++] IPC unloading for BooleanArray does not account for offset -* [ARROW-636](https://issues.apache.org/jira/browse/ARROW-636) - [C++] Add Boost / other system requirements to C++ README -* [ARROW-639](https://issues.apache.org/jira/browse/ARROW-639) - [C++] Invalid offset in slices -* [ARROW-642](https://issues.apache.org/jira/browse/ARROW-642) - [Java] Remove temporary file in java/tools -* [ARROW-644](https://issues.apache.org/jira/browse/ARROW-644) - Python: Cython should be a setup-only requirement -* [ARROW-652](https://issues.apache.org/jira/browse/ARROW-652) - Remove trailing f in merge script output -* [ARROW-654](https://issues.apache.org/jira/browse/ARROW-654) - [C++] Support timezone metadata in file/stream formats -* [ARROW-666](https://issues.apache.org/jira/browse/ARROW-666) - [Python] Error in DictionaryArray \_\_repr\_\_ -* [ARROW-667](https://issues.apache.org/jira/browse/ARROW-667) - build of arrow-master/cpp fails with altivec error? -* [ARROW-668](https://issues.apache.org/jira/browse/ARROW-668) - [Python] Convert nanosecond timestamps to pandas.Timestamp when converting from TimestampValue -* [ARROW-671](https://issues.apache.org/jira/browse/ARROW-671) - [GLib] License file isn't installed -* [ARROW-673](https://issues.apache.org/jira/browse/ARROW-673) - [Java] Support additional Time metadata -* [ARROW-677](https://issues.apache.org/jira/browse/ARROW-677) - [java] Fix checkstyle jcl-over-slf4j conflict issue -* [ARROW-678](https://issues.apache.org/jira/browse/ARROW-678) - [GLib] Fix dependenciesfff -* [ARROW-680](https://issues.apache.org/jira/browse/ARROW-680) - [C++] Multiarch support impacts user-supplied install prefix -* [ARROW-682](https://issues.apache.org/jira/browse/ARROW-682) - Add self-validation checks in integration tests -* [ARROW-683](https://issues.apache.org/jira/browse/ARROW-683) - [C++] Support date32 (DateUnit::DAY) in IPC metadata, rename date to date64 -* [ARROW-685](https://issues.apache.org/jira/browse/ARROW-685) - [GLib] AX\_CXX\_COMPILE\_STDCXX\_11 error running ./configure -* [ARROW-686](https://issues.apache.org/jira/browse/ARROW-686) - [C++] Account for time metadata changes, add time32 and time64 types -* [ARROW-689](https://issues.apache.org/jira/browse/ARROW-689) - [GLib] Install header files and documents to wrong directories -* [ARROW-691](https://issues.apache.org/jira/browse/ARROW-691) - [Java] Encode dictionary Int type in message format -* [ARROW-697](https://issues.apache.org/jira/browse/ARROW-697) - [Java] Raise appropriate exceptions when encountering large (\> INT32\_MAX) record batches -* [ARROW-699](https://issues.apache.org/jira/browse/ARROW-699) - [C++] Arrow dynamic libraries are missed on run of unit tests on Windows -* [ARROW-702](https://issues.apache.org/jira/browse/ARROW-702) - Fix BitVector.copyFromSafe to reAllocate instead of returning false -* [ARROW-703](https://issues.apache.org/jira/browse/ARROW-703) - Fix issue where setValueCount(0) doesn’t work in the case that we’ve shipped vectors across the wire -* [ARROW-704](https://issues.apache.org/jira/browse/ARROW-704) - Fix bad import caused by conflicting changes -* [ARROW-709](https://issues.apache.org/jira/browse/ARROW-709) - [C++] Restore type comparator for DecimalType -* [ARROW-713](https://issues.apache.org/jira/browse/ARROW-713) - [C++] Fix linking issue with ipc benchmark -* [ARROW-715](https://issues.apache.org/jira/browse/ARROW-715) - Python: Explicit pandas import makes it a hard requirement -* [ARROW-716](https://issues.apache.org/jira/browse/ARROW-716) - error building arrow/python -* [ARROW-720](https://issues.apache.org/jira/browse/ARROW-720) - [java] arrow should not have a dependency on slf4j bridges in compile -* [ARROW-723](https://issues.apache.org/jira/browse/ARROW-723) - Arrow freezes on write if chunk\_size=0 -* [ARROW-726](https://issues.apache.org/jira/browse/ARROW-726) - [C++] PyBuffer dtor may segfault if constructor passed an object not exporting buffer protocol -* [ARROW-732](https://issues.apache.org/jira/browse/ARROW-732) - Schema comparison bugs in struct and union types -* [ARROW-736](https://issues.apache.org/jira/browse/ARROW-736) - [Python] Mixed-type object DataFrame columns should not silently coerce to an Arrow type by default -* [ARROW-738](https://issues.apache.org/jira/browse/ARROW-738) - [Python] Fix manylinux1 packaging -* [ARROW-739](https://issues.apache.org/jira/browse/ARROW-739) - Parallel build fails non-deterministically. -* [ARROW-740](https://issues.apache.org/jira/browse/ARROW-740) - FileReader fails for large objects -* [ARROW-747](https://issues.apache.org/jira/browse/ARROW-747) - [C++] Fix spurious warning caused by passing dl to add\_dependencies -* [ARROW-749](https://issues.apache.org/jira/browse/ARROW-749) - [Python] Delete incomplete binary files when writing fails -* [ARROW-753](https://issues.apache.org/jira/browse/ARROW-753) - [Python] Unit tests in arrow/python fail to link on some OS X platforms -* [ARROW-756](https://issues.apache.org/jira/browse/ARROW-756) - [C++] Do not pass -fPIC when compiling with MSVC -* [ARROW-757](https://issues.apache.org/jira/browse/ARROW-757) - [C++] MSVC build fails on googletest when using NMake -* [ARROW-762](https://issues.apache.org/jira/browse/ARROW-762) - Kerberos Problem with PyArrow -* [ARROW-776](https://issues.apache.org/jira/browse/ARROW-776) - [GLib] Cast type is wrong -* [ARROW-777](https://issues.apache.org/jira/browse/ARROW-777) - [Java] Resolve getObject behavior per changes / discussion in ARROW-729 -* [ARROW-778](https://issues.apache.org/jira/browse/ARROW-778) - Modify merge tool to work on Windows -* [ARROW-780](https://issues.apache.org/jira/browse/ARROW-780) - PYTHON\_EXECUTABLE Required to be set during build -* [ARROW-781](https://issues.apache.org/jira/browse/ARROW-781) - [Python/C++] Increase reference count for base object? -* [ARROW-783](https://issues.apache.org/jira/browse/ARROW-783) - Integration tests fail for length-0 record batch -* [ARROW-787](https://issues.apache.org/jira/browse/ARROW-787) - [GLib] Fix compilation errors caused by ARROW-758 -* [ARROW-789](https://issues.apache.org/jira/browse/ARROW-789) - Fix issue where setValueCount(0) doesn’t work in the case that we’ve shipped vectors across the wire -* [ARROW-793](https://issues.apache.org/jira/browse/ARROW-793) - [GLib] Wrong indent -* [ARROW-794](https://issues.apache.org/jira/browse/ARROW-794) - [C++] Check whether data is contiguous in ipc::WriteTensor -* [ARROW-796](https://issues.apache.org/jira/browse/ARROW-796) - [Java] Checkstyle additions causing build failure in some environments -* [ARROW-797](https://issues.apache.org/jira/browse/ARROW-797) - [Python] Add updated pyarrow.\* public API listing in Sphinx docs -* [ARROW-800](https://issues.apache.org/jira/browse/ARROW-800) - [C++] Boost headers being transitively included in pyarrow -* [ARROW-805](https://issues.apache.org/jira/browse/ARROW-805) - listing empty HDFS directory returns an error instead of returning empty list -* [ARROW-809](https://issues.apache.org/jira/browse/ARROW-809) - C++: Writing sliced record batch to IPC writes the entire array -* [ARROW-812](https://issues.apache.org/jira/browse/ARROW-812) - Pip install pyarrow on mac failed. -* [ARROW-817](https://issues.apache.org/jira/browse/ARROW-817) - [C++] Fix incorrect code comment from ARROW-722 -* [ARROW-821](https://issues.apache.org/jira/browse/ARROW-821) - [Python] Extra file \_table\_api.h generated during Python build process -* [ARROW-822](https://issues.apache.org/jira/browse/ARROW-822) - [Python] StreamWriter fails to open with socket as sink -* [ARROW-826](https://issues.apache.org/jira/browse/ARROW-826) - Compilation error on Mac with -DARROW\_PYTHON=on -* [ARROW-829](https://issues.apache.org/jira/browse/ARROW-829) - Python: Parquet: Dictionary encoding is deactivated if column-wise compression was selected -* [ARROW-830](https://issues.apache.org/jira/browse/ARROW-830) - Python: jemalloc is not anymore publicly exposed -* [ARROW-836](https://issues.apache.org/jira/browse/ARROW-836) - Test for timedelta compat with pandas -* [ARROW-839](https://issues.apache.org/jira/browse/ARROW-839) - [C++] Portable alternative to PyDate\_to\_ms function -* [ARROW-847](https://issues.apache.org/jira/browse/ARROW-847) - C++: BUILD\_BYPRODUCTS not specified anymore for gtest -* [ARROW-852](https://issues.apache.org/jira/browse/ARROW-852) - Python: Also set Arrow Library PATHS when detection was done through pkg-config -* [ARROW-853](https://issues.apache.org/jira/browse/ARROW-853) - [Python] It is no longer necessary to modify the RPATH of the Cython extensions on many environments -* [ARROW-858](https://issues.apache.org/jira/browse/ARROW-858) - Remove dependency on boost regex -* [ARROW-866](https://issues.apache.org/jira/browse/ARROW-866) - [Python] Error from file object destructor -* [ARROW-867](https://issues.apache.org/jira/browse/ARROW-867) - [Python] Miscellaneous pyarrow MSVC fixes -* [ARROW-875](https://issues.apache.org/jira/browse/ARROW-875) - Nullable variable length vector fillEmpties() fills an extra value -* [ARROW-879](https://issues.apache.org/jira/browse/ARROW-879) - compat with pandas 0.20.0 -* [ARROW-882](https://issues.apache.org/jira/browse/ARROW-882) - [C++] On Windows statically built lib file overwrites lib file of shared build -* [ARROW-883](https://issues.apache.org/jira/browse/ARROW-883) - [JAVA] Introduction of new types has shifted Enumerations -* [ARROW-885](https://issues.apache.org/jira/browse/ARROW-885) - [Python/C++] Decimal test failure on MSVC -* [ARROW-886](https://issues.apache.org/jira/browse/ARROW-886) - VariableLengthVectors don't reAlloc offsets -* [ARROW-887](https://issues.apache.org/jira/browse/ARROW-887) - [format] For backward compatibility, new unit fields must have default values matching previous implied unit -* [ARROW-888](https://issues.apache.org/jira/browse/ARROW-888) - BitVector transfer() does not transfer ownership -* [ARROW-895](https://issues.apache.org/jira/browse/ARROW-895) - Nullable variable length vector lastSet not set correctly -* [ARROW-900](https://issues.apache.org/jira/browse/ARROW-900) - [Python] UnboundLocalError in ParquetDatasetPiece -* [ARROW-903](https://issues.apache.org/jira/browse/ARROW-903) - [GLib] Remove a needless "." -* [ARROW-914](https://issues.apache.org/jira/browse/ARROW-914) - [C++/Python] Fix Decimal ToBytes -* [ARROW-922](https://issues.apache.org/jira/browse/ARROW-922) - Allow Flatbuffers and RapidJSON to be used locally on Windows -* [ARROW-927](https://issues.apache.org/jira/browse/ARROW-927) - C++/Python: Add manylinux1 builds to Travis matrix -* [ARROW-928](https://issues.apache.org/jira/browse/ARROW-928) - Update CMAKE script to detect unsupported msvc compilers versions -* [ARROW-933](https://issues.apache.org/jira/browse/ARROW-933) - [Python] arrow\_python bindings have debug print statement -* [ARROW-934](https://issues.apache.org/jira/browse/ARROW-934) - [GLib] Glib sources missing from result of 02-source.sh -* [ARROW-936](https://issues.apache.org/jira/browse/ARROW-936) - Fix release README -* [ARROW-936](https://issues.apache.org/jira/browse/ARROW-936) - Fix release README -* [ARROW-938](https://issues.apache.org/jira/browse/ARROW-938) - Fix Apache Rat errors from source release build - - -## New Features and Improvements - -* [ARROW-6](https://issues.apache.org/jira/browse/ARROW-6) - Hope to add development document -* [ARROW-39](https://issues.apache.org/jira/browse/ARROW-39) - C++: Logical chunked arrays / columns: conforming to fixed chunk sizes -* [ARROW-52](https://issues.apache.org/jira/browse/ARROW-52) - Set up project blog -* [ARROW-95](https://issues.apache.org/jira/browse/ARROW-95) - Scaffold Main Documentation using asciidoc -* [ARROW-98](https://issues.apache.org/jira/browse/ARROW-98) - Java: API documentation -* [ARROW-99](https://issues.apache.org/jira/browse/ARROW-99) - C++: Explore if RapidCheck may be helpful for testing / worth adding to toolchain -* [ARROW-183](https://issues.apache.org/jira/browse/ARROW-183) - C++: Add storage type to DecimalType -* [ARROW-231](https://issues.apache.org/jira/browse/ARROW-231) - C++: Add typed Resize to PoolBuffer -* [ARROW-281](https://issues.apache.org/jira/browse/ARROW-281) - [C++] IPC/RPC support on Win32 platforms -* [ARROW-316](https://issues.apache.org/jira/browse/ARROW-316) - Finalize Date type -* [ARROW-341](https://issues.apache.org/jira/browse/ARROW-341) - [Python] Making libpyarrow available to third parties -* [ARROW-452](https://issues.apache.org/jira/browse/ARROW-452) - [C++/Python] Merge "Feather" file format implementation -* [ARROW-459](https://issues.apache.org/jira/browse/ARROW-459) - [C++] Implement IPC round trip for DictionaryArray, dictionaries shared across record batches -* [ARROW-483](https://issues.apache.org/jira/browse/ARROW-483) - [C++/Python] Provide access to "custom\_metadata" Field attribute in IPC setting -* [ARROW-491](https://issues.apache.org/jira/browse/ARROW-491) - [C++] Add FixedWidthBinary type -* [ARROW-492](https://issues.apache.org/jira/browse/ARROW-492) - [C++] Add arrow/arrow.h public API -* [ARROW-493](https://issues.apache.org/jira/browse/ARROW-493) - [C++] Allow in-memory array over 2^31 -1 elements but require splitting at IPC / RPC boundaries -* [ARROW-502](https://issues.apache.org/jira/browse/ARROW-502) - [C++/Python] Add MemoryPool implementation that logs allocation activity to std::cout -* [ARROW-510](https://issues.apache.org/jira/browse/ARROW-510) - Add integration tests for date and time types -* [ARROW-518](https://issues.apache.org/jira/browse/ARROW-518) - C++: Make Status::OK method constexpr -* [ARROW-520](https://issues.apache.org/jira/browse/ARROW-520) - [C++] Add STL-compliant allocator that hooks into an arrow::MemoryPool -* [ARROW-528](https://issues.apache.org/jira/browse/ARROW-528) - [Python] Support \_metadata or \_common\_metadata files when reading Parquet directories -* [ARROW-534](https://issues.apache.org/jira/browse/ARROW-534) - [C++] Add IPC tests for date/time types -* [ARROW-539](https://issues.apache.org/jira/browse/ARROW-539) - [Python] Support reading Parquet datasets with standard partition directory schemes -* [ARROW-542](https://issues.apache.org/jira/browse/ARROW-542) - [Java] Implement dictionaries in stream/file encoding -* [ARROW-550](https://issues.apache.org/jira/browse/ARROW-550) - [Format] Add a TensorMessage type -* [ARROW-552](https://issues.apache.org/jira/browse/ARROW-552) - [Python] Add scalar value support for Dictionary type -* [ARROW-557](https://issues.apache.org/jira/browse/ARROW-557) - [Python] Explicitly opt in to HDFS unit tests -* [ARROW-563](https://issues.apache.org/jira/browse/ARROW-563) - C++: Support non-standard gcc version strings -* [ARROW-566](https://issues.apache.org/jira/browse/ARROW-566) - Python: Deterministic position of libarrow in manylinux1 wheels -* [ARROW-568](https://issues.apache.org/jira/browse/ARROW-568) - [C++] Add default implementations for TypeVisitor, ArrayVisitor methods that return NotImplemented -* [ARROW-569](https://issues.apache.org/jira/browse/ARROW-569) - [C++] Set version for \*.pc -* [ARROW-574](https://issues.apache.org/jira/browse/ARROW-574) - Python: Add support for nested Python lists in Pandas conversion -* [ARROW-576](https://issues.apache.org/jira/browse/ARROW-576) - [C++] Complete round trip Union file/stream IPC tests -* [ARROW-577](https://issues.apache.org/jira/browse/ARROW-577) - [C++] Refactor StreamWriter and FileWriter to have private implementations -* [ARROW-578](https://issues.apache.org/jira/browse/ARROW-578) - [C++] Add CMake option to add custom $CXXFLAGS -* [ARROW-580](https://issues.apache.org/jira/browse/ARROW-580) - C++: Also provide jemalloc\_X targets if only a static or shared version is found -* [ARROW-582](https://issues.apache.org/jira/browse/ARROW-582) - [Java] Add Date/Time Support to JSON File -* [ARROW-589](https://issues.apache.org/jira/browse/ARROW-589) - C++: Use system provided shared jemalloc if static is unavailable -* [ARROW-591](https://issues.apache.org/jira/browse/ARROW-591) - [C++] Add round trip testing fixture for JSON format -* [ARROW-593](https://issues.apache.org/jira/browse/ARROW-593) - [C++] Rename ReadableFileInterface to RandomAccessFile -* [ARROW-598](https://issues.apache.org/jira/browse/ARROW-598) - [Python] Add support for converting pyarrow.Buffer to a memoryview with zero copy -* [ARROW-603](https://issues.apache.org/jira/browse/ARROW-603) - [C++] Add RecordBatch::Validate method that at least checks that schema matches the array metadata -* [ARROW-605](https://issues.apache.org/jira/browse/ARROW-605) - [C++] Refactor generic ArrayLoader class, support work for Feather merge -* [ARROW-606](https://issues.apache.org/jira/browse/ARROW-606) - [C++] Upgrade to flatbuffers 1.6.0 -* [ARROW-608](https://issues.apache.org/jira/browse/ARROW-608) - [Format] Days since epoch date type -* [ARROW-610](https://issues.apache.org/jira/browse/ARROW-610) - [C++] Win32 compatibility in file.cc -* [ARROW-612](https://issues.apache.org/jira/browse/ARROW-612) - [Java] Field toString should show nullable flag status -* [ARROW-615](https://issues.apache.org/jira/browse/ARROW-615) - Move ByteArrayReadableSeekableByteChannel to vector.util package -* [ARROW-616](https://issues.apache.org/jira/browse/ARROW-616) - [C++] Remove -g flag in release builds -* [ARROW-618](https://issues.apache.org/jira/browse/ARROW-618) - [Python] Implement support for DatetimeTZ custom type from pandas -* [ARROW-620](https://issues.apache.org/jira/browse/ARROW-620) - [C++] Add date/time support to JSON reader/writer for integration testing -* [ARROW-621](https://issues.apache.org/jira/browse/ARROW-621) - [C++] Implement an "inline visitor" template that enables visitor-pattern-like code without virtual function dispatch -* [ARROW-625](https://issues.apache.org/jira/browse/ARROW-625) - [C++] Add time unit to TimeType::ToString -* [ARROW-626](https://issues.apache.org/jira/browse/ARROW-626) - [Python] Enable pyarrow.BufferReader to read from any Python object implementing the buffer/memoryview protocol -* [ARROW-631](https://issues.apache.org/jira/browse/ARROW-631) - [GLib] Import C API (C++ API wrapper) based on GLib from https://github.com/kou/arrow-glib -* [ARROW-632](https://issues.apache.org/jira/browse/ARROW-632) - [Python] Add support for FixedWidthBinary type -* [ARROW-635](https://issues.apache.org/jira/browse/ARROW-635) - [C++] Add JSON read/write support for FixedWidthBinary -* [ARROW-637](https://issues.apache.org/jira/browse/ARROW-637) - [Format] Add time zone metadata to Timestamp type -* [ARROW-646](https://issues.apache.org/jira/browse/ARROW-646) - Cache miniconda packages -* [ARROW-647](https://issues.apache.org/jira/browse/ARROW-647) - [C++] Don't require Boost static libraries to support CentOS 7 -* [ARROW-648](https://issues.apache.org/jira/browse/ARROW-648) - [C++] Support multiarch on Debian -* [ARROW-650](https://issues.apache.org/jira/browse/ARROW-650) - [GLib] Follow eadableFileInterface -\> RnadomAccessFile change -* [ARROW-651](https://issues.apache.org/jira/browse/ARROW-651) - [C++] Set shared library version for .deb packages -* [ARROW-655](https://issues.apache.org/jira/browse/ARROW-655) - Implement DecimalArray -* [ARROW-656](https://issues.apache.org/jira/browse/ARROW-656) - [C++] Implement IO interface that can read and write to a fixed-size mutable buffer -* [ARROW-657](https://issues.apache.org/jira/browse/ARROW-657) - [Python] Write and read tensors (with zero copy) into shared memory -* [ARROW-658](https://issues.apache.org/jira/browse/ARROW-658) - [C++] Implement in-memory arrow::Tensor objects -* [ARROW-659](https://issues.apache.org/jira/browse/ARROW-659) - [C++] Add multithreaded memcpy implementation (for hardware where it helps) -* [ARROW-660](https://issues.apache.org/jira/browse/ARROW-660) - [C++] Restore function that can read a complete encapsulated record batch message -* [ARROW-661](https://issues.apache.org/jira/browse/ARROW-661) - [C++] Add a Flatbuffer metadata type that supports array data over 2^31 - 1 elements -* [ARROW-662](https://issues.apache.org/jira/browse/ARROW-662) - [Format] Factor Flatbuffer schema metadata into a Schema.fbs -* [ARROW-663](https://issues.apache.org/jira/browse/ARROW-663) - [Java] Support additional Time metadata + vector value accessors -* [ARROW-664](https://issues.apache.org/jira/browse/ARROW-664) - Make C++ Arrow serialization deterministic -* [ARROW-669](https://issues.apache.org/jira/browse/ARROW-669) - [Python] Attach proper tzinfo when computing boxed scalars for TimestampArray -* [ARROW-670](https://issues.apache.org/jira/browse/ARROW-670) - Arrow 0.3 release -* [ARROW-672](https://issues.apache.org/jira/browse/ARROW-672) - [Format] Bump metadata version for 0.3 release -* [ARROW-674](https://issues.apache.org/jira/browse/ARROW-674) - [Java] Support additional Timestamp timezone metadata -* [ARROW-675](https://issues.apache.org/jira/browse/ARROW-675) - [GLib] Update package metadata -* [ARROW-676](https://issues.apache.org/jira/browse/ARROW-676) - [java] move from MinorType to FieldType in ValueVectors to carry all the relevant type bits -* [ARROW-679](https://issues.apache.org/jira/browse/ARROW-679) - [Format] Change RecordBatch and Field length members from int to long -* [ARROW-681](https://issues.apache.org/jira/browse/ARROW-681) - [C++] Build Arrow on Windows with dynamically linked boost -* [ARROW-684](https://issues.apache.org/jira/browse/ARROW-684) - Python: More informative message when parquet-cpp but not parquet-arrow is available -* [ARROW-687](https://issues.apache.org/jira/browse/ARROW-687) - [C++] Build and run full test suite in Appveyor -* [ARROW-688](https://issues.apache.org/jira/browse/ARROW-688) - [C++] Use CMAKE\_INSTALL\_INCLUDEDIR for consistency -* [ARROW-690](https://issues.apache.org/jira/browse/ARROW-690) - Only send JIRA updates to issues@arrow.apache.org -* [ARROW-698](https://issues.apache.org/jira/browse/ARROW-698) - [C++] Add options to StreamWriter/FileWriter to permit large record batches -* [ARROW-700](https://issues.apache.org/jira/browse/ARROW-700) - Add headroom interface for allocator. -* [ARROW-701](https://issues.apache.org/jira/browse/ARROW-701) - [Java] Support additional Date metadata -* [ARROW-706](https://issues.apache.org/jira/browse/ARROW-706) - [GLib] Add package install document -* [ARROW-707](https://issues.apache.org/jira/browse/ARROW-707) - Python: All none-Pandas column should be converted to NullArray -* [ARROW-708](https://issues.apache.org/jira/browse/ARROW-708) - [C++] Some IPC code simplification, perf analysis -* [ARROW-710](https://issues.apache.org/jira/browse/ARROW-710) - [Python] Enable Feather APIs to read and write using Python file-like objects -* [ARROW-711](https://issues.apache.org/jira/browse/ARROW-711) - [C++] Remove extern template declarations for NumericArray types -* [ARROW-712](https://issues.apache.org/jira/browse/ARROW-712) - [C++] Implement Array::Accept as inline visitor -* [ARROW-717](https://issues.apache.org/jira/browse/ARROW-717) - [C++] IPC zero-copy round trips for arrow::Tensor -* [ARROW-718](https://issues.apache.org/jira/browse/ARROW-718) - [Python] Expose arrow::Tensor with conversions to/from NumPy arrays -* [ARROW-719](https://issues.apache.org/jira/browse/ARROW-719) - [GLib] Support prepared source archive release -* [ARROW-722](https://issues.apache.org/jira/browse/ARROW-722) - [Python] pandas conversions for new date and time types/metadata -* [ARROW-724](https://issues.apache.org/jira/browse/ARROW-724) - Add "How to Contribute" section to README -* [ARROW-725](https://issues.apache.org/jira/browse/ARROW-725) - [Format] Constant length list type -* [ARROW-727](https://issues.apache.org/jira/browse/ARROW-727) - [Python] Write memoryview-compatible objects in NativeFile.write with zero copy -* [ARROW-728](https://issues.apache.org/jira/browse/ARROW-728) - [C++/Python] Add arrow::Table function for removing a column -* [ARROW-729](https://issues.apache.org/jira/browse/ARROW-729) - [Java] Add vector type for 32-bit date as days since UNIX epoch -* [ARROW-731](https://issues.apache.org/jira/browse/ARROW-731) - [C++] Add shared library related versions to .pc -* [ARROW-733](https://issues.apache.org/jira/browse/ARROW-733) - [C++/Format] Change name of Fixed Width Binary to Fixed \*Size\* Binary for consistency -* [ARROW-734](https://issues.apache.org/jira/browse/ARROW-734) - [Python] Support for pyarrow on Windows / MSVC -* [ARROW-735](https://issues.apache.org/jira/browse/ARROW-735) - [C++] Developer instruction document for MSVC on Windows -* [ARROW-737](https://issues.apache.org/jira/browse/ARROW-737) - [C++] Support obtaining mutable slices of mutable buffers -* [ARROW-741](https://issues.apache.org/jira/browse/ARROW-741) - [Python] Add Python 3.6 to Travis CI -* [ARROW-743](https://issues.apache.org/jira/browse/ARROW-743) - [C++] Consolidate unit tests for code in array.h -* [ARROW-744](https://issues.apache.org/jira/browse/ARROW-744) - [GLib] Re-add an assertion to garrow\_table\_new() test -* [ARROW-745](https://issues.apache.org/jira/browse/ARROW-745) - [C++] Allow use of system cpplint -* [ARROW-746](https://issues.apache.org/jira/browse/ARROW-746) - [GLib] Add garrow\_array\_get\_data\_type() -* [ARROW-748](https://issues.apache.org/jira/browse/ARROW-748) - [Python] Pin runtime library versions in conda-forge packages to force upgrades -* [ARROW-751](https://issues.apache.org/jira/browse/ARROW-751) - [Python] Rename all Cython extensions to "private" status with leading underscore -* [ARROW-752](https://issues.apache.org/jira/browse/ARROW-752) - [Python] Construct pyarrow.DictionaryArray from boxed pyarrow array objects -* [ARROW-754](https://issues.apache.org/jira/browse/ARROW-754) - [GLib] Add garrow\_array\_is\_null() -* [ARROW-755](https://issues.apache.org/jira/browse/ARROW-755) - [GLib] Add garrow\_array\_get\_value\_type() -* [ARROW-758](https://issues.apache.org/jira/browse/ARROW-758) - [C++] Fix compiler warnings on MSVC x64 -* [ARROW-761](https://issues.apache.org/jira/browse/ARROW-761) - [Python] Add function to compute the total size of tensor payloads, including metadata and padding -* [ARROW-763](https://issues.apache.org/jira/browse/ARROW-763) - C++: Use \`python-config\` to find libpythonX.X.dylib -* [ARROW-765](https://issues.apache.org/jira/browse/ARROW-765) - [Python] Make generic ArrowException subclass value error -* [ARROW-768](https://issues.apache.org/jira/browse/ARROW-768) - [Java] Change the "boxed" object representation of date and time types -* [ARROW-769](https://issues.apache.org/jira/browse/ARROW-769) - [GLib] Support building without installed Arrow C++ -* [ARROW-770](https://issues.apache.org/jira/browse/ARROW-770) - [C++] Move clang-tidy/format config files back to C++ source tree -* [ARROW-771](https://issues.apache.org/jira/browse/ARROW-771) - [Python] Add APIs for reading individual Parquet row groups -* [ARROW-773](https://issues.apache.org/jira/browse/ARROW-773) - [C++] Add function to create arrow::Table with column appended to existing table -* [ARROW-774](https://issues.apache.org/jira/browse/ARROW-774) - [GLib] Remove needless LICENSE.txt copy -* [ARROW-775](https://issues.apache.org/jira/browse/ARROW-775) - [Java] add simple constructors to value vectors -* [ARROW-779](https://issues.apache.org/jira/browse/ARROW-779) - [C++/Python] Raise exception if old metadata encountered -* [ARROW-782](https://issues.apache.org/jira/browse/ARROW-782) - [C++] Change struct to class for objects that meet the criteria in the Google style guide -* [ARROW-788](https://issues.apache.org/jira/browse/ARROW-788) - Possible nondeterminism in Tensor serialization code -* [ARROW-795](https://issues.apache.org/jira/browse/ARROW-795) - [C++] Combine libarrow/libarrow\_io/libarrow\_ipc -* [ARROW-798](https://issues.apache.org/jira/browse/ARROW-798) - [Docs] Publish Format Markdown documents somehow on arrow.apache.org -* [ARROW-802](https://issues.apache.org/jira/browse/ARROW-802) - [GLib] Add read examples -* [ARROW-803](https://issues.apache.org/jira/browse/ARROW-803) - [GLib] Update package repository URL -* [ARROW-804](https://issues.apache.org/jira/browse/ARROW-804) - [GLib] Update build document -* [ARROW-806](https://issues.apache.org/jira/browse/ARROW-806) - [GLib] Support add/remove a column from table -* [ARROW-807](https://issues.apache.org/jira/browse/ARROW-807) - [GLib] Update "Since" tag -* [ARROW-808](https://issues.apache.org/jira/browse/ARROW-808) - [GLib] Remove needless ignore entries -* [ARROW-810](https://issues.apache.org/jira/browse/ARROW-810) - [GLib] Remove io/ipc prefix -* [ARROW-811](https://issues.apache.org/jira/browse/ARROW-811) - [GLib] Add GArrowBuffer -* [ARROW-815](https://issues.apache.org/jira/browse/ARROW-815) - [Java] Allow for expanding underlying buffer size after allocation -* [ARROW-816](https://issues.apache.org/jira/browse/ARROW-816) - [C++] Use conda packages for RapidJSON, Flatbuffers to speed up builds -* [ARROW-818](https://issues.apache.org/jira/browse/ARROW-818) - [Python] Review public pyarrow.\* API completeness and update docs -* [ARROW-820](https://issues.apache.org/jira/browse/ARROW-820) - [C++] Build dependencies for Parquet library without arrow support -* [ARROW-825](https://issues.apache.org/jira/browse/ARROW-825) - [Python] Generalize pyarrow.from\_pylist to accept any object implementing the PySequence protocol -* [ARROW-827](https://issues.apache.org/jira/browse/ARROW-827) - [Python] Variety of Parquet improvements to support Dask integration -* [ARROW-828](https://issues.apache.org/jira/browse/ARROW-828) - [CPP] Document new requirement (libboost-regex-dev) in README.md -* [ARROW-831](https://issues.apache.org/jira/browse/ARROW-831) - Switch from boost::regex to std::regex -* [ARROW-832](https://issues.apache.org/jira/browse/ARROW-832) - [C++] Upgrade thirdparty gtest to 1.8.0 -* [ARROW-833](https://issues.apache.org/jira/browse/ARROW-833) - [Python] "Quickstart" build / environment setup guide for Python developers -* [ARROW-841](https://issues.apache.org/jira/browse/ARROW-841) - [Python] Add pyarrow build to Appveyor -* [ARROW-844](https://issues.apache.org/jira/browse/ARROW-844) - [Format] Revise format/README.md to reflect progress reaching a more complete specification -* [ARROW-845](https://issues.apache.org/jira/browse/ARROW-845) - [Python] Sync FindArrow.cmake changes from parquet-cpp -* [ARROW-846](https://issues.apache.org/jira/browse/ARROW-846) - [GLib] Add GArrowTensor, GArrowInt8Tensor and GArrowUInt8Tensor -* [ARROW-848](https://issues.apache.org/jira/browse/ARROW-848) - [Python] Improvements / fixes to conda quickstart guide -* [ARROW-849](https://issues.apache.org/jira/browse/ARROW-849) - [C++] Add optional $ARROW\_BUILD\_TOOLCHAIN environment variable option for configuring build environment -* [ARROW-857](https://issues.apache.org/jira/browse/ARROW-857) - [Python] Automate publishing Python documentation to arrow-site -* [ARROW-859](https://issues.apache.org/jira/browse/ARROW-859) - [C++] Do not build unit tests by default? -* [ARROW-860](https://issues.apache.org/jira/browse/ARROW-860) - [C++] Decide if typed Tensor subclasses are worthwhile -* [ARROW-861](https://issues.apache.org/jira/browse/ARROW-861) - [Python] Move DEVELOPMENT.md to Sphinx docs -* [ARROW-862](https://issues.apache.org/jira/browse/ARROW-862) - [Python] Improve source build instructions in README -* [ARROW-863](https://issues.apache.org/jira/browse/ARROW-863) - [GLib] Use GBytes to implement zero-copy -* [ARROW-864](https://issues.apache.org/jira/browse/ARROW-864) - [GLib] Unify Array files -* [ARROW-865](https://issues.apache.org/jira/browse/ARROW-865) - [Python] Verify Parquet roundtrips for new date/time types -* [ARROW-868](https://issues.apache.org/jira/browse/ARROW-868) - [GLib] Use GBytes to reduce copy -* [ARROW-869](https://issues.apache.org/jira/browse/ARROW-869) - [JS] Rename directory to js/ -* [ARROW-871](https://issues.apache.org/jira/browse/ARROW-871) - [GLib] Unify DataType files -* [ARROW-876](https://issues.apache.org/jira/browse/ARROW-876) - [GLib] Unify ArrayBuffer files -* [ARROW-877](https://issues.apache.org/jira/browse/ARROW-877) - [GLib] Add garrow\_array\_get\_null\_bitmap() -* [ARROW-878](https://issues.apache.org/jira/browse/ARROW-878) - [GLib] Add garrow\_binary\_array\_get\_buffer() -* [ARROW-880](https://issues.apache.org/jira/browse/ARROW-880) - [GLib] Add garrow\_primitive\_array\_get\_buffer() -* [ARROW-890](https://issues.apache.org/jira/browse/ARROW-890) - [GLib] Add GArrowMutableBuffer -* [ARROW-892](https://issues.apache.org/jira/browse/ARROW-892) - [GLib] Fix GArrowTensor document -* [ARROW-893](https://issues.apache.org/jira/browse/ARROW-893) - Add GLib document to Web site -* [ARROW-894](https://issues.apache.org/jira/browse/ARROW-894) - [GLib] Add GArrowPoolBuffer -* [ARROW-896](https://issues.apache.org/jira/browse/ARROW-896) - [Docs] Add Jekyll plugin for including rendered Jupyter notebooks on website -* [ARROW-898](https://issues.apache.org/jira/browse/ARROW-898) - [C++] Expand metadata support to field level, provide for sharing instances of KeyValueMetadata -* [ARROW-904](https://issues.apache.org/jira/browse/ARROW-904) - [GLib] Simplify error check codes -* [ARROW-907](https://issues.apache.org/jira/browse/ARROW-907) - C++: Convenience construct Table from schema and arrays -* [ARROW-908](https://issues.apache.org/jira/browse/ARROW-908) - [GLib] Unify OutputStream files -* [ARROW-910](https://issues.apache.org/jira/browse/ARROW-910) - [C++] Write 0-length EOS indicator at end of stream -* [ARROW-916](https://issues.apache.org/jira/browse/ARROW-916) - [GLib] Add GArrowBufferOutputStream -* [ARROW-917](https://issues.apache.org/jira/browse/ARROW-917) - [GLib] Add GArrowBufferReader -* [ARROW-918](https://issues.apache.org/jira/browse/ARROW-918) - [GLib] Use GArrowBuffer for read -* [ARROW-919](https://issues.apache.org/jira/browse/ARROW-919) - [GLib] Use "id" to get type enum value from GArrowDataType -* [ARROW-920](https://issues.apache.org/jira/browse/ARROW-920) - [GLib] Add Lua examples -* [ARROW-925](https://issues.apache.org/jira/browse/ARROW-925) - [GLib] Fix GArrowBufferReader test -* [ARROW-926](https://issues.apache.org/jira/browse/ARROW-926) - Update KEYS to include wesm -* [ARROW-930](https://issues.apache.org/jira/browse/ARROW-930) - javadoc generation fails with java 8 -* [ARROW-931](https://issues.apache.org/jira/browse/ARROW-931) - [GLib] Reconstruct input stream -* [ARROW-965](https://issues.apache.org/jira/browse/ARROW-965) - Website updates for 0.3.0 release - - - -# Apache Arrow 0.2.0 (2017-02-18) - -## Bug Fixes - -* [ARROW-112](https://issues.apache.org/jira/browse/ARROW-112) - [C++] Style fix for constants/enums -* [ARROW-202](https://issues.apache.org/jira/browse/ARROW-202) - [C++] Integrate with appveyor ci for windows support and get arrow building on windows -* [ARROW-220](https://issues.apache.org/jira/browse/ARROW-220) - [C++] Build conda artifacts in a build environment with better cross-linux ABI compatibility -* [ARROW-224](https://issues.apache.org/jira/browse/ARROW-224) - [C++] Address static linking of boost dependencies -* [ARROW-230](https://issues.apache.org/jira/browse/ARROW-230) - Python: Do not name modules like native ones (i.e. rename pyarrow.io) -* [ARROW-239](https://issues.apache.org/jira/browse/ARROW-239) - [Python] HdfsFile.read called with no arguments should read remainder of file -* [ARROW-261](https://issues.apache.org/jira/browse/ARROW-261) - [C++] Refactor BinaryArray/StringArray classes to not inherit from ListArray -* [ARROW-273](https://issues.apache.org/jira/browse/ARROW-273) - Lists use unsigned offset vectors instead of signed (as defined in the spec) -* [ARROW-275](https://issues.apache.org/jira/browse/ARROW-275) - Add tests for UnionVector in Arrow File -* [ARROW-294](https://issues.apache.org/jira/browse/ARROW-294) - [C++] Do not use fopen / fclose / etc. methods for memory mapped file implementation -* [ARROW-322](https://issues.apache.org/jira/browse/ARROW-322) - [C++] Do not build HDFS IO interface optionally -* [ARROW-323](https://issues.apache.org/jira/browse/ARROW-323) - [Python] Opt-in to PyArrow parquet build rather than skipping silently on failure -* [ARROW-334](https://issues.apache.org/jira/browse/ARROW-334) - [Python] OS X rpath issues on some configurations -* [ARROW-337](https://issues.apache.org/jira/browse/ARROW-337) - UnionListWriter.list() is doing more than it should, this can cause data corruption -* [ARROW-339](https://issues.apache.org/jira/browse/ARROW-339) - Make merge\_arrow\_pr script work with Python 3 -* [ARROW-339](https://issues.apache.org/jira/browse/ARROW-339) - Make merge\_arrow\_pr script work with Python 3 -* [ARROW-340](https://issues.apache.org/jira/browse/ARROW-340) - [C++] Opening a writeable file on disk that already exists does not truncate to zero -* [ARROW-342](https://issues.apache.org/jira/browse/ARROW-342) - Set Python version on release -* [ARROW-345](https://issues.apache.org/jira/browse/ARROW-345) - libhdfs integration doesn't work for Mac -* [ARROW-346](https://issues.apache.org/jira/browse/ARROW-346) - Python API Documentation -* [ARROW-348](https://issues.apache.org/jira/browse/ARROW-348) - [Python] CMake build type should be configurable on the command line -* [ARROW-349](https://issues.apache.org/jira/browse/ARROW-349) - Six is missing as a requirement in the python setup.py -* [ARROW-351](https://issues.apache.org/jira/browse/ARROW-351) - Time type has no unit -* [ARROW-354](https://issues.apache.org/jira/browse/ARROW-354) - Connot compare an array of empty strings to another -* [ARROW-357](https://issues.apache.org/jira/browse/ARROW-357) - Default Parquet chunk\_size of 64k is too small -* [ARROW-358](https://issues.apache.org/jira/browse/ARROW-358) - [C++] libhdfs can be in non-standard locations in some Hadoop distributions -* [ARROW-362](https://issues.apache.org/jira/browse/ARROW-362) - Python: Calling to\_pandas on a table read from Parquet leaks memory -* [ARROW-371](https://issues.apache.org/jira/browse/ARROW-371) - Python: Table with null timestamp becomes float in pandas -* [ARROW-375](https://issues.apache.org/jira/browse/ARROW-375) - columns parameter in parquet.read\_table() raises KeyError for valid column -* [ARROW-384](https://issues.apache.org/jira/browse/ARROW-384) - Align Java and C++ RecordBatch data and metadata layout -* [ARROW-386](https://issues.apache.org/jira/browse/ARROW-386) - [Java] Respect case of struct / map field names -* [ARROW-387](https://issues.apache.org/jira/browse/ARROW-387) - [C++] arrow::io::BufferReader does not permit shared memory ownership in zero-copy reads -* [ARROW-390](https://issues.apache.org/jira/browse/ARROW-390) - C++: CMake fails on json-integration-test with ARROW\_BUILD\_TESTS=OFF -* [ARROW-392](https://issues.apache.org/jira/browse/ARROW-392) - Fix string/binary integration tests -* [ARROW-393](https://issues.apache.org/jira/browse/ARROW-393) - [JAVA] JSON file reader fails to set the buffer size on String data vector -* [ARROW-395](https://issues.apache.org/jira/browse/ARROW-395) - Arrow file format writes record batches in reverse order. -* [ARROW-398](https://issues.apache.org/jira/browse/ARROW-398) - [Java] Java file format requires bitmaps of all 1's to be written when there are no nulls -* [ARROW-399](https://issues.apache.org/jira/browse/ARROW-399) - [Java] ListVector.loadFieldBuffers ignores the ArrowFieldNode length metadata -* [ARROW-400](https://issues.apache.org/jira/browse/ARROW-400) - [Java] ArrowWriter writes length 0 for Struct types -* [ARROW-401](https://issues.apache.org/jira/browse/ARROW-401) - [Java] Floating point vectors should do an approximate comparison in integration tests -* [ARROW-402](https://issues.apache.org/jira/browse/ARROW-402) - [Java] "refCnt gone negative" error in integration tests -* [ARROW-403](https://issues.apache.org/jira/browse/ARROW-403) - [JAVA] UnionVector: Creating a transfer pair doesn't transfer the schema to destination vector -* [ARROW-404](https://issues.apache.org/jira/browse/ARROW-404) - [Python] Closing an HdfsClient while there are still open file handles results in a crash -* [ARROW-405](https://issues.apache.org/jira/browse/ARROW-405) - [C++] Be less stringent about finding include/hdfs.h in HADOOP\_HOME -* [ARROW-406](https://issues.apache.org/jira/browse/ARROW-406) - [C++] Large HDFS reads must utilize the set file buffer size when making RPCs -* [ARROW-408](https://issues.apache.org/jira/browse/ARROW-408) - [C++/Python] Remove defunct conda recipes -* [ARROW-414](https://issues.apache.org/jira/browse/ARROW-414) - [Java] "Buffer too large to resize to ..." error -* [ARROW-420](https://issues.apache.org/jira/browse/ARROW-420) - Align Date implementation between Java and C++ -* [ARROW-421](https://issues.apache.org/jira/browse/ARROW-421) - [Python] Zero-copy buffers read by pyarrow::PyBytesReader must retain a reference to the parent PyBytes to avoid premature garbage collection issues -* [ARROW-422](https://issues.apache.org/jira/browse/ARROW-422) - C++: IPC should depend on rapidjson\_ep if RapidJSON is vendored -* [ARROW-429](https://issues.apache.org/jira/browse/ARROW-429) - git-archive SHA-256 checksums are changing -* [ARROW-433](https://issues.apache.org/jira/browse/ARROW-433) - [Python] Date conversion is locale-dependent -* [ARROW-434](https://issues.apache.org/jira/browse/ARROW-434) - Segfaults and encoding issues in Python Parquet reads -* [ARROW-435](https://issues.apache.org/jira/browse/ARROW-435) - C++: Spelling mistake in if(RAPIDJSON\_VENDORED) -* [ARROW-437](https://issues.apache.org/jira/browse/ARROW-437) - [C++] clang compiler warnings from overridden virtual functions -* [ARROW-445](https://issues.apache.org/jira/browse/ARROW-445) - C++: arrow\_ipc is built before arrow/ipc/Message\_generated.h was generated -* [ARROW-447](https://issues.apache.org/jira/browse/ARROW-447) - Python: Align scalar/pylist string encoding with pandas' one. -* [ARROW-455](https://issues.apache.org/jira/browse/ARROW-455) - [C++] BufferOutputStream dtor does not call Close() -* [ARROW-469](https://issues.apache.org/jira/browse/ARROW-469) - C++: Add option so that resize doesn't decrease the capacity -* [ARROW-481](https://issues.apache.org/jira/browse/ARROW-481) - [Python] Fix Python 2.7 regression in patch for PARQUET-472 -* [ARROW-486](https://issues.apache.org/jira/browse/ARROW-486) - [C++] arrow::io::MemoryMappedFile can't be casted to arrow::io::FileInterface -* [ARROW-487](https://issues.apache.org/jira/browse/ARROW-487) - Python: ConvertTableToPandas segfaults if ObjectBlock::Write fails -* [ARROW-494](https://issues.apache.org/jira/browse/ARROW-494) - [C++] When MemoryMappedFile is destructed, memory is unmapped even if buffer referecnes still exist -* [ARROW-499](https://issues.apache.org/jira/browse/ARROW-499) - Update file serialization to use streaming serialization format -* [ARROW-505](https://issues.apache.org/jira/browse/ARROW-505) - [C++] Fix compiler warnings in release mode -* [ARROW-511](https://issues.apache.org/jira/browse/ARROW-511) - [Python] List[T] conversions not implemented for single arrays -* [ARROW-513](https://issues.apache.org/jira/browse/ARROW-513) - [C++] Fix Appveyor build -* [ARROW-516](https://issues.apache.org/jira/browse/ARROW-516) - Building pyarrow with parquet -* [ARROW-519](https://issues.apache.org/jira/browse/ARROW-519) - [C++] Missing vtable in libarrow.dylib on Xcode 6.4 -* [ARROW-523](https://issues.apache.org/jira/browse/ARROW-523) - Python: Account for changes in PARQUET-834 -* [ARROW-533](https://issues.apache.org/jira/browse/ARROW-533) - [C++] arrow::TimestampArray / TimeArray has a broken constructor -* [ARROW-535](https://issues.apache.org/jira/browse/ARROW-535) - [Python] Add type mapping for NPY\_LONGLONG -* [ARROW-537](https://issues.apache.org/jira/browse/ARROW-537) - [C++] StringArray/BinaryArray comparisons may be incorrect when values with non-zero length are null -* [ARROW-540](https://issues.apache.org/jira/browse/ARROW-540) - [C++] Fix build in aftermath of ARROW-33 -* [ARROW-543](https://issues.apache.org/jira/browse/ARROW-543) - C++: Lazily computed null\_counts counts number of non-null entries -* [ARROW-544](https://issues.apache.org/jira/browse/ARROW-544) - [C++] ArrayLoader::LoadBinary fails for length-0 arrays -* [ARROW-545](https://issues.apache.org/jira/browse/ARROW-545) - [Python] Ignore files without .parq or .parquet prefix when reading directory of files -* [ARROW-548](https://issues.apache.org/jira/browse/ARROW-548) - [Python] Add nthreads option to pyarrow.Filesystem.read\_parquet -* [ARROW-551](https://issues.apache.org/jira/browse/ARROW-551) - C++: Construction of Column with nullptr Array segfaults -* [ARROW-556](https://issues.apache.org/jira/browse/ARROW-556) - [Integration] Can not run Integration tests if different cpp build path -* [ARROW-561](https://issues.apache.org/jira/browse/ARROW-561) - Update java & python dependencies to improve downstream packaging experience -* [ARROW-562](https://issues.apache.org/jira/browse/ARROW-562) - Mockito should be in test scope - - -## New Features and Improvements - -* [ARROW-33](https://issues.apache.org/jira/browse/ARROW-33) - C++: Implement zero-copy array slicing -* [ARROW-81](https://issues.apache.org/jira/browse/ARROW-81) - [Format] Add a Category logical type (distinct from dictionary-encoding) -* [ARROW-96](https://issues.apache.org/jira/browse/ARROW-96) - C++: API documentation using Doxygen -* [ARROW-97](https://issues.apache.org/jira/browse/ARROW-97) - Python: API documentation via sphinx-apidoc -* [ARROW-108](https://issues.apache.org/jira/browse/ARROW-108) - [C++] Add IPC round trip for union types -* [ARROW-189](https://issues.apache.org/jira/browse/ARROW-189) - C++: Use ExternalProject to build thirdparty dependencies -* [ARROW-191](https://issues.apache.org/jira/browse/ARROW-191) - Python: Provide infrastructure for manylinux1 wheels -* [ARROW-221](https://issues.apache.org/jira/browse/ARROW-221) - Add switch for writing Parquet 1.0 compatible logical types -* [ARROW-227](https://issues.apache.org/jira/browse/ARROW-227) - [C++/Python] Hook arrow\_io generic reader / writer interface into arrow\_parquet -* [ARROW-228](https://issues.apache.org/jira/browse/ARROW-228) - [Python] Create an Arrow-cpp-compatible interface for reading bytes from Python file-like objects -* [ARROW-240](https://issues.apache.org/jira/browse/ARROW-240) - Installation instructions for pyarrow -* [ARROW-243](https://issues.apache.org/jira/browse/ARROW-243) - [C++] Add "driver" option to HdfsClient to choose between libhdfs and libhdfs3 at runtime -* [ARROW-268](https://issues.apache.org/jira/browse/ARROW-268) - [C++] Flesh out union implementation to have all required methods for IPC -* [ARROW-303](https://issues.apache.org/jira/browse/ARROW-303) - [C++] Also build static libraries for leaf libraries -* [ARROW-312](https://issues.apache.org/jira/browse/ARROW-312) - [Python] Provide Python API to read/write the Arrow IPC file format -* [ARROW-312](https://issues.apache.org/jira/browse/ARROW-312) - [Python] Provide Python API to read/write the Arrow IPC file format -* [ARROW-317](https://issues.apache.org/jira/browse/ARROW-317) - [C++] Implement zero-copy Slice method on arrow::Buffer that retains reference to parent -* [ARROW-327](https://issues.apache.org/jira/browse/ARROW-327) - [Python] Remove conda builds from Travis CI processes -* [ARROW-328](https://issues.apache.org/jira/browse/ARROW-328) - [C++] Return shared\_ptr by value instead of const-ref? -* [ARROW-330](https://issues.apache.org/jira/browse/ARROW-330) - [C++] CMake functions to simplify shared / static library configuration -* [ARROW-332](https://issues.apache.org/jira/browse/ARROW-332) - [Python] Add helper function to convert RecordBatch to pandas.DataFrame -* [ARROW-333](https://issues.apache.org/jira/browse/ARROW-333) - Make writers update their internal schema even when no data is written. -* [ARROW-335](https://issues.apache.org/jira/browse/ARROW-335) - Improve Type apis and toString() by encapsulating flatbuffers better -* [ARROW-336](https://issues.apache.org/jira/browse/ARROW-336) - Run Apache Rat in Travis builds -* [ARROW-338](https://issues.apache.org/jira/browse/ARROW-338) - [C++] Refactor IPC vector "loading" and "unloading" to be based on cleaner visitor pattern -* [ARROW-344](https://issues.apache.org/jira/browse/ARROW-344) - Instructions for building with conda -* [ARROW-350](https://issues.apache.org/jira/browse/ARROW-350) - Add Kerberos support to HDFS shim -* [ARROW-353](https://issues.apache.org/jira/browse/ARROW-353) - Arrow release 0.2 -* [ARROW-355](https://issues.apache.org/jira/browse/ARROW-355) - Add tests for serialising arrays of empty strings to Parquet -* [ARROW-356](https://issues.apache.org/jira/browse/ARROW-356) - Add documentation about reading Parquet -* [ARROW-359](https://issues.apache.org/jira/browse/ARROW-359) - Need to document ARROW\_LIBHDFS\_DIR -* [ARROW-360](https://issues.apache.org/jira/browse/ARROW-360) - C++: Add method to shrink PoolBuffer using realloc -* [ARROW-361](https://issues.apache.org/jira/browse/ARROW-361) - Python: Support reading a column-selection from Parquet files -* [ARROW-363](https://issues.apache.org/jira/browse/ARROW-363) - Set up Java/C++ integration test harness -* [ARROW-365](https://issues.apache.org/jira/browse/ARROW-365) - Python: Provide Array.to\_pandas() -* [ARROW-366](https://issues.apache.org/jira/browse/ARROW-366) - [java] implement Dictionary vector -* [ARROW-367](https://issues.apache.org/jira/browse/ARROW-367) - [java] converter csv/json <=\> Arrow file format for Integration tests -* [ARROW-368](https://issues.apache.org/jira/browse/ARROW-368) - Document use of LD\_LIBRARY\_PATH when using Python -* [ARROW-369](https://issues.apache.org/jira/browse/ARROW-369) - [Python] Add ability to convert multiple record batches at once to pandas -* [ARROW-370](https://issues.apache.org/jira/browse/ARROW-370) - Python: Pandas conversion from \`datetime.date\` columns -* [ARROW-372](https://issues.apache.org/jira/browse/ARROW-372) - Create JSON arrow file format for integration tests -* [ARROW-373](https://issues.apache.org/jira/browse/ARROW-373) - [C++] Implement C++ version of JSON file format for testing -* [ARROW-374](https://issues.apache.org/jira/browse/ARROW-374) - Python: clarify unicode vs. binary in API -* [ARROW-377](https://issues.apache.org/jira/browse/ARROW-377) - Python: Add support for conversion of Pandas.Categorical -* [ARROW-379](https://issues.apache.org/jira/browse/ARROW-379) - Python: Use setuptools\_scm/setuptools\_scm\_git\_archive to provide the version number -* [ARROW-380](https://issues.apache.org/jira/browse/ARROW-380) - [Java] optimize null count when serializing vectors. -* [ARROW-381](https://issues.apache.org/jira/browse/ARROW-381) - [C++] Simplify primitive array type builders to use a default type singleton -* [ARROW-382](https://issues.apache.org/jira/browse/ARROW-382) - Python: Extend API documentation -* [ARROW-383](https://issues.apache.org/jira/browse/ARROW-383) - [C++] Implement C++ version of ARROW-367 integration test validator -* [ARROW-389](https://issues.apache.org/jira/browse/ARROW-389) - Python: Write Parquet files to pyarrow.io.NativeFile objects -* [ARROW-394](https://issues.apache.org/jira/browse/ARROW-394) - Add integration tests for boolean, list, struct, and other basic types -* [ARROW-396](https://issues.apache.org/jira/browse/ARROW-396) - Python: Add pyarrow.schema.Schema.equals -* [ARROW-409](https://issues.apache.org/jira/browse/ARROW-409) - Python: Change pyarrow.Table.dataframe\_from\_batches API to create Table instead -* [ARROW-410](https://issues.apache.org/jira/browse/ARROW-410) - [C++] Add Flush method to arrow::io::OutputStream -* [ARROW-411](https://issues.apache.org/jira/browse/ARROW-411) - [Java] Move Intergration.compare and Intergration.compareSchemas to a public utils class -* [ARROW-415](https://issues.apache.org/jira/browse/ARROW-415) - C++: Add Equals implementation to compare Tables -* [ARROW-416](https://issues.apache.org/jira/browse/ARROW-416) - C++: Add Equals implementation to compare Columns -* [ARROW-417](https://issues.apache.org/jira/browse/ARROW-417) - C++: Add Equals implementation to compare ChunkedArrays -* [ARROW-418](https://issues.apache.org/jira/browse/ARROW-418) - [C++] Consolidate array container and builder code, remove arrow/types -* [ARROW-419](https://issues.apache.org/jira/browse/ARROW-419) - [C++] Promote util/{status.h, buffer.h, memory-pool.h} to top level of arrow/ source directory -* [ARROW-423](https://issues.apache.org/jira/browse/ARROW-423) - C++: Define BUILD\_BYPRODUCTS in external project to support non-make CMake generators -* [ARROW-425](https://issues.apache.org/jira/browse/ARROW-425) - Python: Expose a C function to convert arrow::Table to pyarrow.Table -* [ARROW-426](https://issues.apache.org/jira/browse/ARROW-426) - Python: Conversion from pyarrow.Array to a Python list -* [ARROW-427](https://issues.apache.org/jira/browse/ARROW-427) - [C++] Implement dictionary-encoded array container -* [ARROW-428](https://issues.apache.org/jira/browse/ARROW-428) - [Python] Deserialize from Arrow record batches to pandas in parallel using a thread pool -* [ARROW-430](https://issues.apache.org/jira/browse/ARROW-430) - Python: Better version handling -* [ARROW-432](https://issues.apache.org/jira/browse/ARROW-432) - [Python] Avoid unnecessary memory copy in to\_pandas conversion by using low-level pandas internals APIs -* [ARROW-438](https://issues.apache.org/jira/browse/ARROW-438) - [Python] Concatenate Table instances with equal schemas -* [ARROW-440](https://issues.apache.org/jira/browse/ARROW-440) - [C++] Support pkg-config -* [ARROW-441](https://issues.apache.org/jira/browse/ARROW-441) - [Python] Expose Arrow's file and memory map classes as NativeFile subclasses -* [ARROW-442](https://issues.apache.org/jira/browse/ARROW-442) - [Python] Add public Python API to inspect Parquet file metadata -* [ARROW-444](https://issues.apache.org/jira/browse/ARROW-444) - [Python] Avoid unnecessary memory copies from use of PyBytes\_\* C APIs -* [ARROW-449](https://issues.apache.org/jira/browse/ARROW-449) - Python: Conversion from pyarrow.{Table,RecordBatch} to a Python dict -* [ARROW-450](https://issues.apache.org/jira/browse/ARROW-450) - Python: Fixes for PARQUET-818 -* [ARROW-456](https://issues.apache.org/jira/browse/ARROW-456) - C++: Add jemalloc based MemoryPool -* [ARROW-457](https://issues.apache.org/jira/browse/ARROW-457) - Python: Better control over memory pool -* [ARROW-458](https://issues.apache.org/jira/browse/ARROW-458) - [Python] Expose jemalloc MemoryPool -* [ARROW-461](https://issues.apache.org/jira/browse/ARROW-461) - [Python] Implement conversion between arrow::DictionaryArray and pandas.Categorical -* [ARROW-463](https://issues.apache.org/jira/browse/ARROW-463) - C++: Support jemalloc 4.x -* [ARROW-466](https://issues.apache.org/jira/browse/ARROW-466) - C++: ExternalProject for jemalloc -* [ARROW-467](https://issues.apache.org/jira/browse/ARROW-467) - [Python] Run parquet-cpp unit tests in Travis CI -* [ARROW-468](https://issues.apache.org/jira/browse/ARROW-468) - Python: Conversion of nested data in pd.DataFrames to/from Arrow structures -* [ARROW-470](https://issues.apache.org/jira/browse/ARROW-470) - [Python] Add "FileSystem" abstraction to access directories of files in a uniform way -* [ARROW-471](https://issues.apache.org/jira/browse/ARROW-471) - [Python] Enable ParquetFile to pass down separately-obtained file metadata -* [ARROW-472](https://issues.apache.org/jira/browse/ARROW-472) - [Python] Expose parquet::{SchemaDescriptor, ColumnDescriptor}::Equals -* [ARROW-474](https://issues.apache.org/jira/browse/ARROW-474) - Create an Arrow streaming file fomat -* [ARROW-475](https://issues.apache.org/jira/browse/ARROW-475) - [Python] High level support for reading directories of Parquet files (as a single Arrow table) from supported file system interfaces -* [ARROW-476](https://issues.apache.org/jira/browse/ARROW-476) - [Integration] Add integration tests for Binary / Varbytes type -* [ARROW-477](https://issues.apache.org/jira/browse/ARROW-477) - [Java] Add support for second/microsecond/nanosecond timestamps in-memory and in IPC/JSON layer -* [ARROW-478](https://issues.apache.org/jira/browse/ARROW-478) - [Python] Accept a PyBytes object in the pyarrow.io.BufferReader ctor -* [ARROW-479](https://issues.apache.org/jira/browse/ARROW-479) - Python: Test for expected schema in Pandas conversion -* [ARROW-484](https://issues.apache.org/jira/browse/ARROW-484) - Add more detail about what of technology can be found in the Arrow implementations to README -* [ARROW-485](https://issues.apache.org/jira/browse/ARROW-485) - [Java] Users are required to initialize VariableLengthVectors.offsetVector before calling VariableLengthVectors.mutator.getSafe -* [ARROW-490](https://issues.apache.org/jira/browse/ARROW-490) - Python: Update manylinux1 build scripts -* [ARROW-495](https://issues.apache.org/jira/browse/ARROW-495) - [C++] Add C++ implementation of streaming serialized format -* [ARROW-497](https://issues.apache.org/jira/browse/ARROW-497) - [Java] Integration test harness for streaming format -* [ARROW-498](https://issues.apache.org/jira/browse/ARROW-498) - [C++] Integration test harness for streaming format -* [ARROW-503](https://issues.apache.org/jira/browse/ARROW-503) - [Python] Interface to streaming binary format -* [ARROW-506](https://issues.apache.org/jira/browse/ARROW-506) - Implement Arrow Echo server for integration testing -* [ARROW-508](https://issues.apache.org/jira/browse/ARROW-508) - [C++] Make file/memory-mapped file interfaces threadsafe -* [ARROW-509](https://issues.apache.org/jira/browse/ARROW-509) - [Python] Add support for PARQUET-835 (parallel column reads) -* [ARROW-512](https://issues.apache.org/jira/browse/ARROW-512) - C++: Add method to check for primitive types -* [ARROW-514](https://issues.apache.org/jira/browse/ARROW-514) - [Python] Accept pyarrow.io.Buffer as input to StreamReader, FileReader classes -* [ARROW-515](https://issues.apache.org/jira/browse/ARROW-515) - [Python] Add StreamReader/FileReader methods that read all record batches as a Table -* [ARROW-521](https://issues.apache.org/jira/browse/ARROW-521) - [C++/Python] Track peak memory use in default MemoryPool -* [ARROW-524](https://issues.apache.org/jira/browse/ARROW-524) - [java] provide apis to access nested vectors and buffers -* [ARROW-525](https://issues.apache.org/jira/browse/ARROW-525) - Python: Add more documentation to the package -* [ARROW-527](https://issues.apache.org/jira/browse/ARROW-527) - clean drill-module.conf file -* [ARROW-529](https://issues.apache.org/jira/browse/ARROW-529) - Python: Add jemalloc and Python 3.6 to manylinux1 build -* [ARROW-531](https://issues.apache.org/jira/browse/ARROW-531) - Python: Document jemalloc, extend Pandas section, add Getting Involved -* [ARROW-538](https://issues.apache.org/jira/browse/ARROW-538) - [C++] Set up AddressSanitizer (ASAN) builds -* [ARROW-546](https://issues.apache.org/jira/browse/ARROW-546) - Python: Account for changes in PARQUET-867 -* [ARROW-547](https://issues.apache.org/jira/browse/ARROW-547) - [Python] Expose Array::Slice and RecordBatch::Slice -* [ARROW-553](https://issues.apache.org/jira/browse/ARROW-553) - C++: Faster valid bitmap building -* [ARROW-558](https://issues.apache.org/jira/browse/ARROW-558) - Add KEYS files - - - -# Apache Arrow 0.1.0 (2016-10-10) - -## New Features and Improvements - -* [ARROW-1](https://issues.apache.org/jira/browse/ARROW-1) - Import Initial Codebase -* [ARROW-2](https://issues.apache.org/jira/browse/ARROW-2) - Post Simple Website -* [ARROW-3](https://issues.apache.org/jira/browse/ARROW-3) - Post Initial Arrow Format Spec -* [ARROW-4](https://issues.apache.org/jira/browse/ARROW-4) - Initial Arrow CPP Implementation -* [ARROW-7](https://issues.apache.org/jira/browse/ARROW-7) - Add Python library build toolchain -* [ARROW-8](https://issues.apache.org/jira/browse/ARROW-8) - Set up Travis CI -* [ARROW-9](https://issues.apache.org/jira/browse/ARROW-9) - Rename some unchanged "Drill" to "Arrow" -* [ARROW-9](https://issues.apache.org/jira/browse/ARROW-9) - Rename some unchanged "Drill" to "Arrow" -* [ARROW-10](https://issues.apache.org/jira/browse/ARROW-10) - Fix mismatch of javadoc names and method parameters -* [ARROW-11](https://issues.apache.org/jira/browse/ARROW-11) - Mirror JIRA activity to dev@arrow.apache.org -* [ARROW-13](https://issues.apache.org/jira/browse/ARROW-13) - Add PR merge tool similar to that used in Parquet -* [ARROW-14](https://issues.apache.org/jira/browse/ARROW-14) - Add JIRA components -* [ARROW-15](https://issues.apache.org/jira/browse/ARROW-15) - Fix a naming typo for memory.AllocationManager.AllocationOutcome -* [ARROW-19](https://issues.apache.org/jira/browse/ARROW-19) - C++: Externalize memory allocations and add a MemoryPool abstract interface to builder classes -* [ARROW-20](https://issues.apache.org/jira/browse/ARROW-20) - C++: Add null count member to Array containers, remove nullable member -* [ARROW-21](https://issues.apache.org/jira/browse/ARROW-21) - C++: Add in-memory schema metadata container -* [ARROW-22](https://issues.apache.org/jira/browse/ARROW-22) - C++: Add schema adapter routines for converting flat Parquet schemas to in-memory Arrow schemas -* [ARROW-23](https://issues.apache.org/jira/browse/ARROW-23) - C++: Add logical "Column" container for chunked data -* [ARROW-24](https://issues.apache.org/jira/browse/ARROW-24) - C++: Add logical "Table" container -* [ARROW-26](https://issues.apache.org/jira/browse/ARROW-26) - C++: Add developer instructions for building parquet-cpp integration -* [ARROW-28](https://issues.apache.org/jira/browse/ARROW-28) - C++: Add google/benchmark to the 3rd-party build toolchain -* [ARROW-30](https://issues.apache.org/jira/browse/ARROW-30) - Python: pandas/NumPy to/from Arrow conversion routines -* [ARROW-31](https://issues.apache.org/jira/browse/ARROW-31) - Python: basic PyList <-\> Arrow marshaling code -* [ARROW-35](https://issues.apache.org/jira/browse/ARROW-35) - Add a short call-to-action / how-to-get-involved to the main README.md -* [ARROW-37](https://issues.apache.org/jira/browse/ARROW-37) - C++: Represent boolean array data in bit-packed form -* [ARROW-42](https://issues.apache.org/jira/browse/ARROW-42) - Python: Add to Travis CI build -* [ARROW-43](https://issues.apache.org/jira/browse/ARROW-43) - Python: Add rudimentary console \_\_repr\_\_ for array types -* [ARROW-44](https://issues.apache.org/jira/browse/ARROW-44) - Python: Implement basic object model for scalar values (i.e. results of arrow\_arr[i]) -* [ARROW-48](https://issues.apache.org/jira/browse/ARROW-48) - Python: Add Schema object wrapper -* [ARROW-49](https://issues.apache.org/jira/browse/ARROW-49) - Python: Add Column and Table wrapper interface -* [ARROW-50](https://issues.apache.org/jira/browse/ARROW-50) - C++: Enable library builds for 3rd-party users without having to build thirdparty googletest -* [ARROW-53](https://issues.apache.org/jira/browse/ARROW-53) - Python: Fix RPATH and add source installation instructions -* [ARROW-54](https://issues.apache.org/jira/browse/ARROW-54) - Python: rename package to "pyarrow" -* [ARROW-56](https://issues.apache.org/jira/browse/ARROW-56) - Format: Specify LSB bit ordering in bit arrays -* [ARROW-57](https://issues.apache.org/jira/browse/ARROW-57) - Format: Draft data headers IDL for data interchange -* [ARROW-58](https://issues.apache.org/jira/browse/ARROW-58) - Format: Draft type metadata ("schemas") IDL -* [ARROW-59](https://issues.apache.org/jira/browse/ARROW-59) - Python: Boolean data support for builtin data structures -* [ARROW-60](https://issues.apache.org/jira/browse/ARROW-60) - C++: Struct type builder API -* [ARROW-64](https://issues.apache.org/jira/browse/ARROW-64) - Add zsh support to C++ build scripts -* [ARROW-66](https://issues.apache.org/jira/browse/ARROW-66) - Maybe some missing steps in installation guide -* [ARROW-67](https://issues.apache.org/jira/browse/ARROW-67) - C++: Draft type metadata conversion to/from IPC representation -* [ARROW-68](https://issues.apache.org/jira/browse/ARROW-68) - Update setup\_build\_env and third-party script to be more userfriendly -* [ARROW-70](https://issues.apache.org/jira/browse/ARROW-70) - C++: Add "lite" DCHECK macros used in parquet-cpp -* [ARROW-71](https://issues.apache.org/jira/browse/ARROW-71) - C++: Add script to run clang-tidy on codebase -* [ARROW-73](https://issues.apache.org/jira/browse/ARROW-73) - Support CMake 2.8 -* [ARROW-76](https://issues.apache.org/jira/browse/ARROW-76) - Revise format document to include null count, defer non-nullable arrays to the domain of metadata -* [ARROW-78](https://issues.apache.org/jira/browse/ARROW-78) - C++: Add constructor for DecimalType -* [ARROW-79](https://issues.apache.org/jira/browse/ARROW-79) - Python: Add benchmarks -* [ARROW-82](https://issues.apache.org/jira/browse/ARROW-82) - C++: Implement IPC exchange for List types -* [ARROW-85](https://issues.apache.org/jira/browse/ARROW-85) - C++: memcmp can be avoided in Equal when comparing with the same Buffer -* [ARROW-86](https://issues.apache.org/jira/browse/ARROW-86) - Python: Implement zero-copy Arrow-to-Pandas conversion -* [ARROW-87](https://issues.apache.org/jira/browse/ARROW-87) - Implement Decimal schema conversion for all ways supported in Parquet -* [ARROW-89](https://issues.apache.org/jira/browse/ARROW-89) - Python: Add benchmarks for Arrow<-\>Pandas conversion -* [ARROW-90](https://issues.apache.org/jira/browse/ARROW-90) - Apache Arrow cpp code does not support power architecture -* [ARROW-91](https://issues.apache.org/jira/browse/ARROW-91) - C++: First draft of an adapter class for parquet-cpp's ParquetFileReader that produces Arrow table/row batch objects -* [ARROW-92](https://issues.apache.org/jira/browse/ARROW-92) - C++: Arrow to Parquet Schema conversion -* [ARROW-100](https://issues.apache.org/jira/browse/ARROW-100) - [C++] Computing RowBatch size -* [ARROW-101](https://issues.apache.org/jira/browse/ARROW-101) - Fix java warnings emitted by java compiler -* [ARROW-102](https://issues.apache.org/jira/browse/ARROW-102) - travis-ci support for java project -* [ARROW-106](https://issues.apache.org/jira/browse/ARROW-106) - Add IPC round trip for string types (string, char, varchar, binary) -* [ARROW-107](https://issues.apache.org/jira/browse/ARROW-107) - [C++] add ipc round trip for struct types -* [ARROW-190](https://issues.apache.org/jira/browse/ARROW-190) - Python: Provide installable sdist builds -* [ARROW-196](https://issues.apache.org/jira/browse/ARROW-196) - [C++] Add conda dev recipe for libarrow and libarrow\_parquet -* [ARROW-197](https://issues.apache.org/jira/browse/ARROW-197) - [Python] Add conda dev recipe for pyarrow -* [ARROW-199](https://issues.apache.org/jira/browse/ARROW-199) - [C++] Refine third party dependency -* [ARROW-201](https://issues.apache.org/jira/browse/ARROW-201) - C++: Initial ParquetWriter implementation -* [ARROW-203](https://issues.apache.org/jira/browse/ARROW-203) - Python: Basic filename based Parquet read/write -* [ARROW-204](https://issues.apache.org/jira/browse/ARROW-204) - [Python] Automate uploading conda build artifacts for libarrow and pyarrow -* [ARROW-206](https://issues.apache.org/jira/browse/ARROW-206) - [C++] Expose an equality API for arrays that compares a range of slots on two arrays -* [ARROW-207](https://issues.apache.org/jira/browse/ARROW-207) - Extend BufferAllocator interface to allow decorators around BufferAllocator -* [ARROW-212](https://issues.apache.org/jira/browse/ARROW-212) - [C++] Clarify the fact that PrimitiveArray is now abstract class -* [ARROW-213](https://issues.apache.org/jira/browse/ARROW-213) - Exposing static arrow build -* [ARROW-214](https://issues.apache.org/jira/browse/ARROW-214) - C++: Add String support to Parquet I/O -* [ARROW-215](https://issues.apache.org/jira/browse/ARROW-215) - C++: Support other integer types in Parquet I/O -* [ARROW-218](https://issues.apache.org/jira/browse/ARROW-218) - Add option to use GitHub API token via environment variable when merging PRs -* [ARROW-222](https://issues.apache.org/jira/browse/ARROW-222) - [C++] Create prototype file-like interface to HDFS (via libhdfs) and begin defining more general IO interface for Arrow data adapters -* [ARROW-233](https://issues.apache.org/jira/browse/ARROW-233) - [C++] Add visibility defines for limiting shared library symbol visibility -* [ARROW-234](https://issues.apache.org/jira/browse/ARROW-234) - [C++] Build with libhdfs support in arrow\_io in conda builds -* [ARROW-236](https://issues.apache.org/jira/browse/ARROW-236) - [Python] Enable Parquet read/write to work with HDFS file objects -* [ARROW-237](https://issues.apache.org/jira/browse/ARROW-237) - [C++] Create Arrow specializations of Parquet allocator and read interfaces -* [ARROW-238](https://issues.apache.org/jira/browse/ARROW-238) - C++: InternalMemoryPool::Free() should throw an error when there is insufficient allocated memory -* [ARROW-242](https://issues.apache.org/jira/browse/ARROW-242) - C++/Python: Support Timestamp Data Type -* [ARROW-245](https://issues.apache.org/jira/browse/ARROW-245) - [Format] Clarify Arrow's relationship with big endian platforms -* [ARROW-251](https://issues.apache.org/jira/browse/ARROW-251) - [C++] Expose APIs for getting code and message of the status -* [ARROW-252](https://issues.apache.org/jira/browse/ARROW-252) - Add implementation guidelines to the documentation -* [ARROW-253](https://issues.apache.org/jira/browse/ARROW-253) - Int types should only have width of 8\*2^n (8, 16, 32, 64) -* [ARROW-254](https://issues.apache.org/jira/browse/ARROW-254) - Remove Bit type as it is redundant with boolean -* [ARROW-255](https://issues.apache.org/jira/browse/ARROW-255) - Finalize Dictionary representation -* [ARROW-256](https://issues.apache.org/jira/browse/ARROW-256) - Add versioning to the arrow spec. -* [ARROW-257](https://issues.apache.org/jira/browse/ARROW-257) - Add a typeids Vector to Union type -* [ARROW-262](https://issues.apache.org/jira/browse/ARROW-262) - [Format] Add a new format document for metadata and logical types for messaging and IPC / on-wire/file representations -* [ARROW-264](https://issues.apache.org/jira/browse/ARROW-264) - Create an Arrow File format -* [ARROW-267](https://issues.apache.org/jira/browse/ARROW-267) - [C++] C++ implementation of file-like layout for RPC / IPC -* [ARROW-270](https://issues.apache.org/jira/browse/ARROW-270) - [Format] Define more generic Interval logical type -* [ARROW-271](https://issues.apache.org/jira/browse/ARROW-271) - Update Field structure to be more explicit -* [ARROW-272](https://issues.apache.org/jira/browse/ARROW-272) - Arrow release 0.1 -* [ARROW-279](https://issues.apache.org/jira/browse/ARROW-279) - rename vector module to arrow-vector for consistency -* [ARROW-280](https://issues.apache.org/jira/browse/ARROW-280) - [C++] Consolidate file and shared memory IO interfaces -* [ARROW-282](https://issues.apache.org/jira/browse/ARROW-282) - Make parquet-cpp an optional dependency of pyarrow -* [ARROW-285](https://issues.apache.org/jira/browse/ARROW-285) - Allow for custom flatc compiler -* [ARROW-286](https://issues.apache.org/jira/browse/ARROW-286) - Build thirdparty dependencies in parallel -* [ARROW-289](https://issues.apache.org/jira/browse/ARROW-289) - Install test-util.h -* [ARROW-290](https://issues.apache.org/jira/browse/ARROW-290) - Specialize alloc() in ArrowBuf -* [ARROW-291](https://issues.apache.org/jira/browse/ARROW-291) - [Python] Update NOTICE file for Python codebase -* [ARROW-292](https://issues.apache.org/jira/browse/ARROW-292) - [Java] Upgrade Netty to 4.041 -* [ARROW-293](https://issues.apache.org/jira/browse/ARROW-293) - [C++] Implementations of IO interfaces for operating system files -* [ARROW-296](https://issues.apache.org/jira/browse/ARROW-296) - [C++] Remove arrow\_parquet C++ module and related parts of build system -* [ARROW-298](https://issues.apache.org/jira/browse/ARROW-298) - create release scripts -* [ARROW-299](https://issues.apache.org/jira/browse/ARROW-299) - Use absolute namespace in macros -* [ARROW-301](https://issues.apache.org/jira/browse/ARROW-301) - [Format] Add some form of user field metadata to IPC schemas -* [ARROW-302](https://issues.apache.org/jira/browse/ARROW-302) - [Python] Add support to use the Arrow file format with file-like objects -* [ARROW-305](https://issues.apache.org/jira/browse/ARROW-305) - Add compression and use\_dictionary options to Parquet interface -* [ARROW-306](https://issues.apache.org/jira/browse/ARROW-306) - Add option to pass cmake arguments via environment variable -* [ARROW-315](https://issues.apache.org/jira/browse/ARROW-315) - Finalize timestamp type -* [ARROW-318](https://issues.apache.org/jira/browse/ARROW-318) - [Python] Revise README to reflect current state of project -* [ARROW-319](https://issues.apache.org/jira/browse/ARROW-319) - Add canonical Arrow Schema json representation -* [ARROW-324](https://issues.apache.org/jira/browse/ARROW-324) - Update arrow metadata diagram -* [ARROW-325](https://issues.apache.org/jira/browse/ARROW-325) - make TestArrowFile not dependent on timezone - - -## Bug Fixes - -* [ARROW-5](https://issues.apache.org/jira/browse/ARROW-5) - Error when run maven install -* [ARROW-5](https://issues.apache.org/jira/browse/ARROW-5) - Error when run maven install -* [ARROW-16](https://issues.apache.org/jira/browse/ARROW-16) - Building cpp issues on XCode 7.2.1 -* [ARROW-17](https://issues.apache.org/jira/browse/ARROW-17) - Set some vector fields to default access level for Drill compatibility -* [ARROW-18](https://issues.apache.org/jira/browse/ARROW-18) - Fix bug with decimal precision and scale -* [ARROW-36](https://issues.apache.org/jira/browse/ARROW-36) - Remove fixVersions from patch tool (until we have them) -* [ARROW-46](https://issues.apache.org/jira/browse/ARROW-46) - Port DRILL-4410 to Arrow -* [ARROW-51](https://issues.apache.org/jira/browse/ARROW-51) - Move ValueVector test from Drill project -* [ARROW-55](https://issues.apache.org/jira/browse/ARROW-55) - Python: fix legacy Python (2.7) tests and add to Travis CI -* [ARROW-62](https://issues.apache.org/jira/browse/ARROW-62) - Format: Are the nulls bits 0 or 1 for null values? -* [ARROW-63](https://issues.apache.org/jira/browse/ARROW-63) - C++: ctest fails if Python 3 is the active Python interpreter -* [ARROW-65](https://issues.apache.org/jira/browse/ARROW-65) - Python: FindPythonLibsNew does not work in a virtualenv -* [ARROW-69](https://issues.apache.org/jira/browse/ARROW-69) - Change permissions for assignable users -* [ARROW-72](https://issues.apache.org/jira/browse/ARROW-72) - FindParquet searches for non-existent header -* [ARROW-75](https://issues.apache.org/jira/browse/ARROW-75) - C++: Fix handling of empty strings -* [ARROW-77](https://issues.apache.org/jira/browse/ARROW-77) - C++: conform null bit interpretation to match ARROW-62 -* [ARROW-80](https://issues.apache.org/jira/browse/ARROW-80) - Segmentation fault on len(Array) for empty arrays -* [ARROW-83](https://issues.apache.org/jira/browse/ARROW-83) - Add basic test infrastructure for DecimalType -* [ARROW-84](https://issues.apache.org/jira/browse/ARROW-84) - C++: separate test codes -* [ARROW-88](https://issues.apache.org/jira/browse/ARROW-88) - C++: Refactor given PARQUET-572 -* [ARROW-93](https://issues.apache.org/jira/browse/ARROW-93) - XCode 7.3 breaks builds -* [ARROW-94](https://issues.apache.org/jira/browse/ARROW-94) - Expand list example to clarify null vs empty list -* [ARROW-103](https://issues.apache.org/jira/browse/ARROW-103) - Missing patterns from .gitignore -* [ARROW-104](https://issues.apache.org/jira/browse/ARROW-104) - Update Layout.md based on discussion on the mailing list -* [ARROW-105](https://issues.apache.org/jira/browse/ARROW-105) - Unit tests fail if assertions are disabled -* [ARROW-113](https://issues.apache.org/jira/browse/ARROW-113) - TestValueVector test fails if cannot allocate 2GB of memory -* [ARROW-185](https://issues.apache.org/jira/browse/ARROW-185) - [C++] Make sure alignment and memory padding conform to spec -* [ARROW-188](https://issues.apache.org/jira/browse/ARROW-188) - Python: Add numpy as install requirement -* [ARROW-193](https://issues.apache.org/jira/browse/ARROW-193) - For the instruction, typos "int his" should be "in this" -* [ARROW-194](https://issues.apache.org/jira/browse/ARROW-194) - C++: Allow read-only memory mapped source -* [ARROW-200](https://issues.apache.org/jira/browse/ARROW-200) - [Python] Convert Values String looks like it has incorrect error handling -* [ARROW-205](https://issues.apache.org/jira/browse/ARROW-205) - builds failing on master branch with apt-get error -* [ARROW-209](https://issues.apache.org/jira/browse/ARROW-209) - [C++] Broken builds: llvm.org apt repos are unavailable -* [ARROW-210](https://issues.apache.org/jira/browse/ARROW-210) - [C++] Tidy up the type system a little bit -* [ARROW-211](https://issues.apache.org/jira/browse/ARROW-211) - Several typos/errors in Layout.md examples -* [ARROW-217](https://issues.apache.org/jira/browse/ARROW-217) - Fix Travis w.r.t conda 4.1.0 changes -* [ARROW-219](https://issues.apache.org/jira/browse/ARROW-219) - [C++] Passed CMAKE\_CXX\_FLAGS are being dropped, fix compiler warnings -* [ARROW-223](https://issues.apache.org/jira/browse/ARROW-223) - Do not link against libpython -* [ARROW-225](https://issues.apache.org/jira/browse/ARROW-225) - [C++/Python] master Travis CI build is broken -* [ARROW-244](https://issues.apache.org/jira/browse/ARROW-244) - [C++] Some global APIs of IPC module should be visible to the outside -* [ARROW-246](https://issues.apache.org/jira/browse/ARROW-246) - [Java] UnionVector doesn't call allocateNew() when creating it's vectorType -* [ARROW-247](https://issues.apache.org/jira/browse/ARROW-247) - [C++] Missing explicit destructor in RowBatchReader causes an incomplete type error -* [ARROW-250](https://issues.apache.org/jira/browse/ARROW-250) - Fix for ARROW-246 may cause memory leaks -* [ARROW-259](https://issues.apache.org/jira/browse/ARROW-259) - Use flatbuffer fields in java implementation -* [ARROW-260](https://issues.apache.org/jira/browse/ARROW-260) - TestValueVector.testFixedVectorReallocation and testVariableVectorReallocation are flaky -* [ARROW-265](https://issues.apache.org/jira/browse/ARROW-265) - Negative decimal values have wrong padding -* [ARROW-265](https://issues.apache.org/jira/browse/ARROW-265) - Negative decimal values have wrong padding -* [ARROW-266](https://issues.apache.org/jira/browse/ARROW-266) - [C++] Fix the broken build -* [ARROW-274](https://issues.apache.org/jira/browse/ARROW-274) - Make the MapVector nullable -* [ARROW-277](https://issues.apache.org/jira/browse/ARROW-277) - Flatbuf serialization fails for Timestamp type -* [ARROW-278](https://issues.apache.org/jira/browse/ARROW-278) - [Format] Struct type name consistency in implementations and metadata -* [ARROW-283](https://issues.apache.org/jira/browse/ARROW-283) - [C++] Update arrow\_parquet to account for API changes in PARQUET-573 -* [ARROW-284](https://issues.apache.org/jira/browse/ARROW-284) - [C++] Triage builds by disabling Arrow-Parquet module -* [ARROW-287](https://issues.apache.org/jira/browse/ARROW-287) - [java] Make nullable vectors use a BitVecor instead of UInt1Vector for bits -* [ARROW-297](https://issues.apache.org/jira/browse/ARROW-297) - Fix Arrow pom for release -* [ARROW-304](https://issues.apache.org/jira/browse/ARROW-304) - NullableMapReaderImpl.isSet() always returns true -* [ARROW-308](https://issues.apache.org/jira/browse/ARROW-308) - UnionListWriter.setPosition() should not call startList() -* [ARROW-309](https://issues.apache.org/jira/browse/ARROW-309) - Types.getMinorTypeForArrowType() does not work for Union type -* [ARROW-313](https://issues.apache.org/jira/browse/ARROW-313) - XCode 8.0 breaks builds -* [ARROW-314](https://issues.apache.org/jira/browse/ARROW-314) - JSONScalar is unnecessary and unused. -* [ARROW-320](https://issues.apache.org/jira/browse/ARROW-320) - ComplexCopier.copy(FieldReader, FieldWriter) should not start a list if reader is not set -* [ARROW-321](https://issues.apache.org/jira/browse/ARROW-321) - Fix Arrow licences -* [ARROW-855](https://issues.apache.org/jira/browse/ARROW-855) - Arrow Memory Leak - - + + +Changelogs are maintained separately for each subproject. Please check out the +changelog file within each subproject folder for more details: + +* [Datafusion CHANGELOG](./datafusion/CHANGELOG.md) +* [Datafusion Python Binding CHANGELOG](./python/CHANGELOG.md) +* [Ballista CHANGELOG](./ballista/CHANGELOG.md) + +For older versions, see [apache/arrow/CHANGELOG.md](https://github.com/apache/arrow/blob/master/CHANGELOG.md). diff --git a/README.md b/README.md index 4157130ec38b0..0b1b679b21690 100644 --- a/README.md +++ b/README.md @@ -130,7 +130,7 @@ To get started, add the following to your `Cargo.toml` file: ```toml [dependencies] -datafusion = "4.0.0-SNAPSHOT" +datafusion = "5.0.0" ``` ## Using DataFusion as a binary diff --git a/ballista-examples/Cargo.toml b/ballista-examples/Cargo.toml index dbcfad44f62fe..c272225d5e7e8 100644 --- a/ballista-examples/Cargo.toml +++ b/ballista-examples/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "ballista-examples" description = "Ballista usage examples" -version = "0.5.0-SNAPSHOT" +version = "0.5.0" homepage = "https://github.com/apache/arrow-datafusion" repository = "https://github.com/apache/arrow-datafusion" authors = ["Apache Arrow "] diff --git a/ballista/CHANGELOG.md b/ballista/CHANGELOG.md new file mode 100644 index 0000000000000..287229b05faa1 --- /dev/null +++ b/ballista/CHANGELOG.md @@ -0,0 +1,180 @@ + + +For older versions, see [apache/arrow/CHANGELOG.md](https://github.com/apache/arrow/blob/master/CHANGELOG.md) + +# Changelog + +## [ballista-0.5.0](https://github.com/apache/arrow-datafusion/tree/ballista-0.5.0) (2021-08-10) + +[Full Changelog](https://github.com/apache/arrow-datafusion/compare/4.0.0...ballista-0.5.0) + +**Breaking changes:** + +- \[ballista\] support date\_part and date\_turnc ser/de, pass tpch 7 [\#840](https://github.com/apache/arrow-datafusion/pull/840) ([houqp](https://github.com/houqp)) +- Box ScalarValue:Lists, reduce size by half size [\#788](https://github.com/apache/arrow-datafusion/pull/788) ([alamb](https://github.com/alamb)) +- Support DataFrame.collect for Ballista DataFrames [\#785](https://github.com/apache/arrow-datafusion/pull/785) ([andygrove](https://github.com/andygrove)) +- JOIN conditions are order dependent [\#778](https://github.com/apache/arrow-datafusion/pull/778) ([seddonm1](https://github.com/seddonm1)) +- UnresolvedShuffleExec should represent a single shuffle [\#727](https://github.com/apache/arrow-datafusion/pull/727) ([andygrove](https://github.com/andygrove)) +- Ballista: Make shuffle partitions configurable in benchmarks [\#702](https://github.com/apache/arrow-datafusion/pull/702) ([andygrove](https://github.com/andygrove)) +- Rename MergeExec to CoalescePartitionsExec [\#635](https://github.com/apache/arrow-datafusion/pull/635) ([andygrove](https://github.com/andygrove)) +- Ballista: Rename QueryStageExec to ShuffleWriterExec [\#633](https://github.com/apache/arrow-datafusion/pull/633) ([andygrove](https://github.com/andygrove)) +- fix 593, reduce cloning by taking ownership in logical planner's `from` fn [\#610](https://github.com/apache/arrow-datafusion/pull/610) ([Jimexist](https://github.com/Jimexist)) +- fix join column handling logic for `On` and `Using` constraints [\#605](https://github.com/apache/arrow-datafusion/pull/605) ([houqp](https://github.com/houqp)) +- Move ballista standalone mode to client [\#589](https://github.com/apache/arrow-datafusion/pull/589) ([edrevo](https://github.com/edrevo)) +- Ballista: Implement map-side shuffle [\#543](https://github.com/apache/arrow-datafusion/pull/543) ([andygrove](https://github.com/andygrove)) +- ShuffleReaderExec now supports multiple locations per partition [\#541](https://github.com/apache/arrow-datafusion/pull/541) ([andygrove](https://github.com/andygrove)) +- Make external hostname in executor optional [\#232](https://github.com/apache/arrow-datafusion/pull/232) ([edrevo](https://github.com/edrevo)) +- Remove namespace from executors [\#75](https://github.com/apache/arrow-datafusion/pull/75) ([edrevo](https://github.com/edrevo)) +- Support qualified columns in queries [\#55](https://github.com/apache/arrow-datafusion/pull/55) ([houqp](https://github.com/houqp)) +- Read CSV format text from stdin or memory [\#54](https://github.com/apache/arrow-datafusion/pull/54) ([heymind](https://github.com/heymind)) +- Remove Ballista DataFrame [\#48](https://github.com/apache/arrow-datafusion/pull/48) ([andygrove](https://github.com/andygrove)) +- Use atomics for SQLMetric implementation, remove unused name field [\#25](https://github.com/apache/arrow-datafusion/pull/25) ([returnString](https://github.com/returnString)) + +**Implemented enhancements:** + +- Add crate documentation for Ballista crates [\#830](https://github.com/apache/arrow-datafusion/issues/830) +- Support DataFrame.collect for Ballista DataFrames [\#787](https://github.com/apache/arrow-datafusion/issues/787) +- Ballista: Prep for supporting shuffle correctly, part one [\#736](https://github.com/apache/arrow-datafusion/issues/736) +- Ballista: Implement physical plan serde for ShuffleWriterExec [\#710](https://github.com/apache/arrow-datafusion/issues/710) +- Ballista: Finish implementing shuffle mechanism [\#707](https://github.com/apache/arrow-datafusion/issues/707) +- Rename QueryStageExec to ShuffleWriterExec [\#542](https://github.com/apache/arrow-datafusion/issues/542) +- Ballista ShuffleReaderExec should be able to read from multiple locations per partition [\#540](https://github.com/apache/arrow-datafusion/issues/540) +- \[Ballista\] Use deployments in k8s user guide [\#473](https://github.com/apache/arrow-datafusion/issues/473) +- Ballista refactor QueryStageExec in preparation for map-side shuffle [\#458](https://github.com/apache/arrow-datafusion/issues/458) +- Ballista: Implement map-side of shuffle [\#456](https://github.com/apache/arrow-datafusion/issues/456) +- Refactor Ballista to separate Flight logic from execution logic [\#449](https://github.com/apache/arrow-datafusion/issues/449) +- Use published versions of arrow rather than github shas [\#393](https://github.com/apache/arrow-datafusion/issues/393) +- BallistaContext::collect\(\) logging is too noisy [\#352](https://github.com/apache/arrow-datafusion/issues/352) +- Update Ballista to use new physical plan formatter utility [\#343](https://github.com/apache/arrow-datafusion/issues/343) +- Add Ballista Getting Started documentation [\#329](https://github.com/apache/arrow-datafusion/issues/329) +- Remove references to ballistacompute Docker Hub repo [\#325](https://github.com/apache/arrow-datafusion/issues/325) +- Implement scalable distributed joins [\#63](https://github.com/apache/arrow-datafusion/issues/63) +- Remove hard-coded Ballista version from scripts [\#32](https://github.com/apache/arrow-datafusion/issues/32) +- Implement streaming versions of Dataframe.collect methods [\#789](https://github.com/apache/arrow-datafusion/pull/789) ([andygrove](https://github.com/andygrove)) +- Ballista shuffle is finally working as intended, providing scalable distributed joins [\#750](https://github.com/apache/arrow-datafusion/pull/750) ([andygrove](https://github.com/andygrove)) +- Update to use arrow 5.0 [\#721](https://github.com/apache/arrow-datafusion/pull/721) ([alamb](https://github.com/alamb)) +- Implement serde for ShuffleWriterExec [\#712](https://github.com/apache/arrow-datafusion/pull/712) ([andygrove](https://github.com/andygrove)) +- dedup using join column in wildcard expansion [\#678](https://github.com/apache/arrow-datafusion/pull/678) ([houqp](https://github.com/houqp)) +- Implement metrics for shuffle read and write [\#676](https://github.com/apache/arrow-datafusion/pull/676) ([andygrove](https://github.com/andygrove)) +- Remove hard-coded PartitionMode from Ballista serde [\#637](https://github.com/apache/arrow-datafusion/pull/637) ([andygrove](https://github.com/andygrove)) +- Ballista: Implement scalable distributed joins [\#634](https://github.com/apache/arrow-datafusion/pull/634) ([andygrove](https://github.com/andygrove)) +- Add Keda autoscaling for ballista in k8s [\#586](https://github.com/apache/arrow-datafusion/pull/586) ([edrevo](https://github.com/edrevo)) +- Add some resiliency to lost executors [\#568](https://github.com/apache/arrow-datafusion/pull/568) ([edrevo](https://github.com/edrevo)) +- Add `partition by` constructs in window functions and modify logical planning [\#501](https://github.com/apache/arrow-datafusion/pull/501) ([Jimexist](https://github.com/Jimexist)) +- Support anti join [\#482](https://github.com/apache/arrow-datafusion/pull/482) ([Dandandan](https://github.com/Dandandan)) +- add `order by` construct in window function and logical plans [\#463](https://github.com/apache/arrow-datafusion/pull/463) ([Jimexist](https://github.com/Jimexist)) +- Refactor Ballista executor so that FlightService delegates to an Executor struct [\#450](https://github.com/apache/arrow-datafusion/pull/450) ([andygrove](https://github.com/andygrove)) +- implement lead and lag built-in window function [\#429](https://github.com/apache/arrow-datafusion/pull/429) ([Jimexist](https://github.com/Jimexist)) +- Implement fmt\_as for ShuffleReaderExec [\#400](https://github.com/apache/arrow-datafusion/pull/400) ([andygrove](https://github.com/andygrove)) +- Add window expression part 1 - logical and physical planning, structure, to/from proto, and explain, for empty over clause only [\#334](https://github.com/apache/arrow-datafusion/pull/334) ([Jimexist](https://github.com/Jimexist)) +- \[breaking change\] fix 265, log should be log10, and add ln [\#271](https://github.com/apache/arrow-datafusion/pull/271) ([Jimexist](https://github.com/Jimexist)) +- Allow table providers to indicate their type for catalog metadata [\#205](https://github.com/apache/arrow-datafusion/pull/205) ([returnString](https://github.com/returnString)) +- Add query 19 to TPC-H regression tests [\#59](https://github.com/apache/arrow-datafusion/pull/59) ([Dandandan](https://github.com/Dandandan)) +- Use arrow eq kernels in CaseWhen expression evaluation [\#52](https://github.com/apache/arrow-datafusion/pull/52) ([Dandandan](https://github.com/Dandandan)) +- Add option param for standalone mode [\#42](https://github.com/apache/arrow-datafusion/pull/42) ([djKooks](https://github.com/djKooks)) +- \[DataFusion\] Optimize hash join inner workings, null handling fix [\#24](https://github.com/apache/arrow-datafusion/pull/24) ([Dandandan](https://github.com/Dandandan)) +- \[Ballista\] Docker files for ui [\#22](https://github.com/apache/arrow-datafusion/pull/22) ([msathis](https://github.com/msathis)) + +**Fixed bugs:** + +- Ballista: TPC-H q3 @ SF=1000 never completes [\#835](https://github.com/apache/arrow-datafusion/issues/835) +- Ballista does not support MIN/MAX aggregate functions [\#832](https://github.com/apache/arrow-datafusion/issues/832) +- Ballista docker images fail to build [\#828](https://github.com/apache/arrow-datafusion/issues/828) +- Ballista: UnresolvedShuffleExec should only have a single stage\_id [\#726](https://github.com/apache/arrow-datafusion/issues/726) +- Ballista integration tests are failing [\#623](https://github.com/apache/arrow-datafusion/issues/623) +- Integration test build failure due to arrow-rs using unstable feature [\#596](https://github.com/apache/arrow-datafusion/issues/596) +- `cargo build` cannot build the project [\#531](https://github.com/apache/arrow-datafusion/issues/531) +- ShuffleReaderExec does not get formatted correctly in displayable physical plan [\#399](https://github.com/apache/arrow-datafusion/issues/399) +- Implement serde for MIN and MAX [\#833](https://github.com/apache/arrow-datafusion/pull/833) ([andygrove](https://github.com/andygrove)) +- Ballista: Prep for fixing shuffle mechansim, part 1 [\#738](https://github.com/apache/arrow-datafusion/pull/738) ([andygrove](https://github.com/andygrove)) +- Ballista: Shuffle write bug fix [\#714](https://github.com/apache/arrow-datafusion/pull/714) ([andygrove](https://github.com/andygrove)) +- honor table name for csv/parquet scan in ballista plan serde [\#629](https://github.com/apache/arrow-datafusion/pull/629) ([houqp](https://github.com/houqp)) +- MINOR: Fix integration tests by adding datafusion-cli module to docker image [\#322](https://github.com/apache/arrow-datafusion/pull/322) ([andygrove](https://github.com/andygrove)) + +**Documentation updates:** + +- Add minimal crate documentation for Ballista crates [\#831](https://github.com/apache/arrow-datafusion/pull/831) ([andygrove](https://github.com/andygrove)) +- Add Ballista examples [\#775](https://github.com/apache/arrow-datafusion/pull/775) ([andygrove](https://github.com/andygrove)) +- Update ballista.proto link in architecture doc [\#502](https://github.com/apache/arrow-datafusion/pull/502) ([terrycorley](https://github.com/terrycorley)) +- Update k8s user guide to use deployments [\#474](https://github.com/apache/arrow-datafusion/pull/474) ([edrevo](https://github.com/edrevo)) +- use prettier to format md files [\#367](https://github.com/apache/arrow-datafusion/pull/367) ([Jimexist](https://github.com/Jimexist)) +- Make it easier for developers to find Ballista documentation [\#330](https://github.com/apache/arrow-datafusion/pull/330) ([andygrove](https://github.com/andygrove)) +- Instructions for cross-compiling Ballista to the Raspberry Pi [\#263](https://github.com/apache/arrow-datafusion/pull/263) ([andygrove](https://github.com/andygrove)) +- Add install guide in README [\#236](https://github.com/apache/arrow-datafusion/pull/236) ([djKooks](https://github.com/djKooks)) + +**Performance improvements:** + +- Ballista: Avoid sleeping between polling for tasks [\#698](https://github.com/apache/arrow-datafusion/pull/698) ([Dandandan](https://github.com/Dandandan)) +- Make BallistaContext::collect streaming [\#535](https://github.com/apache/arrow-datafusion/pull/535) ([edrevo](https://github.com/edrevo)) + +**Closed issues:** + +- Confirm git tagging strategy for releases [\#770](https://github.com/apache/arrow-datafusion/issues/770) +- arrow::util::pretty::pretty\_format\_batches missing [\#769](https://github.com/apache/arrow-datafusion/issues/769) +- move the `assert_batches_eq!` macros to a non part of datafusion [\#745](https://github.com/apache/arrow-datafusion/issues/745) +- fix an issue where aliases are not respected in generating downstream schemas in window expr [\#592](https://github.com/apache/arrow-datafusion/issues/592) +- make the planner to print more succinct and useful information in window function explain clause [\#526](https://github.com/apache/arrow-datafusion/issues/526) +- move window frame module to be in `logical_plan` [\#517](https://github.com/apache/arrow-datafusion/issues/517) +- use a more rust idiomatic way of handling nth\_value [\#448](https://github.com/apache/arrow-datafusion/issues/448) +- Make Ballista not depend on arrow directly [\#446](https://github.com/apache/arrow-datafusion/issues/446) +- create a test with more than one partition for window functions [\#435](https://github.com/apache/arrow-datafusion/issues/435) +- Implement hash-partitioned hash aggregate [\#27](https://github.com/apache/arrow-datafusion/issues/27) +- Consider using GitHub pages for DataFusion/Ballista documentation [\#18](https://github.com/apache/arrow-datafusion/issues/18) +- Add Ballista to default cargo workspace [\#17](https://github.com/apache/arrow-datafusion/issues/17) +- Update "repository" in Cargo.toml [\#16](https://github.com/apache/arrow-datafusion/issues/16) +- Consolidate TPC-H benchmarks [\#6](https://github.com/apache/arrow-datafusion/issues/6) +- \[Ballista\] Fix integration test script [\#4](https://github.com/apache/arrow-datafusion/issues/4) +- Ballista should not have separate DataFrame implementation [\#2](https://github.com/apache/arrow-datafusion/issues/2) + +**Merged pull requests:** + +- Change datatype of tpch keys from Int32 to UInt64 to support sf=1000 [\#836](https://github.com/apache/arrow-datafusion/pull/836) ([andygrove](https://github.com/andygrove)) +- Add ballista-examples to docker build [\#829](https://github.com/apache/arrow-datafusion/pull/829) ([andygrove](https://github.com/andygrove)) +- Update dependencies: prost to 0.8 and tonic to 0.5 [\#818](https://github.com/apache/arrow-datafusion/pull/818) ([alamb](https://github.com/alamb)) +- Move `hash_array` into hash\_utils.rs [\#807](https://github.com/apache/arrow-datafusion/pull/807) ([alamb](https://github.com/alamb)) +- Fix: Update clippy lints for Rust 1.54 [\#794](https://github.com/apache/arrow-datafusion/pull/794) ([alamb](https://github.com/alamb)) +- MINOR: Remove unused Ballista query execution code path [\#732](https://github.com/apache/arrow-datafusion/pull/732) ([andygrove](https://github.com/andygrove)) +- \[fix\] benchmark run with compose [\#666](https://github.com/apache/arrow-datafusion/pull/666) ([rdettai](https://github.com/rdettai)) +- bring back dev scripts for ballista [\#648](https://github.com/apache/arrow-datafusion/pull/648) ([Jimexist](https://github.com/Jimexist)) +- Remove unnecessary mutex [\#639](https://github.com/apache/arrow-datafusion/pull/639) ([edrevo](https://github.com/edrevo)) +- round trip TPCH queries in tests [\#630](https://github.com/apache/arrow-datafusion/pull/630) ([houqp](https://github.com/houqp)) +- Fix build [\#627](https://github.com/apache/arrow-datafusion/pull/627) ([andygrove](https://github.com/andygrove)) +- in ballista also check for UI prettier changes [\#578](https://github.com/apache/arrow-datafusion/pull/578) ([Jimexist](https://github.com/Jimexist)) +- turn on clippy rule for needless borrow [\#545](https://github.com/apache/arrow-datafusion/pull/545) ([Jimexist](https://github.com/Jimexist)) +- reuse datafusion physical planner in ballista building from protobuf [\#532](https://github.com/apache/arrow-datafusion/pull/532) ([Jimexist](https://github.com/Jimexist)) +- update cargo.toml in python crate and fix unit test due to hash joins [\#483](https://github.com/apache/arrow-datafusion/pull/483) ([Jimexist](https://github.com/Jimexist)) +- make `VOLUME` declaration in tpch datagen docker absolute [\#466](https://github.com/apache/arrow-datafusion/pull/466) ([crepererum](https://github.com/crepererum)) +- Refactor QueryStageExec in preparation for implementing map-side shuffle [\#459](https://github.com/apache/arrow-datafusion/pull/459) ([andygrove](https://github.com/andygrove)) +- Simplified usage of `use arrow` in ballista. [\#447](https://github.com/apache/arrow-datafusion/pull/447) ([jorgecarleitao](https://github.com/jorgecarleitao)) +- Benchmark subcommand to distinguish between DataFusion and Ballista [\#402](https://github.com/apache/arrow-datafusion/pull/402) ([jgoday](https://github.com/jgoday)) +- \#352: BallistaContext::collect\(\) logging is too noisy [\#394](https://github.com/apache/arrow-datafusion/pull/394) ([jgoday](https://github.com/jgoday)) +- cleanup function return type fn [\#350](https://github.com/apache/arrow-datafusion/pull/350) ([Jimexist](https://github.com/Jimexist)) +- Update Ballista to use new physical plan formatter utility [\#344](https://github.com/apache/arrow-datafusion/pull/344) ([andygrove](https://github.com/andygrove)) +- Update arrow dependencies again [\#341](https://github.com/apache/arrow-datafusion/pull/341) ([alamb](https://github.com/alamb)) +- Remove references to Ballista Docker images published to ballistacompute Docker Hub repo [\#326](https://github.com/apache/arrow-datafusion/pull/326) ([andygrove](https://github.com/andygrove)) +- Update arrow-rs deps [\#317](https://github.com/apache/arrow-datafusion/pull/317) ([alamb](https://github.com/alamb)) +- Update arrow deps [\#269](https://github.com/apache/arrow-datafusion/pull/269) ([alamb](https://github.com/alamb)) +- Enable redundant\_field\_names clippy lint [\#261](https://github.com/apache/arrow-datafusion/pull/261) ([Dandandan](https://github.com/Dandandan)) +- Update arrow-rs deps \(to fix build due to flatbuffers update\) [\#224](https://github.com/apache/arrow-datafusion/pull/224) ([alamb](https://github.com/alamb)) +- update arrow-rs deps to latest master [\#216](https://github.com/apache/arrow-datafusion/pull/216) ([alamb](https://github.com/alamb)) + + + +\* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/ballista/rust/client/Cargo.toml b/ballista/rust/client/Cargo.toml index 5c7eb3802a104..3507a7b22a45d 100644 --- a/ballista/rust/client/Cargo.toml +++ b/ballista/rust/client/Cargo.toml @@ -19,7 +19,7 @@ name = "ballista" description = "Ballista Distributed Compute" license = "Apache-2.0" -version = "0.5.0-SNAPSHOT" +version = "0.5.0" homepage = "https://github.com/apache/arrow-datafusion" repository = "https://github.com/apache/arrow-datafusion" authors = ["Apache Arrow "] @@ -37,4 +37,4 @@ datafusion = { path = "../../../datafusion" } [features] default = [] -standalone = ["ballista-executor", "ballista-scheduler"] \ No newline at end of file +standalone = ["ballista-executor", "ballista-scheduler"] diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml index 2495343b3e878..f61f32259f30e 100644 --- a/ballista/rust/core/Cargo.toml +++ b/ballista/rust/core/Cargo.toml @@ -19,7 +19,7 @@ name = "ballista-core" description = "Ballista Distributed Compute" license = "Apache-2.0" -version = "0.5.0-SNAPSHOT" +version = "0.5.0" homepage = "https://github.com/apache/arrow-datafusion" repository = "https://github.com/apache/arrow-datafusion" authors = ["Apache Arrow "] diff --git a/ballista/rust/executor/Cargo.toml b/ballista/rust/executor/Cargo.toml index 5d6ecb986a15e..f30015b884a44 100644 --- a/ballista/rust/executor/Cargo.toml +++ b/ballista/rust/executor/Cargo.toml @@ -19,7 +19,7 @@ name = "ballista-executor" description = "Ballista Distributed Compute - Executor" license = "Apache-2.0" -version = "0.5.0-SNAPSHOT" +version = "0.5.0" homepage = "https://github.com/apache/arrow-datafusion" repository = "https://github.com/apache/arrow-datafusion" authors = ["Apache Arrow "] diff --git a/ballista/rust/scheduler/Cargo.toml b/ballista/rust/scheduler/Cargo.toml index 382f7c62af10e..fb6286669b93e 100644 --- a/ballista/rust/scheduler/Cargo.toml +++ b/ballista/rust/scheduler/Cargo.toml @@ -19,7 +19,7 @@ name = "ballista-scheduler" description = "Ballista Distributed Compute - Scheduler" license = "Apache-2.0" -version = "0.5.0-SNAPSHOT" +version = "0.5.0" homepage = "https://github.com/apache/arrow-datafusion" repository = "https://github.com/apache/arrow-datafusion" authors = ["Apache Arrow "] diff --git a/datafusion/CHANGELOG.md b/datafusion/CHANGELOG.md new file mode 100644 index 0000000000000..41afa286b7965 --- /dev/null +++ b/datafusion/CHANGELOG.md @@ -0,0 +1,318 @@ + + +For older versions, see [apache/arrow/CHANGELOG.md](https://github.com/apache/arrow/blob/master/CHANGELOG.md) + +# Changelog + +## [5.0.0](https://github.com/apache/arrow-datafusion/tree/5.0.0) (2021-08-10) + +[Full Changelog](https://github.com/apache/arrow-datafusion/compare/4.0.0...5.0.0) + +**Breaking changes:** + +- Box ScalarValue:Lists, reduce size by half size [\#788](https://github.com/apache/arrow-datafusion/pull/788) ([alamb](https://github.com/alamb)) +- JOIN conditions are order dependent [\#778](https://github.com/apache/arrow-datafusion/pull/778) ([seddonm1](https://github.com/seddonm1)) +- Show the result of all optimizer passes in EXPLAIN VERBOSE [\#759](https://github.com/apache/arrow-datafusion/pull/759) ([alamb](https://github.com/alamb)) +- \#723 Datafusion add option in ExecutionConfig to enable/disable parquet pruning [\#749](https://github.com/apache/arrow-datafusion/pull/749) ([lvheyang](https://github.com/lvheyang)) +- Update API for extension planning to include logical plan [\#643](https://github.com/apache/arrow-datafusion/pull/643) ([alamb](https://github.com/alamb)) +- Rename MergeExec to CoalescePartitionsExec [\#635](https://github.com/apache/arrow-datafusion/pull/635) ([andygrove](https://github.com/andygrove)) +- fix 593, reduce cloning by taking ownership in logical planner's `from` fn [\#610](https://github.com/apache/arrow-datafusion/pull/610) ([Jimexist](https://github.com/Jimexist)) +- fix join column handling logic for `On` and `Using` constraints [\#605](https://github.com/apache/arrow-datafusion/pull/605) ([houqp](https://github.com/houqp)) +- Rewrite pruning logic in terms of PruningStatistics using Array trait \(option 2\) [\#426](https://github.com/apache/arrow-datafusion/pull/426) ([alamb](https://github.com/alamb)) +- Support reading from NdJson formatted data sources [\#404](https://github.com/apache/arrow-datafusion/pull/404) ([heymind](https://github.com/heymind)) +- Add metrics to RepartitionExec [\#398](https://github.com/apache/arrow-datafusion/pull/398) ([andygrove](https://github.com/andygrove)) +- Use 4.x arrow-rs from crates.io rather than git sha [\#395](https://github.com/apache/arrow-datafusion/pull/395) ([alamb](https://github.com/alamb)) +- Return Vec\ from PredicateBuilder rather than an `Fn` [\#370](https://github.com/apache/arrow-datafusion/pull/370) ([alamb](https://github.com/alamb)) +- Refactor: move RowGroupPredicateBuilder into its own module, rename to PruningPredicateBuilder [\#365](https://github.com/apache/arrow-datafusion/pull/365) ([alamb](https://github.com/alamb)) +- \[Datafusion\] NOW\(\) function support [\#288](https://github.com/apache/arrow-datafusion/pull/288) ([msathis](https://github.com/msathis)) +- Implement select distinct [\#262](https://github.com/apache/arrow-datafusion/pull/262) ([Dandandan](https://github.com/Dandandan)) +- Refactor datafusion/src/physical\_plan/common.rs build\_file\_list to take less param and reuse code [\#253](https://github.com/apache/arrow-datafusion/pull/253) ([Jimexist](https://github.com/Jimexist)) +- Support qualified columns in queries [\#55](https://github.com/apache/arrow-datafusion/pull/55) ([houqp](https://github.com/houqp)) +- Read CSV format text from stdin or memory [\#54](https://github.com/apache/arrow-datafusion/pull/54) ([heymind](https://github.com/heymind)) +- Use atomics for SQLMetric implementation, remove unused name field [\#25](https://github.com/apache/arrow-datafusion/pull/25) ([returnString](https://github.com/returnString)) + +**Implemented enhancements:** + +- Allow extension nodes to correctly plan physical expressions with relations [\#642](https://github.com/apache/arrow-datafusion/issues/642) +- Filters aren't passed down to table scans in a union [\#557](https://github.com/apache/arrow-datafusion/issues/557) +- Support pruning for `boolean` columns [\#490](https://github.com/apache/arrow-datafusion/issues/490) +- Implement SQLMetrics for RepartitionExec [\#397](https://github.com/apache/arrow-datafusion/issues/397) +- DataFusion benchmarks should show executed plan with metrics after query completes [\#396](https://github.com/apache/arrow-datafusion/issues/396) +- Use published versions of arrow rather than github shas [\#393](https://github.com/apache/arrow-datafusion/issues/393) +- Add Compare to GroupByScalar [\#364](https://github.com/apache/arrow-datafusion/issues/364) +- Reusable "row group pruning" logic [\#363](https://github.com/apache/arrow-datafusion/issues/363) +- Add an Order Preserving merge operator [\#362](https://github.com/apache/arrow-datafusion/issues/362) +- Implement Postgres compatible `now()` function [\#251](https://github.com/apache/arrow-datafusion/issues/251) +- COUNT DISTINCT does not support dictionary types [\#249](https://github.com/apache/arrow-datafusion/issues/249) +- Use standard make\_null\_array for CASE [\#222](https://github.com/apache/arrow-datafusion/issues/222) +- Implement date\_trunc\(\) function [\#203](https://github.com/apache/arrow-datafusion/issues/203) +- COUNT DISTINCT does not support for `Float64` [\#199](https://github.com/apache/arrow-datafusion/issues/199) +- Update SQLMetric to use atomics rather than a Mutex [\#30](https://github.com/apache/arrow-datafusion/issues/30) +- Implement PartialOrd for ScalarValue [\#838](https://github.com/apache/arrow-datafusion/pull/838) ([viirya](https://github.com/viirya)) +- Support date datatypes in max/min [\#820](https://github.com/apache/arrow-datafusion/pull/820) ([viirya](https://github.com/viirya)) +- Implement vectorized hashing for DictionaryArray types [\#812](https://github.com/apache/arrow-datafusion/pull/812) ([alamb](https://github.com/alamb)) +- Convert unsupported conditions in left right join to filters [\#796](https://github.com/apache/arrow-datafusion/pull/796) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([Dandandan](https://github.com/Dandandan)) +- Implement streaming versions of Dataframe.collect methods [\#789](https://github.com/apache/arrow-datafusion/pull/789) ([andygrove](https://github.com/andygrove)) +- impl from str for column and scalar [\#762](https://github.com/apache/arrow-datafusion/pull/762) ([Jimexist](https://github.com/Jimexist)) +- impl fmt::Display for PlanType [\#752](https://github.com/apache/arrow-datafusion/pull/752) ([Jimexist](https://github.com/Jimexist)) +- Remove unnecessary projection in logical plan optimization phase [\#747](https://github.com/apache/arrow-datafusion/pull/747) ([waynexia](https://github.com/waynexia)) +- Support table columns alias [\#735](https://github.com/apache/arrow-datafusion/pull/735) ([Dandandan](https://github.com/Dandandan)) +- Derive PartialEq for datasource enums [\#734](https://github.com/apache/arrow-datafusion/pull/734) ([alamb](https://github.com/alamb)) +- Allow filetype to be lowercase, Implement FromStr for FileType [\#728](https://github.com/apache/arrow-datafusion/pull/728) ([Jimexist](https://github.com/Jimexist)) +- Update to use arrow 5.0 [\#721](https://github.com/apache/arrow-datafusion/pull/721) ([alamb](https://github.com/alamb)) +- \#554: Lead/lag window function with offset and default value arguments [\#687](https://github.com/apache/arrow-datafusion/pull/687) ([jgoday](https://github.com/jgoday)) +- dedup using join column in wildcard expansion [\#678](https://github.com/apache/arrow-datafusion/pull/678) ([houqp](https://github.com/houqp)) +- Implement metrics for HashJoinExec [\#664](https://github.com/apache/arrow-datafusion/pull/664) ([andygrove](https://github.com/andygrove)) +- Show physical plan with metrics in benchmark [\#662](https://github.com/apache/arrow-datafusion/pull/662) ([andygrove](https://github.com/andygrove)) +- Allow non-equijoin filters in join condition [\#660](https://github.com/apache/arrow-datafusion/pull/660) ([Dandandan](https://github.com/Dandandan)) +- Add End-to-end test for parquet pruning + metrics for ParquetExec [\#657](https://github.com/apache/arrow-datafusion/pull/657) ([alamb](https://github.com/alamb)) +- Add support for leading field in interval [\#647](https://github.com/apache/arrow-datafusion/pull/647) ([Dandandan](https://github.com/Dandandan)) +- Remove hard-coded PartitionMode from Ballista serde [\#637](https://github.com/apache/arrow-datafusion/pull/637) ([andygrove](https://github.com/andygrove)) +- Ballista: Implement scalable distributed joins [\#634](https://github.com/apache/arrow-datafusion/pull/634) ([andygrove](https://github.com/andygrove)) +- implement rank and dense\_rank function and refactor built-in window function evaluation [\#631](https://github.com/apache/arrow-datafusion/pull/631) ([Jimexist](https://github.com/Jimexist)) +- Improve "field not found" error messages [\#625](https://github.com/apache/arrow-datafusion/pull/625) ([andygrove](https://github.com/andygrove)) +- Support modulus op [\#577](https://github.com/apache/arrow-datafusion/pull/577) ([gangliao](https://github.com/gangliao)) +- implement `std::default::Default` for execution config [\#570](https://github.com/apache/arrow-datafusion/pull/570) ([Jimexist](https://github.com/Jimexist)) +- `to_timestamp_millis()`, `to_timestamp_micros()`, `to_timestamp_seconds()` [\#567](https://github.com/apache/arrow-datafusion/pull/567) ([velvia](https://github.com/velvia)) +- Filter push down for Union [\#559](https://github.com/apache/arrow-datafusion/pull/559) ([Dandandan](https://github.com/Dandandan)) +- Implement window functions with `partition_by` clause [\#558](https://github.com/apache/arrow-datafusion/pull/558) ([Jimexist](https://github.com/Jimexist)) +- support table alias in join clause [\#547](https://github.com/apache/arrow-datafusion/pull/547) ([houqp](https://github.com/houqp)) +- Not equal predicate in physical\_planning pruning [\#544](https://github.com/apache/arrow-datafusion/pull/544) ([jgoday](https://github.com/jgoday)) +- add error handling and boundary checking for window frames [\#530](https://github.com/apache/arrow-datafusion/pull/530) ([Jimexist](https://github.com/Jimexist)) +- Implement window functions with `order_by` clause [\#520](https://github.com/apache/arrow-datafusion/pull/520) ([Jimexist](https://github.com/Jimexist)) +- support group by column positions [\#519](https://github.com/apache/arrow-datafusion/pull/519) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([jychen7](https://github.com/jychen7)) +- Implement constant folding for CAST [\#513](https://github.com/apache/arrow-datafusion/pull/513) ([msathis](https://github.com/msathis)) +- Add window frame constructs - alternative [\#506](https://github.com/apache/arrow-datafusion/pull/506) ([Jimexist](https://github.com/Jimexist)) +- Add `partition by` constructs in window functions and modify logical planning [\#501](https://github.com/apache/arrow-datafusion/pull/501) ([Jimexist](https://github.com/Jimexist)) +- Add support for boolean columns in pruning logic [\#500](https://github.com/apache/arrow-datafusion/pull/500) ([alamb](https://github.com/alamb)) +- \#215 resolve aliases for group by exprs [\#485](https://github.com/apache/arrow-datafusion/pull/485) ([jychen7](https://github.com/jychen7)) +- Support anti join [\#482](https://github.com/apache/arrow-datafusion/pull/482) ([Dandandan](https://github.com/Dandandan)) +- Support semi join [\#470](https://github.com/apache/arrow-datafusion/pull/470) ([Dandandan](https://github.com/Dandandan)) +- add `order by` construct in window function and logical plans [\#463](https://github.com/apache/arrow-datafusion/pull/463) ([Jimexist](https://github.com/Jimexist)) +- Remove reundant filters \(e.g. c\> 5 AND c\>5 --\> c\>5\) [\#436](https://github.com/apache/arrow-datafusion/pull/436) ([jgoday](https://github.com/jgoday)) +- fix: display the content of debug explain [\#434](https://github.com/apache/arrow-datafusion/pull/434) ([NGA-TRAN](https://github.com/NGA-TRAN)) +- implement lead and lag built-in window function [\#429](https://github.com/apache/arrow-datafusion/pull/429) ([Jimexist](https://github.com/Jimexist)) +- add support for ndjson for datafusion-cli [\#427](https://github.com/apache/arrow-datafusion/pull/427) ([Jimexist](https://github.com/Jimexist)) +- add `first_value`, `last_value`, and `nth_value` built-in window functions [\#403](https://github.com/apache/arrow-datafusion/pull/403) ([Jimexist](https://github.com/Jimexist)) +- export both `now` and `random` functions [\#389](https://github.com/apache/arrow-datafusion/pull/389) ([Jimexist](https://github.com/Jimexist)) +- Function to create `ArrayRef` from an iterator of ScalarValues [\#381](https://github.com/apache/arrow-datafusion/pull/381) ([alamb](https://github.com/alamb)) +- Sort preserving merge \(\#362\) [\#379](https://github.com/apache/arrow-datafusion/pull/379) ([tustvold](https://github.com/tustvold)) +- Add support for multiple partitions with SortExec \(\#362\) [\#378](https://github.com/apache/arrow-datafusion/pull/378) ([tustvold](https://github.com/tustvold)) +- add window expression stream, delegated window aggregation to aggregate functions, and implement `row_number` [\#375](https://github.com/apache/arrow-datafusion/pull/375) ([Jimexist](https://github.com/Jimexist)) +- Add PartialOrd and Ord to GroupByScalar \(\#364\) [\#368](https://github.com/apache/arrow-datafusion/pull/368) ([tustvold](https://github.com/tustvold)) +- Implement readable explain plans for physical plans [\#337](https://github.com/apache/arrow-datafusion/pull/337) ([alamb](https://github.com/alamb)) +- Add window expression part 1 - logical and physical planning, structure, to/from proto, and explain, for empty over clause only [\#334](https://github.com/apache/arrow-datafusion/pull/334) ([Jimexist](https://github.com/Jimexist)) +- Use NullArray to Pass row count to ScalarFunctions that take 0 arguments [\#328](https://github.com/apache/arrow-datafusion/pull/328) ([Jimexist](https://github.com/Jimexist)) +- add --quiet/-q flag and allow timing info to be turned on/off [\#323](https://github.com/apache/arrow-datafusion/pull/323) ([Jimexist](https://github.com/Jimexist)) +- Implement hash partitioned aggregation [\#320](https://github.com/apache/arrow-datafusion/pull/320) ([Dandandan](https://github.com/Dandandan)) +- Support COUNT\(DISTINCT timestamps\) [\#319](https://github.com/apache/arrow-datafusion/pull/319) ([charlibot](https://github.com/charlibot)) +- add random SQL function [\#303](https://github.com/apache/arrow-datafusion/pull/303) ([Jimexist](https://github.com/Jimexist)) +- allow datafusion cli to take -- comments [\#296](https://github.com/apache/arrow-datafusion/pull/296) ([Jimexist](https://github.com/Jimexist)) +- Add json print format mode to datafusion cli [\#295](https://github.com/apache/arrow-datafusion/pull/295) ([Jimexist](https://github.com/Jimexist)) +- Add print format param with support for tsv print format to datafusion cli [\#292](https://github.com/apache/arrow-datafusion/pull/292) ([Jimexist](https://github.com/Jimexist)) +- Add print format param and support for csv print format to datafusion cli [\#289](https://github.com/apache/arrow-datafusion/pull/289) ([Jimexist](https://github.com/Jimexist)) +- allow datafusion-cli to take a file param [\#285](https://github.com/apache/arrow-datafusion/pull/285) ([Jimexist](https://github.com/Jimexist)) +- add param validation for datafusion-cli [\#284](https://github.com/apache/arrow-datafusion/pull/284) ([Jimexist](https://github.com/Jimexist)) +- \[breaking change\] fix 265, log should be log10, and add ln [\#271](https://github.com/apache/arrow-datafusion/pull/271) ([Jimexist](https://github.com/Jimexist)) +- Implement count distinct for dictionary arrays [\#256](https://github.com/apache/arrow-datafusion/pull/256) ([alamb](https://github.com/alamb)) +- Count distinct floats [\#252](https://github.com/apache/arrow-datafusion/pull/252) ([pjmore](https://github.com/pjmore)) +- Add rule to eliminate `LIMIT 0` and replace it with an `EmptyRelation` [\#213](https://github.com/apache/arrow-datafusion/pull/213) ([Dandandan](https://github.com/Dandandan)) +- Allow table providers to indicate their type for catalog metadata [\#205](https://github.com/apache/arrow-datafusion/pull/205) ([returnString](https://github.com/returnString)) +- Use arrow eq kernels in CaseWhen expression evaluation [\#52](https://github.com/apache/arrow-datafusion/pull/52) ([Dandandan](https://github.com/Dandandan)) +- Re-export Arrow and Parquet crates from DataFusion [\#39](https://github.com/apache/arrow-datafusion/pull/39) ([returnString](https://github.com/returnString)) +- \[DataFusion\] Optimize hash join inner workings, null handling fix [\#24](https://github.com/apache/arrow-datafusion/pull/24) ([Dandandan](https://github.com/Dandandan)) +- \[ARROW-12441\] \[DataFusion\] Cross join implementation [\#11](https://github.com/apache/arrow-datafusion/pull/11) ([Dandandan](https://github.com/Dandandan)) + +**Fixed bugs:** + +- Projection pushdown removes unqualified column names even when they are used [\#617](https://github.com/apache/arrow-datafusion/issues/617) +- Panic while running join datatypes/schema.rs:165:10 [\#601](https://github.com/apache/arrow-datafusion/issues/601) +- Indentation is incorrect for joins in formatted physical plans [\#345](https://github.com/apache/arrow-datafusion/issues/345) +- Error while running `COUNT DISTINCT (timestamp)`: 'Unexpected DataType for list [\#314](https://github.com/apache/arrow-datafusion/issues/314) +- When joining two tables, get Error: Plan\("Schema contains duplicate unqualified field name \'xxx\'"\) [\#311](https://github.com/apache/arrow-datafusion/issues/311) +- Incorrect answers with SELECT DISTINCT queries [\#250](https://github.com/apache/arrow-datafusion/issues/250) +- Intermitent failure in CI join\_with\_hash\_collision [\#227](https://github.com/apache/arrow-datafusion/issues/227) +- `Concat` from Dataframe API no longer accepts multiple expressions [\#226](https://github.com/apache/arrow-datafusion/issues/226) +- Fix right, full join handling when having multiple non-matching rows at the left side [\#845](https://github.com/apache/arrow-datafusion/pull/845) ([Dandandan](https://github.com/Dandandan)) +- Qualified field resolution too strict [\#810](https://github.com/apache/arrow-datafusion/pull/810) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([seddonm1](https://github.com/seddonm1)) +- Better join order resolution logic [\#797](https://github.com/apache/arrow-datafusion/pull/797) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([seddonm1](https://github.com/seddonm1)) +- Produce correct answers for Group BY NULL \(Option 1\) [\#793](https://github.com/apache/arrow-datafusion/pull/793) ([alamb](https://github.com/alamb)) +- Use consistent version of string\_to\_timestamp\_nanos in DataFusion [\#767](https://github.com/apache/arrow-datafusion/pull/767) ([alamb](https://github.com/alamb)) +- \#723 limit pruning rule to simple expression [\#764](https://github.com/apache/arrow-datafusion/pull/764) ([lvheyang](https://github.com/lvheyang)) +- \#699 fix return type conflict when calling builtin math fuctions [\#716](https://github.com/apache/arrow-datafusion/pull/716) ([lvheyang](https://github.com/lvheyang)) +- Fix Date32 and Date64 parquet row group pruning [\#690](https://github.com/apache/arrow-datafusion/pull/690) ([alamb](https://github.com/alamb)) +- Remove qualifiers on pushed down predicates / Fix parquet pruning [\#689](https://github.com/apache/arrow-datafusion/pull/689) ([alamb](https://github.com/alamb)) +- use `Weak` ptr to break catalog list \<\> info schema cyclic reference [\#681](https://github.com/apache/arrow-datafusion/pull/681) ([crepererum](https://github.com/crepererum)) +- honor table name for csv/parquet scan in ballista plan serde [\#629](https://github.com/apache/arrow-datafusion/pull/629) ([houqp](https://github.com/houqp)) +- fix 621, where unnamed window functions shall be differentiated by partition and order by clause [\#622](https://github.com/apache/arrow-datafusion/pull/622) ([Jimexist](https://github.com/Jimexist)) +- RFC: Do not prune out unnecessary columns with unqualified references [\#619](https://github.com/apache/arrow-datafusion/pull/619) ([alamb](https://github.com/alamb)) +- \[fix\] select \* on empty table [\#613](https://github.com/apache/arrow-datafusion/pull/613) ([rdettai](https://github.com/rdettai)) +- fix 592, support alias in window functions [\#607](https://github.com/apache/arrow-datafusion/pull/607) ([Jimexist](https://github.com/Jimexist)) +- RepartitionExec should not error if output has hung up [\#576](https://github.com/apache/arrow-datafusion/pull/576) ([alamb](https://github.com/alamb)) +- Fix pruning on not equal predicate [\#561](https://github.com/apache/arrow-datafusion/pull/561) ([alamb](https://github.com/alamb)) +- hash float arrays using primitive usigned integer type [\#556](https://github.com/apache/arrow-datafusion/pull/556) ([houqp](https://github.com/houqp)) +- Return errors properly from RepartitionExec [\#521](https://github.com/apache/arrow-datafusion/pull/521) ([alamb](https://github.com/alamb)) +- refactor sort exec stream and combine batches [\#515](https://github.com/apache/arrow-datafusion/pull/515) ([Jimexist](https://github.com/Jimexist)) +- Fix display of execution time in datafusion-cli [\#514](https://github.com/apache/arrow-datafusion/pull/514) ([Dandandan](https://github.com/Dandandan)) +- Wrong aggregation arguments error. [\#505](https://github.com/apache/arrow-datafusion/pull/505) ([jgoday](https://github.com/jgoday)) +- fix window aggregation with alias and add integration test case [\#454](https://github.com/apache/arrow-datafusion/pull/454) ([Jimexist](https://github.com/Jimexist)) +- fix: don't duplicate existing filters [\#409](https://github.com/apache/arrow-datafusion/pull/409) ([e-dard](https://github.com/e-dard)) +- Fixed incorrect logical type in GroupByScalar. [\#391](https://github.com/apache/arrow-datafusion/pull/391) ([jorgecarleitao](https://github.com/jorgecarleitao)) +- Fix indented display for multi-child nodes [\#358](https://github.com/apache/arrow-datafusion/pull/358) ([alamb](https://github.com/alamb)) +- Fix SQL planner to support multibyte column names [\#357](https://github.com/apache/arrow-datafusion/pull/357) ([agatan](https://github.com/agatan)) +- Fix wrong projection 'optimization' [\#268](https://github.com/apache/arrow-datafusion/pull/268) ([Dandandan](https://github.com/Dandandan)) +- Fix Left join implementation is incorrect for 0 or multiple batches on the right side [\#238](https://github.com/apache/arrow-datafusion/pull/238) ([Dandandan](https://github.com/Dandandan)) +- Count distinct boolean [\#230](https://github.com/apache/arrow-datafusion/pull/230) ([pjmore](https://github.com/pjmore)) +- Fix Filter / where clause without column names is removed in optimization pass [\#225](https://github.com/apache/arrow-datafusion/pull/225) ([Dandandan](https://github.com/Dandandan)) + +**Documentation updates:** + +- No way to get to the examples from docs.rs [\#186](https://github.com/apache/arrow-datafusion/issues/186) +- Update docs to use vendored version of arrow [\#772](https://github.com/apache/arrow-datafusion/pull/772) ([alamb](https://github.com/alamb)) +- Fix typo in DEVELOPERS.md [\#692](https://github.com/apache/arrow-datafusion/pull/692) ([lvheyang](https://github.com/lvheyang)) +- update stale documentations related to window functions [\#598](https://github.com/apache/arrow-datafusion/pull/598) ([Jimexist](https://github.com/Jimexist)) +- update readme to reflect work on window functions [\#471](https://github.com/apache/arrow-datafusion/pull/471) ([Jimexist](https://github.com/Jimexist)) +- Add examples section to datafusion crate doc [\#457](https://github.com/apache/arrow-datafusion/pull/457) ([mluts](https://github.com/mluts)) +- add invariants spec [\#443](https://github.com/apache/arrow-datafusion/pull/443) ([houqp](https://github.com/houqp)) +- add output field name rfc [\#422](https://github.com/apache/arrow-datafusion/pull/422) ([houqp](https://github.com/houqp)) +- Update more docs and also the developer.md doc [\#414](https://github.com/apache/arrow-datafusion/pull/414) ([Jimexist](https://github.com/Jimexist)) +- use prettier to format md files [\#367](https://github.com/apache/arrow-datafusion/pull/367) ([Jimexist](https://github.com/Jimexist)) +- Add new logo svg with white background [\#313](https://github.com/apache/arrow-datafusion/pull/313) ([parthsarthy](https://github.com/parthsarthy)) +- Add projects \(Squirtle and Tensorbase\) to list in readme [\#312](https://github.com/apache/arrow-datafusion/pull/312) ([parthsarthy](https://github.com/parthsarthy)) +- docs - fix the ballista link [\#274](https://github.com/apache/arrow-datafusion/pull/274) ([haoxins](https://github.com/haoxins)) +- misc\(README\): Replace Cube.js with Cube Store [\#248](https://github.com/apache/arrow-datafusion/pull/248) ([ovr](https://github.com/ovr)) +- Initial docs for SQL syntax [\#242](https://github.com/apache/arrow-datafusion/pull/242) ([Dandandan](https://github.com/Dandandan)) +- Deduplicate README.md [\#79](https://github.com/apache/arrow-datafusion/pull/79) ([msathis](https://github.com/msathis)) + +**Performance improvements:** + +- Speed up inlist for strings and primitives [\#813](https://github.com/apache/arrow-datafusion/pull/813) ([Dandandan](https://github.com/Dandandan)) +- perf: improve performance of `SortPreservingMergeExec` operator [\#722](https://github.com/apache/arrow-datafusion/pull/722) ([e-dard](https://github.com/e-dard)) +- Optimize min/max queries with table statistics [\#719](https://github.com/apache/arrow-datafusion/pull/719) ([b41sh](https://github.com/b41sh)) +- perf: Improve materialisation performance of SortPreservingMergeExec [\#691](https://github.com/apache/arrow-datafusion/pull/691) ([e-dard](https://github.com/e-dard)) +- Optimize count\(\*\) with table statistics [\#620](https://github.com/apache/arrow-datafusion/pull/620) ([Dandandan](https://github.com/Dandandan)) +- optimize window function's `find_ranges_in_range` [\#595](https://github.com/apache/arrow-datafusion/pull/595) ([Jimexist](https://github.com/Jimexist)) +- Collapse sort into window expr and do sort within logical phase [\#571](https://github.com/apache/arrow-datafusion/pull/571) ([Jimexist](https://github.com/Jimexist)) +- Use repartition in window functions to speed up [\#569](https://github.com/apache/arrow-datafusion/pull/569) ([Jimexist](https://github.com/Jimexist)) +- Constant fold / optimize `to_timestamp` function during planning [\#387](https://github.com/apache/arrow-datafusion/pull/387) ([msathis](https://github.com/msathis)) +- Speed up `create_batch_from_map` [\#339](https://github.com/apache/arrow-datafusion/pull/339) ([Dandandan](https://github.com/Dandandan)) +- Simplify math expression code \(use unary kernel\) [\#309](https://github.com/apache/arrow-datafusion/pull/309) ([Dandandan](https://github.com/Dandandan)) + +**Closed issues:** + +- Confirm git tagging strategy for releases [\#770](https://github.com/apache/arrow-datafusion/issues/770) +- arrow::util::pretty::pretty\_format\_batches missing [\#769](https://github.com/apache/arrow-datafusion/issues/769) +- move the `assert_batches_eq!` macros to a non part of datafusion [\#745](https://github.com/apache/arrow-datafusion/issues/745) +- fix an issue where aliases are not respected in generating downstream schemas in window expr [\#592](https://github.com/apache/arrow-datafusion/issues/592) +- make the planner to print more succinct and useful information in window function explain clause [\#526](https://github.com/apache/arrow-datafusion/issues/526) +- move window frame module to be in `logical_plan` [\#517](https://github.com/apache/arrow-datafusion/issues/517) +- use a more rust idiomatic way of handling nth\_value [\#448](https://github.com/apache/arrow-datafusion/issues/448) +- create a test with more than one partition for window functions [\#435](https://github.com/apache/arrow-datafusion/issues/435) +- COUNT DISTINCT does not support for `Boolean` [\#202](https://github.com/apache/arrow-datafusion/issues/202) +- Read CSV format text from stdin or memory [\#198](https://github.com/apache/arrow-datafusion/issues/198) +- Fix null handling hash join [\#195](https://github.com/apache/arrow-datafusion/issues/195) +- Allow TableProviders to indicate their type for the information schema [\#191](https://github.com/apache/arrow-datafusion/issues/191) +- Make DataFrame extensible [\#190](https://github.com/apache/arrow-datafusion/issues/190) +- TPC-H Query 19 [\#170](https://github.com/apache/arrow-datafusion/issues/170) +- TPC-H Query 7 [\#161](https://github.com/apache/arrow-datafusion/issues/161) +- Upgrade hashbrown to 0.10 [\#151](https://github.com/apache/arrow-datafusion/issues/151) +- Implement vectorized hashing for hash aggregate [\#149](https://github.com/apache/arrow-datafusion/issues/149) +- More efficient LEFT join implementation [\#143](https://github.com/apache/arrow-datafusion/issues/143) +- Implement vectorized hashing [\#142](https://github.com/apache/arrow-datafusion/issues/142) +- RFC Roadmap for 2021 \(DataFusion\) [\#140](https://github.com/apache/arrow-datafusion/issues/140) +- Implement hash partitioning [\#131](https://github.com/apache/arrow-datafusion/issues/131) +- Grouping by column position [\#110](https://github.com/apache/arrow-datafusion/issues/110) +- \[Datafusion\] GROUP BY with a high cardinality doesn't seem to finish [\#107](https://github.com/apache/arrow-datafusion/issues/107) +- \[Rust\] Add support for JSON data sources [\#103](https://github.com/apache/arrow-datafusion/issues/103) +- \[Rust\] Implement metrics framework [\#95](https://github.com/apache/arrow-datafusion/issues/95) +- Publically export Arrow crate from datafusion [\#36](https://github.com/apache/arrow-datafusion/issues/36) +- Implement hash-partitioned hash aggregate [\#27](https://github.com/apache/arrow-datafusion/issues/27) +- Consider using GitHub pages for DataFusion/Ballista documentation [\#18](https://github.com/apache/arrow-datafusion/issues/18) +- Update "repository" in Cargo.toml [\#16](https://github.com/apache/arrow-datafusion/issues/16) + +**Merged pull requests:** + +- Use `RawTable` API in hash join [\#827](https://github.com/apache/arrow-datafusion/pull/827) ([Dandandan](https://github.com/Dandandan)) +- Add test for window functions on dictionary [\#823](https://github.com/apache/arrow-datafusion/pull/823) ([alamb](https://github.com/alamb)) +- Update dependencies: prost to 0.8 and tonic to 0.5 [\#818](https://github.com/apache/arrow-datafusion/pull/818) ([alamb](https://github.com/alamb)) +- Move `hash_array` into hash\_utils.rs [\#807](https://github.com/apache/arrow-datafusion/pull/807) ([alamb](https://github.com/alamb)) +- Remove GroupByScalar and use ScalarValue in preparation for supporting null values in GroupBy [\#786](https://github.com/apache/arrow-datafusion/pull/786) ([alamb](https://github.com/alamb)) +- fix 226, make `concat`, `concat_ws`, and `random` work with `Python` crate [\#761](https://github.com/apache/arrow-datafusion/pull/761) ([Jimexist](https://github.com/Jimexist)) +- Test for parquet pruning disabling [\#754](https://github.com/apache/arrow-datafusion/pull/754) ([alamb](https://github.com/alamb)) +- Add explain verbose with limit push down [\#751](https://github.com/apache/arrow-datafusion/pull/751) ([Jimexist](https://github.com/Jimexist)) +- Move assert\_batches\_eq! macros to test\_utils.rs [\#746](https://github.com/apache/arrow-datafusion/pull/746) ([alamb](https://github.com/alamb)) +- Show optimized physical and logical plans in EXPLAIN [\#744](https://github.com/apache/arrow-datafusion/pull/744) ([alamb](https://github.com/alamb)) +- update `python` crate to support latest pyo3 syntax and gil sematics [\#741](https://github.com/apache/arrow-datafusion/pull/741) ([Jimexist](https://github.com/Jimexist)) +- update `python` crate dependencies [\#740](https://github.com/apache/arrow-datafusion/pull/740) ([Jimexist](https://github.com/Jimexist)) +- provide more details on required .parquet file extension error message [\#729](https://github.com/apache/arrow-datafusion/pull/729) ([Jimexist](https://github.com/Jimexist)) +- split up windows functions into a dedicated module with separate files [\#724](https://github.com/apache/arrow-datafusion/pull/724) ([Jimexist](https://github.com/Jimexist)) +- Use pytest in integration test [\#715](https://github.com/apache/arrow-datafusion/pull/715) ([Jimexist](https://github.com/Jimexist)) +- replace once iter chain with array::IntoIter [\#704](https://github.com/apache/arrow-datafusion/pull/704) ([houqp](https://github.com/houqp)) +- avoid iterator materialization in column index lookup [\#703](https://github.com/apache/arrow-datafusion/pull/703) ([houqp](https://github.com/houqp)) +- Fix build with 1.52.1 [\#696](https://github.com/apache/arrow-datafusion/pull/696) ([alamb](https://github.com/alamb)) +- Fix test output due to logical merge conflict [\#694](https://github.com/apache/arrow-datafusion/pull/694) ([alamb](https://github.com/alamb)) +- add more integration tests [\#668](https://github.com/apache/arrow-datafusion/pull/668) ([Jimexist](https://github.com/Jimexist)) +- Bump arrow and parquet versions to 4.4 [\#654](https://github.com/apache/arrow-datafusion/pull/654) ([toddtreece](https://github.com/toddtreece)) +- Add query 15 to TPC-H queries [\#645](https://github.com/apache/arrow-datafusion/pull/645) ([Dandandan](https://github.com/Dandandan)) +- Improve error message and comments [\#641](https://github.com/apache/arrow-datafusion/pull/641) ([alamb](https://github.com/alamb)) +- add integration tests for rank, dense\_rank, fix last\_value evaluation with rank [\#638](https://github.com/apache/arrow-datafusion/pull/638) ([Jimexist](https://github.com/Jimexist)) +- round trip TPCH queries in tests [\#630](https://github.com/apache/arrow-datafusion/pull/630) ([houqp](https://github.com/houqp)) +- use Into\ as argument type wherever applicable [\#615](https://github.com/apache/arrow-datafusion/pull/615) ([houqp](https://github.com/houqp)) +- reuse alias map in aggregate logical planning and refactor position resolution [\#606](https://github.com/apache/arrow-datafusion/pull/606) ([Jimexist](https://github.com/Jimexist)) +- fix clippy warnings [\#581](https://github.com/apache/arrow-datafusion/pull/581) ([Jimexist](https://github.com/Jimexist)) +- Add benchmarks to window function queries [\#564](https://github.com/apache/arrow-datafusion/pull/564) ([Jimexist](https://github.com/Jimexist)) +- reuse code for now function expr creation [\#548](https://github.com/apache/arrow-datafusion/pull/548) ([houqp](https://github.com/houqp)) +- turn on clippy rule for needless borrow [\#545](https://github.com/apache/arrow-datafusion/pull/545) ([Jimexist](https://github.com/Jimexist)) +- Refactor hash aggregates's planner building code [\#539](https://github.com/apache/arrow-datafusion/pull/539) ([Jimexist](https://github.com/Jimexist)) +- Cleanup Repartition Exec code [\#538](https://github.com/apache/arrow-datafusion/pull/538) ([alamb](https://github.com/alamb)) +- reuse datafusion physical planner in ballista building from protobuf [\#532](https://github.com/apache/arrow-datafusion/pull/532) ([Jimexist](https://github.com/Jimexist)) +- remove redundant `into_iter()` calls [\#527](https://github.com/apache/arrow-datafusion/pull/527) ([Jimexist](https://github.com/Jimexist)) +- Fix 517 - move `window_frames` module to `logical_plan` [\#518](https://github.com/apache/arrow-datafusion/pull/518) ([Jimexist](https://github.com/Jimexist)) +- Refactor window aggregation, simplify batch processing logic [\#516](https://github.com/apache/arrow-datafusion/pull/516) ([Jimexist](https://github.com/Jimexist)) +- Add datafusion::test\_util, resolve test data paths without env vars [\#498](https://github.com/apache/arrow-datafusion/pull/498) ([mluts](https://github.com/mluts)) +- Avoid warnings in tests when compiling without default features [\#489](https://github.com/apache/arrow-datafusion/pull/489) ([alamb](https://github.com/alamb)) +- update cargo.toml in python crate and fix unit test due to hash joins [\#483](https://github.com/apache/arrow-datafusion/pull/483) ([Jimexist](https://github.com/Jimexist)) +- use prettier check in CI [\#453](https://github.com/apache/arrow-datafusion/pull/453) ([Jimexist](https://github.com/Jimexist)) +- Optimize `nth_value`, remove `first_value`, `last_value` structs and use idiomatic rust style [\#452](https://github.com/apache/arrow-datafusion/pull/452) ([Jimexist](https://github.com/Jimexist)) +- Fixed typo / logical merge conflict [\#433](https://github.com/apache/arrow-datafusion/pull/433) ([jorgecarleitao](https://github.com/jorgecarleitao)) +- include test data and add aggregation tests in integration test [\#425](https://github.com/apache/arrow-datafusion/pull/425) ([Jimexist](https://github.com/Jimexist)) +- Add some padding around the logo [\#411](https://github.com/apache/arrow-datafusion/pull/411) ([parthsarthy](https://github.com/parthsarthy)) +- Benchmark subcommand to distinguish between DataFusion and Ballista [\#402](https://github.com/apache/arrow-datafusion/pull/402) ([jgoday](https://github.com/jgoday)) +- refactor datafusion/`scalar_value` to use more macro and avoid dup code [\#392](https://github.com/apache/arrow-datafusion/pull/392) ([Jimexist](https://github.com/Jimexist)) +- Update TPC-H benchmark to show physical plan when debug mode is enabled [\#386](https://github.com/apache/arrow-datafusion/pull/386) ([andygrove](https://github.com/andygrove)) +- Update arrow dependencies again [\#341](https://github.com/apache/arrow-datafusion/pull/341) ([alamb](https://github.com/alamb)) +- Update arrow-rs deps [\#317](https://github.com/apache/arrow-datafusion/pull/317) ([alamb](https://github.com/alamb)) +- Update PR template by commenting out instructions [\#315](https://github.com/apache/arrow-datafusion/pull/315) ([alamb](https://github.com/alamb)) +- fix clippy warning [\#286](https://github.com/apache/arrow-datafusion/pull/286) ([Jimexist](https://github.com/Jimexist)) +- add integration test to compare datafusion-cli against psql [\#281](https://github.com/apache/arrow-datafusion/pull/281) ([Jimexist](https://github.com/Jimexist)) +- Update arrow deps [\#269](https://github.com/apache/arrow-datafusion/pull/269) ([alamb](https://github.com/alamb)) +- Use multi-stage build dockerfile in datafusion-cli and reduce image size from 2.16GB to 89.9MB [\#266](https://github.com/apache/arrow-datafusion/pull/266) ([Jimexist](https://github.com/Jimexist)) +- Enable redundant\_field\_names clippy lint [\#261](https://github.com/apache/arrow-datafusion/pull/261) ([Dandandan](https://github.com/Dandandan)) +- fix clippy lint [\#259](https://github.com/apache/arrow-datafusion/pull/259) ([alamb](https://github.com/alamb)) +- Move datafusion-cli to new crate [\#231](https://github.com/apache/arrow-datafusion/pull/231) ([Dandandan](https://github.com/Dandandan)) +- Make test join\_with\_hash\_collision deterministic [\#229](https://github.com/apache/arrow-datafusion/pull/229) ([Dandandan](https://github.com/Dandandan)) +- Update arrow-rs deps \(to fix build due to flatbuffers update\) [\#224](https://github.com/apache/arrow-datafusion/pull/224) ([alamb](https://github.com/alamb)) +- Use standard make\_null\_array for CASE [\#223](https://github.com/apache/arrow-datafusion/pull/223) ([alamb](https://github.com/alamb)) +- update arrow-rs deps to latest master [\#216](https://github.com/apache/arrow-datafusion/pull/216) ([alamb](https://github.com/alamb)) +- MINOR: Remove empty rust dir [\#61](https://github.com/apache/arrow-datafusion/pull/61) ([andygrove](https://github.com/andygrove)) + + + +\* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index 9b094ac1a828c..2716cc751500d 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "datafusion" description = "DataFusion is an in-memory query engine that uses Apache Arrow as the memory model" -version = "4.0.0-SNAPSHOT" +version = "5.0.0" homepage = "https://github.com/apache/arrow-datafusion" repository = "https://github.com/apache/arrow-datafusion" readme = "../README.md" diff --git a/dev/release/create-tarball.sh b/dev/release/create-tarball.sh index 9e411997b0933..8d6d942d0b4fc 100755 --- a/dev/release/create-tarball.sh +++ b/dev/release/create-tarball.sh @@ -76,16 +76,21 @@ cat < ./update_change_log-datafusion.sh + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" + +${SOURCE_DIR}/update_change_log-datafusion.sh +${SOURCE_DIR}/update_change_log-ballista.sh +${SOURCE_DIR}/update_change_log-python.sh diff --git a/dev/release/update_change_log-ballista.sh b/dev/release/update_change_log-ballista.sh new file mode 100755 index 0000000000000..68193156622a2 --- /dev/null +++ b/dev/release/update_change_log-ballista.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Usage: +# CHANGELOG_GITHUB_TOKEN= ./update_change_log-ballista.sh + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" + +CURRENT_VER=$(grep version "${SOURCE_TOP_DIR}/ballista/rust/client/Cargo.toml" | head -n 1 | awk '{print $3}' | tr -d '"') +${SOURCE_DIR}/update_change_log.sh ballista 4.0.0 "ballista-${CURRENT_VER}" diff --git a/dev/release/update_change_log-datafusion.sh b/dev/release/update_change_log-datafusion.sh new file mode 100755 index 0000000000000..f0f455ad1c9b5 --- /dev/null +++ b/dev/release/update_change_log-datafusion.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Usage: +# CHANGELOG_GITHUB_TOKEN= ./update_change_log-datafusion.sh + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" + +CURRENT_VER=$(grep version "${SOURCE_TOP_DIR}/datafusion/Cargo.toml" | head -n 1 | awk '{print $3}' | tr -d '"') +${SOURCE_DIR}/update_change_log.sh datafusion 4.0.0 "${CURRENT_VER}" diff --git a/dev/release/update_change_log-python.sh b/dev/release/update_change_log-python.sh new file mode 100755 index 0000000000000..a48a5b657c5f3 --- /dev/null +++ b/dev/release/update_change_log-python.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Usage: +# CHANGELOG_GITHUB_TOKEN= ./update_change_log-python.sh + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" + +CURRENT_VER=$(grep version "${SOURCE_TOP_DIR}/python/Cargo.toml" | head -n 1 | awk '{print $3}' | tr -d '"') +${SOURCE_DIR}/update_change_log.sh python 4.0.0 "python-${CURRENT_VER}" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 4ee9e2eb1e498..0c9c2332ce704 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -27,13 +27,23 @@ # arrow-datafusion/.github_changelog_generator # # Usage: -# CHANGELOG_GITHUB_TOKEN= ./update_change_log.sh +# CHANGELOG_GITHUB_TOKEN= ./update_change_log.sh set -e SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" +if [[ "$#" -ne 3 ]]; then + echo "USAGE: $0 PROJECT FROM_VER TO_VER" + exit 1 +fi + +PROJECT=$1 +FROM_VER=$2 +TO_VER=$3 +OUTPUT_PATH="${PROJECT}/CHANGELOG.md" + pushd ${SOURCE_TOP_DIR} docker run -it --rm \ -e CHANGELOG_GITHUB_TOKEN=$CHANGELOG_GITHUB_TOKEN \ @@ -41,7 +51,30 @@ docker run -it --rm \ githubchangeloggenerator/github-changelog-generator \ --user apache \ --project arrow-datafusion \ - --since-tag 4.0.0 \ - --future-release 5.0.0 + --since-tag "${FROM_VER}" \ + --include-labels "${PROJECT}" \ + --output "${OUTPUT_PATH}" \ + --future-release "${TO_VER}" + +sed -i "s/\\\n/\n\n/" "${OUTPUT_PATH}" + +echo ' +' | cat - "${OUTPUT_PATH}" > "${OUTPUT_PATH}".tmp +mv "${OUTPUT_PATH}".tmp "${OUTPUT_PATH}" diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh new file mode 100755 index 0000000000000..5bbf82d0186e1 --- /dev/null +++ b/dev/release/verify-release-candidate.sh @@ -0,0 +1,146 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +case $# in + 2) VERSION="$1" + RC_NUMBER="$2" + ;; + *) echo "Usage: $0 X.Y.Z RC_NUMBER" + exit 1 + ;; +esac + +set -e +set -x +set -o pipefail + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +ARROW_DIR="$(dirname $(dirname ${SOURCE_DIR}))" +ARROW_DIST_URL='https://dist.apache.org/repos/dist/dev/arrow' + +download_dist_file() { + curl \ + --silent \ + --show-error \ + --fail \ + --location \ + --remote-name $ARROW_DIST_URL/$1 +} + +download_rc_file() { + download_dist_file apache-arrow-datafusion-${VERSION}-rc${RC_NUMBER}/$1 +} + +import_gpg_keys() { + download_dist_file KEYS + gpg --import KEYS +} + +fetch_archive() { + local dist_name=$1 + download_rc_file ${dist_name}.tar.gz + download_rc_file ${dist_name}.tar.gz.asc + download_rc_file ${dist_name}.tar.gz.sha256 + download_rc_file ${dist_name}.tar.gz.sha512 + gpg --verify ${dist_name}.tar.gz.asc ${dist_name}.tar.gz + shasum -a 256 -c ${dist_name}.tar.gz.sha256 + shasum -a 512 -c ${dist_name}.tar.gz.sha512 +} + +verify_dir_artifact_signatures() { + # verify the signature and the checksums of each artifact + find $1 -name '*.asc' | while read sigfile; do + artifact=${sigfile/.asc/} + gpg --verify $sigfile $artifact || exit 1 + + # go into the directory because the checksum files contain only the + # basename of the artifact + pushd $(dirname $artifact) + base_artifact=$(basename $artifact) + if [ -f $base_artifact.sha256 ]; then + shasum -a 256 -c $base_artifact.sha256 || exit 1 + fi + shasum -a 512 -c $base_artifact.sha512 || exit 1 + popd + done +} + +setup_tempdir() { + cleanup() { + if [ "${TEST_SUCCESS}" = "yes" ]; then + rm -fr "${ARROW_TMPDIR}" + else + echo "Failed to verify release candidate. See ${ARROW_TMPDIR} for details." + fi + } + + if [ -z "${ARROW_TMPDIR}" ]; then + # clean up automatically if ARROW_TMPDIR is not defined + ARROW_TMPDIR=$(mktemp -d -t "$1.XXXXX") + trap cleanup EXIT + else + # don't clean up automatically + mkdir -p "${ARROW_TMPDIR}" + fi +} + +test_source_distribution() { + # install rust toolchain in a similar fashion like test-miniconda + export RUSTUP_HOME=$PWD/test-rustup + export CARGO_HOME=$PWD/test-rustup + + curl https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path + + export PATH=$RUSTUP_HOME/bin:$PATH + source $RUSTUP_HOME/env + + # build and test rust + + # raises on any formatting errors + rustup component add rustfmt --toolchain stable + cargo fmt --all -- --check + + # Clone testing repositories if not cloned already + git clone https://github.com/apache/arrow-testing.git arrow-testing-data + git clone https://github.com/apache/parquet-testing.git parquet-testing-data + export ARROW_TEST_DATA=$PWD/arrow-testing-data/data + export PARQUET_TEST_DATA=$PWD/parquet-testing-data/data + + cargo build + cargo test --all +} + +TEST_SUCCESS=no + +setup_tempdir "arrow-${VERSION}" +echo "Working in sandbox ${ARROW_TMPDIR}" +cd ${ARROW_TMPDIR} + +dist_name="apache-arrow-datafusion-${VERSION}" +import_gpg_keys +fetch_archive ${dist_name} +tar xf ${dist_name}.tar.gz +pushd ${dist_name} +test_source_distribution +popd + +TEST_SUCCESS=yes +echo 'Release candidate looks good!' +exit 0 diff --git a/dev/update_arrow_deps.py b/dev/update_arrow_deps.py index 44bdf4235d1c6..69fcdc84ab8fd 100755 --- a/dev/update_arrow_deps.py +++ b/dev/update_arrow_deps.py @@ -17,7 +17,7 @@ # limitations under the License. # -# Script that updates the arrow dependencies in datafusion and ballista, locall +# Script that updates the arrow dependencies in datafusion and ballista, locally # # installation: # pip install tomlkit requests diff --git a/dev/update_ballista_versions.py b/dev/update_ballista_versions.py new file mode 100755 index 0000000000000..cb75a9a71c479 --- /dev/null +++ b/dev/update_ballista_versions.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Script that updates verions for ballista crates, locally +# +# dependencies: +# pip install tomlkit + +import os +import argparse +from pathlib import Path +import tomlkit + + +def update_cargo_toml(cargo_toml: str, new_version: str): + print(f'updating {cargo_toml}') + with open(cargo_toml) as f: + data = f.read() + + doc = tomlkit.parse(data) + doc.get('package')['version'] = new_version + + with open(cargo_toml, 'w') as f: + f.write(tomlkit.dumps(doc)) + + +def main(): + parser = argparse.ArgumentParser(description='Update ballista crate versions.') + parser.add_argument('new_version', type=str, help='new ballista version') + args = parser.parse_args() + + repo_root = Path(__file__).parent.parent.absolute() + ballista_crates = set([ + os.path.join(repo_root, rel_path, "Cargo.toml") + for rel_path in [ + 'ballista-examples', + 'ballista/rust/core', + 'ballista/rust/scheduler', + 'ballista/rust/executor', + 'ballista/rust/client', + ] + ]) + new_version = args.new_version + + print(f'Updating ballista versions in {repo_root} to {new_version}') + + for cargo_toml in ballista_crates: + update_cargo_toml(cargo_toml, new_version) + + +if __name__ == "__main__": + main() diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md new file mode 100644 index 0000000000000..a4964abdd4bb2 --- /dev/null +++ b/python/CHANGELOG.md @@ -0,0 +1,72 @@ + + +For older versions, see [apache/arrow/CHANGELOG.md](https://github.com/apache/arrow/blob/master/CHANGELOG.md) + +# Changelog + +## [python-0.3.0](https://github.com/apache/arrow-datafusion/tree/python-0.3.0) (2021-08-10) + +[Full Changelog](https://github.com/apache/arrow-datafusion/compare/4.0.0...python-0.3.0) + +**Implemented enhancements:** + +- add more math functions and unit tests to `python` crate [\#748](https://github.com/apache/arrow-datafusion/pull/748) ([Jimexist](https://github.com/Jimexist)) +- Expose ExecutionContext.register\_csv to the python bindings [\#524](https://github.com/apache/arrow-datafusion/pull/524) ([kszucs](https://github.com/kszucs)) +- Implement missing join types for Python dataframe [\#503](https://github.com/apache/arrow-datafusion/pull/503) ([Dandandan](https://github.com/Dandandan)) +- Add missing functions to python [\#388](https://github.com/apache/arrow-datafusion/pull/388) ([jgoday](https://github.com/jgoday)) + +**Fixed bugs:** + +- fix maturin version in pyproject.toml [\#756](https://github.com/apache/arrow-datafusion/pull/756) ([Jimexist](https://github.com/Jimexist)) +- fix pyarrow type id mapping in `python` crate [\#742](https://github.com/apache/arrow-datafusion/pull/742) ([Jimexist](https://github.com/Jimexist)) + +**Closed issues:** + +- Confirm git tagging strategy for releases [\#770](https://github.com/apache/arrow-datafusion/issues/770) +- arrow::util::pretty::pretty\_format\_batches missing [\#769](https://github.com/apache/arrow-datafusion/issues/769) +- move the `assert_batches_eq!` macros to a non part of datafusion [\#745](https://github.com/apache/arrow-datafusion/issues/745) +- fix an issue where aliases are not respected in generating downstream schemas in window expr [\#592](https://github.com/apache/arrow-datafusion/issues/592) +- make the planner to print more succinct and useful information in window function explain clause [\#526](https://github.com/apache/arrow-datafusion/issues/526) +- move window frame module to be in `logical_plan` [\#517](https://github.com/apache/arrow-datafusion/issues/517) +- use a more rust idiomatic way of handling nth\_value [\#448](https://github.com/apache/arrow-datafusion/issues/448) +- create a test with more than one partition for window functions [\#435](https://github.com/apache/arrow-datafusion/issues/435) +- Implement hash-partitioned hash aggregate [\#27](https://github.com/apache/arrow-datafusion/issues/27) +- Consider using GitHub pages for DataFusion/Ballista documentation [\#18](https://github.com/apache/arrow-datafusion/issues/18) +- Update "repository" in Cargo.toml [\#16](https://github.com/apache/arrow-datafusion/issues/16) + +**Merged pull requests:** + +- fix python binding for `concat`, `concat_ws`, and `random` [\#768](https://github.com/apache/arrow-datafusion/pull/768) ([Jimexist](https://github.com/Jimexist)) +- fix 226, make `concat`, `concat_ws`, and `random` work with `Python` crate [\#761](https://github.com/apache/arrow-datafusion/pull/761) ([Jimexist](https://github.com/Jimexist)) +- fix python crate with the changes to logical plan builder [\#650](https://github.com/apache/arrow-datafusion/pull/650) ([Jimexist](https://github.com/Jimexist)) +- use nightly nightly-2021-05-10 [\#536](https://github.com/apache/arrow-datafusion/pull/536) ([Jimexist](https://github.com/Jimexist)) +- Define the unittests using pytest [\#493](https://github.com/apache/arrow-datafusion/pull/493) ([kszucs](https://github.com/kszucs)) +- use requirements.txt to formalize python deps [\#484](https://github.com/apache/arrow-datafusion/pull/484) ([Jimexist](https://github.com/Jimexist)) +- update cargo.toml in python crate and fix unit test due to hash joins [\#483](https://github.com/apache/arrow-datafusion/pull/483) ([Jimexist](https://github.com/Jimexist)) +- simplify python function definitions [\#477](https://github.com/apache/arrow-datafusion/pull/477) ([Jimexist](https://github.com/Jimexist)) +- Expose DataFrame::sort in the python bindings [\#469](https://github.com/apache/arrow-datafusion/pull/469) ([kszucs](https://github.com/kszucs)) +- Revert "Revert "Add datafusion-python \(\#69\)" \(\#257\)" [\#270](https://github.com/apache/arrow-datafusion/pull/270) ([andygrove](https://github.com/andygrove)) +- Revert "Add datafusion-python \(\#69\)" [\#257](https://github.com/apache/arrow-datafusion/pull/257) ([andygrove](https://github.com/andygrove)) +- update arrow-rs deps to latest master [\#216](https://github.com/apache/arrow-datafusion/pull/216) ([alamb](https://github.com/alamb)) +- Add datafusion-python [\#69](https://github.com/apache/arrow-datafusion/pull/69) ([jorgecarleitao](https://github.com/jorgecarleitao)) + + + +\* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/python/Cargo.toml b/python/Cargo.toml index fe84e5234c333..60cc74dfc89e0 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "datafusion" -version = "0.2.1" +version = "0.3.0" homepage = "https://github.com/apache/arrow" repository = "https://github.com/apache/arrow" authors = ["Apache Arrow "] From deb929369c9aaba728ae0c2c49dcd05bfecc8bf8 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 10 Aug 2021 17:48:53 -0600 Subject: [PATCH 329/329] Add version to ballista dependencies (#852) --- ballista/rust/client/Cargo.toml | 8 ++++---- ballista/rust/core/Cargo.toml | 2 +- ballista/rust/executor/Cargo.toml | 10 ++++------ ballista/rust/scheduler/Cargo.toml | 7 +++---- 4 files changed, 12 insertions(+), 15 deletions(-) diff --git a/ballista/rust/client/Cargo.toml b/ballista/rust/client/Cargo.toml index 3507a7b22a45d..7b22bae0549d6 100644 --- a/ballista/rust/client/Cargo.toml +++ b/ballista/rust/client/Cargo.toml @@ -26,14 +26,14 @@ authors = ["Apache Arrow "] edition = "2018" [dependencies] -ballista-core = { path = "../core" } -ballista-executor = { path = "../executor", optional = true } -ballista-scheduler = { path = "../scheduler", optional = true } +ballista-core = { path = "../core", version = "0.5.0" } +ballista-executor = { path = "../executor", version = "0.5.0", optional = true } +ballista-scheduler = { path = "../scheduler", version = "0.5.0", optional = true } futures = "0.3" log = "0.4" tokio = "1.0" -datafusion = { path = "../../../datafusion" } +datafusion = { path = "../../../datafusion", version = "5.0.0" } [features] default = [] diff --git a/ballista/rust/core/Cargo.toml b/ballista/rust/core/Cargo.toml index f61f32259f30e..b2fa50c889a57 100644 --- a/ballista/rust/core/Cargo.toml +++ b/ballista/rust/core/Cargo.toml @@ -44,7 +44,7 @@ uuid = { version = "0.8", features = ["v4"] } arrow-flight = { version = "5.0" } -datafusion = { path = "../../../datafusion" } +datafusion = { path = "../../../datafusion", version = "5.0.0" } [dev-dependencies] tempfile = "3" diff --git a/ballista/rust/executor/Cargo.toml b/ballista/rust/executor/Cargo.toml index f30015b884a44..8600409b1de43 100644 --- a/ballista/rust/executor/Cargo.toml +++ b/ballista/rust/executor/Cargo.toml @@ -29,10 +29,13 @@ edition = "2018" snmalloc = ["snmalloc-rs"] [dependencies] +arrow = { version = "5.0" } +arrow-flight = { version = "5.0" } anyhow = "1" async-trait = "0.1.36" -ballista-core = { path = "../core" } +ballista-core = { path = "../core", version = "0.5.0" } configure_me = "0.4.0" +datafusion = { path = "../../../datafusion", version = "5.0.0" } env_logger = "0.8" futures = "0.3" log = "0.4" @@ -43,11 +46,6 @@ tokio-stream = { version = "0.1", features = ["net"] } tonic = "0.5" uuid = { version = "0.8", features = ["v4"] } -arrow = { version = "5.0" } -arrow-flight = { version = "5.0" } - -datafusion = { path = "../../../datafusion" } - [dev-dependencies] [build-dependencies] diff --git a/ballista/rust/scheduler/Cargo.toml b/ballista/rust/scheduler/Cargo.toml index fb6286669b93e..c772ba9f240d5 100644 --- a/ballista/rust/scheduler/Cargo.toml +++ b/ballista/rust/scheduler/Cargo.toml @@ -32,9 +32,10 @@ sled = ["sled_package", "tokio-stream"] [dependencies] anyhow = "1" -ballista-core = { path = "../core" } +ballista-core = { path = "../core", version = "0.5.0" } clap = "2" configure_me = "0.4.0" +datafusion = { path = "../../../datafusion", version = "5.0.0" } env_logger = "0.8" etcd-client = { version = "0.6", optional = true } futures = "0.3" @@ -53,10 +54,8 @@ tonic = "0.5" tower = { version = "0.4" } warp = "0.3" -datafusion = { path = "../../../datafusion" } - [dev-dependencies] -ballista-core = { path = "../core" } +ballista-core = { path = "../core", version = "0.5.0" } uuid = { version = "0.8", features = ["v4"] } [build-dependencies]