Skip to content

Commit 4c98b8e

Browse files
jorgecarleitaoandygrovexhochy
authored andcommitted
Add datafusion-python (#69)
* Added Python project. * Update python/Cargo.toml Co-authored-by: Andy Grove <andygrove@users.noreply.github.com> * Update python/Cargo.toml Co-authored-by: Uwe L. Korn <xhochy@users.noreply.github.com> * Added license and black formatting. * License * Fixing build. * TesTestt * Bumped to latest DataFusion. * Bumped nightly. * Bumped pyarrow in tests. * Added some tests back. Co-authored-by: Andy Grove <andygrove@users.noreply.github.com> Co-authored-by: Uwe L. Korn <xhochy@users.noreply.github.com> GitOrigin-RevId: 46bde0bd148aacf1677a575cb9ddbc154b6c4fb3
0 parents  commit 4c98b8e

24 files changed

Lines changed: 2093 additions & 0 deletions

.cargo/config

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
[target.x86_64-apple-darwin]
19+
rustflags = [
20+
"-C", "link-arg=-undefined",
21+
"-C", "link-arg=dynamic_lookup",
22+
]

.dockerignore

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
target
19+
venv

.gitignore

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
/target
19+
Cargo.lock
20+
venv

Cargo.toml

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
[package]
19+
name = "datafusion"
20+
version = "0.2.1"
21+
homepage = "https://github.com/apache/arrow"
22+
repository = "https://github.com/apache/arrow"
23+
authors = ["Apache Arrow <dev@arrow.apache.org>"]
24+
description = "Build and run queries against data"
25+
readme = "README.md"
26+
license = "Apache-2.0"
27+
edition = "2018"
28+
29+
[dependencies]
30+
tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] }
31+
rand = "0.7"
32+
pyo3 = { version = "0.12.1", features = ["extension-module"] }
33+
datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "2423ff0d" }
34+
35+
[lib]
36+
name = "datafusion"
37+
crate-type = ["cdylib"]
38+
39+
[package.metadata.maturin]
40+
requires-dist = ["pyarrow>=1"]
41+
42+
classifier = [
43+
"Development Status :: 2 - Pre-Alpha",
44+
"Intended Audience :: Developers",
45+
"License :: OSI Approved :: Apache Software License",
46+
"License :: OSI Approved",
47+
"Operating System :: MacOS",
48+
"Operating System :: Microsoft :: Windows",
49+
"Operating System :: POSIX :: Linux",
50+
"Programming Language :: Python :: 3",
51+
"Programming Language :: Python :: 3.6",
52+
"Programming Language :: Python :: 3.7",
53+
"Programming Language :: Python :: 3.8",
54+
"Programming Language :: Python :: 3.9",
55+
"Programming Language :: Python",
56+
"Programming Language :: Rust",
57+
]

README.md

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
<!---
2+
Licensed to the Apache Software Foundation (ASF) under one
3+
or more contributor license agreements. See the NOTICE file
4+
distributed with this work for additional information
5+
regarding copyright ownership. The ASF licenses this file
6+
to you under the Apache License, Version 2.0 (the
7+
"License"); you may not use this file except in compliance
8+
with the License. You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing,
13+
software distributed under the License is distributed on an
14+
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
KIND, either express or implied. See the License for the
16+
specific language governing permissions and limitations
17+
under the License.
18+
-->
19+
20+
## DataFusion in Python
21+
22+
This is a Python library that binds to [Apache Arrow](https://arrow.apache.org/) in-memory query engine [DataFusion](https://github.com/apache/arrow/tree/master/rust/datafusion).
23+
24+
Like pyspark, it allows you to build a plan through SQL or a DataFrame API against in-memory data, parquet or CSV files, run it in a multi-threaded environment, and obtain the result back in Python.
25+
26+
It also allows you to use UDFs and UDAFs for complex operations.
27+
28+
The major advantage of this library over other execution engines is that this library achieves zero-copy between Python and its execution engine: there is no cost in using UDFs, UDAFs, and collecting the results to Python apart from having to lock the GIL when running those operations.
29+
30+
Its query engine, DataFusion, is written in [Rust](https://www.rust-lang.org/), which makes strong assumptions about thread safety and lack of memory leaks.
31+
32+
Technically, zero-copy is achieved via the [c data interface](https://arrow.apache.org/docs/format/CDataInterface.html).
33+
34+
## How to use it
35+
36+
Simple usage:
37+
38+
```python
39+
import datafusion
40+
import pyarrow
41+
42+
# an alias
43+
f = datafusion.functions
44+
45+
# create a context
46+
ctx = datafusion.ExecutionContext()
47+
48+
# create a RecordBatch and a new DataFrame from it
49+
batch = pyarrow.RecordBatch.from_arrays(
50+
[pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])],
51+
names=["a", "b"],
52+
)
53+
df = ctx.create_dataframe([[batch]])
54+
55+
# create a new statement
56+
df = df.select(
57+
f.col("a") + f.col("b"),
58+
f.col("a") - f.col("b"),
59+
)
60+
61+
# execute and collect the first (and only) batch
62+
result = df.collect()[0]
63+
64+
assert result.column(0) == pyarrow.array([5, 7, 9])
65+
assert result.column(1) == pyarrow.array([-3, -3, -3])
66+
```
67+
68+
### UDFs
69+
70+
```python
71+
def is_null(array: pyarrow.Array) -> pyarrow.Array:
72+
return array.is_null()
73+
74+
udf = f.udf(is_null, [pyarrow.int64()], pyarrow.bool_())
75+
76+
df = df.select(udf(f.col("a")))
77+
```
78+
79+
### UDAF
80+
81+
```python
82+
import pyarrow
83+
import pyarrow.compute
84+
85+
86+
class Accumulator:
87+
"""
88+
Interface of a user-defined accumulation.
89+
"""
90+
def __init__(self):
91+
self._sum = pyarrow.scalar(0.0)
92+
93+
def to_scalars(self) -> [pyarrow.Scalar]:
94+
return [self._sum]
95+
96+
def update(self, values: pyarrow.Array) -> None:
97+
# not nice since pyarrow scalars can't be summed yet. This breaks on `None`
98+
self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(values).as_py())
99+
100+
def merge(self, states: pyarrow.Array) -> None:
101+
# not nice since pyarrow scalars can't be summed yet. This breaks on `None`
102+
self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(states).as_py())
103+
104+
def evaluate(self) -> pyarrow.Scalar:
105+
return self._sum
106+
107+
108+
df = ...
109+
110+
udaf = f.udaf(Accumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()])
111+
112+
df = df.aggregate(
113+
[],
114+
[udaf(f.col("a"))]
115+
)
116+
```
117+
118+
## How to install
119+
120+
```bash
121+
pip install datafusion
122+
```
123+
124+
## How to develop
125+
126+
This assumes that you have rust and cargo installed. We use the workflow recommended by [pyo3](https://github.com/PyO3/pyo3) and [maturin](https://github.com/PyO3/maturin).
127+
128+
Bootstrap:
129+
130+
```bash
131+
# fetch this repo
132+
git clone git@github.com:apache/arrow-datafusion.git
133+
134+
cd arrow-datafusion/python
135+
136+
# prepare development environment (used to build wheel / install in development)
137+
python3 -m venv venv
138+
pip install maturin==0.10.4 toml==0.10.1 pyarrow==1.0.0
139+
```
140+
141+
Whenever rust code changes (your changes or via git pull):
142+
143+
```bash
144+
venv/bin/maturin develop
145+
venv/bin/python -m unittest discover tests
146+
```

pyproject.toml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
[build-system]
19+
requires = ["maturin"]
20+
build-backend = "maturin"

rust-toolchain

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
nightly-2021-01-06

0 commit comments

Comments
 (0)