kdbrooks
diff --git a/‎.cargo/config‎
Lines changed: 22 additions & 0 deletions b/‎.cargo/config‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎.dockerignore‎
Lines changed: 19 additions & 0 deletions b/‎.dockerignore‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 20 additions & 0 deletions b/‎.gitignore‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 57 additions & 0 deletions b/‎Cargo.toml‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 146 additions & 0 deletions b/‎README.md‎
Lines changed: 146 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 20 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎rust-toolchain‎
Lines changed: 1 addition & 0 deletions b/‎rust-toolchain‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[target.x86_64-apple-darwin]
+rustflags = [
+  "-C", "link-arg=-undefined",
+  "-C", "link-arg=dynamic_lookup",
+]
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+target
+venv
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+/target
+Cargo.lock
+venv
@@ -0,0 +1,57 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "datafusion"
+version = "0.2.1"
+homepage = "https://github.com/apache/arrow"
+repository = "https://github.com/apache/arrow"
+authors = ["Apache Arrow <dev@arrow.apache.org>"]
+description = "Build and run queries against data"
+readme = "README.md"
+license = "Apache-2.0"
+edition = "2018"
+
+[dependencies]
+tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] }
+rand = "0.7"
+pyo3 = { version = "0.12.1", features = ["extension-module"] }
+datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "2423ff0d" }
+
+[lib]
+name = "datafusion"
+crate-type = ["cdylib"]
+
+[package.metadata.maturin]
+requires-dist = ["pyarrow>=1"]
+
+classifier = [
+    "Development Status :: 2 - Pre-Alpha",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: Apache Software License",
+    "License :: OSI Approved",
+    "Operating System :: MacOS",
+    "Operating System :: Microsoft :: Windows",
+    "Operating System :: POSIX :: Linux",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.6",
+    "Programming Language :: Python :: 3.7",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python",
+    "Programming Language :: Rust",
+]
@@ -0,0 +1,146 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+## DataFusion in Python
+
+This is a Python library that binds to [Apache Arrow](https://arrow.apache.org/) in-memory query engine [DataFusion](https://github.com/apache/arrow/tree/master/rust/datafusion).
+
+Like pyspark, it allows you to build a plan through SQL or a DataFrame API against in-memory data, parquet or CSV files, run it in a multi-threaded environment, and obtain the result back in Python.
+
+It also allows you to use UDFs and UDAFs for complex operations.
+
+The major advantage of this library over other execution engines is that this library achieves zero-copy between Python and its execution engine: there is no cost in using UDFs, UDAFs, and collecting the results to Python apart from having to lock the GIL when running those operations.
+
+Its query engine, DataFusion, is written in [Rust](https://www.rust-lang.org/), which makes strong assumptions about thread safety and lack of memory leaks.
+
+Technically, zero-copy is achieved via the [c data interface](https://arrow.apache.org/docs/format/CDataInterface.html).
+
+## How to use it
+
+Simple usage:
+
+```python
+import datafusion
+import pyarrow
+
+# an alias
+f = datafusion.functions
+
+# create a context
+ctx = datafusion.ExecutionContext()
+
+# create a RecordBatch and a new DataFrame from it
+batch = pyarrow.RecordBatch.from_arrays(
+    [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])],
+    names=["a", "b"],
+)
+df = ctx.create_dataframe([[batch]])
+
+# create a new statement
+df = df.select(
+    f.col("a") + f.col("b"),
+    f.col("a") - f.col("b"),
+)
+
+# execute and collect the first (and only) batch
+result = df.collect()[0]
+
+assert result.column(0) == pyarrow.array([5, 7, 9])
+assert result.column(1) == pyarrow.array([-3, -3, -3])
+```
+
+### UDFs
+
+```python
+def is_null(array: pyarrow.Array) -> pyarrow.Array:
+    return array.is_null()
+
+udf = f.udf(is_null, [pyarrow.int64()], pyarrow.bool_())
+
+df = df.select(udf(f.col("a")))
+```
+
+### UDAF
+
+```python
+import pyarrow
+import pyarrow.compute
+
+
+class Accumulator:
+    """
+    Interface of a user-defined accumulation.
+    """
+    def __init__(self):
+        self._sum = pyarrow.scalar(0.0)
+
+    def to_scalars(self) -> [pyarrow.Scalar]:
+        return [self._sum]
+
+    def update(self, values: pyarrow.Array) -> None:
+        # not nice since pyarrow scalars can't be summed yet. This breaks on `None`
+        self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(values).as_py())
+
+    def merge(self, states: pyarrow.Array) -> None:
+        # not nice since pyarrow scalars can't be summed yet. This breaks on `None`
+        self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(states).as_py())
+
+    def evaluate(self) -> pyarrow.Scalar:
+        return self._sum
+
+
+df = ...
+
+udaf = f.udaf(Accumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()])
+
+df = df.aggregate(
+    [],
+    [udaf(f.col("a"))]
+)
+```
+
+## How to install
+
+```bash
+pip install datafusion
+```
+
+## How to develop
+
+This assumes that you have rust and cargo installed. We use the workflow recommended by [pyo3](https://github.com/PyO3/pyo3) and [maturin](https://github.com/PyO3/maturin).
+
+Bootstrap:
+
+```bash
+# fetch this repo
+git clone git@github.com:apache/arrow-datafusion.git
+
+cd arrow-datafusion/python
+
+# prepare development environment (used to build wheel / install in development)
+python3 -m venv venv
+pip install maturin==0.10.4 toml==0.10.1 pyarrow==1.0.0
+```
+
+Whenever rust code changes (your changes or via git pull):
+
+```bash
+venv/bin/maturin develop
+venv/bin/python -m unittest discover tests
+```
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[build-system]
+requires = ["maturin"]
+build-backend = "maturin"
@@ -0,0 +1 @@
+nightly-2021-01-06