Skip to content

Commit b90330e

Browse files
authored
feat: teach Python about (part of) VortexWriteOptions (#4825)
In particular, this allows Python users to write using the compact strategy. --------- Signed-off-by: Daniel King <dan@spiraldb.com>
1 parent 440239e commit b90330e

4 files changed

Lines changed: 139 additions & 5 deletions

File tree

docs/conf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
nitpicky = True # ensures all :class:, :obj:, etc. links are valid
5353
nitpick_ignore = []
5454

55-
doctest_global_setup = "import pyarrow; import vortex; import vortex as vx"
55+
doctest_global_setup = "import pyarrow; import vortex; import vortex as vx; import random; random.seed(a=0)"
5656
doctest_default_flags = (
5757
doctest.ELLIPSIS | doctest.IGNORE_EXCEPTION_DETAIL | doctest.DONT_ACCEPT_TRUE_FOR_1 | doctest.NORMALIZE_WHITESPACE
5858
)
@@ -196,7 +196,7 @@ def _post_process(app, builder):
196196
os.environ["POLARS_TABLE_WIDTH"] = "80"
197197

198198

199-
def _convert_python_fenced_blocks_from_rust_to_valid_reST_blocks(app, what, name, obj, options, lines):
199+
def _convert_python_fenced_blocks_from_rust_to_valid_reST_blocks(app, what, name, obj, options, lines: list[str]):
200200
"""Remove Markdown-style code fences from Python docs written in Rust.
201201
202202
We would like `cargo test` to Just Work (TM). Unfortunately, by default, it executes any

vortex-python/python/vortex/_lib/io.pyi

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,11 @@ def read_url(
1313
indices: Array | None = None,
1414
) -> Array: ...
1515
def write(iter: IntoArrayIterator, path: str) -> None: ...
16+
17+
class VortexWriteOptions:
18+
@staticmethod
19+
def default() -> VortexWriteOptions: ...
20+
@staticmethod
21+
def compact() -> VortexWriteOptions: ...
22+
@staticmethod
23+
def write_path(iter: IntoArrayIterator, path: str) -> VortexWriteOptions: ...

vortex-python/python/vortex/io.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4-
from vortex._lib.io import read_url, write # pyright: ignore[reportMissingModuleSource]
4+
from vortex._lib.io import VortexWriteOptions, read_url, write # pyright: ignore[reportMissingModuleSource]
55

6-
__all__ = ["read_url", "write"]
6+
__all__ = ["read_url", "write", "VortexWriteOptions"]

vortex-python/src/io.rs

Lines changed: 127 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,11 @@ use pyo3::prelude::*;
88
use pyo3::pyfunction;
99
use tokio::fs::File;
1010
use vortex::arrow::FromArrowArray;
11+
use vortex::compressor::CompactCompressor;
1112
use vortex::dtype::DType;
1213
use vortex::dtype::arrow::FromArrowType;
1314
use vortex::error::{VortexError, VortexResult};
14-
use vortex::file::VortexWriteOptions;
15+
use vortex::file::{VortexWriteOptions, WriteStrategyBuilder};
1516
use vortex::iter::{ArrayIterator, ArrayIteratorAdapter, ArrayIteratorExt};
1617
use vortex::{ArrayRef, Canonical, IntoArray};
1718

@@ -30,6 +31,8 @@ pub(crate) fn init(py: Python, parent: &Bound<PyModule>) -> PyResult<()> {
3031
m.add_function(wrap_pyfunction!(read_url, &m)?)?;
3132
m.add_function(wrap_pyfunction!(write, &m)?)?;
3233

34+
m.add_class::<PyVortexWriteOptions>()?;
35+
3336
Ok(())
3437
}
3538

@@ -148,6 +151,10 @@ pub fn read_url<'py>(
148151
/// >>> vx.io.write(reader, "streamed.vortex") # doctest: +SKIP
149152
/// ```
150153
///
154+
/// See also
155+
/// --------
156+
///
157+
/// :func:`vortex.io.VortexWriteOptions`
151158
#[pyfunction]
152159
#[pyo3(signature = (iter, path))]
153160
pub fn write(iter: PyIntoArrayIterator, path: &str) -> PyResult<()> {
@@ -161,6 +168,125 @@ pub fn write(iter: PyIntoArrayIterator, path: &str) -> PyResult<()> {
161168
Ok(())
162169
}
163170

171+
/// Write Vortex files with custom configuration.
172+
///
173+
/// See also
174+
/// --------
175+
///
176+
/// :func:`vortex.io.write`.
177+
#[pyclass(name = "VortexWriteOptions", module = "io", frozen)]
178+
pub(crate) struct PyVortexWriteOptions {
179+
// TODO(DK): This might need to be an Arc<dyn Compressor> if we actually have multiple
180+
// compressors.
181+
compressor: Option<CompactCompressor>,
182+
}
183+
184+
#[pymethods]
185+
impl PyVortexWriteOptions {
186+
/// Balance size, read-throughput, and read-latency.
187+
#[staticmethod]
188+
pub fn default() -> Self {
189+
Self { compressor: None }
190+
}
191+
192+
/// Prioritize small size over read-throughput and read-latency.
193+
///
194+
/// Let's model some stock ticker data. As you may know, the stock market always (noisly) goes
195+
/// up:
196+
///
197+
/// ```python
198+
/// >>> import os
199+
/// >>> import random
200+
/// >>> sprl = vx.array([random.randint(i, i + 10) for i in range(100_000)])
201+
/// ```
202+
///
203+
/// If we naively wrote 4-bytes for each of these integers to a file we'd have 400,000 bytes!
204+
/// Let's see how small this is when we write with the default Vortex write options (which are
205+
/// also used by :func:`vortex.io.write`):
206+
///
207+
/// ```python
208+
/// >>> vx.io.VortexWriteOptions.default().write_path(sprl, "chonky.vortex")
209+
/// >>> import os
210+
/// >>> os.path.getsize('chonky.vortex')
211+
/// 215196
212+
/// ```
213+
///
214+
/// Wow, Vortex manages to use about two bytes per integer! So advanced. So tiny.
215+
///
216+
/// But can we do better?
217+
///
218+
/// We sure can.
219+
///
220+
/// ```python
221+
/// >>> vx.io.VortexWriteOptions.compact().write_path(sprl, "tiny.vortex")
222+
/// >>> os.path.getsize('tiny.vortex')
223+
/// 54200
224+
/// ```
225+
///
226+
/// Random numbers are not (usually) composed of random bytes!
227+
#[staticmethod]
228+
pub fn compact() -> Self {
229+
Self {
230+
compressor: Some(CompactCompressor::default()),
231+
}
232+
}
233+
234+
/// Write an array or iterator of arrays into a local file.
235+
///
236+
///
237+
/// Parameters
238+
/// ----------
239+
/// iter : vortex.Array | vortex.ArrayIterator | pyarrow.Table | pyarrow.RecordBatchReader
240+
/// The data to write. Can be a single array, an array iterator, or a PyArrow object that supports streaming.
241+
/// When using PyArrow objects, data is streamed directly without loading the entire dataset into memory.
242+
///
243+
/// path : str
244+
/// The file path.
245+
///
246+
/// Examples
247+
/// --------
248+
///
249+
/// Write a single Vortex array `a` to the local file `a.vortex` using the default settings:
250+
///
251+
/// ```python
252+
/// >>> import vortex as vx
253+
/// >>> import random
254+
/// >>> a = vx.array([0, 1, 2, 3, None, 4])
255+
/// >>> vx.io.VortexWriteOptions.default().write_path(a, "a.vortex") # doctest: +SKIP
256+
/// ```
257+
///
258+
/// Write the same array while preferring small file sizes over read-throughput and
259+
/// read-latency:
260+
///
261+
/// ```python
262+
/// >>> import vortex as vx
263+
/// >>> vx.io.VortexWriteOptions.compact().write_path(a, "a.vortex") # doctest: +SKIP
264+
/// ```
265+
///
266+
/// See also
267+
/// --------
268+
///
269+
/// :func:`vortex.io.write`
270+
#[pyo3(signature = (iter, path))]
271+
pub fn write_path(&self, iter: PyIntoArrayIterator, path: &str) -> PyResult<()> {
272+
TOKIO_RUNTIME.block_on(async move {
273+
let mut file = File::create(path).await?;
274+
275+
let mut strategy = WriteStrategyBuilder::new();
276+
if let Some(compressor) = self.compressor.as_ref() {
277+
strategy = strategy.with_compressor(compressor.clone())
278+
}
279+
280+
VortexWriteOptions::default()
281+
.with_strategy(strategy.build())
282+
.write(&mut file, iter.into_inner().into_array_stream())
283+
.await
284+
})?;
285+
286+
Ok(())
287+
}
288+
}
289+
164290
/// Conversion type for converting Python objects into a [`vortex::ArrayIterator`].
165291
pub type PyIntoArrayIterator = PyVortex<Box<dyn ArrayIterator + Send>>;
166292

0 commit comments

Comments
 (0)