@@ -8,10 +8,11 @@ use pyo3::prelude::*;
88use pyo3:: pyfunction;
99use tokio:: fs:: File ;
1010use vortex:: arrow:: FromArrowArray ;
11+ use vortex:: compressor:: CompactCompressor ;
1112use vortex:: dtype:: DType ;
1213use vortex:: dtype:: arrow:: FromArrowType ;
1314use vortex:: error:: { VortexError , VortexResult } ;
14- use vortex:: file:: VortexWriteOptions ;
15+ use vortex:: file:: { VortexWriteOptions , WriteStrategyBuilder } ;
1516use vortex:: iter:: { ArrayIterator , ArrayIteratorAdapter , ArrayIteratorExt } ;
1617use vortex:: { ArrayRef , Canonical , IntoArray } ;
1718
@@ -30,6 +31,8 @@ pub(crate) fn init(py: Python, parent: &Bound<PyModule>) -> PyResult<()> {
3031 m. add_function ( wrap_pyfunction ! ( read_url, & m) ?) ?;
3132 m. add_function ( wrap_pyfunction ! ( write, & m) ?) ?;
3233
34+ m. add_class :: < PyVortexWriteOptions > ( ) ?;
35+
3336 Ok ( ( ) )
3437}
3538
@@ -148,6 +151,10 @@ pub fn read_url<'py>(
148151/// >>> vx.io.write(reader, "streamed.vortex") # doctest: +SKIP
149152/// ```
150153///
154+ /// See also
155+ /// --------
156+ ///
157+ /// :func:`vortex.io.VortexWriteOptions`
151158#[ pyfunction]
152159#[ pyo3( signature = ( iter, path) ) ]
153160pub fn write ( iter : PyIntoArrayIterator , path : & str ) -> PyResult < ( ) > {
@@ -161,6 +168,125 @@ pub fn write(iter: PyIntoArrayIterator, path: &str) -> PyResult<()> {
161168 Ok ( ( ) )
162169}
163170
171+ /// Write Vortex files with custom configuration.
172+ ///
173+ /// See also
174+ /// --------
175+ ///
176+ /// :func:`vortex.io.write`.
177+ #[ pyclass( name = "VortexWriteOptions" , module = "io" , frozen) ]
178+ pub ( crate ) struct PyVortexWriteOptions {
179+ // TODO(DK): This might need to be an Arc<dyn Compressor> if we actually have multiple
180+ // compressors.
181+ compressor : Option < CompactCompressor > ,
182+ }
183+
184+ #[ pymethods]
185+ impl PyVortexWriteOptions {
186+ /// Balance size, read-throughput, and read-latency.
187+ #[ staticmethod]
188+ pub fn default ( ) -> Self {
189+ Self { compressor : None }
190+ }
191+
192+ /// Prioritize small size over read-throughput and read-latency.
193+ ///
194+ /// Let's model some stock ticker data. As you may know, the stock market always (noisly) goes
195+ /// up:
196+ ///
197+ /// ```python
198+ /// >>> import os
199+ /// >>> import random
200+ /// >>> sprl = vx.array([random.randint(i, i + 10) for i in range(100_000)])
201+ /// ```
202+ ///
203+ /// If we naively wrote 4-bytes for each of these integers to a file we'd have 400,000 bytes!
204+ /// Let's see how small this is when we write with the default Vortex write options (which are
205+ /// also used by :func:`vortex.io.write`):
206+ ///
207+ /// ```python
208+ /// >>> vx.io.VortexWriteOptions.default().write_path(sprl, "chonky.vortex")
209+ /// >>> import os
210+ /// >>> os.path.getsize('chonky.vortex')
211+ /// 215196
212+ /// ```
213+ ///
214+ /// Wow, Vortex manages to use about two bytes per integer! So advanced. So tiny.
215+ ///
216+ /// But can we do better?
217+ ///
218+ /// We sure can.
219+ ///
220+ /// ```python
221+ /// >>> vx.io.VortexWriteOptions.compact().write_path(sprl, "tiny.vortex")
222+ /// >>> os.path.getsize('tiny.vortex')
223+ /// 54200
224+ /// ```
225+ ///
226+ /// Random numbers are not (usually) composed of random bytes!
227+ #[ staticmethod]
228+ pub fn compact ( ) -> Self {
229+ Self {
230+ compressor : Some ( CompactCompressor :: default ( ) ) ,
231+ }
232+ }
233+
234+ /// Write an array or iterator of arrays into a local file.
235+ ///
236+ ///
237+ /// Parameters
238+ /// ----------
239+ /// iter : vortex.Array | vortex.ArrayIterator | pyarrow.Table | pyarrow.RecordBatchReader
240+ /// The data to write. Can be a single array, an array iterator, or a PyArrow object that supports streaming.
241+ /// When using PyArrow objects, data is streamed directly without loading the entire dataset into memory.
242+ ///
243+ /// path : str
244+ /// The file path.
245+ ///
246+ /// Examples
247+ /// --------
248+ ///
249+ /// Write a single Vortex array `a` to the local file `a.vortex` using the default settings:
250+ ///
251+ /// ```python
252+ /// >>> import vortex as vx
253+ /// >>> import random
254+ /// >>> a = vx.array([0, 1, 2, 3, None, 4])
255+ /// >>> vx.io.VortexWriteOptions.default().write_path(a, "a.vortex") # doctest: +SKIP
256+ /// ```
257+ ///
258+ /// Write the same array while preferring small file sizes over read-throughput and
259+ /// read-latency:
260+ ///
261+ /// ```python
262+ /// >>> import vortex as vx
263+ /// >>> vx.io.VortexWriteOptions.compact().write_path(a, "a.vortex") # doctest: +SKIP
264+ /// ```
265+ ///
266+ /// See also
267+ /// --------
268+ ///
269+ /// :func:`vortex.io.write`
270+ #[ pyo3( signature = ( iter, path) ) ]
271+ pub fn write_path ( & self , iter : PyIntoArrayIterator , path : & str ) -> PyResult < ( ) > {
272+ TOKIO_RUNTIME . block_on ( async move {
273+ let mut file = File :: create ( path) . await ?;
274+
275+ let mut strategy = WriteStrategyBuilder :: new ( ) ;
276+ if let Some ( compressor) = self . compressor . as_ref ( ) {
277+ strategy = strategy. with_compressor ( compressor. clone ( ) )
278+ }
279+
280+ VortexWriteOptions :: default ( )
281+ . with_strategy ( strategy. build ( ) )
282+ . write ( & mut file, iter. into_inner ( ) . into_array_stream ( ) )
283+ . await
284+ } ) ?;
285+
286+ Ok ( ( ) )
287+ }
288+ }
289+
164290/// Conversion type for converting Python objects into a [`vortex::ArrayIterator`].
165291pub type PyIntoArrayIterator = PyVortex < Box < dyn ArrayIterator + Send > > ;
166292
0 commit comments