Skip to content

Commit 32aa62c

Browse files
delta003danking
andauthored
Add RepeatedScan in vortex-data python package (#4426)
Co-authored-by: Dan King <dan@spiraldb.com>
1 parent 5573557 commit 32aa62c

11 files changed

Lines changed: 363 additions & 28 deletions

File tree

docs/api/python/io.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ HTTP, S3, Google Cloud Storage, and Azure Blob Storage.
99

1010
~vortex.open
1111
~vortex.VortexFile
12+
~vortex.RepeatedScan
1213
~vortex.io.read_url
1314
~vortex.io.write
1415

@@ -21,6 +22,9 @@ HTTP, S3, Google Cloud Storage, and Azure Blob Storage.
2122
.. autoclass:: vortex.VortexFile
2223
:members:
2324

25+
.. autoclass:: vortex.RepeatedScan
26+
:members:
27+
2428
.. automodule:: vortex.io
2529
:members:
2630
:imported-members:

vortex-python/python/vortex/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4-
from . import _lib, arrays, dataset, expr, file, io, ray
4+
from . import _lib, arrays, dataset, expr, file, io, ray, scan
55
from ._lib.arrays import ( # pyright: ignore[reportMissingModuleSource]
66
AlpArray,
77
AlpRdArray,
@@ -67,6 +67,7 @@
6767
from ._lib.serde import ArrayContext, ArrayParts # pyright: ignore[reportMissingModuleSource]
6868
from .arrays import Array, PyArray, array
6969
from .file import VortexFile, open
70+
from .scan import RepeatedScan
7071

7172
assert _lib, "Ensure we eagerly import the Vortex native library"
7273

@@ -76,6 +77,7 @@
7677
"dataset",
7778
"expr",
7879
"file",
80+
"scan",
7981
"io",
8082
"ray",
8183
# --- Objects and Functions ---
@@ -149,6 +151,8 @@
149151
"open",
150152
# Iterator
151153
"ArrayIterator",
154+
# Scan
155+
"RepeatedScan",
152156
]
153157

154158
#: The default registry for Vortex

vortex-python/python/vortex/_lib/file.pyi

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ from .dataset import VortexDataset
1313
from .dtype import DType
1414
from .expr import Expr
1515
from .iter import ArrayIterator
16+
from .scan import RepeatedScan
1617

1718
@final
1819
class VortexFile:
@@ -27,6 +28,14 @@ class VortexFile:
2728
indices: Array | None = None,
2829
batch_size: int | None = None,
2930
) -> ArrayIterator: ...
31+
def prepare(
32+
self,
33+
projection: IntoProjection = None,
34+
*,
35+
expr: Expr | None = None,
36+
indices: Array | None = None,
37+
batch_size: int | None = None,
38+
) -> RepeatedScan: ...
3039
def to_arrow(
3140
self,
3241
projection: IntoProjection = None,
@@ -37,4 +46,4 @@ class VortexFile:
3746
def to_dataset(self) -> VortexDataset: ...
3847
def to_polars(self) -> pl.LazyFrame: ...
3948

40-
def open(path: str) -> VortexFile: ...
49+
def open(path: str, *, without_segment_cache: bool = False) -> VortexFile: ...
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
from typing import final
5+
6+
from vortex import ArrayIterator
7+
8+
from .scalar import Scalar
9+
10+
@final
11+
class RepeatedScan:
12+
def execute(
13+
self,
14+
*,
15+
start: int | None = None,
16+
stop: int | None = None,
17+
) -> ArrayIterator: ...
18+
def scalar_at(self, index: int) -> Scalar: ...

vortex-python/python/vortex/file.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,20 +14,23 @@
1414
from ._lib.expr import Expr # pyright: ignore[reportMissingModuleSource]
1515
from ._lib.iter import ArrayIterator # pyright: ignore[reportMissingModuleSource]
1616
from .dataset import VortexDataset
17+
from .scan import RepeatedScan
1718
from .type_aliases import IntoProjection, RecordBatchReader
1819

1920
if TYPE_CHECKING:
2021
import polars
2122

2223

23-
def open(path: str) -> VortexFile:
24+
def open(path: str, *, without_segment_cache: bool = False) -> VortexFile:
2425
"""
2526
Lazily open a Vortex file located at the given path or URL.
2627
2728
Parameters
2829
----------
2930
path : :class:`str`
3031
A local path or URL to the Vortex file.
32+
without_segment_cache : :class:`bool`
33+
If true, disable the segment cache for this file, useful when memory is constrained.
3134
3235
Examples
3336
--------
@@ -40,7 +43,7 @@ def open(path: str) -> VortexFile:
4043
See also: :class:`vortex.dataset.VortexDataset`
4144
"""
4245

43-
return VortexFile(_file.open(path))
46+
return VortexFile(_file.open(path, without_segment_cache=without_segment_cache))
4447

4548

4649
@final
@@ -144,6 +147,29 @@ def scan(
144147
"""
145148
return self._file.scan(projection, expr=expr, indices=indices, batch_size=batch_size)
146149

150+
def to_repeated_scan(
151+
self,
152+
projection: IntoProjection = None,
153+
*,
154+
expr: Expr | None = None,
155+
indices: Array | None = None,
156+
batch_size: int | None = None,
157+
) -> RepeatedScan:
158+
"""Prepare a scan of the Vortex file for repeated reads, returning a :class:`vortex.RepeatedScan`.
159+
160+
Parameters
161+
----------
162+
projection : :class:`vortex.Expr` | list[str] | None
163+
The projection expression to read, or else read all columns.
164+
expr : :class:`vortex.Expr` | None
165+
The predicate used to filter rows. The filter columns do not need to be in the projection.
166+
indices : :class:`vortex.Array` | None
167+
The indices of the rows to read. Must be sorted and non-null.
168+
batch_size : :class:`int` | None
169+
The number of rows to read per chunk.
170+
"""
171+
return RepeatedScan(self._file.prepare(projection, expr=expr, indices=indices, batch_size=batch_size))
172+
147173
def to_arrow(
148174
self,
149175
projection: IntoProjection = None,
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
from __future__ import annotations
5+
6+
from typing import final
7+
8+
from ._lib import scan as _scan # pyright: ignore[reportMissingModuleSource]
9+
from ._lib.iter import ArrayIterator # pyright: ignore[reportMissingModuleSource]
10+
from ._lib.scalar import Scalar # pyright: ignore[reportMissingModuleSource]
11+
12+
13+
@final
14+
class RepeatedScan:
15+
"""
16+
A prepared scan that is optimized for repeated execution.
17+
"""
18+
19+
def __init__(self, scan: _scan.RepeatedScan):
20+
self._scan = scan
21+
22+
def execute(
23+
self,
24+
*,
25+
row_range: tuple[int, int] | None = None,
26+
) -> ArrayIterator:
27+
"""Execute the scan returning a :class:`vortex.ArrayIterator`.
28+
29+
Parameters
30+
----------
31+
row_range : tuple[int, int] | None
32+
Tuple is interpreted as [start, stop).
33+
34+
Examples
35+
--------
36+
37+
Scan a file with a structured column and nulls at multiple levels and in multiple columns.
38+
39+
>>> import vortex as vx
40+
>>> import vortex.expr as ve
41+
>>> a = vx.array([
42+
... {'name': 'Joseph', 'age': 25},
43+
... {'name': None, 'age': 31},
44+
... {'name': 'Angela', 'age': None},
45+
... {'name': 'Mikhail', 'age': 57},
46+
... {'name': None, 'age': None},
47+
... ])
48+
>>> vx.io.write(a, "a.vortex")
49+
>>> scan = vx.open("a.vortex").to_repeated_scan()
50+
>>> scan.execute(row_range=(1, 3)).read_all().to_arrow_array()
51+
<pyarrow.lib.StructArray object at ...>
52+
-- is_valid: all not null
53+
-- child 0 type: int64
54+
[
55+
31,
56+
null
57+
]
58+
-- child 1 type: string_view
59+
[
60+
null,
61+
"Angela"
62+
]
63+
"""
64+
if row_range is None:
65+
start, stop = None, None
66+
else:
67+
start, stop = row_range
68+
return self._scan.execute(start=start, stop=stop)
69+
70+
def scalar_at(self, index: int) -> Scalar:
71+
"""Fetch a scalar from the scan returning a :class:`vortex.Scalar`.
72+
73+
Parameters
74+
----------
75+
index : int
76+
The row index to fetch. Raises an :class:`IndexError` if out of bounds or
77+
if the given row index was not included in the scan.
78+
79+
Examples
80+
--------
81+
82+
Scan a file with a structured column and nulls at multiple levels and in multiple columns.
83+
84+
>>> import vortex as vx
85+
>>> import vortex.expr as ve
86+
>>> a = vx.array([
87+
... {'name': 'Joseph', 'age': 25},
88+
... {'name': None, 'age': 31},
89+
... {'name': 'Angela', 'age': None},
90+
... {'name': 'Mikhail', 'age': 57},
91+
... {'name': None, 'age': None},
92+
... ])
93+
>>> vx.io.write(a, "a.vortex")
94+
>>> scan = vx.open("a.vortex").to_repeated_scan()
95+
>>> scan.scalar_at(1)
96+
<vortex.StructScalar object at ...>
97+
"""
98+
return self._scan.scalar_at(index)

0 commit comments

Comments
 (0)