Skip to content

Commit 8b8c9ee

Browse files
authored
fix[vortex-array]: propagate preferred type conversion to nested fields (#4390)
When arrow_type is None, vortex to arrow conversion should respect conversion to the encoding's preferred arrow type for all nested fields. Previous to this commit, this preference was lost for any fields within structs or lists. For example, a struct with a utf8 dictionary field would lose the dictionary encoding. Signed-off-by: Alfonso Subiotto Marques <alfonso.subiotto@polarsignals.com>
1 parent f5473e5 commit 8b8c9ee

4 files changed

Lines changed: 32 additions & 14 deletions

File tree

docs/guides/python-integrations.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ If you have a struct array, use {meth}`.Array.to_arrow_table` to construct an Ar
9898
>>> struct_arr.to_arrow_table()
9999
pyarrow.Table
100100
age: int64
101-
name: string_view
101+
name: string
102102
----
103103
age: [[25,31,33,57]]
104104
name: [["Joseph","Narendra","Angela","Mikhail"]]

vortex-array/src/arrow/compute/to_arrow/canonical.rs

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,17 @@ pub(super) struct ToArrowCanonical;
3939
impl Kernel for ToArrowCanonical {
4040
#[allow(clippy::cognitive_complexity)]
4141
fn invoke(&self, args: &InvocationArgs) -> VortexResult<Option<Output>> {
42-
let ToArrowArgs { array, arrow_type } = ToArrowArgs::try_from(args)?;
42+
let ToArrowArgs {
43+
array,
44+
arrow_type: arrow_type_opt,
45+
} = ToArrowArgs::try_from(args)?;
4346
if !array.is_canonical() {
4447
// Not handled by this kernel
4548
return Ok(None);
4649
}
4750

4851
// Figure out the target Arrow type, or use the canonical type
49-
let arrow_type = arrow_type
52+
let arrow_type = arrow_type_opt
5053
.cloned()
5154
.map(Ok)
5255
.unwrap_or_else(|| array.dtype().to_arrow_dtype())?;
@@ -136,11 +139,13 @@ impl Kernel for ToArrowCanonical {
136139
to_arrow_decimal256(array)
137140
}
138141
(Canonical::Struct(array), DataType::Struct(fields)) => {
139-
to_arrow_struct(array, fields.as_ref())
142+
to_arrow_struct(array, fields.as_ref(), arrow_type_opt.is_none())
143+
}
144+
(Canonical::List(array), DataType::List(field)) => {
145+
to_arrow_list::<i32>(array, field, arrow_type_opt.is_none())
140146
}
141-
(Canonical::List(array), DataType::List(field)) => to_arrow_list::<i32>(array, field),
142147
(Canonical::List(array), DataType::LargeList(field)) => {
143-
to_arrow_list::<i64>(array, field)
148+
to_arrow_list::<i64>(array, field, arrow_type_opt.is_none())
144149
}
145150
(Canonical::VarBinView(array), DataType::BinaryView) if array.dtype().is_binary() => {
146151
to_arrow_varbinview::<BinaryViewType>(array)
@@ -277,7 +282,11 @@ fn to_arrow_decimal256(array: DecimalArray) -> VortexResult<ArrowArrayRef> {
277282
))
278283
}
279284

280-
fn to_arrow_struct(array: StructArray, fields: &[FieldRef]) -> VortexResult<ArrowArrayRef> {
285+
fn to_arrow_struct(
286+
array: StructArray,
287+
fields: &[FieldRef],
288+
to_preferred: bool,
289+
) -> VortexResult<ArrowArrayRef> {
281290
if array.fields().len() != fields.len() {
282291
vortex_bail!(
283292
"StructArray has {} fields, but target Arrow type has {} fields",
@@ -301,9 +310,12 @@ fn to_arrow_struct(array: StructArray, fields: &[FieldRef]) -> VortexResult<Arro
301310
);
302311
}
303312

304-
arr.clone()
305-
.into_arrow(field.data_type())
306-
.map_err(|err| err.with_context(format!("Failed to canonicalize field {field}")))
313+
let result = if to_preferred {
314+
arr.clone().into_arrow_preferred()
315+
} else {
316+
arr.clone().into_arrow(field.data_type())
317+
};
318+
result.map_err(|err| err.with_context(format!("Failed to canonicalize field {field}")))
307319
})
308320
.collect::<VortexResult<Vec<_>>>()?;
309321

@@ -341,14 +353,19 @@ fn to_arrow_struct(array: StructArray, fields: &[FieldRef]) -> VortexResult<Arro
341353
fn to_arrow_list<O: NativePType + OffsetSizeTrait>(
342354
array: ListArray,
343355
element: &FieldRef,
356+
to_preferred: bool,
344357
) -> VortexResult<ArrowArrayRef> {
345358
// First we cast the offsets into the correct width.
346359
let offsets_dtype = DType::Primitive(O::PTYPE, array.dtype().nullability());
347360
let arrow_offsets = cast(array.offsets(), &offsets_dtype)
348361
.map_err(|err| err.with_context(format!("Failed to cast offsets to {offsets_dtype}")))?
349362
.to_primitive()?;
350363

351-
let values = array.elements().clone().into_arrow(element.data_type())?;
364+
let values = if to_preferred {
365+
array.elements().clone().into_arrow_preferred()?
366+
} else {
367+
array.elements().clone().into_arrow(element.data_type())?
368+
};
352369
let nulls = array.validity_mask()?.to_null_buffer();
353370

354371
Ok(Arc::new(GenericListArray::new(

vortex-layout/src/layouts/dict/reader.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,8 @@ mod tests {
328328
.unwrap()
329329
.into_array();
330330
let actual = actual.into_arrow_preferred().unwrap();
331-
let expected = expected.into_arrow_preferred().unwrap();
331+
let expected_arrow_dtype = expected.dtype().to_arrow_dtype().unwrap();
332+
let expected = expected.into_arrow(&expected_arrow_dtype).unwrap();
332333
assert_eq!(actual.data_type(), expected.data_type());
333334
assert_eq!(&actual, &expected);
334335
}
@@ -340,7 +341,7 @@ mod tests {
340341
vec![true, false, true], // Expected: nulls excluded, all dict values match
341342
)]
342343
#[case::all_false_case(
343-
vec![Some("x"), None, Some("x")], // Dict values: ["x"]
344+
vec![Some("x"), None, Some("x")], // Dict values: ["x"]
344345
"", // Filter for empty string
345346
vec![false, false, false], // Expected: all false, no dict values match
346347
)]

vortex-python/python/vortex/arrays.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def _Array_to_arrow_table(self: _arrays.Array) -> pyarrow.Table:
8080
>>> array.to_arrow_table()
8181
pyarrow.Table
8282
age: int64
83-
name: string_view
83+
name: string
8484
----
8585
age: [[25,31,33,57]]
8686
name: [["Joseph","Narendra","Angela","Mikhail"]]

0 commit comments

Comments
 (0)