|
21 | 21 | import pyarrow as pa |
22 | 22 | import pyarrow.dataset as ds |
23 | 23 | import pytest |
| 24 | +from datafusion.object_store import LocalFileSystem |
24 | 25 |
|
25 | | -from datafusion import udf |
| 26 | +from datafusion import udf, col |
26 | 27 |
|
27 | 28 | from . import generic as helpers |
28 | 29 |
|
@@ -374,3 +375,58 @@ def test_simple_select(ctx, tmp_path, arr): |
374 | 375 | result = batches[0].column(0) |
375 | 376 |
|
376 | 377 | np.testing.assert_equal(result, arr) |
| 378 | + |
| 379 | + |
| 380 | +@pytest.mark.parametrize("file_sort_order", (None, [[col("int").sort(True, True)]])) |
| 381 | +@pytest.mark.parametrize("pass_schema", (True, False)) |
| 382 | +def test_register_listing_table(ctx, tmp_path, pass_schema, file_sort_order): |
| 383 | + dir_root = tmp_path / "dataset_parquet_partitioned" |
| 384 | + dir_root.mkdir(exist_ok=False) |
| 385 | + (dir_root / "grp=a/date_id=20201005").mkdir(exist_ok=False, parents=True) |
| 386 | + (dir_root / "grp=a/date_id=20211005").mkdir(exist_ok=False, parents=True) |
| 387 | + (dir_root / "grp=b/date_id=20201005").mkdir(exist_ok=False, parents=True) |
| 388 | + |
| 389 | + table = pa.Table.from_arrays( |
| 390 | + [ |
| 391 | + [1, 2, 3, 4, 5, 6, 7], |
| 392 | + ["a", "b", "c", "d", "e", "f", "g"], |
| 393 | + [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7], |
| 394 | + ], |
| 395 | + names=["int", "str", "float"], |
| 396 | + ) |
| 397 | + pa.parquet.write_table( |
| 398 | + table.slice(0, 3), dir_root / "grp=a/date_id=20201005/file.parquet" |
| 399 | + ) |
| 400 | + pa.parquet.write_table( |
| 401 | + table.slice(3, 2), dir_root / "grp=a/date_id=20211005/file.parquet" |
| 402 | + ) |
| 403 | + pa.parquet.write_table( |
| 404 | + table.slice(5, 10), dir_root / "grp=b/date_id=20201005/file.parquet" |
| 405 | + ) |
| 406 | + |
| 407 | + ctx.register_object_store("file://local", LocalFileSystem(), None) |
| 408 | + ctx.register_listing_table( |
| 409 | + "my_table", |
| 410 | + f"file://{dir_root}/", |
| 411 | + table_partition_cols=[("grp", "string"), ("date_id", "int")], |
| 412 | + file_extension=".parquet", |
| 413 | + schema=table.schema if pass_schema else None, |
| 414 | + file_sort_order=file_sort_order, |
| 415 | + ) |
| 416 | + assert ctx.tables() == {"my_table"} |
| 417 | + |
| 418 | + result = ctx.sql( |
| 419 | + "SELECT grp, COUNT(*) AS count FROM my_table GROUP BY grp" |
| 420 | + ).collect() |
| 421 | + result = pa.Table.from_batches(result) |
| 422 | + |
| 423 | + rd = result.to_pydict() |
| 424 | + assert dict(zip(rd["grp"], rd["count"])) == {"a": 5, "b": 2} |
| 425 | + |
| 426 | + result = ctx.sql( |
| 427 | + "SELECT grp, COUNT(*) AS count FROM my_table WHERE date_id=20201005 GROUP BY grp" |
| 428 | + ).collect() |
| 429 | + result = pa.Table.from_batches(result) |
| 430 | + |
| 431 | + rd = result.to_pydict() |
| 432 | + assert dict(zip(rd["grp"], rd["count"])) == {"a": 3, "b": 2} |
0 commit comments