Add retry and client options for object store(only s3)

yhjung1 · yhjung1 · commit f5a12db5b73d · 2025-09-01T05:26:00.000Z
Add partitioning functionality for write operations
diff --git a/Cargo.toml b/Cargo.toml
@@ -61,5 +61,5 @@ name = "datafusion_python"
 crate-type = ["cdylib", "rlib"]
 
 [profile.release]
-lto = true
-codegen-units = 1
+lto = "thin"
+#codegen-units = 1
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -26,6 +26,7 @@
     TYPE_CHECKING,
     Any,
     Iterable,
+    List,
     Literal,
     Optional,
     Union,
@@ -56,6 +57,7 @@
 
 from enum import Enum
 
+from datafusion._internal import InsertOp
 
 # excerpt from deltalake
 # https://github.com/apache/datafusion-python/pull/981#discussion_r1905619163
@@ -875,6 +877,7 @@ def except_all(self, other: DataFrame) -> DataFrame:
         """
         return DataFrame(self.df.except_all(other.df))
 
+    @overload
     def write_csv(self, path: str | pathlib.Path, with_header: bool = False) -> None:
         """Execute the :py:class:`DataFrame`  and write the results to a CSV file.
 
@@ -883,6 +886,19 @@ def write_csv(self, path: str | pathlib.Path, with_header: bool = False) -> None
             with_header: If true, output the CSV header row.
         """
         self.df.write_csv(str(path), with_header)
+        
+    @overload
+    def write_csv(self, path: str | pathlib.Path, with_header: bool = False, insert_operation: InsertOp = InsertOp.Append, single_file_output: bool = False, partition_by: Optional[List[str]] = None,) -> None:
+        """Execute the :py:class:`DataFrame`  and write the results to a CSV file.
+
+        Args:
+            path: Path of the CSV file to write.
+            with_header: If true, output the CSV header row.
+            insert_operation: The operation to perform on the CSV file(Append, Overwrite, Replace).
+            single_file_output: If true, write the CSV file as a single file.
+            partition_by: The columns to partition the CSV file by.
+        """
+        self.df.write_csv(str(path), with_header, insert_operation, single_file_output, partition_by or [])
 
     @overload
     def write_parquet(
@@ -911,8 +927,11 @@ def write_parquet(
     def write_parquet(
         self,
         path: str | pathlib.Path,
-        compression: Union[str, Compression, ParquetWriterOptions] = Compression.ZSTD,
+        compression: Union[str, Compression] = Compression.ZSTD,
         compression_level: int | None = None,
+        insert_operation: InsertOp = InsertOp.Append,
+        single_file_output: bool = False,
+        partition_by: Optional[List[str]] = None,
     ) -> None:
         """Execute the :py:class:`DataFrame` and write the results to a Parquet file.
 
@@ -931,12 +950,16 @@ def write_parquet(
             compression_level: Compression level to use. For ZSTD, the
                 recommended range is 1 to 22, with the default being 4. Higher levels
                 provide better compression but slower speed.
+            insert_operation: The operation to perform on the Parquet file(Append, Overwrite, Replace).
+            single_file_output: If true, write the Parquet file as a single file.
+            partition_by: The columns to partition the Parquet file by.
         """
+
         if isinstance(compression, ParquetWriterOptions):
             if compression_level is not None:
                 msg = "compression_level should be None when using ParquetWriterOptions"
                 raise ValueError(msg)
-            self.write_parquet_with_options(path, compression)
+            self.write_parquet_with_options(path, compression, insert_operation, single_file_output, partition_by or [])
             return
 
         if isinstance(compression, str):
@@ -948,10 +971,14 @@ def write_parquet(
         ):
             compression_level = compression.get_default_level()
 
-        self.df.write_parquet(str(path), compression.value, compression_level)
+        self.df.write_parquet(str(path), compression.value, compression_level, insert_operation, single_file_output, partition_by or [])
 
     def write_parquet_with_options(
-        self, path: str | pathlib.Path, options: ParquetWriterOptions
+        self, path: str | pathlib.Path, 
+        options: ParquetWriterOptions, 
+        insert_operation: InsertOp = InsertOp.Append,
+        single_file_output: bool = False,
+        partition_by: Optional[List[str]] = None,
     ) -> None:
         """Execute the :py:class:`DataFrame` and write the results to a Parquet file.
 
@@ -1000,15 +1027,21 @@ def write_parquet_with_options(
             str(path),
             options_internal,
             column_specific_options_internal,
+            insert_operation,
+            single_file_output,
+            partition_by,
         )
 
-    def write_json(self, path: str | pathlib.Path) -> None:
+    def write_json(self, path: str | pathlib.Path, insert_operation: InsertOp = InsertOp.Append, single_file_output: bool = False, partition_by: Optional[List[str]] = None) -> None:
         """Execute the :py:class:`DataFrame` and write the results to a JSON file.
 
         Args:
             path: Path of the JSON file to write.
+            insert_operation: The operation to perform on the JSON file(Append, Overwrite, Replace).
+            single_file_output: If true, write the JSON file as a single file.
+            partition_by: The columns to partition the JSON file by.
         """
-        self.df.write_json(str(path))
+        self.df.write_json(str(path), insert_operation, single_file_output, partition_by or [])
 
     def to_arrow_table(self) -> pa.Table:
         """Execute the :py:class:`DataFrame` and convert it into an Arrow Table.
diff --git a/python/datafusion/object_store.py b/python/datafusion/object_store.py
@@ -24,4 +24,7 @@
 MicrosoftAzure = object_store.MicrosoftAzure
 Http = object_store.Http
 
-__all__ = ["AmazonS3", "GoogleCloud", "Http", "LocalFileSystem", "MicrosoftAzure"]
+RetryConfig = object_store.RetryConfig
+ClientOptions = object_store.ClientOptions
+
+__all__ = ["AmazonS3", "GoogleCloud", "Http", "LocalFileSystem", "MicrosoftAzure", "RetryConfig", "ClientOptions"]
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -34,6 +34,7 @@ use datafusion::dataframe::{DataFrame, DataFrameWriteOptions};
 use datafusion::datasource::TableProvider;
 use datafusion::error::DataFusionError;
 use datafusion::execution::SendableRecordBatchStream;
+use datafusion::logical_expr::dml::InsertOp;
 use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel};
 use datafusion::prelude::*;
 use datafusion_ffi::table_provider::FFI_TableProvider;
@@ -58,6 +59,27 @@ use crate::{
     expr::{sort_expr::PySortExpr, PyExpr},
 };
 
+#[derive(Clone, Copy, PartialEq)]
+#[pyclass(name = "InsertOp", module = "datafusion", eq, eq_int)]
+pub enum PyInsertOp {
+    #[pyo3(name = "Append")]
+    Append,
+    #[pyo3(name = "Overwrite")]
+    Overwrite,
+    #[pyo3(name = "Replace")]
+    Replace,
+}
+
+impl From<PyInsertOp> for InsertOp {
+    fn from(op: PyInsertOp) -> Self {
+        match op {
+            PyInsertOp::Append => InsertOp::Append,
+            PyInsertOp::Overwrite => InsertOp::Overwrite,
+            PyInsertOp::Replace => InsertOp::Replace,
+        }
+    }
+}
+
 // https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116
 // - we have not decided on the table_provider approach yet
 // this is an interim implementation
@@ -743,33 +765,42 @@ impl PyDataFrame {
     }
 
     /// Write a `DataFrame` to a CSV file.
-    fn write_csv(&self, path: &str, with_header: bool, py: Python) -> PyDataFusionResult<()> {
+    fn write_csv(&self, path: &str, with_header: bool, insert_operation: PyInsertOp, single_file_output: bool, partition_by: Vec<String>, py: Python) -> PyDataFusionResult<()> {
         let csv_options = CsvOptions {
             has_header: Some(with_header),
             ..Default::default()
         };
-        wait_for_future(
+        let _ = wait_for_future(
             py,
             self.df.as_ref().clone().write_csv(
                 path,
-                DataFrameWriteOptions::new(),
+                DataFrameWriteOptions::new()
+                    .with_insert_operation(insert_operation.into())
+                    .with_single_file_output(single_file_output)
+                    .with_partition_by(partition_by),
                 Some(csv_options),
             ),
-        )??;
+        )?;
         Ok(())
     }
 
     /// Write a `DataFrame` to a Parquet file.
     #[pyo3(signature = (
         path,
         compression="zstd",
-        compression_level=None
+        compression_level=None,
+        insert_operation=PyInsertOp::Append,
+        single_file_output=false,
+        partition_by=vec![],
         ))]
     fn write_parquet(
         &self,
         path: &str,
         compression: &str,
         compression_level: Option<u32>,
+        insert_operation: PyInsertOp,
+        single_file_output: bool,
+        partition_by: Vec<String>,
         py: Python,
     ) -> PyDataFusionResult<()> {
         fn verify_compression_level(cl: Option<u32>) -> Result<u32, PyErr> {
@@ -813,19 +844,33 @@ impl PyDataFrame {
             py,
             self.df.as_ref().clone().write_parquet(
                 path,
-                DataFrameWriteOptions::new(),
+                DataFrameWriteOptions::new()
+                    .with_insert_operation(insert_operation.into())
+                    .with_single_file_output(single_file_output)
+                    .with_partition_by(partition_by),
                 Option::from(options),
             ),
         )??;
         Ok(())
     }
 
     /// Write a `DataFrame` to a Parquet file, using advanced options.
+    #[pyo3(signature = (
+        path,
+        options,
+        column_specific_options,
+        insert_operation=PyInsertOp::Append,
+        single_file_output=false,
+        partition_by=vec![],
+    ))]
     fn write_parquet_with_options(
         &self,
         path: &str,
         options: PyParquetWriterOptions,
         column_specific_options: HashMap<String, PyParquetColumnOptions>,
+        insert_operation: PyInsertOp,
+        single_file_output: bool,
+        partition_by: Vec<String>,
         py: Python,
     ) -> PyDataFusionResult<()> {
         let table_options = TableParquetOptions {
@@ -841,22 +886,29 @@ impl PyDataFrame {
             py,
             self.df.as_ref().clone().write_parquet(
                 path,
-                DataFrameWriteOptions::new(),
+                DataFrameWriteOptions::new()
+                    .with_insert_operation(insert_operation.into())
+                    .with_single_file_output(single_file_output)
+                    .with_partition_by(partition_by),
                 Option::from(table_options),
             ),
         )??;
         Ok(())
     }
 
     /// Executes a query and writes the results to a partitioned JSON file.
-    fn write_json(&self, path: &str, py: Python) -> PyDataFusionResult<()> {
-        wait_for_future(
+    fn write_json(&self, path: &str, insert_operation: PyInsertOp, single_file_output: bool, partition_by: Vec<String>, py: Python) -> PyDataFusionResult<()> {
+        let _ = wait_for_future(
             py,
             self.df
                 .as_ref()
                 .clone()
-                .write_json(path, DataFrameWriteOptions::new(), None),
-        )??;
+                .write_json(path, DataFrameWriteOptions::new()
+                    .with_insert_operation(insert_operation.into())
+                    .with_single_file_output(single_file_output)
+                    .with_partition_by(partition_by),
+                None),
+        )?;
         Ok(())
     }
 
diff --git a/src/lib.rs b/src/lib.rs
@@ -97,6 +97,7 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<physical_plan::PyExecutionPlan>()?;
     m.add_class::<record_batch::PyRecordBatch>()?;
     m.add_class::<record_batch::PyRecordBatchStream>()?;
+    m.add_class::<dataframe::PyInsertOp>()?;
 
     let catalog = PyModule::new(py, "catalog")?;
     catalog::init_module(&catalog)?;
diff --git a/src/store.rs b/src/store.rs