Adding dtypes option to Workflow (NVIDIA-Merlin#392)

Alberto Alvarez · web-flow · commit 1edf2e078696 · 2020-11-03T09:37:00.000-08:00
diff --git a/nvtabular/workflow.py b/nvtabular/workflow.py
@@ -24,7 +24,7 @@
 from fsspec.core import get_fs_token_paths
 
 from nvtabular.io.dask import _ddf_to_dataset
-from nvtabular.io.dataset import Dataset
+from nvtabular.io.dataset import Dataset, _set_dtypes
 from nvtabular.io.shuffle import Shuffle, _check_shuffle_arg
 from nvtabular.io.writer_factory import writer_factory
 from nvtabular.ops import DFOperator, StatOperator, TransformOperator
@@ -483,7 +483,9 @@ def _run_trans_ops_for_phase(self, gdf, tasks):
                 gdf = op.apply_op(gdf, self.columns_ctx, cols_grp, target_cols=target_cols)
         return gdf
 
-    def apply_ops(self, gdf, start_phase=None, end_phase=None, writer=None, output_path=None):
+    def apply_ops(
+        self, gdf, start_phase=None, end_phase=None, writer=None, output_path=None, dtypes=None
+    ):
         """
         gdf: cudf dataframe
         Controls the application of registered preprocessing phase op
@@ -508,6 +510,8 @@ def apply_ops(self, gdf, start_phase=None, end_phase=None, writer=None, output_p
                     writer.need_cal_col_names = False
 
                 start_write = time.time()
+                # Special dtype conversion
+                gdf = _set_dtypes(gdf, dtypes)
                 writer.add_data(gdf)
                 self.timings["write_df"] += time.time() - start_write
 
@@ -714,6 +718,7 @@ def apply(
         output_format="parquet",
         out_files_per_proc=None,
         num_io_threads=0,
+        dtypes=None,
     ):
         """
         Runs all the preprocessing and feature engineering operators.
@@ -753,6 +758,9 @@ def apply(
         num_io_threads : integer
             Number of IO threads to use for writing the output dataset.
             For `0` (default), no dedicated IO threads will be used.
+        dtypes : dict
+            Dictionary containing desired datatypes for output columns.
+            Keys are column names, values are datatypes.
         """
 
         # Check shuffle argument
@@ -773,6 +781,7 @@ def apply(
                 output_format=output_format,
                 out_files_per_proc=out_files_per_proc,
                 num_io_threads=num_io_threads,
+                dtypes=dtypes,
             )
         else:
             self.iterate_online(
@@ -782,6 +791,7 @@ def apply(
                 output_format=output_format,
                 out_files_per_proc=out_files_per_proc,
                 num_io_threads=num_io_threads,
+                dtypes=dtypes,
             )
 
     def iterate_online(
@@ -794,6 +804,7 @@ def iterate_online(
         out_files_per_proc=None,
         apply_ops=True,
         num_io_threads=0,
+        dtypes=None,
     ):
         """Iterate through dataset and (optionally) apply/shuffle/write."""
         # Check shuffle argument
@@ -813,8 +824,9 @@ def iterate_online(
 
         # Iterate through dataset, apply ops, and write out processed data
         if apply_ops:
-            for gdf in dataset.to_iter(shuffle=(shuffle is not None)):
-                self.apply_ops(gdf, output_path=output_path, writer=writer)
+            columns = self.columns_ctx["all"]["base"]
+            for gdf in dataset.to_iter(shuffle=(shuffle is not None), columns=columns):
+                self.apply_ops(gdf, output_path=output_path, writer=writer, dtypes=dtypes)
 
         # Close writer and write general/specialized metadata
         if writer:
@@ -844,6 +856,7 @@ def build_and_process_graph(
         out_files_per_proc=None,
         apply_ops=True,
         num_io_threads=0,
+        dtypes=None,
     ):
         """Build Dask-task graph for workflow.
 
@@ -873,6 +886,12 @@ def build_and_process_graph(
             for idx, _ in enumerate(self.phases[:end]):
                 self.exec_phase(idx, record_stats=record_stats, update_ddf=(idx == (end - 1)))
             self._base_phase = 0  # Re-Set _base_phase
+
+        if dtypes:
+            ddf = self.get_ddf()
+            _meta = _set_dtypes(ddf._meta, dtypes)
+            self.set_ddf(ddf.map_partitions(_set_dtypes, dtypes, meta=_meta))
+
         if output_format:
             output_path = output_path or "./"
             output_path = str(output_path)
@@ -895,6 +914,7 @@ def write_to_dataset(
         iterate=False,
         nfiles=None,
         num_io_threads=0,
+        dtypes=None,
     ):
         """Write data to shuffled parquet dataset.
 
@@ -919,6 +939,7 @@ def write_to_dataset(
                 out_files_per_proc=out_files_per_proc,
                 apply_ops=apply_ops,
                 num_io_threads=num_io_threads,
+                dtypes=dtypes,
             )
         else:
             self.build_and_process_graph(
@@ -930,6 +951,7 @@ def write_to_dataset(
                 out_files_per_proc=out_files_per_proc,
                 apply_ops=apply_ops,
                 num_io_threads=num_io_threads,
+                dtypes=dtypes,
             )
 
     def ddf_to_dataset(
diff --git a/tests/unit/test_workflow.py b/tests/unit/test_workflow.py
@@ -442,3 +442,71 @@ def test_chaining_3():
     assert all(
         x in result.columns for x in ["ad_id_count", "ad_id_clicked_sum_ctr", "ad_id_clicked_sum"]
     )
+
+
+@pytest.mark.parametrize("shuffle", [nvt.io.Shuffle.PER_WORKER, nvt.io.Shuffle.PER_PARTITION, None])
+@pytest.mark.parametrize("use_client", [True, False])
+@pytest.mark.parametrize("apply_offline", [True, False])
+def test_workflow_apply(client, use_client, tmpdir, shuffle, apply_offline):
+    out_files_per_proc = 2
+    out_path = str(tmpdir.mkdir("processed"))
+    path = str(tmpdir.join("simple.parquet"))
+
+    size = 25
+    row_group_size = 5
+
+    cont_columns = ["cont1", "cont2"]
+    cat_columns = ["cat1", "cat2"]
+    label_column = ["label"]
+
+    df = pd.DataFrame(
+        {
+            "cont1": np.arange(size, dtype=np.float64),
+            "cont2": np.arange(size, dtype=np.float64),
+            "cat1": np.arange(size, dtype=np.int32),
+            "cat2": np.arange(size, dtype=np.int32),
+            "label": np.arange(size, dtype=np.float64),
+        }
+    )
+    df.to_parquet(path, row_group_size=row_group_size, engine="pyarrow")
+
+    dataset = nvt.Dataset(path, engine="parquet", row_groups_per_part=1)
+    processor = nvt.Workflow(
+        cat_names=cat_columns,
+        cont_names=cont_columns,
+        label_name=label_column,
+        client=client if use_client else None,
+    )
+    processor.add_cont_feature([ops.FillMissing(), ops.Clip(min_value=0), ops.LogOp()])
+    processor.add_cat_preprocess(ops.Categorify())
+
+    processor.finalize()
+    # Force dtypes
+    dict_dtypes = {}
+    for col in cont_columns:
+        dict_dtypes[col] = np.float32
+    for col in cat_columns:
+        dict_dtypes[col] = np.float32
+    for col in label_column:
+        dict_dtypes[col] = np.int64
+
+    if not apply_offline:
+        processor.apply(
+            dataset,
+            output_format=None,
+            record_stats=True,
+        )
+    processor.apply(
+        dataset,
+        apply_offline=apply_offline,
+        record_stats=apply_offline,
+        output_path=out_path,
+        shuffle=shuffle,
+        out_files_per_proc=out_files_per_proc,
+        dtypes=dict_dtypes,
+    )
+
+    # Check dtypes
+    for filename in glob.glob(os.path.join(out_path, "*.parquet")):
+        gdf = cudf.io.read_parquet(filename)
+        assert dict(gdf.dtypes) == dict_dtypes