Allow subset of schema to be passed into load_table_from_dataframe.

tswast · tswast · commit 09bb7059f012 · 2019-08-20T15:21:22.000-07:00
The types of any remaining columns will be auto-detected.
diff --git a/bigquery/google/cloud/bigquery/_pandas_helpers.py b/bigquery/google/cloud/bigquery/_pandas_helpers.py
@@ -187,37 +187,50 @@ def bq_to_arrow_array(series, bq_field):
     return pyarrow.array(series, type=arrow_type)
 
 
-def dataframe_to_bq_schema(dataframe):
+def dataframe_to_bq_schema(dataframe, bq_schema):
     """Convert a pandas DataFrame schema to a BigQuery schema.
 
-    TODO(GH#8140): Add bq_schema argument to allow overriding autodetected
-                   schema for a subset of columns.
-
     Args:
         dataframe (pandas.DataFrame):
-            DataFrame to convert to convert to Parquet file.
+            DataFrame for which the client determines the BigQuery schema.
+        bq_schema (Sequence[google.cloud.bigquery.schema.SchemaField]):
+            A BigQuery schema. Use this argument to override the autodetected
+            type for some or all of the DataFrame columns.
 
     Returns:
         Optional[Sequence[google.cloud.bigquery.schema.SchemaField]]:
             The automatically determined schema. Returns None if the type of
             any column cannot be determined.
     """
-    bq_schema = []
+    if bq_schema:
+        bq_schema_index = {field.name: field for field in bq_schema}
+    else:
+        bq_schema_index = {}
+
+    bq_schema_out = []
     for column, dtype in zip(dataframe.columns, dataframe.dtypes):
+        # Use provided type from schema, if present.
+        bq_field = bq_schema_index.get(column)
+        if bq_field:
+            bq_schema_out.append(bq_field)
+            continue
+
+        # Otherwise, try to automatically determine the type based on the
+        # pandas dtype.
         bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name)
         if not bq_type:
             return None
         bq_field = schema.SchemaField(column, bq_type)
-        bq_schema.append(bq_field)
-    return tuple(bq_schema)
+        bq_schema_out.append(bq_field)
+    return tuple(bq_schema_out)
 
 
 def dataframe_to_arrow(dataframe, bq_schema):
     """Convert pandas dataframe to Arrow table, using BigQuery schema.
 
     Args:
         dataframe (pandas.DataFrame):
-            DataFrame to convert to convert to Parquet file.
+            DataFrame to convert to Arrow table.
         bq_schema (Sequence[google.cloud.bigquery.schema.SchemaField]):
             Desired BigQuery schema. Number of columns must match number of
             columns in the DataFrame.
@@ -255,7 +268,7 @@ def dataframe_to_parquet(dataframe, bq_schema, filepath, parquet_compression="SN
 
     Args:
         dataframe (pandas.DataFrame):
-            DataFrame to convert to convert to Parquet file.
+            DataFrame to convert to Parquet file.
         bq_schema (Sequence[google.cloud.bigquery.schema.SchemaField]):
             Desired BigQuery schema. Number of columns must match number of
             columns in the DataFrame.
diff --git a/bigquery/google/cloud/bigquery/client.py b/bigquery/google/cloud/bigquery/client.py
@@ -1531,14 +1531,15 @@ def load_table_from_dataframe(
         if location is None:
             location = self.location
 
-        if not job_config.schema:
-            autodetected_schema = _pandas_helpers.dataframe_to_bq_schema(dataframe)
-
-            # Only use an explicit schema if we were able to determine one
-            # matching the dataframe. If not, fallback to the pandas to_parquet
-            # method.
-            if autodetected_schema:
-                job_config.schema = autodetected_schema
+        autodetected_schema = _pandas_helpers.dataframe_to_bq_schema(
+            dataframe, job_config.schema
+        )
+
+        # Only use an explicit schema if we were able to determine one
+        # matching the dataframe. If not, fallback to the pandas to_parquet
+        # method.
+        if autodetected_schema:
+            job_config.schema = autodetected_schema
 
         tmpfd, tmppath = tempfile.mkstemp(suffix="_job_{}.parquet".format(job_id[:8]))
         os.close(tmpfd)
diff --git a/bigquery/tests/unit/test_client.py b/bigquery/tests/unit/test_client.py
@@ -5393,6 +5393,90 @@ def test_load_table_from_dataframe_w_automatic_schema(self):
             SchemaField("ts_col", "TIMESTAMP"),
         )
 
+    @unittest.skipIf(pandas is None, "Requires `pandas`")
+    @unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
+    def test_load_table_from_dataframe_w_partial_automatic_schema(self):
+        from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
+        from google.cloud.bigquery import job
+        from google.cloud.bigquery.schema import SchemaField
+
+        client = self._make_client()
+        dt_col = pandas.Series(
+            [
+                datetime.datetime(2010, 1, 2, 3, 44, 50),
+                datetime.datetime(2011, 2, 3, 14, 50, 59),
+                datetime.datetime(2012, 3, 14, 15, 16),
+            ],
+            dtype="datetime64[ns]",
+        )
+        ts_col = pandas.Series(
+            [
+                datetime.datetime(2010, 1, 2, 3, 44, 50),
+                datetime.datetime(2011, 2, 3, 14, 50, 59),
+                datetime.datetime(2012, 3, 14, 15, 16),
+            ],
+            dtype="datetime64[ns]",
+        ).dt.tz_localize(pytz.utc)
+        df_data = {
+            "int_col": [1, 2, 3],
+            "int_as_float_col": [1.0, float("nan"), 3.0],
+            "float_col": [1.0, 2.0, 3.0],
+            "bool_col": [True, False, True],
+            "dt_col": dt_col,
+            "ts_col": ts_col,
+            "string_col": ["abc", "def", "ghi"],
+        }
+        dataframe = pandas.DataFrame(
+            df_data,
+            columns=[
+                "int_col",
+                "int_as_float_col",
+                "float_col",
+                "bool_col",
+                "dt_col",
+                "ts_col",
+                "string_col",
+            ],
+        )
+        load_patch = mock.patch(
+            "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
+        )
+
+        schema = (
+            SchemaField("int_as_float_col", "INTEGER"),
+            SchemaField("string_col", "STRING"),
+        )
+        job_config = job.LoadJobConfig(schema=schema)
+        with load_patch as load_table_from_file:
+            client.load_table_from_dataframe(
+                dataframe, self.TABLE_REF, job_config=job_config, location=self.LOCATION
+            )
+
+        load_table_from_file.assert_called_once_with(
+            client,
+            mock.ANY,
+            self.TABLE_REF,
+            num_retries=_DEFAULT_NUM_RETRIES,
+            rewind=True,
+            job_id=mock.ANY,
+            job_id_prefix=None,
+            location=self.LOCATION,
+            project=None,
+            job_config=mock.ANY,
+        )
+
+        sent_config = load_table_from_file.mock_calls[0][2]["job_config"]
+        assert sent_config.source_format == job.SourceFormat.PARQUET
+        assert tuple(sent_config.schema) == (
+            SchemaField("int_col", "INTEGER"),
+            SchemaField("int_as_float_col", "INTEGER"),
+            SchemaField("float_col", "FLOAT"),
+            SchemaField("bool_col", "BOOLEAN"),
+            SchemaField("dt_col", "DATETIME"),
+            SchemaField("ts_col", "TIMESTAMP"),
+            SchemaField("string_col", "STRING"),
+        )
+
     @unittest.skipIf(pandas is None, "Requires `pandas`")
     @unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
     def test_load_table_from_dataframe_w_schema_wo_pyarrow(self):
@@ -5402,7 +5486,7 @@ def test_load_table_from_dataframe_w_schema_wo_pyarrow(self):
 
         client = self._make_client()
         records = [{"name": "Monty", "age": 100}, {"name": "Python", "age": 60}]
-        dataframe = pandas.DataFrame(records)
+        dataframe = pandas.DataFrame(records, columns=["name", "age"])
         schema = (SchemaField("name", "STRING"), SchemaField("age", "INTEGER"))
         job_config = job.LoadJobConfig(schema=schema)
 
@@ -5514,7 +5598,7 @@ def test_load_table_from_dataframe_w_nulls(self):
 
         client = self._make_client()
         records = [{"name": None, "age": None}, {"name": None, "age": None}]
-        dataframe = pandas.DataFrame(records)
+        dataframe = pandas.DataFrame(records, columns=["name", "age"])
         schema = [SchemaField("name", "STRING"), SchemaField("age", "INTEGER")]
         job_config = job.LoadJobConfig(schema=schema)