BigQuery: Adds load_table_from_dataframe() and snippet (#5387)

alixhami · web-flow · commit fe57c5a0d44c · 2018-05-29T11:04:05.000-07:00
* Adds load_table_from_dataframe() and snippet

* Add index to DataFrame in bigquery_load_table_dataframe sample
diff --git a/bigquery/google/cloud/bigquery/client.py b/bigquery/google/cloud/bigquery/client.py
@@ -773,8 +773,8 @@ def load_table_from_file(
             job_config=None):
         """Upload the contents of this table from a file-like object.
 
-        Like load_table_from_uri, this creates, starts and returns
-        a ``LoadJob``.
+        Similar to :meth:`load_table_from_uri`, this method creates, starts and
+        returns a :class:`~google.cloud.bigquery.job.LoadJob`.
 
         Arguments:
             file_obj (file): A file handle opened in binary mode for reading.
@@ -833,6 +833,63 @@ def load_table_from_file(
             raise exceptions.from_http_response(exc.response)
         return self.job_from_resource(response.json())
 
+    def load_table_from_dataframe(self, dataframe, destination,
+                                  num_retries=_DEFAULT_NUM_RETRIES,
+                                  job_id=None, job_id_prefix=None,
+                                  location=None, project=None,
+                                  job_config=None):
+        """Upload the contents of a table from a pandas DataFrame.
+
+        Similar to :meth:`load_table_from_uri`, this method creates, starts and
+        returns a :class:`~google.cloud.bigquery.job.LoadJob`.
+
+        Arguments:
+            dataframe (pandas.DataFrame):
+                A :class:`~pandas.DataFrame` containing the data to load.
+            destination (google.cloud.bigquery.table.TableReference):
+                The destination table to use for loading the data. If it is an
+                existing table, the schema of the :class:`~pandas.DataFrame`
+                must match the schema of the destination table. If the table
+                does not yet exist, the schema is inferred from the
+                :class:`~pandas.DataFrame`.
+
+        Keyword Arguments:
+            num_retries (int, optional): Number of upload retries.
+            job_id (str, optional): Name of the job.
+            job_id_prefix (str, optional):
+                The user-provided prefix for a randomly generated
+                job ID. This parameter will be ignored if a ``job_id`` is
+                also given.
+            location (str):
+                Location where to run the job. Must match the location of the
+                destination table.
+            project (str, optional):
+                Project ID of the project of where to run the job. Defaults
+                to the client's project.
+            job_config (google.cloud.bigquery.job.LoadJobConfig, optional):
+                Extra configuration options for the job.
+
+        Returns:
+            google.cloud.bigquery.job.LoadJob: A new load job.
+
+        Raises:
+            ImportError:
+                If a usable parquet engine cannot be found. This method
+                requires one of :mod:`pyarrow` or :mod:`fastparquet` to be
+                installed.
+        """
+        buffer = six.BytesIO()
+        dataframe.to_parquet(buffer)
+
+        if job_config is None:
+            job_config = job.LoadJobConfig()
+        job_config.source_format = job.SourceFormat.PARQUET
+
+        return self.load_table_from_file(
+            buffer, destination, num_retries=num_retries, rewind=True,
+            job_id=job_id, job_id_prefix=job_id_prefix, location=location,
+            project=project, job_config=job_config)
+
     def _do_resumable_upload(self, stream, metadata, num_retries):
         """Perform a resumable upload.
 
diff --git a/bigquery/nox.py b/bigquery/nox.py
@@ -41,7 +41,7 @@ def default(session):
     if session.interpreter == 'python3.4':
         session.install('-e', '.')
     else:
-        session.install('-e', '.[pandas]')
+        session.install('-e', '.[pandas, pyarrow]')
 
     # IPython does not support Python 2 after version 5.x
     if session.interpreter == 'python2.7':
@@ -142,7 +142,7 @@ def snippets(session, py):
         os.path.join('..', 'storage'),
         os.path.join('..', 'test_utils'),
     )
-    session.install('-e', '.[pandas]')
+    session.install('-e', '.[pandas, pyarrow]')
 
     # Run py.test against the system tests.
     session.run(
diff --git a/bigquery/setup.py b/bigquery/setup.py
@@ -35,6 +35,7 @@
 ]
 extras = {
     'pandas': 'pandas>=0.17.1',
+    'pyarrow': 'pyarrow>=0.4.1',
 }
 
 
diff --git a/bigquery/tests/unit/test_client.py b/bigquery/tests/unit/test_client.py
@@ -23,6 +23,14 @@
 import six
 from six.moves import http_client
 import pytest
+try:
+    import pandas
+except (ImportError, AttributeError):  # pragma: NO COVER
+    pandas = None
+try:
+    import pyarrow
+except (ImportError, AttributeError):  # pragma: NO COVER
+    pyarrow = None
 
 from google.cloud.bigquery.dataset import DatasetReference
 
@@ -3484,6 +3492,68 @@ def test_load_table_from_file_bad_mode(self):
         with pytest.raises(ValueError):
             client.load_table_from_file(file_obj, self.TABLE_REF)
 
+    @unittest.skipIf(pandas is None, 'Requires `pandas`')
+    @unittest.skipIf(pyarrow is None, 'Requires `pyarrow`')
+    def test_load_table_from_dataframe(self):
+        from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
+        from google.cloud.bigquery import job
+
+        client = self._make_client()
+        records = [
+            {'name': 'Monty', 'age': 100},
+            {'name': 'Python', 'age': 60},
+        ]
+        dataframe = pandas.DataFrame(records)
+
+        load_patch = mock.patch(
+            'google.cloud.bigquery.client.Client.load_table_from_file',
+            autospec=True)
+        with load_patch as load_table_from_file:
+            client.load_table_from_dataframe(dataframe, self.TABLE_REF)
+
+        load_table_from_file.assert_called_once_with(
+            client, mock.ANY, self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES,
+            rewind=True, job_id=None, job_id_prefix=None, location=None,
+            project=None, job_config=mock.ANY)
+
+        sent_file = load_table_from_file.mock_calls[0][1][1]
+        sent_bytes = sent_file.getvalue()
+        assert isinstance(sent_bytes, bytes)
+        assert len(sent_bytes) > 0
+
+        sent_config = load_table_from_file.mock_calls[0][2]['job_config']
+        assert sent_config.source_format == job.SourceFormat.PARQUET
+
+    @unittest.skipIf(pandas is None, 'Requires `pandas`')
+    @unittest.skipIf(pyarrow is None, 'Requires `pyarrow`')
+    def test_load_table_from_dataframe_w_custom_job_config(self):
+        from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
+        from google.cloud.bigquery import job
+
+        client = self._make_client()
+        records = [
+            {'name': 'Monty', 'age': 100},
+            {'name': 'Python', 'age': 60},
+        ]
+        dataframe = pandas.DataFrame(records)
+        job_config = job.LoadJobConfig()
+
+        load_patch = mock.patch(
+            'google.cloud.bigquery.client.Client.load_table_from_file',
+            autospec=True)
+        with load_patch as load_table_from_file:
+            client.load_table_from_dataframe(
+                dataframe, self.TABLE_REF, job_config=job_config)
+
+        load_table_from_file.assert_called_once_with(
+            client, mock.ANY, self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES,
+            rewind=True, job_id=None, job_id_prefix=None, location=None,
+            project=None, job_config=mock.ANY)
+
+        sent_config = load_table_from_file.mock_calls[0][2]['job_config']
+        assert sent_config is job_config
+        assert sent_config.source_format == job.SourceFormat.PARQUET
+
     # Low-level tests
 
     @classmethod
diff --git a/docs/bigquery/snippets.py b/docs/bigquery/snippets.py
@@ -31,8 +31,12 @@
 import six
 try:
     import pandas
-except ImportError:
+except (ImportError, AttributeError):
     pandas = None
+try:
+    import pyarrow
+except (ImportError, AttributeError):
+    pyarrow = None
 
 from google.cloud import bigquery
 
@@ -2073,5 +2077,48 @@ def test_list_rows_as_dataframe(client):
     assert len(df) == table.num_rows           # verify the number of rows
 
 
+@pytest.mark.skipif(pandas is None, reason='Requires `pandas`')
+@pytest.mark.skipif(pyarrow is None, reason='Requires `pyarrow`')
+def test_load_table_from_dataframe(client, to_delete):
+    dataset_id = 'load_table_dataframe_dataset_{}'.format(_millis())
+    dataset = bigquery.Dataset(client.dataset(dataset_id))
+    client.create_dataset(dataset)
+    to_delete.append(dataset)
+
+    # [START bigquery_load_table_dataframe]
+    # from google.cloud import bigquery
+    # client = bigquery.Client()
+    # dataset_id = 'my_dataset'
+
+    dataset_ref = client.dataset(dataset_id)
+    table_ref = dataset_ref.table('monty_python')
+    records = [
+        {'title': 'The Meaning of Life', 'release_year': 1983},
+        {'title': 'Monty Python and the Holy Grail', 'release_year': 1975},
+        {'title': 'Life of Brian', 'release_year': 1979},
+        {
+            'title': 'And Now for Something Completely Different',
+            'release_year': 1971
+        },
+    ]
+    # Optionally set explicit indices.
+    # If indices are not specified, a column will be created for the default
+    # indices created by pandas.
+    index = ['Q24980', 'Q25043', 'Q24953', 'Q16403']
+    dataframe = pandas.DataFrame(
+        records, index=pandas.Index(index, name='wikidata_id'))
+
+    job = client.load_table_from_dataframe(dataframe, table_ref, location='US')
+
+    job.result()  # Waits for table load to complete.
+
+    assert job.state == 'DONE'
+    table = client.get_table(table_ref)
+    assert table.num_rows == 4
+    # [END bigquery_load_table_dataframe]
+    column_names = [field.name for field in table.schema]
+    assert sorted(column_names) == ['release_year', 'title', 'wikidata_id']
+
+
 if __name__ == '__main__':
     pytest.main()

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@`
`35`	`35`	`]`
`36`	`36`	`extras = {`
`37`	`37`	`'pandas': 'pandas>=0.17.1',`
	`38`	`+ 'pyarrow': 'pyarrow>=0.4.1',`
`38`	`39`	`}`
`39`	`40`
`40`	`41`