BigQuery: Adds Parquet SourceFormat and samples (googleapis#5057)

alixhami · web-flow · commit 6e85c95ab9eb · 2018-03-16T14:34:49.000-07:00
* adds parquet and samples

* changes StringIO to BytesIO in snippets
diff --git a/bigquery/google/cloud/bigquery/job.py b/bigquery/google/cloud/bigquery/job.py
@@ -154,13 +154,14 @@ class SourceFormat(_EnumApiResourceProperty):
 
     For CSV files, specify `CSV`. For datastore backups, specify
     `DATASTORE_BACKUP`. For newline-delimited json, specify
-    `NEWLINE_DELIMITED_JSON`. For Avro, specify `AVRO`. The default
-    value is `CSV`.
+    `NEWLINE_DELIMITED_JSON`. For Avro, specify `AVRO`. For Parquet, specify
+    `PARQUET`. The default value is `CSV`.
     """
     CSV = 'CSV'
     DATASTORE_BACKUP = 'DATASTORE_BACKUP'
     NEWLINE_DELIMITED_JSON = 'NEWLINE_DELIMITED_JSON'
     AVRO = 'AVRO'
+    PARQUET = 'PARQUET'
 
 
 class WriteDisposition(_EnumApiResourceProperty):
diff --git a/docs/bigquery/snippets.py b/docs/bigquery/snippets.py
@@ -931,6 +931,33 @@ def test_load_table_from_uri_cmek(client, to_delete):
     # [END bigquery_load_table_gcs_json_cmek]
 
 
+def test_load_table_from_uri_parquet(client, to_delete):
+    dataset_id = 'load_table_dataset_{}'.format(_millis())
+    dataset = bigquery.Dataset(client.dataset(dataset_id))
+    client.create_dataset(dataset)
+    to_delete.append(dataset)
+
+    # [START bigquery_load_table_gcs_parquet]
+    # client = bigquery.Client()
+    # dataset_id = 'my_dataset'
+    dataset_ref = client.dataset(dataset_id)
+    job_config = bigquery.LoadJobConfig()
+    job_config.source_format = bigquery.SourceFormat.PARQUET
+
+    load_job = client.load_table_from_uri(
+        'gs://cloud-samples-data/bigquery/us-states/us-states.parquet',
+        dataset_ref.table('us_states'),
+        job_config=job_config)  # API request
+
+    assert load_job.job_type == 'load'
+
+    load_job.result()  # Waits for table load to complete.
+
+    assert load_job.state == 'DONE'
+    assert client.get_table(dataset_ref.table('us_states')).num_rows > 0
+    # [END bigquery_load_table_gcs_parquet]
+
+
 def test_load_table_from_uri_autodetect(client, to_delete):
     dataset_id = 'load_table_dataset_{}'.format(_millis())
     dataset = bigquery.Dataset(client.dataset(dataset_id))
@@ -971,7 +998,7 @@ def test_load_table_from_uri_append(client, to_delete):
         bigquery.SchemaField('post_abbr', 'STRING')
     ]
     table_ref = dataset.table('us_states')
-    body = six.StringIO('Washington,WA')
+    body = six.BytesIO(b'Washington,WA')
     client.load_table_from_file(
         body, table_ref, job_config=job_config).result()
 
@@ -997,6 +1024,45 @@ def test_load_table_from_uri_append(client, to_delete):
     # [END bigquery_load_table_gcs_json_append]
 
 
+def test_load_table_from_uri_parquet_append(client, to_delete):
+    dataset_id = 'load_table_dataset_{}'.format(_millis())
+    dataset = bigquery.Dataset(client.dataset(dataset_id))
+    client.create_dataset(dataset)
+    to_delete.append(dataset)
+
+    job_config = bigquery.LoadJobConfig()
+    job_config.schema = [
+        bigquery.SchemaField('name', 'STRING'),
+        bigquery.SchemaField('post_abbr', 'STRING')
+    ]
+    table_ref = dataset.table('us_states')
+    body = six.BytesIO(b'Washington,WA')
+    client.load_table_from_file(
+        body, table_ref, job_config=job_config).result()
+
+    # [START bigquery_load_table_gcs_parquet_append]
+    # client = bigquery.Client()
+    # table_ref = client.dataset('my_dataset').table('existing_table')
+    previous_rows = client.get_table(table_ref).num_rows
+    job_config = bigquery.LoadJobConfig()
+    job_config.source_format = bigquery.SourceFormat.PARQUET
+    # The schema of the parquet file must match the table schema in an append
+    job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
+
+    load_job = client.load_table_from_uri(
+        'gs://cloud-samples-data/bigquery/us-states/us-states.parquet',
+        table_ref,
+        job_config=job_config)  # API request
+
+    assert load_job.job_type == 'load'
+
+    load_job.result()  # Waits for table load to complete.
+
+    assert load_job.state == 'DONE'
+    assert client.get_table(table_ref).num_rows == previous_rows + 50
+    # [END bigquery_load_table_gcs_parquet_append]
+
+
 def test_load_table_from_uri_truncate(client, to_delete):
     dataset_id = 'load_table_dataset_{}'.format(_millis())
     dataset = bigquery.Dataset(client.dataset(dataset_id))
@@ -1009,7 +1075,7 @@ def test_load_table_from_uri_truncate(client, to_delete):
         bigquery.SchemaField('post_abbr', 'STRING')
     ]
     table_ref = dataset.table('us_states')
-    body = six.StringIO('Washington,WA')
+    body = six.BytesIO(b'Washington,WA')
     client.load_table_from_file(
         body, table_ref, job_config=job_config).result()
 
@@ -1037,6 +1103,46 @@ def test_load_table_from_uri_truncate(client, to_delete):
     # [END bigquery_load_table_gcs_json_truncate]
 
 
+def test_load_table_from_uri_parquet_truncate(client, to_delete):
+    dataset_id = 'load_table_dataset_{}'.format(_millis())
+    dataset = bigquery.Dataset(client.dataset(dataset_id))
+    client.create_dataset(dataset)
+    to_delete.append(dataset)
+
+    job_config = bigquery.LoadJobConfig()
+    job_config.schema = [
+        bigquery.SchemaField('name', 'STRING'),
+        bigquery.SchemaField('post_abbr', 'STRING')
+    ]
+    table_ref = dataset.table('us_states')
+    body = six.BytesIO(b'Washington,WA')
+    client.load_table_from_file(
+        body, table_ref, job_config=job_config).result()
+
+    # [START bigquery_load_table_gcs_parquet_truncate]
+    # client = bigquery.Client()
+    # table_ref = client.dataset('my_dataset').table('existing_table')
+    previous_rows = client.get_table(table_ref).num_rows
+    assert previous_rows > 0
+
+    job_config = bigquery.LoadJobConfig()
+    job_config.source_format = bigquery.SourceFormat.PARQUET
+    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
+
+    load_job = client.load_table_from_uri(
+        'gs://cloud-samples-data/bigquery/us-states/us-states.parquet',
+        table_ref,
+        job_config=job_config)  # API request
+
+    assert load_job.job_type == 'load'
+
+    load_job.result()  # Waits for table load to complete.
+
+    assert load_job.state == 'DONE'
+    assert client.get_table(table_ref).num_rows == 50
+    # [END bigquery_load_table_gcs_parquet_truncate]
+
+
 def _write_csv_to_storage(bucket_name, blob_name, header_row, data_rows):
     import csv
     from google.cloud._testing import _NamedTemporaryFile
@@ -1103,14 +1209,14 @@ def test_copy_table_multiple_source(client, to_delete):
         bigquery.SchemaField('post_abbr', 'STRING')
     ]
 
-    table_data = {'table1': 'Washington,WA', 'table2': 'California,CA'}
+    table_data = {'table1': b'Washington,WA', 'table2': b'California,CA'}
     for table_id, data in table_data.items():
         table_ref = source_dataset.table(table_id)
         table = bigquery.Table(table_ref, schema=schema)
         to_delete.insert(0, table)
         job_config = bigquery.LoadJobConfig()
         job_config.schema = schema
-        body = six.StringIO(data)
+        body = six.BytesIO(data)
         client.load_table_from_file(
             body, table_ref, job_config=job_config).result()  # API request
 
diff --git a/docs/bigquery/usage.rst b/docs/bigquery/usage.rst
@@ -223,6 +223,12 @@ Load a JSON file from Cloud Storage:
    :start-after: [START bigquery_load_table_gcs_json]
    :end-before: [END bigquery_load_table_gcs_json]
 
+Load a Parquet file from Cloud Storage:
+
+.. literalinclude:: snippets.py
+   :start-after: [START bigquery_load_table_gcs_parquet]
+   :end-before: [END bigquery_load_table_gcs_parquet]
+
 Load a JSON file from Cloud Storage, using an autodetected schema:
 
 .. literalinclude:: snippets.py
@@ -235,12 +241,24 @@ Append a JSON file from Cloud Storage to an existing table:
    :start-after: [START bigquery_load_table_gcs_json_append]
    :end-before: [END bigquery_load_table_gcs_json_append]
 
+Append a Parquet file from Cloud Storage to an existing table:
+
+.. literalinclude:: snippets.py
+   :start-after: [START bigquery_load_table_gcs_parquet_append]
+   :end-before: [END bigquery_load_table_gcs_parquet_append]
+
 Overwrite / replace an existing table with a JSON file from Cloud Storage:
 
 .. literalinclude:: snippets.py
    :start-after: [START bigquery_load_table_gcs_json_truncate]
    :end-before: [END bigquery_load_table_gcs_json_truncate]
 
+Overwrite / replace an existing table with a Parquet file from Cloud Storage:
+
+.. literalinclude:: snippets.py
+   :start-after: [START bigquery_load_table_gcs_parquet_truncate]
+   :end-before: [END bigquery_load_table_gcs_parquet_truncate]
+
 Customer Managed Encryption Keys
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~