diff --git a/.coveragerc b/.coveragerc index b178b094..dd39c854 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,3 +1,19 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Generated by synthtool. DO NOT EDIT! [run] branch = True diff --git a/.flake8 b/.flake8 index 0268ecc9..20fe9bda 100644 --- a/.flake8 +++ b/.flake8 @@ -1,3 +1,19 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Generated by synthtool. DO NOT EDIT! [flake8] ignore = E203, E266, E501, W503 diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 9fe23c11..8892e94b 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -11,8 +11,7 @@ Thanks for stopping by to let us know something could be better! Please run down the following list and make sure you've tried the usual "quick fixes": - Search the issues already opened: https://github.com/googleapis/python-bigquery-storage/issues - - Search the issues on our "catch-all" repository: https://github.com/googleapis/google-cloud-python - - Search StackOverflow: http://stackoverflow.com/questions/tagged/google-cloud-platform+python + - Search StackOverflow: https://stackoverflow.com/questions/tagged/google-cloud-platform+python If you are still having issues, please be sure to include as much information as possible: diff --git a/.repo-metadata.json b/.repo-metadata.json index d918148b..e91cf373 100644 --- a/.repo-metadata.json +++ b/.repo-metadata.json @@ -4,7 +4,7 @@ "product_documentation": "https://cloud.google.com/bigquery/docs/reference/storage/", "client_documentation": "https://googleapis.dev/python/bigquerystorage/latest", "issue_tracker": "https://issuetracker.google.com/savedsearches/559654", - "release_level": "beta", + "release_level": "ga", "language": "python", "repo": "googleapis/python-bigquery-storage", "distribution_name": "google-cloud-bigquery-storage", diff --git a/CHANGELOG.md b/CHANGELOG.md index e12ace0b..b415487b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,14 @@ [1]: https://pypi.org/project/google-cloud-bigquery-storage/#history +## [1.0.0](https://www.github.com/googleapis/python-bigquery-storage/compare/v0.8.0...v1.0.0) (2020-06-04) + + +### Bug Fixes + +* handle consuming streams with no data ([#29](https://www.github.com/googleapis/python-bigquery-storage/issues/29)) ([56d1b1f](https://www.github.com/googleapis/python-bigquery-storage/commit/56d1b1fd75965669f5a4d10e5b00671c276eda88)) +* update pyarrow references that are warning ([#31](https://www.github.com/googleapis/python-bigquery-storage/issues/31)) ([5302481](https://www.github.com/googleapis/python-bigquery-storage/commit/5302481d9f0ee07630ae62ed7268e510bcaa5d84)) + ## [0.8.0](https://www.github.com/googleapis/python-bigquery-storage/compare/v0.7.0...v0.8.0) (2020-03-03) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index f2f999ba..c232b2c2 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -22,7 +22,7 @@ In order to add a feature: documentation. - The feature must work fully on the following CPython versions: 2.7, - 3.5, 3.6, and 3.7 on both UNIX and Windows. + 3.5, 3.6, 3.7 and 3.8 on both UNIX and Windows. - The feature must not add unnecessary dependencies (where "unnecessary" is of course subjective, but new dependencies should @@ -214,26 +214,18 @@ We support: - `Python 3.5`_ - `Python 3.6`_ - `Python 3.7`_ +- `Python 3.8`_ .. _Python 3.5: https://docs.python.org/3.5/ .. _Python 3.6: https://docs.python.org/3.6/ .. _Python 3.7: https://docs.python.org/3.7/ +.. _Python 3.8: https://docs.python.org/3.8/ Supported versions can be found in our ``noxfile.py`` `config`_. .. _config: https://github.com/googleapis/python-bigquery-storage/blob/master/noxfile.py -We explicitly decided not to support `Python 2.5`_ due to `decreased usage`_ -and lack of continuous integration `support`_. - -.. _Python 2.5: https://docs.python.org/2.5/ -.. _decreased usage: https://caremad.io/2013/10/a-look-at-pypi-downloads/ -.. _support: https://blog.travis-ci.com/2013-11-18-upcoming-build-environment-updates/ - -We have `dropped 2.6`_ as a supported version as well since Python 2.6 is no -longer supported by the core development team. - Python 2.7 support is deprecated. All code changes should maintain Python 2.7 compatibility until January 1, 2020. We also explicitly decided to support Python 3 beginning with version @@ -247,7 +239,6 @@ We also explicitly decided to support Python 3 beginning with version .. _prominent: https://docs.djangoproject.com/en/1.9/faq/install/#what-python-version-can-i-use-with-django .. _projects: http://flask.pocoo.org/docs/0.10/python3/ .. _Unicode literal support: https://www.python.org/dev/peps/pep-0414/ -.. _dropped 2.6: https://github.com/googleapis/google-cloud-python/issues/995 ********** Versioning diff --git a/MANIFEST.in b/MANIFEST.in index cd011be2..68855abc 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,19 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Generated by synthtool. DO NOT EDIT! include README.rst LICENSE recursive-include google *.json *.proto diff --git a/README.rst b/README.rst index 6b600ca0..f8623e38 100644 --- a/README.rst +++ b/README.rst @@ -1,14 +1,19 @@ -Python Client for BigQuery Storage API (`Beta`_) -================================================= - +Python Client for BigQuery Storage API +====================================== +|ga| |pypi| |versions| `BigQuery Storage API`_: - `Client Library Documentation`_ - `Product Documentation`_ -.. _Beta: https://github.com/googleapis/google-cloud-python/blob/master/README.rst +.. |ga| image:: https://img.shields.io/badge/support-GA-gold.svg + :target: https://github.com/googleapis/google-cloud-python/blob/master/README.rst#general-availability +.. |pypi| image:: https://img.shields.io/pypi/v/google-cloud-bigquery-storage.svg + :target: https://pypi.org/project/google-cloud-bigquery-storage/ +.. |versions| image:: https://img.shields.io/pypi/pyversions/google-cloud-bigquery-storage.svg + :target: https://pypi.org/project/google-cloud-bigquery-storage/ .. _BigQuery Storage API: https://cloud.google.com/bigquery/docs/reference/storage/ .. _Client Library Documentation: https://googleapis.dev/python/bigquerystorage/latest .. _Product Documentation: https://cloud.google.com/bigquery/docs/reference/storage/ diff --git a/google/cloud/bigquery_storage_v1/proto/arrow.proto b/google/cloud/bigquery_storage_v1/proto/arrow.proto index 90add978..1c54eeab 100644 --- a/google/cloud/bigquery_storage_v1/proto/arrow.proto +++ b/google/cloud/bigquery_storage_v1/proto/arrow.proto @@ -27,7 +27,7 @@ option php_namespace = "Google\\Cloud\\BigQuery\\Storage\\V1"; // Arrow schema as specified in // https://arrow.apache.org/docs/python/api/datatypes.html // and serialized to bytes using IPC: -// https://arrow.apache.org/docs/ipc.html. +// https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc // // See code samples on how this message can be deserialized. message ArrowSchema { diff --git a/google/cloud/bigquery_storage_v1/proto/arrow_pb2.py b/google/cloud/bigquery_storage_v1/proto/arrow_pb2.py index d3570dea..155008ed 100644 --- a/google/cloud/bigquery_storage_v1/proto/arrow_pb2.py +++ b/google/cloud/bigquery_storage_v1/proto/arrow_pb2.py @@ -135,11 +135,10 @@ __module__="google.cloud.bigquery.storage_v1.proto.arrow_pb2", __doc__="""Arrow schema as specified in https://arrow.apache.org/docs/python/api/datatypes.html and serialized - to bytes using IPC: https://arrow.apache.org/docs/ipc.html. - - See code samples on how this message can be deserialized. - - + to bytes using IPC: + https://arrow.apache.org/docs/format/Columnar.html#serialization-and- + interprocess-communication-ipc See code samples on how this message + can be deserialized. Attributes: serialized_schema: IPC serialized Arrow schema. @@ -156,8 +155,6 @@ DESCRIPTOR=_ARROWRECORDBATCH, __module__="google.cloud.bigquery.storage_v1.proto.arrow_pb2", __doc__="""Arrow RecordBatch. - - Attributes: serialized_record_batch: IPC-serialized Arrow RecordBatch. diff --git a/google/cloud/bigquery_storage_v1/proto/avro_pb2.py b/google/cloud/bigquery_storage_v1/proto/avro_pb2.py index c824282c..1b5cf205 100644 --- a/google/cloud/bigquery_storage_v1/proto/avro_pb2.py +++ b/google/cloud/bigquery_storage_v1/proto/avro_pb2.py @@ -134,8 +134,6 @@ DESCRIPTOR=_AVROSCHEMA, __module__="google.cloud.bigquery.storage_v1.proto.avro_pb2", __doc__="""Avro schema. - - Attributes: schema: Json serialized schema, as described at @@ -153,8 +151,6 @@ DESCRIPTOR=_AVROROWS, __module__="google.cloud.bigquery.storage_v1.proto.avro_pb2", __doc__="""Avro rows. - - Attributes: serialized_binary_rows: Binary serialized rows in a block. diff --git a/google/cloud/bigquery_storage_v1/proto/storage_pb2.py b/google/cloud/bigquery_storage_v1/proto/storage_pb2.py index 3821f077..6ceb3aa8 100644 --- a/google/cloud/bigquery_storage_v1/proto/storage_pb2.py +++ b/google/cloud/bigquery_storage_v1/proto/storage_pb2.py @@ -613,8 +613,6 @@ DESCRIPTOR=_CREATEREADSESSIONREQUEST, __module__="google.cloud.bigquery.storage_v1.proto.storage_pb2", __doc__="""Request message for ``CreateReadSession``. - - Attributes: parent: Required. The request project that owns the session, in the @@ -643,8 +641,6 @@ DESCRIPTOR=_READROWSREQUEST, __module__="google.cloud.bigquery.storage_v1.proto.storage_pb2", __doc__="""Request message for ``ReadRows``. - - Attributes: read_stream: Required. Stream to read rows from. @@ -664,10 +660,7 @@ dict( DESCRIPTOR=_THROTTLESTATE, __module__="google.cloud.bigquery.storage_v1.proto.storage_pb2", - __doc__="""Information on if the current connection is being - throttled. - - + __doc__="""Information on if the current connection is being throttled. Attributes: throttle_percent: How much this connection is being throttled. Zero means no @@ -713,8 +706,6 @@ DESCRIPTOR=_STREAMSTATS, __module__="google.cloud.bigquery.storage_v1.proto.storage_pb2", __doc__="""Estimated stream statistics for a given Stream. - - Attributes: progress: Represents the progress of the current stream. @@ -731,10 +722,8 @@ dict( DESCRIPTOR=_READROWSRESPONSE, __module__="google.cloud.bigquery.storage_v1.proto.storage_pb2", - __doc__="""Response from calling ``ReadRows`` may include row data, - progress and throttling information. - - + __doc__="""Response from calling ``ReadRows`` may include row data, progress and + throttling information. Attributes: rows: Row data is returned in format specified during session @@ -763,8 +752,6 @@ DESCRIPTOR=_SPLITREADSTREAMREQUEST, __module__="google.cloud.bigquery.storage_v1.proto.storage_pb2", __doc__="""Request message for ``SplitReadStream``. - - Attributes: name: Required. Name of the stream to split. @@ -790,8 +777,6 @@ DESCRIPTOR=_SPLITREADSTREAMRESPONSE, __module__="google.cloud.bigquery.storage_v1.proto.storage_pb2", __doc__="""Response message for ``SplitReadStream``. - - Attributes: primary_stream: Primary stream, which contains the beginning portion of diff --git a/google/cloud/bigquery_storage_v1/proto/stream.proto b/google/cloud/bigquery_storage_v1/proto/stream.proto index 3ef32c13..febad036 100644 --- a/google/cloud/bigquery_storage_v1/proto/stream.proto +++ b/google/cloud/bigquery_storage_v1/proto/stream.proto @@ -65,8 +65,7 @@ message ReadSession { repeated string selected_fields = 1; // SQL text filtering statement, similar to a WHERE clause in a query. - // Currently, only a single predicate that is a comparison between a column - // and a constant value is supported. Aggregates are not supported. + // Aggregates are not supported. // // Examples: "int_field > 5" // "date_field = CAST('2014-9-27' as DATE)" @@ -100,7 +99,7 @@ message ReadSession { } // Immutable. Table that this ReadSession is reading from, in the form - // `projects/{project_id}/datasets/{dataset_id}/tables/{table_id} + // `projects/{project_id}/datasets/{dataset_id}/tables/{table_id}` string table = 6 [ (google.api.field_behavior) = IMMUTABLE, (google.api.resource_reference) = { diff --git a/google/cloud/bigquery_storage_v1/proto/stream_pb2.py b/google/cloud/bigquery_storage_v1/proto/stream_pb2.py index 333b7506..3860f350 100644 --- a/google/cloud/bigquery_storage_v1/proto/stream_pb2.py +++ b/google/cloud/bigquery_storage_v1/proto/stream_pb2.py @@ -460,8 +460,6 @@ DESCRIPTOR=_READSESSION_TABLEMODIFIERS, __module__="google.cloud.bigquery.storage_v1.proto.stream_pb2", __doc__="""Additional attributes when reading a table. - - Attributes: snapshot_time: The snapshot time of the table. If not set, interpreted as @@ -477,8 +475,6 @@ DESCRIPTOR=_READSESSION_TABLEREADOPTIONS, __module__="google.cloud.bigquery.storage_v1.proto.stream_pb2", __doc__="""Options dictating how we read a table. - - Attributes: selected_fields: Names of the fields in the table that should be read. If @@ -488,12 +484,10 @@ fields in selected\_fields. row_restriction: SQL text filtering statement, similar to a WHERE clause in a - query. Currently, only a single predicate that is a comparison - between a column and a constant value is supported. Aggregates - are not supported. Examples: "int\_field > 5" "date\_field = - CAST('2014-9-27' as DATE)" "nullable\_field is not NULL" - "st\_equals(geo\_field, st\_geofromtext("POINT(2, 2)"))" - "numeric\_field BETWEEN 1.0 AND 5.0" + query. Aggregates are not supported. Examples: "int\_field > + 5" "date\_field = CAST('2014-9-27' as DATE)" "nullable\_field + is not NULL" "st\_equals(geo\_field, st\_geofromtext("POINT(2, + 2)"))" "numeric\_field BETWEEN 1.0 AND 5.0" """, # @@protoc_insertion_point(class_scope:google.cloud.bigquery.storage.v1.ReadSession.TableReadOptions) ), @@ -501,8 +495,6 @@ DESCRIPTOR=_READSESSION, __module__="google.cloud.bigquery.storage_v1.proto.stream_pb2", __doc__="""Information about the ReadSession. - - Attributes: name: Output only. Unique identifier for the session, in the form `` @@ -525,8 +517,8 @@ Output only. Arrow schema. table: Immutable. Table that this ReadSession is reading from, in the - form \`projects/{project\_id}/datasets/{dataset\_id}/tables/{t - able\_id} + form ``projects/{project_id}/datasets/{dataset_id}/tables/{tab + le_id}`` table_modifiers: Optional. Any modifiers which are applied when reading from the specified table. @@ -554,11 +546,9 @@ dict( DESCRIPTOR=_READSTREAM, __module__="google.cloud.bigquery.storage_v1.proto.stream_pb2", - __doc__="""Information about a single stream that gets data out of - the storage system. Most of the information about ``ReadStream`` - instances is aggregated, making ``ReadStream`` lightweight. - - + __doc__="""Information about a single stream that gets data out of the storage + system. Most of the information about ``ReadStream`` instances is + aggregated, making ``ReadStream`` lightweight. Attributes: name: Output only. Name of the stream, in the form ``projects/{proje diff --git a/google/cloud/bigquery_storage_v1/reader.py b/google/cloud/bigquery_storage_v1/reader.py index 9b6e3e6b..3587ff9e 100644 --- a/google/cloud/bigquery_storage_v1/reader.py +++ b/google/cloud/bigquery_storage_v1/reader.py @@ -287,7 +287,13 @@ def to_arrow(self): record_batches = [] for page in self.pages: record_batches.append(page.to_arrow()) - return pyarrow.Table.from_batches(record_batches) + + if record_batches: + return pyarrow.Table.from_batches(record_batches) + + # No data, return an empty Table. + self._stream_parser._parse_arrow_schema() + return pyarrow.Table.from_batches([], schema=self._stream_parser._schema) def to_dataframe(self, dtypes=None): """Create a :class:`pandas.DataFrame` of all rows in the stream. @@ -323,6 +329,7 @@ def to_dataframe(self, dtypes=None): # rarely no-copy, whereas pyarrow.Table.from_batches + to_pandas is # usually no-copy. schema_type = self._read_session.WhichOneof("schema") + if schema_type == "arrow_schema": record_batch = self.to_arrow() df = record_batch.to_pandas() @@ -330,10 +337,58 @@ def to_dataframe(self, dtypes=None): df[column] = pandas.Series(df[column], dtype=dtypes[column]) return df - frames = [] - for page in self.pages: - frames.append(page.to_dataframe(dtypes=dtypes)) - return pandas.concat(frames) + frames = [page.to_dataframe(dtypes=dtypes) for page in self.pages] + + if frames: + return pandas.concat(frames) + + # No data, construct an empty dataframe with columns matching the schema. + # The result should be consistent with what an empty ARROW stream would produce. + self._stream_parser._parse_avro_schema() + schema = self._stream_parser._avro_schema_json + + column_dtypes = self._dtypes_from_avro(schema["fields"]) + column_dtypes.update(dtypes) + + df = pandas.DataFrame(columns=column_dtypes.keys()) + for column in df: + df[column] = pandas.Series([], dtype=column_dtypes[column]) + + return df + + def _dtypes_from_avro(self, avro_fields): + """Determine Pandas dtypes for columns in Avro schema. + + Args: + avro_fields (Iterable[Mapping[str, Any]]): + Avro fields' metadata. + + Returns: + colelctions.OrderedDict[str, str]: + Column names with their corresponding Pandas dtypes. + """ + result = collections.OrderedDict() + + type_map = {"long": "int64", "double": "float64", "boolean": "bool"} + + for field_info in avro_fields: + # If a type is an union of multiple types, pick the first type + # that is not "null". + if isinstance(field_info["type"], list): + type_info = next(item for item in field_info["type"] if item != "null") + + if isinstance(type_info, six.string_types): + field_dtype = type_map.get(type_info, "object") + else: + logical_type = type_info.get("logicalType") + if logical_type == "timestamp-micros": + field_dtype = "datetime64[ns, UTC]" + else: + field_dtype = "object" + + result[field_info["name"]] = field_dtype + + return result class ReadRowsPage(object): @@ -594,7 +649,7 @@ def to_dataframe(self, message, dtypes=None): def _parse_arrow_message(self, message): self._parse_arrow_schema() - return pyarrow.read_record_batch( + return pyarrow.ipc.read_record_batch( pyarrow.py_buffer(message.arrow_record_batch.serialized_record_batch), self._schema, ) @@ -603,7 +658,7 @@ def _parse_arrow_schema(self): if self._schema: return - self._schema = pyarrow.read_schema( + self._schema = pyarrow.ipc.read_schema( pyarrow.py_buffer(self._read_session.arrow_schema.serialized_schema) ) self._column_names = [field.name for field in self._schema] diff --git a/google/cloud/bigquery_storage_v1beta1/proto/arrow_pb2.py b/google/cloud/bigquery_storage_v1beta1/proto/arrow_pb2.py index 4d7d90af..585c8415 100644 --- a/google/cloud/bigquery_storage_v1beta1/proto/arrow_pb2.py +++ b/google/cloud/bigquery_storage_v1beta1/proto/arrow_pb2.py @@ -134,8 +134,6 @@ DESCRIPTOR=_ARROWSCHEMA, __module__="google.cloud.bigquery.storage_v1beta1.proto.arrow_pb2", __doc__="""Arrow schema. - - Attributes: serialized_schema: IPC serialized Arrow schema. @@ -152,8 +150,6 @@ DESCRIPTOR=_ARROWRECORDBATCH, __module__="google.cloud.bigquery.storage_v1beta1.proto.arrow_pb2", __doc__="""Arrow RecordBatch. - - Attributes: serialized_record_batch: IPC serialized Arrow RecordBatch. diff --git a/google/cloud/bigquery_storage_v1beta1/proto/avro_pb2.py b/google/cloud/bigquery_storage_v1beta1/proto/avro_pb2.py index 1f5ee11d..74551969 100644 --- a/google/cloud/bigquery_storage_v1beta1/proto/avro_pb2.py +++ b/google/cloud/bigquery_storage_v1beta1/proto/avro_pb2.py @@ -134,8 +134,6 @@ DESCRIPTOR=_AVROSCHEMA, __module__="google.cloud.bigquery.storage_v1beta1.proto.avro_pb2", __doc__="""Avro schema. - - Attributes: schema: Json serialized schema, as described at @@ -153,8 +151,6 @@ DESCRIPTOR=_AVROROWS, __module__="google.cloud.bigquery.storage_v1beta1.proto.avro_pb2", __doc__="""Avro rows. - - Attributes: serialized_binary_rows: Binary serialized rows in a block. diff --git a/google/cloud/bigquery_storage_v1beta1/proto/read_options.proto b/google/cloud/bigquery_storage_v1beta1/proto/read_options.proto index 9591deba..8ed9b73f 100644 --- a/google/cloud/bigquery_storage_v1beta1/proto/read_options.proto +++ b/google/cloud/bigquery_storage_v1beta1/proto/read_options.proto @@ -29,8 +29,7 @@ message TableReadOptions { repeated string selected_fields = 1; // Optional. SQL text filtering statement, similar to a WHERE clause in - // a query. Currently, only a single predicate that is a comparison between - // a column and a constant value is supported. Aggregates are not supported. + // a query. Aggregates are not supported. // // Examples: "int_field > 5" // "date_field = CAST('2014-9-27' as DATE)" diff --git a/google/cloud/bigquery_storage_v1beta1/proto/read_options_pb2.py b/google/cloud/bigquery_storage_v1beta1/proto/read_options_pb2.py index 7291232d..0d4911af 100644 --- a/google/cloud/bigquery_storage_v1beta1/proto/read_options_pb2.py +++ b/google/cloud/bigquery_storage_v1beta1/proto/read_options_pb2.py @@ -94,8 +94,6 @@ DESCRIPTOR=_TABLEREADOPTIONS, __module__="google.cloud.bigquery.storage_v1beta1.proto.read_options_pb2", __doc__="""Options dictating how we read a table. - - Attributes: selected_fields: Optional. Names of the fields in the table that should be @@ -105,9 +103,7 @@ of fields in selected\_fields. row_restriction: Optional. SQL text filtering statement, similar to a WHERE - clause in a query. Currently, only a single predicate that is - a comparison between a column and a constant value is - supported. Aggregates are not supported. Examples: + clause in a query. Aggregates are not supported. Examples: "int\_field > 5" "date\_field = CAST('2014-9-27' as DATE)" "nullable\_field is not NULL" "st\_equals(geo\_field, st\_geofromtext("POINT(2, 2)"))" "numeric\_field BETWEEN 1.0 diff --git a/google/cloud/bigquery_storage_v1beta1/proto/storage_pb2.py b/google/cloud/bigquery_storage_v1beta1/proto/storage_pb2.py index c7d84619..d78931b5 100644 --- a/google/cloud/bigquery_storage_v1beta1/proto/storage_pb2.py +++ b/google/cloud/bigquery_storage_v1beta1/proto/storage_pb2.py @@ -1258,10 +1258,7 @@ dict( DESCRIPTOR=_STREAM, __module__="google.cloud.bigquery.storage_v1beta1.proto.storage_pb2", - __doc__="""Information about a single data stream within a read - session. - - + __doc__="""Information about a single data stream within a read session. Attributes: name: Name of the stream, in the form ``projects/{project_id}/locati @@ -1278,10 +1275,7 @@ dict( DESCRIPTOR=_STREAMPOSITION, __module__="google.cloud.bigquery.storage_v1beta1.proto.storage_pb2", - __doc__="""Expresses a point within a given stream using an offset - position. - - + __doc__="""Expresses a point within a given stream using an offset position. Attributes: stream: Identifier for a given Stream. @@ -1300,8 +1294,6 @@ DESCRIPTOR=_READSESSION, __module__="google.cloud.bigquery.storage_v1beta1.proto.storage_pb2", __doc__="""Information returned from a ``CreateReadSession`` request. - - Attributes: name: Unique identifier for the session, in the form ``projects/{pro @@ -1338,11 +1330,8 @@ dict( DESCRIPTOR=_CREATEREADSESSIONREQUEST, __module__="google.cloud.bigquery.storage_v1beta1.proto.storage_pb2", - __doc__="""Creates a new read session, which may include additional - options such as requested parallelism, projection filters and - constraints. - - + __doc__="""Creates a new read session, which may include additional options such + as requested parallelism, projection filters and constraints. Attributes: table_reference: Required. Reference to the table to read. @@ -1380,10 +1369,8 @@ dict( DESCRIPTOR=_READROWSREQUEST, __module__="google.cloud.bigquery.storage_v1beta1.proto.storage_pb2", - __doc__="""Requesting row data via ``ReadRows`` must provide Stream - position information. - - + __doc__="""Requesting row data via ``ReadRows`` must provide Stream position + information. Attributes: read_position: Required. Identifier of the position in the stream to start @@ -1403,8 +1390,6 @@ DESCRIPTOR=_STREAMSTATUS, __module__="google.cloud.bigquery.storage_v1beta1.proto.storage_pb2", __doc__="""Progress information for a given Stream. - - Attributes: estimated_row_count: Number of estimated rows in the current stream. May change @@ -1471,10 +1456,7 @@ dict( DESCRIPTOR=_THROTTLESTATUS, __module__="google.cloud.bigquery.storage_v1beta1.proto.storage_pb2", - __doc__="""Information on if the current connection is being - throttled. - - + __doc__="""Information on if the current connection is being throttled. Attributes: throttle_percent: How much this connection is being throttled. 0 is no @@ -1491,10 +1473,8 @@ dict( DESCRIPTOR=_READROWSRESPONSE, __module__="google.cloud.bigquery.storage_v1beta1.proto.storage_pb2", - __doc__="""Response from calling ``ReadRows`` may include row data, - progress and throttling information. - - + __doc__="""Response from calling ``ReadRows`` may include row data, progress and + throttling information. Attributes: rows: Row data is returned in format specified during session @@ -1526,10 +1506,8 @@ dict( DESCRIPTOR=_BATCHCREATEREADSESSIONSTREAMSREQUEST, __module__="google.cloud.bigquery.storage_v1beta1.proto.storage_pb2", - __doc__="""Information needed to request additional streams for an - established read session. - - + __doc__="""Information needed to request additional streams for an established + read session. Attributes: session: Required. Must be a non-expired session obtained from a call @@ -1550,10 +1528,8 @@ dict( DESCRIPTOR=_BATCHCREATEREADSESSIONSTREAMSRESPONSE, __module__="google.cloud.bigquery.storage_v1beta1.proto.storage_pb2", - __doc__="""The response from ``BatchCreateReadSessionStreams`` - returns the stream identifiers for the newly created streams. - - + __doc__="""The response from ``BatchCreateReadSessionStreams`` returns the stream + identifiers for the newly created streams. Attributes: streams: Newly added streams. @@ -1570,8 +1546,6 @@ DESCRIPTOR=_FINALIZESTREAMREQUEST, __module__="google.cloud.bigquery.storage_v1beta1.proto.storage_pb2", __doc__="""Request information for invoking ``FinalizeStream``. - - Attributes: stream: Stream to finalize. @@ -1588,8 +1562,6 @@ DESCRIPTOR=_SPLITREADSTREAMREQUEST, __module__="google.cloud.bigquery.storage_v1beta1.proto.storage_pb2", __doc__="""Request information for ``SplitReadStream``. - - Attributes: original_stream: Stream to split. @@ -1616,8 +1588,6 @@ DESCRIPTOR=_SPLITREADSTREAMRESPONSE, __module__="google.cloud.bigquery.storage_v1beta1.proto.storage_pb2", __doc__="""Response from ``SplitReadStream``. - - Attributes: primary_stream: Primary stream, which contains the beginning portion of diff --git a/google/cloud/bigquery_storage_v1beta1/proto/table_reference_pb2.py b/google/cloud/bigquery_storage_v1beta1/proto/table_reference_pb2.py index fd15501a..5100e9f2 100644 --- a/google/cloud/bigquery_storage_v1beta1/proto/table_reference_pb2.py +++ b/google/cloud/bigquery_storage_v1beta1/proto/table_reference_pb2.py @@ -162,10 +162,8 @@ dict( DESCRIPTOR=_TABLEREFERENCE, __module__="google.cloud.bigquery.storage_v1beta1.proto.table_reference_pb2", - __doc__="""Table reference that includes just the 3 strings needed to - identify a table. - - + __doc__="""Table reference that includes just the 3 strings needed to identify a + table. Attributes: project_id: The assigned project ID of the project. @@ -186,8 +184,6 @@ DESCRIPTOR=_TABLEMODIFIERS, __module__="google.cloud.bigquery.storage_v1beta1.proto.table_reference_pb2", __doc__="""All fields in this message optional. - - Attributes: snapshot_time: The snapshot time of the table. If not set, interpreted as diff --git a/google/cloud/bigquery_storage_v1beta1/reader.py b/google/cloud/bigquery_storage_v1beta1/reader.py index 7e07392d..60942f29 100644 --- a/google/cloud/bigquery_storage_v1beta1/reader.py +++ b/google/cloud/bigquery_storage_v1beta1/reader.py @@ -607,7 +607,7 @@ def to_dataframe(self, message, dtypes=None): def _parse_arrow_message(self, message): self._parse_arrow_schema() - return pyarrow.read_record_batch( + return pyarrow.ipc.read_record_batch( pyarrow.py_buffer(message.arrow_record_batch.serialized_record_batch), self._schema, ) @@ -616,7 +616,7 @@ def _parse_arrow_schema(self): if self._schema: return - self._schema = pyarrow.read_schema( + self._schema = pyarrow.ipc.read_schema( pyarrow.py_buffer(self._read_session.arrow_schema.serialized_schema) ) self._column_names = [field.name for field in self._schema] diff --git a/google/cloud/bigquery_storage_v1beta2/proto/arrow.proto b/google/cloud/bigquery_storage_v1beta2/proto/arrow.proto index ad178dce..74733db9 100644 --- a/google/cloud/bigquery_storage_v1beta2/proto/arrow.proto +++ b/google/cloud/bigquery_storage_v1beta2/proto/arrow.proto @@ -25,7 +25,7 @@ option java_package = "com.google.cloud.bigquery.storage.v1beta2"; // Arrow schema as specified in // https://arrow.apache.org/docs/python/api/datatypes.html // and serialized to bytes using IPC: -// https://arrow.apache.org/docs/ipc.html. +// https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc // // See code samples on how this message can be deserialized. message ArrowSchema { diff --git a/google/cloud/bigquery_storage_v1beta2/proto/arrow_pb2.py b/google/cloud/bigquery_storage_v1beta2/proto/arrow_pb2.py index ecf172c3..edc0d585 100644 --- a/google/cloud/bigquery_storage_v1beta2/proto/arrow_pb2.py +++ b/google/cloud/bigquery_storage_v1beta2/proto/arrow_pb2.py @@ -191,11 +191,10 @@ __module__="google.cloud.bigquery.storage_v1beta2.proto.arrow_pb2", __doc__="""Arrow schema as specified in https://arrow.apache.org/docs/python/api/datatypes.html and serialized - to bytes using IPC: https://arrow.apache.org/docs/ipc.html. - - See code samples on how this message can be deserialized. - - + to bytes using IPC: + https://arrow.apache.org/docs/format/Columnar.html#serialization-and- + interprocess-communication-ipc See code samples on how this message + can be deserialized. Attributes: serialized_schema: IPC serialized Arrow schema. @@ -212,8 +211,6 @@ DESCRIPTOR=_ARROWRECORDBATCH, __module__="google.cloud.bigquery.storage_v1beta2.proto.arrow_pb2", __doc__="""Arrow RecordBatch. - - Attributes: serialized_record_batch: IPC-serialized Arrow RecordBatch. @@ -230,8 +227,6 @@ DESCRIPTOR=_ARROWSERIALIZATIONOPTIONS, __module__="google.cloud.bigquery.storage_v1beta2.proto.arrow_pb2", __doc__="""Contains options specific to Arrow Serialization. - - Attributes: format: The Arrow IPC format to use. diff --git a/google/cloud/bigquery_storage_v1beta2/proto/avro_pb2.py b/google/cloud/bigquery_storage_v1beta2/proto/avro_pb2.py index f3df85cf..fb059ff1 100644 --- a/google/cloud/bigquery_storage_v1beta2/proto/avro_pb2.py +++ b/google/cloud/bigquery_storage_v1beta2/proto/avro_pb2.py @@ -116,8 +116,6 @@ DESCRIPTOR=_AVROSCHEMA, __module__="google.cloud.bigquery.storage_v1beta2.proto.avro_pb2", __doc__="""Avro schema. - - Attributes: schema: Json serialized schema, as described at @@ -135,8 +133,6 @@ DESCRIPTOR=_AVROROWS, __module__="google.cloud.bigquery.storage_v1beta2.proto.avro_pb2", __doc__="""Avro rows. - - Attributes: serialized_binary_rows: Binary serialized rows in a block. diff --git a/google/cloud/bigquery_storage_v1beta2/proto/storage_pb2.py b/google/cloud/bigquery_storage_v1beta2/proto/storage_pb2.py index d93e2e0b..e711ad52 100644 --- a/google/cloud/bigquery_storage_v1beta2/proto/storage_pb2.py +++ b/google/cloud/bigquery_storage_v1beta2/proto/storage_pb2.py @@ -613,8 +613,6 @@ DESCRIPTOR=_CREATEREADSESSIONREQUEST, __module__="google.cloud.bigquery.storage_v1beta2.proto.storage_pb2", __doc__="""Request message for ``CreateReadSession``. - - Attributes: parent: Required. The request project that owns the session, in the @@ -643,8 +641,6 @@ DESCRIPTOR=_READROWSREQUEST, __module__="google.cloud.bigquery.storage_v1beta2.proto.storage_pb2", __doc__="""Request message for ``ReadRows``. - - Attributes: read_stream: Required. Stream to read rows from. @@ -664,10 +660,7 @@ dict( DESCRIPTOR=_THROTTLESTATE, __module__="google.cloud.bigquery.storage_v1beta2.proto.storage_pb2", - __doc__="""Information on if the current connection is being - throttled. - - + __doc__="""Information on if the current connection is being throttled. Attributes: throttle_percent: How much this connection is being throttled. Zero means no @@ -713,8 +706,6 @@ DESCRIPTOR=_STREAMSTATS, __module__="google.cloud.bigquery.storage_v1beta2.proto.storage_pb2", __doc__="""Estimated stream statistics for a given Stream. - - Attributes: progress: Represents the progress of the current stream. @@ -731,10 +722,8 @@ dict( DESCRIPTOR=_READROWSRESPONSE, __module__="google.cloud.bigquery.storage_v1beta2.proto.storage_pb2", - __doc__="""Response from calling ``ReadRows`` may include row data, - progress and throttling information. - - + __doc__="""Response from calling ``ReadRows`` may include row data, progress and + throttling information. Attributes: rows: Row data is returned in format specified during session @@ -763,8 +752,6 @@ DESCRIPTOR=_SPLITREADSTREAMREQUEST, __module__="google.cloud.bigquery.storage_v1beta2.proto.storage_pb2", __doc__="""Request message for ``SplitReadStream``. - - Attributes: name: Required. Name of the stream to split. @@ -790,8 +777,6 @@ DESCRIPTOR=_SPLITREADSTREAMRESPONSE, __module__="google.cloud.bigquery.storage_v1beta2.proto.storage_pb2", __doc__="""Response message for ``SplitReadStream``. - - Attributes: primary_stream: Primary stream, which contains the beginning portion of diff --git a/google/cloud/bigquery_storage_v1beta2/proto/stream.proto b/google/cloud/bigquery_storage_v1beta2/proto/stream.proto index cad7f95a..38d5de47 100644 --- a/google/cloud/bigquery_storage_v1beta2/proto/stream.proto +++ b/google/cloud/bigquery_storage_v1beta2/proto/stream.proto @@ -67,8 +67,7 @@ message ReadSession { repeated string selected_fields = 1; // SQL text filtering statement, similar to a WHERE clause in a query. - // Currently, only a single predicate that is a comparison between a column - // and a constant value is supported. Aggregates are not supported. + // Aggregates are not supported. // // Examples: "int_field > 5" // "date_field = CAST('2014-9-27' as DATE)" diff --git a/google/cloud/bigquery_storage_v1beta2/proto/stream_pb2.py b/google/cloud/bigquery_storage_v1beta2/proto/stream_pb2.py index 08e60bb3..f532985c 100644 --- a/google/cloud/bigquery_storage_v1beta2/proto/stream_pb2.py +++ b/google/cloud/bigquery_storage_v1beta2/proto/stream_pb2.py @@ -483,8 +483,6 @@ DESCRIPTOR=_READSESSION_TABLEMODIFIERS, __module__="google.cloud.bigquery.storage_v1beta2.proto.stream_pb2", __doc__="""Additional attributes when reading a table. - - Attributes: snapshot_time: The snapshot time of the table. If not set, interpreted as @@ -500,8 +498,6 @@ DESCRIPTOR=_READSESSION_TABLEREADOPTIONS, __module__="google.cloud.bigquery.storage_v1beta2.proto.stream_pb2", __doc__="""Options dictating how we read a table. - - Attributes: selected_fields: Names of the fields in the table that should be read. If @@ -511,12 +507,10 @@ fields in selected\_fields. row_restriction: SQL text filtering statement, similar to a WHERE clause in a - query. Currently, only a single predicate that is a comparison - between a column and a constant value is supported. Aggregates - are not supported. Examples: "int\_field > 5" "date\_field = - CAST('2014-9-27' as DATE)" "nullable\_field is not NULL" - "st\_equals(geo\_field, st\_geofromtext("POINT(2, 2)"))" - "numeric\_field BETWEEN 1.0 AND 5.0" + query. Aggregates are not supported. Examples: "int\_field > + 5" "date\_field = CAST('2014-9-27' as DATE)" "nullable\_field + is not NULL" "st\_equals(geo\_field, st\_geofromtext("POINT(2, + 2)"))" "numeric\_field BETWEEN 1.0 AND 5.0" arrow_serialization_options: Optional. Options specific to the Apache Arrow output format. """, @@ -526,8 +520,6 @@ DESCRIPTOR=_READSESSION, __module__="google.cloud.bigquery.storage_v1beta2.proto.stream_pb2", __doc__="""Information about the ReadSession. - - Attributes: name: Output only. Unique identifier for the session, in the form `` @@ -579,11 +571,9 @@ dict( DESCRIPTOR=_READSTREAM, __module__="google.cloud.bigquery.storage_v1beta2.proto.stream_pb2", - __doc__="""Information about a single stream that gets data out of - the storage system. Most of the information about ``ReadStream`` - instances is aggregated, making ``ReadStream`` lightweight. - - + __doc__="""Information about a single stream that gets data out of the storage + system. Most of the information about ``ReadStream`` instances is + aggregated, making ``ReadStream`` lightweight. Attributes: name: Output only. Name of the stream, in the form ``projects/{proje diff --git a/noxfile.py b/noxfile.py index 037397df..eb939299 100644 --- a/noxfile.py +++ b/noxfile.py @@ -113,8 +113,7 @@ def system(session): # Install all test dependencies, then install this package into the # virtualenv's dist-packages. - session.install("mock", "pytest") - + session.install("mock", "pytest", "google-cloud-testutils") session.install("-e", ".[fastavro,pandas,pyarrow]") session.install("-e", ".") @@ -160,7 +159,7 @@ def docs(session): """Build the docs for this library.""" session.install("-e", ".") - session.install("sphinx", "alabaster", "recommonmark") + session.install("sphinx<3.0.0", "alabaster", "recommonmark") shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) session.run( diff --git a/samples/quickstart.py b/samples/quickstart.py index 19203984..8358fdc1 100644 --- a/samples/quickstart.py +++ b/samples/quickstart.py @@ -17,7 +17,7 @@ def main(project_id="your-project-id", snapshot_millis=0): # [START bigquerystorage_quickstart] - from google.cloud import bigquery_storage_v1beta1 + from google.cloud import bigquery_storage_v1 # TODO(developer): Set the project_id variable. # project_id = 'your-project-id' @@ -25,54 +25,47 @@ def main(project_id="your-project-id", snapshot_millis=0): # The read session is created in this project. This project can be # different from that which contains the table. - client = bigquery_storage_v1beta1.BigQueryStorageClient() + client = bigquery_storage_v1.BigQueryReadClient() # This example reads baby name data from the public datasets. - table_ref = bigquery_storage_v1beta1.types.TableReference() - table_ref.project_id = "bigquery-public-data" - table_ref.dataset_id = "usa_names" - table_ref.table_id = "usa_1910_current" + table = "projects/{}/datasets/{}/tables/{}".format( + "bigquery-public-data", "usa_names", "usa_1910_current" + ) + + requested_session = bigquery_storage_v1.types.ReadSession() + requested_session.table = table + # This API can also deliver data serialized in Apache Arrow format. + # This example leverages Apache Avro. + requested_session.data_format = bigquery_storage_v1.enums.DataFormat.AVRO # We limit the output columns to a subset of those allowed in the table, # and set a simple filter to only report names from the state of # Washington (WA). - read_options = bigquery_storage_v1beta1.types.TableReadOptions() - read_options.selected_fields.append("name") - read_options.selected_fields.append("number") - read_options.selected_fields.append("state") - read_options.row_restriction = 'state = "WA"' + requested_session.read_options.selected_fields.append("name") + requested_session.read_options.selected_fields.append("number") + requested_session.read_options.selected_fields.append("state") + requested_session.read_options.row_restriction = 'state = "WA"' # Set a snapshot time if it's been specified. modifiers = None if snapshot_millis > 0: - modifiers = bigquery_storage_v1beta1.types.TableModifiers() - modifiers.snapshot_time.FromMilliseconds(snapshot_millis) + requested_session.table_modifiers.snapshot_time.FromMilliseconds( + snapshot_millis + ) parent = "projects/{}".format(project_id) session = client.create_read_session( - table_ref, parent, - table_modifiers=modifiers, - read_options=read_options, - # This API can also deliver data serialized in Apache Arrow format. - # This example leverages Apache Avro. - format_=bigquery_storage_v1beta1.enums.DataFormat.AVRO, - # We use a LIQUID strategy in this example because we only read from a - # single stream. Consider BALANCED if you're consuming multiple streams - # concurrently and want more consistent stream sizes. - sharding_strategy=(bigquery_storage_v1beta1.enums.ShardingStrategy.LIQUID), - ) # API request. - - # We'll use only a single stream for reading data from the table. Because - # of dynamic sharding, this will yield all the rows in the table. However, - # if you wanted to fan out multiple readers you could do so by having a - # reader process each individual stream. - reader = client.read_rows( - bigquery_storage_v1beta1.types.StreamPosition(stream=session.streams[0]) + requested_session, + # We'll use only a single stream for reading data from the table. However, + # if you wanted to fan out multiple readers you could do so by having a + # reader process each individual stream. + max_stream_count=1, ) + reader = client.read_rows(session.streams[0].name) # The read stream contains blocks of Avro-encoded bytes. The rows() method - # uses the fastavro library to parse these blocks as an interable of Python + # uses the fastavro library to parse these blocks as an iterable of Python # dictionaries. Install fastavro with the following command: # # pip install google-cloud-bigquery-storage[fastavro] diff --git a/setup.cfg b/setup.cfg index 3bd55550..c3a2b39f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,19 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Generated by synthtool. DO NOT EDIT! [bdist_wheel] universal = 1 diff --git a/setup.py b/setup.py index 09a47675..08d7d873 100644 --- a/setup.py +++ b/setup.py @@ -21,8 +21,8 @@ name = "google-cloud-bigquery-storage" description = "BigQuery Storage API API client library" -version = "0.8.0" -release_status = "Development Status :: 4 - Beta" +version = "1.0.0" +release_status = "Development Status :: 5 - Production/Stable" dependencies = [ "google-api-core[grpc] >= 1.14.0, < 2.0.0dev", 'enum34; python_version < "3.4"', @@ -30,7 +30,7 @@ extras = { "pandas": "pandas>=0.17.1", "fastavro": "fastavro>=0.21.2", - "pyarrow": "pyarrow>=0.13.0, != 0.14.0", + "pyarrow": "pyarrow>=0.15.0", } package_root = os.path.abspath(os.path.dirname(__file__)) diff --git a/synth.metadata b/synth.metadata index bef95ffe..6f343a57 100644 --- a/synth.metadata +++ b/synth.metadata @@ -1,27 +1,32 @@ { - "updateTime": "2020-02-20T23:57:11.494163Z", "sources": [ { "generator": { "name": "artman", - "version": "0.45.0", - "dockerImage": "googleapis/artman@sha256:6aec9c34db0e4be221cdaf6faba27bdc07cfea846808b3d3b964dfce3a9a0f9b" + "version": "2.0.0", + "dockerImage": "googleapis/artman@sha256:b3b47805231a305d0f40c4bf069df20f6a2635574e6d4259fac651d3f9f6e098" + } + }, + { + "git": { + "name": ".", + "remote": "https://github.com/googleapis/python-bigquery-storage.git", + "sha": "9812244202f131142286fc48126ecb633515a6bc" } }, { "git": { "name": "googleapis", "remote": "https://github.com/googleapis/googleapis.git", - "sha": "3eaaaf8626ce5b0c0bc7eee05e143beffa373b01", - "internalRef": "296274723", - "log": "3eaaaf8626ce5b0c0bc7eee05e143beffa373b01\nAdd BUILD.bazel for v1 secretmanager.googleapis.com\n\nPiperOrigin-RevId: 296274723\n\ne76149c3d992337f85eeb45643106aacae7ede82\nMove securitycenter v1 to use generate from annotations.\n\nPiperOrigin-RevId: 296266862\n\n203740c78ac69ee07c3bf6be7408048751f618f8\nAdd StackdriverLoggingConfig field to Cloud Tasks v2 API.\n\nPiperOrigin-RevId: 296256388\n\n" + "sha": "28e76243c23cc282efbb288cb558c174e3e5e9ee", + "internalRef": "308294748" } }, { - "template": { - "name": "python_split_library", - "origin": "synthtool.gcp", - "version": "2020.2.4" + "git": { + "name": "synthtool", + "remote": "https://github.com/googleapis/synthtool.git", + "sha": "01b6f23d24b27878b48667ce597876d66b59780e" } } ], diff --git a/synth.py b/synth.py index 336384b9..043dee12 100644 --- a/synth.py +++ b/synth.py @@ -17,17 +17,15 @@ import synthtool as s from synthtool import gcp -gapic = gcp.GAPICGenerator() +gapic = gcp.GAPICBazel() common = gcp.CommonTemplates() versions = ["v1beta1", "v1beta2", "v1"] for version in versions: - library = gapic.py_library( - "bigquery_storage", - version, - config_path="/google/cloud/bigquery/storage/" f"artman_bigquerystorage_{version}.yaml", - artman_output_name=f"bigquerystorage-{version}", + service="bigquery_storage", + version=version, + bazel_target=f"//google/cloud/bigquery/storage/{version}:bigquery-storage-{version}-py", include_protos=True, ) diff --git a/tests/system/v1/test_reader_dataframe_v1.py b/tests/system/v1/test_reader_dataframe_v1.py index 2162ba48..ec3e983c 100644 --- a/tests/system/v1/test_reader_dataframe_v1.py +++ b/tests/system/v1/test_reader_dataframe_v1.py @@ -46,12 +46,12 @@ def test_read_v1(client, project_id): assert tbl.num_columns == 4 schema = tbl.schema - # Use field_by_name because the order doesn't currently match that of - # selected_fields. - assert pyarrow.types.is_int64(schema.field_by_name("station_id").type) - assert pyarrow.types.is_float64(schema.field_by_name("latitude").type) - assert pyarrow.types.is_float64(schema.field_by_name("longitude").type) - assert pyarrow.types.is_string(schema.field_by_name("name").type) + # Use field with a name specifier as there may be ordering differences + # when selected_fields is used + assert pyarrow.types.is_int64(schema.field("station_id").type) + assert pyarrow.types.is_float64(schema.field("latitude").type) + assert pyarrow.types.is_float64(schema.field("longitude").type) + assert pyarrow.types.is_string(schema.field("name").type) @pytest.mark.parametrize( diff --git a/tests/system/v1beta1/test_reader_dataframe_v1beta1.py b/tests/system/v1beta1/test_reader_dataframe_v1beta1.py index 07dcab38..20143f0f 100644 --- a/tests/system/v1beta1/test_reader_dataframe_v1beta1.py +++ b/tests/system/v1beta1/test_reader_dataframe_v1beta1.py @@ -48,12 +48,12 @@ def test_read_rows_to_arrow(client, project_id): assert tbl.num_columns == 4 schema = tbl.schema - # Use field_by_name because the order doesn't currently match that of - # selected_fields. - assert pyarrow.types.is_int64(schema.field_by_name("station_id").type) - assert pyarrow.types.is_float64(schema.field_by_name("latitude").type) - assert pyarrow.types.is_float64(schema.field_by_name("longitude").type) - assert pyarrow.types.is_string(schema.field_by_name("name").type) + # Use field with a name specifier as there may be ordering differences + # when selected_fields is used + assert pyarrow.types.is_int64(schema.field("station_id").type) + assert pyarrow.types.is_float64(schema.field("latitude").type) + assert pyarrow.types.is_float64(schema.field("longitude").type) + assert pyarrow.types.is_string(schema.field("name").type) @pytest.mark.parametrize( diff --git a/tests/unit/test_reader_v1.py b/tests/unit/test_reader_v1.py index 65d80d3a..febc872d 100644 --- a/tests/unit/test_reader_v1.py +++ b/tests/unit/test_reader_v1.py @@ -233,12 +233,14 @@ def _bq_to_avro_schema(bq_columns): def _bq_to_arrow_schema(bq_columns): def bq_col_as_field(column): - doc = column.get("description") + metadata = None + if column.get("description") is not None: + metadata = {"description": column.get("description")} name = column["name"] type_ = BQ_TO_ARROW_TYPES[column["type"]] mode = column.get("mode", "nullable").lower() - return pyarrow.field(name, type_, mode == "nullable", {"description": doc}) + return pyarrow.field(name, type_, mode == "nullable", metadata) return pyarrow.schema(bq_col_as_field(c) for c in bq_columns) @@ -643,6 +645,92 @@ def test_to_dataframe_w_dtypes_arrow(class_under_test): ) +def test_to_dataframe_empty_w_scalars_avro(class_under_test): + avro_schema = _bq_to_avro_schema(SCALAR_COLUMNS) + read_session = _generate_avro_read_session(avro_schema) + avro_blocks = _bq_to_avro_blocks([], avro_schema) + reader = class_under_test(avro_blocks, mock_client, "", 0, {}) + + got = reader.to_dataframe(read_session) + + expected = pandas.DataFrame(columns=SCALAR_COLUMN_NAMES) + expected["int_col"] = expected["int_col"].astype("int64") + expected["float_col"] = expected["float_col"].astype("float64") + expected["bool_col"] = expected["bool_col"].astype("bool") + expected["ts_col"] = expected["ts_col"].astype("datetime64[ns, UTC]") + + pandas.testing.assert_frame_equal( + got.reset_index(drop=True), # reset_index to ignore row labels + expected.reset_index(drop=True), + ) + + +def test_to_dataframe_empty_w_scalars_arrow(class_under_test): + arrow_schema = _bq_to_arrow_schema(SCALAR_COLUMNS) + read_session = _generate_arrow_read_session(arrow_schema) + arrow_batches = _bq_to_arrow_batches([], arrow_schema) + reader = class_under_test(arrow_batches, mock_client, "", 0, {}) + + got = reader.to_dataframe(read_session) + + expected = pandas.DataFrame([], columns=SCALAR_COLUMN_NAMES) + expected["int_col"] = expected["int_col"].astype("int64") + expected["float_col"] = expected["float_col"].astype("float64") + expected["bool_col"] = expected["bool_col"].astype("bool") + expected["ts_col"] = expected["ts_col"].astype("datetime64[ns, UTC]") + + pandas.testing.assert_frame_equal( + got.reset_index(drop=True), # reset_index to ignore row labels + expected.reset_index(drop=True), + ) + + +def test_to_dataframe_empty_w_dtypes_avro(class_under_test, mock_client): + avro_schema = _bq_to_avro_schema( + [ + {"name": "bigfloat", "type": "float64"}, + {"name": "lilfloat", "type": "float64"}, + ] + ) + read_session = _generate_avro_read_session(avro_schema) + avro_blocks = _bq_to_avro_blocks([], avro_schema) + reader = class_under_test(avro_blocks, mock_client, "", 0, {}) + + got = reader.to_dataframe(read_session, dtypes={"lilfloat": "float16"}) + + expected = pandas.DataFrame([], columns=["bigfloat", "lilfloat"]) + expected["bigfloat"] = expected["bigfloat"].astype("float64") + expected["lilfloat"] = expected["lilfloat"].astype("float16") + + pandas.testing.assert_frame_equal( + got.reset_index(drop=True), # reset_index to ignore row labels + expected.reset_index(drop=True), + ) + + +def test_to_dataframe_empty_w_dtypes_arrow(class_under_test, mock_client): + arrow_schema = _bq_to_arrow_schema( + [ + {"name": "bigfloat", "type": "float64"}, + {"name": "lilfloat", "type": "float64"}, + ] + ) + read_session = _generate_arrow_read_session(arrow_schema) + arrow_batches = _bq_to_arrow_batches([], arrow_schema) + reader = class_under_test(arrow_batches, mock_client, "", 0, {}) + + got = reader.to_dataframe(read_session, dtypes={"lilfloat": "float16"}) + + expected = pandas.DataFrame([], columns=["bigfloat", "lilfloat"]) + expected["bigfloat"] = expected["bigfloat"].astype("float64") + expected["lilfloat"] = expected["lilfloat"].astype("float16") + + pandas.testing.assert_frame_equal( + got.reset_index(drop=True), # reset_index to ignore row labels + expected.reset_index(drop=True), + ) + + def test_to_dataframe_by_page(class_under_test, mock_client): bq_columns = [ {"name": "int_col", "type": "int64"}, diff --git a/tests/unit/test_reader_v1beta1.py b/tests/unit/test_reader_v1beta1.py index 3d512752..f30241fa 100644 --- a/tests/unit/test_reader_v1beta1.py +++ b/tests/unit/test_reader_v1beta1.py @@ -234,12 +234,14 @@ def _bq_to_avro_schema(bq_columns): def _bq_to_arrow_schema(bq_columns): def bq_col_as_field(column): - doc = column.get("description") + metadata = None + if column.get("description") is not None: + metadata = {"description": column.get("description")} name = column["name"] type_ = BQ_TO_ARROW_TYPES[column["type"]] mode = column.get("mode", "nullable").lower() - return pyarrow.field(name, type_, mode == "nullable", {"description": doc}) + return pyarrow.field(name, type_, mode == "nullable", metadata) return pyarrow.schema(bq_col_as_field(c) for c in bq_columns)