diff --git a/CHANGELOG.md b/CHANGELOG.md index 5bbde01f4..034f4f324 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ [1]: https://pypi.org/project/google-cloud-bigquery/#history +## [3.10.0](https://github.com/googleapis/python-bigquery/compare/v3.9.0...v3.10.0) (2023-04-18) + + +### Features + +* Add date, datetime, time, timestamp dtype to to_dataframe ([#1547](https://github.com/googleapis/python-bigquery/issues/1547)) ([64e913d](https://github.com/googleapis/python-bigquery/commit/64e913d73832f6363466cbea5ace2337c86fa58b)) + ## [3.9.0](https://github.com/googleapis/python-bigquery/compare/v3.8.0...v3.9.0) (2023-03-28) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 601aa13df..a14dbec9b 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -290,6 +290,10 @@ def default_types_mapper( int_dtype: Union[Any, None] = None, float_dtype: Union[Any, None] = None, string_dtype: Union[Any, None] = None, + date_dtype: Union[Any, None] = None, + datetime_dtype: Union[Any, None] = None, + time_dtype: Union[Any, None] = None, + timestamp_dtype: Union[Any, None] = None, ): """Create a mapping from pyarrow types to pandas types. @@ -321,13 +325,28 @@ def types_mapper(arrow_data_type): elif ( # If date_as_object is True, we know some DATE columns are # out-of-bounds of what is supported by pandas. - not date_as_object + date_dtype is not None + and not date_as_object and pyarrow.types.is_date(arrow_data_type) ): - return db_dtypes.DateDtype() + return date_dtype - elif pyarrow.types.is_time(arrow_data_type): - return db_dtypes.TimeDtype() + elif ( + datetime_dtype is not None + and pyarrow.types.is_timestamp(arrow_data_type) + and arrow_data_type.tz is None + ): + return datetime_dtype + + elif ( + timestamp_dtype is not None + and pyarrow.types.is_timestamp(arrow_data_type) + and arrow_data_type.tz is not None + ): + return timestamp_dtype + + elif time_dtype is not None and pyarrow.types.is_time(arrow_data_type): + return time_dtype return types_mapper diff --git a/google/cloud/bigquery/enums.py b/google/cloud/bigquery/enums.py index e4e3d22fc..553853630 100644 --- a/google/cloud/bigquery/enums.py +++ b/google/cloud/bigquery/enums.py @@ -90,6 +90,12 @@ class DefaultPandasDTypes(enum.Enum): INT_DTYPE = object() """Specifies default integer dtype""" + DATE_DTYPE = object() + """Specifies default date dtype""" + + TIME_DTYPE = object() + """Specifies default time dtype""" + class DestinationFormat(object): """The exported file format. The default value is :attr:`CSV`. diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index e4807cc63..315d8201c 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -58,6 +58,11 @@ except ImportError: # pragma: NO COVER pandas = None +try: + import db_dtypes # type: ignore +except ImportError: # pragma: NO COVER + db_dtypes = None + if typing.TYPE_CHECKING: # pragma: NO COVER # Assumption: type checks are only used by library developers and CI environments # that have all optional dependencies installed, thus no conditional imports. @@ -764,7 +769,6 @@ def __init__(self, job_id, query, client, job_config=None): _helpers._set_sub_prop( self._properties, ["configuration", "query", "query"], query ) - self._query_results = None self._done_timeout = None self._transport_timeout = None @@ -1332,6 +1336,15 @@ def _reload_query_results( # the timeout from the futures API is respected. See: # https://github.com/GoogleCloudPlatform/google-cloud-python/issues/4135 timeout_ms = None + + # Python_API_core, as part of a major rewrite of the deadline, timeout, + # retry process sets the timeout value as a Python object(). + # Our system does not natively handle that and instead expects + # either none or a numeric value. If passed a Python object, convert to + # None. + if type(self._done_timeout) == object: # pragma: NO COVER + self._done_timeout = None + if self._done_timeout is not None: # Subtract a buffer for context switching, network latency, etc. api_timeout = self._done_timeout - _TIMEOUT_BUFFER_SECS @@ -1629,6 +1642,10 @@ def to_dataframe( int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE, float_dtype: Union[Any, None] = None, string_dtype: Union[Any, None] = None, + date_dtype: Union[Any, None] = DefaultPandasDTypes.DATE_DTYPE, + datetime_dtype: Union[Any, None] = None, + time_dtype: Union[Any, None] = DefaultPandasDTypes.TIME_DTYPE, + timestamp_dtype: Union[Any, None] = None, ) -> "pandas.DataFrame": """Return a pandas DataFrame from a QueryJob @@ -1689,7 +1706,7 @@ def to_dataframe( type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type - .. versionadded:: 3.7.1 + .. versionadded:: 3.8.0 int_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``) @@ -1699,7 +1716,7 @@ def to_dataframe( Integer types can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types - .. versionadded:: 3.7.1 + .. versionadded:: 3.8.0 float_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``) @@ -1709,7 +1726,7 @@ def to_dataframe( type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types - .. versionadded:: 3.7.1 + .. versionadded:: 3.8.0 string_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to @@ -1719,7 +1736,50 @@ def to_dataframe( type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type - .. versionadded:: 3.7.1 + .. versionadded:: 3.8.0 + + date_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. + ``pandas.ArrowDtype(pyarrow.date32())``) to convert BigQuery Date + type, instead of relying on the default ``db_dtypes.DateDtype()``. + If you explicitly set the value to ``None``, then the data type will be + ``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery + Date type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#date_type + + .. versionadded:: 3.10.0 + + datetime_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. + ``pandas.ArrowDtype(pyarrow.timestamp("us"))``) to convert BigQuery Datetime + type, instead of relying on the default ``numpy.dtype("datetime64[ns]``. + If you explicitly set the value to ``None``, then the data type will be + ``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery + Datetime type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#datetime_type + + .. versionadded:: 3.10.0 + + time_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. + ``pandas.ArrowDtype(pyarrow.time64("us"))``) to convert BigQuery Time + type, instead of relying on the default ``db_dtypes.TimeDtype()``. + If you explicitly set the value to ``None``, then the data type will be + ``numpy.dtype("object")``. BigQuery Time type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#time_type + + .. versionadded:: 3.10.0 + + timestamp_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. + ``pandas.ArrowDtype(pyarrow.timestamp("us", tz="UTC"))``) to convert BigQuery Timestamp + type, instead of relying on the default ``numpy.dtype("datetime64[ns, UTC]")``. + If you explicitly set the value to ``None``, then the data type will be + ``numpy.dtype("datetime64[ns, UTC]")`` or ``object`` if out of bound. BigQuery + Datetime type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#timestamp_type + + .. versionadded:: 3.10.0 Returns: pandas.DataFrame: @@ -1747,6 +1807,10 @@ def to_dataframe( int_dtype=int_dtype, float_dtype=float_dtype, string_dtype=string_dtype, + date_dtype=date_dtype, + datetime_dtype=datetime_dtype, + time_dtype=time_dtype, + timestamp_dtype=timestamp_dtype, ) # If changing the signature of this method, make sure to apply the same diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 93b0da67f..a34e5dc25 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1935,6 +1935,10 @@ def to_dataframe( int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE, float_dtype: Union[Any, None] = None, string_dtype: Union[Any, None] = None, + date_dtype: Union[Any, None] = DefaultPandasDTypes.DATE_DTYPE, + datetime_dtype: Union[Any, None] = None, + time_dtype: Union[Any, None] = DefaultPandasDTypes.TIME_DTYPE, + timestamp_dtype: Union[Any, None] = None, ) -> "pandas.DataFrame": """Create a pandas DataFrame by loading all pages of a query. @@ -1999,7 +2003,7 @@ def to_dataframe( type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type - .. versionadded:: 3.7.1 + .. versionadded:: 3.8.0 int_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``) @@ -2009,7 +2013,7 @@ def to_dataframe( Integer types can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types - .. versionadded:: 3.7.1 + .. versionadded:: 3.8.0 float_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``) @@ -2019,7 +2023,7 @@ def to_dataframe( type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types - .. versionadded:: 3.7.1 + .. versionadded:: 3.8.0 string_dtype (Optional[pandas.Series.dtype, None]): If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to @@ -2029,7 +2033,50 @@ def to_dataframe( type can be found at: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type - .. versionadded:: 3.7.1 + .. versionadded:: 3.8.0 + + date_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. + ``pandas.ArrowDtype(pyarrow.date32())``) to convert BigQuery Date + type, instead of relying on the default ``db_dtypes.DateDtype()``. + If you explicitly set the value to ``None``, then the data type will be + ``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery + Date type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#date_type + + .. versionadded:: 3.10.0 + + datetime_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. + ``pandas.ArrowDtype(pyarrow.timestamp("us"))``) to convert BigQuery Datetime + type, instead of relying on the default ``numpy.dtype("datetime64[ns]``. + If you explicitly set the value to ``None``, then the data type will be + ``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery + Datetime type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#datetime_type + + .. versionadded:: 3.10.0 + + time_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. + ``pandas.ArrowDtype(pyarrow.time64("us"))``) to convert BigQuery Time + type, instead of relying on the default ``db_dtypes.TimeDtype()``. + If you explicitly set the value to ``None``, then the data type will be + ``numpy.dtype("object")``. BigQuery Time type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#time_type + + .. versionadded:: 3.10.0 + + timestamp_dtype (Optional[pandas.Series.dtype, None]): + If set, indicate a pandas ExtensionDtype (e.g. + ``pandas.ArrowDtype(pyarrow.timestamp("us", tz="UTC"))``) to convert BigQuery Timestamp + type, instead of relying on the default ``numpy.dtype("datetime64[ns, UTC]")``. + If you explicitly set the value to ``None``, then the data type will be + ``numpy.dtype("datetime64[ns, UTC]")`` or ``object`` if out of bound. BigQuery + Datetime type can be found at: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#timestamp_type + + .. versionadded:: 3.10.0 Returns: pandas.DataFrame: @@ -2059,6 +2106,9 @@ def to_dataframe( if int_dtype is DefaultPandasDTypes.INT_DTYPE: int_dtype = pandas.Int64Dtype() + if time_dtype is DefaultPandasDTypes.TIME_DTYPE: + time_dtype = db_dtypes.TimeDtype() + if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"): raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE) @@ -2071,6 +2121,24 @@ def to_dataframe( if string_dtype is not None and not hasattr(string_dtype, "__from_arrow__"): raise ValueError("string_dtype", _NO_SUPPORTED_DTYPE) + if ( + date_dtype is not None + and date_dtype is not DefaultPandasDTypes.DATE_DTYPE + and not hasattr(date_dtype, "__from_arrow__") + ): + raise ValueError("date_dtype", _NO_SUPPORTED_DTYPE) + + if datetime_dtype is not None and not hasattr(datetime_dtype, "__from_arrow__"): + raise ValueError("datetime_dtype", _NO_SUPPORTED_DTYPE) + + if time_dtype is not None and not hasattr(time_dtype, "__from_arrow__"): + raise ValueError("time_dtype", _NO_SUPPORTED_DTYPE) + + if timestamp_dtype is not None and not hasattr( + timestamp_dtype, "__from_arrow__" + ): + raise ValueError("timestamp_dtype", _NO_SUPPORTED_DTYPE) + if dtypes is None: dtypes = {} @@ -2086,25 +2154,29 @@ def to_dataframe( create_bqstorage_client=create_bqstorage_client, ) - # When converting date or timestamp values to nanosecond precision, the result - # can be out of pyarrow bounds. To avoid the error when converting to - # Pandas, we set the date_as_object or timestamp_as_object parameter to True, - # if necessary. - date_as_object = not all( - self.__can_cast_timestamp_ns(col) - for col in record_batch - # Type can be date32 or date64 (plus units). - # See: https://arrow.apache.org/docs/python/api/datatypes.html - if pyarrow.types.is_date(col.type) - ) + # Default date dtype is `db_dtypes.DateDtype()` that could cause out of bounds error, + # when pyarrow converts date values to nanosecond precision. To avoid the error, we + # set the date_as_object parameter to True, if necessary. + date_as_object = False + if date_dtype is DefaultPandasDTypes.DATE_DTYPE: + date_dtype = db_dtypes.DateDtype() + date_as_object = not all( + self.__can_cast_timestamp_ns(col) + for col in record_batch + # Type can be date32 or date64 (plus units). + # See: https://arrow.apache.org/docs/python/api/datatypes.html + if pyarrow.types.is_date(col.type) + ) - timestamp_as_object = not all( - self.__can_cast_timestamp_ns(col) - for col in record_batch - # Type can be datetime and timestamp (plus units and time zone). - # See: https://arrow.apache.org/docs/python/api/datatypes.html - if pyarrow.types.is_timestamp(col.type) - ) + timestamp_as_object = False + if datetime_dtype is None and timestamp_dtype is None: + timestamp_as_object = not all( + self.__can_cast_timestamp_ns(col) + for col in record_batch + # Type can be datetime and timestamp (plus units and time zone). + # See: https://arrow.apache.org/docs/python/api/datatypes.html + if pyarrow.types.is_timestamp(col.type) + ) if len(record_batch) > 0: df = record_batch.to_pandas( @@ -2117,6 +2189,10 @@ def to_dataframe( int_dtype=int_dtype, float_dtype=float_dtype, string_dtype=string_dtype, + date_dtype=date_dtype, + datetime_dtype=datetime_dtype, + time_dtype=time_dtype, + timestamp_dtype=timestamp_dtype, ), ) else: @@ -2317,6 +2393,10 @@ def to_dataframe( int_dtype=None, float_dtype=None, string_dtype=None, + date_dtype=None, + datetime_dtype=None, + time_dtype=None, + timestamp_dtype=None, ) -> "pandas.DataFrame": """Create an empty dataframe. @@ -2330,6 +2410,10 @@ def to_dataframe( int_dtype (Any): Ignored. Added for compatibility with RowIterator. float_dtype (Any): Ignored. Added for compatibility with RowIterator. string_dtype (Any): Ignored. Added for compatibility with RowIterator. + date_dtype (Any): Ignored. Added for compatibility with RowIterator. + datetime_dtype (Any): Ignored. Added for compatibility with RowIterator. + time_dtype (Any): Ignored. Added for compatibility with RowIterator. + timestamp_dtype (Any): Ignored. Added for compatibility with RowIterator. Returns: pandas.DataFrame: An empty :class:`~pandas.DataFrame`. diff --git a/google/cloud/bigquery/version.py b/google/cloud/bigquery/version.py index 0bc275357..b674396b2 100644 --- a/google/cloud/bigquery/version.py +++ b/google/cloud/bigquery/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "3.9.0" +__version__ = "3.10.0" diff --git a/samples/geography/requirements-test.txt b/samples/geography/requirements-test.txt index e0ec46254..3c3afdcb1 100644 --- a/samples/geography/requirements-test.txt +++ b/samples/geography/requirements-test.txt @@ -1,2 +1,2 @@ -pytest==7.2.2 -mock==5.0.1 +pytest==7.3.1 +mock==5.0.2 diff --git a/samples/geography/requirements.txt b/samples/geography/requirements.txt index 75964dbe1..49dd1c156 100644 --- a/samples/geography/requirements.txt +++ b/samples/geography/requirements.txt @@ -1,4 +1,4 @@ -attrs==22.2.0 +attrs==23.1.0 certifi==2022.12.7 cffi==1.15.1 charset-normalizer==3.1.0 @@ -6,27 +6,27 @@ click==8.1.3 click-plugins==1.1.1 cligj==0.7.2 dataclasses==0.8; python_version < '3.7' -db-dtypes==1.0.5 -Fiona==1.9.1 +db-dtypes==1.1.1 +Fiona==1.9.3 geojson==3.0.1 geopandas===0.10.2; python_version == '3.7' geopandas==0.12.2; python_version >= '3.8' google-api-core==2.11.0 -google-auth==2.16.2 -google-cloud-bigquery==3.6.0 -google-cloud-bigquery-storage==2.19.0 +google-auth==2.17.3 +google-cloud-bigquery==3.9.0 +google-cloud-bigquery-storage==2.19.1 google-cloud-core==2.3.2 google-crc32c==1.5.0 google-resumable-media==2.4.1 -googleapis-common-protos==1.58.0 -grpcio==1.51.3 +googleapis-common-protos==1.59.0 +grpcio==1.54.0 idna==3.4 libcst==0.4.9 munch==2.5.0 mypy-extensions==1.0.0 -packaging==23.0 +packaging==23.1 pandas===1.3.5; python_version == '3.7' -pandas==1.5.3; python_version >= '3.8' +pandas==2.0.0; python_version >= '3.8' proto-plus==1.22.2 pyarrow==11.0.0 pyasn1==0.4.8 @@ -34,7 +34,7 @@ pyasn1-modules==0.2.8 pycparser==2.21 pyparsing==3.0.9 python-dateutil==2.8.2 -pytz==2022.7.1 +pytz==2023.3 PyYAML==6.0 requests==2.28.2 rsa==4.9 @@ -42,4 +42,4 @@ Shapely==2.0.1 six==1.16.0 typing-extensions==4.5.0 typing-inspect==0.8.0 -urllib3==1.26.14 +urllib3==1.26.15 diff --git a/samples/magics/requirements-test.txt b/samples/magics/requirements-test.txt index 3ed7558d5..9fa68a930 100644 --- a/samples/magics/requirements-test.txt +++ b/samples/magics/requirements-test.txt @@ -1,3 +1,3 @@ google-cloud-testutils==1.3.3 -pytest==7.2.2 -mock==5.0.1 +pytest==7.3.1 +mock==5.0.2 diff --git a/samples/magics/requirements.txt b/samples/magics/requirements.txt index 55b828f1b..956b03dda 100644 --- a/samples/magics/requirements.txt +++ b/samples/magics/requirements.txt @@ -1,15 +1,15 @@ -db-dtypes==1.0.5 -google-cloud-bigquery-storage==2.19.0 +db-dtypes==1.1.1 +google-cloud-bigquery-storage==2.19.1 google-auth-oauthlib==1.0.0 -grpcio==1.51.3 -ipywidgets==8.0.4 +grpcio==1.54.0 +ipywidgets==8.0.6 ipython===7.31.1; python_version == '3.7' ipython===8.0.1; python_version == '3.8' -ipython==8.11.0; python_version >= '3.9' +ipython==8.12.0; python_version >= '3.9' matplotlib===3.5.3; python_version == '3.7' matplotlib==3.7.1; python_version >= '3.8' pandas===1.3.5; python_version == '3.7' -pandas==1.5.3; python_version >= '3.8' +pandas==2.0.0; python_version >= '3.8' pyarrow==11.0.0 -pytz==2022.7.1 +pytz==2023.3 typing-extensions==4.5.0 diff --git a/samples/snippets/requirements-test.txt b/samples/snippets/requirements-test.txt index 3ed7558d5..9fa68a930 100644 --- a/samples/snippets/requirements-test.txt +++ b/samples/snippets/requirements-test.txt @@ -1,3 +1,3 @@ google-cloud-testutils==1.3.3 -pytest==7.2.2 -mock==5.0.1 +pytest==7.3.1 +mock==5.0.2 diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index 6c6b17ea8..034d9d00d 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -1,16 +1,16 @@ -db-dtypes==1.0.5 -google-cloud-bigquery==3.6.0 -google-cloud-bigquery-storage==2.19.0 +db-dtypes==1.1.1 +google-cloud-bigquery==3.9.0 +google-cloud-bigquery-storage==2.19.1 google-auth-oauthlib==1.0.0 -grpcio==1.51.3 -ipywidgets==8.0.4 +grpcio==1.54.0 +ipywidgets==8.0.6 ipython===7.31.1; python_version == '3.7' ipython===8.0.1; python_version == '3.8' -ipython==8.11.0; python_version >= '3.9' +ipython==8.12.0; python_version >= '3.9' matplotlib===3.5.3; python_version == '3.7' matplotlib==3.7.1; python_version >= '3.8' pandas===1.3.5; python_version == '3.7' -pandas==1.5.3; python_version >= '3.8' +pandas==2.0.0; python_version >= '3.8' pyarrow==11.0.0 -pytz==2022.7.1 +pytz==2023.3 typing-extensions==4.5.0 diff --git a/setup.py b/setup.py index 51cb6dc75..08106f694 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ # Keep the no-op bqstorage extra for backward compatibility. # See: https://github.com/googleapis/python-bigquery/issues/757 "bqstorage": [ - "google-cloud-bigquery-storage >= 2.0.0, <3.0.0dev", + "google-cloud-bigquery-storage >= 2.6.0, <3.0.0dev", # Due to an issue in pip's dependency resolver, the `grpc` extra is not # installed, even though `google-cloud-bigquery-storage` specifies it # as `google-api-core[grpc]`. We thus need to explicitly specify it here. diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index c94d80abf..2ea482e8b 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -8,7 +8,7 @@ db-dtypes==0.3.0 geopandas==0.9.0 google-api-core==1.31.5 -google-cloud-bigquery-storage==2.0.0 +google-cloud-bigquery-storage==2.6.0 google-cloud-core==1.6.0 google-resumable-media==0.6.0 grpcio==1.47.0 diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index 91305b450..ea8cc6d63 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -34,6 +34,7 @@ pandas = pytest.importorskip("pandas", minversion="0.23.0") +pyarrow = pytest.importorskip("pyarrow") numpy = pytest.importorskip("numpy") bigquery_storage = pytest.importorskip( @@ -1109,6 +1110,103 @@ def test_list_rows_nullable_scalars_extreme_dtypes( assert df.dtypes["string_col"].name == "object" +@pytest.mark.parametrize( + ("max_results",), + ( + (None,), + (10,), + ), # Use BQ Storage API. # Use REST API. +) +def test_list_rows_nullable_scalars_extreme_dtypes_w_custom_dtype( + bigquery_client, scalars_extreme_table, max_results +): + # TODO(GH#836): Avoid INTERVAL columns until they are supported by the + # BigQuery Storage API and pyarrow. + schema = [ + bigquery.SchemaField("bool_col", enums.SqlTypeNames.BOOLEAN), + bigquery.SchemaField("bignumeric_col", enums.SqlTypeNames.BIGNUMERIC), + bigquery.SchemaField("bytes_col", enums.SqlTypeNames.BYTES), + bigquery.SchemaField("date_col", enums.SqlTypeNames.DATE), + bigquery.SchemaField("datetime_col", enums.SqlTypeNames.DATETIME), + bigquery.SchemaField("float64_col", enums.SqlTypeNames.FLOAT64), + bigquery.SchemaField("geography_col", enums.SqlTypeNames.GEOGRAPHY), + bigquery.SchemaField("int64_col", enums.SqlTypeNames.INT64), + bigquery.SchemaField("numeric_col", enums.SqlTypeNames.NUMERIC), + bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING), + bigquery.SchemaField("time_col", enums.SqlTypeNames.TIME), + bigquery.SchemaField("timestamp_col", enums.SqlTypeNames.TIMESTAMP), + ] + + df = bigquery_client.list_rows( + scalars_extreme_table, + max_results=max_results, + selected_fields=schema, + ).to_dataframe( + bool_dtype=pandas.BooleanDtype(), + int_dtype=pandas.Int64Dtype(), + float_dtype=( + pandas.Float64Dtype() + if hasattr(pandas, "Float64Dtype") + else pandas.StringDtype() + ), + string_dtype=pandas.StringDtype(), + date_dtype=( + pandas.ArrowDtype(pyarrow.date32()) + if hasattr(pandas, "ArrowDtype") + else None + ), + datetime_dtype=( + pandas.ArrowDtype(pyarrow.timestamp("us")) + if hasattr(pandas, "ArrowDtype") + else None + ), + time_dtype=( + pandas.ArrowDtype(pyarrow.time64("us")) + if hasattr(pandas, "ArrowDtype") + else None + ), + timestamp_dtype=( + pandas.ArrowDtype(pyarrow.timestamp("us", tz="UTC")) + if hasattr(pandas, "ArrowDtype") + else None + ), + ) + + # These pandas dtypes are handled by the custom dtypes. + assert df.dtypes["bool_col"].name == "boolean" + assert df.dtypes["float64_col"].name == "Float64" + assert df.dtypes["int64_col"].name == "Int64" + assert df.dtypes["string_col"].name == "string" + + assert ( + df.dtypes["date_col"].name == "date32[day][pyarrow]" + if hasattr(pandas, "ArrowDtype") + else "datetime64[ns]" + ) + assert ( + df.dtypes["datetime_col"].name == "timestamp[us][pyarrow]" + if hasattr(pandas, "ArrowDtype") + else "object" + ) + assert ( + df.dtypes["timestamp_col"].name == "timestamp[us, tz=UTC][pyarrow]" + if hasattr(pandas, "ArrowDtype") + else "object" + ) + assert ( + df.dtypes["time_col"].name == "time64[us][pyarrow]" + if hasattr(pandas, "ArrowDtype") + else "object" + ) + + # decimal.Decimal is used to avoid loss of precision. + assert df.dtypes["numeric_col"].name == "object" + assert df.dtypes["bignumeric_col"].name == "object" + + # pandas uses Python bytes objects. + assert df.dtypes["bytes_col"].name == "object" + + def test_upload_time_and_datetime_56(bigquery_client, dataset_id): df = pandas.DataFrame( dict( diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index a2444efdd..01b60ceb3 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -59,12 +59,6 @@ @pytest.fixture def table_read_options_kwarg(): - # Create a BigQuery Storage table read options object with pyarrow compression - # enabled if a recent-enough version of google-cloud-bigquery-storage dependency is - # installed to support the compression. - if not hasattr(bigquery_storage, "ArrowSerializationOptions"): - return {} - read_options = bigquery_storage.ReadSession.TableReadOptions( arrow_serialization_options=bigquery_storage.ArrowSerializationOptions( buffer_compression=bigquery_storage.ArrowSerializationOptions.CompressionCodec.LZ4_FRAME diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 22c7c048d..53db635fa 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -46,6 +46,7 @@ PYARROW_VERSION = pkg_resources.parse_version("0.0.1") if pyarrow: + import pyarrow import pyarrow.types PYARROW_VERSION = pkg_resources.parse_version(pyarrow.__version__) @@ -3471,11 +3472,45 @@ def test_to_dataframe_w_dtypes_mapper(self): SchemaField("age", "INTEGER"), SchemaField("seconds", "INT64"), SchemaField("miles", "FLOAT64"), + SchemaField("date", "DATE"), + SchemaField("datetime", "DATETIME"), + SchemaField("time", "TIME"), + SchemaField("timestamp", "TIMESTAMP"), ] row_data = [ - ["Phred Phlyntstone", "true", "32", "23000", "1.77"], - ["Bharney Rhubble", "false", "33", "454000", "6.66"], - ["Wylma Phlyntstone", "true", "29", "341000", "2.0"], + [ + "Phred Phlyntstone", + "true", + "32", + "23000", + "1.77", + "1999-12-01", + "1999-12-31T00:00:00.000000", + "00:00:00.000000", + "1433836800000000", + ], + [ + "Bharney Rhubble", + "false", + "33", + "454000", + "6.66", + "4567-06-14", + "4567-12-31T00:00:00.000000", + "12:00:00.232413", + "81953424000000000", + ], + [ + "Wylma Phlyntstone", + "true", + "29", + "341000", + "2.0", + "9999-12-31", + "9999-12-31T23:59:59.999999", + "23:59:59.999999", + "253402261199999999", + ], ] rows = [{"f": [{"v": field} for field in row]} for row in row_data] path = "/foo" @@ -3486,17 +3521,142 @@ def test_to_dataframe_w_dtypes_mapper(self): create_bqstorage_client=False, bool_dtype=pandas.BooleanDtype(), int_dtype=pandas.Int32Dtype(), - float_dtype=pandas.StringDtype(), + float_dtype=( + pandas.Float64Dtype() + if hasattr(pandas, "Float64Dtype") + else pandas.StringDtype() + ), string_dtype=pandas.StringDtype(), + date_dtype=( + pandas.ArrowDtype(pyarrow.date32()) + if hasattr(pandas, "ArrowDtype") + else None + ), + datetime_dtype=( + pandas.ArrowDtype(pyarrow.timestamp("us")) + if hasattr(pandas, "ArrowDtype") + else None + ), + time_dtype=( + pandas.ArrowDtype(pyarrow.time64("us")) + if hasattr(pandas, "ArrowDtype") + else None + ), + timestamp_dtype=( + pandas.ArrowDtype(pyarrow.timestamp("us", tz="UTC")) + if hasattr(pandas, "ArrowDtype") + else None + ), ) self.assertIsInstance(df, pandas.DataFrame) + + self.assertEqual(list(df.complete), [True, False, True]) self.assertEqual(df.complete.dtype.name, "boolean") + + self.assertEqual(list(df.age), [32, 33, 29]) self.assertEqual(df.age.dtype.name, "Int32") + + self.assertEqual(list(df.seconds), [23000, 454000, 341000]) self.assertEqual(df.seconds.dtype.name, "Int32") - self.assertEqual(df.miles.dtype.name, "string") + + self.assertEqual( + list(df.name), ["Phred Phlyntstone", "Bharney Rhubble", "Wylma Phlyntstone"] + ) self.assertEqual(df.name.dtype.name, "string") + if hasattr(pandas, "Float64Dtype"): + self.assertEqual(list(df.miles), [1.77, 6.66, 2.0]) + self.assertEqual(df.miles.dtype.name, "Float64") + else: + self.assertEqual(list(df.miles), ["1.77", "6.66", "2.0"]) + self.assertEqual(df.miles.dtype.name, "string") + + if hasattr(pandas, "ArrowDtype"): + self.assertEqual( + list(df.date), + [ + datetime.date(1999, 12, 1), + datetime.date(4567, 6, 14), + datetime.date(9999, 12, 31), + ], + ) + self.assertEqual(df.date.dtype.name, "date32[day][pyarrow]") + + self.assertEqual( + list(df.datetime), + [ + datetime.datetime(1999, 12, 31, 0, 0), + datetime.datetime(4567, 12, 31, 0, 0), + datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), + ], + ) + self.assertEqual(df.datetime.dtype.name, "timestamp[us][pyarrow]") + + self.assertEqual( + list(df.time), + [ + datetime.time(0, 0), + datetime.time(12, 0, 0, 232413), + datetime.time(23, 59, 59, 999999), + ], + ) + self.assertEqual(df.time.dtype.name, "time64[us][pyarrow]") + + self.assertEqual( + list(df.timestamp), + [ + datetime.datetime(2015, 6, 9, 8, 0, tzinfo=datetime.timezone.utc), + datetime.datetime(4567, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), + datetime.datetime( + 9999, 12, 31, 12, 59, 59, 999999, tzinfo=datetime.timezone.utc + ), + ], + ) + self.assertEqual(df.timestamp.dtype.name, "timestamp[us, tz=UTC][pyarrow]") + else: + self.assertEqual( + list(df.date), + [ + pandas.Timestamp("1999-12-01 00:00:00"), + pandas.Timestamp("2229-03-27 01:41:45.161793536"), + pandas.Timestamp("1816-03-29 05:56:08.066277376"), + ], + ) + self.assertEqual(df.date.dtype.name, "datetime64[ns]") + + self.assertEqual( + list(df.datetime), + [ + datetime.datetime(1999, 12, 31, 0, 0), + datetime.datetime(4567, 12, 31, 0, 0), + datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), + ], + ) + self.assertEqual(df.datetime.dtype.name, "object") + + self.assertEqual( + list(df.time), + [ + datetime.time(0, 0), + datetime.time(12, 0, 0, 232413), + datetime.time(23, 59, 59, 999999), + ], + ) + self.assertEqual(df.time.dtype.name, "object") + + self.assertEqual( + list(df.timestamp), + [ + datetime.datetime(2015, 6, 9, 8, 0, tzinfo=datetime.timezone.utc), + datetime.datetime(4567, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), + datetime.datetime( + 9999, 12, 31, 12, 59, 59, 999999, tzinfo=datetime.timezone.utc + ), + ], + ) + self.assertEqual(df.timestamp.dtype.name, "object") + @unittest.skipIf(pandas is None, "Requires `pandas`") def test_to_dataframe_w_none_dtypes_mapper(self): from google.cloud.bigquery.schema import SchemaField @@ -3507,11 +3667,23 @@ def test_to_dataframe_w_none_dtypes_mapper(self): SchemaField("age", "INTEGER"), SchemaField("seconds", "INT64"), SchemaField("miles", "FLOAT64"), + SchemaField("date", "DATE"), + SchemaField("datetime", "DATETIME"), + SchemaField("time", "TIME"), + SchemaField("timestamp", "TIMESTAMP"), ] row_data = [ - ["Phred Phlyntstone", "true", "32", "23000", "1.77"], - ["Bharney Rhubble", "false", "33", "454000", "6.66"], - ["Wylma Phlyntstone", "true", "29", "341000", "2.0"], + [ + "Phred Phlyntstone", + "true", + "32", + "23000", + "1.77", + "1999-12-01", + "1999-12-31T00:00:00.000000", + "23:59:59.999999", + "1433836800000000", + ], ] rows = [{"f": [{"v": field} for field in row]} for row in row_data] path = "/foo" @@ -3524,6 +3696,10 @@ def test_to_dataframe_w_none_dtypes_mapper(self): int_dtype=None, float_dtype=None, string_dtype=None, + date_dtype=None, + datetime_dtype=None, + time_dtype=None, + timestamp_dtype=None, ) self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(df.complete.dtype.name, "bool") @@ -3531,6 +3707,10 @@ def test_to_dataframe_w_none_dtypes_mapper(self): self.assertEqual(df.seconds.dtype.name, "int64") self.assertEqual(df.miles.dtype.name, "float64") self.assertEqual(df.name.dtype.name, "object") + self.assertEqual(df.date.dtype.name, "datetime64[ns]") + self.assertEqual(df.datetime.dtype.name, "datetime64[ns]") + self.assertEqual(df.time.dtype.name, "object") + self.assertEqual(df.timestamp.dtype.name, "datetime64[ns, UTC]") @unittest.skipIf(pandas is None, "Requires `pandas`") def test_to_dataframe_w_unsupported_dtypes_mapper(self): @@ -3568,6 +3748,26 @@ def test_to_dataframe_w_unsupported_dtypes_mapper(self): create_bqstorage_client=False, string_dtype=numpy.dtype("object"), ) + with self.assertRaises(ValueError): + row_iterator.to_dataframe( + create_bqstorage_client=False, + date_dtype=numpy.dtype("object"), + ) + with self.assertRaises(ValueError): + row_iterator.to_dataframe( + create_bqstorage_client=False, + datetime_dtype=numpy.dtype("datetime64[us]"), + ) + with self.assertRaises(ValueError): + row_iterator.to_dataframe( + create_bqstorage_client=False, + time_dtype=numpy.dtype("datetime64[us]"), + ) + with self.assertRaises(ValueError): + row_iterator.to_dataframe( + create_bqstorage_client=False, + timestamp_dtype=numpy.dtype("datetime64[us]"), + ) @unittest.skipIf(pandas is None, "Requires `pandas`") def test_to_dataframe_column_dtypes(self):