Skip to content
This repository was archived by the owner on Mar 6, 2026. It is now read-only.

Commit dcd78c7

Browse files
authored
fix!: use nullable Int64 and boolean dtypes in to_dataframe (#786)
To override this behavior, specify the types for the desired columns with the `dtype` argument. BREAKING CHANGE: uses Int64 type by default to avoid loss-of-precision in results with large integer values Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes https://issuetracker.google.com/144712110 🦕 Fixes #793
1 parent 66014c3 commit dcd78c7

11 files changed

Lines changed: 340 additions & 39 deletions

File tree

docs/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@
110110
# directories to ignore when looking for source files.
111111
exclude_patterns = [
112112
"_build",
113+
"**/.nox/**/*",
113114
"samples/AUTHORING_GUIDE.md",
114115
"samples/CONTRIBUTING.md",
115116
"samples/snippets/README.rst",

docs/usage/pandas.rst

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@ First, ensure that the :mod:`pandas` library is installed by running:
1414
1515
pip install --upgrade pandas
1616
17-
Alternatively, you can install the BigQuery python client library with
17+
Alternatively, you can install the BigQuery Python client library with
1818
:mod:`pandas` by running:
1919

2020
.. code-block:: bash
2121
22-
pip install --upgrade google-cloud-bigquery[pandas]
22+
pip install --upgrade 'google-cloud-bigquery[pandas]'
2323
2424
To retrieve query results as a :class:`pandas.DataFrame`:
2525

@@ -37,6 +37,27 @@ To retrieve table rows as a :class:`pandas.DataFrame`:
3737
:start-after: [START bigquery_list_rows_dataframe]
3838
:end-before: [END bigquery_list_rows_dataframe]
3939

40+
The following data types are used when creating a pandas DataFrame.
41+
42+
.. list-table:: Pandas Data Type Mapping
43+
:header-rows: 1
44+
45+
* - BigQuery
46+
- pandas
47+
- Notes
48+
* - BOOL
49+
- boolean
50+
-
51+
* - DATETIME
52+
- datetime64[ns], object
53+
- object is used when there are values not representable in pandas
54+
* - FLOAT64
55+
- float64
56+
-
57+
* - INT64
58+
- Int64
59+
-
60+
4061
Load a Pandas DataFrame to a BigQuery Table
4162
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4263

@@ -45,7 +66,7 @@ As of version 1.3.0, you can use the
4566
to load data from a :class:`pandas.DataFrame` to a
4667
:class:`~google.cloud.bigquery.table.Table`. To use this function, in addition
4768
to :mod:`pandas`, you will need to install the :mod:`pyarrow` library. You can
48-
install the BigQuery python client library with :mod:`pandas` and
69+
install the BigQuery Python client library with :mod:`pandas` and
4970
:mod:`pyarrow` by running:
5071

5172
.. code-block:: bash

google/cloud/bigquery/_pandas_helpers.py

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import functools
1919
import logging
2020
import queue
21+
from typing import Dict, Sequence
2122
import warnings
2223

2324
try:
@@ -42,15 +43,19 @@
4243

4344
_LOGGER = logging.getLogger(__name__)
4445

45-
_NO_BQSTORAGE_ERROR = (
46-
"The google-cloud-bigquery-storage library is not installed, "
47-
"please install google-cloud-bigquery-storage to use bqstorage features."
48-
)
49-
5046
_PROGRESS_INTERVAL = 0.2 # Maximum time between download status checks, in seconds.
5147

5248
_MAX_QUEUE_SIZE_DEFAULT = object() # max queue size sentinel for BQ Storage downloads
5349

50+
# If you update the default dtypes, also update the docs at docs/usage/pandas.rst.
51+
_BQ_TO_PANDAS_DTYPE_NULLSAFE = {
52+
"BOOL": "boolean",
53+
"BOOLEAN": "boolean",
54+
"FLOAT": "float64",
55+
"FLOAT64": "float64",
56+
"INT64": "Int64",
57+
"INTEGER": "Int64",
58+
}
5459
_PANDAS_DTYPE_TO_BQ = {
5560
"bool": "BOOLEAN",
5661
"datetime64[ns, UTC]": "TIMESTAMP",
@@ -217,6 +222,28 @@ def bq_to_arrow_schema(bq_schema):
217222
return pyarrow.schema(arrow_fields)
218223

219224

225+
def bq_schema_to_nullsafe_pandas_dtypes(
226+
bq_schema: Sequence[schema.SchemaField],
227+
) -> Dict[str, str]:
228+
"""Return the default dtypes to use for columns in a BigQuery schema.
229+
230+
Only returns default dtypes which are safe to have NULL values. This
231+
includes Int64, which has pandas.NA values and does not result in
232+
loss-of-precision.
233+
234+
Returns:
235+
A mapping from column names to pandas dtypes.
236+
"""
237+
dtypes = {}
238+
for bq_field in bq_schema:
239+
if bq_field.mode.upper() not in {"NULLABLE", "REQUIRED"}:
240+
continue
241+
field_type = bq_field.field_type.upper()
242+
if field_type in _BQ_TO_PANDAS_DTYPE_NULLSAFE:
243+
dtypes[bq_field.name] = _BQ_TO_PANDAS_DTYPE_NULLSAFE[field_type]
244+
return dtypes
245+
246+
220247
def bq_to_arrow_array(series, bq_field):
221248
arrow_type = bq_to_arrow_data_type(bq_field)
222249

google/cloud/bigquery/table.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1933,6 +1933,13 @@ def to_dataframe(
19331933
bqstorage_client=bqstorage_client,
19341934
create_bqstorage_client=create_bqstorage_client,
19351935
)
1936+
default_dtypes = _pandas_helpers.bq_schema_to_nullsafe_pandas_dtypes(
1937+
self.schema
1938+
)
1939+
1940+
# Let the user-defined dtypes override the default ones.
1941+
# https://stackoverflow.com/a/26853961/101923
1942+
dtypes = {**default_dtypes, **dtypes}
19361943

19371944
# When converting timestamp values to nanosecond precision, the result
19381945
# can be out of pyarrow bounds. To avoid the error when converting to
@@ -1954,7 +1961,9 @@ def to_dataframe(
19541961

19551962
extra_kwargs = {"timestamp_as_object": timestamp_as_object}
19561963

1957-
df = record_batch.to_pandas(date_as_object=date_as_object, **extra_kwargs)
1964+
df = record_batch.to_pandas(
1965+
date_as_object=date_as_object, integer_object_nulls=True, **extra_kwargs
1966+
)
19581967

19591968
for column in dtypes:
19601969
df[column] = pandas.Series(df[column], dtype=dtypes[column])

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
# Keep the no-op bqstorage extra for backward compatibility.
5151
# See: https://github.com/googleapis/python-bigquery/issues/757
5252
"bqstorage": [],
53-
"pandas": ["pandas>=0.23.0"],
53+
"pandas": ["pandas>=1.0.0"],
5454
"tqdm": ["tqdm >= 4.7.4, <5.0.0dev"],
5555
"opentelemetry": [
5656
"opentelemetry-api >= 0.11b0",

testing/constraints-3.6.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ grpcio==1.38.1
1313
opentelemetry-api==0.11b0
1414
opentelemetry-instrumentation==0.11b0
1515
opentelemetry-sdk==0.11b0
16-
pandas==0.23.0
16+
pandas==1.0.0
1717
proto-plus==1.10.0
1818
protobuf==3.12.0
1919
pyarrow==3.0.0

tests/system/test_arrow.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,9 @@
1414

1515
"""System tests for Arrow connector."""
1616

17+
import pyarrow
1718
import pytest
1819

19-
pyarrow = pytest.importorskip(
20-
"pyarrow", minversion="3.0.0"
21-
) # Needs decimal256 for BIGNUMERIC columns.
22-
2320

2421
@pytest.mark.parametrize(
2522
("max_results", "scalars_table_name"),

tests/system/test_pandas.py

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -567,7 +567,7 @@ def test_query_results_to_dataframe(bigquery_client):
567567
for _, row in df.iterrows():
568568
for col in column_names:
569569
# all the schema fields are nullable, so None is acceptable
570-
if not row[col] is None:
570+
if not pandas.isna(row[col]):
571571
assert isinstance(row[col], exp_datatypes[col])
572572

573573

@@ -597,7 +597,7 @@ def test_query_results_to_dataframe_w_bqstorage(bigquery_client):
597597
for index, row in df.iterrows():
598598
for col in column_names:
599599
# all the schema fields are nullable, so None is acceptable
600-
if not row[col] is None:
600+
if not pandas.isna(row[col]):
601601
assert isinstance(row[col], exp_datatypes[col])
602602

603603

@@ -795,3 +795,71 @@ def test_list_rows_max_results_w_bqstorage(bigquery_client):
795795
dataframe = row_iterator.to_dataframe(bqstorage_client=bqstorage_client)
796796

797797
assert len(dataframe.index) == 100
798+
799+
800+
@pytest.mark.parametrize(
801+
("max_results",), ((None,), (10,),) # Use BQ Storage API. # Use REST API.
802+
)
803+
def test_list_rows_nullable_scalars_dtypes(bigquery_client, scalars_table, max_results):
804+
df = bigquery_client.list_rows(
805+
scalars_table, max_results=max_results,
806+
).to_dataframe()
807+
808+
assert df.dtypes["bool_col"].name == "boolean"
809+
assert df.dtypes["datetime_col"].name == "datetime64[ns]"
810+
assert df.dtypes["float64_col"].name == "float64"
811+
assert df.dtypes["int64_col"].name == "Int64"
812+
assert df.dtypes["timestamp_col"].name == "datetime64[ns, UTC]"
813+
814+
# object is used by default, but we can use "datetime64[ns]" automatically
815+
# when data is within the supported range.
816+
# https://github.com/googleapis/python-bigquery/issues/861
817+
assert df.dtypes["date_col"].name == "object"
818+
819+
# object is used by default, but we can use "timedelta64[ns]" automatically
820+
# https://github.com/googleapis/python-bigquery/issues/862
821+
assert df.dtypes["time_col"].name == "object"
822+
823+
# decimal.Decimal is used to avoid loss of precision.
824+
assert df.dtypes["bignumeric_col"].name == "object"
825+
assert df.dtypes["numeric_col"].name == "object"
826+
827+
# pandas uses Python string and bytes objects.
828+
assert df.dtypes["bytes_col"].name == "object"
829+
assert df.dtypes["string_col"].name == "object"
830+
831+
832+
@pytest.mark.parametrize(
833+
("max_results",), ((None,), (10,),) # Use BQ Storage API. # Use REST API.
834+
)
835+
def test_list_rows_nullable_scalars_extreme_dtypes(
836+
bigquery_client, scalars_extreme_table, max_results
837+
):
838+
df = bigquery_client.list_rows(
839+
scalars_extreme_table, max_results=max_results
840+
).to_dataframe()
841+
842+
# Extreme values are out-of-bounds for pandas datetime64 values, which use
843+
# nanosecond precision. Values before 1677-09-21 and after 2262-04-11 must
844+
# be represented with object.
845+
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations
846+
assert df.dtypes["date_col"].name == "object"
847+
assert df.dtypes["datetime_col"].name == "object"
848+
assert df.dtypes["timestamp_col"].name == "object"
849+
850+
# These pandas dtypes can handle the same ranges as BigQuery.
851+
assert df.dtypes["bool_col"].name == "boolean"
852+
assert df.dtypes["float64_col"].name == "float64"
853+
assert df.dtypes["int64_col"].name == "Int64"
854+
855+
# object is used by default, but we can use "timedelta64[ns]" automatically
856+
# https://github.com/googleapis/python-bigquery/issues/862
857+
assert df.dtypes["time_col"].name == "object"
858+
859+
# decimal.Decimal is used to avoid loss of precision.
860+
assert df.dtypes["numeric_col"].name == "object"
861+
assert df.dtypes["bignumeric_col"].name == "object"
862+
863+
# pandas uses Python string and bytes objects.
864+
assert df.dtypes["bytes_col"].name == "object"
865+
assert df.dtypes["string_col"].name == "object"

tests/unit/job/test_query_pandas.py

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,6 @@
2020
import pyarrow
2121
import pytest
2222

23-
try:
24-
import pandas
25-
except (ImportError, AttributeError): # pragma: NO COVER
26-
pandas = None
27-
2823
from google.cloud import bigquery_storage
2924

3025
try:
@@ -36,6 +31,8 @@
3631
from .helpers import _make_connection
3732
from .helpers import _make_job_resource
3833

34+
pandas = pytest.importorskip("pandas")
35+
3936

4037
@pytest.fixture
4138
def table_read_options_kwarg():
@@ -78,7 +75,6 @@ def test__contains_order_by(query, expected):
7875
assert not mut._contains_order_by(query)
7976

8077

81-
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
8278
@pytest.mark.parametrize(
8379
"query",
8480
(
@@ -413,7 +409,6 @@ def test_to_arrow_w_tqdm_wo_query_plan():
413409
result_patch_tqdm.assert_called()
414410

415411

416-
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
417412
def test_to_dataframe():
418413
from google.cloud.bigquery.job import QueryJob as target_class
419414

@@ -452,7 +447,6 @@ def test_to_dataframe():
452447
assert list(df) == ["name", "age"] # verify the column names
453448

454449

455-
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
456450
def test_to_dataframe_ddl_query():
457451
from google.cloud.bigquery.job import QueryJob as target_class
458452

@@ -472,7 +466,6 @@ def test_to_dataframe_ddl_query():
472466
assert len(df) == 0
473467

474468

475-
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
476469
def test_to_dataframe_bqstorage(table_read_options_kwarg):
477470
from google.cloud.bigquery.job import QueryJob as target_class
478471

@@ -522,7 +515,6 @@ def test_to_dataframe_bqstorage(table_read_options_kwarg):
522515
)
523516

524517

525-
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
526518
def test_to_dataframe_bqstorage_no_pyarrow_compression():
527519
from google.cloud.bigquery.job import QueryJob as target_class
528520

@@ -565,7 +557,6 @@ def test_to_dataframe_bqstorage_no_pyarrow_compression():
565557
)
566558

567559

568-
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
569560
def test_to_dataframe_column_dtypes():
570561
from google.cloud.bigquery.job import QueryJob as target_class
571562

@@ -617,15 +608,14 @@ def test_to_dataframe_column_dtypes():
617608
assert list(df) == exp_columns # verify the column names
618609

619610
assert df.start_timestamp.dtype.name == "datetime64[ns, UTC]"
620-
assert df.seconds.dtype.name == "int64"
611+
assert df.seconds.dtype.name == "Int64"
621612
assert df.miles.dtype.name == "float64"
622613
assert df.km.dtype.name == "float16"
623614
assert df.payment_type.dtype.name == "object"
624-
assert df.complete.dtype.name == "bool"
615+
assert df.complete.dtype.name == "boolean"
625616
assert df.date.dtype.name == "object"
626617

627618

628-
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
629619
def test_to_dataframe_column_date_dtypes():
630620
from google.cloud.bigquery.job import QueryJob as target_class
631621

@@ -657,7 +647,6 @@ def test_to_dataframe_column_date_dtypes():
657647
assert df.date.dtype.name == "datetime64[ns]"
658648

659649

660-
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
661650
@pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`")
662651
@mock.patch("tqdm.tqdm")
663652
def test_to_dataframe_with_progress_bar(tqdm_mock):
@@ -685,7 +674,6 @@ def test_to_dataframe_with_progress_bar(tqdm_mock):
685674
tqdm_mock.assert_called()
686675

687676

688-
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
689677
@pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`")
690678
def test_to_dataframe_w_tqdm_pending():
691679
from google.cloud.bigquery import table
@@ -741,7 +729,6 @@ def test_to_dataframe_w_tqdm_pending():
741729
)
742730

743731

744-
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
745732
@pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`")
746733
def test_to_dataframe_w_tqdm():
747734
from google.cloud.bigquery import table
@@ -801,7 +788,6 @@ def test_to_dataframe_w_tqdm():
801788
)
802789

803790

804-
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
805791
@pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`")
806792
def test_to_dataframe_w_tqdm_max_results():
807793
from google.cloud.bigquery import table

0 commit comments

Comments
 (0)