Skip to content
This repository was archived by the owner on Mar 6, 2026. It is now read-only.

Commit 485fbc2

Browse files
committed
fix: converting to dataframe with out of bounds timestamps
1 parent 8360487 commit 485fbc2

File tree

3 files changed

+62
-2
lines changed

3 files changed

+62
-2
lines changed

google/cloud/bigquery/table.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import functools
2222
import logging
2323
import operator
24+
import pytz
2425
import warnings
2526

2627
import six
@@ -1726,7 +1727,32 @@ def to_dataframe(
17261727
bqstorage_client=bqstorage_client,
17271728
create_bqstorage_client=create_bqstorage_client,
17281729
)
1729-
df = record_batch.to_pandas(date_as_object=date_as_object)
1730+
1731+
# When converting timestamp values to nanosecond precision, the result
1732+
# can be out of pyarrow bounds. To avoid the error when converting to
1733+
# Pandas, we set the timestamp_as_object parameter to True, if necessary.
1734+
#
1735+
# NOTE: Python 3+ only, as timestamp_as_object parameter is only supported
1736+
# in pyarrow>=1.0, but the latter is not compatible with Python 2.
1737+
if six.PY2:
1738+
extra_kwargs = {}
1739+
else:
1740+
type_to_check = pyarrow.timestamp("us", tz=pytz.UTC)
1741+
1742+
for column in record_batch:
1743+
if column.type == type_to_check:
1744+
try:
1745+
column.cast("timestamp[ns]")
1746+
except pyarrow.lib.ArrowInvalid:
1747+
timestamp_as_object = True
1748+
break
1749+
else:
1750+
timestamp_as_object = False
1751+
1752+
extra_kwargs = {"timestamp_as_object": timestamp_as_object}
1753+
1754+
df = record_batch.to_pandas(date_as_object=date_as_object, **extra_kwargs)
1755+
17301756
for column in dtypes:
17311757
df[column] = pandas.Series(df[column], dtype=dtypes[column])
17321758
return df

setup.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,11 @@
4848
"pandas": ["pandas>=0.17.1"],
4949
# Exclude PyArrow dependency from Windows Python 2.7.
5050
'pyarrow: platform_system != "Windows" or python_version >= "3.4"': [
51+
# Pyarrow >= 1.0 is not compatible with Python 2 anymore.
52+
"pyarrow>=1.0.0, <2.0dev; python_version>='3.4'",
5153
# Bad Linux release for 0.14.0.
5254
# https://issues.apache.org/jira/browse/ARROW-5868
53-
"pyarrow>=0.4.1, != 0.14.0"
55+
"pyarrow>=0.4.1, != 0.14.0; python_version<'3.0'",
5456
],
5557
"tqdm": ["tqdm >= 4.0.0, <5.0.0dev"],
5658
"fastparquet": [

tests/unit/test_table.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import datetime as dt
1516
import itertools
1617
import logging
1718
import time
@@ -2271,6 +2272,37 @@ def test_to_dataframe(self):
22712272
self.assertEqual(df.name.dtype.name, "object")
22722273
self.assertEqual(df.age.dtype.name, "int64")
22732274

2275+
@pytest.mark.xfail(
2276+
six.PY2,
2277+
reason=(
2278+
"Requires pyarrow>-1.0 to work, but the latter is not compatible "
2279+
"with Python 2 anymore."
2280+
),
2281+
)
2282+
@unittest.skipIf(pandas is None, "Requires `pandas`")
2283+
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
2284+
def test_to_dataframe_timestamp_out_of_pyarrow_bounds(self):
2285+
from google.cloud.bigquery.schema import SchemaField
2286+
2287+
schema = [SchemaField("some_timestamp", "TIMESTAMP")]
2288+
rows = [
2289+
{"f": [{"v": "81953424000.0"}]}, # 4567-01-01 00:00:00 UTC
2290+
{"f": [{"v": "253402214400.0"}]}, # 9999-12-31 00:00:00 UTC
2291+
]
2292+
path = "/foo"
2293+
api_request = mock.Mock(return_value={"rows": rows})
2294+
row_iterator = self._make_one(_mock_client(), api_request, path, schema)
2295+
2296+
df = row_iterator.to_dataframe(create_bqstorage_client=False)
2297+
2298+
self.assertIsInstance(df, pandas.DataFrame)
2299+
self.assertEqual(len(df), 2) # verify the number of rows
2300+
self.assertEqual(list(df.columns), ["some_timestamp"])
2301+
self.assertEqual(
2302+
list(df["some_timestamp"]),
2303+
[dt.datetime(4567, 1, 1), dt.datetime(9999, 12, 31)],
2304+
)
2305+
22742306
@unittest.skipIf(pandas is None, "Requires `pandas`")
22752307
def test_to_dataframe_warning_wo_pyarrow(self):
22762308
from google.cloud.bigquery.client import PyarrowMissingWarning

0 commit comments

Comments
 (0)