Skip to content

Commit 800a6bb

Browse files
JohnPatontswast
authored andcommitted
Add tqdm progress bar for to_dataframe downloads (googleapis#7552)
Add progress_bar_type argument to to_dataframe Install tqdm to use this feature. If there are any tqdm errors during progress bar construction, a warning is displayed and no progress bar is displayed.
1 parent 70be116 commit 800a6bb

7 files changed

Lines changed: 219 additions & 20 deletions

File tree

bigquery/docs/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,9 +326,9 @@
326326

327327
# Example configuration for intersphinx: refer to the Python standard library.
328328
intersphinx_mapping = {
329-
"python": ("http://python.readthedocs.org/en/latest/", None),
330329
"gax": ("https://gax-python.readthedocs.org/en/latest/", None),
331330
"pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
331+
"python": ("http://python.readthedocs.org/en/latest/", None),
332332
}
333333

334334
# Napoleon settings

bigquery/google/cloud/bigquery/job.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2810,7 +2810,7 @@ def result(self, timeout=None, retry=DEFAULT_RETRY):
28102810
dest_table = Table(dest_table_ref, schema=schema)
28112811
return self._client.list_rows(dest_table, retry=retry)
28122812

2813-
def to_dataframe(self, bqstorage_client=None, dtypes=None):
2813+
def to_dataframe(self, bqstorage_client=None, dtypes=None, progress_bar_type=None):
28142814
"""Return a pandas DataFrame from a QueryJob
28152815
28162816
Args:
@@ -2837,6 +2837,16 @@ def to_dataframe(self, bqstorage_client=None, dtypes=None):
28372837
provided ``dtype`` is used when constructing the series for
28382838
the column specified. Otherwise, the default pandas behavior
28392839
is used.
2840+
progress_bar_type (Optional[str]):
2841+
If set, use the `tqdm <https://tqdm.github.io/>`_ library to
2842+
display a progress bar while the data downloads. Install the
2843+
``tqdm`` package to use this feature.
2844+
2845+
See
2846+
:func:`~google.cloud.bigquery.table.RowIterator.to_dataframe`
2847+
for details.
2848+
2849+
..versionadded:: 1.11.0
28402850
28412851
Returns:
28422852
A :class:`~pandas.DataFrame` populated with row data and column

bigquery/google/cloud/bigquery/table.py

Lines changed: 73 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@
3030
except ImportError: # pragma: NO COVER
3131
pandas = None
3232

33+
try:
34+
import tqdm
35+
except ImportError: # pragma: NO COVER
36+
tqdm = None
37+
3338
from google.api_core.page_iterator import HTTPIterator
3439

3540
import google.cloud._helpers
@@ -44,6 +49,10 @@
4449
"The pandas library is not installed, please install "
4550
"pandas to use the to_dataframe() function."
4651
)
52+
_NO_TQDM_ERROR = (
53+
"A progress bar was requested, but there was an error loading the tqdm "
54+
"library. Please install tqdm to use the progress bar functionality."
55+
)
4756
_TABLE_HAS_NO_SCHEMA = 'Table has no schema: call "client.get_table()"'
4857
_MARKER = object()
4958

@@ -1330,12 +1339,22 @@ def _to_dataframe_dtypes(self, page, column_names, dtypes):
13301339
columns[column] = pandas.Series(columns[column], dtype=dtypes[column])
13311340
return pandas.DataFrame(columns, columns=column_names)
13321341

1333-
def _to_dataframe_tabledata_list(self, dtypes):
1342+
def _to_dataframe_tabledata_list(self, dtypes, progress_bar=None):
13341343
"""Use (slower, but free) tabledata.list to construct a DataFrame."""
13351344
column_names = [field.name for field in self.schema]
13361345
frames = []
1346+
13371347
for page in iter(self.pages):
1338-
frames.append(self._to_dataframe_dtypes(page, column_names, dtypes))
1348+
current_frame = self._to_dataframe_dtypes(page, column_names, dtypes)
1349+
frames.append(current_frame)
1350+
1351+
if progress_bar is not None:
1352+
# In some cases, the number of total rows is not populated
1353+
# until the first page of rows is fetched. Update the
1354+
# progress bar's total to keep an accurate count.
1355+
progress_bar.total = progress_bar.total or self.total_rows
1356+
progress_bar.update(len(current_frame))
1357+
13391358
return pandas.concat(frames)
13401359

13411360
def _to_dataframe_bqstorage(self, bqstorage_client, dtypes):
@@ -1385,10 +1404,37 @@ def get_dataframe(stream):
13851404
# the end using manually-parsed schema.
13861405
return pandas.concat(frames)[columns]
13871406

1388-
def to_dataframe(self, bqstorage_client=None, dtypes=None):
1407+
def _get_progress_bar(self, progress_bar_type):
1408+
"""Construct a tqdm progress bar object, if tqdm is installed."""
1409+
if tqdm is None:
1410+
if progress_bar_type is not None:
1411+
warnings.warn(_NO_TQDM_ERROR, UserWarning, stacklevel=3)
1412+
return None
1413+
1414+
description = "Downloading"
1415+
unit = "rows"
1416+
1417+
try:
1418+
if progress_bar_type == "tqdm":
1419+
return tqdm.tqdm(desc=description, total=self.total_rows, unit=unit)
1420+
elif progress_bar_type == "tqdm_notebook":
1421+
return tqdm.tqdm_notebook(
1422+
desc=description, total=self.total_rows, unit=unit
1423+
)
1424+
elif progress_bar_type == "tqdm_gui":
1425+
return tqdm.tqdm_gui(
1426+
desc=description, total=self.total_rows, unit=unit
1427+
)
1428+
except (KeyError, TypeError):
1429+
# Protect ourselves from any tqdm errors. In case of
1430+
# unexpected tqdm behavior, just fall back to showing
1431+
# no progress bar.
1432+
warnings.warn(_NO_TQDM_ERROR, UserWarning, stacklevel=3)
1433+
return None
1434+
1435+
def to_dataframe(self, bqstorage_client=None, dtypes=None, progress_bar_type=None):
13891436
"""Create a pandas DataFrame by loading all pages of a query.
13901437
1391-
13921438
Args:
13931439
bqstorage_client ( \
13941440
google.cloud.bigquery_storage_v1beta1.BigQueryStorageClient \
@@ -1413,6 +1459,26 @@ def to_dataframe(self, bqstorage_client=None, dtypes=None):
14131459
provided ``dtype`` is used when constructing the series for
14141460
the column specified. Otherwise, the default pandas behavior
14151461
is used.
1462+
progress_bar_type (Optional[str]):
1463+
If set, use the `tqdm <https://tqdm.github.io/>`_ library to
1464+
display a progress bar while the data downloads. Install the
1465+
``tqdm`` package to use this feature.
1466+
1467+
Possible values of ``progress_bar_type`` include:
1468+
1469+
``None``
1470+
No progress bar.
1471+
``'tqdm'``
1472+
Use the :func:`tqdm.tqdm` function to print a progress bar
1473+
to :data:`sys.stderr`.
1474+
``'tqdm_notebook'``
1475+
Use the :func:`tqdm.tqdm_notebook` function to display a
1476+
progress bar as a Jupyter notebook widget.
1477+
``'tqdm_gui'``
1478+
Use the :func:`tqdm.tqdm_gui` function to display a
1479+
progress bar as a graphical dialog box.
1480+
1481+
..versionadded:: 1.11.0
14161482
14171483
Returns:
14181484
pandas.DataFrame:
@@ -1429,10 +1495,12 @@ def to_dataframe(self, bqstorage_client=None, dtypes=None):
14291495
if dtypes is None:
14301496
dtypes = {}
14311497

1498+
progress_bar = self._get_progress_bar(progress_bar_type)
1499+
14321500
if bqstorage_client is not None:
14331501
return self._to_dataframe_bqstorage(bqstorage_client, dtypes)
14341502
else:
1435-
return self._to_dataframe_tabledata_list(dtypes)
1503+
return self._to_dataframe_tabledata_list(dtypes, progress_bar=progress_bar)
14361504

14371505

14381506
class _EmptyRowIterator(object):

bigquery/noxfile.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,9 @@ def default(session):
4444

4545
# Pyarrow does not support Python 3.7
4646
if session.python == '3.7':
47-
dev_install = '.[pandas]'
47+
dev_install = '.[pandas, tqdm]'
4848
else:
49-
dev_install = '.[pandas, pyarrow]'
49+
dev_install = '.[pandas, pyarrow, tqdm]'
5050
session.install('-e', dev_install)
5151

5252
# IPython does not support Python 2 after version 5.x

bigquery/setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
# Exclude PyArrow dependency from Windows Python 2.7.
4040
'pyarrow: platform_system != "Windows" or python_version >= "3.4"':
4141
'pyarrow>=0.4.1',
42+
'tqdm': 'tqdm >= 4.0.0, <5.0.0dev',
4243
'fastparquet': ['fastparquet', 'python-snappy'],
4344
}
4445

bigquery/tests/unit/test_table.py

Lines changed: 129 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import itertools
1616
import json
1717
import unittest
18+
import warnings
1819

1920
import mock
2021
import pytest
@@ -29,6 +30,11 @@
2930
except (ImportError, AttributeError): # pragma: NO COVER
3031
pandas = None
3132

33+
try:
34+
from tqdm import tqdm
35+
except (ImportError, AttributeError): # pragma: NO COVER
36+
tqdm = None
37+
3238
from google.cloud.bigquery.dataset import DatasetReference
3339

3440

@@ -901,7 +907,6 @@ def test_time_partitioning_setter_none(self):
901907
self.assertIsNone(table.time_partitioning)
902908

903909
def test_partitioning_type_setter(self):
904-
import warnings
905910
from google.cloud.bigquery.table import TimePartitioningType
906911

907912
dataset = DatasetReference(self.PROJECT, self.DS_ID)
@@ -920,7 +925,6 @@ def test_partitioning_type_setter(self):
920925
self.assertIs(warning.category, PendingDeprecationWarning)
921926

922927
def test_partitioning_type_setter_w_time_partitioning_set(self):
923-
import warnings
924928
from google.cloud.bigquery.table import TimePartitioning
925929

926930
dataset = DatasetReference(self.PROJECT, self.DS_ID)
@@ -938,7 +942,6 @@ def test_partitioning_type_setter_w_time_partitioning_set(self):
938942
self.assertIs(warning.category, PendingDeprecationWarning)
939943

940944
def test_partitioning_expiration_setter_w_time_partitioning_set(self):
941-
import warnings
942945
from google.cloud.bigquery.table import TimePartitioning
943946

944947
dataset = DatasetReference(self.PROJECT, self.DS_ID)
@@ -956,8 +959,6 @@ def test_partitioning_expiration_setter_w_time_partitioning_set(self):
956959
self.assertIs(warning.category, PendingDeprecationWarning)
957960

958961
def test_partition_expiration_setter(self):
959-
import warnings
960-
961962
dataset = DatasetReference(self.PROJECT, self.DS_ID)
962963
table_ref = dataset.table(self.TABLE_NAME)
963964
table = self._make_one(table_ref)
@@ -1112,8 +1113,6 @@ def _make_one(self, *args, **kw):
11121113
return self._get_target_class()(*args, **kw)
11131114

11141115
def test_ctor(self):
1115-
import warnings
1116-
11171116
project = "test-project"
11181117
dataset_id = "test_dataset"
11191118
table_id = "coffee_table"
@@ -1191,8 +1190,6 @@ def test_ctor_view(self):
11911190
self.assertTrue(table.view_use_legacy_sql)
11921191

11931192
def test_ctor_missing_properties(self):
1194-
import warnings
1195-
11961193
resource = {
11971194
"tableReference": {
11981195
"projectId": "testproject",
@@ -1413,6 +1410,129 @@ def test_to_dataframe(self):
14131410
self.assertEqual(df.name.dtype.name, "object")
14141411
self.assertEqual(df.age.dtype.name, "int64")
14151412

1413+
@unittest.skipIf(pandas is None, "Requires `pandas`")
1414+
@unittest.skipIf(tqdm is None, "Requires `tqdm`")
1415+
@mock.patch("tqdm.tqdm_gui")
1416+
@mock.patch("tqdm.tqdm_notebook")
1417+
@mock.patch("tqdm.tqdm")
1418+
def test_to_dataframe_progress_bar(
1419+
self, tqdm_mock, tqdm_notebook_mock, tqdm_gui_mock
1420+
):
1421+
from google.cloud.bigquery.table import RowIterator
1422+
from google.cloud.bigquery.table import SchemaField
1423+
1424+
schema = [
1425+
SchemaField("name", "STRING", mode="REQUIRED"),
1426+
SchemaField("age", "INTEGER", mode="REQUIRED"),
1427+
]
1428+
rows = [
1429+
{"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
1430+
{"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
1431+
{"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
1432+
{"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
1433+
]
1434+
path = "/foo"
1435+
api_request = mock.Mock(return_value={"rows": rows})
1436+
1437+
progress_bars = (
1438+
("tqdm", tqdm_mock),
1439+
("tqdm_notebook", tqdm_notebook_mock),
1440+
("tqdm_gui", tqdm_gui_mock),
1441+
)
1442+
1443+
for progress_bar_type, progress_bar_mock in progress_bars:
1444+
row_iterator = RowIterator(_mock_client(), api_request, path, schema)
1445+
df = row_iterator.to_dataframe(progress_bar_type=progress_bar_type)
1446+
1447+
progress_bar_mock.assert_called()
1448+
progress_bar_mock().update.assert_called()
1449+
self.assertEqual(len(df), 4)
1450+
1451+
@unittest.skipIf(pandas is None, "Requires `pandas`")
1452+
@mock.patch("google.cloud.bigquery.table.tqdm", new=None)
1453+
def test_to_dataframe_no_tqdm_no_progress_bar(self):
1454+
from google.cloud.bigquery.table import RowIterator
1455+
from google.cloud.bigquery.table import SchemaField
1456+
1457+
schema = [
1458+
SchemaField("name", "STRING", mode="REQUIRED"),
1459+
SchemaField("age", "INTEGER", mode="REQUIRED"),
1460+
]
1461+
rows = [
1462+
{"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
1463+
{"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
1464+
{"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
1465+
{"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
1466+
]
1467+
path = "/foo"
1468+
api_request = mock.Mock(return_value={"rows": rows})
1469+
row_iterator = RowIterator(_mock_client(), api_request, path, schema)
1470+
1471+
with warnings.catch_warnings(record=True) as warned:
1472+
df = row_iterator.to_dataframe()
1473+
1474+
self.assertEqual(len(warned), 0)
1475+
self.assertEqual(len(df), 4)
1476+
1477+
@unittest.skipIf(pandas is None, "Requires `pandas`")
1478+
@mock.patch("google.cloud.bigquery.table.tqdm", new=None)
1479+
def test_to_dataframe_no_tqdm(self):
1480+
from google.cloud.bigquery.table import RowIterator
1481+
from google.cloud.bigquery.table import SchemaField
1482+
1483+
schema = [
1484+
SchemaField("name", "STRING", mode="REQUIRED"),
1485+
SchemaField("age", "INTEGER", mode="REQUIRED"),
1486+
]
1487+
rows = [
1488+
{"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
1489+
{"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
1490+
{"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
1491+
{"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
1492+
]
1493+
path = "/foo"
1494+
api_request = mock.Mock(return_value={"rows": rows})
1495+
row_iterator = RowIterator(_mock_client(), api_request, path, schema)
1496+
1497+
with warnings.catch_warnings(record=True) as warned:
1498+
df = row_iterator.to_dataframe(progress_bar_type="tqdm")
1499+
1500+
self.assertEqual(len(warned), 1)
1501+
for warning in warned:
1502+
self.assertIs(warning.category, UserWarning)
1503+
1504+
# Even though the progress bar won't show, downloading the dataframe
1505+
# should still work.
1506+
self.assertEqual(len(df), 4)
1507+
1508+
@unittest.skipIf(pandas is None, "Requires `pandas`")
1509+
@unittest.skipIf(tqdm is None, "Requires `tqdm`")
1510+
@mock.patch("tqdm.tqdm_gui", new=None) # will raise TypeError on call
1511+
@mock.patch("tqdm.tqdm_notebook", new=None) # will raise TypeError on call
1512+
@mock.patch("tqdm.tqdm", new=None) # will raise TypeError on call
1513+
def test_to_dataframe_tqdm_error(self):
1514+
from google.cloud.bigquery.table import RowIterator
1515+
from google.cloud.bigquery.table import SchemaField
1516+
1517+
schema = [
1518+
SchemaField("name", "STRING", mode="REQUIRED"),
1519+
SchemaField("age", "INTEGER", mode="REQUIRED"),
1520+
]
1521+
rows = [
1522+
{"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
1523+
{"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
1524+
{"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
1525+
{"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
1526+
]
1527+
path = "/foo"
1528+
1529+
for progress_bar_type in ("tqdm", "tqdm_notebook", "tqdm_gui"):
1530+
api_request = mock.Mock(return_value={"rows": rows})
1531+
row_iterator = RowIterator(_mock_client(), api_request, path, schema)
1532+
df = row_iterator.to_dataframe(progress_bar_type=progress_bar_type)
1533+
1534+
self.assertEqual(len(df), 4) # all should be well
1535+
14161536
@unittest.skipIf(pandas is None, "Requires `pandas`")
14171537
def test_to_dataframe_w_empty_results(self):
14181538
from google.cloud.bigquery.table import RowIterator

0 commit comments

Comments
 (0)