diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index cb06536da..9ee60f7e4 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -1,3 +1,3 @@ docker: image: gcr.io/repo-automation-bots/owlbot-python:latest - digest: sha256:5ff7446edeaede81c3ed58b23a4e76a5403fba1350ce28478045657303b6479d + digest: sha256:aea14a583128771ae8aefa364e1652f3c56070168ef31beb203534222d842b8b diff --git a/.github/sync-repo-settings.yaml b/.github/sync-repo-settings.yaml index b18fb9c29..2697f214c 100644 --- a/.github/sync-repo-settings.yaml +++ b/.github/sync-repo-settings.yaml @@ -3,7 +3,7 @@ branchProtectionRules: # Identifies the protection rule pattern. Name of the branch to be protected. # Defaults to `master` -- pattern: master +- pattern: '{master,v3}' requiredStatusCheckContexts: - 'Kokoro' - 'Kokoro snippets-3.8' diff --git a/.kokoro/samples/python3.6/periodic-head.cfg b/.kokoro/samples/python3.6/periodic-head.cfg index f9cfcd33e..5aa01bab5 100644 --- a/.kokoro/samples/python3.6/periodic-head.cfg +++ b/.kokoro/samples/python3.6/periodic-head.cfg @@ -7,5 +7,5 @@ env_vars: { env_vars: { key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-pubsub/.kokoro/test-samples-against-head.sh" + value: "github/python-bigquery/.kokoro/test-samples-against-head.sh" } diff --git a/.kokoro/samples/python3.7/periodic-head.cfg b/.kokoro/samples/python3.7/periodic-head.cfg index f9cfcd33e..5aa01bab5 100644 --- a/.kokoro/samples/python3.7/periodic-head.cfg +++ b/.kokoro/samples/python3.7/periodic-head.cfg @@ -7,5 +7,5 @@ env_vars: { env_vars: { key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-pubsub/.kokoro/test-samples-against-head.sh" + value: "github/python-bigquery/.kokoro/test-samples-against-head.sh" } diff --git a/.kokoro/samples/python3.8/periodic-head.cfg b/.kokoro/samples/python3.8/periodic-head.cfg index f9cfcd33e..5aa01bab5 100644 --- a/.kokoro/samples/python3.8/periodic-head.cfg +++ b/.kokoro/samples/python3.8/periodic-head.cfg @@ -7,5 +7,5 @@ env_vars: { env_vars: { key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-pubsub/.kokoro/test-samples-against-head.sh" + value: "github/python-bigquery/.kokoro/test-samples-against-head.sh" } diff --git a/.kokoro/samples/python3.9/periodic-head.cfg b/.kokoro/samples/python3.9/periodic-head.cfg index f9cfcd33e..5aa01bab5 100644 --- a/.kokoro/samples/python3.9/periodic-head.cfg +++ b/.kokoro/samples/python3.9/periodic-head.cfg @@ -7,5 +7,5 @@ env_vars: { env_vars: { key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-pubsub/.kokoro/test-samples-against-head.sh" + value: "github/python-bigquery/.kokoro/test-samples-against-head.sh" } diff --git a/CHANGELOG.md b/CHANGELOG.md index 2439d64b0..966a8744a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,38 @@ [1]: https://pypi.org/project/google-cloud-bigquery/#history +## [2.23.0](https://www.github.com/googleapis/python-bigquery/compare/v2.22.1...v2.23.0) (2021-07-27) + + +### Features + +* Update proto definitions for bigquery/v2 to support new proto fields for BQML. ([#817](https://www.github.com/googleapis/python-bigquery/issues/817)) ([fe7a902](https://www.github.com/googleapis/python-bigquery/commit/fe7a902e8b3e723ace335c9b499aea6d180a025b)) + + +### Bug Fixes + +* no longer raise a warning in `to_dataframe` if `max_results` set ([#815](https://www.github.com/googleapis/python-bigquery/issues/815)) ([3c1be14](https://www.github.com/googleapis/python-bigquery/commit/3c1be149e76b1d1d8879fdcf0924ddb1c1839e94)) +* retry ChunkedEncodingError by default ([#802](https://www.github.com/googleapis/python-bigquery/issues/802)) ([419d36d](https://www.github.com/googleapis/python-bigquery/commit/419d36d6b1887041e5795dbc8fc808890e91ab11)) + + +### Documentation + +* correct docs for `LoadJobConfig.destination_table_description` ([#810](https://www.github.com/googleapis/python-bigquery/issues/810)) ([da87fd9](https://www.github.com/googleapis/python-bigquery/commit/da87fd921cc8067b187d7985c978aac8eb58d107)) + +### [2.22.1](https://www.github.com/googleapis/python-bigquery/compare/v2.22.0...v2.22.1) (2021-07-22) + + +### Bug Fixes + +* issue a warning if buggy pyarrow is detected ([#787](https://www.github.com/googleapis/python-bigquery/issues/787)) ([e403721](https://www.github.com/googleapis/python-bigquery/commit/e403721af1373eb1f1a1c7be5b2182e3819ed1f9)) +* use a larger chunk size when loading data ([#799](https://www.github.com/googleapis/python-bigquery/issues/799)) ([b804373](https://www.github.com/googleapis/python-bigquery/commit/b804373277c1c1baa3370ebfb4783503b7ff360f)) + + +### Documentation + +* add Samples section to CONTRIBUTING.rst ([#785](https://www.github.com/googleapis/python-bigquery/issues/785)) ([e587029](https://www.github.com/googleapis/python-bigquery/commit/e58702967d572e83b4c774278818302594a511b7)) +* add sample to delete job metadata ([#798](https://www.github.com/googleapis/python-bigquery/issues/798)) ([be9b242](https://www.github.com/googleapis/python-bigquery/commit/be9b242f2180f5b795dfb3a168a97af1682999fd)) + ## [2.22.0](https://www.github.com/googleapis/python-bigquery/compare/v2.21.0...v2.22.0) (2021-07-19) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 102355b3a..2faf5aed3 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -177,6 +177,30 @@ Build the docs via: $ nox -s docs +************************* +Samples and code snippets +************************* + +Code samples and snippets live in the `samples/` catalogue. Feel free to +provide more examples, but make sure to write tests for those examples. +Each folder containing example code requires its own `noxfile.py` script +which automates testing. If you decide to create a new folder, you can +base it on the `samples/snippets` folder (providing `noxfile.py` and +the requirements files). + +The tests will run against a real Google Cloud Project, so you should +configure them just like the System Tests. + +- To run sample tests, you can execute:: + + # Run all tests in a folder + $ cd samples/snippets + $ nox -s py-3.8 + + # Run a single sample test + $ cd samples/snippets + $ nox -s py-3.8 -- -k + ******************************************** Note About ``README`` as it pertains to PyPI ******************************************** diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 2ff96da4d..b381fa5f7 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -93,6 +93,8 @@ def pyarrow_numeric(): def pyarrow_bignumeric(): + # 77th digit is partial. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types return pyarrow.decimal256(76, 38) diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index 8572ba911..742ecac2e 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -27,6 +27,7 @@ import json import math import os +import packaging.version import tempfile from typing import Any, BinaryIO, Dict, Iterable, Optional, Sequence, Tuple, Union import uuid @@ -34,6 +35,8 @@ try: import pyarrow + + _PYARROW_VERSION = packaging.version.parse(pyarrow.__version__) except ImportError: # pragma: NO COVER pyarrow = None @@ -95,7 +98,7 @@ from google.cloud.bigquery.table import RowIterator -_DEFAULT_CHUNKSIZE = 1048576 # 1024 * 1024 B = 1 MB +_DEFAULT_CHUNKSIZE = 100 * 1024 * 1024 # 100 MB _MAX_MULTIPART_SIZE = 5 * 1024 * 1024 _DEFAULT_NUM_RETRIES = 6 _BASE_UPLOAD_TEMPLATE = "{host}/upload/bigquery/v2/projects/{project}/jobs?uploadType=" @@ -118,6 +121,9 @@ # https://github.com/googleapis/python-bigquery/issues/438 _MIN_GET_QUERY_RESULTS_TIMEOUT = 120 +# https://github.com/googleapis/python-bigquery/issues/781#issuecomment-883497414 +_PYARROW_BAD_VERSIONS = frozenset([packaging.version.Version("2.0.0")]) + class Project(object): """Wrapper for resource describing a BigQuery project. @@ -2609,6 +2615,15 @@ def load_table_from_dataframe( try: if job_config.source_format == job.SourceFormat.PARQUET: + if _PYARROW_VERSION in _PYARROW_BAD_VERSIONS: + msg = ( + "Loading dataframe data in PARQUET format with pyarrow " + f"{_PYARROW_VERSION} can result in data corruption. It is " + "therefore *strongly* advised to use a different pyarrow " + "version or a different source format. " + "See: https://github.com/googleapis/python-bigquery/issues/781" + ) + warnings.warn(msg, category=RuntimeWarning) if job_config.schema: if parquet_compression == "snappy": # adjust the default value diff --git a/google/cloud/bigquery/job/load.py b/google/cloud/bigquery/job/load.py index f1b045412..aee055c1c 100644 --- a/google/cloud/bigquery/job/load.py +++ b/google/cloud/bigquery/job/load.py @@ -170,7 +170,7 @@ def destination_encryption_configuration(self, value): @property def destination_table_description(self): - """Optional[str]: Name given to destination table. + """Optional[str]: Description of the destination table. See: https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#DestinationTableProperties.FIELDS.description diff --git a/google/cloud/bigquery/retry.py b/google/cloud/bigquery/retry.py index 5e9075fe1..2df4de08b 100644 --- a/google/cloud/bigquery/retry.py +++ b/google/cloud/bigquery/retry.py @@ -27,6 +27,7 @@ exceptions.TooManyRequests, exceptions.InternalServerError, exceptions.BadGateway, + requests.exceptions.ChunkedEncodingError, requests.exceptions.ConnectionError, auth_exceptions.TransportError, ) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 18d969a3f..daade1ac6 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1552,11 +1552,6 @@ def _validate_bqstorage(self, bqstorage_client, create_bqstorage_client): return False if self.max_results is not None: - warnings.warn( - "Cannot use bqstorage_client if max_results is set, " - "reverting to fetching data with the REST endpoint.", - stacklevel=2, - ) return False try: @@ -1604,6 +1599,25 @@ def total_rows(self): """int: The total number of rows in the table.""" return self._total_rows + def _maybe_warn_max_results( + self, bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"], + ): + """Issue a warning if BQ Storage client is not ``None`` with ``max_results`` set. + + This helper method should be used directly in the relevant top-level public + methods, so that the warning is issued for the correct line in user code. + + Args: + bqstorage_client: + The BigQuery Storage client intended to use for downloading result rows. + """ + if bqstorage_client is not None and self.max_results is not None: + warnings.warn( + "Cannot use bqstorage_client if max_results is set, " + "reverting to fetching data with the REST endpoint.", + stacklevel=3, + ) + def _to_page_iterable( self, bqstorage_download, tabledata_list_download, bqstorage_client=None ): @@ -1700,6 +1714,8 @@ def to_arrow( if pyarrow is None: raise ValueError(_NO_PYARROW_ERROR) + self._maybe_warn_max_results(bqstorage_client) + if not self._validate_bqstorage(bqstorage_client, create_bqstorage_client): create_bqstorage_client = False bqstorage_client = None @@ -1790,6 +1806,8 @@ def to_dataframe_iterable( if dtypes is None: dtypes = {} + self._maybe_warn_max_results(bqstorage_client) + column_names = [field.name for field in self._schema] bqstorage_download = functools.partial( _pandas_helpers.download_dataframe_bqstorage, @@ -1896,6 +1914,8 @@ def to_dataframe( if dtypes is None: dtypes = {} + self._maybe_warn_max_results(bqstorage_client) + if not self._validate_bqstorage(bqstorage_client, create_bqstorage_client): create_bqstorage_client = False bqstorage_client = None diff --git a/google/cloud/bigquery/version.py b/google/cloud/bigquery/version.py index 2db0ca518..416bf20ed 100644 --- a/google/cloud/bigquery/version.py +++ b/google/cloud/bigquery/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.22.0" +__version__ = "2.23.0" diff --git a/google/cloud/bigquery_v2/types/model.py b/google/cloud/bigquery_v2/types/model.py index 17e101d25..706418401 100644 --- a/google/cloud/bigquery_v2/types/model.py +++ b/google/cloud/bigquery_v2/types/model.py @@ -96,6 +96,8 @@ class Model(proto.Message): Output only. Label columns that were used to train this model. The output of the model will have a `predicted_` prefix to these columns. + best_trial_id (int): + The best trial_id across all training runs. """ class ModelType(proto.Enum): @@ -113,6 +115,7 @@ class ModelType(proto.Enum): ARIMA = 11 AUTOML_REGRESSOR = 12 AUTOML_CLASSIFIER = 13 + ARIMA_PLUS = 19 class LossType(proto.Enum): r"""Loss metric to evaluate model training performance.""" @@ -151,6 +154,7 @@ class DataFrequency(proto.Enum): WEEKLY = 5 DAILY = 6 HOURLY = 7 + PER_MINUTE = 8 class HolidayRegion(proto.Enum): r"""Type of supported holiday regions for time series forecasting @@ -285,7 +289,7 @@ class RegressionMetrics(proto.Message): median_absolute_error (google.protobuf.wrappers_pb2.DoubleValue): Median absolute error. r_squared (google.protobuf.wrappers_pb2.DoubleValue): - R^2 score. + R^2 score. This corresponds to r2_score in ML.EVALUATE. """ mean_absolute_error = proto.Field( @@ -528,7 +532,7 @@ class ClusteringMetrics(proto.Message): Mean of squared distances between each sample to its cluster centroid. clusters (Sequence[google.cloud.bigquery_v2.types.Model.ClusteringMetrics.Cluster]): - [Beta] Information for all clusters. + Information for all clusters. """ class Cluster(proto.Message): @@ -697,10 +701,29 @@ class ArimaSingleModelForecastingMetrics(proto.Message): Is arima model fitted with drift or not. It is always false when d is not 1. time_series_id (str): - The id to indicate different time series. + The time_series_id value for this time series. It will be + one of the unique values from the time_series_id_column + specified during ARIMA model training. Only present when + time_series_id_column training option was used. + time_series_ids (Sequence[str]): + The tuple of time_series_ids identifying this time series. + It will be one of the unique tuples of values present in the + time_series_id_columns specified during ARIMA model + training. Only present when time_series_id_columns training + option was used and the order of values here are same as the + order of time_series_id_columns. seasonal_periods (Sequence[google.cloud.bigquery_v2.types.Model.SeasonalPeriod.SeasonalPeriodType]): Seasonal periods. Repeated because multiple periods are supported for one time series. + has_holiday_effect (google.protobuf.wrappers_pb2.BoolValue): + If true, holiday_effect is a part of time series + decomposition result. + has_spikes_and_dips (google.protobuf.wrappers_pb2.BoolValue): + If true, spikes_and_dips is a part of time series + decomposition result. + has_step_changes (google.protobuf.wrappers_pb2.BoolValue): + If true, step_changes is a part of time series decomposition + result. """ non_seasonal_order = proto.Field( @@ -711,9 +734,19 @@ class ArimaSingleModelForecastingMetrics(proto.Message): ) has_drift = proto.Field(proto.BOOL, number=3,) time_series_id = proto.Field(proto.STRING, number=4,) + time_series_ids = proto.RepeatedField(proto.STRING, number=9,) seasonal_periods = proto.RepeatedField( proto.ENUM, number=5, enum="Model.SeasonalPeriod.SeasonalPeriodType", ) + has_holiday_effect = proto.Field( + proto.MESSAGE, number=6, message=wrappers_pb2.BoolValue, + ) + has_spikes_and_dips = proto.Field( + proto.MESSAGE, number=7, message=wrappers_pb2.BoolValue, + ) + has_step_changes = proto.Field( + proto.MESSAGE, number=8, message=wrappers_pb2.BoolValue, + ) non_seasonal_order = proto.RepeatedField( proto.MESSAGE, number=1, message="Model.ArimaOrder", @@ -901,7 +934,7 @@ class TrainingRun(proto.Message): """ class TrainingOptions(proto.Message): - r""" + r"""Options used in model training. Attributes: max_iterations (int): The maximum number of iterations in training. @@ -972,8 +1005,9 @@ class TrainingOptions(proto.Message): num_clusters (int): Number of clusters for clustering models. model_uri (str): - [Beta] Google Cloud Storage URI from which the model was - imported. Only applicable for imported models. + Google Cloud Storage URI from which the model + was imported. Only applicable for imported + models. optimization_strategy (google.cloud.bigquery_v2.types.Model.OptimizationStrategy): Optimization strategy for training linear regression models. @@ -1030,8 +1064,11 @@ class TrainingOptions(proto.Message): If a valid value is specified, then holiday effects modeling is enabled. time_series_id_column (str): - The id column that will be used to indicate - different time series to forecast in parallel. + The time series id column that was used + during ARIMA model training. + time_series_id_columns (Sequence[str]): + The time series id columns that were used + during ARIMA model training. horizon (int): The number of periods ahead that need to be forecasted. @@ -1042,6 +1079,15 @@ class TrainingOptions(proto.Message): output feature name is A.b. auto_arima_max_order (int): The max value of non-seasonal p and q. + decompose_time_series (google.protobuf.wrappers_pb2.BoolValue): + If true, perform decompose time series and + save the results. + clean_spikes_and_dips (google.protobuf.wrappers_pb2.BoolValue): + If true, clean spikes and dips in the input + time series. + adjust_step_changes (google.protobuf.wrappers_pb2.BoolValue): + If true, detect step changes and make data + adjustment in the input time series. """ max_iterations = proto.Field(proto.INT64, number=1,) @@ -1120,9 +1166,19 @@ class TrainingOptions(proto.Message): proto.ENUM, number=42, enum="Model.HolidayRegion", ) time_series_id_column = proto.Field(proto.STRING, number=43,) + time_series_id_columns = proto.RepeatedField(proto.STRING, number=51,) horizon = proto.Field(proto.INT64, number=44,) preserve_input_structs = proto.Field(proto.BOOL, number=45,) auto_arima_max_order = proto.Field(proto.INT64, number=46,) + decompose_time_series = proto.Field( + proto.MESSAGE, number=50, message=wrappers_pb2.BoolValue, + ) + clean_spikes_and_dips = proto.Field( + proto.MESSAGE, number=52, message=wrappers_pb2.BoolValue, + ) + adjust_step_changes = proto.Field( + proto.MESSAGE, number=53, message=wrappers_pb2.BoolValue, + ) class IterationResult(proto.Message): r"""Information about a single iteration of the training run. @@ -1218,10 +1274,29 @@ class ArimaModelInfo(proto.Message): Whether Arima model fitted with drift or not. It is always false when d is not 1. time_series_id (str): - The id to indicate different time series. + The time_series_id value for this time series. It will be + one of the unique values from the time_series_id_column + specified during ARIMA model training. Only present when + time_series_id_column training option was used. + time_series_ids (Sequence[str]): + The tuple of time_series_ids identifying this time series. + It will be one of the unique tuples of values present in the + time_series_id_columns specified during ARIMA model + training. Only present when time_series_id_columns training + option was used and the order of values here are same as the + order of time_series_id_columns. seasonal_periods (Sequence[google.cloud.bigquery_v2.types.Model.SeasonalPeriod.SeasonalPeriodType]): Seasonal periods. Repeated because multiple periods are supported for one time series. + has_holiday_effect (google.protobuf.wrappers_pb2.BoolValue): + If true, holiday_effect is a part of time series + decomposition result. + has_spikes_and_dips (google.protobuf.wrappers_pb2.BoolValue): + If true, spikes_and_dips is a part of time series + decomposition result. + has_step_changes (google.protobuf.wrappers_pb2.BoolValue): + If true, step_changes is a part of time series decomposition + result. """ non_seasonal_order = proto.Field( @@ -1237,11 +1312,21 @@ class ArimaModelInfo(proto.Message): ) has_drift = proto.Field(proto.BOOL, number=4,) time_series_id = proto.Field(proto.STRING, number=5,) + time_series_ids = proto.RepeatedField(proto.STRING, number=10,) seasonal_periods = proto.RepeatedField( proto.ENUM, number=6, enum="Model.SeasonalPeriod.SeasonalPeriodType", ) + has_holiday_effect = proto.Field( + proto.MESSAGE, number=7, message=wrappers_pb2.BoolValue, + ) + has_spikes_and_dips = proto.Field( + proto.MESSAGE, number=8, message=wrappers_pb2.BoolValue, + ) + has_step_changes = proto.Field( + proto.MESSAGE, number=9, message=wrappers_pb2.BoolValue, + ) arima_model_info = proto.RepeatedField( proto.MESSAGE, @@ -1319,6 +1404,7 @@ class ArimaModelInfo(proto.Message): label_columns = proto.RepeatedField( proto.MESSAGE, number=11, message=standard_sql.StandardSqlField, ) + best_trial_id = proto.Field(proto.INT64, number=19,) class GetModelRequest(proto.Message): diff --git a/google/cloud/bigquery_v2/types/table_reference.py b/google/cloud/bigquery_v2/types/table_reference.py index a0a8ee4c9..d56e5b09f 100644 --- a/google/cloud/bigquery_v2/types/table_reference.py +++ b/google/cloud/bigquery_v2/types/table_reference.py @@ -36,11 +36,23 @@ class TableReference(proto.Message): maximum length is 1,024 characters. Certain operations allow suffixing of the table ID with a partition decorator, such as ``sample_table$20190123``. + project_id_alternative (Sequence[str]): + The alternative field that will be used when ESF is not able + to translate the received data to the project_id field. + dataset_id_alternative (Sequence[str]): + The alternative field that will be used when ESF is not able + to translate the received data to the project_id field. + table_id_alternative (Sequence[str]): + The alternative field that will be used when ESF is not able + to translate the received data to the project_id field. """ project_id = proto.Field(proto.STRING, number=1,) dataset_id = proto.Field(proto.STRING, number=2,) table_id = proto.Field(proto.STRING, number=3,) + project_id_alternative = proto.RepeatedField(proto.STRING, number=4,) + dataset_id_alternative = proto.RepeatedField(proto.STRING, number=5,) + table_id_alternative = proto.RepeatedField(proto.STRING, number=6,) __all__ = tuple(sorted(__protobuf__.manifest)) diff --git a/samples/geography/noxfile.py b/samples/geography/noxfile.py index 160fe7286..9fc7f1782 100644 --- a/samples/geography/noxfile.py +++ b/samples/geography/noxfile.py @@ -28,8 +28,9 @@ # WARNING - WARNING - WARNING - WARNING - WARNING # WARNING - WARNING - WARNING - WARNING - WARNING -# Copy `noxfile_config.py` to your directory and modify it instead. +BLACK_VERSION = "black==19.10b0" +# Copy `noxfile_config.py` to your directory and modify it instead. # `TEST_CONFIG` dict is a configuration hook that allows users to # modify the test configurations. The values here should be in sync @@ -159,7 +160,7 @@ def lint(session: nox.sessions.Session) -> None: @nox.session def blacken(session: nox.sessions.Session) -> None: - session.install("black") + session.install(BLACK_VERSION) python_files = [path for path in os.listdir(".") if path.endswith(".py")] session.run("black", *python_files) diff --git a/samples/geography/requirements.txt b/samples/geography/requirements.txt index c7aa209ad..3a83eda64 100644 --- a/samples/geography/requirements.txt +++ b/samples/geography/requirements.txt @@ -1,4 +1,4 @@ geojson==2.5.0 -google-cloud-bigquery==2.21.0 +google-cloud-bigquery==2.22.1 google-cloud-bigquery-storage==2.6.0 Shapely==1.7.1 diff --git a/samples/snippets/conftest.py b/samples/snippets/conftest.py index 000e5f85c..74984f902 100644 --- a/samples/snippets/conftest.py +++ b/samples/snippets/conftest.py @@ -50,6 +50,31 @@ def dataset_id(bigquery_client: bigquery.Client, project_id: str): bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True) +@pytest.fixture(scope="session") +def dataset_id_us_east1(bigquery_client: bigquery.Client, project_id: str): + dataset_id = prefixer.create_prefix() + full_dataset_id = f"{project_id}.{dataset_id}" + dataset = bigquery.Dataset(full_dataset_id) + dataset.location = "us-east1" + bigquery_client.create_dataset(dataset) + yield dataset_id + bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True) + + +@pytest.fixture(scope="session") +def table_id_us_east1( + bigquery_client: bigquery.Client, project_id: str, dataset_id_us_east1: str +): + table_id = prefixer.create_prefix() + full_table_id = f"{project_id}.{dataset_id_us_east1}.{table_id}" + table = bigquery.Table( + full_table_id, schema=[bigquery.SchemaField("string_col", "STRING")] + ) + bigquery_client.create_table(table) + yield full_table_id + bigquery_client.delete_table(table, not_found_ok=True) + + @pytest.fixture def random_table_id(bigquery_client: bigquery.Client, project_id: str, dataset_id: str): """Create a new table ID each time, so random_table_id can be used as diff --git a/samples/snippets/delete_job.py b/samples/snippets/delete_job.py new file mode 100644 index 000000000..abed0c90d --- /dev/null +++ b/samples/snippets/delete_job.py @@ -0,0 +1,44 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def delete_job_metadata(job_id: str, location: str): + orig_job_id = job_id + orig_location = location + # [START bigquery_delete_job] + from google.cloud import bigquery + from google.api_core import exceptions + + # TODO(developer): Set the job ID to the ID of the job whose metadata you + # wish to delete. + job_id = "abcd-efgh-ijkl-mnop" + + # TODO(developer): Set the location to the region or multi-region + # containing the job. + location = "us-east1" + + # [END bigquery_delete_job] + job_id = orig_job_id + location = orig_location + + # [START bigquery_delete_job] + client = bigquery.Client() + + client.delete_job_metadata(job_id, location=location) + + try: + client.get_job(job_id, location=location) + except exceptions.NotFound: + print(f"Job metadata for job {location}:{job_id} was deleted.") + # [END bigquery_delete_job] diff --git a/samples/snippets/delete_job_test.py b/samples/snippets/delete_job_test.py new file mode 100644 index 000000000..c9baa817d --- /dev/null +++ b/samples/snippets/delete_job_test.py @@ -0,0 +1,33 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from google.cloud import bigquery + +import delete_job + + +def test_delete_job_metadata( + capsys, bigquery_client: bigquery.Client, table_id_us_east1: str +): + query_job: bigquery.QueryJob = bigquery_client.query( + f"SELECT COUNT(*) FROM `{table_id_us_east1}`", location="us-east1", + ) + query_job.result() + assert query_job.job_id is not None + + delete_job.delete_job_metadata(query_job.job_id, "us-east1") + + out, _ = capsys.readouterr() + assert "deleted" in out + assert f"us-east1:{query_job.job_id}" in out diff --git a/samples/snippets/noxfile.py b/samples/snippets/noxfile.py index 160fe7286..9fc7f1782 100644 --- a/samples/snippets/noxfile.py +++ b/samples/snippets/noxfile.py @@ -28,8 +28,9 @@ # WARNING - WARNING - WARNING - WARNING - WARNING # WARNING - WARNING - WARNING - WARNING - WARNING -# Copy `noxfile_config.py` to your directory and modify it instead. +BLACK_VERSION = "black==19.10b0" +# Copy `noxfile_config.py` to your directory and modify it instead. # `TEST_CONFIG` dict is a configuration hook that allows users to # modify the test configurations. The values here should be in sync @@ -159,7 +160,7 @@ def lint(session: nox.sessions.Session) -> None: @nox.session def blacken(session: nox.sessions.Session) -> None: - session.install("black") + session.install(BLACK_VERSION) python_files = [path for path in os.listdir(".") if path.endswith(".py")] session.run("black", *python_files) diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index b62c84c33..ffa689a9e 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -1,7 +1,7 @@ -google-cloud-bigquery==2.21.0 +google-cloud-bigquery==2.22.1 google-cloud-bigquery-storage==2.6.0 google-auth-oauthlib==0.4.4 -grpcio==1.38.1 +grpcio==1.39.0 ipython==7.16.1; python_version < '3.7' ipython==7.17.0; python_version >= '3.7' matplotlib==3.3.4; python_version < '3.7' diff --git a/setup.py b/setup.py index 71958ccf9..0ca19b576 100644 --- a/setup.py +++ b/setup.py @@ -30,9 +30,15 @@ release_status = "Development Status :: 5 - Production/Stable" dependencies = [ "grpcio >= 1.38.1, < 2.0dev", # https://github.com/googleapis/python-bigquery/issues/695 - "google-api-core[grpc] >= 1.29.0, < 3.0.0dev", + # NOTE: Maintainers, please do not require google-api-core>=2.x.x + # Until this issue is closed + # https://github.com/googleapis/google-cloud-python/issues/10566 + "google-api-core[grpc] >= 1.29.0, <3.0.0dev", "proto-plus >= 1.10.0", - "google-cloud-core >= 1.4.1, < 3.0dev", + # NOTE: Maintainers, please do not require google-cloud-core>=2.x.x + # Until this issue is closed + # https://github.com/googleapis/google-cloud-python/issues/10566 + "google-cloud-core >= 1.4.1, <3.0.0dev", "google-resumable-media >= 0.6.0, < 3.0dev", "packaging >= 14.3", "protobuf >= 3.12.0", diff --git a/tests/data/scalars.jsonl b/tests/data/scalars.jsonl new file mode 100644 index 000000000..4419a6e9a --- /dev/null +++ b/tests/data/scalars.jsonl @@ -0,0 +1,2 @@ +{"bool_col": true, "bytes_col": "abcd", "date_col": "2021-07-21", "datetime_col": "2021-07-21 11:39:45", "geography_col": "POINT(-122.0838511 37.3860517)", "int64_col": "123456789", "numeric_col": "1.23456789", "bignumeric_col": "10.111213141516171819", "float64_col": "1.25", "string_col": "Hello, World", "time_col": "11:41:43.07616", "timestamp_col": "2021-07-21T17:43:43.945289Z"} +{"bool_col": null, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": null, "numeric_col": null, "bignumeric_col": null, "float64_col": null, "string_col": null, "time_col": null, "timestamp_col": null} diff --git a/tests/data/scalars_extreme.jsonl b/tests/data/scalars_extreme.jsonl new file mode 100644 index 000000000..ceccd8dbc --- /dev/null +++ b/tests/data/scalars_extreme.jsonl @@ -0,0 +1,5 @@ +{"bool_col": true, "bytes_col": "DQo=\n", "date_col": "9999-12-31", "datetime_col": "9999-12-31 23:59:59.999999", "geography_col": "POINT(-135.0000 90.0000)", "int64_col": "9223372036854775807", "numeric_col": "9.9999999999999999999999999999999999999E+28", "bignumeric_col": "9.999999999999999999999999999999999999999999999999999999999999999999999999999E+37", "float64_col": "+inf", "string_col": "Hello, World", "time_col": "23:59:59.99999", "timestamp_col": "9999-12-31T23:59:59.999999Z"} +{"bool_col": false, "bytes_col": "8J+Zgw==\n", "date_col": "0001-01-01", "datetime_col": "0001-01-01 00:00:00", "geography_col": "POINT(45.0000 -90.0000)", "int64_col": "-9223372036854775808", "numeric_col": "-9.9999999999999999999999999999999999999E+28", "bignumeric_col": "-9.999999999999999999999999999999999999999999999999999999999999999999999999999E+37", "float64_col": "-inf", "string_col": "Hello, World", "time_col": "00:00:00", "timestamp_col": "0001-01-01T00:00:00.000000Z"} +{"bool_col": true, "bytes_col": "AA==\n", "date_col": "1900-01-01", "datetime_col": "1900-01-01 00:00:00", "geography_col": "POINT(-180.0000 0.0000)", "int64_col": "-1", "numeric_col": "0.000000001", "bignumeric_col": "-0.00000000000000000000000000000000000001", "float64_col": "nan", "string_col": "こんにちは", "time_col": "00:00:00.000001", "timestamp_col": "1900-01-01T00:00:00.000000Z"} +{"bool_col": false, "bytes_col": "", "date_col": "1970-01-01", "datetime_col": "1970-01-01 00:00:00", "geography_col": "POINT(0 0)", "int64_col": "0", "numeric_col": "0.0", "bignumeric_col": "0.0", "float64_col": 0.0, "string_col": "", "time_col": "12:00:00", "timestamp_col": "1970-01-01T00:00:00.000000Z"} +{"bool_col": null, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": null, "numeric_col": null, "bignumeric_col": null, "float64_col": null, "string_col": null, "time_col": null, "timestamp_col": null} diff --git a/tests/data/scalars_schema.json b/tests/data/scalars_schema.json new file mode 100644 index 000000000..00bd150fd --- /dev/null +++ b/tests/data/scalars_schema.json @@ -0,0 +1,62 @@ +[ + { + "mode": "NULLABLE", + "name": "timestamp_col", + "type": "TIMESTAMP" + }, + { + "mode": "NULLABLE", + "name": "time_col", + "type": "TIME" + }, + { + "mode": "NULLABLE", + "name": "float64_col", + "type": "FLOAT" + }, + { + "mode": "NULLABLE", + "name": "datetime_col", + "type": "DATETIME" + }, + { + "mode": "NULLABLE", + "name": "bignumeric_col", + "type": "BIGNUMERIC" + }, + { + "mode": "NULLABLE", + "name": "numeric_col", + "type": "NUMERIC" + }, + { + "mode": "NULLABLE", + "name": "geography_col", + "type": "GEOGRAPHY" + }, + { + "mode": "NULLABLE", + "name": "date_col", + "type": "DATE" + }, + { + "mode": "NULLABLE", + "name": "string_col", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "bool_col", + "type": "BOOLEAN" + }, + { + "mode": "NULLABLE", + "name": "bytes_col", + "type": "BYTES" + }, + { + "mode": "NULLABLE", + "name": "int64_col", + "type": "INTEGER" + } +] diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 4eef60e92..cc2c2a4dc 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -12,18 +12,40 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pathlib + import pytest +import test_utils.prefixer +from google.cloud import bigquery +from google.cloud.bigquery import enums from . import helpers +prefixer = test_utils.prefixer.Prefixer("python-bigquery", "tests/system") + +DATA_DIR = pathlib.Path(__file__).parent.parent / "data" + + +@pytest.fixture(scope="session", autouse=True) +def cleanup_datasets(bigquery_client: bigquery.Client): + for dataset in bigquery_client.list_datasets(): + if prefixer.should_cleanup(dataset.dataset_id): + bigquery_client.delete_dataset( + dataset, delete_contents=True, not_found_ok=True + ) + + @pytest.fixture(scope="session") def bigquery_client(): - from google.cloud import bigquery - return bigquery.Client() +@pytest.fixture(scope="session") +def project_id(bigquery_client: bigquery.Client): + return bigquery_client.project + + @pytest.fixture(scope="session") def bqstorage_client(bigquery_client): from google.cloud import bigquery_storage @@ -33,12 +55,46 @@ def bqstorage_client(bigquery_client): @pytest.fixture(scope="session") def dataset_id(bigquery_client): - dataset_id = f"bqsystem_{helpers.temp_suffix()}" + dataset_id = prefixer.create_prefix() bigquery_client.create_dataset(dataset_id) yield dataset_id - bigquery_client.delete_dataset(dataset_id, delete_contents=True) + bigquery_client.delete_dataset(dataset_id, delete_contents=True, not_found_ok=True) @pytest.fixture def table_id(dataset_id): return f"{dataset_id}.table_{helpers.temp_suffix()}" + + +@pytest.fixture(scope="session") +def scalars_table(bigquery_client: bigquery.Client, project_id: str, dataset_id: str): + schema = bigquery_client.schema_from_json(DATA_DIR / "scalars_schema.json") + job_config = bigquery.LoadJobConfig() + job_config.schema = schema + job_config.source_format = enums.SourceFormat.NEWLINE_DELIMITED_JSON + full_table_id = f"{project_id}.{dataset_id}.scalars" + with open(DATA_DIR / "scalars.jsonl", "rb") as data_file: + job = bigquery_client.load_table_from_file( + data_file, full_table_id, job_config=job_config + ) + job.result() + yield full_table_id + bigquery_client.delete_table(full_table_id) + + +@pytest.fixture(scope="session") +def scalars_extreme_table( + bigquery_client: bigquery.Client, project_id: str, dataset_id: str +): + schema = bigquery_client.schema_from_json(DATA_DIR / "scalars_schema.json") + job_config = bigquery.LoadJobConfig() + job_config.schema = schema + job_config.source_format = enums.SourceFormat.NEWLINE_DELIMITED_JSON + full_table_id = f"{project_id}.{dataset_id}.scalars_extreme" + with open(DATA_DIR / "scalars_extreme.jsonl", "rb") as data_file: + job = bigquery_client.load_table_from_file( + data_file, full_table_id, job_config=job_config + ) + job.result() + yield full_table_id + bigquery_client.delete_table(full_table_id) diff --git a/tests/system/test_arrow.py b/tests/system/test_arrow.py new file mode 100644 index 000000000..f97488e39 --- /dev/null +++ b/tests/system/test_arrow.py @@ -0,0 +1,88 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""System tests for Arrow connector.""" + +import pytest + +pyarrow = pytest.importorskip( + "pyarrow", minversion="3.0.0" +) # Needs decimal256 for BIGNUMERIC columns. + + +@pytest.mark.parametrize( + ("max_results", "scalars_table_name"), + ( + (None, "scalars_table"), # Use BQ Storage API. + (10, "scalars_table"), # Use REST API. + (None, "scalars_extreme_table"), # Use BQ Storage API. + (10, "scalars_extreme_table"), # Use REST API. + ), +) +def test_list_rows_nullable_scalars_dtypes( + bigquery_client, + scalars_table, + scalars_extreme_table, + max_results, + scalars_table_name, +): + table_id = scalars_table + if scalars_table_name == "scalars_extreme_table": + table_id = scalars_extreme_table + arrow_table = bigquery_client.list_rows( + table_id, max_results=max_results, + ).to_arrow() + + schema = arrow_table.schema + bignumeric_type = schema.field("bignumeric_col").type + # 77th digit is partial. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types + assert bignumeric_type.precision in {76, 77} + assert bignumeric_type.scale == 38 + + bool_type = schema.field("bool_col").type + assert bool_type.equals(pyarrow.bool_()) + + bytes_type = schema.field("bytes_col").type + assert bytes_type.equals(pyarrow.binary()) + + date_type = schema.field("date_col").type + assert date_type.equals(pyarrow.date32()) + + datetime_type = schema.field("datetime_col").type + assert datetime_type.unit == "us" + assert datetime_type.tz is None + + float64_type = schema.field("float64_col").type + assert float64_type.equals(pyarrow.float64()) + + geography_type = schema.field("geography_col").type + assert geography_type.equals(pyarrow.string()) + + int64_type = schema.field("int64_col").type + assert int64_type.equals(pyarrow.int64()) + + numeric_type = schema.field("numeric_col").type + assert numeric_type.precision == 38 + assert numeric_type.scale == 9 + + string_type = schema.field("string_col").type + assert string_type.equals(pyarrow.string()) + + time_type = schema.field("time_col").type + assert time_type.equals(pyarrow.time64("us")) + + timestamp_type = schema.field("timestamp_col").type + assert timestamp_type.unit == "us" + assert timestamp_type.tz is not None diff --git a/tests/system/test_client.py b/tests/system/test_client.py index ceb62b8cd..baa2b6ad8 100644 --- a/tests/system/test_client.py +++ b/tests/system/test_client.py @@ -63,7 +63,6 @@ from google.cloud import bigquery_v2 from google.cloud.bigquery.dataset import Dataset from google.cloud.bigquery.dataset import DatasetReference -from google.cloud.bigquery.schema import SchemaField from google.cloud.bigquery.table import Table from google.cloud._helpers import UTC from google.cloud.bigquery import dbapi, enums @@ -154,7 +153,6 @@ class Config(object): CLIENT: Optional[bigquery.Client] = None CURSOR = None - DATASET = None def setUpModule(): @@ -164,9 +162,7 @@ def setUpModule(): class TestBigQuery(unittest.TestCase): def setUp(self): - Config.DATASET = _make_dataset_id("bq_system_tests") - dataset = Config.CLIENT.create_dataset(Config.DATASET) - self.to_delete = [dataset] + self.to_delete = [] def tearDown(self): policy_tag_client = PolicyTagManagerClient() @@ -506,22 +502,6 @@ def test_delete_dataset_delete_contents_false(self): with self.assertRaises(exceptions.BadRequest): Config.CLIENT.delete_dataset(dataset) - def test_delete_job_metadata(self): - dataset_id = _make_dataset_id("us_east1") - self.temp_dataset(dataset_id, location="us-east1") - full_table_id = f"{Config.CLIENT.project}.{dataset_id}.test_delete_job_metadata" - table = Table(full_table_id, schema=[SchemaField("col", "STRING")]) - Config.CLIENT.create_table(table) - query_job: bigquery.QueryJob = Config.CLIENT.query( - f"SELECT COUNT(*) FROM `{full_table_id}`", location="us-east1", - ) - query_job.result() - self.assertIsNotNone(Config.CLIENT.get_job(query_job)) - - Config.CLIENT.delete_job_metadata(query_job) - with self.assertRaises(NotFound): - Config.CLIENT.get_job(query_job) - def test_get_table_w_public_dataset(self): public = "bigquery-public-data" dataset_id = "samples" @@ -1622,20 +1602,6 @@ def test_dbapi_fetchall_from_script(self): row_tuples = [r.values() for r in rows] self.assertEqual(row_tuples, [(5, "foo"), (6, "bar"), (7, "baz")]) - def test_dbapi_create_view(self): - - query = """ - CREATE VIEW {}.dbapi_create_view - AS SELECT name, SUM(number) AS total - FROM `bigquery-public-data.usa_names.usa_1910_2013` - GROUP BY name; - """.format( - Config.DATASET - ) - - Config.CURSOR.execute(query) - self.assertEqual(Config.CURSOR.rowcount, 0, "expected 0 rows") - @unittest.skipIf( bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) @@ -2476,104 +2442,6 @@ def test_list_rows_page_size(self): page = next(pages) self.assertEqual(page.num_items, num_last_page) - def test_parameterized_types_round_trip(self): - client = Config.CLIENT - table_id = f"{Config.DATASET}.test_parameterized_types_round_trip" - fields = ( - ("n", "NUMERIC"), - ("n9", "NUMERIC(9)"), - ("n92", "NUMERIC(9, 2)"), - ("bn", "BIGNUMERIC"), - ("bn9", "BIGNUMERIC(38)"), - ("bn92", "BIGNUMERIC(38, 22)"), - ("s", "STRING"), - ("s9", "STRING(9)"), - ("b", "BYTES"), - ("b9", "BYTES(9)"), - ) - self.to_delete.insert(0, Table(f"{client.project}.{table_id}")) - client.query( - "create table {} ({})".format( - table_id, ", ".join(" ".join(f) for f in fields) - ) - ).result() - table = client.get_table(table_id) - table_id2 = table_id + "2" - self.to_delete.insert(0, Table(f"{client.project}.{table_id2}")) - client.create_table(Table(f"{client.project}.{table_id2}", table.schema)) - table2 = client.get_table(table_id2) - - self.assertEqual(tuple(s._key()[:2] for s in table2.schema), fields) - - def test_table_snapshots(self): - from google.cloud.bigquery import CopyJobConfig - from google.cloud.bigquery import OperationType - - client = Config.CLIENT - - source_table_path = f"{client.project}.{Config.DATASET}.test_table" - snapshot_table_path = f"{source_table_path}_snapshot" - - # Create the table before loading so that the column order is predictable. - schema = [ - bigquery.SchemaField("foo", "INTEGER"), - bigquery.SchemaField("bar", "STRING"), - ] - source_table = helpers.retry_403(Config.CLIENT.create_table)( - Table(source_table_path, schema=schema) - ) - self.to_delete.insert(0, source_table) - - # Populate the table with initial data. - rows = [{"foo": 1, "bar": "one"}, {"foo": 2, "bar": "two"}] - load_job = Config.CLIENT.load_table_from_json(rows, source_table) - load_job.result() - - # Now create a snapshot before modifying the original table data. - copy_config = CopyJobConfig() - copy_config.operation_type = OperationType.SNAPSHOT - - copy_job = client.copy_table( - sources=source_table_path, - destination=snapshot_table_path, - job_config=copy_config, - ) - copy_job.result() - - snapshot_table = client.get_table(snapshot_table_path) - self.to_delete.insert(0, snapshot_table) - - # Modify data in original table. - sql = f'INSERT INTO `{source_table_path}`(foo, bar) VALUES (3, "three")' - query_job = client.query(sql) - query_job.result() - - # List rows from the source table and compare them to rows from the snapshot. - rows_iter = client.list_rows(source_table_path) - rows = sorted(row.values() for row in rows_iter) - assert rows == [(1, "one"), (2, "two"), (3, "three")] - - rows_iter = client.list_rows(snapshot_table_path) - rows = sorted(row.values() for row in rows_iter) - assert rows == [(1, "one"), (2, "two")] - - # Now restore the table from the snapshot and it should again contain the old - # set of rows. - copy_config = CopyJobConfig() - copy_config.operation_type = OperationType.RESTORE - copy_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE - - copy_job = client.copy_table( - sources=snapshot_table_path, - destination=source_table_path, - job_config=copy_config, - ) - copy_job.result() - - rows_iter = client.list_rows(source_table_path) - rows = sorted(row.values() for row in rows_iter) - assert rows == [(1, "one"), (2, "two")] - def temp_dataset(self, dataset_id, location=None): project = Config.CLIENT.project dataset_ref = bigquery.DatasetReference(project, dataset_id) @@ -2604,3 +2472,108 @@ def _table_exists(t): return True except NotFound: return False + + +def test_dbapi_create_view(dataset_id): + + query = f""" + CREATE VIEW {dataset_id}.dbapi_create_view + AS SELECT name, SUM(number) AS total + FROM `bigquery-public-data.usa_names.usa_1910_2013` + GROUP BY name; + """ + + Config.CURSOR.execute(query) + assert Config.CURSOR.rowcount == 0, "expected 0 rows" + + +def test_parameterized_types_round_trip(dataset_id): + client = Config.CLIENT + table_id = f"{dataset_id}.test_parameterized_types_round_trip" + fields = ( + ("n", "NUMERIC"), + ("n9", "NUMERIC(9)"), + ("n92", "NUMERIC(9, 2)"), + ("bn", "BIGNUMERIC"), + ("bn9", "BIGNUMERIC(38)"), + ("bn92", "BIGNUMERIC(38, 22)"), + ("s", "STRING"), + ("s9", "STRING(9)"), + ("b", "BYTES"), + ("b9", "BYTES(9)"), + ) + client.query( + "create table {} ({})".format(table_id, ", ".join(" ".join(f) for f in fields)) + ).result() + table = client.get_table(table_id) + table_id2 = table_id + "2" + client.create_table(Table(f"{client.project}.{table_id2}", table.schema)) + table2 = client.get_table(table_id2) + + assert tuple(s._key()[:2] for s in table2.schema) == fields + + +def test_table_snapshots(dataset_id): + from google.cloud.bigquery import CopyJobConfig + from google.cloud.bigquery import OperationType + + client = Config.CLIENT + + source_table_path = f"{client.project}.{dataset_id}.test_table" + snapshot_table_path = f"{source_table_path}_snapshot" + + # Create the table before loading so that the column order is predictable. + schema = [ + bigquery.SchemaField("foo", "INTEGER"), + bigquery.SchemaField("bar", "STRING"), + ] + source_table = helpers.retry_403(Config.CLIENT.create_table)( + Table(source_table_path, schema=schema) + ) + + # Populate the table with initial data. + rows = [{"foo": 1, "bar": "one"}, {"foo": 2, "bar": "two"}] + load_job = Config.CLIENT.load_table_from_json(rows, source_table) + load_job.result() + + # Now create a snapshot before modifying the original table data. + copy_config = CopyJobConfig() + copy_config.operation_type = OperationType.SNAPSHOT + + copy_job = client.copy_table( + sources=source_table_path, + destination=snapshot_table_path, + job_config=copy_config, + ) + copy_job.result() + + # Modify data in original table. + sql = f'INSERT INTO `{source_table_path}`(foo, bar) VALUES (3, "three")' + query_job = client.query(sql) + query_job.result() + + # List rows from the source table and compare them to rows from the snapshot. + rows_iter = client.list_rows(source_table_path) + rows = sorted(row.values() for row in rows_iter) + assert rows == [(1, "one"), (2, "two"), (3, "three")] + + rows_iter = client.list_rows(snapshot_table_path) + rows = sorted(row.values() for row in rows_iter) + assert rows == [(1, "one"), (2, "two")] + + # Now restore the table from the snapshot and it should again contain the old + # set of rows. + copy_config = CopyJobConfig() + copy_config.operation_type = OperationType.RESTORE + copy_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE + + copy_job = client.copy_table( + sources=snapshot_table_path, + destination=source_table_path, + job_config=copy_config, + ) + copy_job.result() + + rows_iter = client.list_rows(source_table_path) + rows = sorted(row.values() for row in rows_iter) + assert rows == [(1, "one"), (2, "two")] diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 6b62eb85b..535685511 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -27,6 +27,7 @@ import warnings import mock +import packaging import requests import pytest import pytz @@ -7510,6 +7511,42 @@ def test_load_table_from_dataframe_wo_pyarrow_raises_error(self): parquet_compression="gzip", ) + def test_load_table_from_dataframe_w_bad_pyarrow_issues_warning(self): + pytest.importorskip("pandas", reason="Requires `pandas`") + pytest.importorskip("pyarrow", reason="Requires `pyarrow`") + + client = self._make_client() + records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}] + dataframe = pandas.DataFrame(records) + + pyarrow_version_patch = mock.patch( + "google.cloud.bigquery.client._PYARROW_VERSION", + packaging.version.parse("2.0.0"), # A known bad version of pyarrow. + ) + get_table_patch = mock.patch( + "google.cloud.bigquery.client.Client.get_table", + autospec=True, + side_effect=google.api_core.exceptions.NotFound("Table not found"), + ) + load_patch = mock.patch( + "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True + ) + + with load_patch, get_table_patch, pyarrow_version_patch: + with warnings.catch_warnings(record=True) as warned: + client.load_table_from_dataframe( + dataframe, self.TABLE_REF, location=self.LOCATION, + ) + + expected_warnings = [ + warning for warning in warned if "pyarrow" in str(warning).lower() + ] + assert len(expected_warnings) == 1 + assert issubclass(expected_warnings[0].category, RuntimeWarning) + msg = str(expected_warnings[0].message) + assert "pyarrow 2.0.0" in msg + assert "data corruption" in msg + @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_nulls(self): @@ -8039,3 +8076,23 @@ def test_schema_to_json_with_file_object(self): client.schema_to_json(schema_list, fake_file) assert file_content == json.loads(fake_file.getvalue()) + + +def test_upload_chunksize(client): + with mock.patch("google.cloud.bigquery.client.ResumableUpload") as RU: + upload = RU.return_value + + upload.finished = False + + def transmit_next_chunk(transport): + upload.finished = True + result = mock.MagicMock() + result.json.return_value = {} + return result + + upload.transmit_next_chunk = transmit_next_chunk + f = io.BytesIO() + client.load_table_from_file(f, "foo.bar") + + chunk_size = RU.call_args_list[0][0][1] + assert chunk_size == 100 * (1 << 20) diff --git a/tests/unit/test_retry.py b/tests/unit/test_retry.py index 0bef1e5e1..6fb7f93fd 100644 --- a/tests/unit/test_retry.py +++ b/tests/unit/test_retry.py @@ -51,6 +51,10 @@ def test_w_unstructured_requests_connectionerror(self): exc = requests.exceptions.ConnectionError() self.assertTrue(self._call_fut(exc)) + def test_w_unstructured_requests_chunked_encoding_error(self): + exc = requests.exceptions.ChunkedEncodingError() + self.assertTrue(self._call_fut(exc)) + def test_w_auth_transporterror(self): from google.auth.exceptions import TransportError diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 37650cd27..4b1fd833b 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -15,6 +15,7 @@ import datetime import logging import time +import types import unittest import warnings @@ -1862,6 +1863,15 @@ def test__validate_bqstorage_returns_false_when_completely_cached(self): ) ) + def test__validate_bqstorage_returns_false_if_max_results_set(self): + iterator = self._make_one( + max_results=10, first_page_response=None # not cached + ) + result = iterator._validate_bqstorage( + bqstorage_client=None, create_bqstorage_client=True + ) + self.assertFalse(result) + def test__validate_bqstorage_returns_false_if_missing_dependency(self): iterator = self._make_one(first_page_response=None) # not cached @@ -2105,7 +2115,7 @@ def test_to_arrow_w_empty_table(self): @unittest.skipIf( bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) - def test_to_arrow_max_results_w_create_bqstorage_warning(self): + def test_to_arrow_max_results_w_explicit_bqstorage_client_warning(self): from google.cloud.bigquery.schema import SchemaField schema = [ @@ -2119,6 +2129,7 @@ def test_to_arrow_max_results_w_create_bqstorage_warning(self): path = "/foo" api_request = mock.Mock(return_value={"rows": rows}) mock_client = _mock_client() + mock_bqstorage_client = mock.sentinel.bq_storage_client row_iterator = self._make_one( client=mock_client, @@ -2129,7 +2140,7 @@ def test_to_arrow_max_results_w_create_bqstorage_warning(self): ) with warnings.catch_warnings(record=True) as warned: - row_iterator.to_arrow(create_bqstorage_client=True) + row_iterator.to_arrow(bqstorage_client=mock_bqstorage_client) matches = [ warning @@ -2139,6 +2150,49 @@ def test_to_arrow_max_results_w_create_bqstorage_warning(self): and "REST" in str(warning) ] self.assertEqual(len(matches), 1, msg="User warning was not emitted.") + self.assertIn( + __file__, str(matches[0]), msg="Warning emitted with incorrect stacklevel" + ) + mock_client._ensure_bqstorage_client.assert_not_called() + + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) + def test_to_arrow_max_results_w_create_bqstorage_client_no_warning(self): + from google.cloud.bigquery.schema import SchemaField + + schema = [ + SchemaField("name", "STRING", mode="REQUIRED"), + SchemaField("age", "INTEGER", mode="REQUIRED"), + ] + rows = [ + {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, + {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, + ] + path = "/foo" + api_request = mock.Mock(return_value={"rows": rows}) + mock_client = _mock_client() + + row_iterator = self._make_one( + client=mock_client, + api_request=api_request, + path=path, + schema=schema, + max_results=42, + ) + + with warnings.catch_warnings(record=True) as warned: + row_iterator.to_arrow(create_bqstorage_client=True) + + matches = [ + warning + for warning in warned + if warning.category is UserWarning + and "cannot use bqstorage_client" in str(warning).lower() + and "REST" in str(warning) + ] + self.assertFalse(matches) mock_client._ensure_bqstorage_client.assert_not_called() @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") @@ -2372,7 +2426,6 @@ def test_to_arrow_w_pyarrow_none(self): @unittest.skipIf(pandas is None, "Requires `pandas`") def test_to_dataframe_iterable(self): from google.cloud.bigquery.schema import SchemaField - import types schema = [ SchemaField("name", "STRING", mode="REQUIRED"), @@ -2415,7 +2468,6 @@ def test_to_dataframe_iterable(self): @unittest.skipIf(pandas is None, "Requires `pandas`") def test_to_dataframe_iterable_with_dtypes(self): from google.cloud.bigquery.schema import SchemaField - import types schema = [ SchemaField("name", "STRING", mode="REQUIRED"), @@ -2527,6 +2579,61 @@ def test_to_dataframe_iterable_w_bqstorage(self): # Don't close the client if it was passed in. bqstorage_client._transport.grpc_channel.close.assert_not_called() + @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") + def test_to_dataframe_iterable_w_bqstorage_max_results_warning(self): + from google.cloud.bigquery import schema + from google.cloud.bigquery import table as mut + + bqstorage_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) + + iterator_schema = [ + schema.SchemaField("name", "STRING", mode="REQUIRED"), + schema.SchemaField("age", "INTEGER", mode="REQUIRED"), + ] + path = "/foo" + api_request = mock.Mock( + side_effect=[ + { + "rows": [{"f": [{"v": "Bengt"}, {"v": "32"}]}], + "pageToken": "NEXTPAGE", + }, + {"rows": [{"f": [{"v": "Sven"}, {"v": "33"}]}]}, + ] + ) + row_iterator = mut.RowIterator( + _mock_client(), + api_request, + path, + iterator_schema, + table=mut.TableReference.from_string("proj.dset.tbl"), + selected_fields=iterator_schema, + max_results=25, + ) + + with warnings.catch_warnings(record=True) as warned: + dfs = row_iterator.to_dataframe_iterable(bqstorage_client=bqstorage_client) + + # Was a warning emitted? + matches = [ + warning + for warning in warned + if warning.category is UserWarning + and "cannot use bqstorage_client" in str(warning).lower() + and "REST" in str(warning) + ] + assert len(matches) == 1, "User warning was not emitted." + assert __file__ in str(matches[0]), "Warning emitted with incorrect stacklevel" + + # Basic check of what we got as a result. + dataframes = list(dfs) + assert len(dataframes) == 2 + assert isinstance(dataframes[0], pandas.DataFrame) + assert isinstance(dataframes[1], pandas.DataFrame) + @mock.patch("google.cloud.bigquery.table.pandas", new=None) def test_to_dataframe_iterable_error_if_pandas_is_none(self): from google.cloud.bigquery.schema import SchemaField @@ -2926,7 +3033,7 @@ def test_to_dataframe_max_results_w_bqstorage_warning(self): self.assertEqual(len(matches), 1, msg="User warning was not emitted.") @unittest.skipIf(pandas is None, "Requires `pandas`") - def test_to_dataframe_max_results_w_create_bqstorage_warning(self): + def test_to_dataframe_max_results_w_explicit_bqstorage_client_warning(self): from google.cloud.bigquery.schema import SchemaField schema = [ @@ -2940,6 +3047,7 @@ def test_to_dataframe_max_results_w_create_bqstorage_warning(self): path = "/foo" api_request = mock.Mock(return_value={"rows": rows}) mock_client = _mock_client() + mock_bqstorage_client = mock.sentinel.bq_storage_client row_iterator = self._make_one( client=mock_client, @@ -2950,7 +3058,7 @@ def test_to_dataframe_max_results_w_create_bqstorage_warning(self): ) with warnings.catch_warnings(record=True) as warned: - row_iterator.to_dataframe(create_bqstorage_client=True) + row_iterator.to_dataframe(bqstorage_client=mock_bqstorage_client) matches = [ warning @@ -2960,6 +3068,46 @@ def test_to_dataframe_max_results_w_create_bqstorage_warning(self): and "REST" in str(warning) ] self.assertEqual(len(matches), 1, msg="User warning was not emitted.") + self.assertIn( + __file__, str(matches[0]), msg="Warning emitted with incorrect stacklevel" + ) + mock_client._ensure_bqstorage_client.assert_not_called() + + @unittest.skipIf(pandas is None, "Requires `pandas`") + def test_to_dataframe_max_results_w_create_bqstorage_client_no_warning(self): + from google.cloud.bigquery.schema import SchemaField + + schema = [ + SchemaField("name", "STRING", mode="REQUIRED"), + SchemaField("age", "INTEGER", mode="REQUIRED"), + ] + rows = [ + {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, + {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, + ] + path = "/foo" + api_request = mock.Mock(return_value={"rows": rows}) + mock_client = _mock_client() + + row_iterator = self._make_one( + client=mock_client, + api_request=api_request, + path=path, + schema=schema, + max_results=42, + ) + + with warnings.catch_warnings(record=True) as warned: + row_iterator.to_dataframe(create_bqstorage_client=True) + + matches = [ + warning + for warning in warned + if warning.category is UserWarning + and "cannot use bqstorage_client" in str(warning).lower() + and "REST" in str(warning) + ] + self.assertFalse(matches) mock_client._ensure_bqstorage_client.assert_not_called() @unittest.skipIf(pandas is None, "Requires `pandas`")