Skip to content
Next Next commit
Towards lazy-by-default for dataset loading
  • Loading branch information
PGijsbers committed Sep 18, 2024
commit 888eb9d6ca9ab91a56e89422a1346a3d50c37a1a
40 changes: 9 additions & 31 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,8 +416,8 @@ def _name_to_id(

def get_datasets(
dataset_ids: list[str | int],
download_data: bool = True, # noqa: FBT001, FBT002
download_qualities: bool = True, # noqa: FBT001, FBT002
download_data: bool = False, # noqa: FBT001, FBT002
download_qualities: bool = False, # noqa: FBT001, FBT002
) -> list[OpenMLDataset]:
"""Download datasets.

Expand Down Expand Up @@ -450,14 +450,14 @@ def get_datasets(


@openml.utils.thread_safe_if_oslo_installed
def get_dataset( # noqa: C901, PLR0912, PLR0915
def get_dataset( # noqa: C901, PLR0912
dataset_id: int | str,
download_data: bool | None = None, # Optional for deprecation warning; later again only bool
download_data: bool = False, # noqa: FBT002, FBT001
version: int | None = None,
error_if_multiple: bool = False, # noqa: FBT002, FBT001
cache_format: Literal["pickle", "feather"] = "pickle",
download_qualities: bool | None = None, # Same as above
download_features_meta_data: bool | None = None, # Same as above
download_qualities: bool = False, # noqa: FBT002, FBT001
download_features_meta_data: bool = False, # noqa: FBT002, FBT001
download_all_files: bool = False, # noqa: FBT002, FBT001
force_refresh_cache: bool = False, # noqa: FBT001, FBT002
) -> OpenMLDataset:
Expand Down Expand Up @@ -485,7 +485,7 @@ def get_dataset( # noqa: C901, PLR0912, PLR0915
----------
dataset_id : int or str
Dataset ID of the dataset to download
download_data : bool (default=True)
download_data : bool (default=False)
If True, also download the data file. Beware that some datasets are large and it might
make the operation noticeably slower. Metadata is also still retrieved.
If False, create the OpenMLDataset and only populate it with the metadata.
Expand All @@ -499,12 +499,12 @@ def get_dataset( # noqa: C901, PLR0912, PLR0915
Format for caching the dataset - may be feather or pickle
Note that the default 'pickle' option may load slower than feather when
no.of.rows is very high.
download_qualities : bool (default=True)
download_qualities : bool (default=False)
Option to download 'qualities' meta-data in addition to the minimal dataset description.
If True, download and cache the qualities file.
If False, create the OpenMLDataset without qualities metadata. The data may later be added
to the OpenMLDataset through the `OpenMLDataset.load_metadata(qualities=True)` method.
download_features_meta_data : bool (default=True)
download_features_meta_data : bool (default=False)
Option to download 'features' meta-data in addition to the minimal dataset description.
If True, download and cache the features file.
If False, create the OpenMLDataset without features metadata. The data may later be added
Expand All @@ -523,28 +523,6 @@ def get_dataset( # noqa: C901, PLR0912, PLR0915
dataset : :class:`openml.OpenMLDataset`
The downloaded dataset.
"""
# TODO(0.15): Remove the deprecation warning and make the default False; adjust types above
# and documentation. Also remove None-to-True-cases below
if any(
download_flag is None
for download_flag in [download_data, download_qualities, download_features_meta_data]
):
warnings.warn(
"Starting from Version 0.15 `download_data`, `download_qualities`, and `download_featu"
"res_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy "
"loading. To disable this message until version 0.15 explicitly set `download_data`, "
"`download_qualities`, and `download_features_meta_data` to a bool while calling "
"`get_dataset`.",
FutureWarning,
stacklevel=2,
)

download_data = True if download_data is None else download_data
download_qualities = True if download_qualities is None else download_qualities
download_features_meta_data = (
True if download_features_meta_data is None else download_features_meta_data
)

if download_all_files:
warnings.warn(
"``download_all_files`` is experimental and is likely to break with new releases.",
Expand Down
64 changes: 34 additions & 30 deletions tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,24 @@ def test_illegal_length_tag(self):
except openml.exceptions.OpenMLServerException as e:
assert e.code == 477

def _dataset_file_is_downloaded(self, did: int, file: str):
cache_directory = Path(openml.config.get_cache_directory()) / "datasets" / str(did)
return (cache_directory / file).exists()

def _dataset_description_is_downloaded(self, did: int):
return self._dataset_file_is_downloaded(did, "description.xml")

def _dataset_qualities_is_downloaded(self, did: int):
return self._dataset_file_is_downloaded(did, "qualities.xml")

def _dataset_features_is_downloaded(self, did: int):
return self._dataset_file_is_downloaded(did, "features.xml")

def _dataset_data_file_is_downloaded(self, did: int):
parquet_present = self._dataset_file_is_downloaded(did, "dataset.pq")
arff_present = self._dataset_file_is_downloaded(did, "dataset.arff")
return parquet_present or arff_present

def _datasets_retrieved_successfully(self, dids, metadata_only=True):
Comment thread
LennartPurucker marked this conversation as resolved.
"""Checks that all files for the given dids have been downloaded.

Expand All @@ -179,33 +197,14 @@ def _datasets_retrieved_successfully(self, dids, metadata_only=True):
- absence of data arff if metadata_only, else it must be present too.
"""
for did in dids:
assert os.path.exists(
os.path.join(
openml.config.get_cache_directory(), "datasets", str(did), "description.xml"
)
)
assert os.path.exists(
os.path.join(
openml.config.get_cache_directory(), "datasets", str(did), "qualities.xml"
)
)
assert os.path.exists(
os.path.join(
openml.config.get_cache_directory(), "datasets", str(did), "features.xml"
)
)
assert self._dataset_description_is_downloaded(did)
assert self._dataset_qualities_is_downloaded(did)
assert self._dataset_features_is_downloaded(did)

data_assert = self.assertFalse if metadata_only else self.assertTrue
data_assert(
os.path.exists(
os.path.join(
openml.config.get_cache_directory(),
"datasets",
str(did),
"dataset.arff",
),
),
)
if metadata_only:
assert not self._dataset_data_file_is_downloaded(did)
else:
assert self._dataset_data_file_is_downloaded(did)

@pytest.mark.production()
def test__name_to_id_with_deactivated(self):
Expand Down Expand Up @@ -588,9 +587,8 @@ def test_deletion_of_cache_dir(self):
)
assert not os.path.exists(did_cache_dir)

# Use _get_dataset_arff to load the description, trigger an exception in the
# test target and have a slightly higher coverage
@mock.patch("openml.datasets.functions._get_dataset_arff")
# get_dataset_description is the only data guaranteed to be downloaded
@mock.patch("openml.datasets.functions._get_dataset_description")
def test_deletion_of_cache_dir_faulty_download(self, patch):
patch.side_effect = Exception("Boom!")
self.assertRaisesRegex(Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1)
Expand Down Expand Up @@ -1498,7 +1496,7 @@ def test_data_edit_critical_field(self):
os.path.join(self.workdir, "org", "openml", "test", "datasets", str(did)),
)

def test_data_edit_errors(self):
def test_data_edit_requires_field(self):
Comment thread
LennartPurucker marked this conversation as resolved.
# Check server exception when no field to edit is provided
self.assertRaisesRegex(
OpenMLServerException,
Expand All @@ -1509,6 +1507,9 @@ def test_data_edit_errors(self):
edit_dataset,
data_id=64, # blood-transfusion-service-center
)


def test_data_edit_requires_valid_dataset(self):
# Check server exception when unknown dataset is provided
self.assertRaisesRegex(
OpenMLServerException,
Expand All @@ -1518,6 +1519,8 @@ def test_data_edit_errors(self):
description="xor operation dataset",
)


def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self):
# Need to own a dataset to be able to edit meta-data
# Will be creating a forked version of an existing dataset to allow the unit test user
# to edit meta-data of a dataset
Expand All @@ -1543,6 +1546,7 @@ def test_data_edit_errors(self):
default_target_attribute="y",
)

def test_edit_data_user_cannot_edit_critical_field_of_other_users_dataset(self):
# Check server exception when a non-owner or non-admin tries to edit critical fields
self.assertRaisesRegex(
OpenMLServerException,
Expand Down