Skip to content
Merged
1 change: 1 addition & 0 deletions doc/progress.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Changelog
* MAINT #671: Improved the performance of ``check_datasets_active`` by only querying the given list of datasets in contrast to querying all datasets. Modified the corresponding unit test.
* FIX #964 : AValidate `ignore_attribute`, `default_target_attribute`, `row_id_attribute` are set to attributes that exist on the dataset when calling ``create_dataset``.
* DOC #973 : Change the task used in the welcome page example so it no longer fails using numerical dataset.
* ADD #1009 : Give possibility to not download the dataset qualities. The cached version is used even so download attribute is false.
0.11.0
~~~~~~
* ADD #753: Allows uploading custom flows to OpenML via OpenML-Python.
Expand Down
25 changes: 19 additions & 6 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,8 @@ def _name_to_id(
error_if_multiple : bool (default=False)
If `False`, if multiple datasets match, return the least recent active dataset.
If `True`, if multiple datasets match, raise an error.
download_qualities : bool, optional (default=True)
If `True`, also download qualities.xml file. If false use the file if it was cached.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please update this and the other references where it says the cached version is used if false is passed, as this is not the behavior anymore.


Returns
-------
Expand All @@ -310,7 +312,7 @@ def _name_to_id(


def get_datasets(
dataset_ids: List[Union[str, int]], download_data: bool = True,
dataset_ids: List[Union[str, int]], download_data: bool = True, download_qualities: bool = True
) -> List[OpenMLDataset]:
"""Download datasets.

Expand All @@ -326,6 +328,8 @@ def get_datasets(
make the operation noticeably slower. Metadata is also still retrieved.
If False, create the OpenMLDataset and only populate it with the metadata.
The data may later be retrieved through the `OpenMLDataset.get_data` method.
download_qualities : bool, optional (default=True)
If True, also download qualities.xml file. If false use the file if it was cached.

Returns
-------
Expand All @@ -334,7 +338,9 @@ def get_datasets(
"""
datasets = []
for dataset_id in dataset_ids:
datasets.append(get_dataset(dataset_id, download_data))
datasets.append(
get_dataset(dataset_id, download_data, download_qualities=download_qualities)
)
return datasets


Expand All @@ -345,6 +351,7 @@ def get_dataset(
version: int = None,
error_if_multiple: bool = False,
cache_format: str = "pickle",
download_qualities: bool = True,
) -> OpenMLDataset:
""" Download the OpenML dataset representation, optionally also download actual data file.

Expand Down Expand Up @@ -405,7 +412,9 @@ def get_dataset(
features_file = _get_dataset_features_file(did_cache_dir, dataset_id)

try:
qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)
qualities_file = _get_dataset_qualities_file(
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it makes more sense to use download_qualities here and not call the _get_dataset_qualities_file if we don't want to process the qualities.

did_cache_dir, dataset_id, download_qualities
)
except OpenMLServerException as e:
if e.code == 362 and str(e) == "No qualities found - None":
logger.warning("No qualities found for dataset {}".format(dataset_id))
Expand Down Expand Up @@ -981,7 +990,7 @@ def _get_dataset_features_file(did_cache_dir: str, dataset_id: int) -> str:
return features_file


def _get_dataset_qualities_file(did_cache_dir, dataset_id):
def _get_dataset_qualities_file(did_cache_dir, dataset_id, download_qualities=True):
"""API call to load dataset qualities. Loads from cache or downloads them.

Features are metafeatures (number of features, number of classes, ...)
Expand All @@ -996,11 +1005,17 @@ def _get_dataset_qualities_file(did_cache_dir, dataset_id):
dataset_id : int
Dataset ID

download_qualities : bool
wheather to download/use cahsed version or not.
Returns
-------
str
Path of the cached qualities file
"""
# return empty path to avoied used cahched version, this will make the output consistent
# regardless the cache state.
if not download_qualities:
return ""
# Dataset qualities are subject to change and must be fetched every time
qualities_file = os.path.join(did_cache_dir, "qualities.xml")
try:
Expand All @@ -1009,10 +1024,8 @@ def _get_dataset_qualities_file(did_cache_dir, dataset_id):
except (OSError, IOError):
url_extension = "data/qualities/{}".format(dataset_id)
qualities_xml = openml._api_calls._perform_api_call(url_extension, "get")

with io.open(qualities_file, "w", encoding="utf8") as fh:
fh.write(qualities_xml)

return qualities_file


Expand Down
5 changes: 5 additions & 0 deletions tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,11 @@ def test__get_dataset_qualities(self):
qualities_xml_path = os.path.join(self.workdir, "qualities.xml")
self.assertTrue(os.path.exists(qualities_xml_path))

def test__get_dataset_qualities_skip_download(self):
qualities = _get_dataset_qualities_file(self.workdir, 2, False)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you use keywords here (e.g. download_qualities=False)?

self.assertIsInstance(qualities, str)
self.assertEqual(qualities, "")

def test_deletion_of_cache_dir(self):
# Simple removal
did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, 1,)
Expand Down