Skip to content

Commit 80ae046

Browse files
a-moadelMohamed AdelPGijsbersmadel0093
authored
Feature/give possibility to not download the dataset qualities (#1017)
* update getdatasets function to give possibility to not download the dataset qualities * make download qualities defaulted to True * Using cahced version if exist * Updated the comments for get_dataset and get_datasets to include new parameter * Update openml/datasets/functions.py Co-authored-by: PGijsbers <p.gijsbers@tue.nl> * Update openml/datasets/functions.py Co-authored-by: PGijsbers <p.gijsbers@tue.nl> * update get_dataset_qualities to have consistent output regardless the cache status , adding unit test for get_dataset_qualities * run pre-commit * fix parameter passing * Updated the comments for get_dataset and get_datasets to include new parameter, remove unnecessarily call for download qualities Co-authored-by: Mohamed Adel <mohamed.adel3@booking.com> Co-authored-by: PGijsbers <p.gijsbers@tue.nl> Co-authored-by: mohamed adel <m.adel0093@gmail.com>
1 parent 47cda65 commit 80ae046

File tree

3 files changed

+20
-5
lines changed

3 files changed

+20
-5
lines changed

doc/progress.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ Changelog
1313
* MAINT #671: Improved the performance of ``check_datasets_active`` by only querying the given list of datasets in contrast to querying all datasets. Modified the corresponding unit test.
1414
* FIX #964 : AValidate `ignore_attribute`, `default_target_attribute`, `row_id_attribute` are set to attributes that exist on the dataset when calling ``create_dataset``.
1515
* DOC #973 : Change the task used in the welcome page example so it no longer fails using numerical dataset.
16+
* ADD #1009 : Give possibility to not download the dataset qualities. The cached version is used even so download attribute is false.
1617
0.11.0
1718
~~~~~~
1819
* ADD #753: Allows uploading custom flows to OpenML via OpenML-Python.

openml/datasets/functions.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,8 @@ def _name_to_id(
290290
error_if_multiple : bool (default=False)
291291
If `False`, if multiple datasets match, return the least recent active dataset.
292292
If `True`, if multiple datasets match, raise an error.
293+
download_qualities : bool, optional (default=True)
294+
If `True`, also download qualities.xml file. If False it skip the qualities.xml.
293295
294296
Returns
295297
-------
@@ -310,7 +312,7 @@ def _name_to_id(
310312

311313

312314
def get_datasets(
313-
dataset_ids: List[Union[str, int]], download_data: bool = True,
315+
dataset_ids: List[Union[str, int]], download_data: bool = True, download_qualities: bool = True
314316
) -> List[OpenMLDataset]:
315317
"""Download datasets.
316318
@@ -326,6 +328,8 @@ def get_datasets(
326328
make the operation noticeably slower. Metadata is also still retrieved.
327329
If False, create the OpenMLDataset and only populate it with the metadata.
328330
The data may later be retrieved through the `OpenMLDataset.get_data` method.
331+
download_qualities : bool, optional (default=True)
332+
If True, also download qualities.xml file. If False it skip the qualities.xml.
329333
330334
Returns
331335
-------
@@ -334,7 +338,9 @@ def get_datasets(
334338
"""
335339
datasets = []
336340
for dataset_id in dataset_ids:
337-
datasets.append(get_dataset(dataset_id, download_data))
341+
datasets.append(
342+
get_dataset(dataset_id, download_data, download_qualities=download_qualities)
343+
)
338344
return datasets
339345

340346

@@ -345,6 +351,7 @@ def get_dataset(
345351
version: int = None,
346352
error_if_multiple: bool = False,
347353
cache_format: str = "pickle",
354+
download_qualities: bool = True,
348355
) -> OpenMLDataset:
349356
""" Download the OpenML dataset representation, optionally also download actual data file.
350357
@@ -405,7 +412,10 @@ def get_dataset(
405412
features_file = _get_dataset_features_file(did_cache_dir, dataset_id)
406413

407414
try:
408-
qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)
415+
if download_qualities:
416+
qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)
417+
else:
418+
qualities_file = ""
409419
except OpenMLServerException as e:
410420
if e.code == 362 and str(e) == "No qualities found - None":
411421
logger.warning("No qualities found for dataset {}".format(dataset_id))
@@ -996,6 +1006,8 @@ def _get_dataset_qualities_file(did_cache_dir, dataset_id):
9961006
dataset_id : int
9971007
Dataset ID
9981008
1009+
download_qualities : bool
1010+
wheather to download/use cahsed version or not.
9991011
Returns
10001012
-------
10011013
str
@@ -1009,10 +1021,8 @@ def _get_dataset_qualities_file(did_cache_dir, dataset_id):
10091021
except (OSError, IOError):
10101022
url_extension = "data/qualities/{}".format(dataset_id)
10111023
qualities_xml = openml._api_calls._perform_api_call(url_extension, "get")
1012-
10131024
with io.open(qualities_file, "w", encoding="utf8") as fh:
10141025
fh.write(qualities_xml)
1015-
10161026
return qualities_file
10171027

10181028

tests/test_datasets/test_dataset_functions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,10 @@ def test__get_dataset_qualities(self):
433433
qualities_xml_path = os.path.join(self.workdir, "qualities.xml")
434434
self.assertTrue(os.path.exists(qualities_xml_path))
435435

436+
def test__get_dataset_skip_download(self):
437+
qualities = openml.datasets.get_dataset(2, download_qualities=False).qualities
438+
self.assertIsNone(qualities)
439+
436440
def test_deletion_of_cache_dir(self):
437441
# Simple removal
438442
did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, 1,)

0 commit comments

Comments
 (0)