Feature/give possibility to not download the dataset qualities (#1017)

a-moadel · Mohamed Adel · PGijsbers · web-flow · commit 80ae0464d2ce · 2021-02-11T11:07:47.000+01:00
* update getdatasets function to give possibility to not download the dataset qualities

* make download qualities defaulted to True

* Using cahced version if exist

* Updated the comments for get_dataset and get_datasets to include new parameter

* Update openml/datasets/functions.py

Co-authored-by: PGijsbers &lt;p.gijsbers@tue.nl&gt;

* Update openml/datasets/functions.py

Co-authored-by: PGijsbers &lt;p.gijsbers@tue.nl&gt;

* update get_dataset_qualities to have consistent output regardless the cache status , adding unit test for get_dataset_qualities

* run pre-commit

* fix parameter passing

* Updated the comments for get_dataset and get_datasets to include new parameter, remove unnecessarily call for download qualities

Co-authored-by: Mohamed Adel &lt;mohamed.adel3@booking.com&gt;
Co-authored-by: PGijsbers &lt;p.gijsbers@tue.nl&gt;
Co-authored-by: mohamed adel &lt;m.adel0093@gmail.com&gt;
diff --git a/doc/progress.rst b/doc/progress.rst
@@ -13,6 +13,7 @@ Changelog
 * MAINT #671: Improved the performance of ``check_datasets_active`` by only querying the given list of datasets in contrast to querying all datasets. Modified the corresponding unit test.
 * FIX #964 : AValidate `ignore_attribute`, `default_target_attribute`, `row_id_attribute` are set to attributes that exist on the dataset when calling ``create_dataset``.
 * DOC #973 : Change the task used in the welcome page example so it no longer fails using numerical dataset.
+* ADD #1009 : Give possibility to not download the dataset qualities. The cached version is used even so download attribute is false.
 0.11.0
 ~~~~~~
 * ADD #753: Allows uploading custom flows to OpenML via OpenML-Python.
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -290,6 +290,8 @@ def _name_to_id(
     error_if_multiple : bool (default=False)
         If `False`, if multiple datasets match, return the least recent active dataset.
         If `True`, if multiple datasets match, raise an error.
+    download_qualities : bool, optional (default=True)
+        If `True`, also download qualities.xml file. If False it skip the qualities.xml.
 
     Returns
     -------
@@ -310,7 +312,7 @@ def _name_to_id(
 
 
 def get_datasets(
-    dataset_ids: List[Union[str, int]], download_data: bool = True,
+    dataset_ids: List[Union[str, int]], download_data: bool = True, download_qualities: bool = True
 ) -> List[OpenMLDataset]:
     """Download datasets.
 
@@ -326,6 +328,8 @@ def get_datasets(
         make the operation noticeably slower. Metadata is also still retrieved.
         If False, create the OpenMLDataset and only populate it with the metadata.
         The data may later be retrieved through the `OpenMLDataset.get_data` method.
+    download_qualities : bool, optional (default=True)
+        If True, also download qualities.xml file. If False it skip the qualities.xml.
 
     Returns
     -------
@@ -334,7 +338,9 @@ def get_datasets(
     """
     datasets = []
     for dataset_id in dataset_ids:
-        datasets.append(get_dataset(dataset_id, download_data))
+        datasets.append(
+            get_dataset(dataset_id, download_data, download_qualities=download_qualities)
+        )
     return datasets
 
 
@@ -345,6 +351,7 @@ def get_dataset(
     version: int = None,
     error_if_multiple: bool = False,
     cache_format: str = "pickle",
+    download_qualities: bool = True,
 ) -> OpenMLDataset:
     """ Download the OpenML dataset representation, optionally also download actual data file.
 
@@ -405,7 +412,10 @@ def get_dataset(
         features_file = _get_dataset_features_file(did_cache_dir, dataset_id)
 
         try:
-            qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)
+            if download_qualities:
+                qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)
+            else:
+                qualities_file = ""
         except OpenMLServerException as e:
             if e.code == 362 and str(e) == "No qualities found - None":
                 logger.warning("No qualities found for dataset {}".format(dataset_id))
@@ -996,6 +1006,8 @@ def _get_dataset_qualities_file(did_cache_dir, dataset_id):
     dataset_id : int
         Dataset ID
 
+    download_qualities : bool
+        wheather to download/use cahsed version or not.
     Returns
     -------
     str
@@ -1009,10 +1021,8 @@ def _get_dataset_qualities_file(did_cache_dir, dataset_id):
     except (OSError, IOError):
         url_extension = "data/qualities/{}".format(dataset_id)
         qualities_xml = openml._api_calls._perform_api_call(url_extension, "get")
-
         with io.open(qualities_file, "w", encoding="utf8") as fh:
             fh.write(qualities_xml)
-
     return qualities_file
 
 
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -433,6 +433,10 @@ def test__get_dataset_qualities(self):
         qualities_xml_path = os.path.join(self.workdir, "qualities.xml")
         self.assertTrue(os.path.exists(qualities_xml_path))
 
+    def test__get_dataset_skip_download(self):
+        qualities = openml.datasets.get_dataset(2, download_qualities=False).qualities
+        self.assertIsNone(qualities)
+
     def test_deletion_of_cache_dir(self):
         # Simple removal
         did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, 1,)