openml · PGijsbers · Feb 11, 2021 · Jan 7, 2021 · Jan 7, 2021 · Jan 8, 2021
diff --git a/doc/progress.rst b/doc/progress.rst
@@ -12,6 +12,7 @@ Changelog
 * MAINT #671: Improved the performance of ``check_datasets_active`` by only querying the given list of datasets in contrast to querying all datasets. Modified the corresponding unit test.
 * FIX #964 : AValidate `ignore_attribute`, `default_target_attribute`, `row_id_attribute` are set to attributes that exist on the dataset when calling ``create_dataset``.
 * DOC #973 : Change the task used in the welcome page example so it no longer fails using numerical dataset.
+* ADD #1009 : Give possibility to not download the dataset qualities. The cached version is used even so download attribute is false.
 0.11.0
 ~~~~~~
 * ADD #753: Allows uploading custom flows to OpenML via OpenML-Python.

diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -290,6 +290,8 @@ def _name_to_id(
     error_if_multiple : bool (default=False)
         If `False`, if multiple datasets match, return the least recent active dataset.
         If `True`, if multiple datasets match, raise an error.
+    download_qualities : bool, optional (default=True)
+        If `True`, also download qualities.xml file. If false use the file if it was cached.
 
     Returns
     -------
@@ -310,7 +312,7 @@ def _name_to_id(
 
 
 def get_datasets(
-    dataset_ids: List[Union[str, int]], download_data: bool = True,
+    dataset_ids: List[Union[str, int]], download_data: bool = True, download_qualities: bool = True
 ) -> List[OpenMLDataset]:
     """Download datasets.
 
@@ -326,6 +328,8 @@ def get_datasets(
         make the operation noticeably slower. Metadata is also still retrieved.
         If False, create the OpenMLDataset and only populate it with the metadata.
         The data may later be retrieved through the `OpenMLDataset.get_data` method.
+    download_qualities : bool, optional (default=True)
+        If True, also download qualities.xml file. If false use the file if it was cached.
 
     Returns
     -------
@@ -334,7 +338,9 @@ def get_datasets(
     """
     datasets = []
     for dataset_id in dataset_ids:
-        datasets.append(get_dataset(dataset_id, download_data))
+        datasets.append(
+            get_dataset(dataset_id, download_data, download_qualities=download_qualities)
+        )
     return datasets
 
 
@@ -345,6 +351,7 @@ def get_dataset(
     version: int = None,
     error_if_multiple: bool = False,
     cache_format: str = "pickle",
+    download_qualities: bool = True,
 ) -> OpenMLDataset:
     """ Download the OpenML dataset representation, optionally also download actual data file.
 
@@ -405,7 +412,9 @@ def get_dataset(
         features_file = _get_dataset_features_file(did_cache_dir, dataset_id)
 
         try:
-            qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)
+            qualities_file = _get_dataset_qualities_file(
+                did_cache_dir, dataset_id, download_qualities
+            )
         except OpenMLServerException as e:
             if e.code == 362 and str(e) == "No qualities found - None":
                 logger.warning("No qualities found for dataset {}".format(dataset_id))
@@ -981,7 +990,7 @@ def _get_dataset_features_file(did_cache_dir: str, dataset_id: int) -> str:
     return features_file
 
 
-def _get_dataset_qualities_file(did_cache_dir, dataset_id):
+def _get_dataset_qualities_file(did_cache_dir, dataset_id, download_qualities=True):
     """API call to load dataset qualities. Loads from cache or downloads them.
 
     Features are metafeatures (number of features, number of classes, ...)
@@ -996,11 +1005,17 @@ def _get_dataset_qualities_file(did_cache_dir, dataset_id):
     dataset_id : int
         Dataset ID
 
+    download_qualities : bool
+        wheather to download/use cahsed version or not.
     Returns
     -------
     str
         Path of the cached qualities file
     """
+    # return empty path to avoied used cahched version, this will make the output consistent
+    # regardless the cache state.
+    if not download_qualities:
+        return ""
     # Dataset qualities are subject to change and must be fetched every time
     qualities_file = os.path.join(did_cache_dir, "qualities.xml")
     try:
@@ -1009,10 +1024,8 @@ def _get_dataset_qualities_file(did_cache_dir, dataset_id):
     except (OSError, IOError):
         url_extension = "data/qualities/{}".format(dataset_id)
         qualities_xml = openml._api_calls._perform_api_call(url_extension, "get")
-
         with io.open(qualities_file, "w", encoding="utf8") as fh:
             fh.write(qualities_xml)
-
     return qualities_file
 
 

diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -433,6 +433,11 @@ def test__get_dataset_qualities(self):
         qualities_xml_path = os.path.join(self.workdir, "qualities.xml")
         self.assertTrue(os.path.exists(qualities_xml_path))
 
+    def test__get_dataset_qualities_skip_download(self):
+        qualities = _get_dataset_qualities_file(self.workdir, 2, False)
+        self.assertIsInstance(qualities, str)
+        self.assertEqual(qualities, "")
+
     def test_deletion_of_cache_dir(self):
         # Simple removal
         did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, 1,)