openml · PGijsbers · Aug 17, 2023 · Aug 10, 2023 · Aug 11, 2023 · Aug 11, 2023
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -96,7 +96,7 @@ class OpenMLDataset(OpenMLBase):
         which maps a quality name to a quality value.
     dataset: string, optional
         Serialized arff dataset string.
-    minio_url: string, optional
+    parquet_url: string, optional
         URL to the MinIO bucket with dataset files
     parquet_file: string, optional
         Path to the local parquet file.
@@ -132,7 +132,7 @@ def __init__(
         features_file: Optional[str] = None,
         qualities_file: Optional[str] = None,
         dataset=None,
-        minio_url: Optional[str] = None,
+        parquet_url: Optional[str] = None,
         parquet_file: Optional[str] = None,
     ):
         def find_invalid_characters(string, pattern):
@@ -210,7 +210,7 @@ def find_invalid_characters(string, pattern):
         self.data_file = data_file
         self.parquet_file = parquet_file
         self._dataset = dataset
-        self._minio_url = minio_url
+        self._parquet_url = parquet_url
 
         self._features = None  # type: Optional[Dict[int, OpenMLDataFeature]]
         self._qualities = None  # type: Optional[Dict[str, float]]
@@ -329,7 +329,7 @@ def _download_data(self) -> None:
         from .functions import _get_dataset_arff, _get_dataset_parquet
 
         self.data_file = _get_dataset_arff(self)
-        if self._minio_url is not None:
+        if self._parquet_url is not None:
             self.parquet_file = _get_dataset_parquet(self)
 
     def _get_arff(self, format: str) -> Dict:

diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -495,7 +495,7 @@ def get_dataset(
             qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)
 
         arff_file = _get_dataset_arff(description) if download_data else None
-        if "oml:minio_url" in description and download_data:
+        if "oml:parquet_url" in description and download_data:
             try:
                 parquet_file = _get_dataset_parquet(
                     description, download_all_files=download_all_files
@@ -1062,18 +1062,18 @@ def _get_dataset_parquet(
 
     download_all_files: bool, optional (default=False)
         If `True`, download all data found in the bucket to which the description's
-        ``minio_url`` points, only download the parquet file otherwise.
+        ``parquet_url`` points, only download the parquet file otherwise.
 
     Returns
     -------
     output_filename : string, optional
         Location of the Parquet file if successfully downloaded, None otherwise.
     """
     if isinstance(description, dict):
-        url = cast(str, description.get("oml:minio_url"))
+        url = cast(str, description.get("oml:parquet_url"))
         did = description.get("oml:id")
     elif isinstance(description, OpenMLDataset):
-        url = cast(str, description._minio_url)
+        url = cast(str, description._parquet_url)
         did = description.dataset_id
     else:
         raise TypeError("`description` should be either OpenMLDataset or Dict.")
@@ -1316,7 +1316,7 @@ def _create_dataset_from_description(
         cache_format=cache_format,
         features_file=features_file,
         qualities_file=qualities_file,
-        minio_url=description.get("oml:minio_url"),
+        parquet_url=description.get("oml:parquet_url"),
         parquet_file=parquet_file,
     )
 

diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -439,7 +439,7 @@ def test__download_minio_file_works_with_bucket_subdirectory(self):
 
     def test__get_dataset_parquet_not_cached(self):
         description = {
-            "oml:minio_url": "http://openml1.win.tue.nl/dataset20/dataset_20.pq",
+            "oml:parquet_url": "http://openml1.win.tue.nl/dataset20/dataset_20.pq",
             "oml:id": "20",
         }
         path = _get_dataset_parquet(description, cache_directory=self.workdir)
@@ -450,10 +450,10 @@ def test__get_dataset_parquet_not_cached(self):
     def test__get_dataset_parquet_is_cached(self, patch):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         patch.side_effect = RuntimeError(
-            "_download_minio_file should not be called when loading from cache"
+            "_download_parquet_url should not be called when loading from cache"
         )
         description = {
-            "oml:minio_url": "http://openml1.win.tue.nl/dataset30/dataset_30.pq",
+            "oml:parquet_url": "http://openml1.win.tue.nl/dataset30/dataset_30.pq",
             "oml:id": "30",
         }
         path = _get_dataset_parquet(description, cache_directory=None)
@@ -462,7 +462,7 @@ def test__get_dataset_parquet_is_cached(self, patch):
 
     def test__get_dataset_parquet_file_does_not_exist(self):
         description = {
-            "oml:minio_url": "http://openml1.win.tue.nl/dataset20/does_not_exist.pq",
+            "oml:parquet_url": "http://openml1.win.tue.nl/dataset20/does_not_exist.pq",
             "oml:id": "20",
         }
         path = _get_dataset_parquet(description, cache_directory=self.workdir)
@@ -1416,7 +1416,7 @@ def test_get_dataset_cache_format_feather(self):
         # The parquet file on minio with ID 128 is not the iris dataset from the test server.
         dataset = openml.datasets.get_dataset(128, cache_format="feather")
         # Workaround
-        dataset._minio_url = None
+        dataset._parquet_url = None
         dataset.parquet_file = None
         dataset.get_data()
 
@@ -1561,7 +1561,7 @@ def test_get_dataset_parquet(self):
         # There is no parquet-copy of the test server yet.
         openml.config.server = self.production_server
         dataset = openml.datasets.get_dataset(61)
-        self.assertIsNotNone(dataset._minio_url)
+        self.assertIsNotNone(dataset._parquet_url)
         self.assertIsNotNone(dataset.parquet_file)
         self.assertTrue(os.path.isfile(dataset.parquet_file))