Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ class OpenMLDataset(OpenMLBase):
which maps a quality name to a quality value.
dataset: string, optional
Serialized arff dataset string.
minio_url: string, optional
parquet_url: string, optional
URL to the MinIO bucket with dataset files
Comment thread
LennartPurucker marked this conversation as resolved.
Outdated
parquet_file: string, optional
Path to the local parquet file.
Expand Down Expand Up @@ -132,7 +132,7 @@ def __init__(
features_file: Optional[str] = None,
qualities_file: Optional[str] = None,
dataset=None,
minio_url: Optional[str] = None,
parquet_url: Optional[str] = None,
parquet_file: Optional[str] = None,
):
def find_invalid_characters(string, pattern):
Expand Down Expand Up @@ -210,7 +210,7 @@ def find_invalid_characters(string, pattern):
self.data_file = data_file
self.parquet_file = parquet_file
self._dataset = dataset
self._minio_url = minio_url
self._parquet_url = parquet_url

self._features = None # type: Optional[Dict[int, OpenMLDataFeature]]
self._qualities = None # type: Optional[Dict[str, float]]
Expand Down Expand Up @@ -329,7 +329,7 @@ def _download_data(self) -> None:
from .functions import _get_dataset_arff, _get_dataset_parquet

self.data_file = _get_dataset_arff(self)
if self._minio_url is not None:
if self._parquet_url is not None:
self.parquet_file = _get_dataset_parquet(self)

def _get_arff(self, format: str) -> Dict:
Expand Down
10 changes: 5 additions & 5 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,7 @@ def get_dataset(
qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)

arff_file = _get_dataset_arff(description) if download_data else None
if "oml:minio_url" in description and download_data:
if "oml:parquet_url" in description and download_data:
try:
parquet_file = _get_dataset_parquet(
description, download_all_files=download_all_files
Expand Down Expand Up @@ -1062,18 +1062,18 @@ def _get_dataset_parquet(

download_all_files: bool, optional (default=False)
If `True`, download all data found in the bucket to which the description's
``minio_url`` points, only download the parquet file otherwise.
``parquet_url`` points, only download the parquet file otherwise.

Returns
-------
output_filename : string, optional
Location of the Parquet file if successfully downloaded, None otherwise.
"""
if isinstance(description, dict):
url = cast(str, description.get("oml:minio_url"))
url = cast(str, description.get("oml:parquet_url"))
did = description.get("oml:id")
elif isinstance(description, OpenMLDataset):
url = cast(str, description._minio_url)
url = cast(str, description._parquet_url)
did = description.dataset_id
else:
raise TypeError("`description` should be either OpenMLDataset or Dict.")
Expand Down Expand Up @@ -1316,7 +1316,7 @@ def _create_dataset_from_description(
cache_format=cache_format,
features_file=features_file,
qualities_file=qualities_file,
minio_url=description.get("oml:minio_url"),
parquet_url=description.get("oml:parquet_url"),
parquet_file=parquet_file,
)

Expand Down
12 changes: 6 additions & 6 deletions tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ def test__download_minio_file_works_with_bucket_subdirectory(self):

def test__get_dataset_parquet_not_cached(self):
description = {
"oml:minio_url": "http://openml1.win.tue.nl/dataset20/dataset_20.pq",
"oml:parquet_url": "http://openml1.win.tue.nl/dataset20/dataset_20.pq",
"oml:id": "20",
}
path = _get_dataset_parquet(description, cache_directory=self.workdir)
Expand All @@ -450,10 +450,10 @@ def test__get_dataset_parquet_not_cached(self):
def test__get_dataset_parquet_is_cached(self, patch):
openml.config.set_root_cache_directory(self.static_cache_dir)
patch.side_effect = RuntimeError(
"_download_minio_file should not be called when loading from cache"
"_download_parquet_url should not be called when loading from cache"
)
description = {
"oml:minio_url": "http://openml1.win.tue.nl/dataset30/dataset_30.pq",
"oml:parquet_url": "http://openml1.win.tue.nl/dataset30/dataset_30.pq",
"oml:id": "30",
}
path = _get_dataset_parquet(description, cache_directory=None)
Expand All @@ -462,7 +462,7 @@ def test__get_dataset_parquet_is_cached(self, patch):

def test__get_dataset_parquet_file_does_not_exist(self):
description = {
"oml:minio_url": "http://openml1.win.tue.nl/dataset20/does_not_exist.pq",
"oml:parquet_url": "http://openml1.win.tue.nl/dataset20/does_not_exist.pq",
"oml:id": "20",
}
path = _get_dataset_parquet(description, cache_directory=self.workdir)
Expand Down Expand Up @@ -1416,7 +1416,7 @@ def test_get_dataset_cache_format_feather(self):
# The parquet file on minio with ID 128 is not the iris dataset from the test server.
dataset = openml.datasets.get_dataset(128, cache_format="feather")
# Workaround
dataset._minio_url = None
dataset._parquet_url = None
dataset.parquet_file = None
dataset.get_data()

Expand Down Expand Up @@ -1561,7 +1561,7 @@ def test_get_dataset_parquet(self):
# There is no parquet-copy of the test server yet.
openml.config.server = self.production_server
dataset = openml.datasets.get_dataset(61)
self.assertIsNotNone(dataset._minio_url)
self.assertIsNotNone(dataset._parquet_url)
self.assertIsNotNone(dataset.parquet_file)
self.assertTrue(os.path.isfile(dataset.parquet_file))

Expand Down