Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Improving the performance of check_datasets_active, modifying unit test
  • Loading branch information
ArlindKadra committed Oct 28, 2020
commit 7dbb295bf75f564f9a80a0097fae1960dbbb7a2b
19 changes: 16 additions & 3 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,27 +333,40 @@ def _load_features_from_file(features_file: str) -> Dict:
return xml_dict["oml:data_features"]


def check_datasets_active(dataset_ids: List[int]) -> Dict[int, bool]:
def check_datasets_active(
dataset_ids: List[int],
raise_error_if_not_exist: bool = True,
) -> Dict[int, bool]:
"""
Check if the dataset ids provided are active.

Raises an error if a dataset_id in the given list
of dataset_ids does not exist on the server.

Parameters
----------
dataset_ids : List[int]
A list of integers representing dataset ids.
raise_error_if_not_exist : bool, optional (default=True)
Comment thread
PGijsbers marked this conversation as resolved.
Outdated
Flag that if activated can raise an error, if one or more of the
given dataset ids do not exist on the server.

Returns
-------
dict
A dictionary with items {did: bool}
"""
dataset_list = list_datasets(status="all")
dataset_list = list_datasets(
dataset_ids=dataset_ids,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
dataset_ids=dataset_ids,
data_id=dataset_ids,

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually this is a rather important one that hadn't caught my eye first time around 😅

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Strange, I got a suggestion for that and the function had no argument like that at all :S. Nice catch.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

like because of the **kwargs (used to allow all filters)

status="all",
)
active = {}

for did in dataset_ids:
dataset = dataset_list.get(did, None)
if dataset is None:
raise ValueError("Could not find dataset {} in OpenML dataset list.".format(did))
if raise_error_if_not_exist:
raise ValueError(f'Could not find dataset {did} in OpenML dataset list.')
else:
active[did] = dataset["status"] == "active"

Expand Down
6 changes: 5 additions & 1 deletion tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,13 @@ def test_list_datasets_empty(self):
def test_check_datasets_active(self):
# Have to test on live because there is no deactivated dataset on the test server.
openml.config.server = self.production_server
active = openml.datasets.check_datasets_active([2, 17])
active = openml.datasets.check_datasets_active(
[2, 17, 79],
raise_error_if_not_exist=False,
)
self.assertTrue(active[2])
self.assertFalse(active[17])
self.assertIsNone(active.get(79))
self.assertRaisesRegex(
ValueError,
"Could not find dataset 79 in OpenML dataset list.",
Expand Down