Skip to content

Commit f2af798

Browse files
Improving the performance of check_datasets_active (#980)
* Improving the performance of check_datasets_active, modifying unit test * Adding changes to doc/progress * Addressing Pieter's comments Co-authored-by: PGijsbers <p.gijsbers@tue.nl>
1 parent 4923e5b commit f2af798

3 files changed

Lines changed: 18 additions & 3 deletions

File tree

doc/progress.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ Changelog
88

99
0.11.1
1010
~~~~~~
11+
* MAINT #671: Improved the performance of ``check_datasets_active`` by only querying the given list of datasets in contrast to querying all datasets. Modified the corresponding unit test.
1112

1213
0.11.0
1314
~~~~~~

openml/datasets/functions.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -333,14 +333,23 @@ def _load_features_from_file(features_file: str) -> Dict:
333333
return xml_dict["oml:data_features"]
334334

335335

336-
def check_datasets_active(dataset_ids: List[int]) -> Dict[int, bool]:
336+
def check_datasets_active(
337+
dataset_ids: List[int],
338+
raise_error_if_not_exist: bool = True,
339+
) -> Dict[int, bool]:
337340
"""
338341
Check if the dataset ids provided are active.
339342
343+
Raises an error if a dataset_id in the given list
344+
of dataset_ids does not exist on the server.
345+
340346
Parameters
341347
----------
342348
dataset_ids : List[int]
343349
A list of integers representing dataset ids.
350+
raise_error_if_not_exist : bool (default=True)
351+
Flag that if activated can raise an error, if one or more of the
352+
given dataset ids do not exist on the server.
344353
345354
Returns
346355
-------
@@ -353,7 +362,8 @@ def check_datasets_active(dataset_ids: List[int]) -> Dict[int, bool]:
353362
for did in dataset_ids:
354363
dataset = dataset_list.get(did, None)
355364
if dataset is None:
356-
raise ValueError("Could not find dataset {} in OpenML dataset list.".format(did))
365+
if raise_error_if_not_exist:
366+
raise ValueError(f'Could not find dataset {did} in OpenML dataset list.')
357367
else:
358368
active[did] = dataset["status"] == "active"
359369

tests/test_datasets/test_dataset_functions.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,9 +227,13 @@ def test_list_datasets_empty(self):
227227
def test_check_datasets_active(self):
228228
# Have to test on live because there is no deactivated dataset on the test server.
229229
openml.config.server = self.production_server
230-
active = openml.datasets.check_datasets_active([2, 17])
230+
active = openml.datasets.check_datasets_active(
231+
[2, 17, 79],
232+
raise_error_if_not_exist=False,
233+
)
231234
self.assertTrue(active[2])
232235
self.assertFalse(active[17])
236+
self.assertIsNone(active.get(79))
233237
self.assertRaisesRegex(
234238
ValueError,
235239
"Could not find dataset 79 in OpenML dataset list.",

0 commit comments

Comments
 (0)