Skip to content

Commit 891f4a6

Browse files
authored
fix(datasets): Add code 111 for dataset description not found error (#1356)
* fix(datasets): Add code `111` for dataset description not found error * test(dataset): Test the error raised * test: Make error tested for tighter
1 parent 7acfb6a commit 891f4a6

2 files changed

Lines changed: 44 additions & 104 deletions

File tree

openml/_api_calls.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -473,18 +473,17 @@ def __parse_server_exception(
473473
code = int(server_error["oml:code"])
474474
message = server_error["oml:message"]
475475
additional_information = server_error.get("oml:additional_information")
476-
if code in [372, 512, 500, 482, 542, 674]:
476+
if code in [111, 372, 512, 500, 482, 542, 674]:
477477
if additional_information:
478478
full_message = f"{message} - {additional_information}"
479479
else:
480480
full_message = message
481481

482482
# 512 for runs, 372 for datasets, 500 for flows
483483
# 482 for tasks, 542 for evaluations, 674 for setups
484-
return OpenMLServerNoResult(
485-
code=code,
486-
message=full_message,
487-
)
484+
# 111 for dataset descriptions
485+
return OpenMLServerNoResult(code=code, message=full_message, url=url)
486+
488487
# 163: failure to validate flow XML (https://www.openml.org/api_docs#!/flow/post_flow)
489488
if code in [163] and file_elements is not None and "description" in file_elements:
490489
# file_elements['description'] is the XML file description of the flow

tests/test_datasets/test_dataset_functions.py

Lines changed: 40 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
OpenMLNotAuthorizedError,
4444
OpenMLPrivateDatasetError,
4545
OpenMLServerException,
46+
OpenMLServerNoResult,
4647
)
4748
from openml.tasks import TaskType, create_task
4849
from openml.testing import TestBase, create_request_response
@@ -274,9 +275,7 @@ def test_get_dataset_cannot_access_private_data(self):
274275
@pytest.mark.skip("Need to find dataset name of private dataset")
275276
def test_dataset_by_name_cannot_access_private_data(self):
276277
openml.config.server = self.production_server
277-
self.assertRaises(
278-
OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE"
279-
)
278+
self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE")
280279

281280
def test_get_dataset_lazy_all_functions(self):
282281
"""Test that all expected functionality is available without downloading the dataset."""
@@ -285,9 +284,7 @@ def test_get_dataset_lazy_all_functions(self):
285284

286285
def ensure_absence_of_real_data():
287286
assert not os.path.exists(
288-
os.path.join(
289-
openml.config.get_cache_directory(), "datasets", "1", "dataset.arff"
290-
)
287+
os.path.join(openml.config.get_cache_directory(), "datasets", "1", "dataset.arff")
291288
)
292289

293290
tag = "test_lazy_tag_%d" % random.randint(1, 1000000)
@@ -509,12 +506,8 @@ def test_deletion_of_cache_dir(self):
509506
@mock.patch("openml.datasets.functions._get_dataset_description")
510507
def test_deletion_of_cache_dir_faulty_download(self, patch):
511508
patch.side_effect = Exception("Boom!")
512-
self.assertRaisesRegex(
513-
Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1
514-
)
515-
datasets_cache_dir = os.path.join(
516-
self.workdir, "org", "openml", "test", "datasets"
517-
)
509+
self.assertRaisesRegex(Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1)
510+
datasets_cache_dir = os.path.join(self.workdir, "org", "openml", "test", "datasets")
518511
assert len(os.listdir(datasets_cache_dir)) == 0
519512

520513
def test_publish_dataset(self):
@@ -555,9 +548,7 @@ def test__retrieve_class_labels(self):
555548
# Test workaround for string-typed class labels
556549
custom_ds = openml.datasets.get_dataset(2)
557550
custom_ds.features[31].data_type = "string"
558-
labels = custom_ds.retrieve_class_labels(
559-
target_name=custom_ds.features[31].name
560-
)
551+
labels = custom_ds.retrieve_class_labels(target_name=custom_ds.features[31].name)
561552
assert labels == ["COIL", "SHEET"]
562553

563554
def test_upload_dataset_with_url(self):
@@ -600,9 +591,7 @@ def test_data_status(self):
600591
)
601592
dataset.publish()
602593
TestBase._mark_entity_for_removal("data", dataset.id)
603-
TestBase.logger.info(
604-
"collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
605-
)
594+
TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
606595
did = dataset.id
607596

608597
# admin key for test server (only adminds can activate datasets.
@@ -678,8 +667,7 @@ def test_attributes_arff_from_df_unknown_dtype(self):
678667
for arr, dt in zip(data, dtype):
679668
df = pd.DataFrame(arr)
680669
err_msg = (
681-
f"The dtype '{dt}' of the column '0' is not currently "
682-
"supported by liac-arff"
670+
f"The dtype '{dt}' of the column '0' is not currently " "supported by liac-arff"
683671
)
684672
with pytest.raises(ValueError, match=err_msg):
685673
attributes_arff_from_df(df)
@@ -710,16 +698,12 @@ def test_create_dataset_numpy(self):
710698

711699
dataset.publish()
712700
TestBase._mark_entity_for_removal("data", dataset.id)
713-
TestBase.logger.info(
714-
"collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
715-
)
701+
TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
716702

717703
assert (
718704
_get_online_dataset_arff(dataset.id) == dataset._dataset
719705
), "Uploaded arff does not match original one"
720-
assert (
721-
_get_online_dataset_format(dataset.id) == "arff"
722-
), "Wrong format for dataset"
706+
assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
723707

724708
def test_create_dataset_list(self):
725709
data = [
@@ -769,15 +753,11 @@ def test_create_dataset_list(self):
769753

770754
dataset.publish()
771755
TestBase._mark_entity_for_removal("data", dataset.id)
772-
TestBase.logger.info(
773-
"collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
774-
)
756+
TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
775757
assert (
776758
_get_online_dataset_arff(dataset.id) == dataset._dataset
777759
), "Uploaded ARFF does not match original one"
778-
assert (
779-
_get_online_dataset_format(dataset.id) == "arff"
780-
), "Wrong format for dataset"
760+
assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
781761

782762
def test_create_dataset_sparse(self):
783763
# test the scipy.sparse.coo_matrix
@@ -974,9 +954,7 @@ def test_create_dataset_pandas(self):
974954
)
975955
dataset.publish()
976956
TestBase._mark_entity_for_removal("data", dataset.id)
977-
TestBase.logger.info(
978-
"collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
979-
)
957+
TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
980958
assert (
981959
_get_online_dataset_arff(dataset.id) == dataset._dataset
982960
), "Uploaded ARFF does not match original one"
@@ -991,9 +969,7 @@ def test_create_dataset_pandas(self):
991969
column_names = ["input1", "input2", "y"]
992970
df = pd.DataFrame.sparse.from_spmatrix(sparse_data, columns=column_names)
993971
# meta-information
994-
description = (
995-
"Synthetic dataset created from a Pandas DataFrame with Sparse columns"
996-
)
972+
description = "Synthetic dataset created from a Pandas DataFrame with Sparse columns"
997973
dataset = openml.datasets.functions.create_dataset(
998974
name=name,
999975
description=description,
@@ -1014,15 +990,11 @@ def test_create_dataset_pandas(self):
1014990
)
1015991
dataset.publish()
1016992
TestBase._mark_entity_for_removal("data", dataset.id)
1017-
TestBase.logger.info(
1018-
"collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
1019-
)
993+
TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
1020994
assert (
1021995
_get_online_dataset_arff(dataset.id) == dataset._dataset
1022996
), "Uploaded ARFF does not match original one"
1023-
assert (
1024-
_get_online_dataset_format(dataset.id) == "sparse_arff"
1025-
), "Wrong format for dataset"
997+
assert _get_online_dataset_format(dataset.id) == "sparse_arff", "Wrong format for dataset"
1026998

1027999
# Check that we can overwrite the attributes
10281000
data = [["a"], ["b"], ["c"], ["d"], ["e"]]
@@ -1050,13 +1022,9 @@ def test_create_dataset_pandas(self):
10501022
)
10511023
dataset.publish()
10521024
TestBase._mark_entity_for_removal("data", dataset.id)
1053-
TestBase.logger.info(
1054-
"collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
1055-
)
1025+
TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
10561026
downloaded_data = _get_online_dataset_arff(dataset.id)
1057-
assert (
1058-
downloaded_data == dataset._dataset
1059-
), "Uploaded ARFF does not match original one"
1027+
assert downloaded_data == dataset._dataset, "Uploaded ARFF does not match original one"
10601028
assert "@ATTRIBUTE rnd_str {a, b, c, d, e, f, g}" in downloaded_data
10611029

10621030
def test_ignore_attributes_dataset(self):
@@ -1217,9 +1185,7 @@ def test_publish_fetch_ignore_attribute(self):
12171185
# publish dataset
12181186
dataset.publish()
12191187
TestBase._mark_entity_for_removal("data", dataset.id)
1220-
TestBase.logger.info(
1221-
"collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
1222-
)
1188+
TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
12231189
# test if publish was successful
12241190
assert isinstance(dataset.id, int)
12251191

@@ -1403,9 +1369,7 @@ def test_get_dataset_cache_format_feather(self):
14031369
cache_dir = openml.config.get_cache_directory()
14041370
cache_dir_for_id = os.path.join(cache_dir, "datasets", "128")
14051371
feather_file = os.path.join(cache_dir_for_id, "dataset.feather")
1406-
pickle_file = os.path.join(
1407-
cache_dir_for_id, "dataset.feather.attributes.pkl.py3"
1408-
)
1372+
pickle_file = os.path.join(cache_dir_for_id, "dataset.feather.attributes.pkl.py3")
14091373
data = pd.read_feather(feather_file)
14101374
assert os.path.isfile(feather_file), "Feather file is missing"
14111375
assert os.path.isfile(pickle_file), "Attributes pickle file is missing"
@@ -1450,19 +1414,15 @@ def test_data_edit_critical_field(self):
14501414
# for this, we need to first clone a dataset to do changes
14511415
did = fork_dataset(1)
14521416
self._wait_for_dataset_being_processed(did)
1453-
result = edit_dataset(
1454-
did, default_target_attribute="shape", ignore_attribute="oil"
1455-
)
1417+
result = edit_dataset(did, default_target_attribute="shape", ignore_attribute="oil")
14561418
assert did == result
14571419

14581420
n_tries = 10
14591421
# we need to wait for the edit to be reflected on the server
14601422
for i in range(n_tries):
14611423
edited_dataset = openml.datasets.get_dataset(did)
14621424
try:
1463-
assert (
1464-
edited_dataset.default_target_attribute == "shape"
1465-
), edited_dataset
1425+
assert edited_dataset.default_target_attribute == "shape", edited_dataset
14661426
assert edited_dataset.ignore_attribute == ["oil"], edited_dataset
14671427
break
14681428
except AssertionError as e:
@@ -1471,9 +1431,7 @@ def test_data_edit_critical_field(self):
14711431
time.sleep(10)
14721432
# Delete the cache dir to get the newer version of the dataset
14731433
shutil.rmtree(
1474-
os.path.join(
1475-
self.workdir, "org", "openml", "test", "datasets", str(did)
1476-
),
1434+
os.path.join(self.workdir, "org", "openml", "test", "datasets", str(did)),
14771435
)
14781436

14791437
def test_data_edit_requires_field(self):
@@ -1564,9 +1522,7 @@ def test_list_datasets_with_high_size_parameter(self):
15641522
openml.config.server = self.production_server
15651523

15661524
datasets_a = openml.datasets.list_datasets(output_format="dataframe")
1567-
datasets_b = openml.datasets.list_datasets(
1568-
output_format="dataframe", size=np.inf
1569-
)
1525+
datasets_b = openml.datasets.list_datasets(output_format="dataframe", size=np.inf)
15701526

15711527
# Reverting to test server
15721528
openml.config.server = self.test_server
@@ -1646,9 +1602,7 @@ def test_invalid_attribute_validations(
16461602
(None, None, ["outlook", "windy"]),
16471603
],
16481604
)
1649-
def test_valid_attribute_validations(
1650-
default_target_attribute, row_id_attribute, ignore_attribute
1651-
):
1605+
def test_valid_attribute_validations(default_target_attribute, row_id_attribute, ignore_attribute):
16521606
data = [
16531607
["a", "sunny", 85.0, 85.0, "FALSE", "no"],
16541608
["b", "sunny", 80.0, 90.0, "TRUE", "no"],
@@ -1749,10 +1703,7 @@ def test_delete_dataset(self):
17491703
def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_api_key):
17501704
openml.config.start_using_configuration_for_example()
17511705
content_file = (
1752-
test_files_directory
1753-
/ "mock_responses"
1754-
/ "datasets"
1755-
/ "data_delete_not_owned.xml"
1706+
test_files_directory / "mock_responses" / "datasets" / "data_delete_not_owned.xml"
17561707
)
17571708
mock_delete.return_value = create_request_response(
17581709
status_code=412,
@@ -1774,10 +1725,7 @@ def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_api_ke
17741725
def test_delete_dataset_with_run(mock_delete, test_files_directory, test_api_key):
17751726
openml.config.start_using_configuration_for_example()
17761727
content_file = (
1777-
test_files_directory
1778-
/ "mock_responses"
1779-
/ "datasets"
1780-
/ "data_delete_has_tasks.xml"
1728+
test_files_directory / "mock_responses" / "datasets" / "data_delete_has_tasks.xml"
17811729
)
17821730
mock_delete.return_value = create_request_response(
17831731
status_code=412,
@@ -1799,10 +1747,7 @@ def test_delete_dataset_with_run(mock_delete, test_files_directory, test_api_key
17991747
def test_delete_dataset_success(mock_delete, test_files_directory, test_api_key):
18001748
openml.config.start_using_configuration_for_example()
18011749
content_file = (
1802-
test_files_directory
1803-
/ "mock_responses"
1804-
/ "datasets"
1805-
/ "data_delete_successful.xml"
1750+
test_files_directory / "mock_responses" / "datasets" / "data_delete_successful.xml"
18061751
)
18071752
mock_delete.return_value = create_request_response(
18081753
status_code=200,
@@ -1821,10 +1766,7 @@ def test_delete_dataset_success(mock_delete, test_files_directory, test_api_key)
18211766
def test_delete_unknown_dataset(mock_delete, test_files_directory, test_api_key):
18221767
openml.config.start_using_configuration_for_example()
18231768
content_file = (
1824-
test_files_directory
1825-
/ "mock_responses"
1826-
/ "datasets"
1827-
/ "data_delete_not_exist.xml"
1769+
test_files_directory / "mock_responses" / "datasets" / "data_delete_not_exist.xml"
18281770
)
18291771
mock_delete.return_value = create_request_response(
18301772
status_code=412,
@@ -1861,9 +1803,7 @@ def test_list_datasets(all_datasets: pd.DataFrame):
18611803

18621804

18631805
def test_list_datasets_by_tag(all_datasets: pd.DataFrame):
1864-
tag_datasets = openml.datasets.list_datasets(
1865-
tag="study_14", output_format="dataframe"
1866-
)
1806+
tag_datasets = openml.datasets.list_datasets(tag="study_14", output_format="dataframe")
18671807
assert 0 < len(tag_datasets) < len(all_datasets)
18681808
_assert_datasets_have_id_and_valid_status(tag_datasets)
18691809

@@ -2001,15 +1941,16 @@ def test_get_dataset_lazy_behavior(
20011941
with_features=with_features,
20021942
with_data=with_data,
20031943
)
2004-
assert (
2005-
dataset.features
2006-
), "Features should be downloaded on-demand if not during get_dataset"
2007-
assert (
2008-
dataset.qualities
2009-
), "Qualities should be downloaded on-demand if not during get_dataset"
2010-
assert (
2011-
dataset.get_data()
2012-
), "Data should be downloaded on-demand if not during get_dataset"
1944+
assert dataset.features, "Features should be downloaded on-demand if not during get_dataset"
1945+
assert dataset.qualities, "Qualities should be downloaded on-demand if not during get_dataset"
1946+
assert dataset.get_data(), "Data should be downloaded on-demand if not during get_dataset"
20131947
_assert_datasets_retrieved_successfully(
20141948
[1], with_qualities=True, with_features=True, with_data=True
20151949
)
1950+
1951+
1952+
def test_get_dataset_with_invalid_id() -> None:
1953+
INVALID_ID = 123819023109238 # Well, at some point this will probably be valid...
1954+
with pytest.raises(OpenMLServerNoResult, match="Unknown dataset") as e:
1955+
openml.datasets.get_dataset(INVALID_ID)
1956+
assert e.value.code == 111

0 commit comments

Comments
 (0)