Skip to content

Commit 3132dac

Browse files
a-moadelmadel0093
andauthored
add validation for ignore_attributes and default_target_attribute at … (openml#978)
* add validation for ignore_attributes and default_target_attribute at craete_dataset * update naming convetions and adding type hints. using pytest parametrize with attribute validation * formating long lines and update types hint for return values * update test_attribute_validations to use pytest.mark.parametrize * add more tests for different input types for attribute validation * update formatting Co-authored-by: adel <m.adel0093@gmail.com>
1 parent 756e747 commit 3132dac

2 files changed

Lines changed: 147 additions & 4 deletions

File tree

openml/datasets/functions.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,29 @@ def _load_features_from_file(features_file: str) -> Dict:
333333
return xml_dict["oml:data_features"]
334334

335335

336+
def _expand_parameter(parameter: Union[str, List[str]]) -> List[str]:
337+
expanded_parameter = []
338+
if isinstance(parameter, str):
339+
expanded_parameter = [x.strip() for x in parameter.split(",")]
340+
elif isinstance(parameter, list):
341+
expanded_parameter = parameter
342+
return expanded_parameter
343+
344+
345+
def _validated_data_attributes(
346+
attributes: List[str], data_attributes: List[str], parameter_name: str
347+
) -> None:
348+
for attribute_ in attributes:
349+
is_attribute_a_data_attribute = any([attr[0] == attribute_ for attr in data_attributes])
350+
if not is_attribute_a_data_attribute:
351+
raise ValueError(
352+
"all attribute of '{}' should be one of the data attribute. "
353+
" Got '{}' while candidates are {}.".format(
354+
parameter_name, attribute_, [attr[0] for attr in data_attributes]
355+
)
356+
)
357+
358+
336359
def check_datasets_active(
337360
dataset_ids: List[int],
338361
raise_error_if_not_exist: bool = True,
@@ -646,6 +669,7 @@ def create_dataset(
646669
ignore_attribute : str | list
647670
Attributes that should be excluded in modelling,
648671
such as identifiers and indexes.
672+
Can have multiple values, comma separated.
649673
citation : str
650674
Reference(s) that should be cited when building on this data.
651675
version_label : str, optional
@@ -697,6 +721,11 @@ def create_dataset(
697721
attributes_[attr_idx] = (attr_name, attributes[attr_name])
698722
else:
699723
attributes_ = attributes
724+
ignore_attributes = _expand_parameter(ignore_attribute)
725+
_validated_data_attributes(ignore_attributes, attributes_, "ignore_attribute")
726+
727+
default_target_attributes = _expand_parameter(default_target_attribute)
728+
_validated_data_attributes(default_target_attributes, attributes_, "default_target_attribute")
700729

701730
if row_id_attribute is not None:
702731
is_row_id_an_attribute = any([attr[0] == row_id_attribute for attr in attributes_])

tests/test_datasets/test_dataset_functions.py

Lines changed: 118 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -901,7 +901,6 @@ def test_create_dataset_pandas(self):
901901
collection_date = "01-01-2018"
902902
language = "English"
903903
licence = "MIT"
904-
default_target_attribute = "play"
905904
citation = "None"
906905
original_data_url = "http://openml.github.io/openml-python"
907906
paper_url = "http://openml.github.io/openml-python"
@@ -913,7 +912,7 @@ def test_create_dataset_pandas(self):
913912
collection_date=collection_date,
914913
language=language,
915914
licence=licence,
916-
default_target_attribute=default_target_attribute,
915+
default_target_attribute="play",
917916
row_id_attribute=None,
918917
ignore_attribute=None,
919918
citation=citation,
@@ -948,7 +947,7 @@ def test_create_dataset_pandas(self):
948947
collection_date=collection_date,
949948
language=language,
950949
licence=licence,
951-
default_target_attribute=default_target_attribute,
950+
default_target_attribute="y",
952951
row_id_attribute=None,
953952
ignore_attribute=None,
954953
citation=citation,
@@ -984,7 +983,7 @@ def test_create_dataset_pandas(self):
984983
collection_date=collection_date,
985984
language=language,
986985
licence=licence,
987-
default_target_attribute=default_target_attribute,
986+
default_target_attribute="rnd_str",
988987
row_id_attribute=None,
989988
ignore_attribute=None,
990989
citation=citation,
@@ -1420,3 +1419,118 @@ def test_data_fork(self):
14201419
self.assertRaisesRegex(
14211420
OpenMLServerException, "Unknown dataset", fork_dataset, data_id=999999,
14221421
)
1422+
1423+
1424+
@pytest.mark.parametrize(
1425+
"default_target_attribute,row_id_attribute,ignore_attribute",
1426+
[
1427+
("wrong", None, None),
1428+
(None, "wrong", None),
1429+
(None, None, "wrong"),
1430+
("wrong,sunny", None, None),
1431+
(None, None, "wrong,sunny"),
1432+
(["wrong", "sunny"], None, None),
1433+
(None, None, ["wrong", "sunny"]),
1434+
],
1435+
)
1436+
def test_invalid_attribute_validations(
1437+
default_target_attribute, row_id_attribute, ignore_attribute
1438+
):
1439+
data = [
1440+
["a", "sunny", 85.0, 85.0, "FALSE", "no"],
1441+
["b", "sunny", 80.0, 90.0, "TRUE", "no"],
1442+
["c", "overcast", 83.0, 86.0, "FALSE", "yes"],
1443+
["d", "rainy", 70.0, 96.0, "FALSE", "yes"],
1444+
["e", "rainy", 68.0, 80.0, "FALSE", "yes"],
1445+
]
1446+
column_names = ["rnd_str", "outlook", "temperature", "humidity", "windy", "play"]
1447+
df = pd.DataFrame(data, columns=column_names)
1448+
# enforce the type of each column
1449+
df["outlook"] = df["outlook"].astype("category")
1450+
df["windy"] = df["windy"].astype("bool")
1451+
df["play"] = df["play"].astype("category")
1452+
# meta-information
1453+
name = "pandas_testing_dataset"
1454+
description = "Synthetic dataset created from a Pandas DataFrame"
1455+
creator = "OpenML tester"
1456+
collection_date = "01-01-2018"
1457+
language = "English"
1458+
licence = "MIT"
1459+
citation = "None"
1460+
original_data_url = "http://openml.github.io/openml-python"
1461+
paper_url = "http://openml.github.io/openml-python"
1462+
with pytest.raises(ValueError, match="should be one of the data attribute"):
1463+
_ = openml.datasets.functions.create_dataset(
1464+
name=name,
1465+
description=description,
1466+
creator=creator,
1467+
contributor=None,
1468+
collection_date=collection_date,
1469+
language=language,
1470+
licence=licence,
1471+
default_target_attribute=default_target_attribute,
1472+
row_id_attribute=row_id_attribute,
1473+
ignore_attribute=ignore_attribute,
1474+
citation=citation,
1475+
attributes="auto",
1476+
data=df,
1477+
version_label="test",
1478+
original_data_url=original_data_url,
1479+
paper_url=paper_url,
1480+
)
1481+
1482+
1483+
@pytest.mark.parametrize(
1484+
"default_target_attribute,row_id_attribute,ignore_attribute",
1485+
[
1486+
("outlook", None, None),
1487+
(None, "outlook", None),
1488+
(None, None, "outlook"),
1489+
("outlook,windy", None, None),
1490+
(None, None, "outlook,windy"),
1491+
(["outlook", "windy"], None, None),
1492+
(None, None, ["outlook", "windy"]),
1493+
],
1494+
)
1495+
def test_valid_attribute_validations(default_target_attribute, row_id_attribute, ignore_attribute):
1496+
data = [
1497+
["a", "sunny", 85.0, 85.0, "FALSE", "no"],
1498+
["b", "sunny", 80.0, 90.0, "TRUE", "no"],
1499+
["c", "overcast", 83.0, 86.0, "FALSE", "yes"],
1500+
["d", "rainy", 70.0, 96.0, "FALSE", "yes"],
1501+
["e", "rainy", 68.0, 80.0, "FALSE", "yes"],
1502+
]
1503+
column_names = ["rnd_str", "outlook", "temperature", "humidity", "windy", "play"]
1504+
df = pd.DataFrame(data, columns=column_names)
1505+
# enforce the type of each column
1506+
df["outlook"] = df["outlook"].astype("category")
1507+
df["windy"] = df["windy"].astype("bool")
1508+
df["play"] = df["play"].astype("category")
1509+
# meta-information
1510+
name = "pandas_testing_dataset"
1511+
description = "Synthetic dataset created from a Pandas DataFrame"
1512+
creator = "OpenML tester"
1513+
collection_date = "01-01-2018"
1514+
language = "English"
1515+
licence = "MIT"
1516+
citation = "None"
1517+
original_data_url = "http://openml.github.io/openml-python"
1518+
paper_url = "http://openml.github.io/openml-python"
1519+
_ = openml.datasets.functions.create_dataset(
1520+
name=name,
1521+
description=description,
1522+
creator=creator,
1523+
contributor=None,
1524+
collection_date=collection_date,
1525+
language=language,
1526+
licence=licence,
1527+
default_target_attribute=default_target_attribute,
1528+
row_id_attribute=row_id_attribute,
1529+
ignore_attribute=ignore_attribute,
1530+
citation=citation,
1531+
attributes="auto",
1532+
data=df,
1533+
version_label="test",
1534+
original_data_url=original_data_url,
1535+
paper_url=paper_url,
1536+
)

0 commit comments

Comments
 (0)