From 1c6c5de6ea65606f8a537eb431c048656b0dc631 Mon Sep 17 00:00:00 2001 From: adel Date: Wed, 28 Oct 2020 14:53:57 +0100 Subject: [PATCH 1/6] add validation for ignore_attributes and default_target_attribute at craete_dataset --- openml/datasets/functions.py | 28 ++++++ tests/test_datasets/test_dataset_functions.py | 95 ++++++++++++++++++- 2 files changed, 119 insertions(+), 4 deletions(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 84943b244..816ca1a53 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -333,6 +333,28 @@ def _load_features_from_file(features_file: str) -> Dict: return xml_dict["oml:data_features"] +def _expand_parameter(parameter): + expanded_parameter = [] + if isinstance(parameter, str): + expanded_parameter = [x.strip() for x in parameter.split(",")] + elif isinstance(parameter, list): + expanded_parameter = parameter + return expanded_parameter + + +def _validated_data_attributes(attributes, data_attributes, parameter_name): + if attributes is not None: + for attribute_ in attributes: + is_row_id_an_attribute = any([attr[0] == attribute_ for attr in data_attributes]) + if not is_row_id_an_attribute: + raise ValueError( + "all attribute of '{}' should be one of the data attribute. " + " Got '{}' while candidates are {}.".format( + parameter_name, attribute_, [attr[0] for attr in data_attributes] + ) + ) + + def check_datasets_active(dataset_ids: List[int]) -> Dict[int, bool]: """ Check if the dataset ids provided are active. @@ -636,6 +658,7 @@ def create_dataset( ignore_attribute : str | list Attributes that should be excluded in modelling, such as identifiers and indexes. + Can have multiple values, comma separated. citation : str Reference(s) that should be cited when building on this data. version_label : str, optional @@ -687,6 +710,11 @@ def create_dataset( attributes_[attr_idx] = (attr_name, attributes[attr_name]) else: attributes_ = attributes + ignore_attributes = _expand_parameter(ignore_attribute) + _validated_data_attributes(ignore_attributes, attributes_, "ignore_attribute") + + default_target_attributes = _expand_parameter(default_target_attribute) + _validated_data_attributes(default_target_attributes, attributes_, "default_target_attribute") if row_id_attribute is not None: is_row_id_an_attribute = any([attr[0] == row_id_attribute for attr in attributes_]) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index c6e6f78f8..ca72773a5 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -876,6 +876,94 @@ def test_get_online_dataset_format(self): "The format of the ARFF files is different", ) + def test_create_dataset_default_target_attribute_validation(self): + data = [ + ["a", "sunny", 85.0, 85.0, "FALSE", "no"], + ["b", "sunny", 80.0, 90.0, "TRUE", "no"], + ["c", "overcast", 83.0, 86.0, "FALSE", "yes"], + ["d", "rainy", 70.0, 96.0, "FALSE", "yes"], + ["e", "rainy", 68.0, 80.0, "FALSE", "yes"], + ] + column_names = ["rnd_str", "outlook", "temperature", "humidity", "windy", "play"] + df = pd.DataFrame(data, columns=column_names) + # enforce the type of each column + df["outlook"] = df["outlook"].astype("category") + df["windy"] = df["windy"].astype("bool") + df["play"] = df["play"].astype("category") + # meta-information + name = "%s-pandas_testing_dataset" % self._get_sentinel() + description = "Synthetic dataset created from a Pandas DataFrame" + creator = "OpenML tester" + collection_date = "01-01-2018" + language = "English" + licence = "MIT" + citation = "None" + original_data_url = "http://openml.github.io/openml-python" + paper_url = "http://openml.github.io/openml-python" + with self.assertRaises(ValueError): + _ = openml.datasets.functions.create_dataset( + name=name, + description=description, + creator=creator, + contributor=None, + collection_date=collection_date, + language=language, + licence=licence, + default_target_attribute="wrong", + row_id_attribute=None, + ignore_attribute=None, + citation=citation, + attributes="auto", + data=df, + version_label="test", + original_data_url=original_data_url, + paper_url=paper_url, + ) + + def test_create_dataset_ignore_attribute_validation(self): + data = [ + ["a", "sunny", 85.0, 85.0, "FALSE", "no"], + ["b", "sunny", 80.0, 90.0, "TRUE", "no"], + ["c", "overcast", 83.0, 86.0, "FALSE", "yes"], + ["d", "rainy", 70.0, 96.0, "FALSE", "yes"], + ["e", "rainy", 68.0, 80.0, "FALSE", "yes"], + ] + column_names = ["rnd_str", "outlook", "temperature", "humidity", "windy", "play"] + df = pd.DataFrame(data, columns=column_names) + # enforce the type of each column + df["outlook"] = df["outlook"].astype("category") + df["windy"] = df["windy"].astype("bool") + df["play"] = df["play"].astype("category") + # meta-information + name = "%s-pandas_testing_dataset" % self._get_sentinel() + description = "Synthetic dataset created from a Pandas DataFrame" + creator = "OpenML tester" + collection_date = "01-01-2018" + language = "English" + licence = "MIT" + citation = "None" + original_data_url = "http://openml.github.io/openml-python" + paper_url = "http://openml.github.io/openml-python" + with self.assertRaises(ValueError): + _ = openml.datasets.functions.create_dataset( + name=name, + description=description, + creator=creator, + contributor=None, + collection_date=collection_date, + language=language, + licence=licence, + default_target_attribute="play", + row_id_attribute=None, + ignore_attribute=["rnd_str", "wrong"], + citation=citation, + attributes="auto", + data=df, + version_label="test", + original_data_url=original_data_url, + paper_url=paper_url, + ) + def test_create_dataset_pandas(self): data = [ ["a", "sunny", 85.0, 85.0, "FALSE", "no"], @@ -897,7 +985,6 @@ def test_create_dataset_pandas(self): collection_date = "01-01-2018" language = "English" licence = "MIT" - default_target_attribute = "play" citation = "None" original_data_url = "http://openml.github.io/openml-python" paper_url = "http://openml.github.io/openml-python" @@ -909,7 +996,7 @@ def test_create_dataset_pandas(self): collection_date=collection_date, language=language, licence=licence, - default_target_attribute=default_target_attribute, + default_target_attribute="play", row_id_attribute=None, ignore_attribute=None, citation=citation, @@ -944,7 +1031,7 @@ def test_create_dataset_pandas(self): collection_date=collection_date, language=language, licence=licence, - default_target_attribute=default_target_attribute, + default_target_attribute="y", row_id_attribute=None, ignore_attribute=None, citation=citation, @@ -980,7 +1067,7 @@ def test_create_dataset_pandas(self): collection_date=collection_date, language=language, licence=licence, - default_target_attribute=default_target_attribute, + default_target_attribute="rnd_str", row_id_attribute=None, ignore_attribute=None, citation=citation, From 07c85cc58101e602bbb3180532d5694c2fd2b98e Mon Sep 17 00:00:00 2001 From: adel Date: Wed, 28 Oct 2020 17:33:51 +0100 Subject: [PATCH 2/6] update naming convetions and adding type hints. using pytest parametrize with attribute validation --- openml/datasets/functions.py | 21 ++++---- tests/test_datasets/test_dataset_functions.py | 49 ++----------------- 2 files changed, 13 insertions(+), 57 deletions(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 816ca1a53..a0cec2d66 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -333,7 +333,7 @@ def _load_features_from_file(features_file: str) -> Dict: return xml_dict["oml:data_features"] -def _expand_parameter(parameter): +def _expand_parameter(parameter: Union[str, list]): expanded_parameter = [] if isinstance(parameter, str): expanded_parameter = [x.strip() for x in parameter.split(",")] @@ -342,17 +342,16 @@ def _expand_parameter(parameter): return expanded_parameter -def _validated_data_attributes(attributes, data_attributes, parameter_name): - if attributes is not None: - for attribute_ in attributes: - is_row_id_an_attribute = any([attr[0] == attribute_ for attr in data_attributes]) - if not is_row_id_an_attribute: - raise ValueError( - "all attribute of '{}' should be one of the data attribute. " - " Got '{}' while candidates are {}.".format( - parameter_name, attribute_, [attr[0] for attr in data_attributes] - ) +def _validated_data_attributes(attributes: list, data_attributes: list, parameter_name: str): + for attribute_ in attributes: + is_attribute_a_data_attribute = any([attr[0] == attribute_ for attr in data_attributes]) + if not is_attribute_a_data_attribute: + raise ValueError( + "all attribute of '{}' should be one of the data attribute. " + " Got '{}' while candidates are {}.".format( + parameter_name, attribute_, [attr[0] for attr in data_attributes] ) + ) def check_datasets_active(dataset_ids: List[int]) -> Dict[int, bool]: diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index ca72773a5..5841288af 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -875,8 +875,9 @@ def test_get_online_dataset_format(self): _get_online_dataset_format(dataset_id), "The format of the ARFF files is different", ) - - def test_create_dataset_default_target_attribute_validation(self): + @pytest.mark.parametrize("default_target_attribute,row_id_attribute,ignore_attribute", + [("wrong", None,None), (None,"wrong",None), (None,None,"wrong")]) + def test_attribute_validations(self): data = [ ["a", "sunny", 85.0, 85.0, "FALSE", "no"], ["b", "sunny", 80.0, 90.0, "TRUE", "no"], @@ -920,50 +921,6 @@ def test_create_dataset_default_target_attribute_validation(self): paper_url=paper_url, ) - def test_create_dataset_ignore_attribute_validation(self): - data = [ - ["a", "sunny", 85.0, 85.0, "FALSE", "no"], - ["b", "sunny", 80.0, 90.0, "TRUE", "no"], - ["c", "overcast", 83.0, 86.0, "FALSE", "yes"], - ["d", "rainy", 70.0, 96.0, "FALSE", "yes"], - ["e", "rainy", 68.0, 80.0, "FALSE", "yes"], - ] - column_names = ["rnd_str", "outlook", "temperature", "humidity", "windy", "play"] - df = pd.DataFrame(data, columns=column_names) - # enforce the type of each column - df["outlook"] = df["outlook"].astype("category") - df["windy"] = df["windy"].astype("bool") - df["play"] = df["play"].astype("category") - # meta-information - name = "%s-pandas_testing_dataset" % self._get_sentinel() - description = "Synthetic dataset created from a Pandas DataFrame" - creator = "OpenML tester" - collection_date = "01-01-2018" - language = "English" - licence = "MIT" - citation = "None" - original_data_url = "http://openml.github.io/openml-python" - paper_url = "http://openml.github.io/openml-python" - with self.assertRaises(ValueError): - _ = openml.datasets.functions.create_dataset( - name=name, - description=description, - creator=creator, - contributor=None, - collection_date=collection_date, - language=language, - licence=licence, - default_target_attribute="play", - row_id_attribute=None, - ignore_attribute=["rnd_str", "wrong"], - citation=citation, - attributes="auto", - data=df, - version_label="test", - original_data_url=original_data_url, - paper_url=paper_url, - ) - def test_create_dataset_pandas(self): data = [ ["a", "sunny", 85.0, 85.0, "FALSE", "no"], From ff3d27aea03f8ef3cdf36198144678fe28d82c8c Mon Sep 17 00:00:00 2001 From: adel Date: Wed, 28 Oct 2020 18:16:44 +0100 Subject: [PATCH 3/6] formating long lines and update types hint for return values --- openml/datasets/functions.py | 6 ++++-- tests/test_datasets/test_dataset_functions.py | 7 +++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index a0cec2d66..e8044aefb 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -333,7 +333,7 @@ def _load_features_from_file(features_file: str) -> Dict: return xml_dict["oml:data_features"] -def _expand_parameter(parameter: Union[str, list]): +def _expand_parameter(parameter: Union[str, List[str]]) -> List[str]: expanded_parameter = [] if isinstance(parameter, str): expanded_parameter = [x.strip() for x in parameter.split(",")] @@ -342,7 +342,9 @@ def _expand_parameter(parameter: Union[str, list]): return expanded_parameter -def _validated_data_attributes(attributes: list, data_attributes: list, parameter_name: str): +def _validated_data_attributes( + attributes: List[str], data_attributes: List[str], parameter_name: str +) -> None: for attribute_ in attributes: is_attribute_a_data_attribute = any([attr[0] == attribute_ for attr in data_attributes]) if not is_attribute_a_data_attribute: diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 5841288af..adfcec5b9 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -875,8 +875,11 @@ def test_get_online_dataset_format(self): _get_online_dataset_format(dataset_id), "The format of the ARFF files is different", ) - @pytest.mark.parametrize("default_target_attribute,row_id_attribute,ignore_attribute", - [("wrong", None,None), (None,"wrong",None), (None,None,"wrong")]) + + @pytest.mark.parametrize( + "default_target_attribute,row_id_attribute,ignore_attribute", + [("wrong", None, None), (None, "wrong", None), (None, None, "wrong")], + ) def test_attribute_validations(self): data = [ ["a", "sunny", 85.0, 85.0, "FALSE", "no"], From ace60aa606ac7c4d8b1c811d02bf89fd192789a8 Mon Sep 17 00:00:00 2001 From: adel Date: Thu, 29 Oct 2020 12:08:02 +0100 Subject: [PATCH 4/6] update test_attribute_validations to use pytest.mark.parametrize --- tests/test_datasets/test_dataset_functions.py | 97 ++++++++++--------- 1 file changed, 49 insertions(+), 48 deletions(-) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index adfcec5b9..0a237a84a 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -876,54 +876,6 @@ def test_get_online_dataset_format(self): "The format of the ARFF files is different", ) - @pytest.mark.parametrize( - "default_target_attribute,row_id_attribute,ignore_attribute", - [("wrong", None, None), (None, "wrong", None), (None, None, "wrong")], - ) - def test_attribute_validations(self): - data = [ - ["a", "sunny", 85.0, 85.0, "FALSE", "no"], - ["b", "sunny", 80.0, 90.0, "TRUE", "no"], - ["c", "overcast", 83.0, 86.0, "FALSE", "yes"], - ["d", "rainy", 70.0, 96.0, "FALSE", "yes"], - ["e", "rainy", 68.0, 80.0, "FALSE", "yes"], - ] - column_names = ["rnd_str", "outlook", "temperature", "humidity", "windy", "play"] - df = pd.DataFrame(data, columns=column_names) - # enforce the type of each column - df["outlook"] = df["outlook"].astype("category") - df["windy"] = df["windy"].astype("bool") - df["play"] = df["play"].astype("category") - # meta-information - name = "%s-pandas_testing_dataset" % self._get_sentinel() - description = "Synthetic dataset created from a Pandas DataFrame" - creator = "OpenML tester" - collection_date = "01-01-2018" - language = "English" - licence = "MIT" - citation = "None" - original_data_url = "http://openml.github.io/openml-python" - paper_url = "http://openml.github.io/openml-python" - with self.assertRaises(ValueError): - _ = openml.datasets.functions.create_dataset( - name=name, - description=description, - creator=creator, - contributor=None, - collection_date=collection_date, - language=language, - licence=licence, - default_target_attribute="wrong", - row_id_attribute=None, - ignore_attribute=None, - citation=citation, - attributes="auto", - data=df, - version_label="test", - original_data_url=original_data_url, - paper_url=paper_url, - ) - def test_create_dataset_pandas(self): data = [ ["a", "sunny", 85.0, 85.0, "FALSE", "no"], @@ -1463,3 +1415,52 @@ def test_data_fork(self): self.assertRaisesRegex( OpenMLServerException, "Unknown dataset", fork_dataset, data_id=999999, ) + + +@pytest.mark.parametrize( + "default_target_attribute,row_id_attribute,ignore_attribute", + [("wrong", None, None), (None, "wrong", None), (None, None, "wrong")], +) +def test_attribute_validations(default_target_attribute, row_id_attribute, ignore_attribute): + data = [ + ["a", "sunny", 85.0, 85.0, "FALSE", "no"], + ["b", "sunny", 80.0, 90.0, "TRUE", "no"], + ["c", "overcast", 83.0, 86.0, "FALSE", "yes"], + ["d", "rainy", 70.0, 96.0, "FALSE", "yes"], + ["e", "rainy", 68.0, 80.0, "FALSE", "yes"], + ] + column_names = ["rnd_str", "outlook", "temperature", "humidity", "windy", "play"] + df = pd.DataFrame(data, columns=column_names) + # enforce the type of each column + df["outlook"] = df["outlook"].astype("category") + df["windy"] = df["windy"].astype("bool") + df["play"] = df["play"].astype("category") + # meta-information + name = "pandas_testing_dataset" + description = "Synthetic dataset created from a Pandas DataFrame" + creator = "OpenML tester" + collection_date = "01-01-2018" + language = "English" + licence = "MIT" + citation = "None" + original_data_url = "http://openml.github.io/openml-python" + paper_url = "http://openml.github.io/openml-python" + with pytest.raises(ValueError, match="should be one of the data attribute"): + _ = openml.datasets.functions.create_dataset( + name=name, + description=description, + creator=creator, + contributor=None, + collection_date=collection_date, + language=language, + licence=licence, + default_target_attribute=default_target_attribute, + row_id_attribute=row_id_attribute, + ignore_attribute=ignore_attribute, + citation=citation, + attributes="auto", + data=df, + version_label="test", + original_data_url=original_data_url, + paper_url=paper_url, + ) From afdd949186a17a925b1ef3bf11b48546876cc429 Mon Sep 17 00:00:00 2001 From: adel Date: Thu, 29 Oct 2020 13:56:29 +0100 Subject: [PATCH 5/6] add more tests for different input types for attribute validation --- tests/test_datasets/test_dataset_functions.py | 55 ++++++++++++++++++- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 0a237a84a..54b916753 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -1419,9 +1419,11 @@ def test_data_fork(self): @pytest.mark.parametrize( "default_target_attribute,row_id_attribute,ignore_attribute", - [("wrong", None, None), (None, "wrong", None), (None, None, "wrong")], + [("wrong", None, None), (None, "wrong", None), (None, None, "wrong"), + ("wrong,sunny", None, None),(None, None, "wrong,sunny"), + (["wrong","sunny"], None, None), (None, None, ["wrong","sunny"])], ) -def test_attribute_validations(default_target_attribute, row_id_attribute, ignore_attribute): +def test_invalid_attribute_validations(default_target_attribute, row_id_attribute, ignore_attribute): data = [ ["a", "sunny", 85.0, 85.0, "FALSE", "no"], ["b", "sunny", 80.0, 90.0, "TRUE", "no"], @@ -1464,3 +1466,52 @@ def test_attribute_validations(default_target_attribute, row_id_attribute, ignor original_data_url=original_data_url, paper_url=paper_url, ) + +@pytest.mark.parametrize( + "default_target_attribute,row_id_attribute,ignore_attribute", + [("outlook", None, None), (None, "outlook", None), (None, None, "outlook"), + ("outlook,windy", None, None), (None, None, "outlook,windy"), + (["outlook","windy"], None, None), (None, None, ["outlook","windy"])], +) +def test_valid_attribute_validations(default_target_attribute, row_id_attribute, ignore_attribute): + data = [ + ["a", "sunny", 85.0, 85.0, "FALSE", "no"], + ["b", "sunny", 80.0, 90.0, "TRUE", "no"], + ["c", "overcast", 83.0, 86.0, "FALSE", "yes"], + ["d", "rainy", 70.0, 96.0, "FALSE", "yes"], + ["e", "rainy", 68.0, 80.0, "FALSE", "yes"], + ] + column_names = ["rnd_str", "outlook", "temperature", "humidity", "windy", "play"] + df = pd.DataFrame(data, columns=column_names) + # enforce the type of each column + df["outlook"] = df["outlook"].astype("category") + df["windy"] = df["windy"].astype("bool") + df["play"] = df["play"].astype("category") + # meta-information + name = "pandas_testing_dataset" + description = "Synthetic dataset created from a Pandas DataFrame" + creator = "OpenML tester" + collection_date = "01-01-2018" + language = "English" + licence = "MIT" + citation = "None" + original_data_url = "http://openml.github.io/openml-python" + paper_url = "http://openml.github.io/openml-python" + _ = openml.datasets.functions.create_dataset( + name=name, + description=description, + creator=creator, + contributor=None, + collection_date=collection_date, + language=language, + licence=licence, + default_target_attribute=default_target_attribute, + row_id_attribute=row_id_attribute, + ignore_attribute=ignore_attribute, + citation=citation, + attributes="auto", + data=df, + version_label="test", + original_data_url=original_data_url, + paper_url=paper_url, + ) From 4cf640f8db6c4af2f1c49f8b8c129b11622375e7 Mon Sep 17 00:00:00 2001 From: adel Date: Thu, 29 Oct 2020 13:57:11 +0100 Subject: [PATCH 6/6] update formatting --- tests/test_datasets/test_dataset_functions.py | 29 ++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 54b916753..50f0e43b7 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -1419,11 +1419,19 @@ def test_data_fork(self): @pytest.mark.parametrize( "default_target_attribute,row_id_attribute,ignore_attribute", - [("wrong", None, None), (None, "wrong", None), (None, None, "wrong"), - ("wrong,sunny", None, None),(None, None, "wrong,sunny"), - (["wrong","sunny"], None, None), (None, None, ["wrong","sunny"])], + [ + ("wrong", None, None), + (None, "wrong", None), + (None, None, "wrong"), + ("wrong,sunny", None, None), + (None, None, "wrong,sunny"), + (["wrong", "sunny"], None, None), + (None, None, ["wrong", "sunny"]), + ], ) -def test_invalid_attribute_validations(default_target_attribute, row_id_attribute, ignore_attribute): +def test_invalid_attribute_validations( + default_target_attribute, row_id_attribute, ignore_attribute +): data = [ ["a", "sunny", 85.0, 85.0, "FALSE", "no"], ["b", "sunny", 80.0, 90.0, "TRUE", "no"], @@ -1467,11 +1475,18 @@ def test_invalid_attribute_validations(default_target_attribute, row_id_attribut paper_url=paper_url, ) + @pytest.mark.parametrize( "default_target_attribute,row_id_attribute,ignore_attribute", - [("outlook", None, None), (None, "outlook", None), (None, None, "outlook"), - ("outlook,windy", None, None), (None, None, "outlook,windy"), - (["outlook","windy"], None, None), (None, None, ["outlook","windy"])], + [ + ("outlook", None, None), + (None, "outlook", None), + (None, None, "outlook"), + ("outlook,windy", None, None), + (None, None, "outlook,windy"), + (["outlook", "windy"], None, None), + (None, None, ["outlook", "windy"]), + ], ) def test_valid_attribute_validations(default_target_attribute, row_id_attribute, ignore_attribute): data = [