openml · PGijsbers · Oct 29, 2020 · Oct 28, 2020 · Oct 28, 2020 · Oct 28, 2020
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -333,6 +333,27 @@ def _load_features_from_file(features_file: str) -> Dict:
         return xml_dict["oml:data_features"]
 
 
+def _expand_parameter(parameter: Union[str, list]):
+    expanded_parameter = []
+    if isinstance(parameter, str):
+        expanded_parameter = [x.strip() for x in parameter.split(",")]
+    elif isinstance(parameter, list):
+        expanded_parameter = parameter
+    return expanded_parameter
+
+
+def _validated_data_attributes(attributes: list, data_attributes: list, parameter_name: str):
+    for attribute_ in attributes:
+        is_attribute_a_data_attribute = any([attr[0] == attribute_ for attr in data_attributes])
+        if not is_attribute_a_data_attribute:
+            raise ValueError(
+                "all attribute of '{}' should be one of the data attribute. "
+                " Got '{}' while candidates are {}.".format(
+                    parameter_name, attribute_, [attr[0] for attr in data_attributes]
+                )
+            )
+
+
 def check_datasets_active(dataset_ids: List[int]) -> Dict[int, bool]:
     """
     Check if the dataset ids provided are active.
@@ -636,6 +657,7 @@ def create_dataset(
     ignore_attribute : str | list
         Attributes that should be excluded in modelling,
         such as identifiers and indexes.
+        Can have multiple values, comma separated.
     citation : str
         Reference(s) that should be cited when building on this data.
     version_label : str, optional
@@ -687,6 +709,11 @@ def create_dataset(
                     attributes_[attr_idx] = (attr_name, attributes[attr_name])
     else:
         attributes_ = attributes
+    ignore_attributes = _expand_parameter(ignore_attribute)
+    _validated_data_attributes(ignore_attributes, attributes_, "ignore_attribute")
+
+    default_target_attributes = _expand_parameter(default_target_attribute)
+    _validated_data_attributes(default_target_attributes, attributes_, "default_target_attribute")
 
     if row_id_attribute is not None:
         is_row_id_an_attribute = any([attr[0] == row_id_attribute for attr in attributes_])

diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -875,6 +875,51 @@ def test_get_online_dataset_format(self):
             _get_online_dataset_format(dataset_id),
             "The format of the ARFF files is different",
         )
+    @pytest.mark.parametrize("default_target_attribute,row_id_attribute,ignore_attribute", 
+    [("wrong", None,None), (None,"wrong",None), (None,None,"wrong")])
+    def test_attribute_validations(self):
+        data = [
+            ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
+            ["b", "sunny", 80.0, 90.0, "TRUE", "no"],
+            ["c", "overcast", 83.0, 86.0, "FALSE", "yes"],
+            ["d", "rainy", 70.0, 96.0, "FALSE", "yes"],
+            ["e", "rainy", 68.0, 80.0, "FALSE", "yes"],
+        ]
+        column_names = ["rnd_str", "outlook", "temperature", "humidity", "windy", "play"]
+        df = pd.DataFrame(data, columns=column_names)
+        # enforce the type of each column
+        df["outlook"] = df["outlook"].astype("category")
+        df["windy"] = df["windy"].astype("bool")
+        df["play"] = df["play"].astype("category")
+        # meta-information
+        name = "%s-pandas_testing_dataset" % self._get_sentinel()
+        description = "Synthetic dataset created from a Pandas DataFrame"
+        creator = "OpenML tester"
+        collection_date = "01-01-2018"
+        language = "English"
+        licence = "MIT"
+        citation = "None"
+        original_data_url = "http://openml.github.io/openml-python"
+        paper_url = "http://openml.github.io/openml-python"
+        with self.assertRaises(ValueError):
+            _ = openml.datasets.functions.create_dataset(
+                name=name,
+                description=description,
+                creator=creator,
+                contributor=None,
+                collection_date=collection_date,
+                language=language,
+                licence=licence,
+                default_target_attribute="wrong",
+                row_id_attribute=None,
+                ignore_attribute=None,
+                citation=citation,
+                attributes="auto",
+                data=df,
+                version_label="test",
+                original_data_url=original_data_url,
+                paper_url=paper_url,
+            )
 
     def test_create_dataset_pandas(self):
         data = [
@@ -897,7 +942,6 @@ def test_create_dataset_pandas(self):
         collection_date = "01-01-2018"
         language = "English"
         licence = "MIT"
-        default_target_attribute = "play"
         citation = "None"
         original_data_url = "http://openml.github.io/openml-python"
         paper_url = "http://openml.github.io/openml-python"
@@ -909,7 +953,7 @@ def test_create_dataset_pandas(self):
             collection_date=collection_date,
             language=language,
             licence=licence,
-            default_target_attribute=default_target_attribute,
+            default_target_attribute="play",
             row_id_attribute=None,
             ignore_attribute=None,
             citation=citation,
@@ -944,7 +988,7 @@ def test_create_dataset_pandas(self):
             collection_date=collection_date,
             language=language,
             licence=licence,
-            default_target_attribute=default_target_attribute,
+            default_target_attribute="y",
             row_id_attribute=None,
             ignore_attribute=None,
             citation=citation,
@@ -980,7 +1024,7 @@ def test_create_dataset_pandas(self):
             collection_date=collection_date,
             language=language,
             licence=licence,
-            default_target_attribute=default_target_attribute,
+            default_target_attribute="rnd_str",
             row_id_attribute=None,
             ignore_attribute=None,
             citation=citation,