openml · mfeurer · Aug 31, 2020 · Aug 19, 2020 · Aug 27, 2020 · Aug 28, 2020
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -806,8 +806,6 @@ def edit_dataset(
     contributor=None,
     collection_date=None,
     language=None,
-    attributes=None,
-    data=None,
     default_target_attribute=None,
     ignore_attribute=None,
     citation=None,
@@ -839,17 +837,6 @@ def edit_dataset(
       language : str
           Language in which the data is represented.
           Starts with 1 upper case letter, rest lower case, e.g. 'English'.
-      attributes : list, dict, or 'auto'
-          A list of tuples. Each tuple consists of the attribute name and type.
-          If passing a pandas DataFrame, the attributes can be automatically
-          inferred by passing ``'auto'``. Specific attributes can be manually
-          specified by a passing a dictionary where the key is the name of the
-          attribute and the value is the data type of the attribute.
-      data : ndarray, list, dataframe, coo_matrix, shape (n_samples, n_features)
-          An array that contains both the attributes and the targets. When
-          providing a dataframe, the attribute names and type can be inferred by
-          passing ``attributes='auto'``.
-          The target feature is indicated as meta-data of the dataset.
       default_target_attribute : str
           The default target attribute, if it exists.
           Can have multiple values, comma separated.
@@ -879,54 +866,6 @@ def edit_dataset(
     if not isinstance(data_id, int):
         raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))
 
-    # case 1, changing these fields creates a new version of the dataset with changed field
-    if any(
-        field is not None
-        for field in [
-            data,
-            attributes,
-            default_target_attribute,
-            row_id_attribute,
-            ignore_attribute,
-        ]
-    ):
-        logger.warning("Creating a new version of dataset, cannot edit existing version")
-
-        # Get old dataset and features
-        dataset = get_dataset(data_id)
-        df, y, categorical, attribute_names = dataset.get_data(dataset_format="dataframe")
-        attributes_old = attributes_arff_from_df(df)
-
-        # Sparse data needs to be provided in a different format from dense data
-        if dataset.format == "sparse_arff":
-            df, y, categorical, attribute_names = dataset.get_data(dataset_format="array")
-            data_old = coo_matrix(df)
-        else:
-            data_old = df
-        data_new = data if data is not None else data_old
-        dataset_new = create_dataset(
-            name=dataset.name,
-            description=description or dataset.description,
-            creator=creator or dataset.creator,
-            contributor=contributor or dataset.contributor,
-            collection_date=collection_date or dataset.collection_date,
-            language=language or dataset.language,
-            licence=dataset.licence,
-            attributes=attributes or attributes_old,
-            data=data_new,
-            default_target_attribute=default_target_attribute or dataset.default_target_attribute,
-            ignore_attribute=ignore_attribute or dataset.ignore_attribute,
-            citation=citation or dataset.citation,
-            row_id_attribute=row_id_attribute or dataset.row_id_attribute,
-            original_data_url=original_data_url or dataset.original_data_url,
-            paper_url=paper_url or dataset.paper_url,
-            update_comment=dataset.update_comment,
-            version_label=dataset.version_label,
-        )
-        dataset_new.publish()
-        return dataset_new.dataset_id
-
-    # case 2, changing any of these fields will update existing dataset
     # compose data edit parameters as xml
     form_data = {"data_id": data_id}
     xml = OrderedDict()  # type: 'OrderedDict[str, OrderedDict]'
@@ -937,6 +876,9 @@ def edit_dataset(
     xml["oml:data_edit_parameters"]["oml:contributor"] = contributor
     xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date
     xml["oml:data_edit_parameters"]["oml:language"] = language
+    xml["oml:data_edit_parameters"]["oml:default_target_attribute"] = default_target_attribute
+    xml["oml:data_edit_parameters"]["oml:row_id_attribute"] = row_id_attribute
+    xml["oml:data_edit_parameters"]["oml:ignore_attribute"] = ignore_attribute
     xml["oml:data_edit_parameters"]["oml:citation"] = citation
     xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url
     xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url

diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -1341,47 +1341,34 @@ def test_get_dataset_cache_format_feather(self):
         self.assertEqual(len(attribute_names), X.shape[1])
 
     def test_data_edit(self):
-
-        # admin key for test server (only admins or owners can edit datasets).
-        # all users can edit their own datasets)
-        openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"
-
-        # case 1, editing description, creator, contributor, collection_date, original_data_url,
-        # paper_url, citation, language edits existing dataset.
+        # Case 1
+        # All users can edit non-critical fields of datasets
+        desc = "xor dataset representing XOR operation"
         did = 564
         result = edit_dataset(
             did,
-            description="xor dataset represents XOR operation",
-            contributor="",
+            description=desc,
+            contributor="xxx",
             collection_date="2019-10-29 17:06:18",
             original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
             paper_url="",
             citation="kaggle",
             language="English",
         )
-        self.assertEqual(result, did)
-
-        # case 2, editing data, attributes, default_target_attribute, row_id_attribute,
-        # ignore_attribute generates a new dataset
+        self.assertEqual(did, result)
+        edited_dataset = openml.datasets.get_dataset(did)
+        self.assertEqual(edited_dataset.description, desc)
 
-        column_names = [
-            ("input1", "REAL"),
-            ("input2", "REAL"),
-            ("y", "REAL"),
-        ]
+        # Case 2
+        # only admins or owners can edit all critical fields of datasets
+        # admin key for test server
+        openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"
         desc = "xor dataset represents XOR operation"
-        result = edit_dataset(
-            564,
-            description=desc,
-            contributor="",
-            collection_date="2019-10-29 17:06:18",
-            attributes=column_names,
-            original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
-            paper_url="",
-            citation="kaggle",
-            language="English",
-        )
-        self.assertNotEqual(did, result)
+        did = 565
+        result = edit_dataset(did, default_target_attribute="y", ignore_attribute="input1")
+        self.assertEqual(did, result)
+        edited_dataset = openml.datasets.get_dataset(did)
+        self.assertEqual(edited_dataset.ignore_attribute, ["input1"])
 
     def test_data_edit_errors(self):
 
@@ -1390,8 +1377,10 @@ def test_data_edit_errors(self):
         # Check server exception when no field to edit is provided
         self.assertRaisesRegex(
             OpenMLServerException,
-            "Please provide atleast one field among description, creator, contributor, "
-            "collection_date, language, citation, original_data_url or paper_url to edit.",
+            "Please provide atleast one field among description, creator, "
+            "contributor, collection_date, language, citation, "
+            "original_data_url, default_target_attribute, row_id_attribute, "
+            "ignore_attribute or paper_url to edit.",
             edit_dataset,
             data_id=564,
         )
@@ -1403,12 +1392,22 @@ def test_data_edit_errors(self):
             data_id=100000,
             description="xor operation dataset",
         )
-        # Check server exception when a non-owner or non-admin tries to edit existing dataset
+        # Check server exception when owner/admin edits critical features of dataset with tasks
+        self.assertRaisesRegex(
+            OpenMLServerException,
+            "Critical features default_target_attribute, row_id_attribute and ignore_attribute "
+            "can only be edited for datasets without any tasks.",
+            edit_dataset,
+            data_id=1,
+            default_target_attribute="y",
+        )
+        # Check server exception when a non-owner or non-admin tries to edit critical features
         openml.config.apikey = "5f0b74b33503e4ad4a7181a91e28719f"
         self.assertRaisesRegex(
             OpenMLServerException,
-            "Dataset is not owned by you",
+            "Critical features default_target_attribute, row_id_attribute and ignore_attribute "
+            "can be edited only by the owner. Fork the dataset if changes are required.",
             edit_dataset,
             data_id=564,
-            description="xor data",
+            default_target_attribute="y",
         )