Edit api (#935)

* version1 * minor fixes * tests * reformat code * check new version * remove get data * code format * review comments * fix duplicate * type annotate * example * tests for exceptions * fix pep8 * black format
openml · mfeurer · Sep 2, 2020 · Jul 7, 2020 · Jul 7, 2020 · Jul 10, 2020
commit 9c93f5b06a9802ae283ccba9d36a5e426378494a
diff --git a/doc/progress.rst b/doc/progress.rst
@@ -8,7 +8,7 @@ Changelog
 
 0.11.0
 ~~~~~~
-
+* ADD #929: Add data edit API
 * FIX #873: Fixes an issue which resulted in incorrect URLs when printing OpenML objects after
   switching the server.
 * FIX #885: Logger no longer registered by default. Added utility functions to easily register

diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py
@@ -5,12 +5,13 @@
 
 How to list and download datasets.
 """
-############################################################################
+""
 
 # License: BSD 3-Clauses
 
 import openml
 import pandas as pd
+from openml.datasets.functions import edit_dataset, get_dataset
 
 ############################################################################
 # Exercise 0
@@ -42,9 +43,9 @@
 # * Find a dataset called 'eeg_eye_state'.
 # * Find all datasets with more than 50 classes.
 datalist[datalist.NumberOfInstances > 10000].sort_values(["NumberOfInstances"]).head(n=20)
-############################################################################
+""
 datalist.query('name == "eeg-eye-state"')
-############################################################################
+""
 datalist.query("NumberOfClasses > 50")
 
 ############################################################################
@@ -108,3 +109,39 @@
     alpha=0.8,
     cmap="plasma",
 )
+
+
+############################################################################
+# Edit a created dataset
+# =================================================
+# This example uses the test server, to avoid editing a dataset on the main server.
+openml.config.start_using_configuration_for_example()
+############################################################################
+# Changes to these field edits existing version: allowed only for dataset owner
+data_id = edit_dataset(
+    564,
+    description="xor dataset represents XOR operation",
+    contributor="",
+    collection_date="2019-10-29 17:06:18",
+    original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
+    paper_url="",
+    citation="kaggle",
+    language="English",
+)
+edited_dataset = get_dataset(data_id)
+print(f"Edited dataset ID: {data_id}")
+
+
+############################################################################
+# Changes to these fields: attributes, default_target_attribute,
+# row_id_attribute, ignore_attribute generates a new edited version: allowed for anyone
+
+new_attributes = [
+    ("x0", "REAL"),
+    ("x1", "REAL"),
+    ("y", "REAL"),
+]
+data_id = edit_dataset(564, attributes=new_attributes)
+print(f"Edited dataset ID: {data_id}")
+
+openml.config.stop_using_configuration_for_example()
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -799,6 +799,154 @@ def status_update(data_id, status):
         raise ValueError("Data id/status does not collide")
 
 
+def edit_dataset(
+    data_id,
+    description=None,
+    creator=None,
+    contributor=None,
+    collection_date=None,
+    language=None,
+    attributes=None,
+    data=None,
+    default_target_attribute=None,
+    ignore_attribute=None,
+    citation=None,
+    row_id_attribute=None,
+    original_data_url=None,
+    paper_url=None,
+) -> int:
+    """
+      Edits an OpenMLDataset.
+      Specify atleast one field to edit, apart from data_id
+       - For certain fields, a new dataset version is created : attributes, data,
+       default_target_attribute, ignore_attribute, row_id_attribute.
+
+       - For other fields, the uploader can edit the exisiting version.
+        Noone except the uploader can edit the exisitng version.
+
+      Parameters
+      ----------
+      data_id : int
+          ID of the dataset.
+      description : str
+          Description of the dataset.
+      creator : str
+          The person who created the dataset.
+      contributor : str
+          People who contributed to the current version of the dataset.
+      collection_date : str
+          The date the data was originally collected, given by the uploader.
+      language : str
+          Language in which the data is represented.
+          Starts with 1 upper case letter, rest lower case, e.g. 'English'.
+      attributes : list, dict, or 'auto'
+          A list of tuples. Each tuple consists of the attribute name and type.
+          If passing a pandas DataFrame, the attributes can be automatically
+          inferred by passing ``'auto'``. Specific attributes can be manually
+          specified by a passing a dictionary where the key is the name of the
+          attribute and the value is the data type of the attribute.
+      data : ndarray, list, dataframe, coo_matrix, shape (n_samples, n_features)
+          An array that contains both the attributes and the targets. When
+          providing a dataframe, the attribute names and type can be inferred by
+          passing ``attributes='auto'``.
+          The target feature is indicated as meta-data of the dataset.
+      default_target_attribute : str
+          The default target attribute, if it exists.
+          Can have multiple values, comma separated.
+      ignore_attribute : str | list
+          Attributes that should be excluded in modelling,
+          such as identifiers and indexes.
+      citation : str
+          Reference(s) that should be cited when building on this data.
+      row_id_attribute : str, optional
+          The attribute that represents the row-id column, if present in the
+          dataset. If ``data`` is a dataframe and ``row_id_attribute`` is not
+          specified, the index of the dataframe will be used as the
+          ``row_id_attribute``. If the name of the index is ``None``, it will
+          be discarded.
+
+          .. versionadded: 0.8
+              Inference of ``row_id_attribute`` from a dataframe.
+      original_data_url : str, optional
+          For derived data, the url to the original dataset.
+      paper_url : str, optional
+          Link to a paper describing the dataset.
+
+
+      Returns
+      -------
+      data_id of the existing edited version or the new version created and published"""
+    if not isinstance(data_id, int):
+        raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))
+
+    # case 1, changing these fields creates a new version of the dataset with changed field
+    if any(
+        field is not None
+        for field in [
+            data,
+            attributes,
+            default_target_attribute,
+            row_id_attribute,
+            ignore_attribute,
+        ]
+    ):
+        logger.warning("Creating a new version of dataset, cannot edit existing version")
+        dataset = get_dataset(data_id)
+
+        decoded_arff = dataset._get_arff(format="arff")
+        data_old = decoded_arff["data"]
+        data_new = data if data is not None else data_old
+        dataset_new = create_dataset(
+            name=dataset.name,
+            description=description or dataset.description,
+            creator=creator or dataset.creator,
+            contributor=contributor or dataset.contributor,
+            collection_date=collection_date or dataset.collection_date,
+            language=language or dataset.language,
+            licence=dataset.licence,
+            attributes=attributes or decoded_arff["attributes"],
+            data=data_new,
+            default_target_attribute=default_target_attribute or dataset.default_target_attribute,
+            ignore_attribute=ignore_attribute or dataset.ignore_attribute,
+            citation=citation or dataset.citation,
+            row_id_attribute=row_id_attribute or dataset.row_id_attribute,
+            original_data_url=original_data_url or dataset.original_data_url,
+            paper_url=paper_url or dataset.paper_url,
+            update_comment=dataset.update_comment,
+            version_label=dataset.version_label,
+        )
+        dataset_new.publish()
+        return dataset_new.dataset_id
+
+    # case 2, changing any of these fields will update existing dataset
+    # compose data edit parameters as xml
+    form_data = {"data_id": data_id}
+    xml = OrderedDict()  # type: 'OrderedDict[str, OrderedDict]'
+    xml["oml:data_edit_parameters"] = OrderedDict()
+    xml["oml:data_edit_parameters"]["@xmlns:oml"] = "http://openml.org/openml"
+    xml["oml:data_edit_parameters"]["oml:description"] = description
+    xml["oml:data_edit_parameters"]["oml:creator"] = creator
+    xml["oml:data_edit_parameters"]["oml:contributor"] = contributor
+    xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date
+    xml["oml:data_edit_parameters"]["oml:language"] = language
+    xml["oml:data_edit_parameters"]["oml:citation"] = citation
+    xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url
+    xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url
+
+    # delete None inputs
+    for k in list(xml["oml:data_edit_parameters"]):
+        if not xml["oml:data_edit_parameters"][k]:
+            del xml["oml:data_edit_parameters"][k]
+
+    file_elements = {"edit_parameters": ("description.xml", xmltodict.unparse(xml))}
+    result_xml = openml._api_calls._perform_api_call(
+        "data/edit", "post", data=form_data, file_elements=file_elements
+    )
+    result = xmltodict.parse(result_xml)
+    data_id = result["oml:data_edit"]["oml:id"]
+    return int(data_id)
+
+
 def _get_dataset_description(did_cache_dir, dataset_id):
     """Get the dataset description as xml dictionary.
 

diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -16,11 +16,17 @@
 
 import openml
 from openml import OpenMLDataset
-from openml.exceptions import OpenMLCacheException, OpenMLHashException, OpenMLPrivateDatasetError
+from openml.exceptions import (
+    OpenMLCacheException,
+    OpenMLHashException,
+    OpenMLPrivateDatasetError,
+    OpenMLServerException,
+)
 from openml.testing import TestBase
 from openml.utils import _tag_entity, _create_cache_directory_for_id
 from openml.datasets.functions import (
     create_dataset,
+    edit_dataset,
     attributes_arff_from_df,
     _get_cached_dataset,
     _get_cached_dataset_features,
@@ -1331,3 +1337,76 @@ def test_get_dataset_cache_format_feather(self):
         self.assertEqual(X.shape, (150, 5))
         self.assertEqual(len(categorical), X.shape[1])
         self.assertEqual(len(attribute_names), X.shape[1])
+
+    def test_data_edit(self):
+
+        # admin key for test server (only admins or owners can edit datasets).
+        # all users can edit their own datasets)
+        openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"
+
+        # case 1, editing description, creator, contributor, collection_date, original_data_url,
+        # paper_url, citation, language edits existing dataset.
+        did = 564
+        result = edit_dataset(
+            did,
+            description="xor dataset represents XOR operation",
+            contributor="",
+            collection_date="2019-10-29 17:06:18",
+            original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
+            paper_url="",
+            citation="kaggle",
+            language="English",
+        )
+        self.assertEqual(result, did)
+
+        # case 2, editing data, attributes, default_target_attribute, row_id_attribute,
+        # ignore_attribute generates a new dataset
+
+        column_names = [
+            ("input1", "REAL"),
+            ("input2", "REAL"),
+            ("y", "REAL"),
+        ]
+        desc = "xor dataset represents XOR operation"
+        result = edit_dataset(
+            564,
+            description=desc,
+            contributor="",
+            collection_date="2019-10-29 17:06:18",
+            attributes=column_names,
+            original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
+            paper_url="",
+            citation="kaggle",
+            language="English",
+        )
+        self.assertNotEqual(did, result)
+
+    def test_data_edit_errors(self):
+
+        # admin key for test server (only admins or owners can edit datasets).
+        openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"
+        # Check server exception when no field to edit is provided
+        self.assertRaisesRegex(
+            OpenMLServerException,
+            "Please provide atleast one field among description, creator, contributor, "
+            "collection_date, language, citation, original_data_url or paper_url to edit.",
+            edit_dataset,
+            data_id=564,
+        )
+        # Check server exception when unknown dataset is provided
+        self.assertRaisesRegex(
+            OpenMLServerException,
+            "Unknown dataset",
+            edit_dataset,
+            data_id=100000,
+            description="xor operation dataset",
+        )
+        # Check server exception when a non-owner or non-admin tries to edit existing dataset
+        openml.config.apikey = "5f0b74b33503e4ad4a7181a91e28719f"
+        self.assertRaisesRegex(
+            OpenMLServerException,
+            "Dataset is not owned by you",
+            edit_dataset,
+            data_id=564,
+            description="xor data",
+        )