Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 3 additions & 61 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -806,8 +806,6 @@ def edit_dataset(
contributor=None,
collection_date=None,
language=None,
attributes=None,
data=None,
default_target_attribute=None,
ignore_attribute=None,
citation=None,
Expand Down Expand Up @@ -839,17 +837,6 @@ def edit_dataset(
language : str
Language in which the data is represented.
Starts with 1 upper case letter, rest lower case, e.g. 'English'.
attributes : list, dict, or 'auto'
A list of tuples. Each tuple consists of the attribute name and type.
If passing a pandas DataFrame, the attributes can be automatically
inferred by passing ``'auto'``. Specific attributes can be manually
specified by a passing a dictionary where the key is the name of the
attribute and the value is the data type of the attribute.
data : ndarray, list, dataframe, coo_matrix, shape (n_samples, n_features)
An array that contains both the attributes and the targets. When
providing a dataframe, the attribute names and type can be inferred by
passing ``attributes='auto'``.
The target feature is indicated as meta-data of the dataset.
default_target_attribute : str
The default target attribute, if it exists.
Can have multiple values, comma separated.
Expand Down Expand Up @@ -879,54 +866,6 @@ def edit_dataset(
if not isinstance(data_id, int):
raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))

# case 1, changing these fields creates a new version of the dataset with changed field
if any(
field is not None
for field in [
data,
attributes,
default_target_attribute,
row_id_attribute,
ignore_attribute,
]
):
logger.warning("Creating a new version of dataset, cannot edit existing version")

# Get old dataset and features
dataset = get_dataset(data_id)
df, y, categorical, attribute_names = dataset.get_data(dataset_format="dataframe")
attributes_old = attributes_arff_from_df(df)

# Sparse data needs to be provided in a different format from dense data
if dataset.format == "sparse_arff":
df, y, categorical, attribute_names = dataset.get_data(dataset_format="array")
data_old = coo_matrix(df)
else:
data_old = df
data_new = data if data is not None else data_old
dataset_new = create_dataset(
name=dataset.name,
description=description or dataset.description,
creator=creator or dataset.creator,
contributor=contributor or dataset.contributor,
collection_date=collection_date or dataset.collection_date,
language=language or dataset.language,
licence=dataset.licence,
attributes=attributes or attributes_old,
data=data_new,
default_target_attribute=default_target_attribute or dataset.default_target_attribute,
ignore_attribute=ignore_attribute or dataset.ignore_attribute,
citation=citation or dataset.citation,
row_id_attribute=row_id_attribute or dataset.row_id_attribute,
original_data_url=original_data_url or dataset.original_data_url,
paper_url=paper_url or dataset.paper_url,
update_comment=dataset.update_comment,
version_label=dataset.version_label,
)
dataset_new.publish()
return dataset_new.dataset_id

# case 2, changing any of these fields will update existing dataset
# compose data edit parameters as xml
form_data = {"data_id": data_id}
xml = OrderedDict() # type: 'OrderedDict[str, OrderedDict]'
Expand All @@ -937,6 +876,9 @@ def edit_dataset(
xml["oml:data_edit_parameters"]["oml:contributor"] = contributor
xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date
xml["oml:data_edit_parameters"]["oml:language"] = language
xml["oml:data_edit_parameters"]["oml:default_target_attribute"] = default_target_attribute
xml["oml:data_edit_parameters"]["oml:row_id_attribute"] = row_id_attribute
xml["oml:data_edit_parameters"]["oml:ignore_attribute"] = ignore_attribute
xml["oml:data_edit_parameters"]["oml:citation"] = citation
xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url
xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url
Expand Down
69 changes: 34 additions & 35 deletions tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1341,47 +1341,34 @@ def test_get_dataset_cache_format_feather(self):
self.assertEqual(len(attribute_names), X.shape[1])

def test_data_edit(self):

# admin key for test server (only admins or owners can edit datasets).
# all users can edit their own datasets)
openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"

# case 1, editing description, creator, contributor, collection_date, original_data_url,
# paper_url, citation, language edits existing dataset.
# Case 1
# All users can edit non-critical fields of datasets
desc = "xor dataset representing XOR operation"
did = 564
result = edit_dataset(
did,
description="xor dataset represents XOR operation",
contributor="",
description=desc,
contributor="xxx",
collection_date="2019-10-29 17:06:18",
original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
paper_url="",
citation="kaggle",
language="English",
)
self.assertEqual(result, did)

# case 2, editing data, attributes, default_target_attribute, row_id_attribute,
# ignore_attribute generates a new dataset
self.assertEqual(did, result)
edited_dataset = openml.datasets.get_dataset(did)
self.assertEqual(edited_dataset.description, desc)

column_names = [
("input1", "REAL"),
("input2", "REAL"),
("y", "REAL"),
]
# Case 2
# only admins or owners can edit all critical fields of datasets
# admin key for test server
openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"
Comment thread
PGijsbers marked this conversation as resolved.
Outdated
desc = "xor dataset represents XOR operation"
result = edit_dataset(
564,
description=desc,
contributor="",
collection_date="2019-10-29 17:06:18",
attributes=column_names,
original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
paper_url="",
citation="kaggle",
language="English",
)
self.assertNotEqual(did, result)
did = 565
result = edit_dataset(did, default_target_attribute="y", ignore_attribute="input1")
self.assertEqual(did, result)
edited_dataset = openml.datasets.get_dataset(did)
self.assertEqual(edited_dataset.ignore_attribute, ["input1"])

def test_data_edit_errors(self):

Expand All @@ -1390,8 +1377,10 @@ def test_data_edit_errors(self):
# Check server exception when no field to edit is provided
self.assertRaisesRegex(
OpenMLServerException,
"Please provide atleast one field among description, creator, contributor, "
"collection_date, language, citation, original_data_url or paper_url to edit.",
"Please provide atleast one field among description, creator, "
"contributor, collection_date, language, citation, "
"original_data_url, default_target_attribute, row_id_attribute, "
"ignore_attribute or paper_url to edit.",
edit_dataset,
data_id=564,
)
Expand All @@ -1403,12 +1392,22 @@ def test_data_edit_errors(self):
data_id=100000,
description="xor operation dataset",
)
# Check server exception when a non-owner or non-admin tries to edit existing dataset
# Check server exception when owner/admin edits critical features of dataset with tasks
self.assertRaisesRegex(
OpenMLServerException,
"Critical features default_target_attribute, row_id_attribute and ignore_attribute "
"can only be edited for datasets without any tasks.",
edit_dataset,
data_id=1,
default_target_attribute="y",
)
# Check server exception when a non-owner or non-admin tries to edit critical features
openml.config.apikey = "5f0b74b33503e4ad4a7181a91e28719f"
self.assertRaisesRegex(
OpenMLServerException,
"Dataset is not owned by you",
"Critical features default_target_attribute, row_id_attribute and ignore_attribute "
"can be edited only by the owner. Fork the dataset if changes are required.",
edit_dataset,
data_id=564,
description="xor data",
default_target_attribute="y",
)