Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
b3b7867
Create first section: Creating Custom Flow
PGijsbers Jul 7, 2020
19d79d7
Add Section: Using the Flow
PGijsbers Jul 7, 2020
208f6cd
Allow run description text to be custom
PGijsbers Jul 10, 2020
2247bbc
Draft for Custom Flow tutorial
PGijsbers Jul 10, 2020
326510c
Add minimal docstring to OpenMLRun
PGijsbers Jul 10, 2020
872bd75
Process code review feedback
PGijsbers Jul 10, 2020
c3a5326
Use the format utility function in automatic runs
PGijsbers Jul 10, 2020
a7cb290
Process @mfeurer feedback
PGijsbers Jul 13, 2020
e5dcaf0
Rename arguments of list_evaluations (#933)
Bilgecelik Jul 14, 2020
1670050
adding config file to user guide (#931)
marcoslbueno Jul 14, 2020
9c93f5b
Edit api (#935)
sahithyaravi Jul 23, 2020
666ca68
Adding support for scikit-learn > 0.22 (#936)
Neeratyoy Aug 3, 2020
5d9c69c
Add flake8-print in pre-commit (#939)
22quinn Aug 3, 2020
7d51a76
Fix edit api (#940)
sahithyaravi Aug 7, 2020
75a5440
Update subflow paragraph
PGijsbers Aug 12, 2020
23a08ab
Check the ClassificationTask has class label set
PGijsbers Aug 14, 2020
95d1fcb
Test task is of supported type
PGijsbers Aug 17, 2020
41aa789
Add tests for format_prediction
PGijsbers Aug 17, 2020
5d2e0ce
Adding Python 3.8 support (#916)
Neeratyoy Aug 17, 2020
5ef24ab
Process feedback Neeratyoy
PGijsbers Aug 25, 2020
1ce5a12
Test Exception with Regex
PGijsbers Aug 28, 2020
f70c720
change edit_api to reflect server (#941)
sahithyaravi Aug 31, 2020
f8839de
Create first section: Creating Custom Flow
PGijsbers Jul 7, 2020
2a6903b
Add Section: Using the Flow
PGijsbers Jul 7, 2020
4802497
Allow run description text to be custom
PGijsbers Jul 10, 2020
7fb64b4
Draft for Custom Flow tutorial
PGijsbers Jul 10, 2020
a6f0a38
Add minimal docstring to OpenMLRun
PGijsbers Jul 10, 2020
3748ae0
Process code review feedback
PGijsbers Jul 10, 2020
5479d7b
Use the format utility function in automatic runs
PGijsbers Jul 10, 2020
4b71c30
Process @mfeurer feedback
PGijsbers Jul 13, 2020
942d66e
Update subflow paragraph
PGijsbers Aug 12, 2020
9ba363e
Check the ClassificationTask has class label set
PGijsbers Aug 14, 2020
a72053d
Test task is of supported type
PGijsbers Aug 17, 2020
de31490
Add tests for format_prediction
PGijsbers Aug 17, 2020
832f437
Process feedback Neeratyoy
PGijsbers Aug 25, 2020
3cc74de
Test Exception with Regex
PGijsbers Aug 28, 2020
03e1e8b
Merge branch 'feature_#753' of https://github.com/openml/openml-pytho…
PGijsbers Sep 1, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Edit api (#935)
* version1

* minor fixes

* tests

* reformat code

* check new version

* remove get data

* code format

* review comments

* fix duplicate

* type annotate

* example

* tests for exceptions

* fix pep8

* black format
  • Loading branch information
sahithyaravi authored Jul 23, 2020
commit 9c93f5b06a9802ae283ccba9d36a5e426378494a
2 changes: 1 addition & 1 deletion doc/progress.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Changelog

0.11.0
~~~~~~

* ADD #929: Add data edit API
* FIX #873: Fixes an issue which resulted in incorrect URLs when printing OpenML objects after
switching the server.
* FIX #885: Logger no longer registered by default. Added utility functions to easily register
Expand Down
43 changes: 40 additions & 3 deletions examples/30_extended/datasets_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@

How to list and download datasets.
"""
############################################################################
""

# License: BSD 3-Clauses

import openml
import pandas as pd
from openml.datasets.functions import edit_dataset, get_dataset

############################################################################
# Exercise 0
Expand Down Expand Up @@ -42,9 +43,9 @@
# * Find a dataset called 'eeg_eye_state'.
# * Find all datasets with more than 50 classes.
datalist[datalist.NumberOfInstances > 10000].sort_values(["NumberOfInstances"]).head(n=20)
############################################################################
""
datalist.query('name == "eeg-eye-state"')
############################################################################
""
datalist.query("NumberOfClasses > 50")

############################################################################
Expand Down Expand Up @@ -108,3 +109,39 @@
alpha=0.8,
cmap="plasma",
)


############################################################################
# Edit a created dataset
# =================================================
# This example uses the test server, to avoid editing a dataset on the main server.
openml.config.start_using_configuration_for_example()
############################################################################
# Changes to these field edits existing version: allowed only for dataset owner
data_id = edit_dataset(
564,
description="xor dataset represents XOR operation",
contributor="",
collection_date="2019-10-29 17:06:18",
original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
paper_url="",
citation="kaggle",
language="English",
)
edited_dataset = get_dataset(data_id)
print(f"Edited dataset ID: {data_id}")


############################################################################
# Changes to these fields: attributes, default_target_attribute,
# row_id_attribute, ignore_attribute generates a new edited version: allowed for anyone

new_attributes = [
("x0", "REAL"),
("x1", "REAL"),
("y", "REAL"),
]
data_id = edit_dataset(564, attributes=new_attributes)
print(f"Edited dataset ID: {data_id}")

openml.config.stop_using_configuration_for_example()
148 changes: 148 additions & 0 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -799,6 +799,154 @@ def status_update(data_id, status):
raise ValueError("Data id/status does not collide")


def edit_dataset(
data_id,
description=None,
creator=None,
contributor=None,
collection_date=None,
language=None,
attributes=None,
data=None,
default_target_attribute=None,
ignore_attribute=None,
citation=None,
row_id_attribute=None,
original_data_url=None,
paper_url=None,
) -> int:
"""
Edits an OpenMLDataset.
Specify atleast one field to edit, apart from data_id
- For certain fields, a new dataset version is created : attributes, data,
default_target_attribute, ignore_attribute, row_id_attribute.

- For other fields, the uploader can edit the exisiting version.
Noone except the uploader can edit the exisitng version.

Parameters
----------
data_id : int
ID of the dataset.
description : str
Description of the dataset.
creator : str
The person who created the dataset.
contributor : str
People who contributed to the current version of the dataset.
collection_date : str
The date the data was originally collected, given by the uploader.
language : str
Language in which the data is represented.
Starts with 1 upper case letter, rest lower case, e.g. 'English'.
attributes : list, dict, or 'auto'
A list of tuples. Each tuple consists of the attribute name and type.
If passing a pandas DataFrame, the attributes can be automatically
inferred by passing ``'auto'``. Specific attributes can be manually
specified by a passing a dictionary where the key is the name of the
attribute and the value is the data type of the attribute.
data : ndarray, list, dataframe, coo_matrix, shape (n_samples, n_features)
An array that contains both the attributes and the targets. When
providing a dataframe, the attribute names and type can be inferred by
passing ``attributes='auto'``.
The target feature is indicated as meta-data of the dataset.
default_target_attribute : str
The default target attribute, if it exists.
Can have multiple values, comma separated.
ignore_attribute : str | list
Attributes that should be excluded in modelling,
such as identifiers and indexes.
citation : str
Reference(s) that should be cited when building on this data.
row_id_attribute : str, optional
The attribute that represents the row-id column, if present in the
dataset. If ``data`` is a dataframe and ``row_id_attribute`` is not
specified, the index of the dataframe will be used as the
``row_id_attribute``. If the name of the index is ``None``, it will
be discarded.

.. versionadded: 0.8
Inference of ``row_id_attribute`` from a dataframe.
original_data_url : str, optional
For derived data, the url to the original dataset.
paper_url : str, optional
Link to a paper describing the dataset.


Returns
-------
data_id of the existing edited version or the new version created and published"""
if not isinstance(data_id, int):
raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))

# case 1, changing these fields creates a new version of the dataset with changed field
if any(
field is not None
for field in [
data,
attributes,
default_target_attribute,
row_id_attribute,
ignore_attribute,
]
):
logger.warning("Creating a new version of dataset, cannot edit existing version")
dataset = get_dataset(data_id)

decoded_arff = dataset._get_arff(format="arff")
data_old = decoded_arff["data"]
data_new = data if data is not None else data_old
dataset_new = create_dataset(
name=dataset.name,
description=description or dataset.description,
creator=creator or dataset.creator,
contributor=contributor or dataset.contributor,
collection_date=collection_date or dataset.collection_date,
language=language or dataset.language,
licence=dataset.licence,
attributes=attributes or decoded_arff["attributes"],
data=data_new,
default_target_attribute=default_target_attribute or dataset.default_target_attribute,
ignore_attribute=ignore_attribute or dataset.ignore_attribute,
citation=citation or dataset.citation,
row_id_attribute=row_id_attribute or dataset.row_id_attribute,
original_data_url=original_data_url or dataset.original_data_url,
paper_url=paper_url or dataset.paper_url,
update_comment=dataset.update_comment,
version_label=dataset.version_label,
)
dataset_new.publish()
return dataset_new.dataset_id

# case 2, changing any of these fields will update existing dataset
# compose data edit parameters as xml
form_data = {"data_id": data_id}
xml = OrderedDict() # type: 'OrderedDict[str, OrderedDict]'
xml["oml:data_edit_parameters"] = OrderedDict()
xml["oml:data_edit_parameters"]["@xmlns:oml"] = "http://openml.org/openml"
xml["oml:data_edit_parameters"]["oml:description"] = description
xml["oml:data_edit_parameters"]["oml:creator"] = creator
xml["oml:data_edit_parameters"]["oml:contributor"] = contributor
xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date
xml["oml:data_edit_parameters"]["oml:language"] = language
xml["oml:data_edit_parameters"]["oml:citation"] = citation
xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url
xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url

# delete None inputs
for k in list(xml["oml:data_edit_parameters"]):
if not xml["oml:data_edit_parameters"][k]:
del xml["oml:data_edit_parameters"][k]

file_elements = {"edit_parameters": ("description.xml", xmltodict.unparse(xml))}
result_xml = openml._api_calls._perform_api_call(
"data/edit", "post", data=form_data, file_elements=file_elements
)
result = xmltodict.parse(result_xml)
data_id = result["oml:data_edit"]["oml:id"]
return int(data_id)


def _get_dataset_description(did_cache_dir, dataset_id):
"""Get the dataset description as xml dictionary.

Expand Down
81 changes: 80 additions & 1 deletion tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,17 @@

import openml
from openml import OpenMLDataset
from openml.exceptions import OpenMLCacheException, OpenMLHashException, OpenMLPrivateDatasetError
from openml.exceptions import (
OpenMLCacheException,
OpenMLHashException,
OpenMLPrivateDatasetError,
OpenMLServerException,
)
from openml.testing import TestBase
from openml.utils import _tag_entity, _create_cache_directory_for_id
from openml.datasets.functions import (
create_dataset,
edit_dataset,
attributes_arff_from_df,
_get_cached_dataset,
_get_cached_dataset_features,
Expand Down Expand Up @@ -1331,3 +1337,76 @@ def test_get_dataset_cache_format_feather(self):
self.assertEqual(X.shape, (150, 5))
self.assertEqual(len(categorical), X.shape[1])
self.assertEqual(len(attribute_names), X.shape[1])

def test_data_edit(self):

# admin key for test server (only admins or owners can edit datasets).
# all users can edit their own datasets)
openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"

# case 1, editing description, creator, contributor, collection_date, original_data_url,
# paper_url, citation, language edits existing dataset.
did = 564
result = edit_dataset(
did,
description="xor dataset represents XOR operation",
contributor="",
collection_date="2019-10-29 17:06:18",
original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
paper_url="",
citation="kaggle",
language="English",
)
self.assertEqual(result, did)

# case 2, editing data, attributes, default_target_attribute, row_id_attribute,
# ignore_attribute generates a new dataset

column_names = [
("input1", "REAL"),
("input2", "REAL"),
("y", "REAL"),
]
desc = "xor dataset represents XOR operation"
result = edit_dataset(
564,
description=desc,
contributor="",
collection_date="2019-10-29 17:06:18",
attributes=column_names,
original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
paper_url="",
citation="kaggle",
language="English",
)
self.assertNotEqual(did, result)

def test_data_edit_errors(self):

# admin key for test server (only admins or owners can edit datasets).
openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"
# Check server exception when no field to edit is provided
self.assertRaisesRegex(
OpenMLServerException,
"Please provide atleast one field among description, creator, contributor, "
"collection_date, language, citation, original_data_url or paper_url to edit.",
edit_dataset,
data_id=564,
)
# Check server exception when unknown dataset is provided
self.assertRaisesRegex(
OpenMLServerException,
"Unknown dataset",
edit_dataset,
data_id=100000,
description="xor operation dataset",
)
# Check server exception when a non-owner or non-admin tries to edit existing dataset
openml.config.apikey = "5f0b74b33503e4ad4a7181a91e28719f"
self.assertRaisesRegex(
OpenMLServerException,
"Dataset is not owned by you",
edit_dataset,
data_id=564,
description="xor data",
)