Skip to content

Commit 495162d

Browse files
authored
Deprecate output_format='dict' (#1258)
* Add deprecation warning for retrieving dict * Refactor check_datasets_active to work with dataframe * Update unit tests to use list_datasets with output_format dataframe * Move list_datasets test to proper file * Remove list_datasets test, duplicate in test_datasets_functions duplicate of tests/test_datasets/test_dataset_functions.py::test_list_datasets * Update list_flows calls to use output_format='dataframe' * Update list_runs calls to require dataframe output * Update list_setup calls for deprecation * Update list_study calls * Update list_tasks to specify output_format dataframe * Add `output_format` to `list_datasets` call * Add TODO markers for removing `dict` support of `list_*` functions * Make status check less strict, call list_dataset with output_format * Change index on id to did, since thats the dataset id's column name * Update test to reflect new error message * Fix bug introduced by refactor Must check results are (somewhat) complete before processing results * Fix minor oversights of refactoring * Rename variables to reflect they are no longer lists * Fix unsafe indexing on dataframe and remaining unit tests * Perform safer check for integer dtypes
1 parent 80a028a commit 495162d

23 files changed

Lines changed: 358 additions & 269 deletions

examples/30_extended/datasets_tutorial.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,9 @@
2121
# * Use the output_format parameter to select output type
2222
# * Default gives 'dict' (other option: 'dataframe', see below)
2323
#
24-
openml_list = openml.datasets.list_datasets() # returns a dict
25-
26-
# Show a nice table with some key data properties
27-
datalist = pd.DataFrame.from_dict(openml_list, orient="index")
24+
# Note: list_datasets will return a pandas dataframe by default from 0.15. When using
25+
# openml-python 0.14, `list_datasets` will warn you to use output_format='dataframe'.
26+
datalist = openml.datasets.list_datasets(output_format="dataframe")
2827
datalist = datalist[["did", "name", "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses"]]
2928

3029
print(f"First 10 of {len(datalist)} datasets...")

examples/30_extended/suites_tutorial.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@
7575

7676
# We'll take a random subset of at least ten tasks of all available tasks on
7777
# the test server:
78-
all_tasks = list(openml.tasks.list_tasks().keys())
78+
all_tasks = list(openml.tasks.list_tasks(output_format="dataframe")["tid"])
7979
task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20))
8080

8181
# The study needs a machine-readable and unique alias. To obtain this,

examples/30_extended/tasks_tutorial.py

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -29,28 +29,19 @@
2929
# Listing tasks
3030
# ^^^^^^^^^^^^^
3131
#
32-
# We will start by simply listing only *supervised classification* tasks:
33-
34-
tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)
35-
36-
############################################################################
37-
# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, which we convert
38-
# into a
32+
# We will start by simply listing only *supervised classification* tasks.
33+
# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, but we
34+
# request a
3935
# `pandas dataframe <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`_
40-
# to have better visualization capabilities and easier access:
36+
# instead to have better visualization capabilities and easier access:
4137

42-
tasks = pd.DataFrame.from_dict(tasks, orient="index")
38+
tasks = openml.tasks.list_tasks(
39+
task_type=TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe"
40+
)
4341
print(tasks.columns)
4442
print(f"First 5 of {len(tasks)} tasks:")
4543
print(tasks.head())
4644

47-
# As conversion to a pandas dataframe is a common task, we have added this functionality to the
48-
# OpenML-Python library which can be used by passing ``output_format='dataframe'``:
49-
tasks_df = openml.tasks.list_tasks(
50-
task_type=TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe"
51-
)
52-
print(tasks_df.head())
53-
5445
############################################################################
5546
# We can filter the list of tasks to only contain datasets with more than
5647
# 500 samples, but less than 1000 samples:

openml/datasets/functions.py

Lines changed: 30 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,15 @@ def list_datasets(
128128
"Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
129129
)
130130

131+
# TODO: [0.15]
132+
if output_format == "dict":
133+
msg = (
134+
"Support for `output_format` of 'dict' will be removed in 0.15 "
135+
"and pandas dataframes will be returned instead. To ensure your code "
136+
"will continue to work, use `output_format`='dataframe'."
137+
)
138+
warnings.warn(msg, category=FutureWarning, stacklevel=2)
139+
131140
return openml.utils._list_all(
132141
data_id=data_id,
133142
output_format=output_format,
@@ -241,7 +250,8 @@ def check_datasets_active(
241250
Check if the dataset ids provided are active.
242251
243252
Raises an error if a dataset_id in the given list
244-
of dataset_ids does not exist on the server.
253+
of dataset_ids does not exist on the server and
254+
`raise_error_if_not_exist` is set to True (default).
245255
246256
Parameters
247257
----------
@@ -256,18 +266,12 @@ def check_datasets_active(
256266
dict
257267
A dictionary with items {did: bool}
258268
"""
259-
dataset_list = list_datasets(status="all", data_id=dataset_ids)
260-
active = {}
261-
262-
for did in dataset_ids:
263-
dataset = dataset_list.get(did, None)
264-
if dataset is None:
265-
if raise_error_if_not_exist:
266-
raise ValueError(f"Could not find dataset {did} in OpenML dataset list.")
267-
else:
268-
active[did] = dataset["status"] == "active"
269-
270-
return active
269+
datasets = list_datasets(status="all", data_id=dataset_ids, output_format="dataframe")
270+
missing = set(dataset_ids) - set(datasets.get("did", []))
271+
if raise_error_if_not_exist and missing:
272+
missing_str = ", ".join(str(did) for did in missing)
273+
raise ValueError(f"Could not find dataset(s) {missing_str} in OpenML dataset list.")
274+
return dict(datasets["status"] == "active")
271275

272276

273277
def _name_to_id(
@@ -285,7 +289,7 @@ def _name_to_id(
285289
----------
286290
dataset_name : str
287291
The name of the dataset for which to find its id.
288-
version : int
292+
version : int, optional
289293
Version to retrieve. If not specified, the oldest active version is returned.
290294
error_if_multiple : bool (default=False)
291295
If `False`, if multiple datasets match, return the least recent active dataset.
@@ -299,16 +303,22 @@ def _name_to_id(
299303
The id of the dataset.
300304
"""
301305
status = None if version is not None else "active"
302-
candidates = list_datasets(data_name=dataset_name, status=status, data_version=version)
306+
candidates = cast(
307+
pd.DataFrame,
308+
list_datasets(
309+
data_name=dataset_name, status=status, data_version=version, output_format="dataframe"
310+
),
311+
)
303312
if error_if_multiple and len(candidates) > 1:
304-
raise ValueError("Multiple active datasets exist with name {}".format(dataset_name))
305-
if len(candidates) == 0:
306-
no_dataset_for_name = "No active datasets exist with name {}".format(dataset_name)
307-
and_version = " and version {}".format(version) if version is not None else ""
313+
msg = f"Multiple active datasets exist with name '{dataset_name}'."
314+
raise ValueError(msg)
315+
if candidates.empty:
316+
no_dataset_for_name = f"No active datasets exist with name '{dataset_name}'"
317+
and_version = f" and version '{version}'." if version is not None else "."
308318
raise RuntimeError(no_dataset_for_name + and_version)
309319

310320
# Dataset ids are chronological so we can just sort based on ids (instead of version)
311-
return sorted(candidates)[0]
321+
return candidates["did"].min()
312322

313323

314324
def get_datasets(

openml/evaluations/functions.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# License: BSD 3-Clause
22

33
import json
4+
import warnings
5+
46
import xmltodict
57
import pandas as pd
68
import numpy as np
@@ -77,6 +79,15 @@ def list_evaluations(
7779
"Invalid output format selected. " "Only 'object', 'dataframe', or 'dict' applicable."
7880
)
7981

82+
# TODO: [0.15]
83+
if output_format == "dict":
84+
msg = (
85+
"Support for `output_format` of 'dict' will be removed in 0.15. "
86+
"To ensure your code will continue to work, "
87+
"use `output_format`='dataframe' or `output_format`='object'."
88+
)
89+
warnings.warn(msg, category=FutureWarning, stacklevel=2)
90+
8091
per_fold_str = None
8192
if per_fold is not None:
8293
per_fold_str = str(per_fold).lower()

openml/flows/functions.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# License: BSD 3-Clause
2+
import warnings
23

34
import dateutil.parser
45
from collections import OrderedDict
@@ -188,6 +189,15 @@ def list_flows(
188189
"Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
189190
)
190191

192+
# TODO: [0.15]
193+
if output_format == "dict":
194+
msg = (
195+
"Support for `output_format` of 'dict' will be removed in 0.15 "
196+
"and pandas dataframes will be returned instead. To ensure your code "
197+
"will continue to work, use `output_format`='dataframe'."
198+
)
199+
warnings.warn(msg, category=FutureWarning, stacklevel=2)
200+
191201
return openml.utils._list_all(
192202
output_format=output_format,
193203
listing_call=_list_flows,

openml/runs/functions.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import itertools
66
import os
77
import time
8-
from typing import Any, List, Dict, Optional, Set, Tuple, Union, TYPE_CHECKING # noqa F401
8+
from typing import Any, List, Dict, Optional, Set, Tuple, Union, TYPE_CHECKING, cast # noqa F401
99
import warnings
1010

1111
import sklearn.metrics
@@ -103,7 +103,7 @@ def run_model_on_task(
103103
"avoid_duplicate_runs is set to True, but no API key is set. "
104104
"Please set your API key in the OpenML configuration file, see"
105105
"https://openml.github.io/openml-python/main/examples/20_basic/introduction_tutorial"
106-
+ ".html#authentication for more information on authentication.",
106+
".html#authentication for more information on authentication.",
107107
)
108108

109109
# TODO: At some point in the future do not allow for arguments in old order (6-2018).
@@ -428,11 +428,10 @@ def run_exists(task_id: int, setup_id: int) -> Set[int]:
428428
return set()
429429

430430
try:
431-
result = list_runs(task=[task_id], setup=[setup_id])
432-
if len(result) > 0:
433-
return set(result.keys())
434-
else:
435-
return set()
431+
result = cast(
432+
pd.DataFrame, list_runs(task=[task_id], setup=[setup_id], output_format="dataframe")
433+
)
434+
return set() if result.empty else set(result["run_id"])
436435
except OpenMLServerException as exception:
437436
# error code 512 implies no results. The run does not exist yet
438437
assert exception.code == 512
@@ -1012,6 +1011,14 @@ def list_runs(
10121011
raise ValueError(
10131012
"Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
10141013
)
1014+
# TODO: [0.15]
1015+
if output_format == "dict":
1016+
msg = (
1017+
"Support for `output_format` of 'dict' will be removed in 0.15 "
1018+
"and pandas dataframes will be returned instead. To ensure your code "
1019+
"will continue to work, use `output_format`='dataframe'."
1020+
)
1021+
warnings.warn(msg, category=FutureWarning, stacklevel=2)
10151022

10161023
if id is not None and (not isinstance(id, list)):
10171024
raise TypeError("id must be of type list.")

openml/setups/functions.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# License: BSD 3-Clause
2-
2+
import warnings
33
from collections import OrderedDict
44
import io
55
import os
@@ -142,6 +142,15 @@ def list_setups(
142142
"Invalid output format selected. " "Only 'dict', 'object', or 'dataframe' applicable."
143143
)
144144

145+
# TODO: [0.15]
146+
if output_format == "dict":
147+
msg = (
148+
"Support for `output_format` of 'dict' will be removed in 0.15. "
149+
"To ensure your code will continue to work, "
150+
"use `output_format`='dataframe' or `output_format`='object'."
151+
)
152+
warnings.warn(msg, category=FutureWarning, stacklevel=2)
153+
145154
batch_size = 1000 # batch size for setups is lower
146155
return openml.utils._list_all(
147156
output_format=output_format,

openml/study/functions.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,14 @@ def list_suites(
463463
raise ValueError(
464464
"Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
465465
)
466+
# TODO: [0.15]
467+
if output_format == "dict":
468+
msg = (
469+
"Support for `output_format` of 'dict' will be removed in 0.15 "
470+
"and pandas dataframes will be returned instead. To ensure your code "
471+
"will continue to work, use `output_format`='dataframe'."
472+
)
473+
warnings.warn(msg, category=FutureWarning, stacklevel=2)
466474

467475
return openml.utils._list_all(
468476
output_format=output_format,
@@ -536,6 +544,14 @@ def list_studies(
536544
raise ValueError(
537545
"Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
538546
)
547+
# TODO: [0.15]
548+
if output_format == "dict":
549+
msg = (
550+
"Support for `output_format` of 'dict' will be removed in 0.15 "
551+
"and pandas dataframes will be returned instead. To ensure your code "
552+
"will continue to work, use `output_format`='dataframe'."
553+
)
554+
warnings.warn(msg, category=FutureWarning, stacklevel=2)
539555

540556
return openml.utils._list_all(
541557
output_format=output_format,

openml/tasks/functions.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,14 @@ def list_tasks(
176176
raise ValueError(
177177
"Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
178178
)
179+
# TODO: [0.15]
180+
if output_format == "dict":
181+
msg = (
182+
"Support for `output_format` of 'dict' will be removed in 0.15 "
183+
"and pandas dataframes will be returned instead. To ensure your code "
184+
"will continue to work, use `output_format`='dataframe'."
185+
)
186+
warnings.warn(msg, category=FutureWarning, stacklevel=2)
179187
return openml.utils._list_all(
180188
output_format=output_format,
181189
listing_call=_list_tasks,

0 commit comments

Comments
 (0)