Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
30dd55f
Add deprecation warning for retrieving dict
PGijsbers Jun 15, 2023
b502312
Refactor check_datasets_active to work with dataframe
PGijsbers Jun 15, 2023
357bb7d
Update unit tests to use list_datasets with output_format dataframe
PGijsbers Jun 15, 2023
29bbb57
Move list_datasets test to proper file
PGijsbers Jun 15, 2023
464e5dd
Remove list_datasets test, duplicate in test_datasets_functions
PGijsbers Jun 15, 2023
aaad25f
Update list_flows calls to use output_format='dataframe'
PGijsbers Jun 15, 2023
cf9dd7b
Update list_runs calls to require dataframe output
PGijsbers Jun 15, 2023
13f2fb5
Update list_setup calls for deprecation
PGijsbers Jun 15, 2023
d3342a1
Update list_study calls
PGijsbers Jun 15, 2023
b8a915b
Update list_tasks to specify output_format dataframe
PGijsbers Jun 15, 2023
3361b15
Add `output_format` to `list_datasets` call
PGijsbers Jun 15, 2023
be16355
Add TODO markers for removing `dict` support of `list_*` functions
PGijsbers Jun 15, 2023
5cc1287
Make status check less strict, call list_dataset with output_format
PGijsbers Jun 15, 2023
576e09c
Change index on id to did, since thats the dataset id's column name
PGijsbers Jun 15, 2023
b82febe
Update test to reflect new error message
PGijsbers Jun 15, 2023
cc944b5
Fix bug introduced by refactor
PGijsbers Jun 15, 2023
dca2590
Fix minor oversights of refactoring
PGijsbers Jun 15, 2023
5240504
Merge branch 'develop' into pandas_default
PGijsbers Jun 15, 2023
3cff453
Rename variables to reflect they are no longer lists
PGijsbers Jun 16, 2023
c130c41
Fix unsafe indexing on dataframe and remaining unit tests
PGijsbers Jun 16, 2023
22a6dd3
Perform safer check for integer dtypes
PGijsbers Jun 16, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Add deprecation warning for retrieving dict
  • Loading branch information
PGijsbers committed Jun 15, 2023
commit 30dd55f66ca12b36dad30a5bd6d2818a2e295174
2 changes: 2 additions & 0 deletions examples/30_extended/datasets_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
# * Use the output_format parameter to select output type
# * Default gives 'dict' (other option: 'dataframe', see below)
#
# Note: list_datasets will return a pandas dataframe by default from 0.15. When using
# openml-python 0.14, `list_datasets` will warn you to use output_format='dataframe'.
openml_list = openml.datasets.list_datasets() # returns a dict

# Show a nice table with some key data properties
Expand Down
30 changes: 22 additions & 8 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,14 @@ def list_datasets(
"Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
)

if output_format == "dict":
msg = (
"Support for `output_format` of 'dict' will be removed in 0.15 "
"and pandas dataframes will be returned instead. To ensure your code "
"will continue to work, use `output_format`='dataframe'."
)
warnings.warn(msg, category=FutureWarning, stacklevel=2)

return openml.utils._list_all(
data_id=data_id,
output_format=output_format,
Expand Down Expand Up @@ -259,7 +267,7 @@ def check_datasets_active(
dict
A dictionary with items {did: bool}
"""
dataset_list = list_datasets(status="all", data_id=dataset_ids)
dataset_list = list_datasets(status="all", data_id=dataset_ids, output_format="dataframe")
active = {}

for did in dataset_ids:
Expand Down Expand Up @@ -288,7 +296,7 @@ def _name_to_id(
----------
dataset_name : str
The name of the dataset for which to find its id.
version : int
version : int, optional
Version to retrieve. If not specified, the oldest active version is returned.
error_if_multiple : bool (default=False)
If `False`, if multiple datasets match, return the least recent active dataset.
Expand All @@ -302,16 +310,22 @@ def _name_to_id(
The id of the dataset.
"""
status = None if version is not None else "active"
candidates = list_datasets(data_name=dataset_name, status=status, data_version=version)
candidates = cast(
pd.DataFrame,
list_datasets(
data_name=dataset_name, status=status, data_version=version, output_format="dataframe"
),
)
if error_if_multiple and len(candidates) > 1:
raise ValueError("Multiple active datasets exist with name {}".format(dataset_name))
if len(candidates) == 0:
no_dataset_for_name = "No active datasets exist with name {}".format(dataset_name)
and_version = " and version {}".format(version) if version is not None else ""
msg = f"Multiple active datasets exist with name '{dataset_name}'."
raise ValueError(msg)
if candidates.empty:
no_dataset_for_name = f"No active datasets exist with name '{dataset_name}'"
and_version = f" and version '{version}'." if version is not None else ""
raise RuntimeError(no_dataset_for_name + and_version)

# Dataset ids are chronological so we can just sort based on ids (instead of version)
return sorted(candidates)[0]
return candidates["id"].min()


def get_datasets(
Expand Down
10 changes: 10 additions & 0 deletions openml/evaluations/functions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# License: BSD 3-Clause

import json
import warnings

import xmltodict
import pandas as pd
import numpy as np
Expand Down Expand Up @@ -77,6 +79,14 @@ def list_evaluations(
"Invalid output format selected. " "Only 'object', 'dataframe', or 'dict' applicable."
)

if output_format == "dict":
msg = (
"Support for `output_format` of 'dict' will be removed in 0.15. "
"To ensure your code will continue to work, "
"use `output_format`='dataframe' or `output_format`='object'."
)
warnings.warn(msg, category=FutureWarning, stacklevel=2)

per_fold_str = None
if per_fold is not None:
per_fold_str = str(per_fold).lower()
Expand Down
9 changes: 9 additions & 0 deletions openml/flows/functions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# License: BSD 3-Clause
import warnings

import dateutil.parser
from collections import OrderedDict
Expand Down Expand Up @@ -188,6 +189,14 @@ def list_flows(
"Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
)

if output_format == "dict":
msg = (
"Support for `output_format` of 'dict' will be removed in 0.15 "
"and pandas dataframes will be returned instead. To ensure your code "
"will continue to work, use `output_format`='dataframe'."
)
warnings.warn(msg, category=FutureWarning, stacklevel=2)

return openml.utils._list_all(
output_format=output_format,
listing_call=_list_flows,
Expand Down
9 changes: 8 additions & 1 deletion openml/runs/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def run_model_on_task(
warnings.warn(
"avoid_duplicate_runs is set to True, but no API key is set. "
"Please set your API key in the OpenML configuration file, see"
"https://openml.github.io/openml-python/main/examples/20_basic/introduction_tutorial.html#authentication"
"https://openml.github.io/openml-python/main/examples/20_basic/introduction_tutorial.html#authentication" # noqa: E501
"for more information on authentication.",
)

Expand Down Expand Up @@ -1012,6 +1012,13 @@ def list_runs(
raise ValueError(
"Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
)
if output_format == "dict":
msg = (
"Support for `output_format` of 'dict' will be removed in 0.15 "
"and pandas dataframes will be returned instead. To ensure your code "
"will continue to work, use `output_format`='dataframe'."
)
warnings.warn(msg, category=FutureWarning, stacklevel=2)

if id is not None and (not isinstance(id, list)):
raise TypeError("id must be of type list.")
Expand Down
10 changes: 9 additions & 1 deletion openml/setups/functions.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# License: BSD 3-Clause

import warnings
from collections import OrderedDict
import io
import os
Expand Down Expand Up @@ -140,6 +140,14 @@ def list_setups(
"Invalid output format selected. " "Only 'dict', 'object', or 'dataframe' applicable."
)

if output_format == "dict":
msg = (
"Support for `output_format` of 'dict' will be removed in 0.15. "
"To ensure your code will continue to work, "
"use `output_format`='dataframe' or `output_format`='object'."
)
warnings.warn(msg, category=FutureWarning, stacklevel=2)

batch_size = 1000 # batch size for setups is lower
return openml.utils._list_all(
output_format=output_format,
Expand Down
14 changes: 14 additions & 0 deletions openml/study/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,13 @@ def list_suites(
raise ValueError(
"Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
)
if output_format == "dict":
msg = (
"Support for `output_format` of 'dict' will be removed in 0.15 "
"and pandas dataframes will be returned instead. To ensure your code "
"will continue to work, use `output_format`='dataframe'."
)
warnings.warn(msg, category=FutureWarning, stacklevel=2)

return openml.utils._list_all(
output_format=output_format,
Expand Down Expand Up @@ -532,6 +539,13 @@ def list_studies(
raise ValueError(
"Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
)
if output_format == "dict":
msg = (
"Support for `output_format` of 'dict' will be removed in 0.15 "
"and pandas dataframes will be returned instead. To ensure your code "
"will continue to work, use `output_format`='dataframe'."
)
warnings.warn(msg, category=FutureWarning, stacklevel=2)

return openml.utils._list_all(
output_format=output_format,
Expand Down
7 changes: 7 additions & 0 deletions openml/tasks/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,13 @@ def list_tasks(
raise ValueError(
"Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
)
if output_format == "dict":
msg = (
"Support for `output_format` of 'dict' will be removed in 0.15 "
"and pandas dataframes will be returned instead. To ensure your code "
"will continue to work, use `output_format`='dataframe'."
)
warnings.warn(msg, category=FutureWarning, stacklevel=2)
return openml.utils._list_all(
output_format=output_format,
listing_call=_list_tasks,
Expand Down