Add deprecation warning for retrieving dict

openml · mfeurer · Jun 16, 2023 · Jun 15, 2023 · Jun 15, 2023 · Jun 15, 2023
commit 30dd55f66ca12b36dad30a5bd6d2818a2e295174
diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py
@@ -21,6 +21,8 @@
 #   * Use the output_format parameter to select output type
 #   * Default gives 'dict' (other option: 'dataframe', see below)
 #
+# Note: list_datasets will return a pandas dataframe by default from 0.15. When using
+# openml-python 0.14, `list_datasets` will warn you to use output_format='dataframe'.
 openml_list = openml.datasets.list_datasets()  # returns a dict
 
 # Show a nice table with some key data properties

diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -131,6 +131,14 @@ def list_datasets(
             "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
         )
 
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15 "
+            "and pandas dataframes will be returned instead. To ensure your code "
+            "will continue to work, use `output_format`='dataframe'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
+
     return openml.utils._list_all(
         data_id=data_id,
         output_format=output_format,
@@ -259,7 +267,7 @@ def check_datasets_active(
     dict
         A dictionary with items {did: bool}
     """
-    dataset_list = list_datasets(status="all", data_id=dataset_ids)
+    dataset_list = list_datasets(status="all", data_id=dataset_ids, output_format="dataframe")
     active = {}
 
     for did in dataset_ids:
@@ -288,7 +296,7 @@ def _name_to_id(
     ----------
     dataset_name : str
         The name of the dataset for which to find its id.
-    version : int
+    version : int, optional
         Version to retrieve. If not specified, the oldest active version is returned.
     error_if_multiple : bool (default=False)
         If `False`, if multiple datasets match, return the least recent active dataset.
@@ -302,16 +310,22 @@ def _name_to_id(
        The id of the dataset.
     """
     status = None if version is not None else "active"
-    candidates = list_datasets(data_name=dataset_name, status=status, data_version=version)
+    candidates = cast(
+        pd.DataFrame,
+        list_datasets(
+            data_name=dataset_name, status=status, data_version=version, output_format="dataframe"
+        ),
+    )
     if error_if_multiple and len(candidates) > 1:
-        raise ValueError("Multiple active datasets exist with name {}".format(dataset_name))
-    if len(candidates) == 0:
-        no_dataset_for_name = "No active datasets exist with name {}".format(dataset_name)
-        and_version = " and version {}".format(version) if version is not None else ""
+        msg = f"Multiple active datasets exist with name '{dataset_name}'."
+        raise ValueError(msg)
+    if candidates.empty:
+        no_dataset_for_name = f"No active datasets exist with name '{dataset_name}'"
+        and_version = f" and version '{version}'." if version is not None else ""
         raise RuntimeError(no_dataset_for_name + and_version)
 
     # Dataset ids are chronological so we can just sort based on ids (instead of version)
-    return sorted(candidates)[0]
+    return candidates["id"].min()
 
 
 def get_datasets(

diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
@@ -1,6 +1,8 @@
 # License: BSD 3-Clause
 
 import json
+import warnings
+
 import xmltodict
 import pandas as pd
 import numpy as np
@@ -77,6 +79,14 @@ def list_evaluations(
             "Invalid output format selected. " "Only 'object', 'dataframe', or 'dict' applicable."
         )
 
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15. "
+            "To ensure your code will continue to work, "
+            "use `output_format`='dataframe' or `output_format`='object'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
+
     per_fold_str = None
     if per_fold is not None:
         per_fold_str = str(per_fold).lower()

diff --git a/openml/flows/functions.py b/openml/flows/functions.py
@@ -1,4 +1,5 @@
 # License: BSD 3-Clause
+import warnings
 
 import dateutil.parser
 from collections import OrderedDict
@@ -188,6 +189,14 @@ def list_flows(
             "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
         )
 
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15 "
+            "and pandas dataframes will be returned instead. To ensure your code "
+            "will continue to work, use `output_format`='dataframe'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
+
     return openml.utils._list_all(
         output_format=output_format,
         listing_call=_list_flows,

diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -102,7 +102,7 @@ def run_model_on_task(
         warnings.warn(
             "avoid_duplicate_runs is set to True, but no API key is set. "
             "Please set your API key in the OpenML configuration file, see"
-            "https://openml.github.io/openml-python/main/examples/20_basic/introduction_tutorial.html#authentication"
+            "https://openml.github.io/openml-python/main/examples/20_basic/introduction_tutorial.html#authentication"  # noqa: E501
             "for more information on authentication.",
         )
 
@@ -1012,6 +1012,13 @@ def list_runs(
         raise ValueError(
             "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
         )
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15 "
+            "and pandas dataframes will be returned instead. To ensure your code "
+            "will continue to work, use `output_format`='dataframe'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
 
     if id is not None and (not isinstance(id, list)):
         raise TypeError("id must be of type list.")

diff --git a/openml/setups/functions.py b/openml/setups/functions.py
@@ -1,5 +1,5 @@
 # License: BSD 3-Clause
-
+import warnings
 from collections import OrderedDict
 import io
 import os
@@ -140,6 +140,14 @@ def list_setups(
             "Invalid output format selected. " "Only 'dict', 'object', or 'dataframe' applicable."
         )
 
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15. "
+            "To ensure your code will continue to work, "
+            "use `output_format`='dataframe' or `output_format`='object'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
+
     batch_size = 1000  # batch size for setups is lower
     return openml.utils._list_all(
         output_format=output_format,

diff --git a/openml/study/functions.py b/openml/study/functions.py
@@ -459,6 +459,13 @@ def list_suites(
         raise ValueError(
             "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
         )
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15 "
+            "and pandas dataframes will be returned instead. To ensure your code "
+            "will continue to work, use `output_format`='dataframe'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
 
     return openml.utils._list_all(
         output_format=output_format,
@@ -532,6 +539,13 @@ def list_studies(
         raise ValueError(
             "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
         )
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15 "
+            "and pandas dataframes will be returned instead. To ensure your code "
+            "will continue to work, use `output_format`='dataframe'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
 
     return openml.utils._list_all(
         output_format=output_format,

diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
@@ -177,6 +177,13 @@ def list_tasks(
         raise ValueError(
             "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable."
         )
+    if output_format == "dict":
+        msg = (
+            "Support for `output_format` of 'dict' will be removed in 0.15 "
+            "and pandas dataframes will be returned instead. To ensure your code "
+            "will continue to work, use `output_format`='dataframe'."
+        )
+        warnings.warn(msg, category=FutureWarning, stacklevel=2)
     return openml.utils._list_all(
         output_format=output_format,
         listing_call=_list_tasks,