Add future warning dataset format (#1265)

PGijsbers · web-flow · commit abf9506ff7ef · 2023-07-03T10:46:23.000+02:00
* Add future warning for more user-facing functions that return arrays

* Use dataframe instead of array, as array will be deprecated

* Update for 0.15 release

* Update for 0.15.0 release that phases out arrays

* Fix mistakes introduced by switching to default dataframe
diff --git a/examples/20_basic/simple_flows_and_runs_tutorial.py b/examples/20_basic/simple_flows_and_runs_tutorial.py
@@ -23,7 +23,7 @@
 # NOTE: We are using dataset 20 from the test server: https://test.openml.org/d/20
 dataset = openml.datasets.get_dataset(20)
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    dataset_format="array", target=dataset.default_target_attribute
+    target=dataset.default_target_attribute
 )
 clf = neighbors.KNeighborsClassifier(n_neighbors=3)
 clf.fit(X, y)
diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py
@@ -64,23 +64,16 @@
 ############################################################################
 # Get the actual data.
 #
-# The dataset can be returned in 3 possible formats: as a NumPy array, a SciPy
-# sparse matrix, or as a Pandas DataFrame. The format is
-# controlled with the parameter ``dataset_format`` which can be either 'array'
-# (default) or 'dataframe'. Let's first build our dataset from a NumPy array
-# and manually create a dataframe.
-X, y, categorical_indicator, attribute_names = dataset.get_data(
-    dataset_format="array", target=dataset.default_target_attribute
-)
-eeg = pd.DataFrame(X, columns=attribute_names)
-eeg["class"] = y
-print(eeg[:10])
+# openml-python returns data as pandas dataframes (stored in the `eeg` variable below),
+# and also some additional metadata that we don't care about right now.
+eeg, *_ = dataset.get_data()
 
 ############################################################################
-# Instead of manually creating the dataframe, you can already request a
-# dataframe with the correct dtypes.
+# You can optionally choose to have openml separate out a column from the
+# dataset. In particular, many datasets for supervised problems have a set
+# `default_target_attribute` which may help identify the target variable.
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    target=dataset.default_target_attribute, dataset_format="dataframe"
+    target=dataset.default_target_attribute
 )
 print(X.head())
 print(X.info())
@@ -91,6 +84,9 @@
 # data file. The dataset object can be used as normal.
 # Whenever you use any functionality that requires the data,
 # such as `get_data`, the data will be downloaded.
+# Starting from 0.15, not downloading data will be the default behavior instead.
+# The data will be downloading automatically when you try to access it through
+# openml objects, e.g., using `dataset.features`.
 dataset = openml.datasets.get_dataset(1471, download_data=False)
 
 ############################################################################
@@ -99,8 +95,8 @@
 # * Explore the data visually.
 eegs = eeg.sample(n=1000)
 _ = pd.plotting.scatter_matrix(
-    eegs.iloc[:100, :4],
-    c=eegs[:100]["class"],
+    X.iloc[:100, :4],
+    c=y[:100],
     figsize=(10, 10),
     marker="o",
     hist_kwds={"bins": 20},
diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py
@@ -27,7 +27,7 @@
 # NOTE: We are using dataset 68 from the test server: https://test.openml.org/d/68
 dataset = openml.datasets.get_dataset(68)
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    dataset_format="array", target=dataset.default_target_attribute
+    target=dataset.default_target_attribute
 )
 clf = neighbors.KNeighborsClassifier(n_neighbors=1)
 clf.fit(X, y)
@@ -38,7 +38,7 @@
 # * e.g. categorical features -> do feature encoding
 dataset = openml.datasets.get_dataset(17)
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    dataset_format="array", target=dataset.default_target_attribute
+    target=dataset.default_target_attribute
 )
 print(f"Categorical features: {categorical_indicator}")
 transformer = compose.ColumnTransformer(
@@ -160,7 +160,7 @@
     ]
 )
 
-run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format="array")
+run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
 myrun = run.publish()
 print(f"Uploaded to {myrun.openml_url}")
 
@@ -172,15 +172,14 @@
 
 # To perform the following line offline, it is required to have been called before
 # such that the task is cached on the local openml cache directory:
-task = openml.tasks.get_task(6)
+task = openml.tasks.get_task(96)
 
 # The following lines can then be executed offline:
 run = openml.runs.run_model_on_task(
     pipe,
     task,
     avoid_duplicate_runs=False,
     upload_flow=False,
-    dataset_format="array",
 )
 
 # The run may be stored offline, and the flow will be stored along with it:
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -716,9 +716,11 @@ def get_data(
             on the server in the dataset.
         dataset_format : string (default='dataframe')
             The format of returned dataset.
-            If ``array``, the returned dataset will be a NumPy array or a SciPy sparse matrix.
+            If ``array``, the returned dataset will be a NumPy array or a SciPy sparse
+            matrix. Support for ``array`` will be removed in 0.15.
             If ``dataframe``, the returned dataset will be a Pandas DataFrame.
 
+
         Returns
         -------
         X : ndarray, dataframe, or sparse matrix, shape (n_samples, n_columns)
@@ -730,6 +732,16 @@ def get_data(
         attribute_names : List[str]
             List of attribute names.
         """
+        # TODO: [0.15]
+        if dataset_format == "array":
+            warnings.warn(
+                "Support for `dataset_format='array'` will be removed in 0.15,"
+                "start using `dataset_format='dataframe' to ensure your code "
+                "will continue to work. You can use the dataframe's `to_numpy` "
+                "function to continue using numpy arrays.",
+                category=FutureWarning,
+                stacklevel=2,
+            )
         data, categorical, attribute_names = self._load_data()
 
         to_exclude = []
diff --git a/openml/tasks/task.py b/openml/tasks/task.py
@@ -1,5 +1,5 @@
 # License: BSD 3-Clause
-
+import warnings
 from abc import ABC
 from collections import OrderedDict
 from enum import Enum
@@ -256,6 +256,16 @@ def get_X_and_y(
         tuple - X and y
 
         """
+        # TODO: [0.15]
+        if dataset_format == "array":
+            warnings.warn(
+                "Support for `dataset_format='array'` will be removed in 0.15,"
+                "start using `dataset_format='dataframe' to ensure your code "
+                "will continue to work. You can use the dataframe's `to_numpy` "
+                "function to continue using numpy arrays.",
+                category=FutureWarning,
+                stacklevel=2,
+            )
         dataset = self.get_dataset()
         if self.task_type_id not in (
             TaskType.SUPERVISED_CLASSIFICATION,

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@`
`23`	`23`	`# NOTE: We are using dataset 20 from the test server: https://test.openml.org/d/20`
`24`	`24`	`dataset = openml.datasets.get_dataset(20)`
`25`	`25`	`X, y, categorical_indicator, attribute_names = dataset.get_data(`
`26`		`- dataset_format="array", target=dataset.default_target_attribute`
	`26`	`+ target=dataset.default_target_attribute`
`27`	`27`	`)`
`28`	`28`	`clf = neighbors.KNeighborsClassifier(n_neighbors=3)`
`29`	`29`	`clf.fit(X, y)`