Skip to content

Commit abf9506

Browse files
authored
Add future warning dataset format (#1265)
* Add future warning for more user-facing functions that return arrays * Use dataframe instead of array, as array will be deprecated * Update for 0.15 release * Update for 0.15.0 release that phases out arrays * Fix mistakes introduced by switching to default dataframe
1 parent a186012 commit abf9506

File tree

5 files changed

+41
-24
lines changed

5 files changed

+41
-24
lines changed

examples/20_basic/simple_flows_and_runs_tutorial.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
# NOTE: We are using dataset 20 from the test server: https://test.openml.org/d/20
2424
dataset = openml.datasets.get_dataset(20)
2525
X, y, categorical_indicator, attribute_names = dataset.get_data(
26-
dataset_format="array", target=dataset.default_target_attribute
26+
target=dataset.default_target_attribute
2727
)
2828
clf = neighbors.KNeighborsClassifier(n_neighbors=3)
2929
clf.fit(X, y)

examples/30_extended/datasets_tutorial.py

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -64,23 +64,16 @@
6464
############################################################################
6565
# Get the actual data.
6666
#
67-
# The dataset can be returned in 3 possible formats: as a NumPy array, a SciPy
68-
# sparse matrix, or as a Pandas DataFrame. The format is
69-
# controlled with the parameter ``dataset_format`` which can be either 'array'
70-
# (default) or 'dataframe'. Let's first build our dataset from a NumPy array
71-
# and manually create a dataframe.
72-
X, y, categorical_indicator, attribute_names = dataset.get_data(
73-
dataset_format="array", target=dataset.default_target_attribute
74-
)
75-
eeg = pd.DataFrame(X, columns=attribute_names)
76-
eeg["class"] = y
77-
print(eeg[:10])
67+
# openml-python returns data as pandas dataframes (stored in the `eeg` variable below),
68+
# and also some additional metadata that we don't care about right now.
69+
eeg, *_ = dataset.get_data()
7870

7971
############################################################################
80-
# Instead of manually creating the dataframe, you can already request a
81-
# dataframe with the correct dtypes.
72+
# You can optionally choose to have openml separate out a column from the
73+
# dataset. In particular, many datasets for supervised problems have a set
74+
# `default_target_attribute` which may help identify the target variable.
8275
X, y, categorical_indicator, attribute_names = dataset.get_data(
83-
target=dataset.default_target_attribute, dataset_format="dataframe"
76+
target=dataset.default_target_attribute
8477
)
8578
print(X.head())
8679
print(X.info())
@@ -91,6 +84,9 @@
9184
# data file. The dataset object can be used as normal.
9285
# Whenever you use any functionality that requires the data,
9386
# such as `get_data`, the data will be downloaded.
87+
# Starting from 0.15, not downloading data will be the default behavior instead.
88+
# The data will be downloading automatically when you try to access it through
89+
# openml objects, e.g., using `dataset.features`.
9490
dataset = openml.datasets.get_dataset(1471, download_data=False)
9591

9692
############################################################################
@@ -99,8 +95,8 @@
9995
# * Explore the data visually.
10096
eegs = eeg.sample(n=1000)
10197
_ = pd.plotting.scatter_matrix(
102-
eegs.iloc[:100, :4],
103-
c=eegs[:100]["class"],
98+
X.iloc[:100, :4],
99+
c=y[:100],
104100
figsize=(10, 10),
105101
marker="o",
106102
hist_kwds={"bins": 20},

examples/30_extended/flows_and_runs_tutorial.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
# NOTE: We are using dataset 68 from the test server: https://test.openml.org/d/68
2828
dataset = openml.datasets.get_dataset(68)
2929
X, y, categorical_indicator, attribute_names = dataset.get_data(
30-
dataset_format="array", target=dataset.default_target_attribute
30+
target=dataset.default_target_attribute
3131
)
3232
clf = neighbors.KNeighborsClassifier(n_neighbors=1)
3333
clf.fit(X, y)
@@ -38,7 +38,7 @@
3838
# * e.g. categorical features -> do feature encoding
3939
dataset = openml.datasets.get_dataset(17)
4040
X, y, categorical_indicator, attribute_names = dataset.get_data(
41-
dataset_format="array", target=dataset.default_target_attribute
41+
target=dataset.default_target_attribute
4242
)
4343
print(f"Categorical features: {categorical_indicator}")
4444
transformer = compose.ColumnTransformer(
@@ -160,7 +160,7 @@
160160
]
161161
)
162162

163-
run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format="array")
163+
run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
164164
myrun = run.publish()
165165
print(f"Uploaded to {myrun.openml_url}")
166166

@@ -172,15 +172,14 @@
172172

173173
# To perform the following line offline, it is required to have been called before
174174
# such that the task is cached on the local openml cache directory:
175-
task = openml.tasks.get_task(6)
175+
task = openml.tasks.get_task(96)
176176

177177
# The following lines can then be executed offline:
178178
run = openml.runs.run_model_on_task(
179179
pipe,
180180
task,
181181
avoid_duplicate_runs=False,
182182
upload_flow=False,
183-
dataset_format="array",
184183
)
185184

186185
# The run may be stored offline, and the flow will be stored along with it:

openml/datasets/dataset.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -716,9 +716,11 @@ def get_data(
716716
on the server in the dataset.
717717
dataset_format : string (default='dataframe')
718718
The format of returned dataset.
719-
If ``array``, the returned dataset will be a NumPy array or a SciPy sparse matrix.
719+
If ``array``, the returned dataset will be a NumPy array or a SciPy sparse
720+
matrix. Support for ``array`` will be removed in 0.15.
720721
If ``dataframe``, the returned dataset will be a Pandas DataFrame.
721722
723+
722724
Returns
723725
-------
724726
X : ndarray, dataframe, or sparse matrix, shape (n_samples, n_columns)
@@ -730,6 +732,16 @@ def get_data(
730732
attribute_names : List[str]
731733
List of attribute names.
732734
"""
735+
# TODO: [0.15]
736+
if dataset_format == "array":
737+
warnings.warn(
738+
"Support for `dataset_format='array'` will be removed in 0.15,"
739+
"start using `dataset_format='dataframe' to ensure your code "
740+
"will continue to work. You can use the dataframe's `to_numpy` "
741+
"function to continue using numpy arrays.",
742+
category=FutureWarning,
743+
stacklevel=2,
744+
)
733745
data, categorical, attribute_names = self._load_data()
734746

735747
to_exclude = []

openml/tasks/task.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# License: BSD 3-Clause
2-
2+
import warnings
33
from abc import ABC
44
from collections import OrderedDict
55
from enum import Enum
@@ -256,6 +256,16 @@ def get_X_and_y(
256256
tuple - X and y
257257
258258
"""
259+
# TODO: [0.15]
260+
if dataset_format == "array":
261+
warnings.warn(
262+
"Support for `dataset_format='array'` will be removed in 0.15,"
263+
"start using `dataset_format='dataframe' to ensure your code "
264+
"will continue to work. You can use the dataframe's `to_numpy` "
265+
"function to continue using numpy arrays.",
266+
category=FutureWarning,
267+
stacklevel=2,
268+
)
259269
dataset = self.get_dataset()
260270
if self.task_type_id not in (
261271
TaskType.SUPERVISED_CLASSIFICATION,

0 commit comments

Comments
 (0)