Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -628,7 +628,9 @@ def _encode_if_category(column):
)
elif array_format == "dataframe":
if scipy.sparse.issparse(data):
return pd.DataFrame.sparse.from_spmatrix(data, columns=attribute_names)
data = pd.DataFrame.sparse.from_spmatrix(data, columns=attribute_names)
if isinstance(data, pd.DataFrame) and data.shape[1] == 1:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this necessary? Don't you also do this in get_data()? Also, I think it'll break the balloon datast which has only a single attribute.

data = data.squeeze() # converting single column to Pandas Series
else:
data_type = "sparse-data" if scipy.sparse.issparse(data) else "non-sparse data"
logger.warning(
Expand Down Expand Up @@ -732,6 +734,7 @@ def get_data(
else:
target = [target]
targets = np.array([True if column in target else False for column in attribute_names])
target_names = np.array([column for column in attribute_names if column in target])
if np.sum(targets) > 1:
raise NotImplementedError(
"Number of requested targets %d is not implemented." % np.sum(targets)
Expand All @@ -752,10 +755,12 @@ def get_data(
attribute_names = [att for att, k in zip(attribute_names, targets) if not k]

x = self._convert_array_format(x, dataset_format, attribute_names)
if scipy.sparse.issparse(y):
y = np.asarray(y.todense()).astype(target_dtype).flatten()
y = y.squeeze()
y = self._convert_array_format(y, dataset_format, attribute_names)
if dataset_format == "array" and scipy.sparse.issparse(y):
# scikit-learn requires dense representation of targets
y = np.asarray(y.todense()).astype(target_dtype)
if not scipy.sparse.issparse(y):
y = y.squeeze()
y = self._convert_array_format(y, dataset_format, target_names)
y = y.astype(target_dtype) if dataset_format == "array" else y
data, targets = x, y

Expand Down
6 changes: 6 additions & 0 deletions tests/test_tasks/test_task_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,3 +234,9 @@ def test_deletion_of_cache_dir(self):
self.assertTrue(os.path.exists(tid_cache_dir))
openml.utils._remove_cache_dir_for_id("tasks", tid_cache_dir)
self.assertFalse(os.path.exists(tid_cache_dir))

def test_supervised_task_target_format(self):
Comment thread
mfeurer marked this conversation as resolved.
Outdated
openml.config.server = self.production_server
task = openml.tasks.get_task(12731)
_, y = task.get_X_and_y(dataset_format="dataframe")
self.assertIsInstance(y, pd.Series)