openml · mfeurer · Apr 26, 2021 · Apr 15, 2021 · Apr 19, 2021 · Apr 21, 2021
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -628,7 +628,9 @@ def _encode_if_category(column):
                 )
         elif array_format == "dataframe":
             if scipy.sparse.issparse(data):
-                return pd.DataFrame.sparse.from_spmatrix(data, columns=attribute_names)
+                data = pd.DataFrame.sparse.from_spmatrix(data, columns=attribute_names)
+            if isinstance(data, pd.DataFrame) and data.shape[1] == 1:
+                data = data.squeeze()  # converting single column to Pandas Series
         else:
             data_type = "sparse-data" if scipy.sparse.issparse(data) else "non-sparse data"
             logger.warning(
@@ -732,6 +734,7 @@ def get_data(
                 else:
                     target = [target]
             targets = np.array([True if column in target else False for column in attribute_names])
+            target_names = np.array([column for column in attribute_names if column in target])
             if np.sum(targets) > 1:
                 raise NotImplementedError(
                     "Number of requested targets %d is not implemented." % np.sum(targets)
@@ -752,10 +755,12 @@ def get_data(
             attribute_names = [att for att, k in zip(attribute_names, targets) if not k]
 
             x = self._convert_array_format(x, dataset_format, attribute_names)
-            if scipy.sparse.issparse(y):
-                y = np.asarray(y.todense()).astype(target_dtype).flatten()
-            y = y.squeeze()
-            y = self._convert_array_format(y, dataset_format, attribute_names)
+            if dataset_format == "array" and scipy.sparse.issparse(y):
+                # scikit-learn requires dense representation of targets
+                y = np.asarray(y.todense()).astype(target_dtype)
+            if not scipy.sparse.issparse(y):
+                y = y.squeeze()
+            y = self._convert_array_format(y, dataset_format, target_names)
             y = y.astype(target_dtype) if dataset_format == "array" else y
             data, targets = x, y
 

diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
@@ -234,3 +234,9 @@ def test_deletion_of_cache_dir(self):
         self.assertTrue(os.path.exists(tid_cache_dir))
         openml.utils._remove_cache_dir_for_id("tasks", tid_cache_dir)
         self.assertFalse(os.path.exists(tid_cache_dir))
+
+    def test_supervised_task_target_format(self):
+        openml.config.server = self.production_server
+        task = openml.tasks.get_task(12731)
+        _, y = task.get_X_and_y(dataset_format="dataframe")
+        self.assertIsInstance(y, pd.Series)