add test and fix for switch of ground truth and predictions

openml · mfeurer · Feb 24, 2023 · Feb 20, 2023 · Feb 20, 2023 · Feb 20, 2023
commit 4956a51c4e40a7e6fccd7539f7747164aca24070
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -1,29 +1,31 @@
 # License: BSD 3-Clause
 
-from collections import OrderedDict
 import io
 import itertools
 import os
 import time
-from typing import Any, List, Dict, Optional, Set, Tuple, Union, TYPE_CHECKING  # noqa F401
 import warnings
+from collections import OrderedDict
+from typing import Any, List, Dict, Optional, Set, Tuple, Union, TYPE_CHECKING  # noqa F401
 
-import sklearn.metrics
-import xmltodict
 import numpy as np
 import pandas as pd
+import sklearn.metrics
+import xmltodict
 from joblib.parallel import Parallel, delayed
 
 import openml
-import openml.utils
 import openml._api_calls
+import openml.utils
+from openml import config
 from openml.exceptions import PyOpenMLError
 from openml.extensions import get_extension_by_model
-from openml import config
 from openml.flows.flow import _copy_server_fields
+from .run import OpenMLRun
+from .trace import OpenMLRunTrace
+from ..exceptions import OpenMLCacheException, OpenMLServerException, OpenMLRunsExistError
 from ..flows import get_flow, flow_exists, OpenMLFlow
 from ..setups import setup_exists, initialize_model
-from ..exceptions import OpenMLCacheException, OpenMLServerException, OpenMLRunsExistError
 from ..tasks import (
     OpenMLTask,
     OpenMLClassificationTask,
@@ -32,8 +34,6 @@
     OpenMLSupervisedTask,
     OpenMLLearningCurveTask,
 )
-from .run import OpenMLRun
-from .trace import OpenMLRunTrace
 from ..tasks import TaskType, get_task
 
 # Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
@@ -155,7 +155,6 @@ def run_flow_on_task(
     dataset_format: str = "dataframe",
     n_jobs: Optional[int] = None,
 ) -> OpenMLRun:
-
     """Run the model provided by the flow on the dataset defined by task.
 
     Takes the flow and repeat information into account.
@@ -515,13 +514,13 @@ def _calculate_local_measure(sklearn_fn, openml_name):
                         else pred_y[i]
                     )
                     if isinstance(test_y, pd.Series):
-                        test_prediction = (
+                        truth = (
                             task.class_labels[test_y.iloc[i]]
                             if isinstance(test_y.iloc[i], int)
                             else test_y.iloc[i]
                         )
                     else:
-                        test_prediction = (
+                        truth = (
                             task.class_labels[test_y[i]]
                             if isinstance(test_y[i], (int, np.integer))
                             else test_y[i]
@@ -535,7 +534,7 @@ def _calculate_local_measure(sklearn_fn, openml_name):
                         sample=sample_no,
                         index=tst_idx,
                         prediction=prediction,
-                        truth=test_prediction,
+                        truth=truth,
                         proba=dict(zip(task.class_labels, pred_prob)),
                     )
                 else:
@@ -552,14 +551,14 @@ def _calculate_local_measure(sklearn_fn, openml_name):
         elif isinstance(task, OpenMLRegressionTask):
 
             for i, _ in enumerate(test_indices):
-                test_prediction = test_y.iloc[i] if isinstance(test_y, pd.Series) else test_y[i]
+                truth = test_y.iloc[i] if isinstance(test_y, pd.Series) else test_y[i]
                 arff_line = format_prediction(
                     task=task,
                     repeat=rep_no,
                     fold=fold_no,
                     index=test_indices[i],
                     prediction=pred_y[i],
-                    truth=test_prediction,
+                    truth=truth,
                 )
 
                 arff_datacontent.append(arff_line)
@@ -1186,6 +1185,10 @@ def format_prediction(
     -------
     A list with elements for the prediction results of a run.
 
+    The returned order of the elements is (if available):
+        [repeat, fold, sample, index, prediction, truth, *probabilities]
+
+    This order follows the R Client API.
     """
     if isinstance(task, OpenMLClassificationTask):
         if proba is None:
@@ -1200,8 +1203,8 @@ def format_prediction(
             else:
                 sample = 0
         probabilities = [proba[c] for c in task.class_labels]
-        return [repeat, fold, sample, index, *probabilities, truth, prediction]
+        return [repeat, fold, sample, index, prediction, truth, *probabilities]
     elif isinstance(task, OpenMLRegressionTask):
-        return [repeat, fold, index, truth, prediction]
+        return [repeat, fold, index, prediction, truth]
     else:
         raise NotImplementedError(f"Formatting for {type(task)} is not supported.")
diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -1,10 +1,10 @@
 # License: BSD 3-Clause
 
-from collections import OrderedDict
+import os
 import pickle
 import time
+from collections import OrderedDict
 from typing import Any, IO, TextIO, List, Union, Tuple, Optional, Dict  # noqa F401
-import os
 
 import arff
 import numpy as np
@@ -304,6 +304,8 @@ def _generate_arff_dict(self) -> "OrderedDict[str, Any]":
 
         Assumes that the run has been executed.
 
+        The order of the attributes follows the order defined by the Client API for R.
+
         Returns
         -------
         arf_dict : dict
@@ -337,11 +339,11 @@ def _generate_arff_dict(self) -> "OrderedDict[str, Any]":
             if class_labels is not None:
                 arff_dict["attributes"] = (
                     arff_dict["attributes"]
+                    + [("prediction", class_labels), ("correct", class_labels)]
                     + [
                         ("confidence." + class_labels[i], "NUMERIC")
                         for i in range(len(class_labels))
                     ]
-                    + [("prediction", class_labels), ("correct", class_labels)]
                 )
             else:
                 raise ValueError("The task has no class labels")
@@ -362,7 +364,7 @@ def _generate_arff_dict(self) -> "OrderedDict[str, Any]":
                 ]
                 prediction_and_true = [("prediction", class_labels), ("correct", class_labels)]
                 arff_dict["attributes"] = (
-                    arff_dict["attributes"] + prediction_confidences + prediction_and_true
+                    arff_dict["attributes"] + prediction_and_true + prediction_confidences
                 )
             else:
                 raise ValueError("The task has no class labels")

diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
@@ -1,22 +1,22 @@
 # License: BSD 3-Clause
 
-import numpy as np
-import random
 import os
+import random
 from time import time
 
+import numpy as np
+import pytest
 import xmltodict
 from sklearn.dummy import DummyClassifier
-from sklearn.tree import DecisionTreeClassifier
+from sklearn.linear_model import LinearRegression
 from sklearn.model_selection import GridSearchCV
 from sklearn.pipeline import Pipeline
+from sklearn.tree import DecisionTreeClassifier
 
-from openml import OpenMLRun
-from openml.testing import TestBase, SimpleImputer
 import openml
 import openml.extensions.sklearn
-
-import pytest
+from openml import OpenMLRun
+from openml.testing import TestBase, SimpleImputer
 
 
 class TestRun(TestBase):
@@ -189,47 +189,105 @@ def test_to_from_filesystem_no_model(self):
         with self.assertRaises(ValueError, msg="Could not find model.pkl"):
             openml.runs.OpenMLRun.from_filesystem(cache_path)
 
+    @staticmethod
+    def assert_run_prediction_data(task, run):
+        # -- Get y_pred and y_true as it should be stored in the run
+        fold_map = np.full(int(task.get_dataset().qualities["NumberOfInstances"]), -1)
+        s_d = task.get_split_dimensions()
+        if (s_d[0] > 1) or (s_d[2] > 1):
+            raise ValueError("Test does not support this task type's split dimensions.")
+
+        for fold_id in range(s_d[1]):
+            _, test_indices = task.get_train_test_split_indices(repeat=0, fold=fold_id, sample=0)
+            fold_map[test_indices] = fold_id
+
+        X, y = task.get_X_and_y()
+
+        # Check correctness of y_ture and y_pred in run
+        for fold_id in range(s_d[1]):
+            # Get data for fold
+            test_indices = np.where(fold_map == fold_id)[0]
+            train_mask = np.full(len(fold_map), True)
+            train_mask[test_indices] = False
+            X_train = X[train_mask]
+            y_train = y[train_mask]
+            X_test = X[test_indices]
+            y_test = y[test_indices]
+            y_pred = LinearRegression().fit(X_train, y_train).predict(X_test)
+
+            # Get stored data for fold
+            saved_fold_data = run.predictions[run.predictions["fold"] == fold_id].sort_values(
+                by="row_id"
+            )
+            saved_y_pred = saved_fold_data["prediction"].values
+            gt_key = "truth" if "truth" in list(saved_fold_data) else "correct"
+            saved_y_test = saved_fold_data[gt_key].values
+
+            assert_method = np.testing.assert_array_almost_equal
+            if task.task_type == "Supervised Classification":
+                y_pred = np.take(task.class_labels, y_pred)
+                y_test = np.take(task.class_labels, y_test)
+                assert_method = np.testing.assert_array_equal
+
+            # Assert correctness
+            assert_method(y_pred, saved_y_pred)
+            assert_method(y_test, saved_y_test)
+
     def test_publish_with_local_loaded_flow(self):
         """
         Publish a run tied to a local flow after it has first been saved to
          and loaded from disk.
         """
         extension = openml.extensions.sklearn.SklearnExtension()
 
-        model = Pipeline(
+        model_clf = Pipeline(
             [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())]
         )
-        task = openml.tasks.get_task(119)  # diabetes; crossvalidation
-
-        # Make sure the flow does not exist on the server yet.
-        flow = extension.model_to_flow(model)
-        self._add_sentinel_to_flow_name(flow)
-        self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
-
-        run = openml.runs.run_flow_on_task(
-            flow=flow,
-            task=task,
-            add_local_measures=False,
-            avoid_duplicate_runs=False,
-            upload_flow=False,
+        model_reg = Pipeline(
+            [
+                ("imputer", SimpleImputer(strategy="mean")),
+                (
+                    "regressor",
+                    # LR because dummy does not produce enough float-like values
+                    LinearRegression(),
+                ),
+            ]
         )
 
-        # Make sure that the flow has not been uploaded as requested.
-        self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
+        task_clf = openml.tasks.get_task(119)  # diabetes; hold out validation
+        task_reg = openml.tasks.get_task(733)  # quake; crossvalidation
+
+        for model, task in [(model_clf, task_clf), (model_reg, task_reg)]:
+            # Make sure the flow does not exist on the server yet.
+            flow = extension.model_to_flow(model)
+            self._add_sentinel_to_flow_name(flow)
+            self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
+
+            run = openml.runs.run_flow_on_task(
+                flow=flow,
+                task=task,
+                add_local_measures=False,
+                avoid_duplicate_runs=False,
+                upload_flow=False,
+            )
 
-        cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
-        run.to_filesystem(cache_path)
-        # obtain run from filesystem
-        loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
-        loaded_run.publish()
-        TestBase._mark_entity_for_removal("run", loaded_run.run_id)
-        TestBase.logger.info(
-            "collected from {}: {}".format(__file__.split("/")[-1], loaded_run.run_id)
-        )
+            # Make sure that the flow has not been uploaded as requested.
+            self.assertFalse(openml.flows.flow_exists(flow.name, flow.external_version))
+            self.assert_run_prediction_data(task, run)
+
+            cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
+            run.to_filesystem(cache_path)
+            # obtain run from filesystem
+            loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
+            loaded_run.publish()
+            TestBase._mark_entity_for_removal("run", loaded_run.run_id)
+            TestBase.logger.info(
+                "collected from {}: {}".format(__file__.split("/")[-1], loaded_run.run_id)
+            )
 
-        # make sure the flow is published as part of publishing the run.
-        self.assertTrue(openml.flows.flow_exists(flow.name, flow.external_version))
-        openml.runs.get_run(loaded_run.run_id)
+            # make sure the flow is published as part of publishing the run.
+            self.assertTrue(openml.flows.flow_exists(flow.name, flow.external_version))
+            openml.runs.get_run(loaded_run.run_id)
 
     def test_run_setup_string_included_in_xml(self):
         SETUP_STRING = "setup-string"