Making DataFrame default behaviour for runs; Fixing test cases for th…

…e same
openml · mfeurer · Oct 23, 2020 · Sep 3, 2019 · Sep 3, 2019 · Oct 16, 2019
commit 5c5fb31c5531832865ba3de2f2cfee946f6b73c8
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
@@ -762,6 +762,7 @@ def _get_external_version_string(
         # requirements for their subcomponents. The external version string is a
         # sorted concatenation of all modules which are present in this run.
         model_package_name = model.__module__.split('.')[0]
+
         module = importlib.import_module(model_package_name)
         model_package_version_number = module.__version__  # type: ignore
         external_version = self._format_external_version(
@@ -1512,10 +1513,11 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
             if not isinstance(classes, list):
                 raise ValueError('please convert model classes to list prior to '
                                  'calling this fn')
-            result = np.zeros((len(y), len(classes)), dtype=np.float32)
-            for obs, prediction_idx in enumerate(y):
-                result[obs][prediction_idx] = 1.0
-            return result
+            # DataFrame allows more accurate mapping of classes as column names
+            result = pd.DataFrame(0, index=np.arange(len(y)), columns=classes, dtype=np.float32)
+            for obs, prediction in enumerate(y):
+                result.loc[obs, prediction] = 1.0
+            return result.to_numpy()
 
         if isinstance(task, OpenMLSupervisedTask):
             if y_train is None:
@@ -1573,6 +1575,11 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
             else:
                 model_classes = used_estimator.classes_
 
+            # to handle the case when dataset is numpy and categories are encoded
+            # however the class labels stored in task are still categories
+            if isinstance(y_train, np.ndarray) and isinstance(task.class_labels[0], str):
+                model_classes = [task.class_labels[i] for i in model_classes]
+
         modelpredict_start_cputime = time.process_time()
         modelpredict_start_walltime = time.time()
 
@@ -1601,9 +1608,16 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
 
             try:
                 proba_y = model_copy.predict_proba(X_test)
-            except AttributeError:
+            except AttributeError:  # predict_proba is not available when probability=False
                 if task.class_labels is not None:
-                    proba_y = _prediction_to_probabilities(pred_y, list(task.class_labels))
+                    if isinstance(y_train, np.ndarray) and isinstance(task.class_labels[0], str):
+                        # mapping (decoding) the predictions to the categories
+                        # creating a separate copy to not change the expected pred_y type
+                        preds = [task.class_labels[pred] for pred in pred_y]
+                        proba_y = _prediction_to_probabilities(preds, model_classes)
+                    else:
+                        proba_y = _prediction_to_probabilities(pred_y, model_classes)
+
                 else:
                     raise ValueError('The task has no class labels')
 
@@ -1619,10 +1633,13 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
                     # then we need to add a column full of zeros into the probabilities
                     # for class 3 because the rest of the library expects that the
                     # probabilities are ordered the same way as the classes are ordered).
-                    proba_y_new = np.zeros((proba_y.shape[0], len(task.class_labels)))
+
+                    # DataFrame allows more accurate mapping of classes as column names
+                    proba_y_new = pd.DataFrame(0, index=np.arange(proba_y.shape[0]),
+                                               columns=task.class_labels, dtype=np.float32)
                     for idx, model_class in enumerate(model_classes):
-                        proba_y_new[:, model_class] = proba_y[:, idx]
-                    proba_y = proba_y_new
+                        proba_y_new.loc[:, model_class] = proba_y[:, idx]
+                    proba_y = proba_y_new.to_numpy()
 
                 if proba_y.shape[1] != len(task.class_labels):
                     message = "Estimator only predicted for {}/{} classes!".format(

diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -489,7 +489,7 @@ def _calculate_local_measure(sklearn_fn, openml_name):
         elif isinstance(task, OpenMLRegressionTask):
 
             for i in range(0, len(test_indices)):
-                arff_line = [rep_no, fold_no, test_indices[i], pred_y[i], test_y[i]]
+                arff_line = [rep_no, fold_no, test_indices[i], pred_y[i], test_y.iloc[i]]
                 arff_datacontent.append(arff_line)
 
             if add_local_measures:

diff --git a/openml/testing.py b/openml/testing.py
@@ -249,4 +249,16 @@ def _check_fold_timing_evaluations(
     from sklearn.preprocessing import Imputer as SimpleImputer
 
 
-__all__ = ['TestBase', 'SimpleImputer']
+class CustomImputer(SimpleImputer):
+    pass
+
+
+def cont(X):
+    return X.dtypes != 'category'
+
+
+def cat(X):
+    return X.dtypes == 'category'
+
+
+__all__ = ['TestBase', 'SimpleImputer', 'CustomImputer', 'cat', 'cont']
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -1340,7 +1340,8 @@ def test_run_model_on_task(self):
         class MyPipe(sklearn.pipeline.Pipeline):
             pass
         task = openml.tasks.get_task(1)
-        pipe = MyPipe([('imp', SimpleImputer()),
+        # using most_frequent imputer since dataste has mixed types and to keep things simple
+        pipe = MyPipe([('imp', SimpleImputer(strategy='most_frequent')),
                        ('dummy', sklearn.dummy.DummyClassifier())])
         openml.runs.run_model_on_task(pipe, task)
 

diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -8,6 +8,7 @@
 import sys
 import unittest.mock
 
+import scipy
 import numpy as np
 import pytest
 
@@ -20,7 +21,7 @@
 import pandas as pd
 
 import openml.extensions.sklearn
-from openml.testing import TestBase, SimpleImputer
+from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont
 from openml.runs.functions import (
     _run_task_get_arffcontent,
     run_exists,
@@ -31,17 +32,16 @@
 from sklearn.naive_bayes import GaussianNB
 from sklearn.model_selection._search import BaseSearchCV
 from sklearn.tree import DecisionTreeClassifier
-
 from sklearn.dummy import DummyClassifier
-from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
 from sklearn.feature_selection import VarianceThreshold
 from sklearn.linear_model import LogisticRegression, SGDClassifier, \
     LinearRegression
 from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
 from sklearn.svm import SVC
 from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, \
     StratifiedKFold
-from sklearn.pipeline import Pipeline
+from sklearn.pipeline import Pipeline, make_pipeline
 
 
 class TestRun(TestBase):
@@ -342,7 +342,9 @@ def test_run_regression_on_classif_task(self):
 
         clf = LinearRegression()
         task = openml.tasks.get_task(task_id)
-        with self.assertRaises(AttributeError):
+        # internally dataframe is loaded and targets are categorical
+        # which LinearRegression() cannot handle
+        with self.assertRaises(ValueError):
             openml.runs.run_model_on_task(
                 model=clf,
                 task=task,
@@ -537,15 +539,17 @@ def test_run_and_upload_column_transformer_pipeline(self):
         def get_ct_cf(nominal_indices, numeric_indices):
             inner = sklearn.compose.ColumnTransformer(
                 transformers=[
-                    ('numeric', sklearn.preprocessing.StandardScaler(),
-                     nominal_indices),
-                    ('nominal', sklearn.preprocessing.OneHotEncoder(
-                        handle_unknown='ignore'), numeric_indices)],
+                    ('numeric',
+                     make_pipeline(SimpleImputer(strategy='most_frequent'),
+                                   sklearn.preprocessing.StandardScaler()),
+                     numeric_indices),
+                    ('nominal',
+                     make_pipeline(CustomImputer(),
+                                   sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore')),
+                     nominal_indices)],
                 remainder='passthrough')
             return sklearn.pipeline.Pipeline(
                 steps=[
-                    ('imputer', sklearn.impute.SimpleImputer(
-                        strategy='constant', fill_value=-1)),
                     ('transformer', inner),
                     ('classifier', sklearn.tree.DecisionTreeClassifier())
                 ]
@@ -567,8 +571,17 @@ def get_ct_cf(nominal_indices, numeric_indices):
             self.TEST_SERVER_TASK_MISSING_VALS[2],
             '62501', sentinel=sentinel)
 
+    @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
+                     reason="columntransformer introduction in 0.20.0")
     def test_run_and_upload_decision_tree_pipeline(self):
-        pipeline2 = Pipeline(steps=[('Imputer', SimpleImputer(strategy='median')),
+
+        cat_imp = make_pipeline(SimpleImputer(strategy='most_frequent'),
+                                OneHotEncoder(handle_unknown='ignore'))
+        cont_imp = make_pipeline(CustomImputer(), StandardScaler())
+        from sklearn.compose import ColumnTransformer
+        ct = ColumnTransformer([('cat', cat_imp, cat),
+                                ('cont', cont_imp, cont)])
+        pipeline2 = Pipeline(steps=[('Imputer', ct),
                                     ('VarianceThreshold', VarianceThreshold()),
                                     ('Estimator', RandomizedSearchCV(
                                         DecisionTreeClassifier(),
@@ -689,6 +702,8 @@ def test_learning_curve_task_2(self):
         self._check_sample_evaluations(run.sample_evaluations, num_repeats,
                                        num_folds, num_samples)
 
+    @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.21",
+                     reason="Pipelines don't support indexing (used for the assert check)")
     def test_initialize_cv_from_run(self):
         randomsearch = RandomizedSearchCV(
             RandomForestClassifier(n_estimators=5),
@@ -701,9 +716,11 @@ def test_initialize_cv_from_run(self):
             cv=StratifiedKFold(n_splits=2, shuffle=True),
             n_iter=2)
 
+        clf = make_pipeline(OneHotEncoder(handle_unknown='ignore'), randomsearch)
+
         task = openml.tasks.get_task(11)
         run = openml.runs.run_model_on_task(
-            model=randomsearch,
+            model=clf,
             task=task,
             avoid_duplicate_runs=False,
             seed=1,
@@ -716,8 +733,8 @@ def test_initialize_cv_from_run(self):
         modelR = openml.runs.initialize_model_from_run(run_id=run.run_id)
         modelS = openml.setups.initialize_model(setup_id=run.setup_id)
 
-        self.assertEqual(modelS.cv.random_state, 62501)
-        self.assertEqual(modelR.cv.random_state, 62501)
+        self.assertEqual(modelS[-1].cv.random_state, 62501)
+        self.assertEqual(modelR[-1].cv.random_state, 62501)
 
     def _test_local_evaluations(self, run):
 
@@ -749,10 +766,14 @@ def _test_local_evaluations(self, run):
                 self.assertGreaterEqual(alt_scores[idx], 0)
                 self.assertLessEqual(alt_scores[idx], 1)
 
+    @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
+                     reason="SimpleImputer doesn't handle mixed type DataFrame as input")
     def test_local_run_swapped_parameter_order_model(self):
 
         # construct sci-kit learn classifier
-        clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
+        clf = Pipeline(steps=[('imputer', make_pipeline(SimpleImputer(strategy='most_frequent'),
+                                                        OneHotEncoder(handle_unknown='ignore'))),
+                              # random forest doesn't take categoricals
                               ('estimator', RandomForestClassifier())])
 
         # download task
@@ -767,11 +788,14 @@ def test_local_run_swapped_parameter_order_model(self):
 
         self._test_local_evaluations(run)
 
+    @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
+                     reason="SimpleImputer doesn't handle mixed type DataFrame as input")
     def test_local_run_swapped_parameter_order_flow(self):
 
         # construct sci-kit learn classifier
-        clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
-                              ('estimator', RandomForestClassifier())])
+        clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
+                              ('encoder', OneHotEncoder(handle_unknown='ignore')),
+                              ('estimator', RandomForestClassifier(n_estimators=10))])
 
         flow = self.extension.model_to_flow(clf)
         # download task
@@ -786,11 +810,13 @@ def test_local_run_swapped_parameter_order_flow(self):
 
         self._test_local_evaluations(run)
 
+    @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
+                     reason="SimpleImputer doesn't handle mixed type DataFrame as input")
     def test_local_run_metric_score(self):
-
         # construct sci-kit learn classifier
-        clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
-                              ('estimator', RandomForestClassifier())])
+        clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
+                              ('encoder', OneHotEncoder(handle_unknown='ignore')),
+                              ('estimator', RandomForestClassifier(n_estimators=10))])
 
         # download task
         task = openml.tasks.get_task(7)
@@ -814,11 +840,31 @@ def test_online_run_metric_score(self):
 
         self._test_local_evaluations(run)
 
+    @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
+                     reason="SimpleImputer doesn't handle mixed type DataFrame as input")
     def test_initialize_model_from_run(self):
+        class MyGaussianNB(GaussianNB):
+            def fit(self, X, y, **kwargs):
+                if scipy.sparse.issparse(X):
+                    X = X.toarray()
+                return super().fit(X=X, y=y, **kwargs)
+
+            def predict(self, X, **kwargs):
+                if scipy.sparse.issparse(X):
+                    X = X.toarray()
+                return super().predict(X=X, **kwargs)
+
+            def predict_proba(self, X, **kwargs):
+                # if isinstance(X, scipy.sparse.csr.csr_matrix):
+                if scipy.sparse.issparse(X):
+                    X = X.toarray()
+                return super().predict_proba(X=X, **kwargs)
+
         clf = sklearn.pipeline.Pipeline(steps=[
-            ('Imputer', SimpleImputer(strategy='median')),
+            ('Imputer', SimpleImputer(strategy='most_frequent')),
+            ('Encoder', OneHotEncoder(handle_unknown='ignore')),
             ('VarianceThreshold', VarianceThreshold(threshold=0.05)),
-            ('Estimator', GaussianNB())])
+            ('Estimator', MyGaussianNB())])
         task = openml.tasks.get_task(11)
         run = openml.runs.run_model_on_task(
             model=clf,
@@ -894,6 +940,8 @@ def test_get_run_trace(self):
         run_trace = openml.runs.get_run_trace(run_id)
         self.assertEqual(len(run_trace.trace_iterations), num_iterations * num_folds)
 
+    @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
+                     reason="SimpleImputer doesn't handle mixed type DataFrame as input")
     def test__run_exists(self):
         # would be better to not sentinel these clfs,
         # so we do not have to perform the actual runs
@@ -1059,6 +1107,8 @@ def test_run_with_illegal_flow_id_1_after_load(self):
             loaded_run.publish
         )
 
+    @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
+                     reason="OneHotEncoder cannot handle mixed type DataFrame as input")
     def test__run_task_get_arffcontent(self):
         task = openml.tasks.get_task(7)
         num_instances = 3196
@@ -1067,7 +1117,8 @@ def test__run_task_get_arffcontent(self):
 
         flow = unittest.mock.Mock()
         flow.name = 'dummy'
-        clf = SGDClassifier(loss='log', random_state=1)
+        clf = make_pipeline(OneHotEncoder(handle_unknown='ignore'),
+                            SGDClassifier(loss='log', random_state=1))
         res = openml.runs.functions._run_task_get_arffcontent(
             flow=flow,
             extension=self.extension,
@@ -1272,17 +1323,28 @@ def test_get_runs_list_by_tag(self):
         runs = openml.runs.list_runs(tag='curves')
         self.assertGreaterEqual(len(runs), 1)
 
+    @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
+                     reason="columntransformer introduction in 0.20.0")
     def test_run_on_dataset_with_missing_labels(self):
         # Check that _run_task_get_arffcontent works when one of the class
         # labels only declared in the arff file, but is not present in the
         # actual data
-
         flow = unittest.mock.Mock()
         flow.name = 'dummy'
         task = openml.tasks.get_task(2)
 
-        model = Pipeline(steps=[('Imputer', SimpleImputer(strategy='median')),
-                                ('Estimator', DecisionTreeClassifier())])
+        from sklearn.compose import ColumnTransformer
+        cat_imp = make_pipeline(SimpleImputer(strategy='most_frequent'),
+                                OneHotEncoder(handle_unknown='ignore'))
+        cont_imp = make_pipeline(CustomImputer(), StandardScaler())
+        ct = ColumnTransformer([('cat', cat_imp, cat),
+                                ('cont', cont_imp, cont)])
+        model = Pipeline(
+            steps=[
+                ('preprocess', ct),
+                ('estimator', sklearn.tree.DecisionTreeClassifier())
+            ]
+        )  # build a sklearn classifier
 
         data_content, _, _, _ = _run_task_get_arffcontent(
             flow=flow,