Updating with PR #982

Neeratyoy · Neeratyoy · commit 902cd3fe7507 · 2021-01-26T20:33:34.000+01:00
diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py
@@ -9,7 +9,7 @@
 
 import openml
 import numpy as np
-from sklearn import compose, ensemble, neighbors, preprocessing, pipeline, tree
+from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
 
 ############################################################################
 # Train machine learning models
@@ -38,13 +38,9 @@
 X, y, categorical_indicator, attribute_names = dataset.get_data(
     dataset_format="array", target=dataset.default_target_attribute
 )
-numerical_indicator = list(~np.array(categorical_indicator))
 print(f"Categorical features: {categorical_indicator}")
 transformer = compose.ColumnTransformer(
-    [
-        ("one_hot_encoder", preprocessing.OneHotEncoder(categories="auto"), categorical_indicator),
-        ("numeric_pass", "passthrough", numerical_indicator),
-    ]
+    [("one_hot_encoder", preprocessing.OneHotEncoder(categories="auto"), categorical_indicator)]
 )
 X = transformer.fit_transform(X)
 clf.fit(X, y)
@@ -88,17 +84,9 @@
 #
 # When you need to handle 'dirty' data, build pipelines to model then automatically.
 task = openml.tasks.get_task(1)
-features = task.get_dataset().features
-nominal_feature_indices = [
-    i
-    for i in range(len(features))
-    if features[i].name != task.target_name and features[i].data_type == "nominal"
-]
-numeric_feature_indices = [
-    i
-    for i in range(len(features))
-    if features[i].name != task.target_name and features[i].data_type == "numeric"
-]
+
+# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines
+from openml.extensions.sklearn import cat, cont
 
 pipe = pipeline.Pipeline(
     steps=[
@@ -107,11 +95,21 @@
             compose.ColumnTransformer(
                 [
                     (
-                        "Nominal",
-                        preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore",),
-                        nominal_feature_indices,
+                        "categorical",
+                        pipeline.Pipeline(
+                            [
+                                ("Imputer", impute.SimpleImputer(strategy="most_frequent")),
+                                (
+                                    "Encoder",
+                                    preprocessing.OneHotEncoder(
+                                        sparse=False, handle_unknown="ignore"
+                                    ),
+                                ),
+                            ]
+                        ),
+                        cat,  # returns the categorical feature indices
                     ),
-                    ("Numeric", "passthrough", numeric_feature_indices,),
+                    ("continuous", "passthrough", cont),  # returns the numeric feature indices
                 ]
             ),
         ),
@@ -123,6 +121,56 @@
 myrun = run.publish()
 print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id))
 
+
+# The above pipeline works with the helper functions that internally deal with pandas DataFrame.
+# In the case, pandas is not available, or a NumPy based data processing is the requirement, the
+# above pipeline is presented below to work with NumPy.
+
+# Extracting the indices of the categorical columns
+features = task.get_dataset().features
+categorical_feature_indices = []
+numeric_feature_indices = []
+for i in range(len(features)):
+    if features[i].name == task.target_name:
+        continue
+    if features[i].data_type == "nominal":
+        categorical_feature_indices.append(i)
+    else:
+        numeric_feature_indices.append(i)
+
+pipe = pipeline.Pipeline(
+    steps=[
+        (
+            "Preprocessing",
+            compose.ColumnTransformer(
+                [
+                    (
+                        "categorical",
+                        pipeline.Pipeline(
+                            [
+                                ("Imputer", impute.SimpleImputer(strategy="most_frequent")),
+                                (
+                                    "Encoder",
+                                    preprocessing.OneHotEncoder(
+                                        sparse=False, handle_unknown="ignore"
+                                    ),
+                                ),
+                            ]
+                        ),
+                        categorical_feature_indices,
+                    ),
+                    ("continuous", "passthrough", numeric_feature_indices),
+                ]
+            ),
+        ),
+        ("Classifier", ensemble.RandomForestClassifier(n_estimators=10)),
+    ]
+)
+
+run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format="array")
+myrun = run.publish()
+print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id))
+
 ###############################################################################
 # Running flows on tasks offline for later upload
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/examples/30_extended/run_setup_tutorial.py b/examples/30_extended/run_setup_tutorial.py
@@ -34,10 +34,14 @@
 
 import numpy as np
 import openml
+from openml.extensions.sklearn import cat, cont
+
 from sklearn.pipeline import make_pipeline, Pipeline
 from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
 from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
 from sklearn.ensemble import RandomForestClassifier
+from sklearn.decomposition import TruncatedSVD
 
 
 openml.config.start_using_configuration_for_example()
@@ -55,18 +59,12 @@
 # easy as you want it to be
 
 
-# Helper functions to return required columns for ColumnTransformer
-def cont(X):
-    return X.dtypes != "category"
-
-
-def cat(X):
-    return X.dtypes == "category"
-
-
-ct = ColumnTransformer(
-    [("cat", OneHotEncoder(handle_unknown="ignore"), cat), ("cont", "passthrough", cont)]
+cat_imp = make_pipeline(
+    SimpleImputer(strategy="most_frequent"),
+    OneHotEncoder(handle_unknown="ignore", sparse=False),
+    TruncatedSVD(),
 )
+ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", "passthrough", cont)])
 model_original = Pipeline(steps=[("transform", ct), ("estimator", RandomForestClassifier()),])
 
 # Let's change some hyperparameters. Of course, in any good application we
diff --git a/openml/extensions/sklearn/__init__.py b/openml/extensions/sklearn/__init__.py
@@ -7,3 +7,31 @@
 __all__ = ["SklearnExtension"]
 
 register_extension(SklearnExtension)
+
+
+def cont(X):
+    """Returns True for all non-categorical columns, False for the rest.
+
+    This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling
+    of mixed data types. To build sklearn models on mixed data types, a ColumnTransformer is
+    required to process each type of columns separately.
+    This function allows transformations meant for continuous/numeric columns to access the
+    continuous/numeric columns given the dataset as DataFrame.
+    """
+    if not hasattr(X, "dtypes"):
+        raise AttributeError("Not a Pandas DataFrame with 'dtypes' as attribute!")
+    return X.dtypes != "category"
+
+
+def cat(X):
+    """Returns True for all categorical columns, False for the rest.
+
+    This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling
+    of mixed data types. To build sklearn models on mixed data types, a ColumnTransformer is
+    required to process each type of columns separately.
+    This function allows transformations meant for categorical columns to access the
+    categorical columns given the dataset as DataFrame.
+    """
+    if not hasattr(X, "dtypes"):
+        raise AttributeError("Not a Pandas DataFrame with 'dtypes' as attribute!")
+    return X.dtypes == "category"
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -10,6 +10,7 @@
 
 import sklearn.metrics
 import xmltodict
+import numpy as np
 import pandas as pd
 
 import openml
@@ -508,7 +509,9 @@ def _calculate_local_measure(sklearn_fn, openml_name):
             for i, tst_idx in enumerate(test_indices):
                 if task.class_labels is not None:
                     prediction = (
-                        task.class_labels[pred_y[i]] if isinstance(pred_y[i], int) else pred_y[i]
+                        task.class_labels[pred_y[i]]
+                        if isinstance(pred_y[i], (int, np.integer))
+                        else pred_y[i]
                     )
                     if isinstance(test_y, pd.Series):
                         test_prediction = (
@@ -519,7 +522,7 @@ def _calculate_local_measure(sklearn_fn, openml_name):
                     else:
                         test_prediction = (
                             task.class_labels[test_y[i]]
-                            if isinstance(test_y[i], int)
+                            if isinstance(test_y[i], (int, np.integer))
                             else test_y[i]
                         )
                     pred_prob = proba_y.iloc[i] if isinstance(proba_y, pd.DataFrame) else proba_y[i]
diff --git a/openml/testing.py b/openml/testing.py
@@ -318,12 +318,4 @@ class CustomImputer(SimpleImputer):
     pass
 
 
-def cont(X):
-    return X.dtypes != "category"
-
-
-def cat(X):
-    return X.dtypes == "category"
-
-
-__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "cat", "cont", "check_task_existence"]
+__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "check_task_existence"]
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -40,7 +40,8 @@
 from openml.flows import OpenMLFlow
 from openml.flows.functions import assert_flows_equal
 from openml.runs.trace import OpenMLRunTrace
-from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont
+from openml.testing import TestBase, SimpleImputer, CustomImputer
+from openml.extensions.sklearn import cat, cont
 
 
 this_directory = os.path.dirname(os.path.abspath(__file__))
@@ -2195,16 +2196,6 @@ def test_failed_serialization_of_custom_class(self):
             # for lower versions
             from sklearn.preprocessing import Imputer as SimpleImputer
 
-        class CustomImputer(SimpleImputer):
-            pass
-
-        def cont(X):
-            return X.dtypes != "category"
-
-        def cat(X):
-            return X.dtypes == "category"
-
-        import sklearn.metrics
         import sklearn.tree
         from sklearn.pipeline import Pipeline, make_pipeline
         from sklearn.compose import ColumnTransformer
@@ -2227,3 +2218,38 @@ def cat(X):
                 raise AttributeError(e)
             else:
                 raise Exception(e)
+
+    @unittest.skipIf(
+        LooseVersion(sklearn.__version__) < "0.20",
+        reason="columntransformer introduction in 0.20.0",
+    )
+    def test_setupid_with_column_transformer(self):
+        """Test to check if inclusion of ColumnTransformer in a pipleline is treated as a new
+        flow each time.
+        """
+        import sklearn.compose
+        from sklearn.svm import SVC
+
+        def column_transformer_pipe(task_id):
+            task = openml.tasks.get_task(task_id)
+            # make columntransformer
+            preprocessor = sklearn.compose.ColumnTransformer(
+                transformers=[
+                    ("num", StandardScaler(), cont),
+                    ("cat", OneHotEncoder(handle_unknown="ignore"), cat),
+                ]
+            )
+            # make pipeline
+            clf = SVC(gamma="scale", random_state=1)
+            pipe = make_pipeline(preprocessor, clf)
+            # run task
+            run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
+            run.publish()
+            new_run = openml.runs.get_run(run.run_id)
+            return new_run
+
+        run1 = column_transformer_pipe(11)  # only categorical
+        TestBase._mark_entity_for_removal("run", run1.run_id)
+        run2 = column_transformer_pipe(23)  # only numeric
+        TestBase._mark_entity_for_removal("run", run2.run_id)
+        self.assertEqual(run1.setup_id, run2.setup_id)
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -20,7 +20,8 @@
 import pandas as pd
 
 import openml.extensions.sklearn
-from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont
+from openml.testing import TestBase, SimpleImputer, CustomImputer
+from openml.extensions.sklearn import cat, cont
 from openml.runs.functions import _run_task_get_arffcontent, run_exists, format_prediction
 from openml.runs.trace import OpenMLRunTrace
 from openml.tasks import TaskType
diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py
@@ -1,6 +1,7 @@
 # License: BSD 3-Clause
 
-from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont
+from openml.testing import TestBase, SimpleImputer, CustomImputer
+from openml.extensions.sklearn import cat, cont
 
 import sklearn
 import unittest