Making tentative changes; Need to test deserialization

openml · mfeurer · Nov 18, 2019 · Nov 6, 2019 · Nov 12, 2019 · Nov 12, 2019
commit 4875dbadf075aa6d8cc2c345bf4c4a3f11b426ea
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
@@ -694,10 +694,12 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
         # will be part of the name (in brackets)
         sub_components_names = ""
         for key in subcomponents:
-            if isinstance(subcomponents[key], str):
-                name = subcomponents[key]
-            else:
+            if isinstance(subcomponents[key], OpenMLFlow):
                 name = subcomponents[key].name
+            elif isinstance(subcomponents[key], str):  # 'drop', 'passthrough' can be passed
+                name = subcomponents[key]
+            elif subcomponents[key] is None:
+                name = "None"
             if key in subcomponents_explicit:
                 sub_components_names += "," + key + "=" + name
             else:
@@ -752,7 +754,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
     def _get_external_version_string(
         self,
         model: Any,
-        sub_components: Dict[str, OpenMLFlow],
+        sub_components: Dict[str, Union[OpenMLFlow, str, None]],
     ) -> str:
         # Create external version string for a flow, given the model and the
         # already parsed dictionary of sub_components. Retrieves the external
@@ -773,7 +775,8 @@ def _get_external_version_string(
         external_versions.add(openml_version)
         external_versions.add(sklearn_version)
         for visitee in sub_components.values():
-            if isinstance(visitee, str):
+            # 'drop', 'passthrough', None can be passed as estimators
+            if isinstance(visitee, str) or visitee is None:
                 continue
             for external_version in visitee.external_version.split(','):
                 external_versions.add(external_version)
@@ -782,15 +785,18 @@ def _get_external_version_string(
     def _check_multiple_occurence_of_component_in_flow(
         self,
         model: Any,
-        sub_components: Dict[str, Any],
+        sub_components: Dict[str, Union[OpenMLFlow, str, None]],
     ) -> None:
-        to_visit_stack = []  # type: List[OpenMLFlow]
+        to_visit_stack = []  # type: List[Union[OpenMLFlow, str, None]]
         to_visit_stack.extend(sub_components.values())
         known_sub_components = set()  # type: Set[str]
+
         while len(to_visit_stack) > 0:
             visitee = to_visit_stack.pop()
-            if isinstance(visitee, str):
+            if isinstance(visitee, str):  # 'drop', 'passthrough' can be passed as estimators
                 known_sub_components.add(visitee)
+            elif visitee is None:  # a None step can be included in a Pipeline
+                known_sub_components.add(str(visitee))
             elif visitee.name in known_sub_components:
                 raise ValueError('Found a second occurence of component %s when '
                                  'trying to serialize %s.' % (visitee.name, model))
@@ -804,7 +810,7 @@ def _extract_information_from_model(
     ) -> Tuple[
         'OrderedDict[str, Optional[str]]',
         'OrderedDict[str, Optional[Dict]]',
-        'OrderedDict[str, Any]',
+        'OrderedDict[str, Union[OpenMLFlow, str, None]]',
         Set,
     ]:
         # This function contains four "global" states and is quite long and
@@ -814,7 +820,7 @@ def _extract_information_from_model(
         # separate class methods
 
         # stores all entities that should become subcomponents
-        sub_components = OrderedDict()  # type: OrderedDict[str, OpenMLFlow]
+        sub_components = OrderedDict()  # type: OrderedDict[str, Union[OpenMLFlow, str, None]]
         # stores the keys of all subcomponents that should become
         sub_components_explicit = set()
         parameters = OrderedDict()  # type: OrderedDict[str, Optional[str]]
@@ -858,7 +864,7 @@ def flatten_all(list_):
                 parameter_value = list()  # type: List
                 reserved_keywords = set(model.get_params(deep=False).keys())
 
-                for sub_component_tuple in rval:
+                for i, sub_component_tuple in enumerate(rval):
                     identifier = sub_component_tuple[0]
                     sub_component = sub_component_tuple[1]
                     sub_component_type = type(sub_component_tuple)
@@ -891,13 +897,15 @@ def flatten_all(list_):
                         raise PyOpenMLError(msg)
 
                     if sub_component is None:
-                        # In a FeatureUnion it is legal to have a None step
+                        # In a FeatureUnion, Pipeline it is legal to have a None step
 
                         pv = [identifier, None]
                         if sub_component_type is tuple:
                             parameter_value.append(tuple(pv))
                         else:
                             parameter_value.append(pv)
+                        sub_components_explicit.add(identifier)
+                        sub_components[identifier] = sub_component
 
                     else:
                         # Add the component to the list of components, add a

diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -28,15 +28,18 @@
 import sklearn.preprocessing
 import sklearn.tree
 import sklearn.cluster
-
+from sklearn.pipeline import make_pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.impute import SimpleImputer
 
 import openml
 from openml.extensions.sklearn import SklearnExtension
 from openml.exceptions import PyOpenMLError
 from openml.flows import OpenMLFlow
 from openml.flows.functions import assert_flows_equal
 from openml.runs.trace import OpenMLRunTrace
-from openml.testing import TestBase, SimpleImputer
+from openml.testing import TestBase
 
 
 this_directory = os.path.dirname(os.path.abspath(__file__))
@@ -678,6 +681,7 @@ def test_serialize_feature_union(self):
         self.assertEqual(serialization.name,
                          'sklearn.pipeline.FeatureUnion('
                          'ohe=sklearn.preprocessing.{}.OneHotEncoder)'
+                         'scaler=None'
                          .format(module_name_encoder))
         new_model = self.extension.flow_to_model(serialization)
         self.assertEqual(type(new_model), type(fu))
@@ -1776,3 +1780,48 @@ def test_trim_flow_name(self):
 
         self.assertEqual("weka.IsolationForest",
                          SklearnExtension.trim_flow_name("weka.IsolationForest"))
+
+    @unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
+                     reason="SimpleImputer available only after 0.19")
+    def test_run_on_model_with_empty_steps(self):
+        # testing 'drop', 'passthrough', None as non-actionable sklearn estimators
+        dataset = openml.datasets.get_dataset(128)
+        task = openml.tasks.get_task(59)
+
+        X, y, categorical_ind, feature_names = dataset.get_data(
+            target=dataset.default_target_attribute, dataset_format='array')
+        categorical_ind = np.array(categorical_ind)
+        cat_idx, = np.where(categorical_ind)
+        cont_idx, = np.where(~categorical_ind)
+
+        clf = make_pipeline(
+            ColumnTransformer([('cat', make_pipeline(SimpleImputer(strategy='most_frequent'),
+                                                     OneHotEncoder()), cat_idx.tolist()),
+                               ('cont', make_pipeline(SimpleImputer(strategy='median'),
+                                                      StandardScaler()), cont_idx.tolist())])
+        )
+
+        clf = sklearn.pipeline.Pipeline([
+            ('dummystep', 'passthrough'),  # adding 'passthrough' as an estimator
+            ('prep', clf),
+            ('variancethreshold', None),  # adding 'None' as an estimator
+            ('classifier', sklearn.svm.SVC(gamma='auto'))
+        ])
+
+        # adding 'drop' to a ColumnTransformer
+        if not categorical_ind.any():
+            clf[1][0].set_params(cat='drop')
+        if not (~categorical_ind).any():
+            clf[1][0].set_params(cont='drop')
+
+        run, flow = openml.runs.run_model_on_task(model=clf, task=task, return_flow=True)
+
+        self.assertEqual(len(flow.components), 4)
+        self.assertEqual(flow.components['dummystep'], 'passthrough')
+        self.assertTrue(isinstance(flow.components['classifier'], OpenMLFlow))
+        self.assertEqual(flow.components['variancethreshold'], None)
+        self.assertTrue(isinstance(flow.components['prep'], OpenMLFlow))
+        self.assertTrue(isinstance(flow.components['prep'].components['columntransformer'],
+                        OpenMLFlow))
+        self.assertEqual(flow.components['prep'].components['columntransformer'].components['cat'],
+                         'drop')