Fixing edge cases to pass tests

openml · mfeurer · Nov 5, 2019 · Aug 5, 2019 · Aug 5, 2019 · Aug 6, 2019
commit 58a66097456bed82ed7b5ff8fabb81c42ae99fd2
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
@@ -501,6 +501,8 @@ def _get_sklearn_description(self, model: Any, char_lim: int = 1024) -> str:
         def match_format(s):
             return "{}\n{}\n".format(s, len(s) * '-')
         s = inspect.getdoc(model)
+        if s is None:
+            return ''
         if len(s) <= char_lim:
             # if the fetched docstring is smaller than char_lim, no trimming required
             return s.strip()
@@ -528,6 +530,105 @@ def match_format(s):
             s = "{}...".format(s[:char_lim - 3])
         return s.strip()
 
+    def _extract_sklearn_parameter_docstring(self, model) -> Union[None, str]:
+        '''Extracts the part of sklearn docstring containing parameter information
+
+        Fetches the entire docstring and trims just the Parameter section.
+        The assumption is that 'Parameters' is the first section in sklearn docstrings,
+        followed by other sections titled 'Attributes', 'See also', 'Note', 'References',
+        appearing in that order if defined.
+        Returns a None if no section with 'Parameters' can be found in the docstring.
+
+        Parameters
+        ----------
+        model : sklearn model
+
+        Returns
+        -------
+        str, or None
+        '''
+        def match_format(s):
+            return "{}\n{}\n".format(s, len(s) * '-')
+        s = inspect.getdoc(model)
+        if s is None:
+            return None
+        try:
+            index1 = s.index(match_format("Parameters"))
+        except ValueError as e:
+            # when sklearn docstring has no 'Parameters' section
+            print("{} {}".format(match_format("Parameters"), e))
+            return None
+
+        headings = ["Attributes", "Notes", "See also", "Note", "References"]
+        for h in headings:
+            try:
+                # to find end of Parameters section
+                index2 = s.index(match_format(h))
+                break
+            except ValueError:
+                print("{} not available in docstring".format(h))
+                continue
+        else:
+            # in the case only 'Parameters' exist, trim till end of docstring
+            index2 = len(s)
+        s = s[index1:index2]
+        return s.strip()
+
+    def _extract_sklearn_param_info(self, model, char_lim=1024) -> Union[None, Dict]:
+        '''Parses parameter type and description from sklearn dosctring
+
+        Parameters
+        ----------
+        model : sklearn model
+        char_lim : int
+            Specifying the max length of the returned string.
+            OpenML servers have a constraint of 1024 characters string fields.
+
+        Returns
+        -------
+        Dict, or None
+        '''
+        docstring = self._extract_sklearn_parameter_docstring(model)
+        if docstring is None:
+            # when sklearn docstring has no 'Parameters' section
+            return None
+
+        n = re.compile("[.]*\n", flags=IGNORECASE)
+        lines = n.split(docstring)
+        p = re.compile("[a-z0-9_ ]+ : [a-z0-9_']+[a-z0-9_ ]*", flags=IGNORECASE)
+        parameter_docs = OrderedDict()  # type: Dict
+        description = []  # type: List
+
+        # collecting parameters and their descriptions
+        for i, s in enumerate(lines):
+            param = p.findall(s)
+            if param != []:
+                if len(description) > 0:
+                    description[-1] = '\n'.join(description[-1]).strip()
+                    if len(description[-1]) > char_lim:
+                        description[-1] = "{}...".format(description[-1][:char_lim - 3])
+                description.append([])
+            else:
+                if len(description) > 0:
+                    description[-1].append(s)
+        description[-1] = '\n'.join(description[-1]).strip()
+        if len(description[-1]) > char_lim:
+            description[-1] = "{}...".format(description[-1][:char_lim - 3])
+
+        # collecting parameters and their types
+        matches = p.findall(docstring)
+        for i, param in enumerate(matches):
+            key, value = param.split(':')
+            parameter_docs[key.strip()] = [value.strip(), description[i]]
+
+        # to avoid KeyError for missing parameters
+        param_list_true = list(model.get_params().keys())
+        param_list_found = list(parameter_docs.keys())
+        for param in list(set(param_list_true) - set(param_list_found)):
+            parameter_docs[param] = [None, None]
+
+        return parameter_docs
+
     def _serialize_model(self, model: Any) -> OpenMLFlow:
         """Create an OpenMLFlow.
 
@@ -656,97 +757,6 @@ def _check_multiple_occurence_of_component_in_flow(
                 known_sub_components.add(visitee.name)
                 to_visit_stack.extend(visitee.components.values())
 
-    def _extract_sklearn_parameter_docstring(self, model) -> Union[None, str]:
-        '''Extracts the part of sklearn docstring containing parameter information
-
-        Fetches the entire docstring and trims just the Parameter section.
-        The assumption is that 'Parameters' is the first section in sklearn docstrings,
-        followed by other sections titled 'Attributes', 'See also', 'Note', 'References',
-        appearing in that order if defined.
-        Returns a None if no section with 'Parameters' can be found in the docstring.
-
-        Parameters
-        ----------
-        model : sklearn model
-
-        Returns
-        -------
-        str, or None
-        '''
-        def match_format(s):
-            return "{}\n{}\n".format(s, len(s) * '-')
-        s = inspect.getdoc(model)
-        try:
-            index1 = s.index(match_format("Parameters"))
-        except ValueError as e:
-            # when sklearn docstring has no 'Parameters' section
-            print("{} {}".format(match_format("Parameters"), e))
-            return None
-
-        headings = ["Attributes", "Notes", "See also", "Note", "References"]
-        for h in headings:
-            try:
-                # to find end of Parameters section
-                index2 = s.index(match_format(h))
-                break
-            except ValueError:
-                print("{} not available in docstring".format(h))
-                continue
-        else:
-            # in the case only 'Parameters' exist, trim till end of docstring
-            index2 = len(s)
-        s = s[index1:index2]
-        return s.strip()
-
-    def _extract_sklearn_param_info(self, model, char_lim=1024) -> Union[None, Dict]:
-        '''Parses parameter type and description from sklearn dosctring
-
-        Parameters
-        ----------
-        model : sklearn model
-        char_lim : int
-            Specifying the max length of the returned string.
-            OpenML servers have a constraint of 1024 characters string fields.
-
-        Returns
-        -------
-        Dict, or None
-        '''
-        docstring = self._extract_sklearn_parameter_docstring(model)
-        if docstring is None:
-            # when sklearn docstring has no 'Parameters' section
-            return None
-
-        n = re.compile("[.]*\n", flags=IGNORECASE)
-        lines = n.split(docstring)
-        p = re.compile("[a-z0-9_ ]+ : [a-z0-9_']+[a-z0-9_ ]*", flags=IGNORECASE)
-        parameter_docs = OrderedDict()  # type: Dict
-        description = []  # type: List
-
-        # collecting parameters and their descriptions
-        for i, s in enumerate(lines):
-            param = p.findall(s)
-            if param != []:
-                if len(description) > 0:
-                    description[-1] = '\n'.join(description[-1]).strip()
-                    if len(description[-1]) > char_lim:
-                        description[-1] = "{}...".format(description[-1][:char_lim - 3])
-                description.append([])
-            else:
-                if len(description) > 0:
-                    description[-1].append(s)
-        description[-1] = '\n'.join(description[-1]).strip()
-        if len(description[-1]) > char_lim:
-            description[-1] = "{}...".format(description[-1][:char_lim - 3])
-
-        # collecting parameters and their types
-        matches = p.findall(docstring)
-        for i, param in enumerate(matches):
-            key, value = param.split(':')
-            parameter_docs[key.strip()] = [value.strip(), description[i]]
-
-        return parameter_docs
-
     def _extract_information_from_model(
         self,
         model: Any,
@@ -890,6 +900,10 @@ def flatten_all(list_):
                     parameters[k] = None
 
             if parameters_docs is not None:
+                # print(type(model))
+                # print(sorted(parameters_docs.keys()))
+                # print(sorted(model_parameters.keys()))
+                # print()
                 data_type, description = parameters_docs[k]
                 parameters_meta_info[k] = OrderedDict((('description', description),
                                                        ('data_type', data_type)))

diff --git a/openml/flows/functions.py b/openml/flows/functions.py
@@ -366,6 +366,10 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
                                    ignore_custom_name_if_none)
         elif key == '_extension':
             continue
+        elif key == 'description':
+            # to ignore matching of descriptions since sklearn based flows may have
+            # altering docstrings and is not guaranteed to be consistent
+            continue
         else:
             if key == 'parameters':
                 if ignore_parameter_values or \
@@ -397,6 +401,33 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
                 # Helps with backwards compatibility as `custom_name` is now auto-generated, but
                 # before it used to be `None`.
                 continue
+            elif key == 'parameters_meta_info':
+                # this value is a dictionary where each key is a parameter name, containing another
+                # dictionary with keys specifying the parameter's 'description' and 'data_type'
+                # check of descriptions can be ignored since that might change
+                # data type check can be ignored if one of them is not defined, i.e., None
+                params1 = set(flow1.parameters_meta_info.keys())
+                params2 = set(flow2.parameters_meta_info.keys())
+                if params1 != params2:
+                    raise ValueError('Parameter list in meta info for parameters differ in the two flows.')
+                # iterating over the parameter's meta info list
+                for param in params1:
+                    if isinstance(flow1.parameters_meta_info[param], Dict) and \
+                       isinstance(flow2.parameters_meta_info[param], Dict) and \
+                       'data_type' in flow1.parameters_meta_info[param] and \
+                       'data_type' in flow2.parameters_meta_info[param]:
+                        value1 = flow1.parameters_meta_info[param]['data_type']
+                        value2 = flow2.parameters_meta_info[param]['data_type']
+                    else:
+                        value1 = flow1.parameters_meta_info[param]
+                        value2 = flow2.parameters_meta_info[param]
+                    if value1 is None or value2 is None:
+                        continue
+                    elif value1 != value2:
+                        raise ValueError("Flow {}: data type for parameter {} in parameters_meta_info differ as "
+                                         "{}\nvs\n{}".format(flow1.name, key, value1, value2))
+                # the continue is to avoid the 'attr != attr2' check at end of function
+                continue
 
             if attr1 != attr2:
                 raise ValueError("Flow %s: values for attribute '%s' differ: "

diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -75,7 +75,7 @@ def test_serialize_model(self):
 
             fixture_name = 'sklearn.tree.tree.DecisionTreeClassifier'
             fixture_short_name = 'sklearn.DecisionTreeClassifier'
-            fixture_description = 'Automatically created scikit-learn flow.'
+            fixture_description = self.extension._get_sklearn_description(model)
             version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \
                               % sklearn.__version__
             # min_impurity_decrease has been introduced in 0.20
@@ -143,7 +143,7 @@ def test_serialize_model_clustering(self):
 
             fixture_name = 'sklearn.cluster.k_means_.KMeans'
             fixture_short_name = 'sklearn.KMeans'
-            fixture_description = 'Automatically created scikit-learn flow.'
+            fixture_description = self.extension._get_sklearn_description(model)
             version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \
                               % sklearn.__version__
             # n_jobs default has changed to None in 0.20
@@ -207,10 +207,10 @@ def test_serialize_model_with_subcomponent(self):
                        '(base_estimator=sklearn.tree.tree.DecisionTreeClassifier)'
         fixture_class_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'
         fixture_short_name = 'sklearn.AdaBoostClassifier'
-        fixture_description = 'Automatically created scikit-learn flow.'
+        fixture_description = self.extension._get_sklearn_description(model)
         fixture_subcomponent_name = 'sklearn.tree.tree.DecisionTreeClassifier'
         fixture_subcomponent_class_name = 'sklearn.tree.tree.DecisionTreeClassifier'
-        fixture_subcomponent_description = 'Automatically created scikit-learn flow.'
+        fixture_subcomponent_description = self.extension._get_sklearn_description(model.base_estimator)
         fixture_structure = {
             fixture_name: [],
             'sklearn.tree.tree.DecisionTreeClassifier': ['base_estimator']
@@ -264,7 +264,7 @@ def test_serialize_pipeline(self):
                        'scaler=sklearn.preprocessing.data.StandardScaler,' \
                        'dummy=sklearn.dummy.DummyClassifier)'
         fixture_short_name = 'sklearn.Pipeline(StandardScaler,DummyClassifier)'
-        fixture_description = 'Automatically created scikit-learn flow.'
+        fixture_description = self.extension._get_sklearn_description(model)
         fixture_structure = {
             fixture_name: [],
             'sklearn.preprocessing.data.StandardScaler': ['scaler'],
@@ -353,7 +353,7 @@ def test_serialize_pipeline_clustering(self):
                        'scaler=sklearn.preprocessing.data.StandardScaler,' \
                        'clusterer=sklearn.cluster.k_means_.KMeans)'
         fixture_short_name = 'sklearn.Pipeline(StandardScaler,KMeans)'
-        fixture_description = 'Automatically created scikit-learn flow.'
+        fixture_description = self.extension._get_sklearn_description(model)
         fixture_structure = {
             fixture_name: [],
             'sklearn.preprocessing.data.StandardScaler': ['scaler'],
@@ -445,7 +445,7 @@ def test_serialize_column_transformer(self):
                   'numeric=sklearn.preprocessing.data.StandardScaler,' \
                   'nominal=sklearn.preprocessing._encoders.OneHotEncoder)'
         fixture_short_name = 'sklearn.ColumnTransformer'
-        fixture_description = 'Automatically created scikit-learn flow.'
+        fixture_description = self.extension._get_sklearn_description(model)
         fixture_structure = {
             fixture: [],
             'sklearn.preprocessing.data.StandardScaler': ['numeric'],
@@ -504,7 +504,7 @@ def test_serialize_column_transformer_pipeline(self):
             fixture_name: [],
         }
 
-        fixture_description = 'Automatically created scikit-learn flow.'
+        fixture_description = self.extension._get_sklearn_description(model)
         serialization = self.extension.model_to_flow(model)
         structure = serialization.get_structure('name')
         self.assertEqual(serialization.name, fixture_name)

diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
@@ -95,7 +95,6 @@ def test_are_flows_equal(self):
         # Test most important values that can be set by a user
         openml.flows.functions.assert_flows_equal(flow, flow)
         for attribute, new_value in [('name', 'Tes'),
-                                     ('description', 'Test flo'),
                                      ('external_version', '2'),
                                      ('language', 'english'),
                                      ('dependencies', 'ab'),