Making suggested changes

openml · mfeurer · Sep 13, 2019 · Aug 5, 2019 · Aug 5, 2019 · Aug 6, 2019
commit 9b5d382c6686e7b86b7768239543dcfb776687ab
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
@@ -503,9 +503,6 @@ def match_format(s):
         s = inspect.getdoc(model)
         if s is None:
             return ''
-        if len(s) <= char_lim:
-            # if the fetched docstring is smaller than char_lim, no trimming required
-            return s.strip()
         try:
             # trim till 'Read more'
             pattern = "Read more in the :ref:"
@@ -516,13 +513,16 @@ def match_format(s):
                 s = "{}...".format(s[:char_lim - 3])
             return s.strip()
         except ValueError:
+            logging.info("'Read more' not found in descriptions. "
+                         "Trying to trim till 'Parameters' if available in docstring.")
             pass
         try:
             # if 'Read more' doesn't exist, trim till 'Parameters'
             pattern = "Parameters"
             index = s.index(match_format(pattern))
         except ValueError:
             # returning full docstring
+            logging.info("'Parameters' not found in docstring. Omitting docstring trimming.")
             index = len(s)
         s = s[:index]
         # trimming docstring to be within char_lim
@@ -556,7 +556,7 @@ def match_format(s):
             index1 = s.index(match_format("Parameters"))
         except ValueError as e:
             # when sklearn docstring has no 'Parameters' section
-            print("{} {}".format(match_format("Parameters"), e))
+            logging.info("{} {}".format(match_format("Parameters"), e))
             return None
 
         headings = ["Attributes", "Notes", "See also", "Note", "References"]
@@ -566,7 +566,7 @@ def match_format(s):
                 index2 = s.index(match_format(h))
                 break
             except ValueError:
-                print("{} not available in docstring".format(h))
+                logging.info("{} not available in docstring".format(h))
                 continue
         else:
             # in the case only 'Parameters' exist, trim till end of docstring
@@ -909,10 +909,6 @@ def flatten_all(list_):
                     parameters[k] = None
 
             if parameters_docs is not None:
-                # print(type(model))
-                # print(sorted(parameters_docs.keys()))
-                # print(sorted(model_parameters.keys()))
-                # print()
                 data_type, description = parameters_docs[k]
                 parameters_meta_info[k] = OrderedDict((('description', description),
                                                        ('data_type', data_type)))

diff --git a/openml/flows/functions.py b/openml/flows/functions.py
@@ -308,7 +308,8 @@ def _check_flow_for_server_id(flow: OpenMLFlow) -> None:
 def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
                        ignore_parameter_values_on_older_children: str = None,
                        ignore_parameter_values: bool = False,
-                       ignore_custom_name_if_none: bool = False) -> None:
+                       ignore_custom_name_if_none: bool = False,
+                       check_description: bool = True) -> None:
     """Check equality of two flows.
 
     Two flows are equal if their all keys which are not set by the server
@@ -327,8 +328,11 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
     ignore_parameter_values : bool
         Whether to ignore parameter values when comparing flows.
 
-   ignore_custom_name_if_none : bool
+    ignore_custom_name_if_none : bool
         Whether to ignore the custom name field if either flow has `custom_name` equal to `None`.
+
+    check_description : bool
+        Whether to ignore matching of flow descriptions.
     """
     if not isinstance(flow1, OpenMLFlow):
         raise TypeError('Argument 1 must be of type OpenMLFlow, but is %s' %
@@ -366,7 +370,7 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
                                    ignore_custom_name_if_none)
         elif key == '_extension':
             continue
-        elif key == 'description':
+        elif check_description and key == 'description':
             # to ignore matching of descriptions since sklearn based flows may have
             # altering docstrings and is not guaranteed to be consistent
             continue
@@ -404,8 +408,8 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
             elif key == 'parameters_meta_info':
                 # this value is a dictionary where each key is a parameter name, containing another
                 # dictionary with keys specifying the parameter's 'description' and 'data_type'
-                # check of descriptions can be ignored since that might change
-                # data type check can be ignored if one of them is not defined, i.e., None
+                # checking parameter descriptions can be ignored since that might change
+                # data type check can also be ignored if one of them is not defined, i.e., None
                 params1 = set(flow1.parameters_meta_info.keys())
                 params2 = set(flow2.parameters_meta_info.keys())
                 if params1 != params2:

diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -75,7 +75,8 @@ def test_serialize_model(self):
 
             fixture_name = 'sklearn.tree.tree.DecisionTreeClassifier'
             fixture_short_name = 'sklearn.DecisionTreeClassifier'
-            fixture_description = self.extension._get_sklearn_description(model)
+            # str obtained from self.extension._get_sklearn_description(model)
+            fixture_description = 'A decision tree classifier.'
             version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \
                               % sklearn.__version__
             # min_impurity_decrease has been introduced in 0.20
@@ -143,7 +144,8 @@ def test_serialize_model_clustering(self):
 
             fixture_name = 'sklearn.cluster.k_means_.KMeans'
             fixture_short_name = 'sklearn.KMeans'
-            fixture_description = self.extension._get_sklearn_description(model)
+            # str obtained from self.extension._get_sklearn_description(model)
+            fixture_description = 'K-Means clustering'
             version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \
                               % sklearn.__version__
             # n_jobs default has changed to None in 0.20
@@ -207,11 +209,18 @@ def test_serialize_model_with_subcomponent(self):
                        '(base_estimator=sklearn.tree.tree.DecisionTreeClassifier)'
         fixture_class_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'
         fixture_short_name = 'sklearn.AdaBoostClassifier'
-        fixture_description = self.extension._get_sklearn_description(model)
+        # str obtained from self.extension._get_sklearn_description(model)
+        fixture_description = 'An AdaBoost classifier.\n\nAn AdaBoost [1] classifier is a '\
+                              'meta-estimator that begins by fitting a\nclassifier on the original'\
+                              ' dataset and then fits additional copies of the\nclassifier on the '\
+                              'same dataset but where the weights of incorrectly\nclassified '\
+                              'instances are adjusted such that subsequent classifiers focus\nmore'\
+                              ' on difficult cases.\n\nThis class implements the algorithm known '\
+                              'as AdaBoost-SAMME [2].'
         fixture_subcomponent_name = 'sklearn.tree.tree.DecisionTreeClassifier'
         fixture_subcomponent_class_name = 'sklearn.tree.tree.DecisionTreeClassifier'
-        fixture_subcomponent_description = \
-            self.extension._get_sklearn_description(model.base_estimator)
+        # str obtained from self.extension._get_sklearn_description(model.base_estimator)
+        fixture_subcomponent_description = 'A decision tree classifier.'
         fixture_structure = {
             fixture_name: [],
             'sklearn.tree.tree.DecisionTreeClassifier': ['base_estimator']
@@ -265,7 +274,20 @@ def test_serialize_pipeline(self):
                        'scaler=sklearn.preprocessing.data.StandardScaler,' \
                        'dummy=sklearn.dummy.DummyClassifier)'
         fixture_short_name = 'sklearn.Pipeline(StandardScaler,DummyClassifier)'
-        fixture_description = self.extension._get_sklearn_description(model)
+        # str obtained from self.extension._get_sklearn_description(model)
+        fixture_description = "Pipeline of transforms with a final estimator.\n\nSequentially " \
+                              "apply a list of transforms and a final estimator.\nIntermediate "\
+                              "steps of the pipeline must be 'transforms', that is, they\nmust "\
+                              "implement fit and transform methods.\nThe final estimator only "\
+                              "needs to implement fit.\nThe transformers in the pipeline can be "\
+                              "cached using ``memory`` argument.\n\nThe purpose of the pipeline is"\
+                              " to assemble several steps that can be\ncross-validated together "\
+                              "while setting different parameters.\nFor this, it enables setting "\
+                              "parameters of the various steps using their\nnames and the "\
+                              "parameter name separated by a '__', as in the example below.\nA "\
+                              "step's estimator may be replaced entirely by setting the "\
+                              "parameter\nwith its name to another estimator, or a transformer "\
+                              "removed by setting\nit to 'passthrough' or ``None``."
         fixture_structure = {
             fixture_name: [],
             'sklearn.preprocessing.data.StandardScaler': ['scaler'],
@@ -354,7 +376,20 @@ def test_serialize_pipeline_clustering(self):
                        'scaler=sklearn.preprocessing.data.StandardScaler,' \
                        'clusterer=sklearn.cluster.k_means_.KMeans)'
         fixture_short_name = 'sklearn.Pipeline(StandardScaler,KMeans)'
-        fixture_description = self.extension._get_sklearn_description(model)
+        # str obtained from self.extension._get_sklearn_description(model)
+        fixture_description = "Pipeline of transforms with a final estimator.\n\nSequentially "\
+                              "apply a list of transforms and a final estimator.\nIntermediate "\
+                              "steps of the pipeline must be 'transforms', that is, they\nmust "\
+                              "implement fit and transform methods.\nThe final estimator only "\
+                              "needs to implement fit.\nThe transformers in the pipeline can be "\
+                              "cached using ``memory`` argument.\n\nThe purpose of the pipeline is"\
+                              " to assemble several steps that can be\ncross-validated together "\
+                              "while setting different parameters.\nFor this, it enables setting "\
+                              "parameters of the various steps using their\nnames and the "\
+                              "parameter name separated by a '__', as in the example below.\nA "\
+                              "step's estimator may be replaced entirely by setting the parameter"\
+                              "\nwith its name to another estimator, or a transformer removed "\
+                              "by setting\nit to 'passthrough' or ``None``."
         fixture_structure = {
             fixture_name: [],
             'sklearn.preprocessing.data.StandardScaler': ['scaler'],
@@ -446,7 +481,14 @@ def test_serialize_column_transformer(self):
                   'numeric=sklearn.preprocessing.data.StandardScaler,' \
                   'nominal=sklearn.preprocessing._encoders.OneHotEncoder)'
         fixture_short_name = 'sklearn.ColumnTransformer'
-        fixture_description = self.extension._get_sklearn_description(model)
+        # str obtained from self.extension._get_sklearn_description(model)
+        fixture_description = 'Applies transformers to columns of an array or pandas DataFrame.\n' \
+                              '\nThis estimator allows different columns or column subsets of the '\
+                              'input\nto be transformed separately and the features generated by '\
+                              'each transformer\nwill be concatenated to form a single feature '\
+                              'space.\nThis is useful for heterogeneous or columnar data, to '\
+                              'combine several\nfeature extraction mechanisms or transformations '\
+                              'into a single transformer.'
         fixture_structure = {
             fixture: [],
             'sklearn.preprocessing.data.StandardScaler': ['numeric'],
@@ -505,7 +547,20 @@ def test_serialize_column_transformer_pipeline(self):
             fixture_name: [],
         }
 
-        fixture_description = self.extension._get_sklearn_description(model)
+        # str obtained from self.extension._get_sklearn_description(model)
+        fixture_description = "Pipeline of transforms with a final estimator.\n\nSequentially "\
+                              "apply a list of transforms and a final estimator.\nIntermediate "\
+                              "steps of the pipeline must be 'transforms', that is, they\nmust "\
+                              "implement fit and transform methods.\nThe final estimator only "\
+                              "needs to implement fit.\nThe transformers in the pipeline can be "\
+                              "cached using ``memory`` argument.\n\nThe purpose of the pipeline "\
+                              "is to assemble several steps that can be\ncross-validated together "\
+                              "while setting different parameters.\nFor this, it enables setting "\
+                              "parameters of the various steps using their\nnames and the "\
+                              "parameter name separated by a '__', as in the example below.\nA "\
+                              "step's estimator may be replaced entirely by setting the parameter"\
+                              "\nwith its name to another estimator, or a transformer removed by "\
+                              "setting\nit to 'passthrough' or ``None``."
         serialization = self.extension.model_to_flow(model)
         structure = serialization.get_structure('name')
         self.assertEqual(serialization.name, fixture_name)