Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Making suggested changes
  • Loading branch information
Neeratyoy committed Sep 2, 2019
commit 9b5d382c6686e7b86b7768239543dcfb776687ab
14 changes: 5 additions & 9 deletions openml/extensions/sklearn/extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,9 +503,6 @@ def match_format(s):
s = inspect.getdoc(model)
if s is None:
return ''
if len(s) <= char_lim:
# if the fetched docstring is smaller than char_lim, no trimming required
return s.strip()
try:
# trim till 'Read more'
pattern = "Read more in the :ref:"
Expand All @@ -516,13 +513,16 @@ def match_format(s):
s = "{}...".format(s[:char_lim - 3])
return s.strip()
except ValueError:
logging.info("'Read more' not found in descriptions. "
"Trying to trim till 'Parameters' if available in docstring.")
pass
Comment thread
mfeurer marked this conversation as resolved.
try:
# if 'Read more' doesn't exist, trim till 'Parameters'
pattern = "Parameters"
Comment thread
mfeurer marked this conversation as resolved.
Comment thread
mfeurer marked this conversation as resolved.
index = s.index(match_format(pattern))
except ValueError:
# returning full docstring
Comment thread
mfeurer marked this conversation as resolved.
logging.info("'Parameters' not found in docstring. Omitting docstring trimming.")
index = len(s)
s = s[:index]
# trimming docstring to be within char_lim
Expand Down Expand Up @@ -556,7 +556,7 @@ def match_format(s):
index1 = s.index(match_format("Parameters"))
except ValueError as e:
# when sklearn docstring has no 'Parameters' section
print("{} {}".format(match_format("Parameters"), e))
logging.info("{} {}".format(match_format("Parameters"), e))
return None

headings = ["Attributes", "Notes", "See also", "Note", "References"]
Expand All @@ -566,7 +566,7 @@ def match_format(s):
index2 = s.index(match_format(h))
break
except ValueError:
print("{} not available in docstring".format(h))
logging.info("{} not available in docstring".format(h))
continue
else:
# in the case only 'Parameters' exist, trim till end of docstring
Expand Down Expand Up @@ -909,10 +909,6 @@ def flatten_all(list_):
parameters[k] = None

if parameters_docs is not None:
# print(type(model))
# print(sorted(parameters_docs.keys()))
# print(sorted(model_parameters.keys()))
# print()
data_type, description = parameters_docs[k]
parameters_meta_info[k] = OrderedDict((('description', description),
('data_type', data_type)))
Expand Down
14 changes: 9 additions & 5 deletions openml/flows/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,8 @@ def _check_flow_for_server_id(flow: OpenMLFlow) -> None:
def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
ignore_parameter_values_on_older_children: str = None,
ignore_parameter_values: bool = False,
ignore_custom_name_if_none: bool = False) -> None:
ignore_custom_name_if_none: bool = False,
check_description: bool = True) -> None:
"""Check equality of two flows.

Two flows are equal if their all keys which are not set by the server
Expand All @@ -327,8 +328,11 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
ignore_parameter_values : bool
Whether to ignore parameter values when comparing flows.

ignore_custom_name_if_none : bool
ignore_custom_name_if_none : bool
Whether to ignore the custom name field if either flow has `custom_name` equal to `None`.

check_description : bool
Whether to ignore matching of flow descriptions.
"""
if not isinstance(flow1, OpenMLFlow):
raise TypeError('Argument 1 must be of type OpenMLFlow, but is %s' %
Expand Down Expand Up @@ -366,7 +370,7 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
ignore_custom_name_if_none)
elif key == '_extension':
continue
elif key == 'description':
elif check_description and key == 'description':
# to ignore matching of descriptions since sklearn based flows may have
# altering docstrings and is not guaranteed to be consistent
continue
Expand Down Expand Up @@ -404,8 +408,8 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
elif key == 'parameters_meta_info':
# this value is a dictionary where each key is a parameter name, containing another
# dictionary with keys specifying the parameter's 'description' and 'data_type'
# check of descriptions can be ignored since that might change
# data type check can be ignored if one of them is not defined, i.e., None
# checking parameter descriptions can be ignored since that might change
# data type check can also be ignored if one of them is not defined, i.e., None
params1 = set(flow1.parameters_meta_info.keys())
params2 = set(flow2.parameters_meta_info.keys())
if params1 != params2:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ def test_serialize_model(self):

fixture_name = 'sklearn.tree.tree.DecisionTreeClassifier'
fixture_short_name = 'sklearn.DecisionTreeClassifier'
fixture_description = self.extension._get_sklearn_description(model)
# str obtained from self.extension._get_sklearn_description(model)
fixture_description = 'A decision tree classifier.'
version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \
% sklearn.__version__
# min_impurity_decrease has been introduced in 0.20
Expand Down Expand Up @@ -143,7 +144,8 @@ def test_serialize_model_clustering(self):

fixture_name = 'sklearn.cluster.k_means_.KMeans'
fixture_short_name = 'sklearn.KMeans'
fixture_description = self.extension._get_sklearn_description(model)
# str obtained from self.extension._get_sklearn_description(model)
fixture_description = 'K-Means clustering'
version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \
% sklearn.__version__
# n_jobs default has changed to None in 0.20
Expand Down Expand Up @@ -207,11 +209,18 @@ def test_serialize_model_with_subcomponent(self):
'(base_estimator=sklearn.tree.tree.DecisionTreeClassifier)'
fixture_class_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'
fixture_short_name = 'sklearn.AdaBoostClassifier'
fixture_description = self.extension._get_sklearn_description(model)
# str obtained from self.extension._get_sklearn_description(model)
fixture_description = 'An AdaBoost classifier.\n\nAn AdaBoost [1] classifier is a '\
'meta-estimator that begins by fitting a\nclassifier on the original'\
' dataset and then fits additional copies of the\nclassifier on the '\
'same dataset but where the weights of incorrectly\nclassified '\
'instances are adjusted such that subsequent classifiers focus\nmore'\
' on difficult cases.\n\nThis class implements the algorithm known '\
'as AdaBoost-SAMME [2].'
fixture_subcomponent_name = 'sklearn.tree.tree.DecisionTreeClassifier'
fixture_subcomponent_class_name = 'sklearn.tree.tree.DecisionTreeClassifier'
fixture_subcomponent_description = \
self.extension._get_sklearn_description(model.base_estimator)
# str obtained from self.extension._get_sklearn_description(model.base_estimator)
fixture_subcomponent_description = 'A decision tree classifier.'
fixture_structure = {
fixture_name: [],
'sklearn.tree.tree.DecisionTreeClassifier': ['base_estimator']
Expand Down Expand Up @@ -265,7 +274,20 @@ def test_serialize_pipeline(self):
'scaler=sklearn.preprocessing.data.StandardScaler,' \
'dummy=sklearn.dummy.DummyClassifier)'
fixture_short_name = 'sklearn.Pipeline(StandardScaler,DummyClassifier)'
fixture_description = self.extension._get_sklearn_description(model)
# str obtained from self.extension._get_sklearn_description(model)
fixture_description = "Pipeline of transforms with a final estimator.\n\nSequentially " \
"apply a list of transforms and a final estimator.\nIntermediate "\
"steps of the pipeline must be 'transforms', that is, they\nmust "\
"implement fit and transform methods.\nThe final estimator only "\
"needs to implement fit.\nThe transformers in the pipeline can be "\
"cached using ``memory`` argument.\n\nThe purpose of the pipeline is"\
" to assemble several steps that can be\ncross-validated together "\
"while setting different parameters.\nFor this, it enables setting "\
"parameters of the various steps using their\nnames and the "\
"parameter name separated by a '__', as in the example below.\nA "\
"step's estimator may be replaced entirely by setting the "\
"parameter\nwith its name to another estimator, or a transformer "\
"removed by setting\nit to 'passthrough' or ``None``."
fixture_structure = {
fixture_name: [],
'sklearn.preprocessing.data.StandardScaler': ['scaler'],
Expand Down Expand Up @@ -354,7 +376,20 @@ def test_serialize_pipeline_clustering(self):
'scaler=sklearn.preprocessing.data.StandardScaler,' \
'clusterer=sklearn.cluster.k_means_.KMeans)'
fixture_short_name = 'sklearn.Pipeline(StandardScaler,KMeans)'
fixture_description = self.extension._get_sklearn_description(model)
# str obtained from self.extension._get_sklearn_description(model)
fixture_description = "Pipeline of transforms with a final estimator.\n\nSequentially "\
"apply a list of transforms and a final estimator.\nIntermediate "\
"steps of the pipeline must be 'transforms', that is, they\nmust "\
"implement fit and transform methods.\nThe final estimator only "\
"needs to implement fit.\nThe transformers in the pipeline can be "\
"cached using ``memory`` argument.\n\nThe purpose of the pipeline is"\
" to assemble several steps that can be\ncross-validated together "\
"while setting different parameters.\nFor this, it enables setting "\
"parameters of the various steps using their\nnames and the "\
"parameter name separated by a '__', as in the example below.\nA "\
"step's estimator may be replaced entirely by setting the parameter"\
"\nwith its name to another estimator, or a transformer removed "\
"by setting\nit to 'passthrough' or ``None``."
fixture_structure = {
fixture_name: [],
'sklearn.preprocessing.data.StandardScaler': ['scaler'],
Expand Down Expand Up @@ -446,7 +481,14 @@ def test_serialize_column_transformer(self):
'numeric=sklearn.preprocessing.data.StandardScaler,' \
'nominal=sklearn.preprocessing._encoders.OneHotEncoder)'
fixture_short_name = 'sklearn.ColumnTransformer'
fixture_description = self.extension._get_sklearn_description(model)
# str obtained from self.extension._get_sklearn_description(model)
fixture_description = 'Applies transformers to columns of an array or pandas DataFrame.\n' \
'\nThis estimator allows different columns or column subsets of the '\
'input\nto be transformed separately and the features generated by '\
'each transformer\nwill be concatenated to form a single feature '\
'space.\nThis is useful for heterogeneous or columnar data, to '\
'combine several\nfeature extraction mechanisms or transformations '\
'into a single transformer.'
fixture_structure = {
fixture: [],
'sklearn.preprocessing.data.StandardScaler': ['numeric'],
Expand Down Expand Up @@ -505,7 +547,20 @@ def test_serialize_column_transformer_pipeline(self):
fixture_name: [],
}

fixture_description = self.extension._get_sklearn_description(model)
# str obtained from self.extension._get_sklearn_description(model)
fixture_description = "Pipeline of transforms with a final estimator.\n\nSequentially "\
"apply a list of transforms and a final estimator.\nIntermediate "\
"steps of the pipeline must be 'transforms', that is, they\nmust "\
"implement fit and transform methods.\nThe final estimator only "\
"needs to implement fit.\nThe transformers in the pipeline can be "\
"cached using ``memory`` argument.\n\nThe purpose of the pipeline "\
"is to assemble several steps that can be\ncross-validated together "\
"while setting different parameters.\nFor this, it enables setting "\
"parameters of the various steps using their\nnames and the "\
"parameter name separated by a '__', as in the example below.\nA "\
"step's estimator may be replaced entirely by setting the parameter"\
"\nwith its name to another estimator, or a transformer removed by "\
"setting\nit to 'passthrough' or ``None``."
serialization = self.extension.model_to_flow(model)
structure = serialization.get_structure('name')
self.assertEqual(serialization.name, fixture_name)
Expand Down