Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Fixing edge cases to pass tests
  • Loading branch information
Neeratyoy committed Aug 24, 2019
commit 58a66097456bed82ed7b5ff8fabb81c42ae99fd2
196 changes: 105 additions & 91 deletions openml/extensions/sklearn/extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,8 @@ def _get_sklearn_description(self, model: Any, char_lim: int = 1024) -> str:
def match_format(s):
return "{}\n{}\n".format(s, len(s) * '-')
s = inspect.getdoc(model)
if s is None:
return ''
if len(s) <= char_lim:
Comment thread
mfeurer marked this conversation as resolved.
Outdated
# if the fetched docstring is smaller than char_lim, no trimming required
return s.strip()
Expand Down Expand Up @@ -528,6 +530,105 @@ def match_format(s):
s = "{}...".format(s[:char_lim - 3])
return s.strip()

def _extract_sklearn_parameter_docstring(self, model) -> Union[None, str]:
'''Extracts the part of sklearn docstring containing parameter information

Fetches the entire docstring and trims just the Parameter section.
The assumption is that 'Parameters' is the first section in sklearn docstrings,
followed by other sections titled 'Attributes', 'See also', 'Note', 'References',
appearing in that order if defined.
Returns a None if no section with 'Parameters' can be found in the docstring.

Parameters
----------
model : sklearn model

Returns
-------
str, or None
'''
def match_format(s):
return "{}\n{}\n".format(s, len(s) * '-')
s = inspect.getdoc(model)
if s is None:
return None
try:
index1 = s.index(match_format("Parameters"))
except ValueError as e:
# when sklearn docstring has no 'Parameters' section
print("{} {}".format(match_format("Parameters"), e))
return None

headings = ["Attributes", "Notes", "See also", "Note", "References"]
for h in headings:
try:
# to find end of Parameters section
index2 = s.index(match_format(h))
break
except ValueError:
print("{} not available in docstring".format(h))
continue
else:
# in the case only 'Parameters' exist, trim till end of docstring
index2 = len(s)
s = s[index1:index2]
return s.strip()

def _extract_sklearn_param_info(self, model, char_lim=1024) -> Union[None, Dict]:
'''Parses parameter type and description from sklearn dosctring

Parameters
----------
model : sklearn model
char_lim : int
Specifying the max length of the returned string.
OpenML servers have a constraint of 1024 characters string fields.

Returns
-------
Dict, or None
'''
docstring = self._extract_sklearn_parameter_docstring(model)
if docstring is None:
# when sklearn docstring has no 'Parameters' section
return None

n = re.compile("[.]*\n", flags=IGNORECASE)
lines = n.split(docstring)
p = re.compile("[a-z0-9_ ]+ : [a-z0-9_']+[a-z0-9_ ]*", flags=IGNORECASE)
parameter_docs = OrderedDict() # type: Dict
description = [] # type: List

# collecting parameters and their descriptions
for i, s in enumerate(lines):
param = p.findall(s)
if param != []:
if len(description) > 0:
description[-1] = '\n'.join(description[-1]).strip()
if len(description[-1]) > char_lim:
description[-1] = "{}...".format(description[-1][:char_lim - 3])
description.append([])
else:
if len(description) > 0:
description[-1].append(s)
description[-1] = '\n'.join(description[-1]).strip()
if len(description[-1]) > char_lim:
description[-1] = "{}...".format(description[-1][:char_lim - 3])

# collecting parameters and their types
matches = p.findall(docstring)
for i, param in enumerate(matches):
key, value = param.split(':')
parameter_docs[key.strip()] = [value.strip(), description[i]]

# to avoid KeyError for missing parameters
param_list_true = list(model.get_params().keys())
param_list_found = list(parameter_docs.keys())
for param in list(set(param_list_true) - set(param_list_found)):
parameter_docs[param] = [None, None]

return parameter_docs

def _serialize_model(self, model: Any) -> OpenMLFlow:
"""Create an OpenMLFlow.

Expand Down Expand Up @@ -656,97 +757,6 @@ def _check_multiple_occurence_of_component_in_flow(
known_sub_components.add(visitee.name)
to_visit_stack.extend(visitee.components.values())

def _extract_sklearn_parameter_docstring(self, model) -> Union[None, str]:
'''Extracts the part of sklearn docstring containing parameter information

Fetches the entire docstring and trims just the Parameter section.
The assumption is that 'Parameters' is the first section in sklearn docstrings,
followed by other sections titled 'Attributes', 'See also', 'Note', 'References',
appearing in that order if defined.
Returns a None if no section with 'Parameters' can be found in the docstring.

Parameters
----------
model : sklearn model

Returns
-------
str, or None
'''
def match_format(s):
return "{}\n{}\n".format(s, len(s) * '-')
s = inspect.getdoc(model)
try:
index1 = s.index(match_format("Parameters"))
except ValueError as e:
# when sklearn docstring has no 'Parameters' section
print("{} {}".format(match_format("Parameters"), e))
return None

headings = ["Attributes", "Notes", "See also", "Note", "References"]
for h in headings:
try:
# to find end of Parameters section
index2 = s.index(match_format(h))
break
except ValueError:
print("{} not available in docstring".format(h))
continue
else:
# in the case only 'Parameters' exist, trim till end of docstring
index2 = len(s)
s = s[index1:index2]
return s.strip()

def _extract_sklearn_param_info(self, model, char_lim=1024) -> Union[None, Dict]:
'''Parses parameter type and description from sklearn dosctring

Parameters
----------
model : sklearn model
char_lim : int
Specifying the max length of the returned string.
OpenML servers have a constraint of 1024 characters string fields.

Returns
-------
Dict, or None
'''
docstring = self._extract_sklearn_parameter_docstring(model)
if docstring is None:
# when sklearn docstring has no 'Parameters' section
return None

n = re.compile("[.]*\n", flags=IGNORECASE)
lines = n.split(docstring)
p = re.compile("[a-z0-9_ ]+ : [a-z0-9_']+[a-z0-9_ ]*", flags=IGNORECASE)
parameter_docs = OrderedDict() # type: Dict
description = [] # type: List

# collecting parameters and their descriptions
for i, s in enumerate(lines):
param = p.findall(s)
if param != []:
if len(description) > 0:
description[-1] = '\n'.join(description[-1]).strip()
if len(description[-1]) > char_lim:
description[-1] = "{}...".format(description[-1][:char_lim - 3])
description.append([])
else:
if len(description) > 0:
description[-1].append(s)
description[-1] = '\n'.join(description[-1]).strip()
if len(description[-1]) > char_lim:
description[-1] = "{}...".format(description[-1][:char_lim - 3])

# collecting parameters and their types
matches = p.findall(docstring)
for i, param in enumerate(matches):
key, value = param.split(':')
parameter_docs[key.strip()] = [value.strip(), description[i]]

return parameter_docs

def _extract_information_from_model(
self,
model: Any,
Expand Down Expand Up @@ -890,6 +900,10 @@ def flatten_all(list_):
parameters[k] = None

if parameters_docs is not None:
# print(type(model))
Comment thread
mfeurer marked this conversation as resolved.
Outdated
# print(sorted(parameters_docs.keys()))
# print(sorted(model_parameters.keys()))
# print()
data_type, description = parameters_docs[k]
parameters_meta_info[k] = OrderedDict((('description', description),
('data_type', data_type)))
Expand Down
31 changes: 31 additions & 0 deletions openml/flows/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,10 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
ignore_custom_name_if_none)
elif key == '_extension':
continue
elif key == 'description':
Comment thread
mfeurer marked this conversation as resolved.
Outdated
# to ignore matching of descriptions since sklearn based flows may have
# altering docstrings and is not guaranteed to be consistent
continue
else:
if key == 'parameters':
if ignore_parameter_values or \
Expand Down Expand Up @@ -397,6 +401,33 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
# Helps with backwards compatibility as `custom_name` is now auto-generated, but
# before it used to be `None`.
continue
elif key == 'parameters_meta_info':
# this value is a dictionary where each key is a parameter name, containing another
# dictionary with keys specifying the parameter's 'description' and 'data_type'
# check of descriptions can be ignored since that might change
Comment thread
mfeurer marked this conversation as resolved.
Outdated
# data type check can be ignored if one of them is not defined, i.e., None
params1 = set(flow1.parameters_meta_info.keys())
params2 = set(flow2.parameters_meta_info.keys())
if params1 != params2:
raise ValueError('Parameter list in meta info for parameters differ in the two flows.')
# iterating over the parameter's meta info list
for param in params1:
if isinstance(flow1.parameters_meta_info[param], Dict) and \
isinstance(flow2.parameters_meta_info[param], Dict) and \
'data_type' in flow1.parameters_meta_info[param] and \
'data_type' in flow2.parameters_meta_info[param]:
value1 = flow1.parameters_meta_info[param]['data_type']
value2 = flow2.parameters_meta_info[param]['data_type']
else:
value1 = flow1.parameters_meta_info[param]
value2 = flow2.parameters_meta_info[param]
if value1 is None or value2 is None:
continue
elif value1 != value2:
raise ValueError("Flow {}: data type for parameter {} in parameters_meta_info differ as "
"{}\nvs\n{}".format(flow1.name, key, value1, value2))
# the continue is to avoid the 'attr != attr2' check at end of function
continue

if attr1 != attr2:
raise ValueError("Flow %s: values for attribute '%s' differ: "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def test_serialize_model(self):

fixture_name = 'sklearn.tree.tree.DecisionTreeClassifier'
fixture_short_name = 'sklearn.DecisionTreeClassifier'
fixture_description = 'Automatically created scikit-learn flow.'
fixture_description = self.extension._get_sklearn_description(model)
Comment thread
mfeurer marked this conversation as resolved.
Outdated
version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \
% sklearn.__version__
# min_impurity_decrease has been introduced in 0.20
Expand Down Expand Up @@ -143,7 +143,7 @@ def test_serialize_model_clustering(self):

fixture_name = 'sklearn.cluster.k_means_.KMeans'
fixture_short_name = 'sklearn.KMeans'
fixture_description = 'Automatically created scikit-learn flow.'
fixture_description = self.extension._get_sklearn_description(model)
version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \
% sklearn.__version__
# n_jobs default has changed to None in 0.20
Expand Down Expand Up @@ -207,10 +207,10 @@ def test_serialize_model_with_subcomponent(self):
'(base_estimator=sklearn.tree.tree.DecisionTreeClassifier)'
fixture_class_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'
fixture_short_name = 'sklearn.AdaBoostClassifier'
fixture_description = 'Automatically created scikit-learn flow.'
fixture_description = self.extension._get_sklearn_description(model)
fixture_subcomponent_name = 'sklearn.tree.tree.DecisionTreeClassifier'
fixture_subcomponent_class_name = 'sklearn.tree.tree.DecisionTreeClassifier'
fixture_subcomponent_description = 'Automatically created scikit-learn flow.'
fixture_subcomponent_description = self.extension._get_sklearn_description(model.base_estimator)
fixture_structure = {
fixture_name: [],
'sklearn.tree.tree.DecisionTreeClassifier': ['base_estimator']
Expand Down Expand Up @@ -264,7 +264,7 @@ def test_serialize_pipeline(self):
'scaler=sklearn.preprocessing.data.StandardScaler,' \
'dummy=sklearn.dummy.DummyClassifier)'
fixture_short_name = 'sklearn.Pipeline(StandardScaler,DummyClassifier)'
fixture_description = 'Automatically created scikit-learn flow.'
fixture_description = self.extension._get_sklearn_description(model)
fixture_structure = {
fixture_name: [],
'sklearn.preprocessing.data.StandardScaler': ['scaler'],
Expand Down Expand Up @@ -353,7 +353,7 @@ def test_serialize_pipeline_clustering(self):
'scaler=sklearn.preprocessing.data.StandardScaler,' \
'clusterer=sklearn.cluster.k_means_.KMeans)'
fixture_short_name = 'sklearn.Pipeline(StandardScaler,KMeans)'
fixture_description = 'Automatically created scikit-learn flow.'
fixture_description = self.extension._get_sklearn_description(model)
fixture_structure = {
fixture_name: [],
'sklearn.preprocessing.data.StandardScaler': ['scaler'],
Expand Down Expand Up @@ -445,7 +445,7 @@ def test_serialize_column_transformer(self):
'numeric=sklearn.preprocessing.data.StandardScaler,' \
'nominal=sklearn.preprocessing._encoders.OneHotEncoder)'
fixture_short_name = 'sklearn.ColumnTransformer'
fixture_description = 'Automatically created scikit-learn flow.'
fixture_description = self.extension._get_sklearn_description(model)
fixture_structure = {
fixture: [],
'sklearn.preprocessing.data.StandardScaler': ['numeric'],
Expand Down Expand Up @@ -504,7 +504,7 @@ def test_serialize_column_transformer_pipeline(self):
fixture_name: [],
}

fixture_description = 'Automatically created scikit-learn flow.'
fixture_description = self.extension._get_sklearn_description(model)
serialization = self.extension.model_to_flow(model)
structure = serialization.get_structure('name')
self.assertEqual(serialization.name, fixture_name)
Expand Down
1 change: 0 additions & 1 deletion tests/test_flows/test_flow_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,6 @@ def test_are_flows_equal(self):
# Test most important values that can be set by a user
openml.flows.functions.assert_flows_equal(flow, flow)
for attribute, new_value in [('name', 'Tes'),
('description', 'Test flo'),
('external_version', '2'),
('language', 'english'),
('dependencies', 'ab'),
Expand Down