Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Making tentative changes; Need to test deserialization
  • Loading branch information
Neeratyoy committed Nov 12, 2019
commit 4875dbadf075aa6d8cc2c345bf4c4a3f11b426ea
32 changes: 20 additions & 12 deletions openml/extensions/sklearn/extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,10 +694,12 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
# will be part of the name (in brackets)
sub_components_names = ""
for key in subcomponents:
if isinstance(subcomponents[key], str):
name = subcomponents[key]
else:
if isinstance(subcomponents[key], OpenMLFlow):
name = subcomponents[key].name
elif isinstance(subcomponents[key], str): # 'drop', 'passthrough' can be passed
name = subcomponents[key]
elif subcomponents[key] is None:
name = "None"
if key in subcomponents_explicit:
sub_components_names += "," + key + "=" + name
else:
Expand Down Expand Up @@ -752,7 +754,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
def _get_external_version_string(
self,
model: Any,
sub_components: Dict[str, OpenMLFlow],
sub_components: Dict[str, Union[OpenMLFlow, str, None]],
) -> str:
# Create external version string for a flow, given the model and the
# already parsed dictionary of sub_components. Retrieves the external
Expand All @@ -773,7 +775,8 @@ def _get_external_version_string(
external_versions.add(openml_version)
external_versions.add(sklearn_version)
for visitee in sub_components.values():
if isinstance(visitee, str):
# 'drop', 'passthrough', None can be passed as estimators
if isinstance(visitee, str) or visitee is None:
continue
for external_version in visitee.external_version.split(','):
external_versions.add(external_version)
Expand All @@ -782,15 +785,18 @@ def _get_external_version_string(
def _check_multiple_occurence_of_component_in_flow(
self,
model: Any,
sub_components: Dict[str, Any],
sub_components: Dict[str, Union[OpenMLFlow, str, None]],
) -> None:
to_visit_stack = [] # type: List[OpenMLFlow]
to_visit_stack = [] # type: List[Union[OpenMLFlow, str, None]]
to_visit_stack.extend(sub_components.values())
known_sub_components = set() # type: Set[str]

while len(to_visit_stack) > 0:
visitee = to_visit_stack.pop()
if isinstance(visitee, str):
if isinstance(visitee, str): # 'drop', 'passthrough' can be passed as estimators
known_sub_components.add(visitee)
elif visitee is None: # a None step can be included in a Pipeline
Comment thread
Neeratyoy marked this conversation as resolved.
Outdated
known_sub_components.add(str(visitee))
elif visitee.name in known_sub_components:
raise ValueError('Found a second occurence of component %s when '
'trying to serialize %s.' % (visitee.name, model))
Expand All @@ -804,7 +810,7 @@ def _extract_information_from_model(
) -> Tuple[
'OrderedDict[str, Optional[str]]',
'OrderedDict[str, Optional[Dict]]',
'OrderedDict[str, Any]',
'OrderedDict[str, Union[OpenMLFlow, str, None]]',
Set,
]:
# This function contains four "global" states and is quite long and
Expand All @@ -814,7 +820,7 @@ def _extract_information_from_model(
# separate class methods

# stores all entities that should become subcomponents
sub_components = OrderedDict() # type: OrderedDict[str, OpenMLFlow]
sub_components = OrderedDict() # type: OrderedDict[str, Union[OpenMLFlow, str, None]]
# stores the keys of all subcomponents that should become
sub_components_explicit = set()
parameters = OrderedDict() # type: OrderedDict[str, Optional[str]]
Expand Down Expand Up @@ -858,7 +864,7 @@ def flatten_all(list_):
parameter_value = list() # type: List
reserved_keywords = set(model.get_params(deep=False).keys())

for sub_component_tuple in rval:
for i, sub_component_tuple in enumerate(rval):
identifier = sub_component_tuple[0]
sub_component = sub_component_tuple[1]
sub_component_type = type(sub_component_tuple)
Expand Down Expand Up @@ -891,13 +897,15 @@ def flatten_all(list_):
raise PyOpenMLError(msg)

if sub_component is None:
# In a FeatureUnion it is legal to have a None step
# In a FeatureUnion, Pipeline it is legal to have a None step

pv = [identifier, None]
if sub_component_type is tuple:
parameter_value.append(tuple(pv))
else:
parameter_value.append(pv)
sub_components_explicit.add(identifier)
sub_components[identifier] = sub_component

else:
# Add the component to the list of components, add a
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,18 @@
import sklearn.preprocessing
import sklearn.tree
import sklearn.cluster

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

import openml
from openml.extensions.sklearn import SklearnExtension
from openml.exceptions import PyOpenMLError
from openml.flows import OpenMLFlow
from openml.flows.functions import assert_flows_equal
from openml.runs.trace import OpenMLRunTrace
from openml.testing import TestBase, SimpleImputer
from openml.testing import TestBase


this_directory = os.path.dirname(os.path.abspath(__file__))
Expand Down Expand Up @@ -678,6 +681,7 @@ def test_serialize_feature_union(self):
self.assertEqual(serialization.name,
'sklearn.pipeline.FeatureUnion('
'ohe=sklearn.preprocessing.{}.OneHotEncoder)'
'scaler=None'
.format(module_name_encoder))
new_model = self.extension.flow_to_model(serialization)
self.assertEqual(type(new_model), type(fu))
Expand Down Expand Up @@ -1776,3 +1780,48 @@ def test_trim_flow_name(self):

self.assertEqual("weka.IsolationForest",
SklearnExtension.trim_flow_name("weka.IsolationForest"))

@unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
reason="SimpleImputer available only after 0.19")
def test_run_on_model_with_empty_steps(self):
# testing 'drop', 'passthrough', None as non-actionable sklearn estimators
dataset = openml.datasets.get_dataset(128)
task = openml.tasks.get_task(59)

X, y, categorical_ind, feature_names = dataset.get_data(
target=dataset.default_target_attribute, dataset_format='array')
categorical_ind = np.array(categorical_ind)
cat_idx, = np.where(categorical_ind)
cont_idx, = np.where(~categorical_ind)

clf = make_pipeline(
ColumnTransformer([('cat', make_pipeline(SimpleImputer(strategy='most_frequent'),
OneHotEncoder()), cat_idx.tolist()),
('cont', make_pipeline(SimpleImputer(strategy='median'),
StandardScaler()), cont_idx.tolist())])
)

clf = sklearn.pipeline.Pipeline([
('dummystep', 'passthrough'), # adding 'passthrough' as an estimator
('prep', clf),
('variancethreshold', None), # adding 'None' as an estimator
('classifier', sklearn.svm.SVC(gamma='auto'))
])

# adding 'drop' to a ColumnTransformer
if not categorical_ind.any():
clf[1][0].set_params(cat='drop')
if not (~categorical_ind).any():
clf[1][0].set_params(cont='drop')

run, flow = openml.runs.run_model_on_task(model=clf, task=task, return_flow=True)

self.assertEqual(len(flow.components), 4)
self.assertEqual(flow.components['dummystep'], 'passthrough')
self.assertTrue(isinstance(flow.components['classifier'], OpenMLFlow))
self.assertEqual(flow.components['variancethreshold'], None)
self.assertTrue(isinstance(flow.components['prep'], OpenMLFlow))
self.assertTrue(isinstance(flow.components['prep'].components['columntransformer'],
OpenMLFlow))
self.assertEqual(flow.components['prep'].components['columntransformer'].components['cat'],
'drop')