Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 43 additions & 32 deletions openml/extensions/sklearn/extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,10 +694,14 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
# will be part of the name (in brackets)
sub_components_names = ""
for key in subcomponents:
if isinstance(subcomponents[key], OpenMLFlow):
name = subcomponents[key].name
elif isinstance(subcomponents[key], str): # 'drop', 'passthrough' can be passed
name = subcomponents[key]
if key in subcomponents_explicit:
sub_components_names += "," + key + "=" + subcomponents[key].name
sub_components_names += "," + key + "=" + name
else:
sub_components_names += "," + subcomponents[key].name
sub_components_names += "," + name

if sub_components_names:
# slice operation on string in order to get rid of leading comma
Expand Down Expand Up @@ -769,6 +773,9 @@ def _get_external_version_string(
external_versions.add(openml_version)
external_versions.add(sklearn_version)
for visitee in sub_components.values():
# 'drop', 'passthrough', None can be passed as estimators
if isinstance(visitee, str):
continue
for external_version in visitee.external_version.split(','):
external_versions.add(external_version)
return ','.join(list(sorted(external_versions)))
Expand All @@ -781,9 +788,12 @@ def _check_multiple_occurence_of_component_in_flow(
to_visit_stack = [] # type: List[OpenMLFlow]
to_visit_stack.extend(sub_components.values())
known_sub_components = set() # type: Set[str]

while len(to_visit_stack) > 0:
visitee = to_visit_stack.pop()
if visitee.name in known_sub_components:
if isinstance(visitee, str): # 'drop', 'passthrough' can be passed as estimators
known_sub_components.add(visitee)
elif visitee.name in known_sub_components:
raise ValueError('Found a second occurence of component %s when '
'trying to serialize %s.' % (visitee.name, model))
else:
Expand Down Expand Up @@ -820,7 +830,7 @@ def _extract_information_from_model(
def flatten_all(list_):
""" Flattens arbitrary depth lists of lists (e.g. [[1,2],[3,[1]]] -> [1,2,3,1]). """
for el in list_:
if isinstance(el, (list, tuple)):
if isinstance(el, (list, tuple)) and len(el) > 0:
yield from flatten_all(el)
else:
yield el
Expand Down Expand Up @@ -850,17 +860,31 @@ def flatten_all(list_):
parameter_value = list() # type: List
reserved_keywords = set(model.get_params(deep=False).keys())

for sub_component_tuple in rval:
for i, sub_component_tuple in enumerate(rval):
identifier = sub_component_tuple[0]
sub_component = sub_component_tuple[1]
sub_component_type = type(sub_component_tuple)
# sub_component_type = type(sub_component_tuple)
if not 2 <= len(sub_component_tuple) <= 3:
# length 2 is for {VotingClassifier.estimators,
# Pipeline.steps, FeatureUnion.transformer_list}
# length 3 is for ColumnTransformer
msg = 'Length of tuple does not match assumptions'
raise ValueError(msg)
if not isinstance(sub_component, (OpenMLFlow, type(None))):

if isinstance(sub_component, str):
if sub_component != 'drop' and sub_component != 'passthrough':
msg = 'Second item of tuple does not match assumptions. ' \
'If string, can be only \'drop\' or \'passthrough\' but' \
'got %s' % sub_component
raise ValueError(msg)
else:
pass
elif isinstance(sub_component, type(None)):
msg = 'Cannot serialize objects of None type. Please use a valid ' \
'placeholder for None. Note that empty sklearn estimators can be '\
'replaced with \'drop\' or \'passthrough\'.'
raise ValueError(msg)
elif not isinstance(sub_component, OpenMLFlow):
msg = 'Second item of tuple does not match assumptions. ' \
'Expected OpenMLFlow, got %s' % type(sub_component)
raise TypeError(msg)
Expand All @@ -873,31 +897,18 @@ def flatten_all(list_):
identifier)
raise PyOpenMLError(msg)

if sub_component is None:
# In a FeatureUnion it is legal to have a None step

pv = [identifier, None]
if sub_component_type is tuple:
parameter_value.append(tuple(pv))
else:
parameter_value.append(pv)

else:
# Add the component to the list of components, add a
# component reference as a placeholder to the list of
# parameters, which will be replaced by the real component
# when deserializing the parameter
sub_components_explicit.add(identifier)
sub_components[identifier] = sub_component
component_reference = OrderedDict() # type: Dict[str, Union[str, Dict]]
component_reference['oml-python:serialized_object'] = 'component_reference'
cr_value = OrderedDict() # type: Dict[str, Any]
cr_value['key'] = identifier
cr_value['step_name'] = identifier
if len(sub_component_tuple) == 3:
cr_value['argument_1'] = sub_component_tuple[2]
component_reference['value'] = cr_value
parameter_value.append(component_reference)
# when deserializing the parameter
sub_components_explicit.add(identifier)
sub_components[identifier] = sub_component
component_reference = OrderedDict() # type: Dict[str, Union[str, Dict]]
component_reference['oml-python:serialized_object'] = 'component_reference'
cr_value = OrderedDict() # type: Dict[str, Any]
cr_value['key'] = identifier
cr_value['step_name'] = identifier
if len(sub_component_tuple) == 3:
cr_value['argument_1'] = sub_component_tuple[2]
component_reference['value'] = cr_value
parameter_value.append(component_reference)

# Here (and in the elif and else branch below) are the only
# places where we encode a value as json to make sure that all
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
import sklearn.preprocessing
import sklearn.tree
import sklearn.cluster

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

import openml
from openml.extensions.sklearn import SklearnExtension
Expand Down Expand Up @@ -607,6 +608,8 @@ def test_serialize_column_transformer_pipeline(self):
serialization2 = self.extension.model_to_flow(new_model)
assert_flows_equal(serialization, serialization2)

@unittest.skipIf(LooseVersion(sklearn.__version__) < "0.20",
reason="Pipeline processing behaviour updated")
def test_serialize_feature_union(self):
ohe_params = {'sparse': False}
if LooseVersion(sklearn.__version__) >= "0.20":
Expand Down Expand Up @@ -673,16 +676,17 @@ def test_serialize_feature_union(self):
self.assertEqual(new_model_params, fu_params)
new_model.fit(self.X, self.y)

fu.set_params(scaler=None)
fu.set_params(scaler='drop')
serialization = self.extension.model_to_flow(fu)
self.assertEqual(serialization.name,
'sklearn.pipeline.FeatureUnion('
'ohe=sklearn.preprocessing.{}.OneHotEncoder)'
'ohe=sklearn.preprocessing.{}.OneHotEncoder,'
'scaler=drop)'
.format(module_name_encoder))
new_model = self.extension.flow_to_model(serialization)
self.assertEqual(type(new_model), type(fu))
self.assertIsNot(new_model, fu)
self.assertIs(new_model.transformer_list[1][1], None)
self.assertIs(new_model.transformer_list[1][1], 'drop')

def test_serialize_feature_union_switched_names(self):
ohe_params = ({'categories': 'auto'}
Expand Down Expand Up @@ -1776,3 +1780,66 @@ def test_trim_flow_name(self):

self.assertEqual("weka.IsolationForest",
SklearnExtension.trim_flow_name("weka.IsolationForest"))

@unittest.skipIf(LooseVersion(sklearn.__version__) < "0.21",
reason="SimpleImputer, ColumnTransformer available only after 0.19 and "
"Pipeline till 0.20 doesn't support indexing and 'passthrough'")
def test_run_on_model_with_empty_steps(self):
from sklearn.compose import ColumnTransformer
# testing 'drop', 'passthrough', None as non-actionable sklearn estimators
dataset = openml.datasets.get_dataset(128)
task = openml.tasks.get_task(59)

X, y, categorical_ind, feature_names = dataset.get_data(
target=dataset.default_target_attribute, dataset_format='array')
categorical_ind = np.array(categorical_ind)
cat_idx, = np.where(categorical_ind)
cont_idx, = np.where(~categorical_ind)

clf = make_pipeline(
ColumnTransformer([('cat', make_pipeline(SimpleImputer(strategy='most_frequent'),
OneHotEncoder()), cat_idx.tolist()),
('cont', make_pipeline(SimpleImputer(strategy='median'),
StandardScaler()), cont_idx.tolist())])
)

clf = sklearn.pipeline.Pipeline([
('dummystep', 'passthrough'), # adding 'passthrough' as an estimator
('prep', clf),
('classifier', sklearn.svm.SVC(gamma='auto'))
])

# adding 'drop' to a ColumnTransformer
if not categorical_ind.any():
clf[1][0].set_params(cat='drop')
if not (~categorical_ind).any():
clf[1][0].set_params(cont='drop')

# serializing model with non-actionable step
run, flow = openml.runs.run_model_on_task(model=clf, task=task, return_flow=True)

self.assertEqual(len(flow.components), 3)
self.assertEqual(flow.components['dummystep'], 'passthrough')
self.assertTrue(isinstance(flow.components['classifier'], OpenMLFlow))
self.assertTrue(isinstance(flow.components['prep'], OpenMLFlow))
self.assertTrue(isinstance(flow.components['prep'].components['columntransformer'],
OpenMLFlow))
self.assertEqual(flow.components['prep'].components['columntransformer'].components['cat'],
'drop')

# de-serializing flow to a model with non-actionable step
model = self.extension.flow_to_model(flow)
model.fit(X, y)
self.assertEqual(type(model), type(clf))
self.assertNotEqual(model, clf)
self.assertEqual(len(model.named_steps), 3)
self.assertEqual(model.named_steps['dummystep'], 'passthrough')

def test_sklearn_serialization_with_none_step(self):
msg = 'Cannot serialize objects of None type. Please use a valid ' \
'placeholder for None. Note that empty sklearn estimators can be ' \
'replaced with \'drop\' or \'passthrough\'.'
clf = sklearn.pipeline.Pipeline([('dummystep', None),
('classifier', sklearn.svm.SVC(gamma='auto'))])
with self.assertRaisesRegex(ValueError, msg):
self.extension.model_to_flow(clf)