Skip to content

Commit 902cd3f

Browse files
committed
Updating with PR #982
2 parents ac173aa + 1be82c3 commit 902cd3f

File tree

8 files changed

+153
-56
lines changed

8 files changed

+153
-56
lines changed

examples/30_extended/flows_and_runs_tutorial.py

Lines changed: 69 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
import openml
1111
import numpy as np
12-
from sklearn import compose, ensemble, neighbors, preprocessing, pipeline, tree
12+
from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
1313

1414
############################################################################
1515
# Train machine learning models
@@ -38,13 +38,9 @@
3838
X, y, categorical_indicator, attribute_names = dataset.get_data(
3939
dataset_format="array", target=dataset.default_target_attribute
4040
)
41-
numerical_indicator = list(~np.array(categorical_indicator))
4241
print(f"Categorical features: {categorical_indicator}")
4342
transformer = compose.ColumnTransformer(
44-
[
45-
("one_hot_encoder", preprocessing.OneHotEncoder(categories="auto"), categorical_indicator),
46-
("numeric_pass", "passthrough", numerical_indicator),
47-
]
43+
[("one_hot_encoder", preprocessing.OneHotEncoder(categories="auto"), categorical_indicator)]
4844
)
4945
X = transformer.fit_transform(X)
5046
clf.fit(X, y)
@@ -88,17 +84,9 @@
8884
#
8985
# When you need to handle 'dirty' data, build pipelines to model then automatically.
9086
task = openml.tasks.get_task(1)
91-
features = task.get_dataset().features
92-
nominal_feature_indices = [
93-
i
94-
for i in range(len(features))
95-
if features[i].name != task.target_name and features[i].data_type == "nominal"
96-
]
97-
numeric_feature_indices = [
98-
i
99-
for i in range(len(features))
100-
if features[i].name != task.target_name and features[i].data_type == "numeric"
101-
]
87+
88+
# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines
89+
from openml.extensions.sklearn import cat, cont
10290

10391
pipe = pipeline.Pipeline(
10492
steps=[
@@ -107,11 +95,21 @@
10795
compose.ColumnTransformer(
10896
[
10997
(
110-
"Nominal",
111-
preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore",),
112-
nominal_feature_indices,
98+
"categorical",
99+
pipeline.Pipeline(
100+
[
101+
("Imputer", impute.SimpleImputer(strategy="most_frequent")),
102+
(
103+
"Encoder",
104+
preprocessing.OneHotEncoder(
105+
sparse=False, handle_unknown="ignore"
106+
),
107+
),
108+
]
109+
),
110+
cat, # returns the categorical feature indices
113111
),
114-
("Numeric", "passthrough", numeric_feature_indices,),
112+
("continuous", "passthrough", cont), # returns the numeric feature indices
115113
]
116114
),
117115
),
@@ -123,6 +121,56 @@
123121
myrun = run.publish()
124122
print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id))
125123

124+
125+
# The above pipeline works with the helper functions that internally deal with pandas DataFrame.
126+
# In the case, pandas is not available, or a NumPy based data processing is the requirement, the
127+
# above pipeline is presented below to work with NumPy.
128+
129+
# Extracting the indices of the categorical columns
130+
features = task.get_dataset().features
131+
categorical_feature_indices = []
132+
numeric_feature_indices = []
133+
for i in range(len(features)):
134+
if features[i].name == task.target_name:
135+
continue
136+
if features[i].data_type == "nominal":
137+
categorical_feature_indices.append(i)
138+
else:
139+
numeric_feature_indices.append(i)
140+
141+
pipe = pipeline.Pipeline(
142+
steps=[
143+
(
144+
"Preprocessing",
145+
compose.ColumnTransformer(
146+
[
147+
(
148+
"categorical",
149+
pipeline.Pipeline(
150+
[
151+
("Imputer", impute.SimpleImputer(strategy="most_frequent")),
152+
(
153+
"Encoder",
154+
preprocessing.OneHotEncoder(
155+
sparse=False, handle_unknown="ignore"
156+
),
157+
),
158+
]
159+
),
160+
categorical_feature_indices,
161+
),
162+
("continuous", "passthrough", numeric_feature_indices),
163+
]
164+
),
165+
),
166+
("Classifier", ensemble.RandomForestClassifier(n_estimators=10)),
167+
]
168+
)
169+
170+
run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format="array")
171+
myrun = run.publish()
172+
print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id))
173+
126174
###############################################################################
127175
# Running flows on tasks offline for later upload
128176
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

examples/30_extended/run_setup_tutorial.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,14 @@
3434

3535
import numpy as np
3636
import openml
37+
from openml.extensions.sklearn import cat, cont
38+
3739
from sklearn.pipeline import make_pipeline, Pipeline
3840
from sklearn.compose import ColumnTransformer
41+
from sklearn.impute import SimpleImputer
3942
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
4043
from sklearn.ensemble import RandomForestClassifier
44+
from sklearn.decomposition import TruncatedSVD
4145

4246

4347
openml.config.start_using_configuration_for_example()
@@ -55,18 +59,12 @@
5559
# easy as you want it to be
5660

5761

58-
# Helper functions to return required columns for ColumnTransformer
59-
def cont(X):
60-
return X.dtypes != "category"
61-
62-
63-
def cat(X):
64-
return X.dtypes == "category"
65-
66-
67-
ct = ColumnTransformer(
68-
[("cat", OneHotEncoder(handle_unknown="ignore"), cat), ("cont", "passthrough", cont)]
62+
cat_imp = make_pipeline(
63+
SimpleImputer(strategy="most_frequent"),
64+
OneHotEncoder(handle_unknown="ignore", sparse=False),
65+
TruncatedSVD(),
6966
)
67+
ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", "passthrough", cont)])
7068
model_original = Pipeline(steps=[("transform", ct), ("estimator", RandomForestClassifier()),])
7169

7270
# Let's change some hyperparameters. Of course, in any good application we

openml/extensions/sklearn/__init__.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,31 @@
77
__all__ = ["SklearnExtension"]
88

99
register_extension(SklearnExtension)
10+
11+
12+
def cont(X):
13+
"""Returns True for all non-categorical columns, False for the rest.
14+
15+
This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling
16+
of mixed data types. To build sklearn models on mixed data types, a ColumnTransformer is
17+
required to process each type of columns separately.
18+
This function allows transformations meant for continuous/numeric columns to access the
19+
continuous/numeric columns given the dataset as DataFrame.
20+
"""
21+
if not hasattr(X, "dtypes"):
22+
raise AttributeError("Not a Pandas DataFrame with 'dtypes' as attribute!")
23+
return X.dtypes != "category"
24+
25+
26+
def cat(X):
27+
"""Returns True for all categorical columns, False for the rest.
28+
29+
This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling
30+
of mixed data types. To build sklearn models on mixed data types, a ColumnTransformer is
31+
required to process each type of columns separately.
32+
This function allows transformations meant for categorical columns to access the
33+
categorical columns given the dataset as DataFrame.
34+
"""
35+
if not hasattr(X, "dtypes"):
36+
raise AttributeError("Not a Pandas DataFrame with 'dtypes' as attribute!")
37+
return X.dtypes == "category"

openml/runs/functions.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
import sklearn.metrics
1212
import xmltodict
13+
import numpy as np
1314
import pandas as pd
1415

1516
import openml
@@ -508,7 +509,9 @@ def _calculate_local_measure(sklearn_fn, openml_name):
508509
for i, tst_idx in enumerate(test_indices):
509510
if task.class_labels is not None:
510511
prediction = (
511-
task.class_labels[pred_y[i]] if isinstance(pred_y[i], int) else pred_y[i]
512+
task.class_labels[pred_y[i]]
513+
if isinstance(pred_y[i], (int, np.integer))
514+
else pred_y[i]
512515
)
513516
if isinstance(test_y, pd.Series):
514517
test_prediction = (
@@ -519,7 +522,7 @@ def _calculate_local_measure(sklearn_fn, openml_name):
519522
else:
520523
test_prediction = (
521524
task.class_labels[test_y[i]]
522-
if isinstance(test_y[i], int)
525+
if isinstance(test_y[i], (int, np.integer))
523526
else test_y[i]
524527
)
525528
pred_prob = proba_y.iloc[i] if isinstance(proba_y, pd.DataFrame) else proba_y[i]

openml/testing.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -318,12 +318,4 @@ class CustomImputer(SimpleImputer):
318318
pass
319319

320320

321-
def cont(X):
322-
return X.dtypes != "category"
323-
324-
325-
def cat(X):
326-
return X.dtypes == "category"
327-
328-
329-
__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "cat", "cont", "check_task_existence"]
321+
__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "check_task_existence"]

tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py

Lines changed: 37 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@
4040
from openml.flows import OpenMLFlow
4141
from openml.flows.functions import assert_flows_equal
4242
from openml.runs.trace import OpenMLRunTrace
43-
from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont
43+
from openml.testing import TestBase, SimpleImputer, CustomImputer
44+
from openml.extensions.sklearn import cat, cont
4445

4546

4647
this_directory = os.path.dirname(os.path.abspath(__file__))
@@ -2195,16 +2196,6 @@ def test_failed_serialization_of_custom_class(self):
21952196
# for lower versions
21962197
from sklearn.preprocessing import Imputer as SimpleImputer
21972198

2198-
class CustomImputer(SimpleImputer):
2199-
pass
2200-
2201-
def cont(X):
2202-
return X.dtypes != "category"
2203-
2204-
def cat(X):
2205-
return X.dtypes == "category"
2206-
2207-
import sklearn.metrics
22082199
import sklearn.tree
22092200
from sklearn.pipeline import Pipeline, make_pipeline
22102201
from sklearn.compose import ColumnTransformer
@@ -2227,3 +2218,38 @@ def cat(X):
22272218
raise AttributeError(e)
22282219
else:
22292220
raise Exception(e)
2221+
2222+
@unittest.skipIf(
2223+
LooseVersion(sklearn.__version__) < "0.20",
2224+
reason="columntransformer introduction in 0.20.0",
2225+
)
2226+
def test_setupid_with_column_transformer(self):
2227+
"""Test to check if inclusion of ColumnTransformer in a pipleline is treated as a new
2228+
flow each time.
2229+
"""
2230+
import sklearn.compose
2231+
from sklearn.svm import SVC
2232+
2233+
def column_transformer_pipe(task_id):
2234+
task = openml.tasks.get_task(task_id)
2235+
# make columntransformer
2236+
preprocessor = sklearn.compose.ColumnTransformer(
2237+
transformers=[
2238+
("num", StandardScaler(), cont),
2239+
("cat", OneHotEncoder(handle_unknown="ignore"), cat),
2240+
]
2241+
)
2242+
# make pipeline
2243+
clf = SVC(gamma="scale", random_state=1)
2244+
pipe = make_pipeline(preprocessor, clf)
2245+
# run task
2246+
run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
2247+
run.publish()
2248+
new_run = openml.runs.get_run(run.run_id)
2249+
return new_run
2250+
2251+
run1 = column_transformer_pipe(11) # only categorical
2252+
TestBase._mark_entity_for_removal("run", run1.run_id)
2253+
run2 = column_transformer_pipe(23) # only numeric
2254+
TestBase._mark_entity_for_removal("run", run2.run_id)
2255+
self.assertEqual(run1.setup_id, run2.setup_id)

tests/test_runs/test_run_functions.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
import pandas as pd
2121

2222
import openml.extensions.sklearn
23-
from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont
23+
from openml.testing import TestBase, SimpleImputer, CustomImputer
24+
from openml.extensions.sklearn import cat, cont
2425
from openml.runs.functions import _run_task_get_arffcontent, run_exists, format_prediction
2526
from openml.runs.trace import OpenMLRunTrace
2627
from openml.tasks import TaskType

tests/test_study/test_study_examples.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# License: BSD 3-Clause
22

3-
from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont
3+
from openml.testing import TestBase, SimpleImputer, CustomImputer
4+
from openml.extensions.sklearn import cat, cont
45

56
import sklearn
67
import unittest

0 commit comments

Comments
 (0)