|
9 | 9 |
|
10 | 10 | import openml |
11 | 11 | import numpy as np |
12 | | -from sklearn import compose, ensemble, neighbors, preprocessing, pipeline, tree |
| 12 | +from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree |
13 | 13 |
|
14 | 14 | ############################################################################ |
15 | 15 | # Train machine learning models |
|
38 | 38 | X, y, categorical_indicator, attribute_names = dataset.get_data( |
39 | 39 | dataset_format="array", target=dataset.default_target_attribute |
40 | 40 | ) |
41 | | -numerical_indicator = list(~np.array(categorical_indicator)) |
42 | 41 | print(f"Categorical features: {categorical_indicator}") |
43 | 42 | transformer = compose.ColumnTransformer( |
44 | | - [ |
45 | | - ("one_hot_encoder", preprocessing.OneHotEncoder(categories="auto"), categorical_indicator), |
46 | | - ("numeric_pass", "passthrough", numerical_indicator), |
47 | | - ] |
| 43 | + [("one_hot_encoder", preprocessing.OneHotEncoder(categories="auto"), categorical_indicator)] |
48 | 44 | ) |
49 | 45 | X = transformer.fit_transform(X) |
50 | 46 | clf.fit(X, y) |
|
88 | 84 | # |
89 | 85 | # When you need to handle 'dirty' data, build pipelines to model then automatically. |
90 | 86 | task = openml.tasks.get_task(1) |
91 | | -features = task.get_dataset().features |
92 | | -nominal_feature_indices = [ |
93 | | - i |
94 | | - for i in range(len(features)) |
95 | | - if features[i].name != task.target_name and features[i].data_type == "nominal" |
96 | | -] |
97 | | -numeric_feature_indices = [ |
98 | | - i |
99 | | - for i in range(len(features)) |
100 | | - if features[i].name != task.target_name and features[i].data_type == "numeric" |
101 | | -] |
| 87 | + |
| 88 | +# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines |
| 89 | +from openml.extensions.sklearn import cat, cont |
102 | 90 |
|
103 | 91 | pipe = pipeline.Pipeline( |
104 | 92 | steps=[ |
|
107 | 95 | compose.ColumnTransformer( |
108 | 96 | [ |
109 | 97 | ( |
110 | | - "Nominal", |
111 | | - preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore",), |
112 | | - nominal_feature_indices, |
| 98 | + "categorical", |
| 99 | + pipeline.Pipeline( |
| 100 | + [ |
| 101 | + ("Imputer", impute.SimpleImputer(strategy="most_frequent")), |
| 102 | + ( |
| 103 | + "Encoder", |
| 104 | + preprocessing.OneHotEncoder( |
| 105 | + sparse=False, handle_unknown="ignore" |
| 106 | + ), |
| 107 | + ), |
| 108 | + ] |
| 109 | + ), |
| 110 | + cat, # returns the categorical feature indices |
113 | 111 | ), |
114 | | - ("Numeric", "passthrough", numeric_feature_indices,), |
| 112 | + ("continuous", "passthrough", cont), # returns the numeric feature indices |
115 | 113 | ] |
116 | 114 | ), |
117 | 115 | ), |
|
123 | 121 | myrun = run.publish() |
124 | 122 | print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id)) |
125 | 123 |
|
| 124 | + |
| 125 | +# The above pipeline works with the helper functions that internally deal with pandas DataFrame. |
| 126 | +# In the case, pandas is not available, or a NumPy based data processing is the requirement, the |
| 127 | +# above pipeline is presented below to work with NumPy. |
| 128 | + |
| 129 | +# Extracting the indices of the categorical columns |
| 130 | +features = task.get_dataset().features |
| 131 | +categorical_feature_indices = [] |
| 132 | +numeric_feature_indices = [] |
| 133 | +for i in range(len(features)): |
| 134 | + if features[i].name == task.target_name: |
| 135 | + continue |
| 136 | + if features[i].data_type == "nominal": |
| 137 | + categorical_feature_indices.append(i) |
| 138 | + else: |
| 139 | + numeric_feature_indices.append(i) |
| 140 | + |
| 141 | +pipe = pipeline.Pipeline( |
| 142 | + steps=[ |
| 143 | + ( |
| 144 | + "Preprocessing", |
| 145 | + compose.ColumnTransformer( |
| 146 | + [ |
| 147 | + ( |
| 148 | + "categorical", |
| 149 | + pipeline.Pipeline( |
| 150 | + [ |
| 151 | + ("Imputer", impute.SimpleImputer(strategy="most_frequent")), |
| 152 | + ( |
| 153 | + "Encoder", |
| 154 | + preprocessing.OneHotEncoder( |
| 155 | + sparse=False, handle_unknown="ignore" |
| 156 | + ), |
| 157 | + ), |
| 158 | + ] |
| 159 | + ), |
| 160 | + categorical_feature_indices, |
| 161 | + ), |
| 162 | + ("continuous", "passthrough", numeric_feature_indices), |
| 163 | + ] |
| 164 | + ), |
| 165 | + ), |
| 166 | + ("Classifier", ensemble.RandomForestClassifier(n_estimators=10)), |
| 167 | + ] |
| 168 | +) |
| 169 | + |
| 170 | +run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format="array") |
| 171 | +myrun = run.publish() |
| 172 | +print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id)) |
| 173 | + |
126 | 174 | ############################################################################### |
127 | 175 | # Running flows on tasks offline for later upload |
128 | 176 | # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
|
0 commit comments