Skip to content

Commit e87f1c7

Browse files
author
Github Actions
committed
Neeratyoy Mallik: Parallel evaluation of tasks (#1020)
1 parent fd2390b commit e87f1c7

147 files changed

Lines changed: 2586 additions & 1701 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

develop/.buildinfo

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# Sphinx build info version 1
22
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3-
config: 916e7408cab0414f9d499b4504128786
3+
config: 4dbf2556f08259b7ee1a59289245f882
44
tags: 645f666f9bcd5a90fca523b33c5a78b7

develop/_downloads/2fc23bfc18345b110ab68bc5f3939dc8/2018_neurips_perrone_example.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -177,18 +177,14 @@ def list_categorical_attributes(flow_type="svm"):
177177
cat_cols = list_categorical_attributes(flow_type=flow_type)
178178
num_cols = list(set(X.columns) - set(cat_cols))
179179

180-
# Missing value imputers
181-
cat_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="None")
180+
# Missing value imputers for numeric columns
182181
num_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=-1)
183182

184-
# Creating the one-hot encoder
183+
# Creating the one-hot encoder for numerical representation of categorical columns
185184
enc = OneHotEncoder(handle_unknown="ignore")
186185

187-
# Pipeline to handle categorical column transformations
188-
cat_transforms = Pipeline(steps=[("impute", cat_imputer), ("encode", enc)])
189-
190186
# Combining column transformers
191-
ct = ColumnTransformer([("cat", cat_transforms, cat_cols), ("num", num_imputer, num_cols)])
187+
ct = ColumnTransformer([("cat", enc, cat_cols), ("num", num_imputer, num_cols)])
192188

193189
# Creating the full pipeline with the surrogate model
194190
clf = RandomForestRegressor(n_estimators=50)

develop/_downloads/42ecf9b9ca30a385452934aeb1a420d5/2018_neurips_perrone_example.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@
8080
},
8181
"outputs": [],
8282
"source": [
83-
"# Separating data into categorical and non-categorical (numeric for this example) columns\ncat_cols = list_categorical_attributes(flow_type=flow_type)\nnum_cols = list(set(X.columns) - set(cat_cols))\n\n# Missing value imputers\ncat_imputer = SimpleImputer(missing_values=np.nan, strategy=\"constant\", fill_value=\"None\")\nnum_imputer = SimpleImputer(missing_values=np.nan, strategy=\"constant\", fill_value=-1)\n\n# Creating the one-hot encoder\nenc = OneHotEncoder(handle_unknown=\"ignore\")\n\n# Pipeline to handle categorical column transformations\ncat_transforms = Pipeline(steps=[(\"impute\", cat_imputer), (\"encode\", enc)])\n\n# Combining column transformers\nct = ColumnTransformer([(\"cat\", cat_transforms, cat_cols), (\"num\", num_imputer, num_cols)])\n\n# Creating the full pipeline with the surrogate model\nclf = RandomForestRegressor(n_estimators=50)\nmodel = Pipeline(steps=[(\"preprocess\", ct), (\"surrogate\", clf)])"
83+
"# Separating data into categorical and non-categorical (numeric for this example) columns\ncat_cols = list_categorical_attributes(flow_type=flow_type)\nnum_cols = list(set(X.columns) - set(cat_cols))\n\n# Missing value imputers for numeric columns\nnum_imputer = SimpleImputer(missing_values=np.nan, strategy=\"constant\", fill_value=-1)\n\n# Creating the one-hot encoder for numerical representation of categorical columns\nenc = OneHotEncoder(handle_unknown=\"ignore\")\n\n# Combining column transformers\nct = ColumnTransformer([(\"cat\", enc, cat_cols), (\"num\", num_imputer, num_cols)])\n\n# Creating the full pipeline with the surrogate model\nclf = RandomForestRegressor(n_estimators=50)\nmodel = Pipeline(steps=[(\"preprocess\", ct), (\"surrogate\", clf)])"
8484
]
8585
},
8686
{

develop/_downloads/6b1e091fbd3ac8d106b6552c91cf05cc/run_setup_tutorial.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,9 @@
5959
# easy as you want it to be
6060

6161

62-
cat_imp = make_pipeline(
63-
SimpleImputer(strategy="most_frequent"),
64-
OneHotEncoder(handle_unknown="ignore", sparse=False),
65-
TruncatedSVD(),
66-
)
67-
ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", "passthrough", cont)])
62+
cat_imp = make_pipeline(OneHotEncoder(handle_unknown="ignore", sparse=False), TruncatedSVD(),)
63+
cont_imp = SimpleImputer(strategy="median")
64+
ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
6865
model_original = Pipeline(steps=[("transform", ct), ("estimator", RandomForestClassifier()),])
6966

7067
# Let's change some hyperparameters. Of course, in any good application we

develop/_downloads/9e0617073c8209f15abf91f273871776/flows_and_runs_tutorial.py

Lines changed: 20 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
# License: BSD 3-Clause
99

1010
import openml
11-
import numpy as np
1211
from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
1312

1413
############################################################################
@@ -54,7 +53,7 @@
5453
task = openml.tasks.get_task(403)
5554

5655
# Build any classifier or pipeline
57-
clf = tree.ExtraTreeClassifier()
56+
clf = tree.DecisionTreeClassifier()
5857

5958
# Run the flow
6059
run = openml.runs.run_model_on_task(clf, task)
@@ -83,7 +82,10 @@
8382
# ############################
8483
#
8584
# When you need to handle 'dirty' data, build pipelines to model then automatically.
86-
task = openml.tasks.get_task(1)
85+
# To demonstrate this using the dataset `credit-a <https://test.openml.org/d/16>`_ via
86+
# `task <https://test.openml.org/t/96>`_ as it contains both numerical and categorical
87+
# variables and missing values in both.
88+
task = openml.tasks.get_task(96)
8789

8890
# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines
8991
from openml.extensions.sklearn import cat, cont
@@ -96,20 +98,14 @@
9698
[
9799
(
98100
"categorical",
99-
pipeline.Pipeline(
100-
[
101-
("Imputer", impute.SimpleImputer(strategy="most_frequent")),
102-
(
103-
"Encoder",
104-
preprocessing.OneHotEncoder(
105-
sparse=False, handle_unknown="ignore"
106-
),
107-
),
108-
]
109-
),
101+
preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"),
110102
cat, # returns the categorical feature indices
111103
),
112-
("continuous", "passthrough", cont), # returns the numeric feature indices
104+
(
105+
"continuous",
106+
impute.SimpleImputer(strategy="median"),
107+
cont,
108+
), # returns the numeric feature indices
113109
]
114110
),
115111
),
@@ -146,20 +142,14 @@
146142
[
147143
(
148144
"categorical",
149-
pipeline.Pipeline(
150-
[
151-
("Imputer", impute.SimpleImputer(strategy="most_frequent")),
152-
(
153-
"Encoder",
154-
preprocessing.OneHotEncoder(
155-
sparse=False, handle_unknown="ignore"
156-
),
157-
),
158-
]
159-
),
145+
preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"),
160146
categorical_feature_indices,
161147
),
162-
("continuous", "passthrough", numeric_feature_indices),
148+
(
149+
"continuous",
150+
impute.SimpleImputer(strategy="median"),
151+
numeric_feature_indices,
152+
),
163153
]
164154
),
165155
),
@@ -182,7 +172,9 @@
182172
task = openml.tasks.get_task(6)
183173

184174
# The following lines can then be executed offline:
185-
run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, upload_flow=False)
175+
run = openml.runs.run_model_on_task(
176+
pipe, task, avoid_duplicate_runs=False, upload_flow=False, dataset_format="array",
177+
)
186178

187179
# The run may be stored offline, and the flow will be stored along with it:
188180
run.to_filesystem(directory="myrun")
Binary file not shown.

develop/_downloads/d2ac26453b3a61789aac1d07c7d96d8f/flows_and_runs_tutorial.ipynb

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"# License: BSD 3-Clause\n\nimport openml\nimport numpy as np\nfrom sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree"
29+
"# License: BSD 3-Clause\n\nimport openml\nfrom sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree"
3030
]
3131
},
3232
{
@@ -80,7 +80,7 @@
8080
},
8181
"outputs": [],
8282
"source": [
83-
"# Get a task\ntask = openml.tasks.get_task(403)\n\n# Build any classifier or pipeline\nclf = tree.ExtraTreeClassifier()\n\n# Run the flow\nrun = openml.runs.run_model_on_task(clf, task)\n\nprint(run)"
83+
"# Get a task\ntask = openml.tasks.get_task(403)\n\n# Build any classifier or pipeline\nclf = tree.DecisionTreeClassifier()\n\n# Run the flow\nrun = openml.runs.run_model_on_task(clf, task)\n\nprint(run)"
8484
]
8585
},
8686
{
@@ -123,7 +123,7 @@
123123
"cell_type": "markdown",
124124
"metadata": {},
125125
"source": [
126-
"### It also works with pipelines\n\nWhen you need to handle 'dirty' data, build pipelines to model then automatically.\n\n"
126+
"### It also works with pipelines\n\nWhen you need to handle 'dirty' data, build pipelines to model then automatically.\nTo demonstrate this using the dataset `credit-a <https://test.openml.org/d/16>`_ via\n`task <https://test.openml.org/t/96>`_ as it contains both numerical and categorical\nvariables and missing values in both.\n\n"
127127
]
128128
},
129129
{
@@ -134,7 +134,7 @@
134134
},
135135
"outputs": [],
136136
"source": [
137-
"task = openml.tasks.get_task(1)\n\n# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines\nfrom openml.extensions.sklearn import cat, cont\n\npipe = pipeline.Pipeline(\n steps=[\n (\n \"Preprocessing\",\n compose.ColumnTransformer(\n [\n (\n \"categorical\",\n pipeline.Pipeline(\n [\n (\"Imputer\", impute.SimpleImputer(strategy=\"most_frequent\")),\n (\n \"Encoder\",\n preprocessing.OneHotEncoder(\n sparse=False, handle_unknown=\"ignore\"\n ),\n ),\n ]\n ),\n cat, # returns the categorical feature indices\n ),\n (\"continuous\", \"passthrough\", cont), # returns the numeric feature indices\n ]\n ),\n ),\n (\"Classifier\", ensemble.RandomForestClassifier(n_estimators=10)),\n ]\n)\n\nrun = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)\nmyrun = run.publish()\nprint(\"Uploaded to http://test.openml.org/r/\" + str(myrun.run_id))\n\n\n# The above pipeline works with the helper functions that internally deal with pandas DataFrame.\n# In the case, pandas is not available, or a NumPy based data processing is the requirement, the\n# above pipeline is presented below to work with NumPy.\n\n# Extracting the indices of the categorical columns\nfeatures = task.get_dataset().features\ncategorical_feature_indices = []\nnumeric_feature_indices = []\nfor i in range(len(features)):\n if features[i].name == task.target_name:\n continue\n if features[i].data_type == \"nominal\":\n categorical_feature_indices.append(i)\n else:\n numeric_feature_indices.append(i)\n\npipe = pipeline.Pipeline(\n steps=[\n (\n \"Preprocessing\",\n compose.ColumnTransformer(\n [\n (\n \"categorical\",\n pipeline.Pipeline(\n [\n (\"Imputer\", impute.SimpleImputer(strategy=\"most_frequent\")),\n (\n \"Encoder\",\n preprocessing.OneHotEncoder(\n sparse=False, handle_unknown=\"ignore\"\n ),\n ),\n ]\n ),\n categorical_feature_indices,\n ),\n (\"continuous\", \"passthrough\", numeric_feature_indices),\n ]\n ),\n ),\n (\"Classifier\", ensemble.RandomForestClassifier(n_estimators=10)),\n ]\n)\n\nrun = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format=\"array\")\nmyrun = run.publish()\nprint(\"Uploaded to http://test.openml.org/r/\" + str(myrun.run_id))"
137+
"task = openml.tasks.get_task(96)\n\n# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines\nfrom openml.extensions.sklearn import cat, cont\n\npipe = pipeline.Pipeline(\n steps=[\n (\n \"Preprocessing\",\n compose.ColumnTransformer(\n [\n (\n \"categorical\",\n preprocessing.OneHotEncoder(sparse=False, handle_unknown=\"ignore\"),\n cat, # returns the categorical feature indices\n ),\n (\n \"continuous\",\n impute.SimpleImputer(strategy=\"median\"),\n cont,\n ), # returns the numeric feature indices\n ]\n ),\n ),\n (\"Classifier\", ensemble.RandomForestClassifier(n_estimators=10)),\n ]\n)\n\nrun = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)\nmyrun = run.publish()\nprint(\"Uploaded to http://test.openml.org/r/\" + str(myrun.run_id))\n\n\n# The above pipeline works with the helper functions that internally deal with pandas DataFrame.\n# In the case, pandas is not available, or a NumPy based data processing is the requirement, the\n# above pipeline is presented below to work with NumPy.\n\n# Extracting the indices of the categorical columns\nfeatures = task.get_dataset().features\ncategorical_feature_indices = []\nnumeric_feature_indices = []\nfor i in range(len(features)):\n if features[i].name == task.target_name:\n continue\n if features[i].data_type == \"nominal\":\n categorical_feature_indices.append(i)\n else:\n numeric_feature_indices.append(i)\n\npipe = pipeline.Pipeline(\n steps=[\n (\n \"Preprocessing\",\n compose.ColumnTransformer(\n [\n (\n \"categorical\",\n preprocessing.OneHotEncoder(sparse=False, handle_unknown=\"ignore\"),\n categorical_feature_indices,\n ),\n (\n \"continuous\",\n impute.SimpleImputer(strategy=\"median\"),\n numeric_feature_indices,\n ),\n ]\n ),\n ),\n (\"Classifier\", ensemble.RandomForestClassifier(n_estimators=10)),\n ]\n)\n\nrun = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format=\"array\")\nmyrun = run.publish()\nprint(\"Uploaded to http://test.openml.org/r/\" + str(myrun.run_id))"
138138
]
139139
},
140140
{
@@ -152,7 +152,7 @@
152152
},
153153
"outputs": [],
154154
"source": [
155-
"# To perform the following line offline, it is required to have been called before\n# such that the task is cached on the local openml cache directory:\ntask = openml.tasks.get_task(6)\n\n# The following lines can then be executed offline:\nrun = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, upload_flow=False)\n\n# The run may be stored offline, and the flow will be stored along with it:\nrun.to_filesystem(directory=\"myrun\")\n\n# They may be loaded and uploaded at a later time\nrun = openml.runs.OpenMLRun.from_filesystem(directory=\"myrun\")\nrun.publish()\n\n# Publishing the run will automatically upload the related flow if\n# it does not yet exist on the server."
155+
"# To perform the following line offline, it is required to have been called before\n# such that the task is cached on the local openml cache directory:\ntask = openml.tasks.get_task(6)\n\n# The following lines can then be executed offline:\nrun = openml.runs.run_model_on_task(\n pipe, task, avoid_duplicate_runs=False, upload_flow=False, dataset_format=\"array\",\n)\n\n# The run may be stored offline, and the flow will be stored along with it:\nrun.to_filesystem(directory=\"myrun\")\n\n# They may be loaded and uploaded at a later time\nrun = openml.runs.OpenMLRun.from_filesystem(directory=\"myrun\")\nrun.publish()\n\n# Publishing the run will automatically upload the related flow if\n# it does not yet exist on the server."
156156
]
157157
},
158158
{

develop/_downloads/e981986e86f01c0fbdffc604d78a6bf3/run_setup_tutorial.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
},
4545
"outputs": [],
4646
"source": [
47-
"# first, let's download the task that we are interested in\ntask = openml.tasks.get_task(6)\n\n\n# we will create a fairly complex model, with many preprocessing components and\n# many potential hyperparameters. Of course, the model can be as complex and as\n# easy as you want it to be\n\n\ncat_imp = make_pipeline(\n SimpleImputer(strategy=\"most_frequent\"),\n OneHotEncoder(handle_unknown=\"ignore\", sparse=False),\n TruncatedSVD(),\n)\nct = ColumnTransformer([(\"cat\", cat_imp, cat), (\"cont\", \"passthrough\", cont)])\nmodel_original = Pipeline(steps=[(\"transform\", ct), (\"estimator\", RandomForestClassifier()),])\n\n# Let's change some hyperparameters. Of course, in any good application we\n# would tune them using, e.g., Random Search or Bayesian Optimization, but for\n# the purpose of this tutorial we set them to some specific values that might\n# or might not be optimal\nhyperparameters_original = {\n \"estimator__criterion\": \"gini\",\n \"estimator__n_estimators\": 50,\n \"estimator__max_depth\": 10,\n \"estimator__min_samples_leaf\": 1,\n}\nmodel_original.set_params(**hyperparameters_original)\n\n# solve the task and upload the result (this implicitly creates the flow)\nrun = openml.runs.run_model_on_task(model_original, task, avoid_duplicate_runs=False)\nrun_original = run.publish() # this implicitly uploads the flow"
47+
"# first, let's download the task that we are interested in\ntask = openml.tasks.get_task(6)\n\n\n# we will create a fairly complex model, with many preprocessing components and\n# many potential hyperparameters. Of course, the model can be as complex and as\n# easy as you want it to be\n\n\ncat_imp = make_pipeline(OneHotEncoder(handle_unknown=\"ignore\", sparse=False), TruncatedSVD(),)\ncont_imp = SimpleImputer(strategy=\"median\")\nct = ColumnTransformer([(\"cat\", cat_imp, cat), (\"cont\", cont_imp, cont)])\nmodel_original = Pipeline(steps=[(\"transform\", ct), (\"estimator\", RandomForestClassifier()),])\n\n# Let's change some hyperparameters. Of course, in any good application we\n# would tune them using, e.g., Random Search or Bayesian Optimization, but for\n# the purpose of this tutorial we set them to some specific values that might\n# or might not be optimal\nhyperparameters_original = {\n \"estimator__criterion\": \"gini\",\n \"estimator__n_estimators\": 50,\n \"estimator__max_depth\": 10,\n \"estimator__min_samples_leaf\": 1,\n}\nmodel_original.set_params(**hyperparameters_original)\n\n# solve the task and upload the result (this implicitly creates the flow)\nrun = openml.runs.run_model_on_task(model_original, task, avoid_duplicate_runs=False)\nrun_original = run.publish() # this implicitly uploads the flow"
4848
]
4949
},
5050
{
Binary file not shown.
1.55 KB
Loading

0 commit comments

Comments
 (0)