openml
diff --git a/‎develop/.buildinfo‎
Lines changed: 1 addition & 1 deletion b/‎develop/.buildinfo‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎develop/_downloads/2fc23bfc18345b110ab68bc5f3939dc8/2018_neurips_perrone_example.py‎
Lines changed: 3 additions & 7 deletions b/‎develop/_downloads/2fc23bfc18345b110ab68bc5f3939dc8/2018_neurips_perrone_example.py‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎develop/_downloads/42ecf9b9ca30a385452934aeb1a420d5/2018_neurips_perrone_example.ipynb‎
Lines changed: 1 addition & 1 deletion b/‎develop/_downloads/42ecf9b9ca30a385452934aeb1a420d5/2018_neurips_perrone_example.ipynb‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎develop/_downloads/6b1e091fbd3ac8d106b6552c91cf05cc/run_setup_tutorial.py‎
Lines changed: 3 additions & 6 deletions b/‎develop/_downloads/6b1e091fbd3ac8d106b6552c91cf05cc/run_setup_tutorial.py‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎develop/_downloads/9e0617073c8209f15abf91f273871776/flows_and_runs_tutorial.py‎
Lines changed: 20 additions & 28 deletions b/‎develop/_downloads/9e0617073c8209f15abf91f273871776/flows_and_runs_tutorial.py‎
Lines changed: 20 additions & 28 deletions
diff --git a/‎develop/_downloads/bc82bea3a5dd7bdba60b65220891d9e5/examples_python.zip‎
-565 Bytes b/‎develop/_downloads/bc82bea3a5dd7bdba60b65220891d9e5/examples_python.zip‎
-565 Bytes
diff --git a/‎develop/_downloads/d2ac26453b3a61789aac1d07c7d96d8f/flows_and_runs_tutorial.ipynb‎
Lines changed: 5 additions & 5 deletions b/‎develop/_downloads/d2ac26453b3a61789aac1d07c7d96d8f/flows_and_runs_tutorial.ipynb‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎develop/_downloads/e981986e86f01c0fbdffc604d78a6bf3/run_setup_tutorial.ipynb‎
Lines changed: 1 addition & 1 deletion b/‎develop/_downloads/e981986e86f01c0fbdffc604d78a6bf3/run_setup_tutorial.ipynb‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎develop/_downloads/fb625db3c50d423b1b7881136ffdeec8/examples_jupyter.zip‎
-606 Bytes b/‎develop/_downloads/fb625db3c50d423b1b7881136ffdeec8/examples_jupyter.zip‎
-606 Bytes
diff --git a/‎develop/_images/sphx_glr_2018_kdd_rijn_example_001.png‎
1.55 KB b/‎develop/_images/sphx_glr_2018_kdd_rijn_example_001.png‎
1.55 KB
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 916e7408cab0414f9d499b4504128786
+config: 4dbf2556f08259b7ee1a59289245f882
 tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -177,18 +177,14 @@ def list_categorical_attributes(flow_type="svm"):
 cat_cols = list_categorical_attributes(flow_type=flow_type)
 num_cols = list(set(X.columns) - set(cat_cols))
 
-# Missing value imputers
-cat_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="None")
+# Missing value imputers for numeric columns
 num_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=-1)
 
-# Creating the one-hot encoder
+# Creating the one-hot encoder for numerical representation of categorical columns
 enc = OneHotEncoder(handle_unknown="ignore")
 
-# Pipeline to handle categorical column transformations
-cat_transforms = Pipeline(steps=[("impute", cat_imputer), ("encode", enc)])
-
 # Combining column transformers
-ct = ColumnTransformer([("cat", cat_transforms, cat_cols), ("num", num_imputer, num_cols)])
+ct = ColumnTransformer([("cat", enc, cat_cols), ("num", num_imputer, num_cols)])
 
 # Creating the full pipeline with the surrogate model
 clf = RandomForestRegressor(n_estimators=50)
 
@@ -80,7 +80,7 @@
       },
       "outputs": [],
       "source": [
-        "# Separating data into categorical and non-categorical (numeric for this example) columns\ncat_cols = list_categorical_attributes(flow_type=flow_type)\nnum_cols = list(set(X.columns) - set(cat_cols))\n\n# Missing value imputers\ncat_imputer = SimpleImputer(missing_values=np.nan, strategy=\"constant\", fill_value=\"None\")\nnum_imputer = SimpleImputer(missing_values=np.nan, strategy=\"constant\", fill_value=-1)\n\n# Creating the one-hot encoder\nenc = OneHotEncoder(handle_unknown=\"ignore\")\n\n# Pipeline to handle categorical column transformations\ncat_transforms = Pipeline(steps=[(\"impute\", cat_imputer), (\"encode\", enc)])\n\n# Combining column transformers\nct = ColumnTransformer([(\"cat\", cat_transforms, cat_cols), (\"num\", num_imputer, num_cols)])\n\n# Creating the full pipeline with the surrogate model\nclf = RandomForestRegressor(n_estimators=50)\nmodel = Pipeline(steps=[(\"preprocess\", ct), (\"surrogate\", clf)])"
+        "# Separating data into categorical and non-categorical (numeric for this example) columns\ncat_cols = list_categorical_attributes(flow_type=flow_type)\nnum_cols = list(set(X.columns) - set(cat_cols))\n\n# Missing value imputers for numeric columns\nnum_imputer = SimpleImputer(missing_values=np.nan, strategy=\"constant\", fill_value=-1)\n\n# Creating the one-hot encoder for numerical representation of categorical columns\nenc = OneHotEncoder(handle_unknown=\"ignore\")\n\n# Combining column transformers\nct = ColumnTransformer([(\"cat\", enc, cat_cols), (\"num\", num_imputer, num_cols)])\n\n# Creating the full pipeline with the surrogate model\nclf = RandomForestRegressor(n_estimators=50)\nmodel = Pipeline(steps=[(\"preprocess\", ct), (\"surrogate\", clf)])"
       ]
     },
     {
 
@@ -59,12 +59,9 @@
 # easy as you want it to be
 
 
-cat_imp = make_pipeline(
-    SimpleImputer(strategy="most_frequent"),
-    OneHotEncoder(handle_unknown="ignore", sparse=False),
-    TruncatedSVD(),
-)
-ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", "passthrough", cont)])
+cat_imp = make_pipeline(OneHotEncoder(handle_unknown="ignore", sparse=False), TruncatedSVD(),)
+cont_imp = SimpleImputer(strategy="median")
+ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
 model_original = Pipeline(steps=[("transform", ct), ("estimator", RandomForestClassifier()),])
 
 # Let's change some hyperparameters. Of course, in any good application we
 
@@ -8,7 +8,6 @@
 # License: BSD 3-Clause
 
 import openml
-import numpy as np
 from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
 
 ############################################################################
@@ -54,7 +53,7 @@
 task = openml.tasks.get_task(403)
 
 # Build any classifier or pipeline
-clf = tree.ExtraTreeClassifier()
+clf = tree.DecisionTreeClassifier()
 
 # Run the flow
 run = openml.runs.run_model_on_task(clf, task)
@@ -83,7 +82,10 @@
 # ############################
 #
 # When you need to handle 'dirty' data, build pipelines to model then automatically.
-task = openml.tasks.get_task(1)
+# To demonstrate this using the dataset `credit-a <https://test.openml.org/d/16>`_ via
+# `task <https://test.openml.org/t/96>`_ as it contains both numerical and categorical
+# variables and missing values in both.
+task = openml.tasks.get_task(96)
 
 # OpenML helper functions for sklearn can be plugged in directly for complicated pipelines
 from openml.extensions.sklearn import cat, cont
@@ -96,20 +98,14 @@
                 [
                     (
                         "categorical",
-                        pipeline.Pipeline(
-                            [
-                                ("Imputer", impute.SimpleImputer(strategy="most_frequent")),
-                                (
-                                    "Encoder",
-                                    preprocessing.OneHotEncoder(
-                                        sparse=False, handle_unknown="ignore"
-                                    ),
-                                ),
-                            ]
-                        ),
+                        preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"),
                         cat,  # returns the categorical feature indices
                     ),
-                    ("continuous", "passthrough", cont),  # returns the numeric feature indices
+                    (
+                        "continuous",
+                        impute.SimpleImputer(strategy="median"),
+                        cont,
+                    ),  # returns the numeric feature indices
                 ]
             ),
         ),
@@ -146,20 +142,14 @@
                 [
                     (
                         "categorical",
-                        pipeline.Pipeline(
-                            [
-                                ("Imputer", impute.SimpleImputer(strategy="most_frequent")),
-                                (
-                                    "Encoder",
-                                    preprocessing.OneHotEncoder(
-                                        sparse=False, handle_unknown="ignore"
-                                    ),
-                                ),
-                            ]
-                        ),
+                        preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"),
                         categorical_feature_indices,
                     ),
-                    ("continuous", "passthrough", numeric_feature_indices),
+                    (
+                        "continuous",
+                        impute.SimpleImputer(strategy="median"),
+                        numeric_feature_indices,
+                    ),
                 ]
             ),
         ),
@@ -182,7 +172,9 @@
 task = openml.tasks.get_task(6)
 
 # The following lines can then be executed offline:
-run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, upload_flow=False)
+run = openml.runs.run_model_on_task(
+    pipe, task, avoid_duplicate_runs=False, upload_flow=False, dataset_format="array",
+)
 
 # The run may be stored offline, and the flow will be stored along with it:
 run.to_filesystem(directory="myrun")
 
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "# License: BSD 3-Clause\n\nimport openml\nimport numpy as np\nfrom sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree"
+        "# License: BSD 3-Clause\n\nimport openml\nfrom sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree"
       ]
     },
     {
@@ -80,7 +80,7 @@
       },
       "outputs": [],
       "source": [
-        "# Get a task\ntask = openml.tasks.get_task(403)\n\n# Build any classifier or pipeline\nclf = tree.ExtraTreeClassifier()\n\n# Run the flow\nrun = openml.runs.run_model_on_task(clf, task)\n\nprint(run)"
+        "# Get a task\ntask = openml.tasks.get_task(403)\n\n# Build any classifier or pipeline\nclf = tree.DecisionTreeClassifier()\n\n# Run the flow\nrun = openml.runs.run_model_on_task(clf, task)\n\nprint(run)"
       ]
     },
     {
@@ -123,7 +123,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "### It also works with pipelines\n\nWhen you need to handle 'dirty' data, build pipelines to model then automatically.\n\n"
+        "### It also works with pipelines\n\nWhen you need to handle 'dirty' data, build pipelines to model then automatically.\nTo demonstrate this using the dataset `credit-a <https://test.openml.org/d/16>`_ via\n`task <https://test.openml.org/t/96>`_ as it contains both numerical and categorical\nvariables and missing values in both.\n\n"
       ]
     },
     {
@@ -134,7 +134,7 @@
       },
       "outputs": [],
       "source": [
-        "task = openml.tasks.get_task(1)\n\n# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines\nfrom openml.extensions.sklearn import cat, cont\n\npipe = pipeline.Pipeline(\n    steps=[\n        (\n            \"Preprocessing\",\n            compose.ColumnTransformer(\n                [\n                    (\n                        \"categorical\",\n                        pipeline.Pipeline(\n                            [\n                                (\"Imputer\", impute.SimpleImputer(strategy=\"most_frequent\")),\n                                (\n                                    \"Encoder\",\n                                    preprocessing.OneHotEncoder(\n                                        sparse=False, handle_unknown=\"ignore\"\n                                    ),\n                                ),\n                            ]\n                        ),\n                        cat,  # returns the categorical feature indices\n                    ),\n                    (\"continuous\", \"passthrough\", cont),  # returns the numeric feature indices\n                ]\n            ),\n        ),\n        (\"Classifier\", ensemble.RandomForestClassifier(n_estimators=10)),\n    ]\n)\n\nrun = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)\nmyrun = run.publish()\nprint(\"Uploaded to http://test.openml.org/r/\" + str(myrun.run_id))\n\n\n# The above pipeline works with the helper functions that internally deal with pandas DataFrame.\n# In the case, pandas is not available, or a NumPy based data processing is the requirement, the\n# above pipeline is presented below to work with NumPy.\n\n# Extracting the indices of the categorical columns\nfeatures = task.get_dataset().features\ncategorical_feature_indices = []\nnumeric_feature_indices = []\nfor i in range(len(features)):\n    if features[i].name == task.target_name:\n        continue\n    if features[i].data_type == \"nominal\":\n        categorical_feature_indices.append(i)\n    else:\n        numeric_feature_indices.append(i)\n\npipe = pipeline.Pipeline(\n    steps=[\n        (\n            \"Preprocessing\",\n            compose.ColumnTransformer(\n                [\n                    (\n                        \"categorical\",\n                        pipeline.Pipeline(\n                            [\n                                (\"Imputer\", impute.SimpleImputer(strategy=\"most_frequent\")),\n                                (\n                                    \"Encoder\",\n                                    preprocessing.OneHotEncoder(\n                                        sparse=False, handle_unknown=\"ignore\"\n                                    ),\n                                ),\n                            ]\n                        ),\n                        categorical_feature_indices,\n                    ),\n                    (\"continuous\", \"passthrough\", numeric_feature_indices),\n                ]\n            ),\n        ),\n        (\"Classifier\", ensemble.RandomForestClassifier(n_estimators=10)),\n    ]\n)\n\nrun = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format=\"array\")\nmyrun = run.publish()\nprint(\"Uploaded to http://test.openml.org/r/\" + str(myrun.run_id))"
+        "task = openml.tasks.get_task(96)\n\n# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines\nfrom openml.extensions.sklearn import cat, cont\n\npipe = pipeline.Pipeline(\n    steps=[\n        (\n            \"Preprocessing\",\n            compose.ColumnTransformer(\n                [\n                    (\n                        \"categorical\",\n                        preprocessing.OneHotEncoder(sparse=False, handle_unknown=\"ignore\"),\n                        cat,  # returns the categorical feature indices\n                    ),\n                    (\n                        \"continuous\",\n                        impute.SimpleImputer(strategy=\"median\"),\n                        cont,\n                    ),  # returns the numeric feature indices\n                ]\n            ),\n        ),\n        (\"Classifier\", ensemble.RandomForestClassifier(n_estimators=10)),\n    ]\n)\n\nrun = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)\nmyrun = run.publish()\nprint(\"Uploaded to http://test.openml.org/r/\" + str(myrun.run_id))\n\n\n# The above pipeline works with the helper functions that internally deal with pandas DataFrame.\n# In the case, pandas is not available, or a NumPy based data processing is the requirement, the\n# above pipeline is presented below to work with NumPy.\n\n# Extracting the indices of the categorical columns\nfeatures = task.get_dataset().features\ncategorical_feature_indices = []\nnumeric_feature_indices = []\nfor i in range(len(features)):\n    if features[i].name == task.target_name:\n        continue\n    if features[i].data_type == \"nominal\":\n        categorical_feature_indices.append(i)\n    else:\n        numeric_feature_indices.append(i)\n\npipe = pipeline.Pipeline(\n    steps=[\n        (\n            \"Preprocessing\",\n            compose.ColumnTransformer(\n                [\n                    (\n                        \"categorical\",\n                        preprocessing.OneHotEncoder(sparse=False, handle_unknown=\"ignore\"),\n                        categorical_feature_indices,\n                    ),\n                    (\n                        \"continuous\",\n                        impute.SimpleImputer(strategy=\"median\"),\n                        numeric_feature_indices,\n                    ),\n                ]\n            ),\n        ),\n        (\"Classifier\", ensemble.RandomForestClassifier(n_estimators=10)),\n    ]\n)\n\nrun = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format=\"array\")\nmyrun = run.publish()\nprint(\"Uploaded to http://test.openml.org/r/\" + str(myrun.run_id))"
       ]
     },
     {
@@ -152,7 +152,7 @@
       },
       "outputs": [],
       "source": [
-        "# To perform the following line offline, it is required to have been called before\n# such that the task is cached on the local openml cache directory:\ntask = openml.tasks.get_task(6)\n\n# The following lines can then be executed offline:\nrun = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, upload_flow=False)\n\n# The run may be stored offline, and the flow will be stored along with it:\nrun.to_filesystem(directory=\"myrun\")\n\n# They may be loaded and uploaded at a later time\nrun = openml.runs.OpenMLRun.from_filesystem(directory=\"myrun\")\nrun.publish()\n\n# Publishing the run will automatically upload the related flow if\n# it does not yet exist on the server."
+        "# To perform the following line offline, it is required to have been called before\n# such that the task is cached on the local openml cache directory:\ntask = openml.tasks.get_task(6)\n\n# The following lines can then be executed offline:\nrun = openml.runs.run_model_on_task(\n    pipe, task, avoid_duplicate_runs=False, upload_flow=False, dataset_format=\"array\",\n)\n\n# The run may be stored offline, and the flow will be stored along with it:\nrun.to_filesystem(directory=\"myrun\")\n\n# They may be loaded and uploaded at a later time\nrun = openml.runs.OpenMLRun.from_filesystem(directory=\"myrun\")\nrun.publish()\n\n# Publishing the run will automatically upload the related flow if\n# it does not yet exist on the server."
       ]
     },
     {
 
@@ -44,7 +44,7 @@
       },
       "outputs": [],
       "source": [
-        "# first, let's download the task that we are interested in\ntask = openml.tasks.get_task(6)\n\n\n# we will create a fairly complex model, with many preprocessing components and\n# many potential hyperparameters. Of course, the model can be as complex and as\n# easy as you want it to be\n\n\ncat_imp = make_pipeline(\n    SimpleImputer(strategy=\"most_frequent\"),\n    OneHotEncoder(handle_unknown=\"ignore\", sparse=False),\n    TruncatedSVD(),\n)\nct = ColumnTransformer([(\"cat\", cat_imp, cat), (\"cont\", \"passthrough\", cont)])\nmodel_original = Pipeline(steps=[(\"transform\", ct), (\"estimator\", RandomForestClassifier()),])\n\n# Let's change some hyperparameters. Of course, in any good application we\n# would tune them using, e.g., Random Search or Bayesian Optimization, but for\n# the purpose of this tutorial we set them to some specific values that might\n# or might not be optimal\nhyperparameters_original = {\n    \"estimator__criterion\": \"gini\",\n    \"estimator__n_estimators\": 50,\n    \"estimator__max_depth\": 10,\n    \"estimator__min_samples_leaf\": 1,\n}\nmodel_original.set_params(**hyperparameters_original)\n\n# solve the task and upload the result (this implicitly creates the flow)\nrun = openml.runs.run_model_on_task(model_original, task, avoid_duplicate_runs=False)\nrun_original = run.publish()  # this implicitly uploads the flow"
+        "# first, let's download the task that we are interested in\ntask = openml.tasks.get_task(6)\n\n\n# we will create a fairly complex model, with many preprocessing components and\n# many potential hyperparameters. Of course, the model can be as complex and as\n# easy as you want it to be\n\n\ncat_imp = make_pipeline(OneHotEncoder(handle_unknown=\"ignore\", sparse=False), TruncatedSVD(),)\ncont_imp = SimpleImputer(strategy=\"median\")\nct = ColumnTransformer([(\"cat\", cat_imp, cat), (\"cont\", cont_imp, cont)])\nmodel_original = Pipeline(steps=[(\"transform\", ct), (\"estimator\", RandomForestClassifier()),])\n\n# Let's change some hyperparameters. Of course, in any good application we\n# would tune them using, e.g., Random Search or Bayesian Optimization, but for\n# the purpose of this tutorial we set them to some specific values that might\n# or might not be optimal\nhyperparameters_original = {\n    \"estimator__criterion\": \"gini\",\n    \"estimator__n_estimators\": 50,\n    \"estimator__max_depth\": 10,\n    \"estimator__min_samples_leaf\": 1,\n}\nmodel_original.set_params(**hyperparameters_original)\n\n# solve the task and upload the result (this implicitly creates the flow)\nrun = openml.runs.run_model_on_task(model_original, task, avoid_duplicate_runs=False)\nrun_original = run.publish()  # this implicitly uploads the flow"
       ]
     },
     {
Original file line number	Diff line number	Diff line change
`@@ -80,7 +80,7 @@`
`80`	`80`	`},`
`81`	`81`	`"outputs": [],`
`82`	`82`	`"source": [`
`83`		- "# Separating data into categorical and non-categorical (numeric for this example) columns\ncat_cols = list_categorical_attributes(flow_type=flow_type)\nnum_cols = list(set(X.columns) - set(cat_cols))\n\n# Missing value imputers\ncat_imputer = SimpleImputer(missing_values=np.nan, strategy=\"constant\", fill_value=\"None\")\nnum_imputer = SimpleImputer(missing_values=np.nan, strategy=\"constant\", fill_value=-1)\n\n# Creating the one-hot encoder\nenc = OneHotEncoder(handle_unknown=\"ignore\")\n\n# Pipeline to handle categorical column transformations\ncat_transforms = Pipeline(steps=[(\"impute\", cat_imputer), (\"encode\", enc)])\n\n# Combining column transformers\nct = ColumnTransformer([(\"cat\", cat_transforms, cat_cols), (\"num\", num_imputer, num_cols)])\n\n# Creating the full pipeline with the surrogate model\nclf = RandomForestRegressor(n_estimators=50)\nmodel = Pipeline(steps=[(\"preprocess\", ct), (\"surrogate\", clf)])"
	`83`	+ "# Separating data into categorical and non-categorical (numeric for this example) columns\ncat_cols = list_categorical_attributes(flow_type=flow_type)\nnum_cols = list(set(X.columns) - set(cat_cols))\n\n# Missing value imputers for numeric columns\nnum_imputer = SimpleImputer(missing_values=np.nan, strategy=\"constant\", fill_value=-1)\n\n# Creating the one-hot encoder for numerical representation of categorical columns\nenc = OneHotEncoder(handle_unknown=\"ignore\")\n\n# Combining column transformers\nct = ColumnTransformer([(\"cat\", enc, cat_cols), (\"num\", num_imputer, num_cols)])\n\n# Creating the full pipeline with the surrogate model\nclf = RandomForestRegressor(n_estimators=50)\nmodel = Pipeline(steps=[(\"preprocess\", ct), (\"surrogate\", clf)])"
`84`	`84`	`]`
`85`	`85`	`},`
`86`	`86`	`{`
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		`- "# License: BSD 3-Clause\n\nimport openml\nimport numpy as np\nfrom sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree"`
	`29`	`+ "# License: BSD 3-Clause\n\nimport openml\nfrom sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree"`
`30`	`30`	`]`
`31`	`31`	`},`
`32`	`32`	`{`
`@@ -80,7 +80,7 @@`
`80`	`80`	`},`
`81`	`81`	`"outputs": [],`
`82`	`82`	`"source": [`
`83`		`- "# Get a task\ntask = openml.tasks.get_task(403)\n\n# Build any classifier or pipeline\nclf = tree.ExtraTreeClassifier()\n\n# Run the flow\nrun = openml.runs.run_model_on_task(clf, task)\n\nprint(run)"`
	`83`	`+ "# Get a task\ntask = openml.tasks.get_task(403)\n\n# Build any classifier or pipeline\nclf = tree.DecisionTreeClassifier()\n\n# Run the flow\nrun = openml.runs.run_model_on_task(clf, task)\n\nprint(run)"`
`84`	`84`	`]`
`85`	`85`	`},`
`86`	`86`	`{`
`@@ -123,7 +123,7 @@`
`123`	`123`	`"cell_type": "markdown",`
`124`	`124`	`"metadata": {},`
`125`	`125`	`"source": [`
`126`		`- "### It also works with pipelines\n\nWhen you need to handle 'dirty' data, build pipelines to model then automatically.\n\n"`
	`126`	+ "### It also works with pipelines\n\nWhen you need to handle 'dirty' data, build pipelines to model then automatically.\nTo demonstrate this using the dataset `credit-a <https://test.openml.org/d/16>`_ via\n`task <https://test.openml.org/t/96>`_ as it contains both numerical and categorical\nvariables and missing values in both.\n\n"
`127`	`127`	`]`
`128`	`128`	`},`
`129`	`129`	`{`
`@@ -134,7 +134,7 @@`
`134`	`134`	`},`
`135`	`135`	`"outputs": [],`
`136`	`136`	`"source": [`
`137`		- "task = openml.tasks.get_task(1)\n\n# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines\nfrom openml.extensions.sklearn import cat, cont\n\npipe = pipeline.Pipeline(\n steps=[\n (\n \"Preprocessing\",\n compose.ColumnTransformer(\n [\n (\n \"categorical\",\n pipeline.Pipeline(\n [\n (\"Imputer\", impute.SimpleImputer(strategy=\"most_frequent\")),\n (\n \"Encoder\",\n preprocessing.OneHotEncoder(\n sparse=False, handle_unknown=\"ignore\"\n ),\n ),\n ]\n ),\n cat, # returns the categorical feature indices\n ),\n (\"continuous\", \"passthrough\", cont), # returns the numeric feature indices\n ]\n ),\n ),\n (\"Classifier\", ensemble.RandomForestClassifier(n_estimators=10)),\n ]\n)\n\nrun = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)\nmyrun = run.publish()\nprint(\"Uploaded to http://test.openml.org/r/\" + str(myrun.run_id))\n\n\n# The above pipeline works with the helper functions that internally deal with pandas DataFrame.\n# In the case, pandas is not available, or a NumPy based data processing is the requirement, the\n# above pipeline is presented below to work with NumPy.\n\n# Extracting the indices of the categorical columns\nfeatures = task.get_dataset().features\ncategorical_feature_indices = []\nnumeric_feature_indices = []\nfor i in range(len(features)):\n if features[i].name == task.target_name:\n continue\n if features[i].data_type == \"nominal\":\n categorical_feature_indices.append(i)\n else:\n numeric_feature_indices.append(i)\n\npipe = pipeline.Pipeline(\n steps=[\n (\n \"Preprocessing\",\n compose.ColumnTransformer(\n [\n (\n \"categorical\",\n pipeline.Pipeline(\n [\n (\"Imputer\", impute.SimpleImputer(strategy=\"most_frequent\")),\n (\n \"Encoder\",\n preprocessing.OneHotEncoder(\n sparse=False, handle_unknown=\"ignore\"\n ),\n ),\n ]\n ),\n categorical_feature_indices,\n ),\n (\"continuous\", \"passthrough\", numeric_feature_indices),\n ]\n ),\n ),\n (\"Classifier\", ensemble.RandomForestClassifier(n_estimators=10)),\n ]\n)\n\nrun = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format=\"array\")\nmyrun = run.publish()\nprint(\"Uploaded to http://test.openml.org/r/\" + str(myrun.run_id))"
	`137`	+ "task = openml.tasks.get_task(96)\n\n# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines\nfrom openml.extensions.sklearn import cat, cont\n\npipe = pipeline.Pipeline(\n steps=[\n (\n \"Preprocessing\",\n compose.ColumnTransformer(\n [\n (\n \"categorical\",\n preprocessing.OneHotEncoder(sparse=False, handle_unknown=\"ignore\"),\n cat, # returns the categorical feature indices\n ),\n (\n \"continuous\",\n impute.SimpleImputer(strategy=\"median\"),\n cont,\n ), # returns the numeric feature indices\n ]\n ),\n ),\n (\"Classifier\", ensemble.RandomForestClassifier(n_estimators=10)),\n ]\n)\n\nrun = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)\nmyrun = run.publish()\nprint(\"Uploaded to http://test.openml.org/r/\" + str(myrun.run_id))\n\n\n# The above pipeline works with the helper functions that internally deal with pandas DataFrame.\n# In the case, pandas is not available, or a NumPy based data processing is the requirement, the\n# above pipeline is presented below to work with NumPy.\n\n# Extracting the indices of the categorical columns\nfeatures = task.get_dataset().features\ncategorical_feature_indices = []\nnumeric_feature_indices = []\nfor i in range(len(features)):\n if features[i].name == task.target_name:\n continue\n if features[i].data_type == \"nominal\":\n categorical_feature_indices.append(i)\n else:\n numeric_feature_indices.append(i)\n\npipe = pipeline.Pipeline(\n steps=[\n (\n \"Preprocessing\",\n compose.ColumnTransformer(\n [\n (\n \"categorical\",\n preprocessing.OneHotEncoder(sparse=False, handle_unknown=\"ignore\"),\n categorical_feature_indices,\n ),\n (\n \"continuous\",\n impute.SimpleImputer(strategy=\"median\"),\n numeric_feature_indices,\n ),\n ]\n ),\n ),\n (\"Classifier\", ensemble.RandomForestClassifier(n_estimators=10)),\n ]\n)\n\nrun = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format=\"array\")\nmyrun = run.publish()\nprint(\"Uploaded to http://test.openml.org/r/\" + str(myrun.run_id))"
`138`	`138`	`]`
`139`	`139`	`},`
`140`	`140`	`{`
`@@ -152,7 +152,7 @@`
`152`	`152`	`},`
`153`	`153`	`"outputs": [],`
`154`	`154`	`"source": [`
`155`		- "# To perform the following line offline, it is required to have been called before\n# such that the task is cached on the local openml cache directory:\ntask = openml.tasks.get_task(6)\n\n# The following lines can then be executed offline:\nrun = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, upload_flow=False)\n\n# The run may be stored offline, and the flow will be stored along with it:\nrun.to_filesystem(directory=\"myrun\")\n\n# They may be loaded and uploaded at a later time\nrun = openml.runs.OpenMLRun.from_filesystem(directory=\"myrun\")\nrun.publish()\n\n# Publishing the run will automatically upload the related flow if\n# it does not yet exist on the server."
	`155`	+ "# To perform the following line offline, it is required to have been called before\n# such that the task is cached on the local openml cache directory:\ntask = openml.tasks.get_task(6)\n\n# The following lines can then be executed offline:\nrun = openml.runs.run_model_on_task(\n pipe, task, avoid_duplicate_runs=False, upload_flow=False, dataset_format=\"array\",\n)\n\n# The run may be stored offline, and the flow will be stored along with it:\nrun.to_filesystem(directory=\"myrun\")\n\n# They may be loaded and uploaded at a later time\nrun = openml.runs.OpenMLRun.from_filesystem(directory=\"myrun\")\nrun.publish()\n\n# Publishing the run will automatically upload the related flow if\n# it does not yet exist on the server."
`156`	`156`	`]`
`157`	`157`	`},`
`158`	`158`	`{`
Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@`
`44`	`44`	`},`
`45`	`45`	`"outputs": [],`
`46`	`46`	`"source": [`
`47`		- "# first, let's download the task that we are interested in\ntask = openml.tasks.get_task(6)\n\n\n# we will create a fairly complex model, with many preprocessing components and\n# many potential hyperparameters. Of course, the model can be as complex and as\n# easy as you want it to be\n\n\ncat_imp = make_pipeline(\n SimpleImputer(strategy=\"most_frequent\"),\n OneHotEncoder(handle_unknown=\"ignore\", sparse=False),\n TruncatedSVD(),\n)\nct = ColumnTransformer([(\"cat\", cat_imp, cat), (\"cont\", \"passthrough\", cont)])\nmodel_original = Pipeline(steps=[(\"transform\", ct), (\"estimator\", RandomForestClassifier()),])\n\n# Let's change some hyperparameters. Of course, in any good application we\n# would tune them using, e.g., Random Search or Bayesian Optimization, but for\n# the purpose of this tutorial we set them to some specific values that might\n# or might not be optimal\nhyperparameters_original = {\n \"estimator__criterion\": \"gini\",\n \"estimator__n_estimators\": 50,\n \"estimator__max_depth\": 10,\n \"estimator__min_samples_leaf\": 1,\n}\nmodel_original.set_params(**hyperparameters_original)\n\n# solve the task and upload the result (this implicitly creates the flow)\nrun = openml.runs.run_model_on_task(model_original, task, avoid_duplicate_runs=False)\nrun_original = run.publish() # this implicitly uploads the flow"
	`47`	+ "# first, let's download the task that we are interested in\ntask = openml.tasks.get_task(6)\n\n\n# we will create a fairly complex model, with many preprocessing components and\n# many potential hyperparameters. Of course, the model can be as complex and as\n# easy as you want it to be\n\n\ncat_imp = make_pipeline(OneHotEncoder(handle_unknown=\"ignore\", sparse=False), TruncatedSVD(),)\ncont_imp = SimpleImputer(strategy=\"median\")\nct = ColumnTransformer([(\"cat\", cat_imp, cat), (\"cont\", cont_imp, cont)])\nmodel_original = Pipeline(steps=[(\"transform\", ct), (\"estimator\", RandomForestClassifier()),])\n\n# Let's change some hyperparameters. Of course, in any good application we\n# would tune them using, e.g., Random Search or Bayesian Optimization, but for\n# the purpose of this tutorial we set them to some specific values that might\n# or might not be optimal\nhyperparameters_original = {\n \"estimator__criterion\": \"gini\",\n \"estimator__n_estimators\": 50,\n \"estimator__max_depth\": 10,\n \"estimator__min_samples_leaf\": 1,\n}\nmodel_original.set_params(**hyperparameters_original)\n\n# solve the task and upload the result (this implicitly creates the flow)\nrun = openml.runs.run_model_on_task(model_original, task, avoid_duplicate_runs=False)\nrun_original = run.publish() # this implicitly uploads the flow"
`48`	`48`	`]`
`49`	`49`	`},`
`50`	`50`	`{`