|
26 | 26 | }, |
27 | 27 | "outputs": [], |
28 | 28 | "source": [ |
29 | | - "# License: BSD 3-Clause\n\nimport openml\nimport numpy as np\nfrom sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree" |
| 29 | + "# License: BSD 3-Clause\n\nimport openml\nfrom sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree" |
30 | 30 | ] |
31 | 31 | }, |
32 | 32 | { |
|
80 | 80 | }, |
81 | 81 | "outputs": [], |
82 | 82 | "source": [ |
83 | | - "# Get a task\ntask = openml.tasks.get_task(403)\n\n# Build any classifier or pipeline\nclf = tree.ExtraTreeClassifier()\n\n# Run the flow\nrun = openml.runs.run_model_on_task(clf, task)\n\nprint(run)" |
| 83 | + "# Get a task\ntask = openml.tasks.get_task(403)\n\n# Build any classifier or pipeline\nclf = tree.DecisionTreeClassifier()\n\n# Run the flow\nrun = openml.runs.run_model_on_task(clf, task)\n\nprint(run)" |
84 | 84 | ] |
85 | 85 | }, |
86 | 86 | { |
|
123 | 123 | "cell_type": "markdown", |
124 | 124 | "metadata": {}, |
125 | 125 | "source": [ |
126 | | - "### It also works with pipelines\n\nWhen you need to handle 'dirty' data, build pipelines to model then automatically.\n\n" |
| 126 | + "### It also works with pipelines\n\nWhen you need to handle 'dirty' data, build pipelines to model then automatically.\nTo demonstrate this using the dataset `credit-a <https://test.openml.org/d/16>`_ via\n`task <https://test.openml.org/t/96>`_ as it contains both numerical and categorical\nvariables and missing values in both.\n\n" |
127 | 127 | ] |
128 | 128 | }, |
129 | 129 | { |
|
134 | 134 | }, |
135 | 135 | "outputs": [], |
136 | 136 | "source": [ |
137 | | - "task = openml.tasks.get_task(1)\n\n# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines\nfrom openml.extensions.sklearn import cat, cont\n\npipe = pipeline.Pipeline(\n steps=[\n (\n \"Preprocessing\",\n compose.ColumnTransformer(\n [\n (\n \"categorical\",\n pipeline.Pipeline(\n [\n (\"Imputer\", impute.SimpleImputer(strategy=\"most_frequent\")),\n (\n \"Encoder\",\n preprocessing.OneHotEncoder(\n sparse=False, handle_unknown=\"ignore\"\n ),\n ),\n ]\n ),\n cat, # returns the categorical feature indices\n ),\n (\"continuous\", \"passthrough\", cont), # returns the numeric feature indices\n ]\n ),\n ),\n (\"Classifier\", ensemble.RandomForestClassifier(n_estimators=10)),\n ]\n)\n\nrun = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)\nmyrun = run.publish()\nprint(\"Uploaded to http://test.openml.org/r/\" + str(myrun.run_id))\n\n\n# The above pipeline works with the helper functions that internally deal with pandas DataFrame.\n# In the case, pandas is not available, or a NumPy based data processing is the requirement, the\n# above pipeline is presented below to work with NumPy.\n\n# Extracting the indices of the categorical columns\nfeatures = task.get_dataset().features\ncategorical_feature_indices = []\nnumeric_feature_indices = []\nfor i in range(len(features)):\n if features[i].name == task.target_name:\n continue\n if features[i].data_type == \"nominal\":\n categorical_feature_indices.append(i)\n else:\n numeric_feature_indices.append(i)\n\npipe = pipeline.Pipeline(\n steps=[\n (\n \"Preprocessing\",\n compose.ColumnTransformer(\n [\n (\n \"categorical\",\n pipeline.Pipeline(\n [\n (\"Imputer\", impute.SimpleImputer(strategy=\"most_frequent\")),\n (\n \"Encoder\",\n preprocessing.OneHotEncoder(\n sparse=False, handle_unknown=\"ignore\"\n ),\n ),\n ]\n ),\n categorical_feature_indices,\n ),\n (\"continuous\", \"passthrough\", numeric_feature_indices),\n ]\n ),\n ),\n (\"Classifier\", ensemble.RandomForestClassifier(n_estimators=10)),\n ]\n)\n\nrun = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format=\"array\")\nmyrun = run.publish()\nprint(\"Uploaded to http://test.openml.org/r/\" + str(myrun.run_id))" |
| 137 | + "task = openml.tasks.get_task(96)\n\n# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines\nfrom openml.extensions.sklearn import cat, cont\n\npipe = pipeline.Pipeline(\n steps=[\n (\n \"Preprocessing\",\n compose.ColumnTransformer(\n [\n (\n \"categorical\",\n preprocessing.OneHotEncoder(sparse=False, handle_unknown=\"ignore\"),\n cat, # returns the categorical feature indices\n ),\n (\n \"continuous\",\n impute.SimpleImputer(strategy=\"median\"),\n cont,\n ), # returns the numeric feature indices\n ]\n ),\n ),\n (\"Classifier\", ensemble.RandomForestClassifier(n_estimators=10)),\n ]\n)\n\nrun = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)\nmyrun = run.publish()\nprint(\"Uploaded to http://test.openml.org/r/\" + str(myrun.run_id))\n\n\n# The above pipeline works with the helper functions that internally deal with pandas DataFrame.\n# In the case, pandas is not available, or a NumPy based data processing is the requirement, the\n# above pipeline is presented below to work with NumPy.\n\n# Extracting the indices of the categorical columns\nfeatures = task.get_dataset().features\ncategorical_feature_indices = []\nnumeric_feature_indices = []\nfor i in range(len(features)):\n if features[i].name == task.target_name:\n continue\n if features[i].data_type == \"nominal\":\n categorical_feature_indices.append(i)\n else:\n numeric_feature_indices.append(i)\n\npipe = pipeline.Pipeline(\n steps=[\n (\n \"Preprocessing\",\n compose.ColumnTransformer(\n [\n (\n \"categorical\",\n preprocessing.OneHotEncoder(sparse=False, handle_unknown=\"ignore\"),\n categorical_feature_indices,\n ),\n (\n \"continuous\",\n impute.SimpleImputer(strategy=\"median\"),\n numeric_feature_indices,\n ),\n ]\n ),\n ),\n (\"Classifier\", ensemble.RandomForestClassifier(n_estimators=10)),\n ]\n)\n\nrun = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format=\"array\")\nmyrun = run.publish()\nprint(\"Uploaded to http://test.openml.org/r/\" + str(myrun.run_id))" |
138 | 138 | ] |
139 | 139 | }, |
140 | 140 | { |
|
152 | 152 | }, |
153 | 153 | "outputs": [], |
154 | 154 | "source": [ |
155 | | - "# To perform the following line offline, it is required to have been called before\n# such that the task is cached on the local openml cache directory:\ntask = openml.tasks.get_task(6)\n\n# The following lines can then be executed offline:\nrun = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, upload_flow=False)\n\n# The run may be stored offline, and the flow will be stored along with it:\nrun.to_filesystem(directory=\"myrun\")\n\n# They may be loaded and uploaded at a later time\nrun = openml.runs.OpenMLRun.from_filesystem(directory=\"myrun\")\nrun.publish()\n\n# Publishing the run will automatically upload the related flow if\n# it does not yet exist on the server." |
| 155 | + "# To perform the following line offline, it is required to have been called before\n# such that the task is cached on the local openml cache directory:\ntask = openml.tasks.get_task(6)\n\n# The following lines can then be executed offline:\nrun = openml.runs.run_model_on_task(\n pipe, task, avoid_duplicate_runs=False, upload_flow=False, dataset_format=\"array\",\n)\n\n# The run may be stored offline, and the flow will be stored along with it:\nrun.to_filesystem(directory=\"myrun\")\n\n# They may be loaded and uploaded at a later time\nrun = openml.runs.OpenMLRun.from_filesystem(directory=\"myrun\")\nrun.publish()\n\n# Publishing the run will automatically upload the related flow if\n# it does not yet exist on the server." |
156 | 156 | ] |
157 | 157 | }, |
158 | 158 | { |
|
0 commit comments