forked from openml/openml-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_study_examples.py
More file actions
60 lines (53 loc) · 3.3 KB
/
test_study_examples.py
File metadata and controls
60 lines (53 loc) · 3.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# License: BSD 3-Clause
from openml.testing import TestBase, SimpleImputer
class TestStudyFunctions(TestBase):
_multiprocess_can_split_ = True
"""Test the example code of Bischl et al. (2018)"""
def test_Figure1a(self):
"""Test listing in Figure 1a on a single task and the old OpenML100 study.
The original listing is pasted into the comment below because it the actual unit test
differs a bit, as for example it does not run for all tasks, but only a single one.
import openml
import sklearn.tree, sklearn.preprocessing
benchmark_suite = openml.study.get_study('OpenML-CC18','tasks') # obtain the benchmark suite
clf = sklearn.pipeline.Pipeline(steps=[('imputer',sklearn.preprocessing.Imputer()), ('estimator',sklearn.tree.DecisionTreeClassifier())]) # build a sklearn classifier
for task_id in benchmark_suite.tasks: # iterate over all tasks
task = openml.tasks.get_task(task_id) # download the OpenML task
X, y = task.get_X_and_y() # get the data (not used in this example)
openml.config.apikey = 'FILL_IN_OPENML_API_KEY' # set the OpenML Api Key
run = openml.runs.run_model_on_task(task,clf) # run classifier on splits (requires API key)
score = run.get_metric_fn(sklearn.metrics.accuracy_score) # print accuracy score
print('Data set: %s; Accuracy: %0.2f' % (task.get_dataset().name,score.mean()))
run.publish() # publish the experiment on OpenML (optional)
print('URL for run: %s/run/%d' %(openml.config.server,run.run_id))
""" # noqa: E501
import openml
import sklearn.metrics
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.tree
benchmark_suite = openml.study.get_study(
'OpenML100', 'tasks'
) # obtain the benchmark suite
clf = sklearn.pipeline.Pipeline(
steps=[
('imputer', SimpleImputer()),
('estimator', sklearn.tree.DecisionTreeClassifier())
]
) # build a sklearn classifier
for task_id in benchmark_suite.tasks[:1]: # iterate over all tasks
task = openml.tasks.get_task(task_id) # download the OpenML task
X, y = task.get_X_and_y() # get the data (not used in this example)
openml.config.apikey = openml.config.apikey # set the OpenML Api Key
run = openml.runs.run_model_on_task(
clf, task, avoid_duplicate_runs=False
) # run classifier on splits (requires API key)
score = run.get_metric_fn(
sklearn.metrics.accuracy_score
) # print accuracy score
print('Data set: %s; Accuracy: %0.2f' % (task.get_dataset().name, score.mean()))
run.publish() # publish the experiment on OpenML (optional)
TestBase._mark_entity_for_removal('run', run.run_id)
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
run.run_id))
print('URL for run: %s/run/%d' % (openml.config.server, run.run_id))